diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..45232b80e
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,161 @@
+---
+Language:        Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveAssignments: AcrossComments
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveDeclarations: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackArguments: true
+BinPackParameters: true # OnePerLine
+BitFieldColonSpacing: Both
+BreakBeforeBraces: Custom # Attach
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertBraces:    true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Middle
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+ReferenceAlignment: Middle
+ReflowComments:  false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes:    CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        c++17
+TabWidth:        4
+UseTab:          Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
+
diff --git a/.clang-tidy b/.clang-tidy
index 3078beacc..310c3d182 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -12,12 +12,15 @@ Checks: >
     -readability-implicit-bool-conversion,
     -readability-magic-numbers,
     -readability-uppercase-literal-suffix,
+    -readability-simplify-boolean-expr,
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
     portability-*,
+    -portability-simd-intrinsics,
     misc-*,
     -misc-const-correctness,
     -misc-non-private-member-variables-in-classes,
     -misc-no-recursion,
+    -misc-use-anonymous-namespace,
 FormatStyle: none
diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline
index f3a4944f8..af8c0cea6 100644
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
     stage('Running llama.cpp'){
         sh'''#!/bin/bash
             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
             cat llama_log.txt                   # Printing results
         '''
     }
diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile
new file mode 100644
index 000000000..522ee8147
--- /dev/null
+++ b/.devops/cpu.Dockerfile
@@ -0,0 +1,92 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+ARG TARGETARCH
+
+ARG GGML_CPU_ARM_ARCH=armv8-a
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
+    else \
+        echo "Unsupported architecture"; \
+        exit 1; \
+    fi && \
+    cmake --build build -j $(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile
new file mode 100644
index 000000000..974dd78a8
--- /dev/null
+++ b/.devops/cuda.Dockerfile
@@ -0,0 +1,94 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
deleted file mode 100644
index 77a9ddc14..000000000
--- a/.devops/full-cuda.Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
deleted file mode 100644
index 8b9633dc4..000000000
--- a/.devops/full-rocm.Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
deleted file mode 100644
index cef1297d3..000000000
--- a/.devops/full.Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-RUN make
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
new file mode 100644
index 000000000..af783f5e9
--- /dev/null
+++ b/.devops/intel.Dockerfile
@@ -0,0 +1,91 @@
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
+
+## Build Image
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/lib/ /app
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
+
diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile
new file mode 100644
index 000000000..02dce501c
--- /dev/null
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -0,0 +1,44 @@
+ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+
+FROM ascendai/cann:$ASCEND_VERSION AS build
+
+WORKDIR /app
+
+COPY . .
+
+RUN yum install -y gcc g++ cmake make
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+# find libascend_hal.so, because the drive hasn`t been mounted.
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+
+RUN echo "Building with static libs" && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake --build build --config Release --target llama-cli
+
+# TODO: use image with NNRT
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+ENTRYPOINT ["/llama-cli" ]
diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec
deleted file mode 100644
index 076f29695..000000000
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ /dev/null
@@ -1,84 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-clblast
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        OpenCL Inference of LLaMA model in C/C++
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
-Requires:       clblast
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CLBLAST=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamaclblast
-cp -p server %{buildroot}%{_bindir}/llamaclblastserver
-cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamaclblast
-%{_bindir}/llamaclblastserver
-%{_bindir}/llamaclblastsimple
-/usr/lib/systemd/system/llamaclblast.service
-%config /etc/sysconfig/llama
-
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
similarity index 79%
rename from .devops/llama-cpp-cublas.srpm.spec
rename to .devops/llama-cpp-cuda.srpm.spec
index f847ebb1e..7425d3a9d 100644
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 
@@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 
-Name:           llama.cpp-cublas
+Name:           llama.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 
 %build
-make -j LLAMA_CUBLAS=1
+make -j GGML_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
+%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-server
+%{_bindir}/llama-cuda-simple
+/usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
 
 %pre
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
index 446213d69..4d5560089 100644
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 
@@ -38,9 +38,9 @@ make -j
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llama
-cp -p server %{buildroot}%{_bindir}/llamaserver
-cp -p simple %{buildroot}%{_bindir}/llamasimple
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
@@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -69,9 +69,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llama
-%{_bindir}/llamaserver
-%{_bindir}/llamasimple
+%{_bindir}/llama-cli
+%{_bindir}/llama-server
+%{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
 
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
deleted file mode 100644
index 2b7faf7c1..000000000
--- a/.devops/main-cuda.Dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
-
-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile
deleted file mode 100644
index 572e5d8ea..000000000
--- a/.devops/main-intel.Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
-
-ARG LLAMA_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
-    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target main
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
-
-COPY --from=build /app/build/bin/main /main
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
deleted file mode 100644
index 0a706dc73..000000000
--- a/.devops/main-rocm.Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile
deleted file mode 100644
index bca460365..000000000
--- a/.devops/main-vulkan.Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk
-
-# Build it
-WORKDIR /app
-COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target main
-
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/main /main && \
-    rm -rf /app
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
deleted file mode 100644
index 3ab1decd6..000000000
--- a/.devops/main.Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-RUN make
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-COPY --from=build /app/main /main
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
new file mode 100644
index 000000000..bfd7fc1c1
--- /dev/null
+++ b/.devops/musa.Dockerfile
@@ -0,0 +1,108 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+# MUSA architecture to build for (defaults to all supported archs)
+ARG MUSA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    cmake \
+    python3 \
+    python3-pip \
+    git \
+    libcurl4-openssl-dev \
+    libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Use the default MUSA archs if not specified
+RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_MUSA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix
index b8a12cc0a..0ecf19fc5 100644
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -6,11 +6,10 @@
         let
           inherit (config.packages) default;
           binaries = [
-            "llama"
+            "llama-cli"
             "llama-embedding"
             "llama-server"
-            "quantize"
-            "train-text-from-scratch"
+            "llama-quantize"
           ];
           mkApp = name: {
             type = "app";
diff --git a/.devops/nix/devshells.nix b/.devops/nix/devshells.nix
index 1862f0f08..bfd304af1 100644
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +1,52 @@
+{ inputs, ... }:
+
 {
   perSystem =
-    { config, lib, ... }:
+    {
+      config,
+      lib,
+      system,
+      ...
+    }:
     {
       devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
+        let
+          pkgs = import inputs.nixpkgs { inherit system; };
+          stdenv = pkgs.stdenv;
+          scripts = config.packages.python-scripts;
+        in
+        lib.pipe (config.packages) [
+          (lib.concatMapAttrs (
+            name: package: {
+              ${name} = pkgs.mkShell {
+                name = "${name}";
+                inputsFrom = [ package ];
+                shellHook = ''
+                  echo "Entering ${name} devShell"
+                '';
+              };
+              "${name}-extra" =
+                if (name == "python-scripts") then
+                  null
+                else
+                  pkgs.mkShell {
+                    name = "${name}-extra";
+                    inputsFrom = [
+                      package
+                      scripts
+                    ];
+                    # Extra packages that *may* be used by some scripts
+                    packages = [
+                        pkgs.python3Packages.tiktoken
+                    ];
+                    shellHook = ''
+                      echo "Entering ${name} devShell"
+                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+                    '';
+                  };
+            }
+          ))
+          (lib.filterAttrs (name: value: value != null))
+        ];
     };
 }
diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix
index 4a2f81c4b..90d683a71 100644
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -26,16 +26,14 @@
           config.cudaSupport = true;
           config.allowUnfreePredicate =
             p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
+            builtins.all (
+              license:
+              license.free
+              || builtins.elem license.shortName [
+                "CUDA EULA"
+                "cuDNN EULA"
+              ]
+            ) (p.meta.licenses or [ p.meta.license ]);
         };
         # Ensure dependencies use ROCm consistently
         pkgsRocm = import inputs.nixpkgs {
diff --git a/.devops/nix/package-gguf-py.nix b/.devops/nix/package-gguf-py.nix
new file mode 100644
index 000000000..cca2f36a5
--- /dev/null
+++ b/.devops/nix/package-gguf-py.nix
@@ -0,0 +1,36 @@
+{
+  lib,
+  llamaVersion,
+  numpy,
+  tqdm,
+  sentencepiece,
+  pyyaml,
+  poetry-core,
+  buildPythonPackage,
+  pytestCheckHook,
+}:
+
+buildPythonPackage {
+  pname = "gguf";
+  version = llamaVersion;
+  pyproject = true;
+  nativeBuildInputs = [ poetry-core ];
+  propagatedBuildInputs = [
+    numpy
+    tqdm
+    sentencepiece
+    pyyaml
+  ];
+  src = lib.cleanSource ../../gguf-py;
+  pythonImportsCheck = [
+    "numpy"
+    "gguf"
+  ];
+  nativeCheckInputs = [ pytestCheckHook ];
+  doCheck = true;
+  meta = with lib; {
+    description = "Python package for writing binary files in the GGUF format";
+    license = licenses.mit;
+    maintainers = [ maintainers.ditsuke ];
+  };
+}
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 815db6a2d..043c4364b 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,36 +1,47 @@
 {
   lib,
+  glibc,
   config,
   stdenv,
-  mkShell,
+  runCommand,
   cmake,
   ninja,
   pkg-config,
   git,
-  python3,
   mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
+  blas,
   cudaPackages,
+  autoAddDriverRunpath,
   darwin,
   rocmPackages,
   vulkan-headers,
   vulkan-loader,
-  clblast,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useOpenCL
-    useRocm
-    useVulkan
-  ],
+  curl,
+  shaderc,
+  useBlas ?
+    builtins.all (x: !x) [
+      useCuda
+      useMetalKit
+      useRocm
+      useVulkan
+    ]
+    && blas.meta.available,
   useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
-  useMpi ? false, # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+  # Increases the runtime closure size by ~700M
+  useMpi ? false,
   useRocm ? config.rocmSupport,
+  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
+  enableCurl ? true,
   useVulkan ? false,
   llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-}@inputs:
+
+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
+  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
+  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
+  precompileMetalShaders ? false,
+}:
 
 let
   inherit (lib)
@@ -38,51 +49,29 @@ let
     cmakeFeature
     optionals
     strings
-    versionOlder
     ;
 
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
   stdenv = throw "Use effectiveStdenv instead";
-  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
 
   suffices =
     lib.optionals useBlas [ "BLAS" ]
     ++ lib.optionals useCuda [ "CUDA" ]
     ++ lib.optionals useMetalKit [ "MetalKit" ]
     ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
     ++ lib.optionals useRocm [ "ROCm" ]
     ++ lib.optionals useVulkan [ "Vulkan" ];
 
   pnameSuffix =
     strings.optionalString (suffices != [ ])
       "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  descriptionSuffix = strings.optionalString (
+    suffices != [ ]
+  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
 
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.tiktoken
-      ps.torchWithoutCuda
-      ps.transformers
-    ]
-  );
+  xcrunHost = runCommand "xcrunHost" { } ''
+    mkdir -p $out/bin
+    ln -s /usr/bin/xcrun $out/bin
+  '';
 
   # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
   # separately
@@ -96,16 +85,9 @@ let
     ++ optionals useMetalKit [ MetalKit ];
 
   cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
-
-    # A temporary hack for reducing the closure size, remove once cudaPackages
-    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
-    cuda_cudart.dev
-    cuda_cudart.lib
-    cuda_cudart.static
-    libcublas.dev
-    libcublas.lib
-    libcublas.static
+    cuda_cudart
+    cuda_cccl # <nv/target>
+    libcublas
   ];
 
   rocmBuildInputs = with rocmPackages; [
@@ -117,174 +99,149 @@ let
   vulkanBuildInputs = [
     vulkan-headers
     vulkan-loader
+    shaderc
   ];
 in
 
-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
+effectiveStdenv.mkDerivation (finalAttrs: {
+  pname = "llama-cpp${pnameSuffix}";
+  version = llamaVersion;
 
-    # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        let
-          noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
-        in
-        noneOf [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
-        ];
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-
-      # TODO: Package up each Python script or service appropriately.
-      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
-      # we could make those *.py into setuptools' entrypoints
-      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
-    '';
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
+  # Note: none of the files discarded here are visible in the sandbox or
+  # affect the output hash. This also means they can be modified without
+  # triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        noneOf = builtins.all (x: !x);
+        baseName = baseNameOf name;
+      in
+      noneOf [
+        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+        (lib.hasPrefix "." baseName) # Skip hidden files and directories
+        (baseName == "flake.lock")
       ];
+    src = lib.cleanSource ../../.;
+  };
 
-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useOpenCL [ clblast ]
-      ++ optionals useRocm rocmBuildInputs
-      ++ optionals useVulkan vulkanBuildInputs;
+  postPatch = ''
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
+  '';
 
-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_NATIVE" false)
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" true)
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
-        (cmakeBool "LLAMA_MPI" useMpi)
-        (cmakeBool "LLAMA_VULKAN" useVulkan)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # `default.metallib` may be compiled with Metal compiler from XCode
+  # and we need to escape sandbox on MacOS to access Metal compiler.
+  # `xcrun` is used find the path of the Metal compiler, which is varible
+  # and not on $PATH
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
+  nativeBuildInputs =
+    [
+      cmake
+      ninja
+      pkg-config
+      git
+    ]
+    ++ optionals useCuda [
+      cudaPackages.cuda_nvcc
+
+      autoAddDriverRunpath
+    ]
+    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+
+  buildInputs =
+    optionals effectiveStdenv.isDarwin darwinBuildInputs
+    ++ optionals useCuda cudaBuildInputs
+    ++ optionals useMpi [ mpi ]
+    ++ optionals useRocm rocmBuildInputs
+    ++ optionals useBlas [ blas ]
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ optionals enableCurl [ curl ];
+
+  cmakeFlags =
+    [
+      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "GGML_NATIVE" false)
+      (cmakeBool "GGML_BLAS" useBlas)
+      (cmakeBool "GGML_CUDA" useCuda)
+      (cmakeBool "GGML_HIP" useRocm)
+      (cmakeBool "GGML_METAL" useMetalKit)
+      (cmakeBool "GGML_VULKAN" useVulkan)
+      (cmakeBool "GGML_STATIC" enableStatic)
+    ]
+    ++ optionals useCuda [
+      (
+        with cudaPackages.flags;
+        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
         )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+      )
+    ]
+    ++ optionals useRocm [
+      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
+    ]
+    ++ optionals useMetalKit [
+      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+    ];
 
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-        # and select the line that matches the current nixpkgs version of rocBLAS.
-        # Should likely use `rocmPackages.clr.gpuTargets`.
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-      ]
-      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
-      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
+  # Environment variables needed for ROCm
+  env = optionals useRocm {
+    ROCM_PATH = "${rocmPackages.clr}";
+    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+  };
 
-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mv $out/bin/main $out/bin/llama
-      mv $out/bin/server $out/bin/llama-server
-      mkdir -p $out/include
-      cp $src/llama.h $out/include/
-    '';
+  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+  # if they haven't been added yet.
+  postInstall = ''
+    mkdir -p $out/include
+    cp $src/include/llama.h $out/include/
+  '';
 
-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useOpenCL
-        useRocm
-        useVulkan
-        ;
+  meta = {
+    # Configurations we don't want even the CI to evaluate. Results in the
+    # "unsupported platform" messages. This is mostly a no-op, because
+    # cudaPackages would've refused to evaluate anyway.
+    badPlatforms = optionals useCuda lib.platforms.darwin;
 
-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-        shellHook = ''
-          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
-        '';
-      };
+    # Configurations that are known to result in build failures. Can be
+    # overridden by importing Nixpkgs with `allowBroken = true`.
+    broken = (useMetalKit && !effectiveStdenv.isDarwin);
 
-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
+    license = lib.licenses.mit;
 
-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+    # Accommodates `nix run` and `lib.getExe`
+    mainProgram = "llama-cli";
 
-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+    # These people might respond, on the best effort basis, if you ping them
+    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+    # Consider adding yourself to this list if you want to ensure this flake
+    # stays maintained and you're willing to invest your time. Do not add
+    # other people without their consent. Consider removing people after
+    # they've been unreachable for long periods of time.
 
-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
+    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+    # an attrset following the same format as in
+    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+    maintainers = with lib.maintainers; [
+      philiptaron
+      SomeoneSerge
+    ];
 
-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
+    # Extend `badPlatforms` instead
+    platforms = lib.platforms.all;
+  };
+})
diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix
new file mode 100644
index 000000000..56ea18278
--- /dev/null
+++ b/.devops/nix/python-scripts.nix
@@ -0,0 +1,66 @@
+{
+  lib,
+  stdenv,
+  buildPythonPackage,
+  poetry-core,
+  mkShell,
+  python3Packages,
+  gguf-py,
+}@inputs:
+
+let
+  llama-python-deps = with python3Packages; [
+    numpy
+    sentencepiece
+    transformers
+    protobuf
+    torchWithoutCuda
+    gguf-py
+    tqdm
+
+    # for scripts/compare-llama-bench.py
+    gitpython
+    tabulate
+
+    # for examples/pydantic-models-to-grammar-examples.py
+    docstring-parser
+    pydantic
+
+  ];
+
+  llama-python-test-deps = with python3Packages; [
+    # Server bench
+    matplotlib
+
+    # server tests
+    openai
+    pytest
+    prometheus-client
+  ];
+in
+
+buildPythonPackage ({
+  pname = "llama-scripts";
+  version = "0.0.0";
+  pyproject = true;
+
+  # NOTE: The files filtered out here are not visible in the build sandbox, neither
+  # do they affect the output hash. They can be modified without triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        any = builtins.any (x: x);
+        baseName = builtins.baseNameOf name;
+      in
+      any [
+        (lib.hasSuffix ".py" name)
+        (baseName == "README.md")
+        (baseName == "pyproject.toml")
+      ];
+    src = lib.cleanSource ../../.;
+  };
+  nativeBuildInputs = [ poetry-core ];
+  nativeCheckInputs = llama-python-test-deps;
+  dependencies = llama-python-deps;
+})
diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix
index 78530c9e8..478e8c422 100644
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,19 +1,41 @@
 {
   lib,
   newScope,
+  python3,
   llamaVersion ? "0.0.0",
 }:
 
+let
+  pythonPackages = python3.pkgs;
+  buildPythonPackage = pythonPackages.buildPythonPackage;
+  numpy = pythonPackages.numpy;
+  tqdm = pythonPackages.tqdm;
+  sentencepiece = pythonPackages.sentencepiece;
+  pyyaml = pythonPackages.pyyaml;
+  poetry-core = pythonPackages.poetry-core;
+  pytestCheckHook = pythonPackages.pytestCheckHook;
+in
+
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
 
-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-    docker = self.callPackage ./docker.nix { };
-    docker-min = self.callPackage ./docker.nix { interactive = false; };
-    sif = self.callPackage ./sif.nix { };
-  }
-)
+lib.makeScope newScope (self: {
+  inherit llamaVersion;
+  gguf-py = self.callPackage ./package-gguf-py.nix {
+    inherit
+      buildPythonPackage
+      numpy
+      tqdm
+      sentencepiece
+      poetry-core
+      pyyaml
+      pytestCheckHook
+      ;
+  };
+  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
+  llama-cpp = self.callPackage ./package.nix { };
+  docker = self.callPackage ./docker.nix { };
+  docker-min = self.callPackage ./docker.nix { interactive = false; };
+  sif = self.callPackage ./sif.nix { };
+})
diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix
index 7535ca0f3..7a5e1dd0f 100644
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@@ -7,7 +7,7 @@
 }:
 
 let
-    optionalInt = cond: x: if cond then x else 0;
+  optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
   inherit (llama-cpp) name;
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
new file mode 100644
index 000000000..a8088ea00
--- /dev/null
+++ b/.devops/rocm.Dockerfile
@@ -0,0 +1,113 @@
+ARG UBUNTU_VERSION=24.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=6.3
+ARG AMDGPU_VERSION=6.3
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+### Build image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
+# gfx906 is deprecated
+#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
+
+#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
+ARG ROCM_DOCKER_ARCH=gfx1100
+
+# Set nvcc architectured
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+# ENV CC=/opt/rocm/llvm/bin/clang
+# ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN apt-get update \
+    && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    libcurl4-openssl-dev \
+    curl \
+    libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
+    && cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib \
+    && find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_ROCM_DEV_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3-pip \
+    python3 \
+    python3-wheel\
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
deleted file mode 100644
index 4f83904bc..000000000
--- a/.devops/server-cuda.Dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
-
-COPY --from=build /app/server /server
-
-ENTRYPOINT [ "/server" ]
diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile
deleted file mode 100644
index 312f2df80..000000000
--- a/.devops/server-intel.Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
-
-ARG LLAMA_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
-    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target server
-
-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
-
-COPY --from=build /app/build/bin/server /server
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/server" ]
diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile
deleted file mode 100644
index e9a31647c..000000000
--- a/.devops/server-rocm.Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT [ "/app/server" ]
diff --git a/.devops/server-vulkan.Dockerfile b/.devops/server-vulkan.Dockerfile
deleted file mode 100644
index e0add6fc3..000000000
--- a/.devops/server-vulkan.Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-ARG UBUNTU_VERSION=jammy
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-
-# Install Vulkan SDK
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk
-
-# Build it
-WORKDIR /app
-COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target server
-
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/server /server && \
-    rm -rf /app
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/server" ]
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
deleted file mode 100644
index 134588fe2..000000000
--- a/.devops/server.Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-RUN make
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-COPY --from=build /app/server /server
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/server" ]
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 3a7d274e4..41a6b1e55 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,36 +8,40 @@ arg1="$1"
 shift
 
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
+    exec python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$@"
+    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./finetune "$@"
+    exec ./llama-cli "$@"
+elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
+    exec ./llama-bench "$@"
+elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
+    exec ./llama-perplexity "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
-    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+    for i in $(ls $1/$2/ggml-model-f16.bin*); do
         if [ -f "${i/f16/q4_0}" ]; then
             echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
         else
             echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" q4_0
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
+    exec ./llama-server "$@"
 else
     echo "Unknown command: $arg1"
     echo "Available commands: "
     echo "  --run (-r): Run a model previously converted into ggml"
     echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
+    echo "              ex: -m model.gguf"
+    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
+    echo "              ex: -m model.gguf -f file.txt"
     echo "  --convert (-c): Convert a llama model into ggml"
     echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
     echo "  --server (-s): Run a model on the server"
diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile
new file mode 100644
index 000000000..9064f3838
--- /dev/null
+++ b/.devops/vulkan.Dockerfile
@@ -0,0 +1,89 @@
+ARG UBUNTU_VERSION=24.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK and cURL
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+
+# Build it
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl libvulkan-dev \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/.dockerignore b/.dockerignore
index 633bbc3a9..064b7c7be 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-.git/
+# Do not ignore .git directory, otherwise the reported build number will always be 0
 .github/
 .gitignore
 .vs/
@@ -12,8 +12,8 @@ build*/
 
 models/*
 
-/main
-/quantize
+/llama-cli
+/llama-quantize
 
 arm_neon.h
 compile_commands.json
diff --git a/.ecrc b/.ecrc
index a3351f4e6..c68877ec2 100644
--- a/.ecrc
+++ b/.ecrc
@@ -1,5 +1,5 @@
 {
-  "Exclude": ["^\\.gitmodules$"],
+  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
   "Disable": {
     "IndentSize": true
   }
diff --git a/.editorconfig b/.editorconfig
index 16d16b3b5..5d63d0a51 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -24,5 +24,27 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 
+[examples/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
+[examples/server/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
+
+[examples/cvector-generator/*.txt]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
+
+[models/templates/*.jinja]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
diff --git a/.flake8 b/.flake8
index 18fba2c15..d64c2564a 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,17 @@
 [flake8]
 max-line-length = 125
-ignore = W503
+ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
+exclude =
+    # Do not traverse examples
+    examples,
+    # Do not include package initializers
+    __init__.py,
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # No need to include the build path
+    build,
+    # This contains builds that we don't want to check
+    dist  # This is generated with `python build .` for package releases
+# max-complexity = 10
diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
new file mode 100644
index 000000000..b85bf5741
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -0,0 +1,87 @@
+name: Bug (compilation)
+description: Something goes wrong when trying to compile llama.cpp.
+title: "Compile bug: "
+labels: ["bug-unconfirmed", "compilation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the compilation of llama.cpp fails.
+        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
+        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
+        by clearing `~/.cache/ccache` (on Linux).
+  - type: textarea
+    id: commit
+    attributes:
+      label: Git commit
+      description: Which commit are you trying to compile?
+      placeholder: |
+        $git rev-parse HEAD
+        84a07a17b1b08cf2b9747c633a2372782848a27f
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+    validations:
+      required: true
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it.
+        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
+      placeholder: >
+        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: command
+    attributes:
+      label: Compile command
+      description: >
+        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
+        This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
new file mode 100644
index 000000000..1ccef0793
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -0,0 +1,101 @@
+name: Bug (model use)
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
+title: "Eval bug: "
+labels: ["bug-unconfirmed", "model evaluation"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for bug reports where the model evaluation results
+        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+        The `llama-cli` binary can be used for simple and reproducible model inference.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software are you running? (use `--version` to get a version string)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
+        multiple: true
+    validations:
+      required: true
+  - type: textarea
+    id: hardware
+    attributes:
+      label: Hardware
+      description: Which CPUs/GPUs are you using?
+      placeholder: >
+        e.g. Ryzen 5950X + 2x RTX 4090
+    validations:
+      required: true
+  - type: textarea
+    id: model
+    attributes:
+      label: Models
+      description: >
+        Which model(s) at which quantization were you using when encountering the bug?
+        If you downloaded a GGUF file off of Huggingface, please provide a link.
+      placeholder: >
+        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
+    validations:
+      required: false
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it.
+        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
+        that information would be very much appreciated by us.
+      placeholder: >
+        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
+        When I use -ngl 0 it works correctly.
+        Here are the exact commands that I used: ...
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
new file mode 100644
index 000000000..1904e31fd
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -0,0 +1,91 @@
+name: Bug (misc.)
+description: Something is not working the way it should (and it's not covered by any of the above cases).
+title: "Misc. bug: "
+labels: ["bug-unconfirmed"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thanks for taking the time to fill out this bug report!
+        This issue template is intended for miscellaneous bugs that don't fit into any other category.
+        If you encountered the issue while using an external UI (e.g. ollama),
+        please reproduce your issue using one of the examples/binaries in this repository.
+  - type: textarea
+    id: version
+    attributes:
+      label: Name and Version
+      description: Which version of our software is affected? (You can use `--version` to get a version string.)
+      placeholder: |
+        $./llama-cli --version
+        version: 2999 (42b4109e)
+        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+    validations:
+      required: true
+  - type: dropdown
+    id: operating-system
+    attributes:
+      label: Operating systems
+      description: Which operating systems do you know to be affected?
+      multiple: true
+      options:
+        - Linux
+        - Mac
+        - Windows
+        - BSD
+        - Other? (Please let us know in description)
+    validations:
+      required: false
+  - type: dropdown
+    id: module
+    attributes:
+      label: Which llama.cpp modules do you know to be affected?
+      multiple: true
+      options:
+        - Documentation/Github
+        - libllama (core library)
+        - llama-cli
+        - llama-server
+        - llama-bench
+        - llama-quantize
+        - Python/Bash scripts
+        - Test code
+        - Other (Please specify in the next section)
+    validations:
+      required: false
+  - type: textarea
+    id: command
+    attributes:
+      label: Command line
+      description: >
+        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
+        This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: false
+  - type: textarea
+    id: info
+    attributes:
+      label: Problem description & steps to reproduce
+      description: >
+        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
+    validations:
+      required: true
+  - type: textarea
+    id: first_bad_commit
+    attributes:
+      label: First Bad Commit
+      description: >
+        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
+        If possible, please do a git bisect and identify the exact commit that introduced the bug.
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: >
+          If applicable, please copy and paste any relevant log output, including any generated text.
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml
new file mode 100644
index 000000000..02dd4f575
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -0,0 +1,51 @@
+name: Enhancement
+description: Used to request enhancements for llama.cpp.
+title: "Feature Request: "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+
+  - type: checkboxes
+    id: prerequisites
+    attributes:
+      label: Prerequisites
+      description: Please confirm the following before submitting your enhancement request.
+      options:
+        - label: I am running the latest code. Mention the version if possible as well.
+          required: true
+        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+          required: true
+        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
+          required: true
+        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+          required: true
+
+  - type: textarea
+    id: feature-description
+    attributes:
+      label: Feature Description
+      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+      placeholder: Detailed description of the enhancement
+    validations:
+      required: true
+
+  - type: textarea
+    id: motivation
+    attributes:
+      label: Motivation
+      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+      placeholder: Explanation of why this feature is needed and its benefits
+    validations:
+      required: true
+
+  - type: textarea
+    id: possible-implementation
+    attributes:
+      label: Possible Implementation
+      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
+      placeholder: Detailed description of potential implementation
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml
new file mode 100644
index 000000000..18975dbbf
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -0,0 +1,52 @@
+name: Research
+description: Track new technical research area.
+title: "Research: "
+labels: ["research 🔬"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+
+  - type: checkboxes
+    id: research-stage
+    attributes:
+      label: Research Stage
+      description: Track general state of this research ticket
+      options:
+        - label: Background Research (Let's try to avoid reinventing the wheel)
+        - label: Hypothesis Formed (How do you think this will work and it's effect?)
+        - label: Strategy / Implementation Forming
+        - label: Analysis of results
+        - label: Debrief / Documentation (So people in the future can learn from us)
+
+  - type: textarea
+    id: background
+    attributes:
+      label: Previous existing literature and research
+      description: Whats the current state of the art and whats the motivation for this research?
+
+  - type: textarea
+    id: hypothesis
+    attributes:
+      label: Hypothesis
+      description: How do you think this will work and it's effect?
+
+  - type: textarea
+    id: implementation
+    attributes:
+      label: Implementation
+      description: Got an approach? e.g. a PR ready to go?
+
+  - type: textarea
+    id: analysis
+    attributes:
+      label: Analysis
+      description: How does the proposed implementation behave?
+
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml
new file mode 100644
index 000000000..b6e6ab36d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -0,0 +1,28 @@
+name: Refactor (Maintainers)
+description: Used to track refactoring opportunities.
+title: "Refactor: "
+labels: ["refactor"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+
+  - type: textarea
+    id: background-description
+    attributes:
+      label: Background Description
+      description: Please provide a detailed written description of the pain points you are trying to solve.
+      placeholder: Detailed description behind your motivation to request refactor
+    validations:
+      required: true
+
+  - type: textarea
+    id: possible-approaches
+    attributes:
+      label: Possible Refactor Approaches
+      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
+      placeholder: Your idea of possible refactoring opportunity/approaches
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
deleted file mode 100644
index 49812832c..000000000
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug-unconfirmed"]
-assignees: ''
-
----
-
-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-
-If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..eb8c4b472
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,11 @@
+blank_issues_enabled: true
+contact_links:
+  - name: Got an idea?
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    about: Pop it there. It may then become an enhancement ticket.
+  - name: Got a question?
+    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    about: Ask a question there!
+  - name: Want to contribute?
+    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
deleted file mode 100644
index dcffda750..000000000
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 000000000..1b47bc968
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,86 @@
+# https://github.com/actions/labeler
+Kompute:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-kompute.h
+            - ggml/src/ggml-kompute/**
+            - README-kompute.md
+Apple Metal:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-metal.h
+            - ggml/src/ggml-metal/**
+            - README-metal.md
+SYCL:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-sycl.h
+            - ggml/src/ggml-sycl/**
+            - docs/backend/SYCL.md
+            - examples/sycl/**
+Nvidia GPU:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-cuda.h
+            - ggml/src/ggml-cuda/**
+Vulkan:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-vulkan.h
+            - ggml/src/ggml-vulkan/**
+documentation:
+    - changed-files:
+        - any-glob-to-any-file:
+            - docs/**
+            - media/**
+testing:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tests/**
+build:
+    - changed-files:
+        - any-glob-to-any-file:
+            - cmake/**
+            - CMakeLists.txt
+            - CMakePresets.json
+examples:
+    - changed-files:
+        - any-glob-to-any-file: examples/**
+devops:
+    - changed-files:
+        - any-glob-to-any-file:
+            - .devops/**
+            - .github/**
+            - ci/**
+python:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.py"
+            - requirements/**
+            - gguf-py/**
+            - .flake8
+script:
+    - changed-files:
+        - any-glob-to-any-file:
+            - scripts/**
+android:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/llama.android/**
+server:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/server/**
+ggml:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/**
+nix:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.nix"
+            - .github/workflows/nix-*.yml
+            - .devops/nix/nixpkgs-instances.nix
+embedding:
+    - changed-files:
+        - any-glob-to-any-file: examples/embedding/
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000..d9f5bdc23
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1 @@
+*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled
new file mode 100644
index 000000000..1c8787ef7
--- /dev/null
+++ b/.github/workflows/bench.yml.disabled
@@ -0,0 +1,315 @@
+# TODO: there have been some issues with the workflow, so disabling for now
+#       https://github.com/ggerganov/llama.cpp/issues/7893
+#
+# Benchmark
+name: Benchmark
+
+on:
+  workflow_dispatch:
+    inputs:
+      gpu-series:
+        description: 'Azure GPU series to run with'
+        required: true
+        type: choice
+        options:
+          - Standard_NC4as_T4_v3
+          - Standard_NC24ads_A100_v4
+          - Standard_NC80adis_H100_v5
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      duration:
+        description: 'Duration of the bench'
+        type: string
+        default: 10m
+
+  push:
+    branches:
+      - master
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  schedule:
+    -  cron: '04 2 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
+  cancel-in-progress: true
+
+jobs:
+  bench-server-baseline:
+    runs-on: Standard_NC4as_T4_v3
+    env:
+      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+      N_USERS: 8
+      DURATION: 10m
+
+    strategy:
+      matrix:
+        model: [phi-2]
+        ftype: [q4_0, q8_0, f16]
+        include:
+          - model: phi-2
+            ftype: q4_0
+            pr_comment_enabled: "true"
+
+    if: |
+      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
+      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install python env
+        id: pipenv
+        run: |
+          cd examples/server/bench
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+
+      - name: Prometheus
+        id: install_prometheus
+        run: |
+          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
+          tar xzf prometheus*.tar.gz --strip-components=1
+          ./prometheus --config.file=examples/server/bench/prometheus.yml &
+          while ! nc -z localhost 9090; do
+            sleep 0.1
+          done
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install k6 and xk6-sse
+        id: k6_installation
+        run: |
+          cd examples/server/bench
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse
+
+      - name: Build
+        id: cmake_build
+        run: |
+          set -eux
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DLLAMA_CUBLAS=ON \
+              -DCUDAToolkit_ROOT=/usr/local/cuda \
+              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+              -DCMAKE_CUDA_ARCHITECTURES=75 \
+              -DLLAMA_FATAL_WARNINGS=OFF \
+              -DLLAMA_ALL_WARNINGS=OFF \
+              -DCMAKE_BUILD_TYPE=Release;
+          cmake --build build --config Release -j $(nproc) --target llama-server
+
+      - name: Download the dataset
+        id: download_dataset
+        run: |
+          cd examples/server/bench
+          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+      - name: Server bench
+        id: server_bench
+        env:
+            HEAD_REF: ${{ github.head_ref || github.ref_name }}
+        run: |
+          set -eux
+
+          cd examples/server/bench
+          source venv/bin/activate
+          python bench.py \
+              --runner-label ${{ env.RUNNER_LABEL }} \
+              --name ${{ github.job }} \
+              --branch $HEAD_REF \
+              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
+              --scenario script.js \
+              --duration ${{ github.event.inputs.duration || env.DURATION }} \
+              --hf-repo ggml-org/models	 \
+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
+              --model-path-prefix /models \
+              --parallel ${{ env.N_USERS }} \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 16384 \
+              --n-prompts 1000 \
+              --max-prompt-tokens 1024 \
+              --max-tokens 2048
+
+          cat results.github.env >> $GITHUB_ENV
+
+          # Remove dataset as we do not want it in the artefact
+          rm ShareGPT_V3_unfiltered_cleaned_split.json
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          compression-level: 9
+          path: |
+            examples/server/bench/*.jpg
+            examples/server/bench/*.json
+            examples/server/bench/*.log
+
+      - name: Commit status
+        uses: Sibz/github-status-action@v1
+        with:
+          authToken: ${{secrets.GITHUB_TOKEN}}
+          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          description: |
+            ${{ env.BENCH_RESULTS }}
+          state: 'success'
+
+      - name: Upload benchmark images
+        uses: devicons/public-upload-to-imgur@v2.2.2
+        continue-on-error: true # Important as it looks unstable: 503
+        id: imgur_step
+        with:
+          client_id: ${{secrets.IMGUR_CLIENT_ID}}
+          path: |
+            examples/server/bench/prompt_tokens_seconds.jpg
+            examples/server/bench/predicted_tokens_seconds.jpg
+            examples/server/bench/kv_cache_usage_ratio.jpg
+            examples/server/bench/requests_processing.jpg
+
+      - name: Extract mermaid
+        id: set_mermaid
+        run: |
+          set -eux
+
+          cd examples/server/bench
+          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
+          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
+          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
+          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
+          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
+          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
+          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Extract image url
+        id: extract_image_url
+        continue-on-error: true
+        run: |
+          set -eux
+
+          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
+          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
+          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
+          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
+
+      - name: Comment PR
+        uses: mshick/add-pr-comment@v2
+        id: comment_pr
+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
+        with:
+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+          message: |
+            <p align="center">
+
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+            </p>
+
+            <details>
+
+            <summary>Expand details for performance related PR only</summary>
+
+            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - ${{ env.BENCH_GRAPH_XLABEL }}
+
+
+            <p align="center">
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
+
+            <details>
+
+            <summary>More</summary>
+
+            ```mermaid
+            ${{ env.PROMPT_TOKENS_SECONDS }}
+            ```
+
+            </details>
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.PREDICTED_TOKENS_SECONDS }}
+            ```
+
+            </details>
+
+            </p>
+
+            <details>
+
+            <summary>Details</summary>
+
+            <p align="center">
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.KV_CACHE_USAGE_RATIO }}
+            ```
+
+            </details>
+
+            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
+
+            <details>
+                <summary>More</summary>
+
+            ```mermaid
+            ${{ env.REQUESTS_PROCESSING }}
+            ```
+
+            </details>
+
+            </p>
+            </details>
+            </details>
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9144f9266..6841ba589 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,65 +10,133 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
 
 jobs:
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
+  macOS-latest-cmake-arm64:
+    runs-on: macos-14
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
+        continue-on-error: true
         run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
-
-  ubuntu-latest-cmake:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
+          brew update
 
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON
-          cmake --build . --config Release -j $(nproc)
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip
+
+  macOS-latest-cmake-x64:
+    runs-on: macos-13
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
         id: cmake_test
@@ -76,6 +144,110 @@ jobs:
           cd build
           ctest -L main --verbose --timeout 900
 
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
+  ubuntu-cpu-cmake:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cpu-cmake
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_CURL=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: llama-bin-ubuntu-x64.zip
+
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
 
@@ -84,12 +256,59 @@ jobs:
     strategy:
       matrix:
         sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=OFF
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-latest-llguidance:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
@@ -102,8 +321,10 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+          cmake .. \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_LLGUIDANCE=ON
+          cmake --build . --config Release -j $(nproc)
 
       - name: Test
         id: cmake_test
@@ -111,33 +332,34 @@ jobs:
           cd build
           ctest -L main --verbose --timeout 900
 
-  ubuntu-latest-cmake-mpi:
+  ubuntu-latest-cmake-rpc:
     runs-on: ubuntu-latest
 
     continue-on-error: true
 
-    strategy:
-      matrix:
-        mpi_library: [mpich, libopenmpi-dev]
-
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-rpc
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+          sudo apt-get install build-essential
 
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_MPI=ON ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
 
       - name: Test
         id: cmake_test
@@ -151,21 +373,101 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-vulkan
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_VULKAN=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 1800
+
+  ubuntu-22-cmake-hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.0.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
 
-      - name: Build
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-hip
+          evict-old-files: 1d
+
+      - name: Build with native CMake HIP support
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_VULKAN=ON ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Build with legacy HIP support
+        id: cmake_build_legacy_hip
+        run: |
+          cmake -B build2 -S . \
+            -DCMAKE_C_COMPILER=hipcc \
+            -DCMAKE_CXX_COMPILER=hipcc \
+            -DGGML_HIP=ON
+          cmake --build build2 --config Release -j $(nproc)
+
+  ubuntu-22-cmake-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-musa
+          evict-old-files: 1d
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
 
   ubuntu-22-cmake-sycl:
     runs-on: ubuntu-22.04
@@ -173,7 +475,7 @@ jobs:
     continue-on-error: true
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: add oneAPI to apt
         shell: bash
@@ -197,16 +499,23 @@ jobs:
 
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl
+          evict-old-files: 1d
 
       - name: Build
         id: cmake_build
         run: |
           source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
-          cmake --build . --config Release -j $(nproc)
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          cmake --build build --config Release -j $(nproc)
 
   ubuntu-22-cmake-sycl-fp16:
     runs-on: ubuntu-22.04
@@ -214,7 +523,7 @@ jobs:
     continue-on-error: true
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: add oneAPI to apt
         shell: bash
@@ -238,79 +547,24 @@ jobs:
 
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-sycl-fp16
+          evict-old-files: 1d
 
       - name: Build
         id: cmake_build
         run: |
           source /opt/intel/oneapi/setvars.sh
-          mkdir build
-          cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        env:
-            LLAMA_FATAL_WARNINGS: 1
-        run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: make_test
-        run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-
-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
-  macOS-latest-cmake:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          cmake --build build --config Release -j $(nproc)
 
   macOS-latest-cmake-ios:
     runs-on: macos-latest
@@ -318,7 +572,13 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-ios
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
@@ -330,15 +590,16 @@ jobs:
         id: cmake_build
         run: |
           sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
   macOS-latest-cmake-tvos:
     runs-on: macos-latest
@@ -346,7 +607,13 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-tvos
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
@@ -358,15 +625,16 @@ jobs:
         id: cmake_build
         run: |
           sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
   macOS-latest-swift:
     runs-on: macos-latest
@@ -378,7 +646,13 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-swift
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
@@ -386,83 +660,130 @@ jobs:
         run: |
           brew update
 
+      - name: Build llama.cpp with CMake
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          sudo cmake --install build --config Release
+
       - name: xcodebuild for swift package
         id: xcodebuild
         run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+          xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
 
-      - name: Build Swift Example
-        id: make_build_swift_example
+  windows-msys2:
+    runs-on: windows-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
+          - { sys: CLANG64, env: clang-x86_64, build: Release }
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-msys2
+          variant: sccache
+          evict-old-files: 1d
+
+      - name: Setup ${{ matrix.sys }}
+        uses: msys2/setup-msys2@v2
+        with:
+          update: true
+          msystem: ${{matrix.sys}}
+          install: >-
+            base-devel
+            git
+            mingw-w64-${{matrix.env}}-toolchain
+            mingw-w64-${{matrix.env}}-cmake
+            mingw-w64-${{matrix.env}}-openblas
+
+      - name: Build using CMake
+        shell: msys2 {0}
         run: |
-            make swift
+            cmake -B build
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
+      - name: Clean after building using CMake
+        shell: msys2 {0}
+        run: |
+            rm -rf build
+
+      - name: Build using CMake w/ OpenBLAS
+        shell: msys2 {0}
+        run: |
+            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
 
   windows-latest-cmake:
     runs-on: windows-latest
 
     env:
       OPENBLAS_VERSION: 0.3.23
-      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.6.0
       SDE_VERSION: 9.33.0-2024-01-07
       VULKAN_VERSION: 1.3.261.1
 
     strategy:
       matrix:
         include:
-          - build: 'noavx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'vulkan'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'noavx-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+          - build: 'avx2-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
+          - build: 'avx-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
+          - build: 'avx512-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
+          - build: 'openblas-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'kompute-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+          - build: 'vulkan-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
+          - build: 'llvm-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+          - build: 'msvc-arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+          - build: 'llvm-arm64-opencl-adreno'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
       - name: Clone Kompute submodule
         id: clone_kompute
-        if: ${{ matrix.build == 'kompute' }}
+        if: ${{ matrix.build == 'kompute-x64' }}
         run: |
-          git submodule update --init kompute
-
-      - name: Download OpenCL SDK
-        id: get_opencl
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
-          mkdir $env:RUNNER_TEMP/opencl
-          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
-      - name: Download CLBlast
-        id: get_clblast
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
-          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
-          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
-          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
-            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
-          }
+          git submodule update --init ggml/src/ggml-kompute/kompute
 
       - name: Download OpenBLAS
         id: get_openblas
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
           curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -475,38 +796,54 @@ jobs:
 
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
+        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add clblast.dll
-        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
-          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
+          cmake -S . -B build ${{ matrix.defines }}
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas' }}
+        if: ${{ matrix.build == 'openblas-x64' }}
         run: |
           cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
           cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
 
       - name: Check AVX512F support
         id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
+        if: ${{ matrix.build == 'avx512-x64' }}
         continue-on-error: true
         run: |
           cd build
@@ -520,14 +857,14 @@ jobs:
       - name: Test
         id: cmake_test
         # not all machines have native AVX-512
-        if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
         run: |
           cd build
           ctest -L main -C Release --verbose --timeout 900
 
       - name: Test (Intel SDE)
         id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
         run: |
           curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
           # for some weird reason windows tar doesn't like sde tar.xz
@@ -535,6 +872,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
+          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
           & $sde -future -- ctest -L main -C Release --verbose --timeout 900
 
       - name: Determine tag name
@@ -555,44 +893,147 @@ jobs:
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
           Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: llama-bin-win-${{ matrix.build }}.zip
 
-  windows-latest-cmake-cublas:
-    runs-on: windows-latest
+  ubuntu-latest-cmake-cuda:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
+
+        - name: Install dependencies
+          env:
+            DEBIAN_FRONTEND: noninteractive
+          run: |
+              apt update
+              apt install -y cmake build-essential ninja-build libgomp1 git
+
+        - name: ccache
+          uses: hendrikmuhs/ccache-action@v1.2.16
+          with:
+            key: ubuntu-latest-cmake-cuda
+            evict-old-files: 1d
+
+        - name: Build with CMake
+          run: |
+            cmake -S . -B build -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_CUDA_ARCHITECTURES=89-real \
+              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+              -DLLAMA_FATAL_WARNINGS=ON \
+              -DGGML_NATIVE=OFF \
+              -DGGML_CUDA=ON
+            cmake --build build
+
+  windows-2019-cmake-cuda:
+    runs-on: windows-2019
 
     strategy:
       matrix:
-        cuda: ['12.2.0', '11.7.1']
-        build: ['cublas']
+        cuda: ['12.4', '11.7']
+        build: ['cuda']
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
-          fetch-depth: 0
+            fetch-depth: 0
 
-      - uses: Jimver/cuda-toolkit@v0.2.11
-        id: cuda-toolkit
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
+          variant: sccache
+          evict-old-files: 1d
+
+      - name: Install Cuda Toolkit 11.7
+        if: ${{ matrix.cuda == '11.7' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Cuda Toolkit 12.4
+        if: ${{ matrix.cuda == '12.4' }}
+        run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
 
       - name: Build
         id: cmake_build
+        shell: cmd
         run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DLLAMA_BUILD_SERVER=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CUDA=ON ^
+            -DGGML_RPC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
 
       - name: Determine tag name
         id: tag
@@ -615,56 +1056,254 @@ jobs:
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
         run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          echo "Cuda install location: ${{ env.CUDA_PATH }}"
           $dst='.\build\bin\cudart\'
-          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
           7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          path: |
-            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
 
   windows-latest-cmake-sycl:
     runs-on: windows-latest
+
     defaults:
       run:
         shell: bash
 
     env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
-
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-sycl
+          variant: sccache
+          evict-old-files: 1d
+
       - name: Install
-        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
 
       - name: Build
         id: cmake_build
         run:  examples/sycl/win-build-sycl.bat
 
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Build the release package
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  windows-latest-cmake-hip:
+    if: ${{ github.event.inputs.create_release != 'true' }}
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Install ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ${{ github.job }}
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGGML_HIP=ON `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  windows-latest-cmake-hip-release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        gpu_target: [gfx1100, gfx1101, gfx1030]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-release
+          evict-old-files: 1d
+
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_HIP=ON `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+
   ios-xcode-build:
     runs-on: macos-latest
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+          sudo cmake --install build --config Release
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
 
       - name: Build Xcode project
         run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -674,7 +1313,13 @@ jobs:
 
     steps:
       - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: android-build
+          evict-old-files: 1d
 
       - name: Set up JDK
         uses: actions/setup-java@v3
@@ -693,43 +1338,32 @@ jobs:
 
           ./gradlew build --no-daemon
 
-#  freeBSD-latest:
-#    runs-on: macos-12
-#    steps:
-#    - name: Clone
-#      uses: actions/checkout@v3
-#
-#    - name: Build
-#      uses: cross-platform-actions/action@v0.19.0
-#      with:
-#        operating_system: freebsd
-#        version: '13.2'
-#        hypervisor: 'qemu'
-#        run: |
-#            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
-
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-focal-make
-      - ubuntu-latest-cmake
-      - macOS-latest-make
-      - macOS-latest-cmake
+      - ubuntu-cpu-cmake
       - windows-latest-cmake
-      - windows-latest-cmake-cublas
+      - windows-2019-cmake-cuda
+      - windows-latest-cmake-hip-release
+      - macOS-latest-cmake-arm64
+      - macOS-latest-cmake-x64
 
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: release
+          evict-old-files: 1d
+
       - name: Determine tag name
         id: tag
         shell: bash
@@ -745,11 +1379,17 @@ jobs:
 
       - name: Download artifacts
         id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
 
       - name: Create release
         id: create_release
-        uses: anzz1/action-create-release@v1
+        uses: ggml-org/action-create-release@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
@@ -764,7 +1404,7 @@ jobs:
             const path = require('path');
             const fs = require('fs');
             const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact')) {
+            for (let file of await fs.readdirSync('./artifact/release')) {
               if (path.extname(file) === '.zip') {
                 console.log('uploadReleaseAsset', file);
                 await github.repos.uploadReleaseAsset({
@@ -772,7 +1412,7 @@ jobs:
                   repo: context.repo.repo,
                   release_id: release_id,
                   name: file,
-                  data: await fs.readFileSync(`./artifact/${file}`)
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
                 });
               }
             }
@@ -786,7 +1426,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -810,7 +1450,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -834,7 +1474,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -864,7 +1504,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -880,7 +1520,7 @@ jobs:
 #          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
 #
 #      - name: Upload binaries
-#        uses: actions/upload-artifact@v1
+#        uses: actions/upload-artifact@v4
 #        with:
 #          name: llama-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -903,7 +1543,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -935,7 +1575,7 @@ jobs:
 #
 #      - name: Upload binaries
 #        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v1
+#        uses: actions/upload-artifact@v4
 #        with:
 #          name: llama-blas-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -949,7 +1589,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -969,3 +1609,37 @@ jobs:
 #          popd
 #          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
 #          make
+
+  openEuler-latest-cmake-cann:
+    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
+    defaults:
+      run:
+       shell: bash -el {0}
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        cann:
+          - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
+        device:
+          - 'ascend910b3'
+        build:
+          - 'Release'
+    container: ascendai/cann:${{ matrix.cann }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake
+
+      - name: Build
+        run: |
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=${{ matrix.device }}
+          cmake --build build -j $(nproc)
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
new file mode 100644
index 000000000..276a217d4
--- /dev/null
+++ b/.github/workflows/close-issue.yml
@@ -0,0 +1,28 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "42 0 * * *"
+
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  issues: write
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          operations-per-run: 10000
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
deleted file mode 100644
index 392db8a08..000000000
--- a/.github/workflows/code-coverage.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: Code Coverage
-on: [push, pull_request]
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-
-jobs:
-  run:
-    runs-on: ubuntu-20.04
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 lcov
-
-      - name: Build
-        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
-
-      - name: Run tests
-        run: CC=gcc-8 make test
-
-      - name: Generate coverage report
-        run: |
-          make coverage
-          make lcov-report
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        env:
-           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        with:
-          files: lcov-report/coverage.info
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 94f9161fc..6955a7dc8 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -10,45 +10,50 @@
 name: Publish Docker image
 
 on:
-  pull_request:
-  push:
-    branches:
-      - master
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because it is expensive
+    - cron: '12 4 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  packages: write
 
 jobs:
   push_to_registry:
     name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       COMMIT_SHA: ${{ github.sha }}
     strategy:
+      fail-fast: false
       matrix:
         config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
-          #                     have disabled them for now until the reason why
-          #                     is understood.
-          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
+          # Multi-stage build
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
     steps:
       - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # preserve git history, so we can determine the build number
 
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
 
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
 
       - name: Log in to Docker Hub
         uses: docker/login-action@v2
@@ -57,9 +62,45 @@ jobs:
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+
+          # determine tag name postfix (build number, commit hash)
+          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
+          else
+            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
+          fi
+          # list all tags possible
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
+              TYPE=""
+          else
+              TYPE="-${{ matrix.config.tag }}"
+          fi
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
+        env:
+          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
       - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
+        if: ${{ matrix.config.free_disk_space == true }}
+        uses: ggml-org/free-disk-space@v1.3.1
         with:
           # this might remove tools that are actually needed,
           # if set to "true" but frees about 6 GB
@@ -74,34 +115,59 @@ jobs:
           docker-images: true
           swap-storage: true
 
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build and push Docker image (versioned)
-        if: github.event_name == 'push'
-        uses: docker/build-push-action@v4
+      - name: Build and push Full Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+        uses: docker/build-push-action@v6
         with:
           context: .
           push: true
           platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
           file: ${{ matrix.config.dockerfile }}
+          target: full
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
 
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
+      - name: Build and push Light Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
+        uses: docker/build-push-action@v6
         with:
           context: .
-          push: ${{ github.event_name == 'push' }}
+          push: true
           platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
           file: ${{ matrix.config.dockerfile }}
+          target: light
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+
+      - name: Build and push Server Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.server_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: server
+          provenance: false
+          # using github experimental cache
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index 0e0993cd4..f02b7c219 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,10 +14,16 @@ on:
     branches:
       - master
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
   editorconfig:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - uses: actions/checkout@v4
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
+        with:
+          version: v3.0.3
       - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
index 57db17512..3ca4d3058 100644
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -24,9 +24,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: '3.9.x'
     - name: Install dependencies
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 000000000..368dbdbe5
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,17 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        repository: "ggerganov/llama.cpp"
+    - uses: actions/labeler@v5
+      with:
+        configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
deleted file mode 100644
index 8d0a3fd7f..000000000
--- a/.github/workflows/nix-ci-aarch64.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
-  push:
-    branches:
-      - master
-    paths: ['**/*.nix', 'flake.lock']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
-
-jobs:
-  nix-build-aarch64:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
deleted file mode 100644
index 01c5a9d5a..000000000
--- a/.github/workflows/nix-ci.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    types: [opened, synchronize, reopened]
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
deleted file mode 100644
index 3a6a96e26..000000000
--- a/.github/workflows/nix-flake-update.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml
deleted file mode 100644
index 2c3c1ebda..000000000
--- a/.github/workflows/nix-publish-flake.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 92e1108b3..46e80aecd 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -3,16 +3,20 @@ name: Python check requirements.txt
 on:
   push:
     paths:
+      - '.github/workflows/python-check-requirements.yml'
       - 'scripts/check-requirements.sh'
       - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'
   pull_request:
     paths:
+      - '.github/workflows/python-check-requirements.yml'
       - 'scripts/check-requirements.sh'
       - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
   python-check-requirements:
@@ -20,10 +24,10 @@ jobs:
     name: check-requirements
     steps:
       - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: "3.11"
       - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh nocleanup
+        run:  bash scripts/check-requirements.sh
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index ea0a05ea1..ddfdf73b8 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,6 +1,17 @@
 name: flake8 Lint
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
   flake8-lint:
@@ -8,13 +19,12 @@ jobs:
     name: Lint
     steps:
       - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: "3.11"
       - name: flake8 Lint
         uses: py-actions/flake8@v2
         with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
+            plugins: "flake8-no-print"
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
new file mode 100644
index 000000000..373bb6010
--- /dev/null
+++ b/.github/workflows/python-type-check.yml
@@ -0,0 +1,40 @@
+name: Python Type-Check
+
+on:
+  push:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
+      - '**.py'
+      - '**/requirements*.txt'
+  pull_request:
+    paths:
+      - '.github/workflows/python-type-check.yml'
+      - 'pyrightconfig.json'
+      - '**.py'
+      - '**/requirements*.txt'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  python-type-check:
+    runs-on: ubuntu-latest
+    name: pyright type-check
+    steps:
+      - name: Check out source repository
+        uses: actions/checkout@v4
+      - name: Set up Python environment
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Python dependencies
+        # TODO: use a venv
+        run: pip install -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.382
+          level: warning
+          warnings: true
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 0b6f6669b..3a29107d0 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -3,13 +3,32 @@ name: Server
 
 on:
   workflow_dispatch: # allows manual triggering
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      slow_tests:
+        description: 'Run slow tests'
+        required: true
+        type: boolean
   push:
     branches:
       - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
   server:
@@ -17,67 +36,204 @@ jobs:
 
     strategy:
       matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
         include:
           - build_type: Release
             sanitizer: ""
-        exclude:
-          - build_type: Release
-            sanitizer: ADDRESS
-          - build_type: Release
-            sanitizer: THREAD
-          - build_type: Release
-            sanitizer: UNDEFINED
-
-    container:
-      image: ubuntu:latest
-      ports:
-        - 8888
-      options: --cpus 4
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
 
     steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
       - name: Dependencies
         id: depends
         run: |
-          apt-get update
-          apt-get -y install \
+          sudo apt-get update
+          sudo apt-get -y install \
             build-essential \
+            xxd \
             git \
             cmake \
-            python3-pip \
+            curl \
             wget \
-            psmisc
+            language-pack-en \
+            libcurl4-openssl-dev
 
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-              -DLLAMA_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
 
       - name: Tests dependencies
         id: test_dependencies
         run: |
           pip install -r examples/server/tests/requirements.txt
 
-      - name: Download models
-        id: download_models
+      # Setup nodejs (to be used for verifying bundled index.html)
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '22.11.0'
+
+      - name: WebUI - Install dependencies
+        id: webui_lint
         run: |
-          cd examples/server/tests
-          ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
+          cd examples/server/webui
+          npm ci
+
+      - name: WebUI - Check code format
+        id: webui_format
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server/webui
+          git status
+
+          npm run format
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Files do not follow coding style. To fix: npm run format"
+            echo "${modified_files}"
+            exit 1
+          fi
+
+      - name: Verify bundled index.html
+        id: verify_server_index_html
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server/webui
+          git status
+
+          npm run build
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Repository is dirty or server/webui is not built as expected"
+            echo "Hint: You may need to follow Web UI build guide in server/README.md"
+            echo "${modified_files}"
+            exit 1
+          fi
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DGGML_OPENMP=OFF ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
 
       - name: Tests
-        id: server_integration_test
+        id: server_integration_tests
+        if: ${{ matrix.sanitizer == '' }}
         run: |
           cd examples/server/tests
-          PORT=8888 ./tests.sh
+          ./tests.sh
+
+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd examples/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh
+
+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd examples/server/tests
+          SLOW_TESTS=1 ./tests.sh
+
+
+  server-windows:
+    runs-on: windows-2019
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: libCURL
+        id: get_libcurl
+        env:
+          CURL_VERSION: 8.6.0_6
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+          mkdir $env:RUNNER_TEMP/libcurl
+          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r examples/server/tests/requirements.txt
+
+      - name: Copy Libcurl
+        id: prepare_libcurl
+        run: |
+          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
+        run: |
+          cd examples/server/tests
+          $env:PYTHONIOENCODING = ":replace"
+          pytest -v -x -m "not slow"
+
+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd examples/server/tests
+          $env:SLOW_TESTS = "1"
+          pytest -v -x
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
deleted file mode 100644
index 03652760c..000000000
--- a/.github/workflows/tidy-post.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
-  workflow_dispatch:
-    workflows: ["clang-tidy-review"]
-    types:
-      - completed
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: ZedThree/clang-tidy-review/post@v0.13.0
-        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
-        with:
-          # adjust options as necessary
-          lgtm_comment_body: ''
-          annotations: false
-          max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
deleted file mode 100644
index a4bc8d976..000000000
--- a/.github/workflows/tidy-review.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  clang-tidy-review:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - uses: ZedThree/clang-tidy-review@v0.13.0
-      id: review
-      with:
-        lgtm_comment_body: ''
-        build_dir: build
-        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
-        split_workflow: true
-
-    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
deleted file mode 100644
index 68a698ab9..000000000
--- a/.github/workflows/zig-build.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: Zig CI
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on: [ubuntu-latest, macos-latest, windows-latest]
-    runs-on: ${{ matrix.runs-on }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
-          fetch-depth: 0
-      - uses: goto-bus-stop/setup-zig@v2
-        with:
-          version: 0.11.0
-      - name: Build Summary
-        run: zig build --summary all -freference-trace
diff --git a/.gitignore b/.gitignore
index 62b6b8b1a..694f36e04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,94 +1,145 @@
-*.o
+# Extensions
+
 *.a
-*.so
-*.gguf
-*.bin
-*.exe
-*.dll
-*.log
-*.gcov
-*.gcno
-*.gcda
-*.dot
 *.bat
+*.bin
+*.d
+*.dll
+*.dot
+*.etag
+*.exe
+*.gcda
+*.gcno
+*.gcov
+*.gguf
+*.gguf.json
+*.lastModified
+*.log
 *.metallib
-.DS_Store
-.build/
+*.o
+*.so
+*.swp
+*.tmp
+
+# IDE / OS
+
 .cache/
 .ccls-cache/
 .direnv/
+.DS_Store
 .envrc
+.idea/
 .swiftpm
-.venv
-.clang-tidy
 .vs/
 .vscode/
-.idea/
+nppBackup
+
+
+# Coverage
 
-lcov-report/
 gcovr-report/
+lcov-report/
 
+# Build Artifacts
+
+tags
+.build/
 build*
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
+!build.zig
+!docs/build.md
+/libllama.so
+/llama-*
+/vulkan-shaders-gen
+android-ndk-*
+arm_neon.h
 cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
+/rpc-server
 out/
 tmp/
+autogen-*.md
+
+# Deprecated
+
+/main
+/server
+
+# CI
+
+!.github/workflows/*.yml
+
+# Models
 
 models/*
 models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
 
-/Pipfile
-/baby-llama
-/beam-search
-/benchmark-matmult
-/convert-llama2c-to-ggml
-/embd-input-test
-/embedding
-/gguf
-/gguf-llama-simple
-/imatrix
-/infill
-/libllama.so
-/llama-bench
-/llava-cli
-/lookahead
-/lookup
-/main
-/metal
-/passkey
-/perplexity
-/q8dot
-/quantize
-/quantize-stats
-/result
-/save-load-state
-/server
-/simple
-/batched
-/batched-bench
-/export-lora
-/finetune
-/speculative
-/parallel
-/train-text-from-scratch
-/tokenize
-/vdot
-/common/build-info.cpp
-arm_neon.h
-compile_commands.json
-CMakeSettings.json
-
-__pycache__
-dist
-
+# Zig
 zig-out/
 zig-cache/
 
+# Logs
+
 ppl-*.txt
 qnt-*.txt
 perf-*.txt
 
-examples/jeopardy/results.txt
+# Examples
 
-poetry.lock
+examples/jeopardy/results.txt
+examples/server/*.css.hpp
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh
+
+# Server Web UI temporary files
+node_modules
+examples/server/webui/dist
+
+# Python
+
+/.venv
+__pycache__/
+*/poetry.lock
 poetry.toml
-nppBackup
+
+# Nix
+/result
+
+# Test binaries
+/tests/test-backend-ops
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-rope
+/tests/test-sampling
+/tests/test-tokenizer-0
+/tests/test-tokenizer-1-bpe
+/tests/test-tokenizer-1-spm
+
+# Scripts
+!/scripts/install-oneapi.bat
+
+# Test models for lora adapters
+/lora-tests
+
+# Local scripts
+/run-vim.sh
+/run-chat.sh
diff --git a/.gitmodules b/.gitmodules
index b7e8b8ff2..23ce5ff05 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = kompute
+	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 65796fe2e..91d791628 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,13 +3,14 @@
 exclude: prompts/.*.txt
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
+  rev: v4.6.0
   hooks:
   - id: trailing-whitespace
   - id: end-of-file-fixer
   - id: check-yaml
   - id: check-added-large-files
 - repo: https://github.com/PyCQA/flake8
-  rev: 6.0.0
+  rev: 7.0.0
   hooks:
   -   id: flake8
+      additional_dependencies: [flake8-no-print]
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..6796b2941
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,1047 @@
+# date: Tue Feb  4 13:04:05 EET 2025
+# this file is auto-generated by scripts/gen-authors.sh
+
+0cc4m <picard12@live.de>
+0xspringtime <110655352+0xspringtime@users.noreply.github.com>
+20kdc <asdd2808@gmail.com>
+2f38b454 <dxf@protonmail.com>
+3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
+44670 <44670@users.noreply.github.com>
+65a <10104049+65a@users.noreply.github.com>
+AN Long <aisk@users.noreply.github.com>
+AT <manyoso@users.noreply.github.com>
+Aarni Koskela <akx@iki.fi>
+Aaron Miller <apage43@ninjawhale.com>
+Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Abheek Gulati <abheekg@hotmail.com>
+Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
+Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
+Adithya Balaji <adithya.b94@gmail.com>
+AdithyanI <adithyan.i4internet@gmail.com>
+Adrian <smith.adriane@gmail.com>
+Adrian Hesketh <a-h@users.noreply.github.com>
+Adrien Gallouët <adrien@gallouet.fr>
+Adrien Gallouët <angt@huggingface.co>
+Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
+Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
+AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
+AidanBeltonS <aidan.belton@codeplay.com>
+Aisuko <urakiny@gmail.com>
+Akarshan Biswas <akarshan.biswas@gmail.com>
+Akarshan Biswas <akarshanbiswas@fedoraproject.org>
+Al Mochkin <14274697+amochkin@users.noreply.github.com>
+Albert Jin <albert.jin@gmail.com>
+Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
+Alberto Cabrera Pérez <alberto.cabrera@intel.com>
+Alex <awhill19@icloud.com>
+Alex Azarov <alex@azarov.by>
+Alex Azarov <alexander.azarov@mapbox.com>
+Alex Klinkhamer <from.github.com.917@grencez.dev>
+Alex Klinkhamer <git@grencez.dev>
+Alex Nguyen <tiendung@users.noreply.github.com>
+Alex O'Connell <35843486+acon96@users.noreply.github.com>
+Alex Petenchea <alex.petenchea@gmail.com>
+Alex Renda <alexrenda@users.noreply.github.com>
+Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
+Alex von Gluck IV <kallisti5@unixzen.com>
+Alexey Parfenov <zxed@alkatrazstudio.net>
+Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
+Ali Nehzat <ali.nehzat@thanks.dev>
+Ali Tariq <ali.tariq@10xengineers.ai>
+Alon <alonfaraj@gmail.com>
+AlpinDale <52078762+AlpinDale@users.noreply.github.com>
+Amir <amir_zia@outlook.com>
+AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
+Ananta Bastola <anantarajbastola@gmail.com>
+Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
+András Salamon <ott2@users.noreply.github.com>
+Andreas (Andi) Kunar <andreask@msn.com>
+Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
+Andrei <abetlen@gmail.com>
+Andrew Canis <andrew.canis@gmail.com>
+Andrew Downing <andrew2085@gmail.com>
+Andrew Duffy <a10y@users.noreply.github.com>
+Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
+Andy Salerno <andysalerno@gmail.com>
+Andy Tai <andy-tai@users.noreply.github.com>
+Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
+Antonis Makropoulos <benuix@gmail.com>
+Arik Poznanski <arikpoz@users.noreply.github.com>
+Armen Kaleshian <kriation@users.noreply.github.com>
+Artem <guinmoon@gmail.com>
+Artem Zinnatullin <ceo@abstractny.gay>
+Artyom Lebedev <vagran.ast@gmail.com>
+Asbjørn Olling <asbjornolling@gmail.com>
+Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
+Ashish <1856117+ashishdatta@users.noreply.github.com>
+Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
+Ashraful Islam <ashraful.meche@gmail.com>
+Atsushi Tatsuma <yoshoku@outlook.com>
+Austin <77757836+teleprint-me@users.noreply.github.com>
+AustinMroz <austinmroz@utexas.edu>
+BADR <contact@pythops.com>
+Bach Le <bach@bullno1.com>
+Bailey Chittle <39804642+bachittle@users.noreply.github.com>
+BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
+Bartowski <ckealty1182@gmail.com>
+Behnam M <58621210+ibehnam@users.noreply.github.com>
+Ben Ashbaugh <ben.ashbaugh@intel.com>
+Ben Garney <bengarney@users.noreply.github.com>
+Ben Siraphob <bensiraphob@gmail.com>
+Ben Williams <ben@719ben.com>
+Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
+Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
+Benson Wong <mostlygeek@gmail.com>
+Bernat Vadell <hounter.caza@gmail.com>
+Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
+Bert Wagner <github@bertwagner.com>
+Billel Mokeddem <billel.mokeddem.ml@gmail.com>
+Bingan <70050083+binganao@users.noreply.github.com>
+Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
+Bodo Graumann <mail@bodograumann.de>
+Bono Lv <lvscar@users.noreply.github.com>
+Borislav Stanimirov <b.stanimirov@abv.bg>
+Borislav Stanimirov <b@ibob.bg>
+Branden Butler <bwtbutler@hotmail.com>
+Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
+Brian <mofosyne@gmail.com>
+Brian Cunnie <brian.cunnie@gmail.com>
+Bruce MacDonald <brucewmacdonald@gmail.com>
+Bryan Honof <bryanhonof@gmail.com>
+CJ Pais <cj@cjpais.com>
+CRD716 <crd716@gmail.com>
+Calvin Laurenson <calvin@laurenson.dev>
+Cameron <csteele@steelecameron.com>
+Cameron Kaiser <classilla@users.noreply.github.com>
+Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
+CarryFun <76023481+CarryFun@users.noreply.github.com>
+Carsten Kragelund Jørgensen <carsten@kragelund.me>
+CarterLi999 <664681047@qq.com>
+Casey Primozic <casey@cprimozic.net>
+Casey Primozic <me@ameo.link>
+CausalLM <148736309+CausalLM@users.noreply.github.com>
+Cebtenzzre <cebtenzzre@gmail.com>
+CentricStorm <CentricStorm@users.noreply.github.com>
+Chad Brewbaker <crb002@gmail.com>
+Changyeon Kim <cyzero.kim@samsung.com>
+Chao Jiang <jc19chaoj@zoho.com>
+Charles Xu <63788048+chaxu01@users.noreply.github.com>
+Charles Xu <charles.xu@arm.com>
+Chen Xi <xi2.chen@intel.com>
+Chen Xi <xixichen08@foxmail.com>
+Cheng Shao <terrorjack@type.dance>
+Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
+Chris Elrod <elrodc@gmail.com>
+Chris Kuehl <ckuehl@ckuehl.me>
+Christian Demsar <christian@github.email.demsar.us>
+Christian Demsar <crasm@git.vczf.us>
+Christian Falch <875252+chrfalch@users.noreply.github.com>
+Christian Kastner <ckk@kvr.at>
+Christian Kögler <ck3d@gmx.de>
+Christian Köhnenkamp <cvk5@me.com>
+Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
+Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
+Clark Saben <76020733+csaben@users.noreply.github.com>
+Clint Herron <hanclinto@gmail.com>
+Conrad Kramer <conrad@conradkramer.com>
+Corentin REGAL <corentin.regal@gmail.com>
+CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
+Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
+Cuong Trinh Manh <nguoithichkhampha@gmail.com>
+DAN™ <dranger003@gmail.com>
+Damian Stewart <d@damianstewart.com>
+Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
+Dan Johansson <dan.johansson@arm.com>
+Dane Madsen <dane_madsen@hotmail.com>
+DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
+Daniel Bevenius <daniel.bevenius@gmail.com>
+Daniel Drake <drake@endlessos.org>
+Daniel Hiltgen <dhiltgen@users.noreply.github.com>
+Daniel Illescas Romero <illescas.daniel@protonmail.com>
+Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
+Daniele <57776841+daniandtheweb@users.noreply.github.com>
+DannyDaemonic <DannyDaemonic@gmail.com>
+Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
+Dave <dave-fl@users.noreply.github.com>
+Dave Airlie <airlied@gmail.com>
+Dave Airlie <airlied@redhat.com>
+Dave Della Costa <ddellacosta+github@gmail.com>
+David Friehs <david@friehs.info>
+David Kennedy <dakennedyd@gmail.com>
+David Pflug <david@pflug.email>
+David Renshaw <dwrenshaw@gmail.com>
+David Sommers <12738+databyte@users.noreply.github.com>
+David Yang <davidyang6us@gmail.com>
+DavidKorczynski <david@adalogics.com>
+Dawid Potocki <github@dawidpotocki.com>
+Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
+Dean <Dean.Sinaean@gmail.com>
+Deins <deinsegle@gmail.com>
+Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
+Derrick T. Woolworth <dwoolworth@gmail.com>
+Deven Mistry <31466137+deven367@users.noreply.github.com>
+Dibakar Gope <dibakar.gope@arm.com>
+Didzis Gosko <didzis@users.noreply.github.com>
+Diego Devesa <slarengh@gmail.com>
+Diogo Teles Sant'Anna <diogoteles@google.com>
+Djip007 <3705339+Djip007@users.noreply.github.com>
+Djip007 <djip.perois@free.fr>
+Don Mahurin <dmahurin@users.noreply.github.com>
+DooWoong Lee (David) <manics99@naver.com>
+Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+Dou Xinpeng <15529241576@163.com>
+Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
+Douglas Hanley <thesecretaryofwar@gmail.com>
+Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
+Ebey Abraham <ebey97@gmail.com>
+Echo Nolan <echo@echonolan.net>
+Ed Lee <edilee@mozilla.com>
+Ed Lepedus <ed.lepedus@googlemail.com>
+Eddie-Wang <wangjinheng1120@163.com>
+Edward Taylor <edeetee@gmail.com>
+Elaine <elaine.zosa@gmail.com>
+Elbios <141279586+Elbios@users.noreply.github.com>
+Elton Kola <eltonkola@gmail.com>
+Emreerdog <34742675+Emreerdog@users.noreply.github.com>
+Engininja2 <139037756+Engininja2@users.noreply.github.com>
+Equim <sayaka@ekyu.moe>
+Eric Curtin <ecurtin@redhat.com>
+Eric Curtin <ericcurtin17@gmail.com>
+Eric Sommerlade <es0m@users.noreply.github.com>
+Eric Zhang <34133756+EZForever@users.noreply.github.com>
+Erik Garrison <erik.garrison@gmail.com>
+Erik Scholz <Green-Sky@users.noreply.github.com>
+Esko Toivonen <eskot98@gmail.com>
+Ettore Di Giacinto <mudler@users.noreply.github.com>
+Evan Jones <evan.q.jones@gmail.com>
+Evan Miller <emmiller@gmail.com>
+Eve <139727413+netrunnereve@users.noreply.github.com>
+Evgeny Kurnevsky <kurnevsky@gmail.com>
+Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
+ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
+FK <sozforex@gmail.com>
+Fabian <cmdrf@users.noreply.github.com>
+Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
+Faez Shakil <faez.shakil@gmail.com>
+Faisal Zaghloul <faisal.zaghloul@gmail.com>
+Faisal Zaghloul <quic_fzaghlou@quicinc.com>
+Fan Shupei <dymarkfan@outlook.com>
+FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
+Fattire <528174+fat-tire@users.noreply.github.com>
+Felix <stenbackfelix@gmail.com>
+Finn Voorhees <finnvoorhees@gmail.com>
+Firat <firatkiral@gmail.com>
+FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
+Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
+Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
+Francisco Melo <43780565+francis2tm@users.noreply.github.com>
+Frank Mai <thxcode0824@gmail.com>
+FrankHB <frankhb1989@gmail.com>
+Frankie Robertson <frankier@users.noreply.github.com>
+Fred Douglas <43351173+fredlas@users.noreply.github.com>
+Frederik Vogel <Schaltfehler@users.noreply.github.com>
+Gabe Goodhart <gabe.l.hart@gmail.com>
+Gabe Goodhart <ghart@us.ibm.com>
+Gaetan Bisson <gaetan@fenua.org>
+GainLee <perfecter.gen@gmail.com>
+Galunid <karolek1231456@gmail.com>
+Gary Linscott <glinscott@gmail.com>
+Gary Mulder <gjmulder@gmail.com>
+Gavin Zhao <gavinzhaojw@protonmail.com>
+Genkagaku.GPT <hlhr202@163.com>
+Georgi Gerganov <ggerganov@gmail.com>
+Gilad S <giladgd@users.noreply.github.com>
+Gilad S. <7817232+giladgd@users.noreply.github.com>
+Giuseppe Scrivano <giuseppe@scrivano.org>
+GiviMAD <GiviMAD@users.noreply.github.com>
+Govlzkoy <gotope@users.noreply.github.com>
+Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
+Guillaume Wenzek <gwenzek@users.noreply.github.com>
+Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
+Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
+Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
+Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
+Haggai Nuchi <h.nuchi@gmail.com>
+Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
+Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
+HanishKVC <hanishkvc@gmail.com>
+Haohui Mai <ricetons@gmail.com>
+Haoxiang Fei <tonyfettes@tonyfettes.com>
+Harald Fernengel <harald.fernengel@here.com>
+Hatsune Miku <129688334+at8u@users.noreply.github.com>
+HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
+Haus1 <haus.xda@gmail.com>
+Henk Poley <HenkPoley@gmail.com>
+Henri Vasserman <henv@hot.ee>
+Henrik Forstén <henrik.forsten@gmail.com>
+Herman Semenov <GermanAizek@yandex.ru>
+Hesen Peng <hesen.peng@gmail.com>
+HimariO <dsfhe49854@gmail.com>
+Hoang Nguyen <hugo53@users.noreply.github.com>
+Hong Bo PENG <penghb@cn.ibm.com>
+Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
+Howard Su <howard0su@gmail.com>
+Hua Jiang <allenhjiang@outlook.com>
+Huang Qi <huangqi3@xiaomi.com>
+Huawei Lin <huaweilin.cs@gmail.com>
+Hugo Roussel <hugo.rous@gmail.com>
+Huifeng Ou <79071290+ho2103@users.noreply.github.com>
+Ian Bull <irbull@eclipsesource.com>
+Ian Bull <irbull@gmail.com>
+Ian Scrivener <github@zilogy.asia>
+Icecream95 <the.real.icecream95@gmail.com>
+Ido S <ido.pluto@gmail.com>
+IgnacioFDM <ignaciofdm@gmail.com>
+Igor Okulist <okigan@gmail.com>
+Ihar Hrachyshka <ihrachys@redhat.com>
+Ikko Eltociear Ashimine <eltociear@gmail.com>
+Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
+Ionoclast Laboratories <brigham@ionoclast.com>
+Isaac McFadyen <isaac@imcf.me>
+IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ivan <nekotekina@gmail.com>
+Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
+Ivan Komarov <Ivan.Komarov@dfyz.info>
+Ivan Stepanov <ivanstepanovftw@gmail.com>
+JFLFY2255 <JFLFY2255@163.com>
+JH23X <165871467+JH23X@users.noreply.github.com>
+Jack Mousseau <jack@software.inc>
+Jack Mousseau <jmousseau@users.noreply.github.com>
+JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jaeden Amero <jaeden@patater.com>
+Jaemin Son <woalsdnd@gmail.com>
+Jafar Uruç <jafar.uruc@gmail.com>
+Jag Chadha <jagtesh@gmail.com>
+Jakub N <jakubniemczyk97@gmail.com>
+James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
+James Reynolds <magnusviri@users.noreply.github.com>
+Jan Boon <jan.boon@kaetemi.be>
+Jan Boon <kaetemi@gmail.com>
+Jan Ploski <jpl@plosquare.com>
+Jannis Schönleber <joennlae@gmail.com>
+Jared Van Bortel <cebtenzzre@gmail.com>
+Jared Van Bortel <jared@nomic.ai>
+Jason McCartney <jmac@theroot.org>
+Jason Stillerman <jason.t.stillerman@gmail.com>
+Jean-Christophe Hoelt <hoelt@fovea.cc>
+Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
+Jed Fox <git@jedfox.com>
+Jeff Bolz <jbolz@nvidia.com>
+Jeffrey Morgan <jmorganca@gmail.com>
+Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jeroen Mostert <jeroen.mostert@cm.com>
+Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
+Jett Janiak <jettjaniak@gmail.com>
+Jeximo <jeximo@gmail.com>
+Jhen-Jie Hong <iainst0409@gmail.com>
+Jiahao Li <liplus17@163.com>
+Jian Liao <jianliao@users.noreply.github.com>
+JidongZhang-THU <1119708529@qq.com>
+Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
+Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
+Jiří Sejkora <Sejseloid@gmail.com>
+Joan Fontanals <jfontanalsmartinez@gmail.com>
+Joan Fontanals <joan.fontanals.martinez@jina.ai>
+João Dinis Ferreira <hello@joaof.eu>
+Joe Eli McIlvain <joe.eli.mac@gmail.com>
+Joe Todd <joe.todd@codeplay.com>
+Johan <JohanAR@users.noreply.github.com>
+Johannes Gäßler <johannesg@5d6.de>
+Johannes Rudolph <johannes.rudolph@gmail.com>
+John <78893154+cmp-nct@users.noreply.github.com>
+John Balis <phobossystems@gmail.com>
+John Smith <67539080+kingsidelee@users.noreply.github.com>
+JohnnyB <jboero@users.noreply.github.com>
+Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
+Jorge A <161275481+jorgealias@users.noreply.github.com>
+Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
+Joseph Stahl <1269177+josephst@users.noreply.github.com>
+Josh Ramer <josh.ramer@icloud.com>
+Joyce <joycebrum@google.com>
+Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
+Judd <foldl@users.noreply.github.com>
+Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
+Julius Arkenberg <arki05@users.noreply.github.com>
+Jun Hee Yoo <contact.jhyoo@gmail.com>
+Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Junil Kim <logyourself@gmail.com>
+Junyang Lin <justinlin930319@hotmail.com>
+Juraj Bednar <juraj@bednar.io>
+Justin Parker <jparkerweb@gmail.com>
+Justin Suess <justin.suess@westpoint.edu>
+Justina Cho <justcho5@gmail.com>
+Justine Tunney <jtunney@gmail.com>
+Justine Tunney <jtunney@mozilla.com>
+Juuso Alasuutari <juuso.alasuutari@gmail.com>
+KASR <karim.asrih@gmail.com>
+Kamil Tomšík <info@tomsik.cz>
+Karol Kontny <82021046+kkontny@users.noreply.github.com>
+Karsten Weiss <knweiss@gmail.com>
+Karthick <j.karthic2004@gmail.com>
+Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
+Karthik Sethuraman <k.seth1993@gmail.com>
+Kasumi <90275229+kasumi-1@users.noreply.github.com>
+Kawrakow <48489457+ikawrakow@users.noreply.github.com>
+Keiichi Tabata <keiichi.tabata@outlook.com>
+Keke Han <hankeke303@163.com>
+Kenvix ⭐ <kenvixzure@live.com>
+Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Kevin Gibbons <bakkot@gmail.com>
+Kevin Ji <1146876+kevinji@users.noreply.github.com>
+Kevin Kwok <antimatter15@gmail.com>
+Kevin Lo <kevlo@kevlo.org>
+Kevin Wang <kevmo314@gmail.com>
+Kolen Cheung <ickc@users.noreply.github.com>
+Konstantin Herud <konstantin.herud@denkbares.com>
+Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
+Kunshang Ji <kunshang.ji@intel.com>
+Kyle Bruene <KyleBruene@users.noreply.github.com>
+Kyle Liang <liangmanlai@gmail.com>
+Kyle Mistele <kyle@mistele.com>
+Kylin <56434533+KyL0N@users.noreply.github.com>
+Lars Grammel <lars.grammel@gmail.com>
+Laura <Tijntje_7@msn.com>
+Lee <44310445+lx200916@users.noreply.github.com>
+Lee Drake <b.lee.drake@gmail.com>
+Leng Yue <lengyue@lengyue.me>
+Leon Knauer <git@leonknauer.com>
+LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
+Leonardo Neumann <leonardo@neumann.dev.br>
+Li Tan <tanliboy@gmail.com>
+Linwei Wang <wanix1988@gmail.com>
+Liu Jia <109258120+Septa2112@users.noreply.github.com>
+Liu Jia <jia3.liu@intel.com>
+LoganDark <github@logandark.mozmail.com>
+Loïc Carrère <loic.carrere@gmail.com>
+LostRuins <39025047+LostRuins@users.noreply.github.com>
+LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
+Luciano <lucianostrika44@gmail.com>
+Luo Tian <lt@basecity.com>
+Lyle Dean <dean@lyle.dev>
+M-A <maruel@gmail.com>
+M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+Ma Mingfei <mingfei.ma@intel.com>
+Maarten ter Huurne <maarten@treewalker.org>
+Mack Straight <eiz@users.noreply.github.com>
+Maël Kerbiriou <m431.kerbiriou@gmail.com>
+MaggotHATE <clay1326@gmail.com>
+Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
+Manuel <44313466+makuche@users.noreply.github.com>
+Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
+Marco Matthies <71844+marcom@users.noreply.github.com>
+Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
+Marian Cepok <marian.cepok@gmail.com>
+Mark Fairbairn <thebaron88@gmail.com>
+Mark Zhuang <zhuangqiubin@gmail.com>
+Marko Tasic <mtasic85@gmail.com>
+Markus Tavenrath <mtavenrath@users.noreply.github.com>
+Martin Delille <martin@delille.org>
+Martin Krasser <krasserm@googlemail.com>
+Martin Schwaighofer <mschwaig@users.noreply.github.com>
+Marvin Gießing <marvin.giessing@gmail.com>
+Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
+MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
+Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
+Matheus C. França <matheus-catarino@hotmail.com>
+Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Baudier <mbaudier@argeo.org>
+Mathieu Geli <mathieu.geli@gmail.com>
+Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
+Mathijs Henquet <mathijs.henquet@gmail.com>
+Mathijs de Bruin <mathijs@mathijsfietst.nl>
+Matt Clayton <156335168+mattjcly@users.noreply.github.com>
+Matt Pulver <matt.pulver@heavy.ai>
+Matt Stephenson <mstephenson6@users.noreply.github.com>
+Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Matteo Mortari <matteo.mortari@gmail.com>
+Mattheus Chediak <shammcity00@gmail.com>
+Matthew Tejo <matthew.tejo@gmail.com>
+Matvey Soloviev <blackhole89@gmail.com>
+Max Krasnyansky <max.krasnyansky@gmail.com>
+Max Krasnyansky <quic_maxk@quicinc.com>
+Maxime <672982+maximegmd@users.noreply.github.com>
+Maximilian Winter <maximilian.winter.91@gmail.com>
+Meng Zhang <meng@tabbyml.com>
+Meng, Hengyu <hengyu.meng@intel.com>
+Mengqing Cao <cmq0113@163.com>
+Merrick Christensen <merrick.christensen@gmail.com>
+Michael Coppola <m18coppola@gmail.com>
+Michael Engel <mengel@redhat.com>
+Michael Francis <edude03@gmail.com>
+Michael Hueschen <m@mhueschen.dev>
+Michael Kesper <mkesper@schokokeks.org>
+Michael Klimenko <mklimenko29@gmail.com>
+Michael Podvitskiy <podvitskiymichael@gmail.com>
+Michael Potter <NanoTekGuy@Gmail.com>
+Michael de Gans <michael.john.degans@gmail.com>
+Michaël de Vries <vriesdemichael@gmail.com>
+Michał Moskal <michal@moskal.me>
+Michał Tuszyński <srgtuszy@gmail.com>
+Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
+Mihai <mihai.chirculescu@yahoo.com>
+Mike <ytianhui2004@gmail.com>
+Mikko Juola <mikjuo@gmail.com>
+Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
+Minsoo Cheong <icycle0409@snu.ac.kr>
+Mirko185 <mirkosig@gmail.com>
+Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
+MistApproach <98988043+MistApproach@users.noreply.github.com>
+Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
+Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
+Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
+Molly Sophia <mollysophia379@gmail.com>
+MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
+Murilo Santana <mvrilo@gmail.com>
+Musab Gultekin <musabgultekin@users.noreply.github.com>
+Nam D. Tran <42194884+namtranase@users.noreply.github.com>
+Nathan Epstein <nate2@umbc.edu>
+Natsu <chino@hotococoa.moe>
+NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
+Nebula <infinitewormhole@gmail.com>
+Neo Zhang <14088817+arthw@users.noreply.github.com>
+Neo Zhang <zhang.jianyu@outlook.com>
+Neo Zhang Jianyu <jianyu.zhang@intel.com>
+Neuman Vong <neuman.vong@gmail.com>
+NeverLucky <92274250+nvrxq@users.noreply.github.com>
+Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
+Nexesenex <124105151+Nexesenex@users.noreply.github.com>
+Niall Coates <1349685+Niall-@users.noreply.github.com>
+Nicholai Tukanov <nicholaitukanov@gmail.com>
+Nico Bosshard <nico@bosshome.ch>
+Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
+Nicolás Pérez <nicolas_perez@brown.edu>
+Nicolò Scipione <nicolo.scipione@codeplay.com>
+Nigel Bosch <pnigelb@gmail.com>
+Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
+Niklas Korz <niklas@niklaskorz.de>
+NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
+Nikolaos Pothitos <pothitos@di.uoa.gr>
+Nikolas <127742645+nneubacher@users.noreply.github.com>
+Nindaleth <Nindaleth@users.noreply.github.com>
+Nuno <rare-magma@posteo.eu>
+OSecret <135510162+OLSecret@users.noreply.github.com>
+Oleksandr Nikitin <oleksandr@tvori.info>
+Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
+Olivier Chafik <ochafik@users.noreply.github.com>
+Ondřej Čertík <ondrej@certik.us>
+Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+PAB <pierreantoine.bannier@gmail.com>
+Pablo Duboue <pablo.duboue@gmail.com>
+Pascal Patry <ppatry@mtacitlabs.com>
+Patrice Ferlet <metal3d@gmail.com>
+Paul Tsochantaris <ptsochantaris@icloud.com>
+Pavel Zloi <github.com@drteam.rocks>
+Pavol Rusnak <pavol@rusnak.io>
+Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
+Pedro Cuenca <pedro@huggingface.co>
+Peter <peter277@users.noreply.github.com>
+Peter Sugihara <peter@campsh.com>
+Phil H <5756783+phiharri@users.noreply.github.com>
+Philip Taron <philip.taron@gmail.com>
+Phillip Kravtsov <phillip@kravtsov.net>
+Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
+Pierrick Hymbert <pierrick.hymbert@gmail.com>
+Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
+Plamen Minev <pacominev@gmail.com>
+Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
+Przemysław Pawełczyk <przemoc@gmail.com>
+Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
+Qingyou Meng <meng.qingyou@gmail.com>
+Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+R0CKSTAR <xiaodong.ye@mthreads.com>
+R0CKSTAR <yeahdongcn@gmail.com>
+RJ Adriaansen <adriaansen@eshcc.eur.nl>
+Radoslav Gerganov <rgerganov@gmail.com>
+Radosław Gryta <radek.gryta@gmail.com>
+Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
+Raj Hammeer Singh Hada <hammeerraj@gmail.com>
+Ralph Soika <ralph.soika@imixs.com>
+Rand Xie <randxiexyy29@gmail.com>
+Randall Fitzgerald <randall@dasaku.net>
+Random Fly <renfei8@live.cn>
+Reinforce-II <fate@eastal.com>
+Rémy Oudompheng <oudomphe@phare.normalesup.org>
+Ren Xuancheng <jklj077@users.noreply.github.com>
+Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
+Reza Kakhki <rezakakhki.de@gmail.com>
+RhinoDevel <RhinoDevel@users.noreply.github.com>
+Riccardo Orlando <Riccorl@users.noreply.github.com>
+Riceball LEE <snowyu.lee@gmail.com>
+Rich Dougherty <rich@rd.nz>
+Richard Kiss <him@richardkiss.com>
+Richard Roberson <richardr1126@gmail.com>
+Rick G <26732651+TheFlipbook@users.noreply.github.com>
+Rickard Edén <rickardeden@gmail.com>
+Rickard Hallerbäck <rickard.hallerback@gmail.com>
+Rickey Bowers Jr <bitRAKE@gmail.com>
+Riley Stewart <ristew@users.noreply.github.com>
+Rinne <AsakusaRinne@gmail.com>
+Rinne <liu_yaohui1998@126.com>
+Robert Brisita <986796+rbrisita@users.noreply.github.com>
+Robert Collins <roberto.tomas.cuentas@gmail.com>
+Robert Ormandi <52251610+ormandi@users.noreply.github.com>
+Robert Sung-wook Shin <edp1096@users.noreply.github.com>
+Robey Holderith <robey@flaminglunchbox.net>
+Robyn <robyngraf@users.noreply.github.com>
+Roger Meier <r.meier@siemens.com>
+Roland <14355895+rbur0425@users.noreply.github.com>
+Romain Biessy <romain.biessy@codeplay.com>
+Romain D <90720+Artefact2@users.noreply.github.com>
+Romain Neutron <romain@neutron.io>
+Roman Parykin <donderom@gmail.com>
+Ron Evans <ron@hybridgroup.com>
+Ron Jailall <rojailal@gmail.com>
+Roni <sulpher@gmx.net>
+Ronny Brendel <ronnybrendel@gmail.com>
+Ronsor <ronsor@ronsor.pw>
+Rowan Hart <rowanbhart@gmail.com>
+Ruan <47767371+ruanych@users.noreply.github.com>
+Ruchira Hasaranga <ruchira66@gmail.com>
+Rudi Servo <rudiservo@gmail.com>
+Ruixin Huang <18860020911@163.com>
+Rune <43761327+Rune-AI@users.noreply.github.com>
+RunningLeon <maningsheng@sensetime.com>
+RunningLeon <mnsheng@yeah.net>
+Ryan Landay <rlanday@gmail.com>
+Ryder Wishart <ryderwishart@gmail.com>
+Ryuei <louixs@users.noreply.github.com>
+Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SRHMorris <69468379+SRHMorris@users.noreply.github.com>
+SXX <sxx1136965276@gmail.com>
+SakuraUmi <yukinon244@gmail.com>
+Salvador E. Tropea <stropea@inti.gob.ar>
+Salvatore Mesoraca <s.mesoraca16@gmail.com>
+Sam Spilsbury <smspillaz@gmail.com>
+Sami Farin <3876865+Safari77@users.noreply.github.com>
+Samuel Maynard <samwmaynard@gmail.com>
+Sang-Kil Park <sang.park@42dot.ai>
+Seb C <47074056+Sebby37@users.noreply.github.com>
+Sebastián A <sebastian.aedo29@gmail.com>
+SebastianApel <13675545+SebastianApel@users.noreply.github.com>
+Senemu <10880819+Senemu@users.noreply.github.com>
+Sergey Alirzaev <zl29ah@gmail.com>
+Sergio López <slp@redhat.com>
+Sergio López <slp@sinrega.org>
+Sertaç Özercan <852750+sozercan@users.noreply.github.com>
+SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
+ShadovvBeast <ShadovvBeast@gmail.com>
+Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shane A <shanea@allenai.org>
+Shangning Xu <32517059+xushangning@users.noreply.github.com>
+Shankar <gshankar.87@gmail.com>
+Shanshan Shen <467638484@qq.com>
+Shijie <821898965@qq.com>
+Shintarou Okada <kokuzen@gmail.com>
+Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
+Shouzheng Liu <lshzh.hi@gmail.com>
+Shuichi Tsutsumi <shuichi0526@gmail.com>
+Shupei Fan <dymarkfan@outlook.com>
+Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
+Simon Willison <swillison@gmail.com>
+Siwen Yu <yusiwen@gmail.com>
+Sky Yan <skyan83@gmail.com>
+Slaren <2141330+slaren@users.noreply.github.com>
+Slava Primenko <primenko.s@gmail.com>
+Small Grass Forest <zixuanxcl@gmail.com>
+SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
+Someone <sergei.kozlukov@aalto.fi>
+Someone Serge <sergei.kozlukov@aalto.fi>
+Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
+Spencer Sutton <spencersutton@users.noreply.github.com>
+Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
+Srinivas Billa <nivibilla@gmail.com>
+Stefan Sydow <stefan@sydow.email>
+Steffen Röcker <sroecker@gmail.com>
+Stephan Walter <stephan@walter.name>
+Stephen Nichols <snichols@users.noreply.github.com>
+Steve Bonds <sbonds@gmail.com>
+Steve Grubb <ausearch.1@gmail.com>
+Steven Prichard <spprichard20@gmail.com>
+Steven Roussey <sroussey@gmail.com>
+Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
+StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
+Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
+Sukriti Sharma <Ssukriti@users.noreply.github.com>
+SuperUserNameMan <yoann@terminajones.com>
+Sutou Kouhei <kou@cozmixng.org>
+Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
+Taikono-Himazin <kazu@po.harenet.ne.jp>
+Tameem <113388789+AhmadTameem@users.noreply.github.com>
+Tamotsu Takahashi <ttakah+github@gmail.com>
+Tei Home <taiteitonghome@proton.me>
+Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
+Thatcher Chamberlin <j.thatcher.c@gmail.com>
+Theia Vogel <theia@vgel.me>
+Thérence <13496987+Royalphax@users.noreply.github.com>
+Thibault Terrasson <thibault.terrasson@gmail.com>
+Thomas Klausner <wiz@gatalith.at>
+Thorsten Sommer <SommerEngineering@users.noreply.github.com>
+Tim Miller <drasticactions@users.noreply.github.com>
+Tim Wang <overocean@gmail.com>
+Timmy Knight <r2d2fish@gmail.com>
+Timothy Cronin <40186632+4imothy@users.noreply.github.com>
+Ting Lou <louting@189.cn>
+Ting Lou <ting.lou@gmail.com>
+Ting Sun <suntcrick@gmail.com>
+Tobias Lütke <tobi@shopify.com>
+Tom C <tom.corelis@gmail.com>
+Tom Jobbins <784313+TheBloke@users.noreply.github.com>
+Tomas <tom.tomas.36478119@gmail.com>
+Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tony Wasserka <4840017+neobrain@users.noreply.github.com>
+Tristan Druyen <tristan@vault81.mozmail.com>
+Tristan Ross <rosscomputerguy@protonmail.com>
+Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
+Tungsten842 <886724vf@anonaddy.me>
+Tungsten842 <quantmint@protonmail.com>
+Tushar <ditsuke@protonmail.com>
+UEXTM.com <84163508+uextm@users.noreply.github.com>
+Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
+Ulrich Drepper <drepper@gmail.com>
+Uzo Nweke <uzoechi@gmail.com>
+Vaibhav Srivastav <vaibhavs10@gmail.com>
+Val Kharitonov <mail@kharvd.com>
+Valentin Konovalov <valle.ketsujin@gmail.com>
+Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
+Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Vali Malinoiu <0x4139@gmail.com>
+Victor Nogueira <felladrin@gmail.com>
+Victor Z. Peng <ziliangdotme@gmail.com>
+Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
+Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
+Vlad <spitfireage@gmail.com>
+Vladimir <bogdad@gmail.com>
+Vladimir Malyutin <first-leon@yandex.ru>
+Vladimir Zorin <vladimir@deviant.guru>
+VoidIsVoid <343750470@qq.com>
+Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
+Wang Qin <37098874+wangqin0@users.noreply.github.com>
+Wang Ran (汪然) <wangr@smail.nju.edu.cn>
+WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
+Weird Constructor <weirdconstructor@gmail.com>
+Welby Seely <welbyseely@gmail.com>
+Wentai Zhang <rchardx@gmail.com>
+WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
+William Tambellini <william.tambellini@gmail.com>
+William Tambellini <wtambellini@sdl.com>
+Willy Tarreau <w@1wt.eu>
+Woof Dog <197125663+woof-dog@users.noreply.github.com>
+Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
+Wu Jian Ping <wujjpp@hotmail.com>
+Wu Jian Ping <wujp@greatld.com>
+Xiake Sun <xiake.sun@intel.com>
+Xiang (Kevin) Li <kevinli020508@gmail.com>
+Xiao-Yong Jin <jinxiaoyong@gmail.com>
+XiaotaoChen <chenxiaotao1234@gmail.com>
+Xiaoyi Chen <cxychina@gmail.com>
+Xie Yanbo <xieyanbo@gmail.com>
+Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
+Xuan Son Nguyen <thichthat@gmail.com>
+Xuan-Son Nguyen <thichthat@gmail.com>
+Yaiko <elyaiko@hotmail.com>
+Yann Follet <131855179+YannFollet@users.noreply.github.com>
+Yaroslav <yaroslav.yashin@me.com>
+Yazan Agha-Schrader <mountaiin@icloud.com>
+Yiming Cui <conandiy@vip.qq.com>
+Yishuo Wang <MeouSker77@outlook.com>
+Yoshi Suhara <y.suhara@gmail.com>
+Yoshi Suhara <ysuhara@nvidia.com>
+Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
+Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
+Yüg <eugeniosegalaweb@gmail.com>
+Yui <dev@sleepyyui.com>
+Yun Dou <dixyes@gmail.com>
+Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
+Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
+Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
+ZHAOKAI WANG <sanxianwei@163.com>
+Zane Shannon <z@zcs.me>
+Zay <95888118+isaiahbjork@users.noreply.github.com>
+Zenix <zenixls2@gmail.com>
+Zhang Peiyuan <a1286225768@gmail.com>
+Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
+Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
+Zhiyuan Li <lizhiyuan@uniartisan.com>
+Zhiyuan Li <uniartisan2017@gmail.com>
+ZhouYuChen <zhouyuchen@naver.com>
+Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
+Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
+Zsapi <martin1.zsapka@gmail.com>
+a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
+a3sh <38979186+A3shTnT@users.noreply.github.com>
+adel boussaken <netdur@gmail.com>
+afrideva <95653597+afrideva@users.noreply.github.com>
+ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
+agray3 <agray3@users.noreply.github.com>
+akawrykow <142945436+akawrykow@users.noreply.github.com>
+alek3y <44779186+alek3y@users.noreply.github.com>
+alexpinel <93524949+alexpinel@users.noreply.github.com>
+alonfaraj <alonfaraj@gmail.com>
+alwqx <kenan3015@gmail.com>
+amd-dwang <dong.wang@amd.com>
+amd-lalithnc <lalithnc@amd.com>
+amritahs-ibm <amritahs@linux.vnet.ibm.com>
+andrijdavid <david@geek.mg>
+anon998 <131767832+anon998@users.noreply.github.com>
+anzz1 <anzz1@live.com>
+apaz <aarpazdera@gmail.com>
+apcameron <37645737+apcameron@users.noreply.github.com>
+arch-btw <57669023+arch-btw@users.noreply.github.com>
+arcrank <arcrank@gmail.com>
+ardfork <134447697+ardfork@users.noreply.github.com>
+arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
+aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
+at8u <129688334+at8u@users.noreply.github.com>
+automaticcat <daogiatuank54@gmail.com>
+awatuna <23447591+awatuna@users.noreply.github.com>
+b4b4o <zwbao@foxmail.com>
+bandoti <141645996+bandoti@users.noreply.github.com>
+beiller <beiller@gmail.com>
+bhubbb <79117352+bhubbb@users.noreply.github.com>
+bmwl <brian.marshall@tolko.com>
+bobqianic <129547291+bobqianic@users.noreply.github.com>
+brucepro <git@brucepro.net>
+bryanSwk <93190252+bryanSwk@users.noreply.github.com>
+bsilvereagle <bsilvereagle@users.noreply.github.com>
+bssrdf <merlintiger@hotmail.com>
+byte-6174 <88070277+byte-6174@users.noreply.github.com>
+cduk <19917266+cduk@users.noreply.github.com>
+cebtenzzre <cebtenzzre@gmail.com>
+chaihahaha <chai836275709@gmail.com>
+chiranko <96988916+chiranko@users.noreply.github.com>
+clibdev <52199778+clibdev@users.noreply.github.com>
+clyang <clyang@clyang.net>
+cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
+codezjx <code.zjx@gmail.com>
+coezbek <c.oezbek@gmail.com>
+comex <comexk@gmail.com>
+compilade <113953597+compilade@users.noreply.github.com>
+compilade <git@compilade.net>
+cpumaxx <163466046+cpumaxx@users.noreply.github.com>
+crasm <crasm@git.vczf.net>
+crasm <crasm@git.vczf.us>
+daboe01 <daboe01@googlemail.com>
+daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
+daminho <37615795+daminho@users.noreply.github.com>
+david raistrick <keen99@users.noreply.github.com>
+ddh0 <dylanhalladay02@icloud.com>
+ddpasa <112642920+ddpasa@users.noreply.github.com>
+deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+devojony <61173062+devojony@users.noreply.github.com>
+ditsuke <ditsuke@protonmail.com>
+divinity76 <divinity76@gmail.com>
+dm4 <sunrisedm4@gmail.com>
+dotpy314 <33351922+dotpy314@users.noreply.github.com>
+drbh <david.richard.holtz@gmail.com>
+ds5t5 <145942675+ds5t5@users.noreply.github.com>
+dylan <canardleteer@users.noreply.github.com>
+eastriver <lee@eastriver.dev>
+ebraminio <ebrahim@gnu.org>
+ebraminio <ebraminio@gmail.com>
+eiery <19350831+eiery@users.noreply.github.com>
+eric8607242 <e0928021388@gmail.com>
+fairydreaming <166155368+fairydreaming@users.noreply.github.com>
+fengerhu1 <2748250768@qq.com>
+fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
+fraxy-v <65565042+fraxy-v@users.noreply.github.com>
+github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+gliptic <gliptic@users.noreply.github.com>
+gn64 <yukikaze.jp@gmail.com>
+goerch <jhr.walter@t-online.de>
+grahameth <96447521+grahameth@users.noreply.github.com>
+gtygo <gtydoit@gmail.com>
+gwjr <502526+gwjr@users.noreply.github.com>
+h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
+hankcs <cnhankmc@gmail.com>
+haopeng <657407891@qq.com>
+hipudding <huafengchun@gmail.com>
+hoangmit <hoangmit@users.noreply.github.com>
+hongbo.mo <352280764@qq.com>
+hopkins385 <98618192+hopkins385@users.noreply.github.com>
+howlger <eclipse@voormann.de>
+howlger <github@voormann.de>
+hutli <6594598+hutli@users.noreply.github.com>
+hutli <hutli@hutli.hu>
+hutli <jensstaermose@hotmail.com>
+hxer7963 <hxer7963@gmail.com>
+hydai <z54981220@gmail.com>
+iSma <ismail.senhaji@gmail.com>
+iacore <74560659+iacore@users.noreply.github.com>
+icppWorld <124377669+icppWorld@users.noreply.github.com>
+igarnier <igarnier@protonmail.com>
+intelmatt <61025942+intelmatt@users.noreply.github.com>
+iohub <rickyang.pro@gmail.com>
+issixx <46835150+issixx@users.noreply.github.com>
+jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
+jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
+jameswu2014 <545426914@qq.com>
+jdomke <28772296+jdomke@users.noreply.github.com>
+jiahao su <damow890@gmail.com>
+jiez <373447296@qq.com>
+jneem <joeneeman@gmail.com>
+joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
+johnson442 <56517414+johnson442@users.noreply.github.com>
+jojorne <jojorne@users.noreply.github.com>
+jon-chuang <9093549+jon-chuang@users.noreply.github.com>
+jp-x-g <jpxg-dev@protonmail.com>
+jukofyork <69222624+jukofyork@users.noreply.github.com>
+junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
+jwj7140 <32943891+jwj7140@users.noreply.github.com>
+k.h.lai <adrian.k.h.lai@outlook.com>
+kaizau <kaizau@users.noreply.github.com>
+kallewoof <kalle.alm@gmail.com>
+kalomaze <66376113+kalomaze@users.noreply.github.com>
+kang <tpdns9032100@gmail.com>
+katsu560 <118887472+katsu560@users.noreply.github.com>
+kchro3 <62481661+kchro3@users.noreply.github.com>
+khimaros <me@khimaros.com>
+kiltyj <kiltyj@gmail.com>
+klosax <131523366+klosax@users.noreply.github.com>
+krystiancha <krystian@krystianch.com>
+kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
+kunnis <kunnis@users.noreply.github.com>
+kuronekosaiko <EvanChanJ@163.com>
+kustaaya <58045274+kustaaya@users.noreply.github.com>
+kuvaus <22169537+kuvaus@users.noreply.github.com>
+kwin1412 <42286931+kwin1412@users.noreply.github.com>
+l3utterfly <gc.pthzfoldr@gmail.com>
+laik <laik.lj@me.com>
+ldwang <ftgreat@163.com>
+le.chang <cljs118@126.com>
+leejet <leejet714@gmail.com>
+leo-pony <nengjunma@outlook.com>
+lexasub <lexakopp2212@gmail.com>
+lhez <quic_lih@quicinc.com>
+limitedAtonement <limitedAtonement@users.noreply.github.com>
+liuwei-git <14815172+liuwei-git@users.noreply.github.com>
+lon <114724657+longregen@users.noreply.github.com>
+loonerin <132926317+loonerin@users.noreply.github.com>
+ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
+luoyu-intel <yu.luo@intel.com>
+m3ndax <adrian.goessl@outlook.com>
+maddes8cht <55592906+maddes8cht@users.noreply.github.com>
+mahorozte <41834471+mahorozte@users.noreply.github.com>
+makomk <makosoft@googlemail.com>
+manikbhandari <mbbhandarimanik2@gmail.com>
+maor-ps <154728172+maor-ps@users.noreply.github.com>
+mashdragon <122402293+mashdragon@users.noreply.github.com>
+matiaslin <45382001+matiaslin@users.noreply.github.com>
+matt23654 <matthew.webber@protonmail.com>
+matteo <matteogeniaccio@yahoo.it>
+mdrokz <mohammadmunshi@gmail.com>
+mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
+minarchist <minarchist@users.noreply.github.com>
+mj-shifu <77107165+mj-shifu@users.noreply.github.com>
+mmyjona <jonathan.gonse@gmail.com>
+momonga <115213907+mmnga@users.noreply.github.com>
+momonga <146910567+mmngays@users.noreply.github.com>
+moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
+musoles <135031143+musoles@users.noreply.github.com>
+mzcu <milos.cubrilo@gmail.com>
+nanahi <130121847+na-na-hi@users.noreply.github.com>
+ngc92 <7938269+ngc92@users.noreply.github.com>
+nhamanasu <45545786+nhamanasu@users.noreply.github.com>
+niansa/tuxifan <anton-sa@web.de>
+niansa/tuxifan <tuxifan@posteo.de>
+nickp27 <nb.porter@gmail.com>
+ningshanwutuobang <ningshanwutuobang@gmail.com>
+nold <Nold360@users.noreply.github.com>
+nopperl <54780682+nopperl@users.noreply.github.com>
+nusu-github <29514220+nusu-github@users.noreply.github.com>
+olexiyb <olexiyb@gmail.com>
+omahs <73983677+omahs@users.noreply.github.com>
+oobabooga <112222186+oobabooga@users.noreply.github.com>
+opparco <parco.opaai@gmail.com>
+ostix360 <55257054+ostix360@users.noreply.github.com>
+pculliton <phillipculliton@gmail.com>
+peidaqi <peidaqi@gmail.com>
+pengxin99 <pengxin.yuan@intel.com>
+perserk <perserk@gmail.com>
+piDack <104877312+piDack@users.noreply.github.com>
+pmysl <piotr.myslinski@outlook.com>
+postmasters <namnguyen@google.com>
+pudepiedj <pudepiedj@gmail.com>
+qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
+qingy1337 <qxli2@students.everettcc.edu>
+qouoq <qouoq@fastmail.com>
+qunash <anzoria@gmail.com>
+rabidcopy <rabidcopy@yahoo.com>
+rankaiyx <rankaiyx@rankaiyx.com>
+redbeard <bharrington@alticon.net>
+rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
+rhuddleston <ryan.huddleston@percona.com>
+rimoliga <53384203+rimoliga@users.noreply.github.com>
+runfuture <runfuture@users.noreply.github.com>
+sandyiscool <sandyiscool@gmail.com>
+sasha0552 <admin@sasha0552.org>
+semidark <me@semidark.net>
+serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
+sharpHL <132747147+sharpHL@users.noreply.github.com>
+shibe2 <shibe@tuta.io>
+singularity <12184989+singularity-s0@users.noreply.github.com>
+sjinzh <sjinzh@gmail.com>
+sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
+slaren <2141330+slaren@users.noreply.github.com>
+slaren <slarengh@gmail.com>
+snadampal <87143774+snadampal@users.noreply.github.com>
+someone13574 <81528246+someone13574@users.noreply.github.com>
+standby24x7 <standby24x7@gmail.com>
+staviq <staviq@gmail.com>
+stduhpf <stephduh@live.fr>
+strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
+swittk <switt1995@gmail.com>
+takov751 <40316768+takov751@users.noreply.github.com>
+tarcey <cey.tarik@gmail.com>
+tc-mb <157115220+tc-mb@users.noreply.github.com>
+texmex76 <40733439+texmex76@users.noreply.github.com>
+thement <40525767+thement@users.noreply.github.com>
+thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
+tjohnman <tjohnman@users.noreply.github.com>
+toyer <2042519524@qq.com>
+tslmy <tslmy@users.noreply.github.com>
+ubik2 <ubik2@users.noreply.github.com>
+uint256_t <konndennsa@gmail.com>
+uint256_t <maekawatoshiki1017@gmail.com>
+unbounded <haakon@likedan.net>
+uvos <devnull@uvos.xyz>
+uvos <philipp@uvos.xyz>
+valiray <133289098+valiray@users.noreply.github.com>
+vb <vaibhavs10@gmail.com>
+vik <vikhyatk@gmail.com>
+viric <viric@viric.name>
+vodkaslime <646329483@qq.com>
+vvhg1 <94630311+vvhg1@users.noreply.github.com>
+vxiiduu <73044267+vxiiduu@users.noreply.github.com>
+wangshuai09 <391746016@qq.com>
+wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
+whoreson <139810751+whoreson@users.noreply.github.com>
+woachk <24752637+woachk@users.noreply.github.com>
+wonjun Jang <strutive07@gmail.com>
+woodx <124784234+woodx9@users.noreply.github.com>
+wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
+wzy <32936898+Freed-Wu@users.noreply.github.com>
+xaedes <xaedes@gmail.com>
+xaedes <xaedes@googlemail.com>
+xctan <axunlei@gmail.com>
+xloem <0xloem@gmail.com>
+yangli2 <yangli2@gmail.com>
+ymcki <84055651+ymcki@users.noreply.github.com>
+yuiseki <yuiseki@gmail.com>
+yuri@FreeBSD <yurivict@users.noreply.github.com>
+zakkor <edward.partenie@gmail.com>
+zhangkaihuo <zhangkaihuo@gmail.com>
+zhentaoyu <zhentao.yu@intel.com>
+zhouwg <6889919+zhouwg@users.noreply.github.com>
+zhouwg <zhouwg2000@gmail.com>
+zrm <trustiosity.zrm@gmail.com>
+Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
+杨朱 · Kiki <baofa.fan@daocloud.io>
+源文雨 <41315874+fumiama@users.noreply.github.com>
+蕭澧邦 <45505768+shou692199@users.noreply.github.com>
+谢乃闻 <sienaiwun@users.noreply.github.com>
+Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48880f720..7b2a1845e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,10 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
+#set(CMAKE_WARN_DEPRECATED YES)
+set(CMAKE_WARN_UNUSED_CLI YES)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -9,11 +12,17 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(LLAMA_STANDALONE ON)
 
+    include(git-vars)
+
     # configure project version
     # TODO
 else()
@@ -32,1120 +41,138 @@ else()
     endif()
 endif()
 
-
-#
-# Option list
-#
-
-if (APPLE)
-    set(LLAMA_METAL_DEFAULT ON)
-else()
-    set(LLAMA_METAL_DEFAULT OFF)
-endif()
-
-# general
-option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
-option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
-option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
-option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
-
-# debug
-option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
-
-# build
-option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)
-
-# sanitizers
-option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
-
-# instruction set specific
-if (LLAMA_NATIVE)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
-option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
-option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
-option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
-option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
-option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
-# in MSVC F16C is implied with AVX2/AVX512
-if (NOT MSVC)
-    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
-endif()
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 
 if (WIN32)
-    set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
 
+if (MSVC)
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
+endif()
+
+#
+# option list
+#
+
+# debug
+option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+
+# build
+option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
+
+# sanitizers
+option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
+option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
+option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+
+# utils
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+
+# extra artifacts
+option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+
 # 3rd party libs
-option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
-set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
-#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
-option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
-set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
-set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
-set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                             "llama: max. batch size for using peer access")
-option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
-option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
-option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
-option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
-option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
-option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
-option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
-option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
-option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
-option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
-option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
-option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
-option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
-option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
-option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
-option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
-option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
-
-option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
-
-# add perf arguments
-option(LLAMA_PERF                            "llama: enable perf"                               OFF)
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
 # Required for relocatable CMake package
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 
-#
-# Compile flags
-#
+# override ggml options
+set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
 
-if (LLAMA_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+    set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
-set(CMAKE_CXX_STANDARD_REQUIRED true)
-set(CMAKE_C_STANDARD 11)
-set(CMAKE_C_STANDARD_REQUIRED true)
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-
-find_package(Threads REQUIRED)
-include(CheckCXXCompilerFlag)
-
-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
 
+# transition helpers
+function (llama_option_depr TYPE OLD NEW)
+    if (${OLD})
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON PARENT_SCOPE)
+    endif()
+endfunction()
+
+llama_option_depr(FATAL_ERROR LLAMA_CUBLAS              GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
+llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
+llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
+llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
+llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
+llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
+llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
+        message(STATUS "Using -fsanitize=thread")
+
         add_compile_options(-fsanitize=thread)
         link_libraries     (-fsanitize=thread)
     endif()
 
     if (LLAMA_SANITIZE_ADDRESS)
+        message(STATUS "Using -fsanitize=address")
+
         add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
         link_libraries     (-fsanitize=address)
     endif()
 
     if (LLAMA_SANITIZE_UNDEFINED)
+        message(STATUS "Using -fsanitize=undefined")
+
         add_compile_options(-fsanitize=undefined)
         link_libraries     (-fsanitize=undefined)
     endif()
 endif()
 
-if (APPLE AND LLAMA_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
+#
+# 3rd-party
+#
 
-        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-
-    message(STATUS "Metal framework found")
-    set(GGML_HEADERS_METAL ggml-metal.h)
-    set(GGML_SOURCES_METAL ggml-metal.m)
-
-    add_compile_definitions(GGML_USE_METAL)
-    if (LLAMA_METAL_NDEBUG)
-        add_compile_definitions(GGML_METAL_NDEBUG)
-    endif()
-
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
-    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    if (LLAMA_METAL_EMBED_LIBRARY)
-        enable_language(ASM)
-        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
-
-        set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
-        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
-
-        add_custom_command(
-            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-            DEPENDS ${METALLIB_SOURCE}
-            COMMENT "Generate assembly for embedded Metal library"
-        )
-
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-    endif()
-
-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
-            COMMENT "Compiling Metal kernels"
-        )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
-if (LLAMA_BLAS)
-    if (LLAMA_STATIC)
-        set(BLA_STATIC ON)
-    endif()
-    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
-    endif()
-
-    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
-    find_package(BLAS)
-
-    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
-
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-        add_compile_options(${BLAS_LINKER_FLAGS})
-
-        add_compile_definitions(GGML_USE_OPENBLAS)
-
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
-    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct LLAMA_BLAS_VENDOR")
-    endif()
-endif()
-
-if (LLAMA_QKK_64)
-    add_compile_definitions(GGML_QKK_64)
-endif()
-
-if (LLAMA_CUBLAS)
-    cmake_minimum_required(VERSION 3.17)
-
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
-
-        enable_language(CUDA)
-
-        set(GGML_HEADERS_CUDA ggml-cuda.h)
-        set(GGML_SOURCES_CUDA ggml-cuda.cu)
-
-        add_compile_definitions(GGML_USE_CUBLAS)
-        if (LLAMA_CUDA_FORCE_DMMV)
-            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
-        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-        if (DEFINED LLAMA_CUDA_DMMV_Y)
-            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
-        endif()
-        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_F16)
-        endif()
-        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
-
-        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == f16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    else()
-        message(WARNING "cuBLAS not found")
-    endif()
-endif()
-
-if (LLAMA_MPI)
-    cmake_minimum_required(VERSION 3.10)
-    find_package(MPI)
-    if (MPI_C_FOUND)
-        message(STATUS "MPI found")
-
-        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c)
-
-        add_compile_definitions(GGML_USE_MPI)
-        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-
-        if (NOT MSVC)
-            add_compile_options(-Wno-cast-qual)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-
-        # Even if you're only using the C header, C++ programs may bring in MPI
-        # C++ functions, so more linkage is needed
-        if (MPI_CXX_FOUND)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
-        endif()
-    else()
-        message(WARNING "MPI not found")
-    endif()
-endif()
-
-if (LLAMA_CLBLAST)
-    find_package(CLBlast)
-    if (CLBlast_FOUND)
-        message(STATUS "CLBlast found")
-
-        set(GGML_HEADERS_OPENCL ggml-opencl.h)
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
-
-        add_compile_definitions(GGML_USE_CLBLAST)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
-    else()
-        message(WARNING "CLBlast not found")
-    endif()
-endif()
-
-if (LLAMA_VULKAN)
-    find_package(Vulkan)
-    if (Vulkan_FOUND)
-        message(STATUS "Vulkan found")
-
-        set(GGML_HEADERS_VULKAN ggml-vulkan.h)
-        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
-
-        add_compile_definitions(GGML_USE_VULKAN)
-
-        if (LLAMA_VULKAN_CHECK_RESULTS)
-            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
-        endif()
-
-        if (LLAMA_VULKAN_DEBUG)
-            add_compile_definitions(GGML_VULKAN_DEBUG)
-        endif()
-
-        if (LLAMA_VULKAN_VALIDATE)
-            add_compile_definitions(GGML_VULKAN_VALIDATE)
-        endif()
-
-        if (LLAMA_VULKAN_RUN_TESTS)
-            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
-    else()
-        message(WARNING "Vulkan not found")
-    endif()
-endif()
-
-if (LLAMA_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
-    endif()
-
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-    endif()
-
-    find_package(hip     REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-
-    message(STATUS "HIP and hipBLAS found")
-
-    set(GGML_HEADERS_ROCM ggml-cuda.h)
-    set(GGML_SOURCES_ROCM ggml-cuda.cu)
-
-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-
-    if (LLAMA_HIP_UMA)
-        add_compile_definitions(GGML_HIP_UMA)
-    endif()
-
-    if (LLAMA_CUDA_FORCE_DMMV)
-        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-    endif()
-
-    if (LLAMA_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-    endif()
-
-    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-
-    set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-
-    if (LLAMA_STATIC)
-        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-    endif()
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-endif()
-
-if (LLAMA_SYCL)
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
-    endif()
-    #todo: AOT
-
-    find_package(IntelSYCL REQUIRED)
-
-    message(STATUS "SYCL found")
-
-    add_compile_definitions(GGML_USE_SYCL)
-
-    if (LLAMA_SYCL_F16)
-        add_compile_definitions(GGML_SYCL_F16)
-    endif()
-
-    add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})
-
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-
-    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
-
-    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
-    else()
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-    endif()
-endif()
-
-if (LLAMA_KOMPUTE)
-    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
-    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-    if (NOT glslc_executable)
-        message(FATAL_ERROR "glslc not found")
-    endif()
-
-    function(compile_shader)
-        set(options)
-        set(oneValueArgs)
-        set(multiValueArgs SOURCES)
-        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        foreach(source ${compile_shader_SOURCES})
-            get_filename_component(filename ${source} NAME)
-            set(spv_file ${filename}.spv)
-            add_custom_command(
-                OUTPUT ${spv_file}
-                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
-                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
-                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-                COMMENT "Compiling ${source} to ${spv_file}"
-                )
-
-            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-            set(FILE_NAME "shader${RAW_FILE_NAME}")
-            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-            if(CMAKE_GENERATOR MATCHES "Visual Studio")
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-                    )
-            else()
-                add_custom_command(
-                    OUTPUT ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                    DEPENDS ${spv_file} xxd
-                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-                    )
-            endif()
-        endforeach()
-    endfunction()
-
-    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
-        message(STATUS "Kompute found")
-        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
-        add_subdirectory(kompute)
-
-        # Compile our shaders
-        compile_shader(SOURCES
-            kompute-shaders/op_scale.comp
-            kompute-shaders/op_scale_8.comp
-            kompute-shaders/op_add.comp
-            kompute-shaders/op_addrow.comp
-            kompute-shaders/op_mul.comp
-            kompute-shaders/op_silu.comp
-            kompute-shaders/op_relu.comp
-            kompute-shaders/op_gelu.comp
-            kompute-shaders/op_softmax.comp
-            kompute-shaders/op_norm.comp
-            kompute-shaders/op_rmsnorm.comp
-            kompute-shaders/op_diagmask.comp
-            kompute-shaders/op_mul_mat_mat_f32.comp
-            kompute-shaders/op_mul_mat_f16.comp
-            kompute-shaders/op_mul_mat_q8_0.comp
-            kompute-shaders/op_mul_mat_q4_0.comp
-            kompute-shaders/op_mul_mat_q4_1.comp
-            kompute-shaders/op_mul_mat_q6_k.comp
-            kompute-shaders/op_getrows_f16.comp
-            kompute-shaders/op_getrows_q4_0.comp
-            kompute-shaders/op_getrows_q4_1.comp
-            kompute-shaders/op_getrows_q6_k.comp
-            kompute-shaders/op_rope_f16.comp
-            kompute-shaders/op_rope_f32.comp
-            kompute-shaders/op_cpy_f16_f16.comp
-            kompute-shaders/op_cpy_f16_f32.comp
-            kompute-shaders/op_cpy_f32_f16.comp
-            kompute-shaders/op_cpy_f32_f32.comp
-        )
-
-        # Create a custom target for our generated shaders
-        add_custom_target(generated_shaders DEPENDS
-            shaderop_scale.h
-            shaderop_scale_8.h
-            shaderop_add.h
-            shaderop_addrow.h
-            shaderop_mul.h
-            shaderop_silu.h
-            shaderop_relu.h
-            shaderop_gelu.h
-            shaderop_softmax.h
-            shaderop_norm.h
-            shaderop_rmsnorm.h
-            shaderop_diagmask.h
-            shaderop_mul_mat_mat_f32.h
-            shaderop_mul_mat_f16.h
-            shaderop_mul_mat_q8_0.h
-            shaderop_mul_mat_q4_0.h
-            shaderop_mul_mat_q4_1.h
-            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f16.h
-            shaderop_getrows_q4_0.h
-            shaderop_getrows_q4_1.h
-            shaderop_getrows_q6_k.h
-            shaderop_rope_f16.h
-            shaderop_rope_f32.h
-            shaderop_cpy_f16_f16.h
-            shaderop_cpy_f16_f32.h
-            shaderop_cpy_f32_f16.h
-            shaderop_cpy_f32_f32.h
-        )
-
-        # Create a custom command that depends on the generated_shaders
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
-        )
-
-        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ggml-kompute.h   ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-
-        add_compile_definitions(GGML_USE_KOMPUTE)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
-    else()
-        message(WARNING "Kompute not found")
-    endif()
-endif()
-
-if (LLAMA_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-
-    add_compile_definitions(GGML_USE_CPU_HBM)
-
-    target_link_libraries(ggml PUBLIC memkind)
-endif()
-
-if (LLAMA_PERF)
-    add_compile_definitions(GGML_PERF)
-endif()
-
-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
-if (LLAMA_FATAL_WARNINGS)
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        list(APPEND C_FLAGS   -Werror)
-        list(APPEND CXX_FLAGS -Werror)
-    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        add_compile_options(/WX)
-    endif()
-endif()
-
-if (LLAMA_ALL_WARNINGS)
-    if (NOT MSVC)
-        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-
-        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
-
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
-    else()
-        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
-    endif()
-endif()
-
-set(CUDA_CXX_FLAGS "")
-
-if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (LLAMA_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
-    endif()
-
-    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    endif()
-endif()
-
-if (WIN32)
-    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
-endif()
-
-if (LLAMA_LTO)
-    include(CheckIPOSupported)
-    check_ipo_supported(RESULT result OUTPUT output)
-    if (result)
-        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
-    else()
-        message(WARNING "IPO is not supported: ${output}")
-    endif()
-endif()
-
-if (LLAMA_CCACHE)
-    find_program(LLAMA_CCACHE_FOUND ccache)
-    if (LLAMA_CCACHE_FOUND)
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-        set(ENV{CCACHE_SLOPPINESS} time_macros)
-        message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.")
-    else()
-        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF")
-    endif ()
-endif()
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
-# Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-    set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
-if (NOT MSVC)
-    if (LLAMA_STATIC)
-        add_link_options(-static)
-        if (MINGW)
-            add_link_options(-static-libgcc -static-libstdc++)
-        endif()
-    endif()
-    if (LLAMA_GPROF)
-        add_compile_options(-pg)
-    endif()
-endif()
-
-set(ARCH_FLAGS "")
-
-if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-    message(STATUS "ARM detected")
-    if (MSVC)
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-
-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        endif ()
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        endif ()
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-            endif()
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (LLAMA_NATIVE)
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (LLAMA_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (LLAMA_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-        elseif (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
-        endif()
-    else()
-        if (LLAMA_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (LLAMA_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (LLAMA_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (LLAMA_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (LLAMA_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-    endif()
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-if (LLAMA_CUBLAS)
-    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
-    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-endif()
-
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 
 #
-# POSIX conformance
+# build the library
 #
 
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
+add_subdirectory(src)
 
 #
-# libraries
+# utils, programs, examples and tests
 #
 
-# ggml
-
-add_library(ggml OBJECT
-            ggml.c
-            ggml.h
-            ggml-alloc.c
-            ggml-alloc.h
-            ggml-backend.c
-            ggml-backend.h
-            ggml-quants.c
-            ggml-quants.h
-            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
-            )
-
-target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
-target_compile_features   (ggml PUBLIC c_std_11) # don't bump
-
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-
-add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
-    target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-    install(TARGETS ggml_shared LIBRARY)
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
 endif()
 
-# llama
-
-add_library(llama
-            llama.cpp
-            llama.h
-            )
-
-target_include_directories(llama PUBLIC .)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
-
-target_link_libraries(llama PRIVATE
-    ggml
-    ${LLAMA_EXTRA_LIBS}
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    if (LLAMA_METAL)
-        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-    endif()
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
 endif()
 
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()
 
 #
 # install
@@ -1154,46 +181,43 @@ endif()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
-    CACHE PATH "Location of header files")
-set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
-    CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
-    CACHE PATH "Location of binary files")
-set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
+set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
+set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
 set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
+
+set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+
+set(LLAMA_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
+
+set_target_properties(llama
+    PROPERTIES
+        PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
+
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
 
 configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
     PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
               LLAMA_LIB_INSTALL_DIR
               LLAMA_BIN_INSTALL_DIR )
 
 write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
     VERSION ${LLAMA_INSTALL_VERSION}
     COMPATIBILITY SameMajorVersion)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
-
-set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-install(TARGETS ggml PUBLIC_HEADER)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
 
 install(
-    FILES convert.py
+    FILES convert_hf_to_gguf.py
     PERMISSIONS
         OWNER_READ
         OWNER_WRITE
@@ -1203,40 +227,10 @@ install(
         WORLD_READ
         WORLD_EXECUTE
     DESTINATION ${CMAKE_INSTALL_BINDIR})
-install(
-    FILES convert-lora-to-ggml.py
-    PERMISSIONS
-        OWNER_READ
-        OWNER_WRITE
-        OWNER_EXECUTE
-        GROUP_READ
-        GROUP_EXECUTE
-        WORLD_READ
-        WORLD_EXECUTE
-    DESTINATION ${CMAKE_INSTALL_BINDIR})
-if (LLAMA_METAL)
-    install(
-        FILES ggml-metal.metal
-        PERMISSIONS
-            OWNER_READ
-            OWNER_WRITE
-            GROUP_READ
-            WORLD_READ
-        DESTINATION ${CMAKE_INSTALL_BINDIR})
-endif()
 
-#
-# programs, examples and tests
-#
+configure_file(cmake/llama.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+        @ONLY)
 
-add_subdirectory(common)
-
-if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif ()
-
-if (LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 000000000..13bdd7907
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,97 @@
+{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name":  "base",
+        "hidden": true,
+        "generator":   "Ninja",
+        "binaryDir":   "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "cl",
+            "GGML_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
+    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
+    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+
+    {
+        "name": "x64-windows-llvm", "hidden": true,
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-windows-llvm", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-apple-clang", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
+        }
+    },
+
+    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
+
+    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
+    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
+    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+
+    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+
+    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
+    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
+    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
+    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
+
+    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+
+    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
+    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
+
+    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
+    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
+  ]
+}
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..72d594b46
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,11 @@
+# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
+
+/ci/ @ggerganov
+/.devops/*.Dockerfile @ngxson
+/examples/server/ @ngxson
+/ggml/src/ggml-cuda/fattn* @JohannesGaessler
+/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
+/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
+/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
+/ggml/src/ggml-opt.cpp @JohannesGaessler
+/ggml/src/gguf.cpp @JohannesGaessler
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..8d411982b
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,125 @@
+# Pull requests (for contributors)
+
+- Test your changes:
+    - Execute [the full CI locally on your machine](ci/README.md) before publishing
+    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
+    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
+    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
+- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+
+# Pull requests (for collaborators)
+
+- Squash-merge PRs
+- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
+
+# Coding guidelines
+
+- Avoid adding third-party dependencies, extra files, extra headers, etc.
+- Always consider cross-compatibility with other operating systems and architectures
+- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- Vertical alignment makes things more readable and easier to batch edit
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
+- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
+- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
+    - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
+    ```cpp
+    // OK
+    llama_context * ctx;
+    const llama_rope_type rope_type;
+
+    // not OK
+    struct llama_context * ctx;
+    const enum llama_rope_type rope_type;
+    ```
+
+    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
+
+- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
+- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
+- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)
+
+# Naming guidelines
+
+- Use `snake_case` for function, variable and type names
+- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+
+    ```cpp
+    // not OK
+    int small_number;
+    int big_number;
+
+    // OK
+    int number_small;
+    int number_big;
+    ```
+
+- Enum values are always in upper case and prefixed with the enum name
+
+    ```cpp
+    enum llama_vocab_type {
+        LLAMA_VOCAB_TYPE_NONE = 0,
+        LLAMA_VOCAB_TYPE_SPM  = 1,
+        LLAMA_VOCAB_TYPE_BPE  = 2,
+        LLAMA_VOCAB_TYPE_WPM  = 3,
+        LLAMA_VOCAB_TYPE_UGM  = 4,
+        LLAMA_VOCAB_TYPE_RWKV = 5,
+    };
+    ```
+
+- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
+
+    ```cpp
+    llama_model_init();           // class: "llama_model",         method: "init"
+    llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
+    llama_sampler_get_seed();     // class: "llama_sampler",       method: "get_seed"
+    llama_set_embeddings();       // class: "llama_context",       method: "set_embeddings"
+    llama_n_threads();            // class: "llama_context",       method: "n_threads"
+    llama_adapter_lora_free();    // class: "llama_adapter_lora",  method: "free"
+    ```
+
+    - The `get` `<action>` can be omitted
+    - The `<noun>` can be omitted if not necessary
+    - The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
+    - Use `init`/`free` for constructor/destructor `<action>`
+
+- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
+
+    ```cpp
+    typedef struct llama_context * llama_context_t;
+
+    enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
+    ```
+
+    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
+
+- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
+- Python filenames are all lowercase with underscores
+
+- _(TODO: abbreviations usage)_
+
+# Preprocessor directives
+
+- _(TODO: add guidelines with examples and apply them to the codebase)_
+
+    ```cpp
+    #ifdef FOO
+    #endif // FOO
+    ```
+
+# Documentation
+
+- Documentation is a community effort
+- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
+- When you notice incorrect or outdated documentation, please update it
+
+# Resources
+
+The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
+
+https://github.com/ggerganov/llama.cpp/projects
diff --git a/LICENSE b/LICENSE
index 76f67efdc..acb96ce78 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Georgi Gerganov
+Copyright (c) 2023-2024 The ggml authors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
index 4f26c0463..dc3de3cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,161 @@
+ifndef LLAMA_MAKEFILE
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+endif
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	libllava.a \
+	llama-batched \
+	llama-batched-bench \
+	llama-bench \
+	llama-cli \
+	llama-convert-llama2c-to-ggml \
+	llama-embedding \
+	llama-eval-callback \
+	llama-export-lora \
+	llama-gbnf-validator \
+	llama-gguf \
+	llama-gguf-hash \
+	llama-gguf-split \
+	llama-gritlm \
+	llama-imatrix \
+	llama-infill \
+	llama-llava-cli \
+	llama-minicpmv-cli\
+	llama-qwen2vl-cli\
+	llama-lookahead \
+	llama-lookup \
+	llama-lookup-create \
+	llama-lookup-merge \
+	llama-lookup-stats \
+	llama-parallel \
+	llama-passkey \
+	llama-perplexity \
+	llama-q8dot \
+	llama-quantize \
+	llama-quantize-stats \
+	llama-retrieval \
+	llama-save-load-state \
+	llama-server \
+	llama-simple \
+	llama-simple-chat \
+	llama-run \
+	llama-speculative \
+	llama-tokenize \
+	llama-vdot \
+	llama-cvector-generator \
+	llama-gen-docs \
+	tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
+	tests/test-arg-parser \
+	tests/test-autorelease \
+	tests/test-backend-ops \
+	tests/test-chat \
+	tests/test-chat-template \
+	tests/test-double-float \
+	tests/test-grammar-integration \
+	tests/test-grammar-parser \
+	tests/test-json-schema-to-grammar \
+	tests/test-llama-grammar \
+	tests/test-log \
+	tests/test-model-load-cancel \
+	tests/test-quantize-fns \
+	tests/test-quantize-perf \
+	tests/test-rope \
+	tests/test-sampling \
+	tests/test-tokenizer-0 \
+	tests/test-tokenizer-1-bpe \
+	tests/test-tokenizer-1-spm
+#	tests/test-opt \
 
-# Code coverage output files
-COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
+# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
+
+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
+
+# Deprecation aliases
+ifdef LLAMA_CUBLAS
+$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+endif
+
+ifdef LLAMA_CUDA
+GGML_CUDA := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_KOMPUTE
+GGML_KOMPUTE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_METAL
+GGML_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_RPC
+GGML_RPC := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL
+GGML_SYCL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_SYCL_F16
+GGML_SYCL_F16 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS
+GGML_OPENBLAS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_OPENBLAS64
+GGML_OPENBLAS64 := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_BLIS
+GGML_BLIS := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_LLAMAFILE
+GGML_NO_LLAMAFILE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_ACCELERATE
+GGML_NO_ACCELERATE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_OPENMP
+GGML_NO_OPENMP := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_NO_METAL
+GGML_NO_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -26,13 +169,26 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 
+# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC  := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC  := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
-	ifndef LLAMA_NO_METAL
-		LLAMA_METAL := 1
+	ifndef GGML_NO_METAL
+		GGML_METAL := 1
 	endif
 
+	GGML_NO_OPENMP := 1
+
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@@ -43,16 +199,33 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif
 
-default: $(BUILD_TARGETS)
+ifdef GGML_METAL
+	GGML_METAL_EMBED_LIBRARY := 1
+endif
+
+ifdef GGML_RPC
+	BUILD_TARGETS += rpc-server
+endif
+
+ifdef GGML_VULKAN
+	BUILD_TARGETS += vulkan-shaders-gen
+endif
+
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
 
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@@ -73,19 +246,7 @@ test: $(TEST_TARGETS)
 	fi
 	@echo 'All tests passed.'
 
-all: $(BUILD_TARGETS) $(TEST_TARGETS)
-
-coverage: ## Run code coverage
-	gcov -pb tests/*.cpp
-
-lcov-report: coverage ## Generate lcov report
-	mkdir -p lcov-report
-	lcov --capture --directory . --output-file lcov-report/coverage.info
-	genhtml lcov-report/coverage.info --output-directory lcov-report
-
-gcovr-report: coverage ## Generate gcovr report
-	mkdir -p gcovr-report
-	gcovr --root . --html --html-details --output gcovr-report/coverage.html
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
 
 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
@@ -96,34 +257,28 @@ endif
 # Compile flags
 #
 
-# keep standard at C11 and C++11
-MK_CPPFLAGS  = -I. -Icommon
+# keep standard at C11 and C++17
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_CXXFLAGS  = -std=c++17 -fPIC
+MK_NVCCFLAGS = -std=c++17
 
-# -Ofast tends to produce faster code, but may not be available for some compilers.
-ifdef LLAMA_FAST
-MK_CFLAGS     += -Ofast
-HOST_CXXFLAGS += -Ofast
-MK_NVCCFLAGS  += -O3
-else
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
-MK_NVCCFLAGS  += -O3
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
 endif
 
-ifndef LLAMA_NO_CCACHE
+ifndef GGML_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
-endif # LLAMA_NO_CCACHE
+endif # GGML_NO_CCACHE
 
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -142,6 +297,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
+	MK_LDFLAGS  += -ldl
 endif
 
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@@ -167,16 +323,24 @@ ifeq ($(UNAME_S),OpenBSD)
 	MK_CPPFLAGS += -D_BSD_SOURCE
 endif
 
+ifdef GGML_SCHED_MAX_COPIES
+	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
+endif
+
 ifdef LLAMA_DEBUG
-	MK_CFLAGS   += -O0 -g
-	MK_CXXFLAGS += -O0 -g
-	MK_LDFLAGS  += -g
+	MK_CFLAGS    += -O0 -g
+	MK_CXXFLAGS  += -O0 -g
+	MK_LDFLAGS   += -g
+	MK_NVCCFLAGS += -O0 -g
 
 	ifeq ($(UNAME_S),Linux)
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
-	MK_CPPFLAGS += -DNDEBUG
+	MK_CPPFLAGS   += -DNDEBUG
+	MK_CFLAGS     += -O3 -g
+	MK_CXXFLAGS   += -O3 -g
+	MK_NVCCFLAGS  += -O3 -g
 endif
 
 ifdef LLAMA_SANITIZE_THREAD
@@ -197,24 +361,36 @@ ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif
 
-ifdef LLAMA_SERVER_VERBOSE
-	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+ifdef LLAMA_SERVER_SSL
+	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
+	MK_LDFLAGS += -lssl -lcrypto
 endif
 
-
-ifdef LLAMA_CODE_COVERAGE
-	MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
+ifndef GGML_NO_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
 endif
 
-ifdef LLAMA_DISABLE_LOGS
-	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
 # warnings
-WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
-				-Werror=implicit-function-declaration
-MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
+WARN_FLAGS = \
+	-Wall \
+	-Wextra \
+	-Wpedantic \
+	-Wcast-qual \
+	-Wno-unused-function
+
+MK_CFLAGS += \
+	$(WARN_FLAGS) \
+	-Wshadow \
+	-Wstrict-prototypes \
+	-Wpointer-arith \
+	-Wmissing-prototypes \
+	-Werror=implicit-int \
+	-Werror=implicit-function-declaration
+
+MK_CXXFLAGS += \
+	$(WARN_FLAGS) \
+	-Wmissing-declarations \
+	-Wmissing-noreturn
 
 ifeq ($(LLAMA_FATAL_WARNINGS),1)
 	MK_CFLAGS   += -Werror
@@ -259,21 +435,22 @@ ifdef LLAMA_GPROF
 	MK_CFLAGS   += -pg
 	MK_CXXFLAGS += -pg
 endif
-ifdef LLAMA_PERF
-	MK_CPPFLAGS += -DGGML_PERF
-endif
 
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 
-ifndef RISCV
+ifndef RISCV_CROSS_COMPILE
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native
 
+	# Usage AMX build test
+	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
+	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
+
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
 	#MK_CXXFLAGS += -mfma -mf16c -mavx
@@ -342,231 +519,488 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
 	CUDA_POWER_ARCH = 1
 endif
 
-else
+ifneq ($(filter loongarch64%,$(UNAME_M)),)
+	MK_CFLAGS   += -mlasx
+	MK_CXXFLAGS += -mlasx
+endif
+
+ifneq ($(filter riscv64%,$(UNAME_M)),)
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 
-ifdef LLAMA_QKK_64
-	MK_CPPFLAGS += -DGGML_QKK_64
+else # RISC-V CROSS COMPILATION
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 
-ifndef LLAMA_NO_ACCELERATE
+ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
-		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS  += -framework Accelerate
+		MK_CPPFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+		MK_CPPFLAGS  += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS  += -DACCELERATE_LAPACK_ILP64
+		MK_LDFLAGS   += -framework Accelerate
+		OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 	endif
-endif # LLAMA_NO_ACCELERATE
+endif # GGML_NO_ACCELERATE
 
-ifdef LLAMA_MPI
-	MK_CPPFLAGS += -DGGML_USE_MPI
-	MK_CFLAGS   += -Wno-cast-qual
-	MK_CXXFLAGS += -Wno-cast-qual
-	OBJS        += ggml-mpi.o
-endif # LLAMA_MPI
+ifndef GGML_NO_OPENMP
+	MK_CPPFLAGS += -DGGML_USE_OPENMP
+	MK_CFLAGS   += -fopenmp
+	MK_CXXFLAGS += -fopenmp
+endif # GGML_NO_OPENMP
 
-ifdef LLAMA_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-endif # LLAMA_OPENBLAS
+ifdef GGML_OPENBLAS
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_OPENBLAS
 
-ifdef LLAMA_BLIS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
-endif # LLAMA_BLIS
+ifdef GGML_OPENBLAS64
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas64)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas64)
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_OPENBLAS64
 
-ifdef LLAMA_CUBLAS
+ifdef GGML_BLIS
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_LDFLAGS   += -lblis -L/usr/local/lib
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_BLIS
+
+ifdef GGML_NVPL
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_LDFLAGS   += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
+endif # GGML_NVPL
+
+ifndef GGML_NO_LLAMAFILE
+	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
+endif
+
+ifndef GGML_NO_AMX
+	MK_CPPFLAGS += -DGGML_USE_AMX
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o
+endif
+
+# only necessary for the CPU backend files
+MK_CPPFLAGS += -Iggml/src/ggml-cpu
+
+ifdef GGML_RPC
+	MK_CPPFLAGS  += -DGGML_USE_RPC
+	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
+endif # GGML_RPC
+
+OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-mma*.cu))
+OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu))
+else
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+ifdef GGML_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
-	OBJS         += ggml-cuda.o
+
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math
+
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
+
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
+
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif # LLAMA_DEBUG
-ifdef LLAMA_CUDA_NVCC
-	NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
+
+ifdef GGML_CUDA_DEBUG
+	MK_NVCCFLAGS += --device-debug
+endif # GGML_CUDA_DEBUG
+
+ifdef GGML_CUDA_NVCC
+	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
 	NVCC = $(CCACHE) nvcc
-endif #LLAMA_CUDA_NVCC
+endif # GGML_CUDA_NVCC
+
 ifdef CUDA_DOCKER_ARCH
 	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
-ifdef LLAMA_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # LLAMA_CUDA_FORCE_MMQ
-ifdef LLAMA_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # LLAMA_CUDA_DMMV_X
-ifdef LLAMA_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-else ifdef LLAMA_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # LLAMA_CUDA_MMV_Y
-ifdef LLAMA_CUDA_F16
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_F16
-ifdef LLAMA_CUDA_DMMV_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_DMMV_F16
-ifdef LLAMA_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
-endif
-ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
-ifdef LLAMA_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_CCBIN
+	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
+endif # GGML_CUDA_CCBIN
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
 ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+define NVCC_COMPILE
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
 else
+define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT
-endif # LLAMA_CUBLAS
 
-ifdef LLAMA_CLBLAST
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(NVCC_COMPILE)
 
-	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
-	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(NVCC_COMPILE)
+endif # GGML_CUDA
 
-	# Mac provides OpenCL as a framework
-	ifeq ($(UNAME_S),Darwin)
-		MK_LDFLAGS += -lclblast -framework OpenCL
-	else
-		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
-	endif
-	OBJS    += ggml-opencl.o
-
-ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_CLBLAST
-
-ifdef LLAMA_VULKAN
+ifdef GGML_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
-	MK_LDFLAGS += -lvulkan
-	OBJS    += ggml-vulkan.o
+	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
+	OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
 
-ifdef LLAMA_VULKAN_CHECK_RESULTS
+ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
 endif
 
-ifdef LLAMA_VULKAN_DEBUG
+ifdef GGML_VULKAN_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
 endif
 
-ifdef LLAMA_VULKAN_VALIDATE
+ifdef GGML_VULKAN_MEMORY_DEBUG
+	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
+endif
+
+ifdef GGML_VULKAN_PERF
+	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
+endif
+
+ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
 
-ifdef LLAMA_VULKAN_RUN_TESTS
+ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif
 
-ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_VULKAN
+GLSLC_CMD  = glslc
+_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
+_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
+_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
+_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
+_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 
-ifdef LLAMA_HIPBLAS
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 
+$(_ggml_vk_header): $(_ggml_vk_source)
+
+$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
+	$(_ggml_vk_genshaders_cmd) \
+		--glslc      $(GLSLC_CMD) \
+		--input-dir  $(_ggml_vk_input_dir) \
+		--target-hpp $(_ggml_vk_header) \
+		--target-cpp $(_ggml_vk_source)
+
+vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+
+endif # GGML_VULKAN
+
+ifdef GGML_HIP
 	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+		ROCM_PATH      ?= /usr
+		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
-	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
-	LLAMA_CUDA_DMMV_X       ?= 32
-	LLAMA_CUDA_MMV_Y        ?= 1
-	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-ifdef LLAMA_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
-	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-ifdef LLAMA_CUDA_FORCE_DMMV
-	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-	OBJS        += ggml-cuda.o
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # LLAMA_HIPBLAS
 
-ifdef LLAMA_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJS		+= ggml-metal.o
-ifdef LLAMA_METAL_NDEBUG
+	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
+
+ifdef GGML_HIP_UMA
+	MK_CPPFLAGS += -DGGML_HIP_UMA
+endif # GGML_HIP_UMA
+
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
+	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+
+	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
+
+	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
+
+ifdef GGML_CUDA_FORCE_MMQ
+	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # GGML_HIP
+
+ifdef GGML_MUSA
+	ifeq ($(wildcard /opt/musa),)
+		MUSA_PATH ?= /usr/local/musa
+	else
+		MUSA_PATH ?= /opt/musa
+	endif
+	MUSA_ARCHITECTURES ?= 21;22
+
+	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
+	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
+	MK_LDFLAGS += -lmusa -lmusart -lmublas
+
+	ifndef GGML_NO_OPENMP
+		# For Ubuntu Focal
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+		# For Ubuntu Jammy
+		MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
+		MK_LDFLAGS  += -L/usr/lib/llvm-14/lib
+	endif # GGML_NO_OPENMP
+
+	CC  := $(MUSA_PATH)/bin/clang
+	CXX := $(MUSA_PATH)/bin/clang++
+	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
+
+	MUSAFLAGS  = -x musa -mtgpu
+	MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
+
+ifdef GGML_CUDA_FORCE_MMQ
+	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+
+ifdef GGML_CUDA_FORCE_CUBLAS
+	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+
+ifdef GGML_CUDA_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+
+ifdef GGML_CUDA_DMMV_F16
+	MUSAFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+ifdef GGML_CUDA_NO_PEER_COPY
+	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
+
+ggml/src/ggml-cuda/ggml-cuda.o: \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
+
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $<
+endif # GGML_MUSA
+
+ifdef GGML_METAL
+	MK_CPPFLAGS  += -DGGML_USE_METAL
+	MK_LDFLAGS   += -framework Foundation -framework Metal -framework MetalKit
+	OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
+
+ifdef GGML_METAL_USE_BF16
+	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
+endif # GGML_METAL_USE_BF16
+ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
-ifdef LLAMA_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJS        += ggml-metal-embed.o
+ifdef GGML_METAL_EMBED_LIBRARY
+	MK_CPPFLAGS  += -DGGML_METAL_EMBED_LIBRARY
+	OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
 endif
-endif # LLAMA_METAL
+endif # GGML_METAL
 
-ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
+ifdef GGML_METAL
+ggml/src/ggml-metal/ggml-metal.o: \
+	ggml/src/ggml-metal/ggml-metal.m \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
-ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal
+ifdef GGML_METAL_EMBED_LIBRARY
+ggml/src/ggml-metal-embed.o: \
+	ggml/src/ggml-metal/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal-impl.h \
+	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
-	@$(AS) $(TEMP_ASSEMBLY) -o $@
-	@rm -f ${TEMP_ASSEMBLY}
+	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
+	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
+	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
+	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
+	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
+	@rmdir ${TEMP_ASSEMBLY}
 endif
-endif # LLAMA_METAL
+endif # GGML_METAL
 
-ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_MPI
+DIR_GGML = ggml
+DIR_LLAMA = src
+DIR_COMMON = common
+
+OBJ_GGML = \
+	$(DIR_GGML)/src/ggml.o \
+	$(DIR_GGML)/src/ggml-alloc.o \
+	$(DIR_GGML)/src/ggml-backend.o \
+	$(DIR_GGML)/src/ggml-backend-reg.o \
+	$(DIR_GGML)/src/ggml-opt.o \
+	$(DIR_GGML)/src/ggml-quants.o \
+	$(DIR_GGML)/src/ggml-threading.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
+	$(OBJ_GGML_EXT)
+
+OBJ_LLAMA = \
+	$(DIR_LLAMA)/llama.o \
+	$(DIR_LLAMA)/llama-vocab.o \
+	$(DIR_LLAMA)/llama-grammar.o \
+	$(DIR_LLAMA)/llama-sampling.o \
+	$(DIR_LLAMA)/unicode.o \
+	$(DIR_LLAMA)/unicode-data.o
+
+OBJ_COMMON = \
+	$(DIR_COMMON)/common.o \
+	$(DIR_COMMON)/arg.o \
+	$(DIR_COMMON)/log.o \
+	$(DIR_COMMON)/console.o \
+	$(DIR_COMMON)/ngram-cache.o \
+	$(DIR_COMMON)/sampling.o \
+	$(DIR_COMMON)/speculative.o \
+	$(DIR_COMMON)/chat.o \
+	$(DIR_COMMON)/build-info.o \
+	$(DIR_COMMON)/json-schema-to-grammar.o
+
+OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+
+LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
+LIB_GGML_S = $(LIB_PRE)ggml.a
+
+LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
+LIB_LLAMA_S = $(LIB_PRE)llama.a
+
+LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
+LIB_COMMON_S = $(LIB_PRE)common.a
+
+LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
 
 GF_CC := $(CC)
 include scripts/get-flags.mk
@@ -580,12 +1014,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 
 # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef GGML_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
+ifdef LLAMA_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+
 #
 # Print build information
 #
@@ -600,67 +1039,126 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
 $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
+
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
+
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # GGML_CUDA
 $(info )
 
+ifdef DEPRECATE_WARNING
+$(info !!! DEPRECATION WARNING !!!)
+$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info   - LLAMA_CUDA)
+$(info   - LLAMA_METAL)
+$(info   - LLAMA_METAL_EMBED_LIBRARY)
+$(info   - LLAMA_OPENMP)
+$(info   - LLAMA_RPC)
+$(info   - LLAMA_SYCL)
+$(info   - LLAMA_SYCL_F16)
+$(info   - LLAMA_OPENBLAS)
+$(info   - LLAMA_OPENBLAS64)
+$(info   - LLAMA_BLIS)
+$(info   - LLAMA_NO_LLAMAFILE)
+$(info   - LLAMA_NO_ACCELERATE)
+$(info   - LLAMA_NO_OPENMP)
+$(info   - LLAMA_NO_METAL)
+$(info   - LLAMA_NO_CCACHE)
+$(info )
+endif
+
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+
 #
-# Build library
+# Build libraries
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+# Libraries
+LIB_GGML   = libggml.so
+LIB_GGML_S = libggml.a
 
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+LIB_LLAMA   = libllama.so
+LIB_LLAMA_S = libllama.a
 
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+LIB_COMMON   = libcommon.so
+LIB_COMMON_S = libcommon.a
 
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+# Targets
+BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
 
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
+# Dependency files
+DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+# Default target
+all: $(BUILD_TARGETS)
 
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
+# force c++ build for source file that have same name as c file
+# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
+$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-common.o: common/common.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+# Rules for building object files
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
+	$(CC) $(CFLAGS) -MMD -c $< -o $@
 
-sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-console.o: common/console.cpp common/console.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 
-train.o: common/train.cpp common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-libllama.so: llama.o ggml.o $(OBJS)
+# Rules for building libraries
+$(LIB_GGML): $(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
-	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
+$(LIB_GGML_S): $(OBJ_GGML)
+	ar rcs $(LIB_GGML_S) $^
 
-clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
-	find examples pocs -type f -name "*.o" -delete
+$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_LLAMA_S): $(OBJ_LLAMA)
+	ar rcs $(LIB_LLAMA_S) $^
+
+$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+
+$(LIB_COMMON_S): $(OBJ_COMMON)
+	ar rcs $(LIB_COMMON_S) $^
+
+# Include dependency files
+-include $(DEP_FILES)
+
+# Clean generated server assets
+clean-server-assets:
+	find examples/server -type f -name "*.js.hpp"   -delete
+	find examples/server -type f -name "*.mjs.hpp"  -delete
+	find examples/server -type f -name "*.css.hpp"  -delete
+	find examples/server -type f -name "*.html.hpp" -delete
+
+# Clean rule
+clean: clean-server-assets
+	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf *.a *.dll *.so *.dot
+	find ggml src common tests examples pocs -type f -name "*.o" -delete
+	find ggml src common tests examples pocs -type f -name "*.d" -delete
 
 #
 # Examples
@@ -673,122 +1171,253 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-cli: examples/main/main.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
-	@echo '====  Run ./main -h for help.  ===='
+	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 
-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-infill: examples/infill/infill.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-run: examples/run/run.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-simple: examples/simple/simple.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-simple-chat: examples/simple-chat/simple-chat.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+llama-tokenize: examples/tokenize/tokenize.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+llama-batched: examples/batched/batched.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
+llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-quantize: examples/quantize/quantize.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-perplexity: examples/perplexity/perplexity.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-imatrix: examples/imatrix/imatrix.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
-
-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
+llama-embedding: examples/embedding/embedding.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+llama-gritlm: examples/gritlm/gritlm.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
+llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-gguf: examples/gguf/gguf.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+examples/gguf-hash/deps/sha1/sha1.o: \
+	examples/gguf-hash/deps/sha1/sha1.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/xxhash/xxhash.o: \
+	examples/gguf-hash/deps/xxhash/xxhash.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+examples/gguf-hash/deps/sha256/sha256.o: \
+	examples/gguf-hash/deps/sha256/sha256.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+
+llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-export-lora: examples/export-lora/export-lora.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-retrieval: examples/retrieval/retrieval.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-speculative: examples/speculative/speculative.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-parallel: examples/parallel/parallel.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookahead: examples/lookahead/lookahead.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup: examples/lookup/lookup.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-create: examples/lookup/lookup-create.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-passkey: examples/passkey/passkey.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+ifdef GGML_RPC
+rpc-server: examples/rpc/rpc-server.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # GGML_RPC
+
+llama-server: \
+	examples/server/server.cpp \
+	examples/server/utils.hpp \
+	examples/server/httplib.h \
+	examples/server/index.html.hpp \
+	examples/server/loading.html.hpp \
+	common/chat.cpp \
+	common/chat.hpp \
+	common/chat-template.hpp \
+	common/json.hpp \
+	common/minja.hpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
+	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+		echo "unsigned char $${NAME}[] = {" && \
+		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+		echo "};" && \
+		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+	) > $@
+
+llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+libllava.a: examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	common/stb_image.h \
+	common/base64.hpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
 
-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
-	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
+llama-llava-cli: examples/llava/llava-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
@@ -803,7 +1432,7 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi
 
-build-info.o: common/build-info.cpp
+common/build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
 
 #
@@ -812,90 +1441,165 @@ build-info.o: common/build-info.cpp
 
 tests: $(TEST_TARGETS)
 
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
+tests/test-arg-parser: tests/test-arg-parser.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-run-benchmark-matmult: benchmark-matmult
-	./$@
-
-.PHONY: run-benchmark-matmult swift
-
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-log: tests/test-log.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
+tests/test-grammar-integration: tests/test-grammar-integration.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-chat: tests/test-chat.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-opt: tests/test-opt.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
+tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c llama.h
+tests/test-c.o: tests/test-c.c include/llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 
-tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
+tests/test-backend-ops: tests/test-backend-ops.cpp \
+	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-chat-template: tests/test-chat-template.cpp \
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# PoCs
+#
+
+llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: FORCE main quantize perplexity embedding server
+
+# Define the object file target
+examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
+#  Eventually we will want to remove these target from building all the time.
+main: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+
+server: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+
+quantize: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard quantize))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "  Remove the 'quantize' binary to remove this warning."
+	@echo "#########"
+endif
+
+perplexity: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard perplexity))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "  Remove the 'perplexity' binary to remove this warning."
+	@echo "#########"
+endif
+
+embedding: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard embedding))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "  Remove the 'embedding' binary to remove this warning."
+	@echo "#########"
+endif
diff --git a/Package.swift b/Package.swift
index b24c9204a..01c996d24 100644
--- a/Package.swift
+++ b/Package.swift
@@ -14,47 +14,6 @@ let package = Package(
         .library(name: "llama", targets: ["llama"]),
     ],
     targets: [
-        .target(
-            name: "llama",
-            path: ".",
-            exclude: [
-               "cmake",
-               "examples",
-               "scripts",
-               "models",
-               "tests",
-               "CMakeLists.txt",
-               "ggml-cuda.cu",
-               "ggml-cuda.h",
-               "Makefile"
-            ],
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m",
-            ],
-            resources: [
-                .process("ggml-metal.metal")
-            ],
-            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
-        )
-    ],
-    cxxLanguageStandard: .cxx11
+        .systemLibrary(name: "llama", pkgConfig: "llama"),
+    ]
 )
diff --git a/README-sycl.md b/README-sycl.md
deleted file mode 100644
index dd5bf9dea..000000000
--- a/README-sycl.md
+++ /dev/null
@@ -1,494 +0,0 @@
-# llama.cpp for SYCL
-
-- [Background](#background)
-- [OS](#os)
-- [Intel GPU](#intel-gpu)
-- [Docker](#docker)
-- [Linux](#linux)
-- [Windows](#windows)
-- [Environment Variable](#environment-variable)
-- [Known Issue](#known-issue)
-- [Q&A](#q&a)
-- [Todo](#todo)
-
-## Background
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
-
-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
-
-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
-
-To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
-
-The llama.cpp for SYCL is used to support Intel GPUs.
-
-For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
-
-## OS
-
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
-
-
-## Intel GPU
-
-### Verified
-
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
-
-Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
-
-### Memory
-
-The memory is a limitation to run LLM on GPUs.
-
-When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors:            buffer size =  3577.56 MiB`.
-
-For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
-
-For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
-
-## Docker
-
-Note:
-- Only docker on Linux is tested. Docker on WSL may not work.
-- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
-
-### Build the image
-
-You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
-
-
-```sh
-# For F16:
-#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
-
-# Or, for F32:
-docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
-
-# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
-```
-
-### Run
-
-```sh
-# Firstly, find all the DRI cards:
-ls -la /dev/dri
-# Then, pick the card that you want to use.
-
-# For example with "/dev/dri/card1"
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-
-## Linux
-
-### Setup Environment
-
-1. Install Intel GPU driver.
-
-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
-
-Note: for iGPU, please install the client GPU driver.
-
-b. Add user to group: video, render.
-
-```sh
-sudo usermod -aG render username
-sudo usermod -aG video username
-```
-
-Note: re-login to enable it.
-
-c. Check
-
-```sh
-sudo apt install clinfo
-sudo clinfo -l
-```
-
-Output (example):
-
-```
-Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
-
-Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
-
-2. Install Intel® oneAPI Base toolkit.
-
-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-
-Recommend to install to default folder: **/opt/intel/oneapi**.
-
-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
-
-b. Check
-
-```sh
-source /opt/intel/oneapi/setvars.sh
-
-sycl-ls
-```
-
-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
-```
-
-2. Build locally:
-
-Note:
-- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
-- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
-
-```sh
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-
-# For FP16:
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-# Or, for FP32:
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Build example/main only
-#cmake --build . --config Release --target main
-
-# Or, build all binary
-cmake --build . --config Release -v
-
-cd ..
-```
-
-or
-
-```sh
-./examples/sycl/build.sh
-```
-
-### Run
-
-1. Put model file to folder **models**
-
-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
-
-2. Enable oneAPI running environment
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. List device ID
-
-Run without parameter:
-
-```sh
-./build/bin/ls-sycl-device
-
-# or running the "main" executable and look at the output log:
-
-./build/bin/main
-```
-
-Check the ID in startup log, like:
-
-```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
-```
-
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-
-4. Set device ID and execute llama.cpp
-
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
-
-```sh
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-or run by script:
-
-```sh
-./examples/sycl/run_llama2.sh
-```
-
-Note:
-
-- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-
-
-5. Check the device ID in output
-
-Like:
-```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
-```
-
-## Windows
-
-### Setup Environment
-
-1. Install Intel GPU driver.
-
-Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
-
-Note: **The driver is mandatory for compute function**.
-
-2. Install Visual Studio.
-
-Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
-
-3. Install Intel® oneAPI Base toolkit.
-
-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-
-Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
-
-Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
-
-b. Enable oneAPI running environment:
-
-- In Search, input 'oneAPI'.
-
-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
-- In Run:
-
-In CMD:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-c. Check GPU
-
-In oneAPI command line:
-
-```
-sycl-ls
-```
-
-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO  [31.0.101.5186]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
-```
-
-4. Install cmake & make
-
-a. Download & install cmake for Windows: https://cmake.org/download/
-
-b. Download & install mingw-w64 make for Windows provided by w64devkit
-
-- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-
-- Extract `w64devkit` on your pc.
-
-- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
-
-### Build locally:
-
-In oneAPI command line window:
-
-```
-mkdir -p build
-cd build
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-
-::  for FP16
-::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-
-::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
-::  build example/main only
-::  make main
-
-::  build all binary
-make -j
-cd ..
-```
-
-or
-
-```
-.\examples\sycl\win-build-sycl.bat
-```
-
-Note:
-
-- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
-
-### Run
-
-1. Put model file to folder **models**
-
-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
-
-2. Enable oneAPI running environment
-
-- In Search, input 'oneAPI'.
-
-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
-- In Run:
-
-In CMD:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-3. List device ID
-
-Run without parameter:
-
-```
-build\bin\ls-sycl-device.exe
-
-or
-
-build\bin\main.exe
-```
-
-Check the ID in startup log, like:
-
-```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
-```
-
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-
-4. Set device ID and execute llama.cpp
-
-Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
-
-```
-set GGML_SYCL_DEVICE=0
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
-```
-or run by script:
-
-```
-.\examples\sycl\win-run-llama2.bat
-```
-
-Note:
-
-- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
-
-
-5. Check the device ID in output
-
-Like:
-```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
-```
-
-## Environment Variable
-
-#### Build
-
-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
-|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
-
-#### Running
-
-
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-
-## Known Issue
-
-- Hang during startup
-
-  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
-
-  Solution: add **--no-mmap** or **--mmap 0**.
-
-## Q&A
-
-- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
-
-  Miss to enable oneAPI running environment.
-
-  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
-
-- In Windows, no result, not error.
-
-  Miss to enable oneAPI running environment.
-
-- Meet compile error.
-
-  Remove folder **build** and try again.
-
-- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
-
-  Please run **sudo sycl-ls**.
-
-  If you see it in result, please add video/render group to your ID:
-
-  ```
-  sudo usermod -aG render username
-  sudo usermod -aG video username
-  ```
-
-  Then **relogin**.
-
-  If you do not see it, please check the installation GPU steps again.
-
-## Todo
-
-- Support multiple cards.
diff --git a/README.md b/README.md
index 67717c1e3..11f3d0286 100644
--- a/README.md
+++ b/README.md
@@ -3,95 +3,68 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 
-### Hot topics
+## Recent API changes
 
-- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
-- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
-- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
-- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+
+## Hot topics
+
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
+- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
+- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 ----
 
-<details>
-  <summary>Table of Contents</summary>
-  <ol>
-    <li>
-      <a href="#description">Description</a>
-    </li>
-    <li>
-      <a href="#usage">Usage</a>
-      <ul>
-        <li><a href="#get-the-code">Get the Code</a></li>
-        <li><a href="#build">Build</a></li>
-        <li><a href="#blas-build">BLAS Build</a></li>
-        <li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
-        <li><a href="#run-the-quantized-model">Run the quantized model</a></li>
-        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
-        <li><a href="#quantization">Quantization</a></li>
-        <li><a href="#interactive-mode">Interactive mode</a></li>
-        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
-        <li><a href="#instruct-mode">Instruct mode</a></li>
-        <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
-        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
-        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
-        <li><a href="#android">Android</a></li>
-        <li><a href="#docker">Docker</a></li>
-      </ul>
-    </li>
-    <li><a href="#contributing">Contributing</a></li>
-    <li><a href="#coding-guidelines">Coding guidelines</a></li>
-    <li><a href="#docs">Docs</a></li>
-  </ol>
-</details>
-
 ## Description
 
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
+range of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
-- AVX, AVX2 and AVX512 support for x86 architectures
+- AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
-- Vulkan, SYCL, and (partial) OpenCL backend support
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-improved significantly thanks to many contributions. It is the main playground for developing new features for the
-[ggml](https://github.com/ggerganov/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
 
-**Supported platforms:**
-
-- [X] Mac OS
-- [X] Linux
-- [X] Windows (via CMake)
-- [X] Docker
-- [X] FreeBSD
-
-**Supported models:**
+<details>
+<summary>Models</summary>
 
 Typically finetunes of the base models below are supported as well.
 
+Instructions for adding support for new models: [HOWTO-add-model.md](docs/development/HOWTO-add-model.md)
+
+#### Text-only
+
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
+- [x] LLaMA 3 🦙🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
-- [X] Falcon
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@@ -100,13 +73,40 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
+- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
 - [x] [GPT-2](https://huggingface.co/gpt2)
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
+- [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
+- [x] [Xverse](https://huggingface.co/models?search=xverse)
+- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
+- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
+- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMo 2](https://allenai.org/olmo)
+- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
+- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
+- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
+- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
+- [x] [Smaug](https://huggingface.co/models?search=Smaug)
+- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
+- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
+- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
+- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
+- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
+- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
+- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
+- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
+- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
+- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
+- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
+- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
+- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 
-**Multimodal models:**
+#### Multimodal
 
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
@@ -114,768 +114,399 @@ Typically finetunes of the base models below are supported as well.
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
+- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
+- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
+- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
+- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
+- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
 
-**HTTP server**
+</details>
 
-[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-**Bindings:**
+<details>
+<summary>Bindings</summary>
 
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
+- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
+- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
+- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
+- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
+- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 
-**UI:**
+</details>
 
-Unless otherwise noted these projects are open-source with permissive licensing:
+<details>
+<summary>UIs</summary>
 
-- [iohub/collama](https://github.com/iohub/coLLaMA)
+*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
-- [nat/openplayground](https://github.com/nat/openplayground)
-- [Faraday](https://faraday.dev/) (proprietary)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
+- [LARS](https://github.com/abgulati/LARS) (AGPL)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
-- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
-- [ollama/ollama](https://github.com/ollama/ollama)
-- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
-- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
-- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
-- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
-- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
-- [semperai/amica](https://github.com/semperai/amica)
-- [withcatai/catai](https://github.com/withcatai/catai)
+- [MindMac](https://mindmac.app) (proprietary)
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
-- [Msty](https://msty.app) (proprietary)
-- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0)
+- [nat/openplayground](https://github.com/nat/openplayground) (MIT)
+- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT)
+- [ollama/ollama](https://github.com/ollama/ollama) (MIT)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
+- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT)
+- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
+- [semperai/amica](https://github.com/semperai/amica) (MIT)
+- [withcatai/catai](https://github.com/withcatai/catai) (MIT)
+- [Autopen](https://github.com/blackhole89/autopen) (GPL)
 
----
+</details>
 
-Here is a typical run using LLaMA v2 13B on M2 Ultra:
+<details>
+<summary>Tools</summary>
 
-```
-$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
+- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
 
-make: Nothing to be done for `default'.
-main: build = 1041 (cf658ad)
-main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
-llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
-llm_load_print_meta: vocab type     = SPM
-llm_load_print_meta: n_vocab        = 32000
-llm_load_print_meta: n_merges       = 0
-llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
-llm_load_print_meta: n_embd         = 5120
-llm_load_print_meta: n_head         = 40
-llm_load_print_meta: n_head_kv      = 40
-llm_load_print_meta: n_layer        = 40
-llm_load_print_meta: n_rot          = 128
-llm_load_print_meta: n_gqa          = 1
-llm_load_print_meta: f_norm_eps     = 1.0e-05
-llm_load_print_meta: f_norm_rms_eps = 1.0e-05
-llm_load_print_meta: n_ff           = 13824
-llm_load_print_meta: freq_base      = 10000.0
-llm_load_print_meta: freq_scale     = 1
-llm_load_print_meta: model type     = 13B
-llm_load_print_meta: model ftype    = mostly Q4_0
-llm_load_print_meta: model size     = 13.02 B
-llm_load_print_meta: general.name   = LLaMA v2
-llm_load_print_meta: BOS token = 1 '<s>'
-llm_load_print_meta: EOS token = 2 '</s>'
-llm_load_print_meta: UNK token = 0 '<unk>'
-llm_load_print_meta: LF token  = 13 '<0x0A>'
-llm_load_tensors: ggml ctx size =    0.11 MB
-llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
-...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
+</details>
 
-system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
+<details>
+<summary>Infrastructure</summary>
 
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
+- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
+- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
+- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
 
- Building a website can be done in 10 simple steps:
-Step 1: Find the right website platform.
-Step 2: Choose your domain name and hosting plan.
-Step 3: Design your website layout.
-Step 4: Write your website content and add images.
-Step 5: Install security features to protect your site from hackers or spammers
-Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-Step 8: Start marketing and promoting the website via social media channels or paid ads
-Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-How does a Website Work?
-A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
-```
+</details>
 
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
+<details>
+<summary>Games</summary>
 
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
 
-## Usage
+</details>
 
-Here are the end-to-end binary build and model conversion steps for most supported models.
+## Supported backends
 
-### Get the Code
+| Backend | Target devices |
+| --- | --- |
+| [Metal](docs/build.md#metal-build) | Apple Silicon |
+| [BLAS](docs/build.md#blas-build) | All |
+| [BLIS](docs/backend/BLIS.md) | All |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [CUDA](docs/build.md#cuda) | Nvidia GPU |
+| [HIP](docs/build.md#hip) | AMD GPU |
+| [Vulkan](docs/build.md#vulkan) | GPU |
+| [CANN](docs/build.md#cann) | Ascend NPU |
 
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
+## Building the project
 
-### Build
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
 
-In order to build llama.cpp you have three different options.
+- Clone this repository and build locally, see [how to build](docs/build.md)
+- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
+- Use a Docker image, see [documentation for Docker](docs/docker.md)
+- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
 
-- Using `make`:
-  - On Linux or MacOS:
+## Obtaining and quantizing models
 
-      ```bash
-      make
-      ```
+The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
 
-  - On Windows:
+- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
+- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
 
-    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-    2. Extract `w64devkit` on your pc.
-    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
-    5. From here you can run:
-        ```bash
-        make
-        ```
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
 
-- Using `CMake`:
+After downloading a model, use the CLI tools to run it locally - see below.
+
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+
+The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
+
+- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
+
+To learn more about model quantization, [read this documentation](examples/quantize/README.md)
+
+## [`llama-cli`](examples/main)
+
+#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
+
+- <details open>
+    <summary>Run in conversation mode</summary>
+
+    Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
 
     ```bash
-    mkdir build
-    cd build
-    cmake ..
-    cmake --build . --config Release
+    llama-cli -m model.gguf
+
+    # > hi, who are you?
+    # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
+    #
+    # > what is 1+1?
+    # Easy peasy! The answer to 1+1 is... 2!
     ```
 
-- Using `Zig` (version 0.11 or later):
-
-    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
-    it's also possible to cross compile for other operating systems and architectures:
-
-    ```bash
-    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
-    ```
-
-    The `zig targets` command will give you valid options to use.
-
--   Using `gmake` (FreeBSD):
-
-    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
-    2. Add your user to **video** group
-    3. Install compilation dependencies.
-
-        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
-            opencl clblast openblas
-
-        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
-        ```
-
-    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
-    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
-    the instructions for use and activate this options in this document below.
-
-### Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
-
-### MPI Build
-
-MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
-
-First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
-
-Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
-
-- Using `make`:
-
-  ```bash
-  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
-  ```
-
-- Using `CMake`:
-
-  ```bash
-  cmake -S . -B build -DLLAMA_MPI=ON
-  ```
-
-Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
-
-Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
-
-Here is an example hostfile:
-
-```
-192.168.0.1:2
-malvolio.local:1
-```
-
-The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
-
-Finally, you're ready to run a computation using `mpirun`:
-
-```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
-```
-
-### BLAS Build
-
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
-
-- #### Accelerate Framework:
-
-  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-
-- #### OpenBLAS:
-
-  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
-
-  - Using `make`:
-    - On Linux:
-      ```bash
-      make LLAMA_OPENBLAS=1
-      ```
-
-    - On Windows:
-
-      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
-      2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
-      3. Extract `w64devkit` on your pc.
-      4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
-      5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
-      6. Run `w64devkit.exe`.
-      7. Use the `cd` command to reach the `llama.cpp` folder.
-      8. From here you can run:
-
-          ```bash
-          make LLAMA_OPENBLAS=1
-          ```
-
-  - Using `CMake` on Linux:
-
-      ```bash
-      mkdir build
-      cd build
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-      cmake --build . --config Release
-      ```
-
-- #### BLIS
-
-  Check [BLIS.md](docs/BLIS.md) for more information.
-
-- #### SYCL
-  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
-  llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
-  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
-
-- #### Intel oneMKL
-  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
-
-  - Using manual oneAPI installation:
-    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-      ```bash
-      mkdir build
-      cd build
-      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
-      ```
-
-  - Using oneAPI docker image:
-    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-
-  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-
-- #### cuBLAS
-
-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
-  - Using `make`:
-    ```bash
-    make LLAMA_CUBLAS=1
-    ```
-  - Using `CMake`:
-
-    ```bash
-    mkdir build
-    cd build
-    cmake .. -DLLAMA_CUBLAS=ON
-    cmake --build . --config Release
-    ```
-
-  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
-
-<!---
-  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---->
-  | Option                         | Legal values           | Default | Description |
-  |--------------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
-  | LLAMA_CUDA_F16                 | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       |     128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
-
-- #### hipBLAS
-
-  This provides BLAS acceleration on HIP-supported AMD GPUs.
-  Make sure to have ROCm installed.
-  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
-
-  - Using `make`:
-    ```bash
-    make LLAMA_HIPBLAS=1
-    ```
-  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
-    ```bash
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
-    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
-    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  - Using `make` (example for target gfx1030, build with 16 CPU threads):
-    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
-    ```
-
-  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
-    ```bash
-    set PATH=%HIP_PATH%\bin;%PATH%
-    mkdir build
-    cd build
-    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
-    cmake --build .
-    ```
-    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-
-
-  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
-  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
-  | Option                  | Legal values           | Default | Description |
-  |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-
-- #### CLBlast
-
-  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
-
-  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
-    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
-
-    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
-    - <details>
-        <summary>Installing the OpenCL SDK from source</summary>
-
-        ```sh
-        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
-        mkdir OpenCL-SDK/build
-        cd OpenCL-SDK/build
-        cmake .. -DBUILD_DOCS=OFF \
-          -DBUILD_EXAMPLES=OFF \
-          -DBUILD_TESTING=OFF \
-          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
-          -DOPENCL_SDK_TEST_SAMPLES=OFF
-        cmake --build . --config Release
-        cmake --install . --prefix /some/path
-        ```
-      </details>
-
-  ##### Installing CLBlast
-
-  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
-  Alternatively, they may be built from source.
-
-  - <details>
-    <summary>Windows:</summary>
-
-      ```cmd
-      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
-      git clone https://github.com/CNugteren/CLBlast.git
-      mkdir CLBlast\build
-      cd CLBlast\build
-      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
-      cmake --build . --config Release
-      cmake --install . --prefix C:/CLBlast
-      ```
-
-  - <details>
-    <summary>Unix:</summary>
-
-      ```sh
-      git clone https://github.com/CNugteren/CLBlast.git
-      mkdir CLBlast/build
-      cd CLBlast/build
-      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
-      cmake --build . --config Release
-      cmake --install . --prefix /some/path
-      ```
-
-      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
     </details>
 
-  ##### Building Llama with CLBlast
+- <details>
+    <summary>Run in conversation mode with custom chat template</summary>
 
-  - Build with make:
-    ```sh
-    make LLAMA_CLBLAST=1
-    ```
-  - CMake (Unix):
-    ```sh
-    mkdir build
-    cd build
-    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-    cmake --build . --config Release
-    ```
-  - CMake (Windows):
-    ```cmd
-    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
-    git clone https://github.com/ggerganov/llama.cpp
-    cd llama.cpp
-    mkdir build
-    cd build
-    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
-    cmake --build . --config Release
-    cmake --install . --prefix C:/LlamaCPP
+    ```bash
+    # use the "chatml" template (use -h to see the list of supported templates)
+    llama-cli -m model.gguf -cnv --chat-template chatml
+
+    # use a custom template
+    llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
     ```
 
-  ##### Running Llama with CLBlast
+    </details>
 
-  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
+- <details>
+    <summary>Run simple text completion</summary>
 
-  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
-  The selection can be a number (starting from 0) or a text string to search:
+    To disable conversation mode explicitly, use `-no-cnv`
 
-  ```sh
-  GGML_OPENCL_PLATFORM=1 ./main ...
-  GGML_OPENCL_DEVICE=2 ./main ...
-  GGML_OPENCL_PLATFORM=Intel ./main ...
-  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
-  ```
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
 
-  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
-  Using the variables it is possible to select a CPU-based driver as well, if so desired.
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
 
-  You can get a list of platforms and devices from the `clinfo -l` command, etc.
+    </details>
 
-- #### Vulkan
+- <details>
+    <summary>Constrain the output with a custom grammar</summary>
 
-  **With docker**:
+    ```bash
+    llama-cli -m model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 
-  You don't need to install Vulkan SDK. It will be installed inside the container.
+    # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
+    ```
 
-  ```sh
-  # Build the image
-  docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
+    The [grammars/](grammars/) folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](grammars/README.md).
 
-  # Then, use it:
-  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-  ```
+    For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/
 
-  **Without docker**:
+    </details>
 
-  Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
 
-  For example, on Ubuntu 22.04 (jammy), use the command below:
+## [`llama-server`](examples/server)
 
-  ```bash
-  wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
-  wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-  apt update -y
-  apt-get install -y vulkan-sdk
-  # To verify the installation, use the command below:
-  vulkaninfo
-  ```
+#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
 
-  Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
+- <details open>
+    <summary>Start a local HTTP server with default configuration on port 8080</summary>
 
-  Then, build llama.cpp using the cmake command below:
+    ```bash
+    llama-server -m model.gguf --port 8080
 
-  ```bash
-  mkdir -p build
-  cd build
-  cmake .. -DLLAMA_VULKAN=1
-  cmake --build . --config Release
-  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+    # Basic web UI can be accessed via browser: http://localhost:8080
+    # Chat completion endpoint: http://localhost:8080/v1/chat/completions
+    ```
 
-  # You should see in the output, ggml_vulkan detected your GPU. For example:
-  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
-  ```
+    </details>
 
-### Prepare and Quantize
+- <details>
+    <summary>Support multiple-users and parallel decoding</summary>
 
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+    ```bash
+    # up to 4 concurrent requests, each with 4096 max context
+    llama-server -m model.gguf -c 16384 -np 4
+    ```
 
-```bash
-# obtain the official LLaMA model weights and place them in ./models
-ls ./models
-llama-2-7b tokenizer_checklist.chk tokenizer.model
-# [Optional] for models using BPE tokenizers
-ls ./models
-<folder containing weights and tokenizer json> vocab.json
-# [Optional] for PyTorch .bin models like Mistral-7B
-ls ./models
-<folder containing weights and tokenizer json>
+    </details>
 
-# install Python dependencies
-python3 -m pip install -r requirements.txt
+- <details>
+    <summary>Enable speculative decoding</summary>
 
-# convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+    ```bash
+    # the draft.gguf model should be a small variant of the target model.gguf
+    llama-server -m model.gguf -md draft.gguf
+    ```
 
-# [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+    </details>
 
-# quantize the model to 4-bits (using Q4_K_M method)
-./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
+- <details>
+    <summary>Serve an embedding model</summary>
 
-# update the gguf filetype to current version if older version is now unsupported
-./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
-```
+    ```bash
+    # use the /embedding endpoint
+    llama-server -m model.gguf --embedding --pooling cls -ub 8192
+    ```
 
-### Run the quantized model
+    </details>
 
-```bash
-# start inference on a gguf model
-./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
-```
+- <details>
+    <summary>Serve a reranking model</summary>
 
-When running the larger models, make sure you have enough disk space to store all the intermediate files.
+    ```bash
+    # use the /reranking endpoint
+    llama-server -m model.gguf --reranking
+    ```
 
-### Running on Windows with prebuilt binaries
+    </details>
 
-You will find prebuilt Windows binaries on the release page.
+- <details>
+    <summary>Constrain all outputs with a grammar</summary>
 
-Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
+    ```bash
+    # custom grammar
+    llama-server -m model.gguf --grammar-file grammar.gbnf
 
-From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
+    # JSON
+    llama-server -m model.gguf --grammar-file grammars/json.gbnf
+    ```
 
-```
-.\main -m llama-2-7b.Q4_0.gguf -n 128
-```
+    </details>
 
-### Memory/Disk Requirements
 
-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+## [`llama-perplexity`](examples/perplexity)
 
-| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-|    7B |         13 GB |                 3.9 GB |
-|   13B |         24 GB |                 7.8 GB |
-|   30B |         60 GB |                19.5 GB |
-|   65B |        120 GB |                38.5 GB |
+#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
 
-### Quantization
+- <details open>
+    <summary>Measure the perplexity over a text file</summary>
 
-Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+    ```bash
+    llama-perplexity -m model.gguf -f file.txt
 
-*(outdated)*
+    # [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...
+    # Final estimate: PPL = 5.4007 +/- 0.67339
+    ```
 
-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
-|    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
-|    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
-|    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
-|    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
-|   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
-|   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
-|   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
-|   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
-|   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
+    </details>
 
-- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
-- recent k-quants improvements and new i-quants
-  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
-  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
-  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
-  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
-  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
-  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
-  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
-  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
-  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
-  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
-  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
-  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
-  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
+- <details>
+    <summary>Measure KL divergence</summary>
 
-### Perplexity (measuring model quality)
+    ```bash
+    # TODO
+    ```
 
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+    </details>
 
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
+[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
+[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
 
-#### How to run
+## [`llama-bench`](examples/llama-bench)
 
-1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
-3. Output:
-```
-perplexity : calculating perplexity over 655 chunks
-24.43 seconds per pass - ETA 4.45 hours
-[1]4.5970,[2]5.1807,[3]6.0382,...
-```
-And after 4.45 hours, you will have the final perplexity.
+#### Benchmark the performance of the inference for various parameters.
 
-### Interactive mode
+- <details open>
+    <summary>Run default benchmark</summary>
 
-If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+    ```bash
+    llama-bench -m model.gguf
 
-Here is an example of a few-shot interaction, invoked with the command
+    # Output:
+    # | model               |       size |     params | backend    | threads |          test |                  t/s |
+    # | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |
+    # | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |
+    #
+    # build: 3e0ba0e60 (4229)
+    ```
 
-```bash
-# default arguments using a 7B model
-./examples/chat.sh
+    </details>
 
-# advanced chat with a 13B model
-./examples/chat-13B.sh
+## [`llama-run`](examples/run)
 
-# custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
-```
+#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
 
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
+- <details>
+    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
 
-![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
+    ```bash
+    llama-run granite-code
+    ```
 
-### Persistent Interaction
+    </details>
 
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+[^3]: [RamaLama](https://github.com/containers/ramalama)
 
-```bash
-# Start a new chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+## [`llama-simple`](examples/simple)
 
-# Resume that chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
 
-# Start a different chat with the same prompt/model
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+- <details>
+    <summary>Basic text completion</summary>
 
-# Different prompt cache for different prompt/model
-PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
-    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
-```
+    ```bash
+    llama-simple -m model.gguf
 
-### Constrained output with grammars
+    # Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called "The Art of
+    ```
 
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+    </details>
 
-```bash
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
 
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+## Contributing
 
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+- Contributors can open PRs
+- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
+- Collaborators will be invited based on contributions
+- Any help with managing issues, PRs and projects is very appreciated!
+- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 
-### Instruct mode
+## Other documentation
 
-1. First, download and place the `ggml` model into the `./models` folder
-2. Run the `main` tool like this:
+- [main (cli)](examples/main/README.md)
+- [server](examples/server/README.md)
+- [GBNF grammars](grammars/README.md)
 
-```
-./examples/alpaca.sh
-```
+#### Development documentation
 
-Sample run:
+- [How to build](docs/build.md)
+- [Running on Docker](docs/docker.md)
+- [Build on Android](docs/android.md)
+- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 
-```
-== Running in interactive mode. ==
- - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMa.
- - If you want to submit another line, end your input in '\'.
-
- Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-> How many letters are there in the English alphabet?
-There 26 letters in the English Alphabet
-> What is the most common way of transportation in Amsterdam?
-The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
-> List 5 words that start with "ca".
-cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
->
-```
-
-### Obtaining and using the Facebook LLaMA 2 model
-
-- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
-- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
-  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
-  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
-  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
-  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
-  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
-  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
-
-### Seminal papers and background on the models
+#### Seminal papers and background on the models
 
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
@@ -887,184 +518,5 @@ If your issue is with model generation quality, then please at least scan the fo
     - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
     - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 
-### Android
+#### References
 
-#### Building the Project using Android NDK
-You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
-
-First, install the essential packages for termux:
-```
-pkg install clang wget git cmake
-```
-Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=<your_ndk_directory>
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
-Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
-
-https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
-
-#### Building the Project using Termux (F-Droid)
-Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
-
-Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
-
-If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
-```
-apt install libopenblas
-```
-
-Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
-```
-apt install ocl-icd opencl-headers opencl-clhpp clinfo
-```
-
-In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
-```
-cmake .
-make
-cp libclblast.so* $PREFIX/lib
-cp ./include/clblast.h ../llama.cpp
-```
-
-Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
-```
-cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
-cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
-make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
-```
-
-Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
-```
-GGML_OPENCL_PLATFORM=0
-GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
-```
-
-(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
-
-For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
-
-Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
-
-### Docker
-
-#### Prerequisites
-* Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
-
-#### Images
-We have three Docker images available for this project:
-
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
-
-Additionally, there the following images, similar to the above:
-
-- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
-
-#### Usage
-
-The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
-
-Replace `/path/to/models` below with the actual path where you downloaded the models.
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
-```
-
-On completion, you are ready to play!
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a light image:
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a server image:
-
-```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
-```
-
-### Docker With CUDA
-
-Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
-
-#### Building Locally
-
-```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
-```
-
-You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
-
-The defaults are:
-
-- `CUDA_VERSION` set to `11.7.1`
-- `CUDA_DOCKER_ARCH` set to `all`
-
-The resulting images, are essentially the same as the non-CUDA images:
-
-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
-
-#### Usage
-
-After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
-
-```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
-```
-
-### Contributing
-
-- Contributors can open PRs
-- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
-- Collaborators will be invited based on contributions
-- Any help with managing issues and PRs is very appreciated!
-- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
-- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-
-### Coding guidelines
-
-- Avoid adding third-party dependencies, extra files, extra headers, etc.
-- Always consider cross-compatibility with other operating systems and architectures
-- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
-- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
-- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
-- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
-- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
-- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
-
-### Docs
-
-- [main](./examples/main/README.md)
-- [server](./examples/server/README.md)
-- [jeopardy](./examples/jeopardy/README.md)
-- [BLIS](./docs/BLIS.md)
-- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
-- [GBNF grammars](./grammars/README.md)
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..f4322c6ee
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,67 @@
+# Security Policy
+
+ - [**Using llama.cpp securely**](#using-llamacpp-securely)
+   - [Untrusted models](#untrusted-models)
+   - [Untrusted inputs](#untrusted-inputs)
+   - [Data privacy](#data-privacy)
+   - [Untrusted environments or networks](#untrusted-environments-or-networks)
+   - [Multi-Tenant environments](#multi-tenant-environments)
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+
+## Using llama.cpp securely
+
+### Untrusted models
+Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
+
+*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
+
+> [!NOTE]
+> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
+
+### Untrusted inputs
+
+Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
+
+For maximum security when handling untrusted inputs, you may need to employ the following:
+
+* Sandboxing: Isolate the environment where the inference happens.
+* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
+* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
+* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
+    * Validation: Enforce strict rules on allowed characters and data types.
+    * Filtering: Remove potentially malicious scripts or code fragments.
+    * Encoding: Convert special characters into safe representations.
+    * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
+
+### Data privacy
+
+To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
+
+### Untrusted environments or networks
+
+If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Encrypt your data if sending it over the network.
+
+### Multi-Tenant environments
+
+If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
+
+1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
+
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+
+## Reporting a vulnerability
+
+Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+
+<!-- normal version -->
+However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h
new file mode 100644
index 000000000..41725880e
--- /dev/null
+++ b/Sources/llama/llama.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <llama.h>
+
diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap
new file mode 100644
index 000000000..d010555b1
--- /dev/null
+++ b/Sources/llama/module.modulemap
@@ -0,0 +1,5 @@
+module llama [system] {
+    header "llama.h"
+    link "llama"
+    export *
+}
diff --git a/build.zig b/build.zig
deleted file mode 100644
index c0af454dc..000000000
--- a/build.zig
+++ /dev/null
@@ -1,139 +0,0 @@
-// Compatible with Zig Version 0.11.0
-const std = @import("std");
-const ArrayList = std.ArrayList;
-const Compile = std.Build.Step.Compile;
-const ConfigHeader = std.Build.Step.ConfigHeader;
-const Mode = std.builtin.Mode;
-const CrossTarget = std.zig.CrossTarget;
-
-const Maker = struct {
-    builder: *std.build.Builder,
-    target: CrossTarget,
-    optimize: Mode,
-    enable_lto: bool,
-
-    include_dirs: ArrayList([]const u8),
-    cflags: ArrayList([]const u8),
-    cxxflags: ArrayList([]const u8),
-    objs: ArrayList(*Compile),
-
-    fn addInclude(m: *Maker, dir: []const u8) !void {
-        try m.include_dirs.append(dir);
-    }
-    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
-        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
-    }
-    fn addCFlag(m: *Maker, flag: []const u8) !void {
-        try m.cflags.append(flag);
-    }
-    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
-        try m.cxxflags.append(flag);
-    }
-    fn addFlag(m: *Maker, flag: []const u8) !void {
-        try m.addCFlag(flag);
-        try m.addCxxFlag(flag);
-    }
-
-    fn init(builder: *std.build.Builder) !Maker {
-        const target = builder.standardTargetOptions(.{});
-        const zig_version = @import("builtin").zig_version_string;
-        const commit_hash = try std.ChildProcess.exec(
-            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
-        );
-        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
-            \\int LLAMA_BUILD_NUMBER = {};
-            \\char const *LLAMA_COMMIT = "{s}";
-            \\char const *LLAMA_COMPILER = "Zig {s}";
-            \\char const *LLAMA_BUILD_TARGET = "{s}";
-            \\
-        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
-        var m = Maker{
-            .builder = builder,
-            .target = target,
-            .optimize = builder.standardOptimizeOption(.{}),
-            .enable_lto = false,
-            .include_dirs = ArrayList([]const u8).init(builder.allocator),
-            .cflags = ArrayList([]const u8).init(builder.allocator),
-            .cxxflags = ArrayList([]const u8).init(builder.allocator),
-            .objs = ArrayList(*Compile).init(builder.allocator),
-        };
-
-        try m.addCFlag("-std=c11");
-        try m.addCxxFlag("-std=c++11");
-        try m.addProjectInclude(&.{});
-        try m.addProjectInclude(&.{"common"});
-        return m;
-    }
-
-    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
-        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        if (o.target.getAbi() != .msvc)
-            o.defineCMacro("_GNU_SOURCE", null);
-
-        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, m.cflags.items);
-            o.linkLibC();
-        } else {
-            o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            if (o.target.getAbi() == .msvc) {
-                o.linkLibC(); // need winsdk + crt
-            } else {
-                // linkLibCpp already add (libc++ + libunwind + libc)
-                o.linkLibCpp();
-            }
-        }
-        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
-        o.want_lto = m.enable_lto;
-        return o;
-    }
-
-    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
-        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addCSourceFiles(&.{src}, m.cxxflags.items);
-        for (deps) |d| e.addObject(d);
-        for (m.objs.items) |o| e.addObject(o);
-        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
-        // https://github.com/ziglang/zig/issues/15448
-        if (e.target.getAbi() == .msvc) {
-            e.linkLibC(); // need winsdk + crt
-        } else {
-            // linkLibCpp already add (libc++ + libunwind + libc)
-            e.linkLibCpp();
-        }
-        m.builder.installArtifact(e);
-        e.want_lto = m.enable_lto;
-        return e;
-    }
-};
-
-pub fn build(b: *std.build.Builder) !void {
-    var make = try Maker.init(b);
-    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
-
-    const ggml = make.obj("ggml", "ggml.c");
-    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
-    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
-    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
-    const llama = make.obj("llama", "llama.cpp");
-    const buildinfo = make.obj("common", "common/build-info.cpp");
-    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("console", "common/console.cpp");
-    const sampling = make.obj("sampling", "common/sampling.cpp");
-    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
-    const train = make.obj("train", "common/train.cpp");
-    const clip = make.obj("clip", "examples/llava/clip.cpp");
-    const llava = make.obj("llava", "examples/llava/llava.cpp");
-
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
-    if (server.target.isWindows()) {
-        server.linkSystemLibrary("ws2_32");
-    }
-}
diff --git a/ci/run.sh b/ci/run.sh
index 35eb3c7aa..77c32ce00 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with VULKAN support
+# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
@@ -36,20 +39,25 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
     if [ -z ${ONEAPI_ROOT} ]; then
-        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
+        echo "source /opt/intel/oneapi/setvars.sh"
         exit 1
     fi
 
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+fi
+
+if [ ! -z ${GG_BUILD_VULKAN} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
 ## helpers
 
@@ -102,8 +110,11 @@ function gg_run_ctest_debug {
 
     set -e
 
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+
     (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
@@ -130,8 +141,11 @@ function gg_run_ctest_release {
 
     set -e
 
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+
     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z ${GG_BUILD_LOW_PERF} ]; then
         (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -152,13 +166,64 @@ function gg_sum_ctest_release {
     gg_printf '```\n'
 }
 
+# test_scripts_debug
+
+function gg_run_test_scripts_debug {
+    cd ${SRC}
+
+    set -e
+
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
+    cd ${SRC}
+
+    set -e
+
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
 function gg_get_model {
-    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
-    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_3b ]]; then
-        echo -n "$gguf_3b"
-    elif [[ -s $gguf_7b ]]; then
-        echo -n "$gguf_7b"
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_0 ]]; then
+        echo -n "$gguf_0"
+    elif [[ -s $gguf_1 ]]; then
+        echo -n "$gguf_1"
+    elif [[ -s $gguf_2 ]]; then
+        echo -n "$gguf_2"
     else
         echo >&2 "No model found. Can't run gg_run_ctest_with_model."
         exit 1
@@ -207,187 +272,7 @@ function gg_sum_ctest_with_model_release {
     gg_printf '```\n'
 }
 
-# open_llama_3b_v2
-
-function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    set +e
-}
-
-function gg_sum_open_llama_3b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 3B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
-}
-
 # open_llama_7b_v2
-# requires: GG_BUILD_CUDA
 
 function gg_run_open_llama_7b_v2 {
     cd ${SRC}
@@ -411,10 +296,10 @@ function gg_run_open_llama_7b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
 
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -430,44 +315,47 @@ function gg_run_open_llama_7b_v2 {
 
     wiki_test="${path_wiki}/wiki.test.raw"
 
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
 
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -496,48 +384,6 @@ function gg_run_open_llama_7b_v2 {
 
     cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
 
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/7B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # currently not supported by the CUDA backend
-    # q8_0
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
     set +e
 }
 
@@ -548,7 +394,6 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
     gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
     gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -561,11 +406,271 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
     gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
+}
+
+# pythia_1.4b
+
+function gg_run_pythia_1_4b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/pythia/1.4B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_pythia_1_4b {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Pythia 1.4B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
+# pythia_2_8b
+
+function gg_run_pythia_2_8b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+
+    path_models="../models-mnt/pythia/2.8B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test="${path_wiki}/wiki.test.raw"
+
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_pythia_2_8b {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Pythia 2.8B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 
 # bge-small
@@ -574,7 +679,7 @@ function gg_run_embd_bge_small {
     cd ${SRC}
 
     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
-    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
@@ -592,17 +697,17 @@ function gg_run_embd_bge_small {
     set -e
 
     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    python3 ../convert-hf-to-gguf.py ${path_models}
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
 
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
 
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
 
-    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
 
     set +e
 }
@@ -616,8 +721,92 @@ function gg_sum_embd_bge_small {
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
 }
 
+# rerank_tiny
+
+function gg_run_rerank_tiny {
+    cd ${SRC}
+
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+
+    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
+
+    path_models="../models-mnt/rerank-tiny"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+
+    # sample output
+    # rerank score 0:    0.029
+    # rerank score 1:    0.029
+    # rerank score 2:    0.135
+
+    # check that the score is in the range [$3, $4]
+    function check_score {
+        qnt="$1"
+        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$score"
+        return 0
+    }
+
+    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
+
+    set +e
+}
+
+function gg_sum_rerank_tiny {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Rerank Tiny (Jina):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
+}
+
+function gg_check_build_requirements {
+    if ! command -v cmake &> /dev/null; then
+        gg_printf 'cmake not found, please install'
+    fi
+
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
+    fi
+
+    if ! command -v ctest &> /dev/null; then
+        gg_printf 'ctest not found, please install'
+    fi
+}
+
 ## main
 
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
     rm -rf ${SRC}/models-mnt
@@ -626,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
     ln -sfn ${mnt_models} ${SRC}/models-mnt
 
     # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
+    if ! python3 -m venv "$MNT/venv"; then
+        echo "Error: Failed to create Python virtual environment at $MNT/venv."
+        exit 1
+    fi
     source "$MNT/venv/bin/activate"
 
     pip install -r ${SRC}/requirements.txt --disable-pip-version-check
@@ -640,12 +832,19 @@ test $ret -eq 0 && gg_run ctest_release
 
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     test $ret -eq 0 && gg_run embd_bge_small
+    test $ret -eq 0 && gg_run rerank_tiny
+
+    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
+        test $ret -eq 0 && gg_run test_scripts_debug
+        test $ret -eq 0 && gg_run test_scripts_release
+    fi
 
     if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
+            test $ret -eq 0 && gg_run pythia_1_4b
         else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run pythia_2_8b
+            #test $ret -eq 0 && gg_run open_llama_7b_v2
         fi
         test $ret -eq 0 && gg_run ctest_with_model_debug
         test $ret -eq 0 && gg_run ctest_with_model_release
diff --git a/cmake/arm64-apple-clang.cmake b/cmake/arm64-apple-clang.cmake
new file mode 100644
index 000000000..5fcd2882a
--- /dev/null
+++ b/cmake/arm64-apple-clang.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Darwin )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-apple-darwin-macho )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
new file mode 100644
index 000000000..802379680
--- /dev/null
+++ b/cmake/arm64-windows-llvm.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-msvc.cmake b/cmake/arm64-windows-msvc.cmake
new file mode 100644
index 000000000..c77631420
--- /dev/null
+++ b/cmake/arm64-windows-msvc.cmake
@@ -0,0 +1,6 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
diff --git a/scripts/build-info.cmake b/cmake/build-info.cmake
similarity index 95%
rename from scripts/build-info.cmake
rename to cmake/build-info.cmake
index ea3dc55c8..c1a456e17 100644
--- a/scripts/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -44,7 +44,7 @@ if(MSVC)
     set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
     execute_process(
-        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
         OUTPUT_VARIABLE OUT
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
diff --git a/cmake/common.cmake b/cmake/common.cmake
new file mode 100644
index 000000000..0f54871e4
--- /dev/null
+++ b/cmake/common.cmake
@@ -0,0 +1,33 @@
+function(llama_add_compile_flags)
+    if (LLAMA_FATAL_WARNINGS)
+        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            list(APPEND C_FLAGS   -Werror)
+            list(APPEND CXX_FLAGS -Werror)
+        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+            add_compile_options(/WX)
+        endif()
+    endif()
+
+    if (LLAMA_ALL_WARNINGS)
+        if (NOT MSVC)
+            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                -Werror=implicit-int -Werror=implicit-function-declaration)
+
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+
+            list(APPEND C_FLAGS   ${WARNING_FLAGS})
+            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        else()
+            # todo : msvc
+            set(C_FLAGS   "" PARENT_SCOPE)
+            set(CXX_FLAGS "" PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
diff --git a/cmake/git-vars.cmake b/cmake/git-vars.cmake
new file mode 100644
index 000000000..1a4c24ebf
--- /dev/null
+++ b/cmake/git-vars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
new file mode 100644
index 000000000..90cbec5b6
--- /dev/null
+++ b/cmake/llama-config.cmake.in
@@ -0,0 +1,30 @@
+set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
+set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+
+@PACKAGE_INIT@
+
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+
+find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
+
+find_library(llama_LIBRARY llama
+    REQUIRED
+    HINTS ${LLAMA_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
+
+add_library(llama UNKNOWN IMPORTED)
+set_target_properties(llama
+    PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
+        INTERFACE_COMPILE_FEATURES c_std_90
+        POSITION_INDEPENDENT_CODE ON)
+
+check_required_components(Llama)
diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in
new file mode 100644
index 000000000..6fb58b5f6
--- /dev/null
+++ b/cmake/llama.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: llama
+Description: Port of Facebook's LLaMA model in C/C++
+Version: @LLAMA_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lllama
+Cflags: -I${includedir}
diff --git a/cmake/x64-windows-llvm.cmake b/cmake/x64-windows-llvm.cmake
new file mode 100644
index 000000000..0603d738f
--- /dev/null
+++ b/cmake/x64-windows-llvm.cmake
@@ -0,0 +1,11 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR x86_64 )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( arch_c_flags "-march=native" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
+
diff --git a/codecov.yml b/codecov.yml
deleted file mode 100644
index a301c5b2c..000000000
--- a/codecov.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-comment: off
-
-coverage:
-  status:
-    project:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
-    patch:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index f79acfef1..e61015d2a 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,5 +1,8 @@
 # common
 
+find_package(Threads REQUIRED)
+
+llama_add_compile_flags()
 
 # Build info header
 #
@@ -19,7 +22,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
         endif()
     endif()
 
-    set(GIT_INDEX "${GIT_DIR}/index")
+    if(EXISTS "${GIT_DIR}/index")
+        set(GIT_INDEX "${GIT_DIR}/index")
+    else()
+        message(WARNING "Git index not found in git repository.")
+        set(GIT_INDEX "")
+    endif()
 else()
     message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
     set(GIT_INDEX "")
@@ -31,7 +39,7 @@ add_custom_command(
     COMMENT "Generating build details from Git"
     COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
             -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
     WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
     DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
     VERBATIM
@@ -42,27 +50,75 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
-
 set(TARGET common)
 
 add_library(${TARGET} STATIC
+    arg.cpp
+    arg.h
     base64.hpp
-    common.h
+    chat.cpp
+    chat.hpp
+    chat-template.hpp
     common.cpp
-    sampling.h
-    sampling.cpp
-    console.h
+    common.h
     console.cpp
-    grammar-parser.h
-    grammar-parser.cpp
-    train.h
-    train.cpp
+    console.h
+    json-schema-to-grammar.cpp
+    json.hpp
+    llguidance.cpp
+    log.cpp
+    log.h
+    minja.hpp
+    ngram-cache.cpp
+    ngram-cache.h
+    sampling.cpp
+    sampling.h
+    speculative.cpp
+    speculative.h
     )
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
+# Use curl to download model url
+if (LLAMA_CURL)
+    find_package(CURL REQUIRED)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    find_library(CURL_LIBRARY curl REQUIRED)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+endif ()
+
+if (LLAMA_LLGUIDANCE)
+    include(ExternalProject)
+    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
+    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
+    ExternalProject_Add(llguidance_ext
+        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
+        # v0.6.12:
+        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
+        PREFIX ${CMAKE_BINARY_DIR}/llguidance
+        SOURCE_DIR ${LLGUIDANCE_SRC}
+        BUILD_IN_SOURCE TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND cargo build --release
+        INSTALL_COMMAND ""
+        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
+        UPDATE_COMMAND ""
+    )
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
+
+    add_library(llguidance STATIC IMPORTED)
+    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
+    add_dependencies(llguidance llguidance_ext)
+
+    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
+endif ()
+
 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/arg.cpp b/common/arg.cpp
new file mode 100644
index 000000000..152f671ab
--- /dev/null
+++ b/common/arg.cpp
@@ -0,0 +1,2370 @@
+#include "arg.h"
+
+#include "log.h"
+#include "sampling.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdarg>
+#include <fstream>
+#include <regex>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "json-schema-to-grammar.h"
+
+using json = nlohmann::ordered_json;
+
+common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
+    this->examples = std::move(examples);
+    return *this;
+}
+
+common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
+    this->excludes = std::move(excludes);
+    return *this;
+}
+
+common_arg & common_arg::set_env(const char * env) {
+    help = help + "\n(env: " + env + ")";
+    this->env = env;
+    return *this;
+}
+
+common_arg & common_arg::set_sparam() {
+    is_sparam = true;
+    return *this;
+}
+
+bool common_arg::in_example(enum llama_example ex) {
+    return examples.find(ex) != examples.end();
+}
+
+bool common_arg::is_exclude(enum llama_example ex) {
+    return excludes.find(ex) != excludes.end();
+}
+
+bool common_arg::get_value_from_env(std::string & output) {
+    if (env == nullptr) return false;
+    char * value = std::getenv(env);
+    if (value) {
+        output = value;
+        return true;
+    }
+    return false;
+}
+
+bool common_arg::has_value_from_env() {
+    return env != nullptr && std::getenv(env);
+}
+
+static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
+    std::vector<std::string> result;
+    std::istringstream iss(input);
+    std::string line;
+    auto add_line = [&](const std::string& l) {
+        if (l.length() <= max_char_per_line) {
+            result.push_back(l);
+        } else {
+            std::istringstream line_stream(l);
+            std::string word, current_line;
+            while (line_stream >> word) {
+                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
+                    if (!current_line.empty()) result.push_back(current_line);
+                    current_line = word;
+                } else {
+                    current_line += (!current_line.empty() ? " " : "") + word;
+                }
+            }
+            if (!current_line.empty()) result.push_back(current_line);
+        }
+    };
+    while (std::getline(iss, line)) {
+        add_line(line);
+    }
+    return result;
+}
+
+std::string common_arg::to_string() {
+    // params for printing to console
+    const static int n_leading_spaces = 40;
+    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
+    std::string leading_spaces(n_leading_spaces, ' ');
+
+    std::ostringstream ss;
+    for (const auto arg : args) {
+        if (arg == args.front()) {
+            if (args.size() == 1) {
+                ss << arg;
+            } else {
+                // first arg is usually abbreviation, we need padding to make it more beautiful
+                auto tmp = std::string(arg) + ", ";
+                auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' ');
+                ss << tmp << spaces;
+            }
+        } else {
+            ss << arg << (arg != args.back() ? ", " : "");
+        }
+    }
+    if (value_hint) ss << " " << value_hint;
+    if (value_hint_2) ss << " " << value_hint_2;
+    if (ss.tellp() > n_leading_spaces - 3) {
+        // current line is too long, add new line
+        ss << "\n" << leading_spaces;
+    } else {
+        // padding between arg and help, same line
+        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+    }
+    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+    for (const auto & line : help_lines) {
+        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+    }
+    return ss.str();
+}
+
+//
+// utils
+//
+
+static void common_params_handle_model_default(
+        std::string & model,
+        const std::string & model_url,
+        std::string & hf_repo,
+        std::string & hf_file,
+        const std::string & hf_token,
+        const std::string & model_default) {
+    if (!hf_repo.empty()) {
+        // short-hand to avoid specifying --hf-file -> default it to --model
+        if (hf_file.empty()) {
+            if (model.empty()) {
+                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
+                if (auto_detected.first.empty() || auto_detected.second.empty()) {
+                    exit(1); // built without CURL, error message already printed
+                }
+                hf_repo = auto_detected.first;
+                hf_file = auto_detected.second;
+            } else {
+                hf_file = model;
+            }
+        }
+        // make sure model path is present (for caching purposes)
+        if (model.empty()) {
+            // this is to avoid different repo having same file name, or same file name in different subdirs
+            std::string filename = hf_repo + "_" + hf_file;
+            // to make sure we don't have any slashes in the filename
+            string_replace_all(filename, "/", "_");
+            model = fs_get_cache_file(filename);
+        }
+    } else if (!model_url.empty()) {
+        if (model.empty()) {
+            auto f = string_split<std::string>(model_url, '#').front();
+            f = string_split<std::string>(f, '?').front();
+            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+        }
+    } else if (model.empty()) {
+        model = model_default;
+    }
+}
+
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
+//
+// CLI argument parsing functions
+//
+
+static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    common_params & params = ctx_arg.params;
+
+    std::unordered_map<std::string, common_arg *> arg_to_options;
+    for (auto & opt : ctx_arg.options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    // handle environment variables
+    for (auto & opt : ctx_arg.options) {
+        std::string value;
+        if (opt.get_value_from_env(value)) {
+            try {
+                if (opt.handler_void && (value == "1" || value == "true")) {
+                    opt.handler_void(params);
+                }
+                if (opt.handler_int) {
+                    opt.handler_int(params, std::stoi(value));
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(params, value);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(string_format(
+                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
+            }
+        }
+    }
+
+    // handle command line arguments
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
+    for (int i = 1; i < argc; i++) {
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+        }
+        auto opt = *arg_to_options[arg];
+        if (opt.has_value_from_env()) {
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+        }
+        try {
+            if (opt.handler_void) {
+                opt.handler_void(params);
+                continue;
+            }
+
+            // arg with single value
+            check_arg(i);
+            std::string val = argv[++i];
+            if (opt.handler_int) {
+                opt.handler_int(params, std::stoi(val));
+                continue;
+            }
+            if (opt.handler_string) {
+                opt.handler_string(params, val);
+                continue;
+            }
+
+            // arg with 2 values
+            check_arg(i);
+            std::string val2 = argv[++i];
+            if (opt.handler_str_str) {
+                opt.handler_str_str(params, val, val2);
+                continue;
+            }
+        } catch (std::exception & e) {
+            throw std::invalid_argument(string_format(
+                "error while handling argument \"%s\": %s\n\n"
+                "usage:\n%s\n\nto show complete usage, run with -h",
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+        }
+    }
+
+    postprocess_cpu_params(params.cpuparams,       nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+
+    postprocess_cpu_params(params.speculative.cpuparams,       &params.cpuparams);
+    postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
+
+    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
+        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+    }
+
+    // TODO: refactor model params in a common struct
+    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
+    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
+    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");
+
+    if (params.escape) {
+        string_process_escapes(params.prompt);
+        string_process_escapes(params.input_prefix);
+        string_process_escapes(params.input_suffix);
+        for (auto & antiprompt : params.antiprompt) {
+            string_process_escapes(antiprompt);
+        }
+        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
+            string_process_escapes(seq_breaker);
+        }
+    }
+
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
+    if (params.reranking && params.embedding) {
+        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
+    }
+
+    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
+        throw std::runtime_error(string_format(
+            "error: the supplied chat template is not supported: %s%s\n",
+            params.chat_template.c_str(),
+            params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
+        ));
+    }
+
+    return true;
+}
+
+static void common_params_print_usage(common_params_context & ctx_arg) {
+    auto print_options = [](std::vector<common_arg *> & options) {
+        for (common_arg * opt : options) {
+            printf("%s", opt->to_string().c_str());
+        }
+    };
+
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
+    for (auto & opt : ctx_arg.options) {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+    printf("----- common params -----\n\n");
+    print_options(common_options);
+    printf("\n\n----- sampling params -----\n\n");
+    print_options(sparam_options);
+    // TODO: maybe convert enum llama_example to string
+    printf("\n\n----- example-specific params -----\n\n");
+    print_options(specific_options);
+}
+
+static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
+    std::vector<ggml_backend_dev_t> devices;
+    auto dev_names = string_split<std::string>(value, ',');
+    if (dev_names.empty()) {
+        throw std::invalid_argument("no devices specified");
+    }
+    if (dev_names.size() == 1 && dev_names[0] == "none") {
+        devices.push_back(nullptr);
+    } else {
+        for (const auto & device : dev_names) {
+            auto * dev = ggml_backend_dev_by_name(device.c_str());
+            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
+            }
+            devices.push_back(dev);
+        }
+        devices.push_back(nullptr);
+    }
+    return devices;
+}
+
+static void add_rpc_devices(std::string servers) {
+    auto rpc_servers = string_split<std::string>(servers, ',');
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        throw std::invalid_argument("failed to find RPC backend");
+    }
+    typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+    ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+    if (!ggml_backend_rpc_add_device_fn) {
+        throw std::invalid_argument("failed to find RPC device add function");
+    }
+    for (const auto & server : rpc_servers) {
+        ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+        if (dev) {
+            ggml_backend_device_register(dev);
+        } else {
+            throw std::invalid_argument("failed to register RPC device");
+        }
+    }
+}
+
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
+    const common_params params_org = ctx_arg.params; // the example can modify the default params
+
+    try {
+        if (!common_params_parse_ex(argc, argv, ctx_arg)) {
+            ctx_arg.params = params_org;
+            return false;
+        }
+        if (ctx_arg.params.usage) {
+            common_params_print_usage(ctx_arg);
+            if (ctx_arg.print_usage) {
+                ctx_arg.print_usage(argc, argv);
+            }
+            exit(0);
+        }
+    } catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        ctx_arg.params = params_org;
+        return false;
+    }
+
+    return true;
+}
+
+static std::string list_builtin_chat_templates() {
+    std::vector<const char *> supported_tmpl;
+    int32_t res = llama_chat_builtin_templates(nullptr, 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    std::ostringstream msg;
+    for (auto & tmpl : supported_tmpl) {
+        msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    // load dynamic backends
+    ggml_backend_load_all();
+
+    common_params_context ctx_arg(params);
+    ctx_arg.print_usage = print_usage;
+    ctx_arg.ex          = ex;
+
+    std::string sampler_type_chars;
+    std::string sampler_type_names;
+    for (const auto & sampler : params.sampling.samplers) {
+        sampler_type_chars += common_sampler_type_to_chr(sampler);
+        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
+    }
+    sampler_type_names.pop_back();
+
+
+    /**
+     * filter options by example
+     * rules:
+     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
+     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+     */
+    auto add_opt = [&](common_arg arg) {
+        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
+            ctx_arg.options.push_back(std::move(arg));
+        }
+    };
+
+
+    add_opt(common_arg(
+        {"-h", "--help", "--usage"},
+        "print usage and exit",
+        [](common_params & params) {
+            params.usage = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--version"},
+        "show version and build info",
+        [](common_params &) {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        }
+    ));
+    add_opt(common_arg(
+        {"--verbose-prompt"},
+        string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+        [](common_params & params) {
+            params.verbose_prompt = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--no-display-prompt"},
+        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        [](common_params & params) {
+            params.display_prompt = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-co", "--color"},
+        string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
+        [](common_params & params) {
+            params.use_color = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(common_arg(
+        {"-t", "--threads"}, "N",
+        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        [](common_params & params, int value) {
+            params.cpuparams.n_threads = value;
+            if (params.cpuparams.n_threads <= 0) {
+                params.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_env("LLAMA_ARG_THREADS"));
+    add_opt(common_arg(
+        {"-tb", "--threads-batch"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads)",
+        [](common_params & params, int value) {
+            params.cpuparams_batch.n_threads = value;
+            if (params.cpuparams_batch.n_threads <= 0) {
+                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"-C", "--cpu-mask"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
+        [](common_params & params, const std::string & mask) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"-Cr", "--cpu-range"}, "lo-hi",
+        "range of CPUs for affinity. Complements --cpu-mask",
+        [](common_params & params, const std::string & range) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"--cpu-strict"}, "<0|1>",
+        string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+        [](common_params & params, const std::string & value) {
+            params.cpuparams.strict_cpu = std::stoul(value);
+        }
+    ));
+    add_opt(common_arg(
+        {"--prio"}, "N",
+        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(common_arg(
+        {"--poll"}, "<0...100>",
+        string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+        [](common_params & params, const std::string & value) {
+            params.cpuparams.poll = std::stoul(value);
+        }
+    ));
+    add_opt(common_arg(
+        {"-Cb", "--cpu-mask-batch"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"-Crb", "--cpu-range-batch"}, "lo-hi",
+        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
+        [](common_params & params, const std::string & range) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"--cpu-strict-batch"}, "<0|1>",
+        "use strict CPU placement (default: same as --cpu-strict)",
+        [](common_params & params, int value) {
+            params.cpuparams_batch.strict_cpu = value;
+        }
+    ));
+    add_opt(common_arg(
+        {"--prio-batch"}, "N",
+        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(common_arg(
+        {"--poll-batch"}, "<0|1>",
+        "use polling to wait for work (default: same as --poll)",
+        [](common_params & params, int value) {
+            params.cpuparams_batch.poll = value;
+        }
+    ));
+    add_opt(common_arg(
+        {"-lcs", "--lookup-cache-static"}, "FNAME",
+        "path to static lookup cache to use for lookup decoding (not updated by generation)",
+        [](common_params & params, const std::string & value) {
+            params.lookup_cache_static = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(common_arg(
+        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
+        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+        [](common_params & params, const std::string & value) {
+            params.lookup_cache_dynamic = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(common_arg(
+        {"-c", "--ctx-size"}, "N",
+        string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        [](common_params & params, int value) {
+            params.n_ctx = value;
+        }
+    ).set_env("LLAMA_ARG_CTX_SIZE"));
+    add_opt(common_arg(
+        {"-n", "--predict", "--n-predict"}, "N",
+        string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
+        [](common_params & params, int value) {
+            params.n_predict = value;
+        }
+    ).set_env("LLAMA_ARG_N_PREDICT"));
+    add_opt(common_arg(
+        {"-b", "--batch-size"}, "N",
+        string_format("logical maximum batch size (default: %d)", params.n_batch),
+        [](common_params & params, int value) {
+            params.n_batch = value;
+        }
+    ).set_env("LLAMA_ARG_BATCH"));
+    add_opt(common_arg(
+        {"-ub", "--ubatch-size"}, "N",
+        string_format("physical maximum batch size (default: %d)", params.n_ubatch),
+        [](common_params & params, int value) {
+            params.n_ubatch = value;
+        }
+    ).set_env("LLAMA_ARG_UBATCH"));
+    add_opt(common_arg(
+        {"--keep"}, "N",
+        string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+        [](common_params & params, int value) {
+            params.n_keep = value;
+        }
+    ));
+    add_opt(common_arg(
+        {"--no-context-shift"},
+        string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        [](common_params & params) {
+            params.ctx_shift = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    add_opt(common_arg(
+        {"--chunks"}, "N",
+        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+        [](common_params & params, int value) {
+            params.n_chunks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(common_arg(
+        {"-fa", "--flash-attn"},
+        string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.flash_attn = true;
+        }
+    ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(common_arg(
+        {"-p", "--prompt"}, "PROMPT",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
+        [](common_params & params, const std::string & value) {
+            params.prompt = value;
+        }
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--no-perf"},
+        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](common_params & params) {
+            params.no_perf = true;
+            params.sampling.no_perf = true;
+        }
+    ).set_env("LLAMA_ARG_NO_PERF"));
+    add_opt(common_arg(
+        {"-f", "--file"}, "FNAME",
+        "a file containing the prompt (default: none)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        }
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--in-file"}, "FNAME",
+        "an input file (repeat to specify multiple files)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.in_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"-bf", "--binary-file"}, "FNAME",
+        "binary file containing the prompt (default: none)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+        }
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-e", "--escape"},
+        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](common_params & params) {
+            params.escape = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--no-escape"},
+        "do not process escape sequences",
+        [](common_params & params) {
+            params.escape = false;
+        }
+    ));
+    add_opt(common_arg(
+        {"-ptc", "--print-token-count"}, "N",
+        string_format("print token count every N tokens (default: %d)", params.n_print),
+        [](common_params & params, int value) {
+            params.n_print = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--prompt-cache"}, "FNAME",
+        "file to cache prompt state for faster startup (default: none)",
+        [](common_params & params, const std::string & value) {
+            params.path_prompt_cache = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--prompt-cache-all"},
+        "if specified, saves user input and generations to cache as well\n",
+        [](common_params & params) {
+            params.prompt_cache_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--prompt-cache-ro"},
+        "if specified, uses the prompt cache but does not update it",
+        [](common_params & params) {
+            params.prompt_cache_ro = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-r", "--reverse-prompt"}, "PROMPT",
+        "halt generation at PROMPT, return control in interactive mode\n",
+        [](common_params & params, const std::string & value) {
+            params.antiprompt.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-sp", "--special"},
+        string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+        [](common_params & params) {
+            params.special = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-cnv", "--conversation"},
+        "run in conversation mode:\n"
+        "- does not print special tokens and suffix/prefix\n"
+        "- interactive mode is also enabled\n"
+        "(default: auto enabled if chat template is available)",
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-no-cnv", "--no-conversation"},
+        "force disable conversation mode (default: false)",
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-i", "--interactive"},
+        string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+        [](common_params & params) {
+            params.interactive = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-if", "--interactive-first"},
+        string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+        [](common_params & params) {
+            params.interactive_first = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-mli", "--multiline-input"},
+        "allows you to write or paste multiple lines without ending each in '\\'",
+        [](common_params & params) {
+            params.multiline_input = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--in-prefix-bos"},
+        "prefix BOS to user inputs, preceding the `--in-prefix` string",
+        [](common_params & params) {
+            params.input_prefix_bos = true;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--in-prefix"}, "STRING",
+        "string to prefix user inputs with (default: empty)",
+        [](common_params & params, const std::string & value) {
+            params.input_prefix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(common_arg(
+        {"--in-suffix"}, "STRING",
+        "string to suffix after user inputs with (default: empty)",
+        [](common_params & params, const std::string & value) {
+            params.input_suffix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(common_arg(
+        {"--no-warmup"},
+        "skip warming up the model with an empty run",
+        [](common_params & params) {
+            params.warmup = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--spm-infill"},
+        string_format(
+            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
+            params.spm_infill ? "enabled" : "disabled"
+        ),
+        [](common_params & params) {
+            params.spm_infill = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
+    add_opt(common_arg(
+        {"--samplers"}, "SAMPLERS",
+        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+        [](common_params & params, const std::string & value) {
+            const auto sampler_names = string_split<std::string>(value, ';');
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"-s", "--seed"}, "SEED",
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
+        [](common_params & params, const std::string & value) {
+            params.sampling.seed = std::stoul(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.sampling.samplers = common_sampler_types_from_chars(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--ignore-eos"},
+        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
+        [](common_params & params) {
+            params.sampling.ignore_eos = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--temp"}, "N",
+        string_format("temperature (default: %.1f)", (double)params.sampling.temp),
+        [](common_params & params, const std::string & value) {
+            params.sampling.temp = std::stof(value);
+            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--top-k"}, "N",
+        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
+        [](common_params & params, int value) {
+            params.sampling.top_k = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--top-p"}, "N",
+        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
+        [](common_params & params, const std::string & value) {
+            params.sampling.top_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--min-p"}, "N",
+        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
+        [](common_params & params, const std::string & value) {
+            params.sampling.min_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-probability"}, "N",
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
+        [](common_params & params, const std::string & value) {
+            params.sampling.xtc_probability = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--xtc-threshold"}, "N",
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
+        [](common_params & params, const std::string & value) {
+            params.sampling.xtc_threshold = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--typical"}, "N",
+        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
+        [](common_params & params, const std::string & value) {
+            params.sampling.typ_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--repeat-last-n"}, "N",
+        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
+        [](common_params & params, int value) {
+            if (value < -1) {
+                throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
+            }
+            params.sampling.penalty_last_n = value;
+            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--repeat-penalty"}, "N",
+        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
+        [](common_params & params, const std::string & value) {
+            params.sampling.penalty_repeat = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--presence-penalty"}, "N",
+        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
+        [](common_params & params, const std::string & value) {
+            params.sampling.penalty_present = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--frequency-penalty"}, "N",
+        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
+        [](common_params & params, const std::string & value) {
+            params.sampling.penalty_freq = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-multiplier"}, "N",
+        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
+        [](common_params & params, const std::string & value) {
+            params.sampling.dry_multiplier = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-base"}, "N",
+        string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
+        [](common_params & params, const std::string & value) {
+            float potential_base = std::stof(value);
+            if (potential_base >= 1.0f)
+            {
+                params.sampling.dry_base = potential_base;
+            }
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-allowed-length"}, "N",
+        string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
+        [](common_params & params, int value) {
+            params.sampling.dry_allowed_length = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-penalty-last-n"}, "N",
+        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
+        [](common_params & params, int value) {
+            if (value < -1) {
+                throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
+            }
+            params.sampling.dry_penalty_last_n = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dry-sequence-breaker"}, "STRING",
+        string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
+            params.sampling.dry_sequence_breakers.empty() ? "none" :
+            std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
+                params.sampling.dry_sequence_breakers.end(),
+                std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
+                [](const std::string& a, const std::string& b) {
+                    std::string formatted_b = (b == "\n") ? "\\n" : b;
+                    return a + ", '" + formatted_b + "'";
+                }).c_str()),
+        [](common_params & params, const std::string & value) {
+            static bool defaults_cleared = false;
+
+            if (!defaults_cleared) {
+                params.sampling.dry_sequence_breakers.clear();
+                defaults_cleared = true;
+            }
+
+            if (value == "none") {
+                params.sampling.dry_sequence_breakers.clear();
+            } else {
+                params.sampling.dry_sequence_breakers.emplace_back(value);
+            }
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dynatemp-range"}, "N",
+        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
+        [](common_params & params, const std::string & value) {
+            params.sampling.dynatemp_range = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--dynatemp-exp"}, "N",
+        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
+        [](common_params & params, const std::string & value) {
+            params.sampling.dynatemp_exponent = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--mirostat"}, "N",
+        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
+        [](common_params & params, int value) {
+            params.sampling.mirostat = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--mirostat-lr"}, "N",
+        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
+        [](common_params & params, const std::string & value) {
+            params.sampling.mirostat_eta = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--mirostat-ent"}, "N",
+        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
+        [](common_params & params, const std::string & value) {
+            params.sampling.mirostat_tau = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
+        "modifies the likelihood of token appearing in the completion,\n"
+        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
+        [](common_params & params, const std::string & value) {
+            std::stringstream ss(value);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    params.sampling.logit_bias.push_back({key, bias});
+                } else {
+                    throw std::invalid_argument("invalid input format");
+                }
+            } catch (const std::exception&) {
+                throw std::invalid_argument("invalid input format");
+            }
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--grammar"}, "GRAMMAR",
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.sampling.grammar = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--grammar-file"}, "FNAME",
+        "file to read grammar from",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.sampling.grammar)
+            );
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"-j", "--json-schema"}, "SCHEMA",
+        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+        [](common_params & params, const std::string & value) {
+            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--pooling"}, "{none,mean,cls,last,rank}",
+        "pooling type for embeddings, use model default if unspecified",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
+            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
+    add_opt(common_arg(
+        {"--attention"}, "{causal,non-causal}",
+        "attention type for embeddings, use model default if unspecified",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--rope-scaling"}, "{none,linear,yarn}",
+        "RoPE frequency scaling method, defaults to linear unless specified by the model",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
+    add_opt(common_arg(
+        {"--rope-scale"}, "N",
+        "RoPE context scaling factor, expands context by a factor of N",
+        [](common_params & params, const std::string & value) {
+            params.rope_freq_scale = 1.0f / std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_ROPE_SCALE"));
+    add_opt(common_arg(
+        {"--rope-freq-base"}, "N",
+        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
+        [](common_params & params, const std::string & value) {
+            params.rope_freq_base = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
+    add_opt(common_arg(
+        {"--rope-freq-scale"}, "N",
+        "RoPE frequency scaling factor, expands context by a factor of 1/N",
+        [](common_params & params, const std::string & value) {
+            params.rope_freq_scale = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
+    add_opt(common_arg(
+        {"--yarn-orig-ctx"}, "N",
+        string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+        [](common_params & params, int value) {
+            params.yarn_orig_ctx = value;
+        }
+    ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
+    add_opt(common_arg(
+        {"--yarn-ext-factor"}, "N",
+        string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+        [](common_params & params, const std::string & value) {
+            params.yarn_ext_factor = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
+    add_opt(common_arg(
+        {"--yarn-attn-factor"}, "N",
+        string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+        [](common_params & params, const std::string & value) {
+            params.yarn_attn_factor = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
+    add_opt(common_arg(
+        {"--yarn-beta-slow"}, "N",
+        string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+        [](common_params & params, const std::string & value) {
+            params.yarn_beta_slow = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
+    add_opt(common_arg(
+        {"--yarn-beta-fast"}, "N",
+        string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+        [](common_params & params, const std::string & value) {
+            params.yarn_beta_fast = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
+    add_opt(common_arg(
+        {"-gan", "--grp-attn-n"}, "N",
+        string_format("group-attention factor (default: %d)", params.grp_attn_n),
+        [](common_params & params, int value) {
+            params.grp_attn_n = value;
+        }
+    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(common_arg(
+        {"-gaw", "--grp-attn-w"}, "N",
+        string_format("group-attention width (default: %d)", params.grp_attn_w),
+        [](common_params & params, int value) {
+            params.grp_attn_w = value;
+        }
+    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-dkvc", "--dump-kv-cache"},
+        "verbose print of the KV cache",
+        [](common_params & params) {
+            params.dump_kv_cache = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"-nkvo", "--no-kv-offload"},
+        "disable KV offload",
+        [](common_params & params) {
+            params.no_kv_offload = true;
+        }
+    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    add_opt(common_arg(
+        {"-ctk", "--cache-type-k"}, "TYPE",
+        string_format(
+            "KV cache data type for K\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_k)
+        ),
+        [](common_params & params, const std::string & value) {
+            params.cache_type_k = kv_cache_type_from_str(value);
+        }
+    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
+    add_opt(common_arg(
+        {"-ctv", "--cache-type-v"}, "TYPE",
+        string_format(
+            "KV cache data type for V\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_v)
+        ),
+        [](common_params & params, const std::string & value) {
+            params.cache_type_v = kv_cache_type_from_str(value);
+        }
+    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
+    add_opt(common_arg(
+        {"--perplexity", "--all-logits"},
+        string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        [](common_params & params) {
+            params.logits_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--hellaswag"},
+        "compute HellaSwag score over random tasks from datafile supplied with -f",
+        [](common_params & params) {
+            params.hellaswag = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--hellaswag-tasks"}, "N",
+        string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+        [](common_params & params, int value) {
+            params.hellaswag_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--winogrande"},
+        "compute Winogrande score over random tasks from datafile supplied with -f",
+        [](common_params & params) {
+            params.winogrande = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--winogrande-tasks"}, "N",
+        string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+        [](common_params & params, int value) {
+            params.winogrande_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--multiple-choice"},
+        "compute multiple choice score over random tasks from datafile supplied with -f",
+        [](common_params & params) {
+            params.multiple_choice = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--multiple-choice-tasks"}, "N",
+        string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+        [](common_params & params, int value) {
+            params.multiple_choice_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--kl-divergence"},
+        "computes KL-divergence to logits provided via --kl-divergence-base",
+        [](common_params & params) {
+            params.kl_divergence = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
+        "set logits file",
+        [](common_params & params, const std::string & value) {
+            params.logits_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--ppl-stride"}, "N",
+        string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+        [](common_params & params, int value) {
+            params.ppl_stride = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"--ppl-output-type"}, "<0|1>",
+        string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+        [](common_params & params, int value) {
+            params.ppl_output_type = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"-dt", "--defrag-thold"}, "N",
+        string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        [](common_params & params, const std::string & value) {
+            params.defrag_thold = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
+    add_opt(common_arg(
+        {"-np", "--parallel"}, "N",
+        string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        [](common_params & params, int value) {
+            params.n_parallel = value;
+        }
+    ).set_env("LLAMA_ARG_N_PARALLEL"));
+    add_opt(common_arg(
+        {"-ns", "--sequences"}, "N",
+        string_format("number of sequences to decode (default: %d)", params.n_sequences),
+        [](common_params & params, int value) {
+            params.n_sequences = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
+    add_opt(common_arg(
+        {"-cb", "--cont-batching"},
+        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.cont_batching = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
+    add_opt(common_arg(
+        {"-nocb", "--no-cont-batching"},
+        "disable continuous batching",
+        [](common_params & params) {
+            params.cont_batching = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    add_opt(common_arg(
+        {"--mmproj"}, "FILE",
+        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [](common_params & params, const std::string & value) {
+            params.mmproj = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(common_arg(
+        {"--image"}, "FILE",
+        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        [](common_params & params, const std::string & value) {
+            params.image.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    if (llama_supports_rpc()) {
+        add_opt(common_arg(
+            {"--rpc"}, "SERVERS",
+            "comma separated list of RPC servers",
+            [](common_params & params, const std::string & value) {
+                add_rpc_devices(value);
+                GGML_UNUSED(params);
+            }
+        ).set_env("LLAMA_ARG_RPC"));
+    }
+    add_opt(common_arg(
+        {"--mlock"},
+        "force system to keep model in RAM rather than swapping or compressing",
+        [](common_params & params) {
+            params.use_mlock = true;
+        }
+    ).set_env("LLAMA_ARG_MLOCK"));
+    add_opt(common_arg(
+        {"--no-mmap"},
+        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        [](common_params & params) {
+            params.use_mmap = false;
+        }
+    ).set_env("LLAMA_ARG_NO_MMAP"));
+    add_opt(common_arg(
+        {"--numa"}, "TYPE",
+        "attempt optimizations that help on some NUMA systems\n"
+        "- distribute: spread execution evenly over all nodes\n"
+        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
+        "- numactl: use the CPU map provided by numactl\n"
+        "if run without this previously, it is recommended to drop the system page cache before using this\n"
+        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_env("LLAMA_ARG_NUMA"));
+    add_opt(common_arg(
+        {"-dev", "--device"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.devices = parse_device_list(value);
+        }
+    ).set_env("LLAMA_ARG_DEVICE"));
+    add_opt(common_arg(
+        {"--list-devices"},
+        "print list of available devices and exit",
+        [](common_params &) {
+            std::vector<ggml_backend_dev_t> rpc_devices;
+            std::vector<ggml_backend_dev_t> all_devices;
+            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                auto * dev = ggml_backend_dev_get(i);
+                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
+                        rpc_devices.push_back(dev);
+                    } else {
+                        all_devices.push_back(dev);
+                    }
+                }
+            }
+            // insert RPC devices in front
+            all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
+            printf("Available devices:\n");
+            for (size_t i = 0; i < all_devices.size(); ++i) {
+                auto * dev = all_devices[i];
+                size_t free, total;
+                ggml_backend_dev_memory(dev, &free, &total);
+                printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+            }
+            exit(0);
+        }
+    ));
+    add_opt(common_arg(
+        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
+        "number of layers to store in VRAM",
+        [](common_params & params, int value) {
+            params.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(common_arg(
+        {"-sm", "--split-mode"}, "{none,layer,row}",
+        "how to split the model across multiple GPUs, one of:\n"
+        "- none: use one GPU only\n"
+        "- layer (default): split layers and KV across GPUs\n"
+        "- row: split rows across GPUs",
+        [](common_params & params, const std::string & value) {
+            std::string arg_next = value;
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+            } else if (arg_next == "row") {
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_SPLIT_MODE"));
+    add_opt(common_arg(
+        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
+        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
+        [](common_params & params, const std::string & value) {
+            std::string arg_next = value;
+
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                );
+            }
+            for (size_t i = 0; i < llama_max_devices(); ++i) {
+                if (i < split_arg.size()) {
+                    params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                    params.tensor_split[i] = 0.0f;
+                }
+            }
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
+    add_opt(common_arg(
+        {"-mg", "--main-gpu"}, "INDEX",
+        string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+        [](common_params & params, int value) {
+            params.main_gpu = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_MAIN_GPU"));
+    add_opt(common_arg(
+        {"--check-tensors"},
+        string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+        [](common_params & params) {
+            params.check_tensors = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--override-kv"}, "KEY=TYPE:VALUE",
+        "advanced option to override model metadata by key. may be specified multiple times.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        [](common_params & params, const std::string & value) {
+            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
+                throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
+            }
+        }
+    ));
+    add_opt(common_arg(
+        {"--lora"}, "FNAME",
+        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        [](common_params & params, const std::string & value) {
+            params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
+        }
+        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(common_arg(
+        {"--lora-scaled"}, "FNAME", "SCALE",
+        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
+        [](common_params & params, const std::string & fname, const std::string & scale) {
+            params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
+        }
+        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(common_arg(
+        {"--control-vector"}, "FNAME",
+        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        [](common_params & params, const std::string & value) {
+            params.control_vectors.push_back({ 1.0f, value, });
+        }
+    ));
+    add_opt(common_arg(
+        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        "add a control vector with user defined scaling SCALE\n"
+        "note: this argument can be repeated to add multiple scaled control vectors",
+        [](common_params & params, const std::string & fname, const std::string & scale) {
+            params.control_vectors.push_back({ std::stof(scale), fname });
+        }
+    ));
+    add_opt(common_arg(
+        {"--control-vector-layer-range"}, "START", "END",
+        "layer range to apply the control vector(s) to, start and end inclusive",
+        [](common_params & params, const std::string & start, const std::string & end) {
+            params.control_vector_layer_start = std::stoi(start);
+            params.control_vector_layer_end = std::stoi(end);
+        }
+    ));
+    add_opt(common_arg(
+        {"-a", "--alias"}, "STRING",
+        "set alias for model name (to be used by REST API)",
+        [](common_params & params, const std::string & value) {
+            params.model_alias = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
+    add_opt(common_arg(
+        {"-m", "--model"}, "FNAME",
+        ex == LLAMA_EXAMPLE_EXPORT_LORA
+            ? std::string("model path from which to load base model")
+            : string_format(
+                "model path (default: `models/$filename` with filename from `--hf-file` "
+                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
+            ),
+        [](common_params & params, const std::string & value) {
+            params.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    add_opt(common_arg(
+        {"-mu", "--model-url"}, "MODEL_URL",
+        "model download url (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.model_url = value;
+        }
+    ).set_env("LLAMA_ARG_MODEL_URL"));
+    add_opt(common_arg(
+        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
+        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+        "example: unsloth/phi-4-GGUF:q4_k_m\n"
+        "(default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO"));
+    add_opt(common_arg(
+        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
+        "Same as --hf-repo, but for the draft model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HFD_REPO"));
+    add_opt(common_arg(
+        {"-hff", "--hf-file"}, "FILE",
+        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE"));
+    add_opt(common_arg(
+        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
+        "Hugging Face model repository for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO_V"));
+    add_opt(common_arg(
+        {"-hffv", "--hf-file-v"}, "FILE",
+        "Hugging Face model file for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE_V"));
+    add_opt(common_arg(
+        {"-hft", "--hf-token"}, "TOKEN",
+        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
+        [](common_params & params, const std::string & value) {
+            params.hf_token = value;
+        }
+    ).set_env("HF_TOKEN"));
+    add_opt(common_arg(
+        {"--context-file"}, "FNAME",
+        "file to load context from (repeat to specify multiple files)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.context_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(common_arg(
+        {"--chunk-size"}, "N",
+        string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+        [](common_params & params, int value) {
+            params.chunk_size = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(common_arg(
+        {"--chunk-separator"}, "STRING",
+        string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.chunk_separator = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(common_arg(
+        {"--junk"}, "N",
+        string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
+        [](common_params & params, int value) {
+            params.n_junk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(common_arg(
+        {"--pos"}, "N",
+        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
+        [](common_params & params, int value) {
+            params.i_pos = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(common_arg(
+        {"-o", "--output", "--output-file"}, "FNAME",
+        string_format("output file (default: '%s')",
+            ex == LLAMA_EXAMPLE_EXPORT_LORA
+                ? params.lora_outfile.c_str()
+                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
+                    ? params.cvector_outfile.c_str()
+                    : params.out_file.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.out_file = value;
+            params.cvector_outfile = value;
+            params.lora_outfile = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(common_arg(
+        {"-ofreq", "--output-frequency"}, "N",
+        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+        [](common_params & params, int value) {
+            params.n_out_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--save-frequency"}, "N",
+        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+        [](common_params & params, int value) {
+            params.n_save_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--process-output"},
+        string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+        [](common_params & params) {
+            params.process_output = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--no-ppl"},
+        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](common_params & params) {
+            params.compute_ppl = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--chunk", "--from-chunk"}, "N",
+        string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
+        [](common_params & params, int value) {
+            params.i_chunk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"-pps"},
+        string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+        [](common_params & params) {
+            params.is_pp_shared = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(common_arg(
+        {"-npp"}, "n0,n1,...",
+        "number of prompt tokens",
+        [](common_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(common_arg(
+        {"-ntg"}, "n0,n1,...",
+        "number of text generation tokens",
+        [](common_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(common_arg(
+        {"-npl"}, "n0,n1,...",
+        "number of parallel prompts",
+        [](common_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(common_arg(
+        {"--embd-normalize"}, "N",
+        string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        [](common_params & params, int value) {
+            params.embd_normalize = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--embd-output-format"}, "FORMAT",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
+        [](common_params & params, const std::string & value) {
+            params.embd_out = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--embd-separator"}, "STRING",
+        "separator of embeddings (default \\n) for example \"<#sep#>\"",
+        [](common_params & params, const std::string & value) {
+            params.embd_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--host"}, "HOST",
+        string_format("ip address to listen (default: %s)", params.hostname.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.hostname = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    add_opt(common_arg(
+        {"--port"}, "PORT",
+        string_format("port to listen (default: %d)", params.port),
+        [](common_params & params, int value) {
+            params.port = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    add_opt(common_arg(
+        {"--path"}, "PATH",
+        string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.public_path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+    add_opt(common_arg(
+        {"--no-webui"},
+        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.webui = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+    add_opt(common_arg(
+        {"--embedding", "--embeddings"},
+        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    add_opt(common_arg(
+        {"--reranking", "--rerank"},
+        string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.reranking = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
+    add_opt(common_arg(
+        {"--api-key"}, "KEY",
+        "API key to use for authentication (default: none)",
+        [](common_params & params, const std::string & value) {
+            params.api_keys.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
+    add_opt(common_arg(
+        {"--api-key-file"}, "FNAME",
+        "path to file containing API keys (default: none)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream key_file(value);
+            if (!key_file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string key;
+            while (std::getline(key_file, key)) {
+                if (!key.empty()) {
+                        params.api_keys.push_back(key);
+                }
+            }
+            key_file.close();
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--ssl-key-file"}, "FNAME",
+        "path to file a PEM-encoded SSL private key",
+        [](common_params & params, const std::string & value) {
+            params.ssl_file_key = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
+    add_opt(common_arg(
+        {"--ssl-cert-file"}, "FNAME",
+        "path to file a PEM-encoded SSL certificate",
+        [](common_params & params, const std::string & value) {
+            params.ssl_file_cert = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
+    add_opt(common_arg(
+        {"-to", "--timeout"}, "N",
+        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+        [](common_params & params, int value) {
+            params.timeout_read  = value;
+            params.timeout_write = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
+    add_opt(common_arg(
+        {"--threads-http"}, "N",
+        string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+        [](common_params & params, int value) {
+            params.n_threads_http = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    add_opt(common_arg(
+        {"--cache-reuse"}, "N",
+        string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
+        [](common_params & params, int value) {
+            params.n_cache_reuse = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+    add_opt(common_arg(
+        {"--metrics"},
+        string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.endpoint_metrics = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(common_arg(
+        {"--slots"},
+        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.endpoint_slots = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(common_arg(
+        {"--props"},
+        string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.endpoint_props = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
+    add_opt(common_arg(
+        {"--no-slots"},
+        "disables slots monitoring endpoint",
+        [](common_params & params) {
+            params.endpoint_slots = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+    add_opt(common_arg(
+        {"--slot-save-path"}, "PATH",
+        "path to save slot kv cache (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.slot_save_path = value;
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.slot_save_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--jinja"},
+        "use jinja template for chat (default: disabled)",
+        [](common_params & params) {
+            params.use_jinja = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    add_opt(common_arg(
+        {"--chat-template"}, "JINJA_TEMPLATE",
+        string_format(
+            "set custom jinja chat template (default: template taken from model's metadata)\n"
+            "if suffix/prefix are specified, template will be disabled\n"
+            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
+            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+        ),
+        [](common_params & params, const std::string & value) {
+            params.chat_template = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    add_opt(common_arg(
+        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
+        string_format(
+            "set custom jinja chat template file (default: template taken from model's metadata)\n"
+            "if suffix/prefix are specified, template will be disabled\n"
+            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
+            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+        ),
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.chat_template));
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
+        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        [](common_params & params, const std::string & value) {
+            params.slot_prompt_similarity = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--lora-init-without-apply"},
+        string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.lora_init_without_apply = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--simple-io"},
+        "use basic IO for better compatibility in subprocesses and limited consoles",
+        [](common_params & params) {
+            params.simple_io = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(common_arg(
+        {"--positive-file"}, "FNAME",
+        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.cvector_positive_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(common_arg(
+        {"--negative-file"}, "FNAME",
+        string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+        [](common_params & params, const std::string & value) {
+            params.cvector_negative_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(common_arg(
+        {"--pca-batch"}, "N",
+        string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+        [](common_params & params, int value) {
+            params.n_pca_batch = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(common_arg(
+        {"--pca-iter"}, "N",
+        string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+        [](common_params & params, int value) {
+            params.n_pca_iterations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(common_arg(
+        {"--method"}, "{pca, mean}",
+        "dimensionality reduction method to be used (default: pca)",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(common_arg(
+        {"--output-format"}, "{md,jsonl}",
+        "output format for batched-bench results (default: md)",
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
+            else if (value == "md") { params.batched_bench_output_jsonl = false; }
+            else { std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(common_arg(
+        {"--log-disable"},
+        "Log disable",
+        [](common_params &) {
+            common_log_pause(common_log_main());
+        }
+    ));
+    add_opt(common_arg(
+        {"--log-file"}, "FNAME",
+        "Log to file",
+        [](common_params &, const std::string & value) {
+            common_log_set_file(common_log_main(), value.c_str());
+        }
+    ));
+    add_opt(common_arg(
+        {"--log-colors"},
+        "Enable colored logging",
+        [](common_params &) {
+            common_log_set_colors(common_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_COLORS"));
+    add_opt(common_arg(
+        {"-v", "--verbose", "--log-verbose"},
+        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
+        [](common_params & params) {
+            params.verbosity = INT_MAX;
+            common_log_set_verbosity_thold(INT_MAX);
+        }
+    ));
+    add_opt(common_arg(
+        {"-lv", "--verbosity", "--log-verbosity"}, "N",
+        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+        [](common_params & params, int value) {
+            params.verbosity = value;
+            common_log_set_verbosity_thold(value);
+        }
+    ).set_env("LLAMA_LOG_VERBOSITY"));
+    add_opt(common_arg(
+        {"--log-prefix"},
+        "Enable prefx in log messages",
+        [](common_params &) {
+            common_log_set_prefix(common_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_PREFIX"));
+    add_opt(common_arg(
+        {"--log-timestamps"},
+        "Enable timestamps in log messages",
+        [](common_params &) {
+            common_log_set_timestamps(common_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+
+    // speculative parameters
+    add_opt(common_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.n_threads = value;
+            if (params.speculative.cpuparams.n_threads <= 0) {
+                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.n_threads = value;
+            if (params.speculative.cpuparams_batch.n_threads <= 0) {
+                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](common_params & params, const std::string & mask) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [](common_params & params, const std::string & range) {
+            params.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--prio-batch-draft"}, "N",
+        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
+        [](common_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [](common_params & params, int value) {
+            params.speculative.cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(common_arg(
+        {"--draft-max", "--draft", "--draft-n"}, "N",
+        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
+        [](common_params & params, int value) {
+            params.speculative.n_max = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+    add_opt(common_arg(
+        {"--draft-min", "--draft-n-min"}, "N",
+        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
+        [](common_params & params, int value) {
+            params.speculative.n_min = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+    add_opt(common_arg(
+        {"--draft-p-split"}, "P",
+        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
+    add_opt(common_arg(
+        {"--draft-p-min"}, "P",
+        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
+        [](common_params & params, const std::string & value) {
+            params.speculative.p_min = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    add_opt(common_arg(
+        {"-cd", "--ctx-size-draft"}, "N",
+        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
+        [](common_params & params, int value) {
+            params.speculative.n_ctx = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+    add_opt(common_arg(
+        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
+        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
+        "use --list-devices to see a list of available devices",
+        [](common_params & params, const std::string & value) {
+            params.speculative.devices = parse_device_list(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [](common_params & params, int value) {
+            params.speculative.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+    add_opt(common_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+
+    add_opt(common_arg(
+        {"-mv", "--model-vocoder"}, "FNAME",
+        "vocoder model for audio generation (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+     add_opt(common_arg(
+        {"--tts-use-guide-tokens"},
+        "Use guide tokens to improve TTS word recall",
+        [](common_params & params) {
+            params.vocoder.use_guide_tokens = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+
+    // model-specific
+    add_opt(common_arg(
+        {"--tts-oute-default"},
+        string_format("use default OuteTTS models (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS}));
+
+    add_opt(common_arg(
+        {"--embd-bge-small-en-default"},
+        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-e5-small-en-default"},
+        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-gte-small-default"},
+        string_format("use default gte-small model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.hf_file = "gte-small-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    return ctx_arg;
+}
diff --git a/common/arg.h b/common/arg.h
new file mode 100644
index 000000000..49ab8667b
--- /dev/null
+++ b/common/arg.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "common.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+//
+// CLI argument parsing
+//
+
+struct common_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::set<enum llama_example> excludes = {};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    void (*handler_void)   (common_params & params) = nullptr;
+    void (*handler_string) (common_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (common_params & params, int) = nullptr;
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(common_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(common_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(common_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
+    common_arg & set_env(const char * env);
+    common_arg & set_sparam();
+    bool in_example(enum llama_example ex);
+    bool is_exclude(enum llama_example ex);
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
+};
+
+struct common_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    common_params & params;
+    std::vector<common_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    common_params_context(common_params & params) : params(params) {}
+};
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+// function to be used by test-arg-parser
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/chat-template.hpp b/common/chat-template.hpp
new file mode 100644
index 000000000..882ba41bd
--- /dev/null
+++ b/common/chat-template.hpp
@@ -0,0 +1,529 @@
+/*
+    Copyright 2024 Google LLC
+
+    Use of this source code is governed by an MIT-style
+    license that can be found in the LICENSE file or at
+    https://opensource.org/licenses/MIT.
+*/
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "minja.hpp"
+#include <json.hpp>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+namespace minja {
+
+struct chat_template_caps {
+    bool supports_tools = false;
+    bool supports_tool_calls = false;
+    bool supports_tool_responses = false;
+    bool supports_system_role = false;
+    bool supports_parallel_tool_calls = false;
+    bool supports_tool_call_id = false;
+    // meta-llama/Llama-3.1-8B-Instruct expects arguments to be an object.
+    // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
+    bool requires_object_arguments = false;
+    // CohereForAI/c4ai-command-r-plus simple variant
+    bool requires_non_null_content = false;
+    // MiniMaxAI/MiniMax-Text-01 special
+    bool requires_typed_content = false;
+};
+
+struct chat_template_inputs {
+    nlohmann::ordered_json messages;
+    nlohmann::ordered_json tools;
+    bool add_generation_prompt = true;
+    nlohmann::ordered_json extra_context;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+};
+
+struct chat_template_options {
+    bool apply_polyfills = true;
+    bool use_bos_token = true;
+    bool use_eos_token = true;
+    bool define_strftime_now = true;
+
+    bool polyfill_tools = true;
+    bool polyfill_tool_call_examples = true;
+    bool polyfill_tool_calls = true;
+    bool polyfill_tool_responses = true;
+    bool polyfill_system_role = true;
+    bool polyfill_object_arguments = true;
+    bool polyfill_typed_content = true;
+};
+
+class chat_template {
+
+  private:
+    chat_template_caps caps_;
+    std::string source_;
+    std::string bos_token_;
+    std::string eos_token_;
+    std::shared_ptr<minja::TemplateNode> template_root_;
+    std::string tool_call_example_;
+
+    std::string try_raw_render(
+        const nlohmann::ordered_json & messages,
+        const nlohmann::ordered_json & tools,
+        bool add_generation_prompt,
+        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
+    {
+        try {
+            chat_template_inputs inputs;
+            inputs.messages = messages;
+            inputs.tools = tools;
+            inputs.add_generation_prompt = add_generation_prompt;
+            inputs.extra_context = extra_context;
+            // Use fixed date for tests
+            inputs.now = std::chrono::system_clock::from_time_t(0);
+
+            chat_template_options opts;
+            opts.apply_polyfills = false;
+
+            auto prompt = apply(inputs, opts);
+            // fprintf(stderr, "try_raw_render: %s\n", prompt.c_str());
+            return prompt;
+        } catch (const std::exception & e) {
+            // fprintf(stderr, "try_raw_render error: %s\n", e.what());
+            return "";
+        }
+    }
+
+  public:
+
+    chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
+        : source_(source), bos_token_(bos_token), eos_token_(eos_token)
+    {
+        template_root_ = minja::Parser::parse(source_, {
+            /* .trim_blocks = */ true,
+            /* .lstrip_blocks = */ true,
+            /* .keep_trailing_newline = */ false,
+        });
+
+        auto contains = [](const std::string & haystack, const std::string & needle) {
+            return haystack.find(needle) != std::string::npos;
+        };
+
+        const std::string user_needle = "<User Needle>";
+        const std::string sys_needle = "<System Needle>";
+        const json dummy_str_user_msg = {{"role", "user"}, {"content", user_needle}};
+        const json dummy_typed_user_msg = {{"role", "user"}, {"content", json::array({{{"type", "text"}, {"text", user_needle}}})}};
+
+        caps_.requires_typed_content =
+            !contains(try_raw_render(json::array({dummy_str_user_msg}), {}, false), user_needle)
+            && contains(try_raw_render(json::array({dummy_typed_user_msg}), {}, false), user_needle);
+
+        const auto dummy_user_msg = caps_.requires_typed_content
+            ? dummy_typed_user_msg
+            : dummy_str_user_msg;
+        const json needle_system_msg = {
+            {"role", "system"},
+            {"content", caps_.requires_typed_content ? json::array({{{"type", "text"}, {"text", sys_needle}}}) : json(sys_needle)},
+        };
+
+        caps_.supports_system_role = contains(try_raw_render({needle_system_msg, dummy_user_msg,}, {}, false), sys_needle);
+
+        auto out = try_raw_render(json::array({
+            dummy_user_msg
+        }), json::array({
+            {
+                {"name", "some_tool"},
+                {"type", "function"},
+                {"function", {
+                    {"name", "some_tool"},
+                    {"description", "Some tool."},
+                    {"parameters", {
+                        {"type", "object"},
+                        {"properties", {
+                            {"arg", {
+                                {"type", "string"},
+                                {"description", "Some argument."},
+                            }},
+                        }},
+                        {"required", json::array({ "arg" })},
+                    }},
+                }},
+            },
+        }), false);
+        caps_.supports_tools = contains(out, "some_tool");
+
+        auto make_tool_calls_msg = [&](const json & tool_calls) {
+            return json {
+                {"role", "assistant"},
+                {"content", nullptr},
+                {"tool_calls", tool_calls},
+            };
+        };
+        auto make_tool_call = [](const std::string & tool_name, const json & arguments) {
+            return json {
+                {"id", "call_1___"},
+                {"type", "function"},
+                {"function", {
+                    {"arguments", arguments},
+                    {"name", tool_name},
+                }},
+            };
+        };
+        const json dummy_args_obj {{"argument_needle", "print('Hello, World!')"}};
+
+        // Note: the arguments are rendered in both cases, but may be double-escaped, which we don't want.
+        out = try_raw_render(json::array({
+            dummy_user_msg,
+            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
+        }), {}, false);
+        auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
+        out = try_raw_render(json::array({
+            dummy_user_msg,
+            make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
+        }), {}, false);
+        auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
+
+        caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
+        caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
+        auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
+        auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
+        caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
+
+        if (caps_.supports_tool_calls) {
+            auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
+            auto tc1 = make_tool_call("test_tool1", dummy_args);
+            auto tc2 = make_tool_call("test_tool2", dummy_args);
+            auto out = try_raw_render(json::array({
+                dummy_user_msg,
+                make_tool_calls_msg(json::array({tc1, tc2})),
+            }), {}, false);
+            caps_.supports_parallel_tool_calls = contains(out, "test_tool1") && contains(out, "test_tool2");
+
+            out = try_raw_render(json::array({
+                dummy_user_msg,
+                make_tool_calls_msg(json::array({tc1})),
+                {
+                    {"role", "tool"},
+                    {"name", "test_tool1"},
+                    {"content", "Some response!"},
+                    {"tool_call_id", "call_911_"},
+                }
+            }), {}, false);
+            caps_.supports_tool_responses = contains(out, "Some response!");
+            caps_.supports_tool_call_id = contains(out, "call_911_");
+        }
+
+        try {
+            if (!caps_.supports_tools) {
+                const json user_msg {
+                    {"role", "user"},
+                    {"content", "Hey"},
+                };
+                const json args {
+                    {"arg1", "some_value"},
+                };
+                const json tool_call_msg {
+                    {"role", "assistant"},
+                    {"content", nullptr},
+                    {"tool_calls", json::array({
+                        {
+                            // TODO: detect if requires numerical id or fixed length == 6 like Nemo
+                            {"id", "call_1___"},
+                            {"type", "function"},
+                            {"function", {
+                                {"name", "tool_name"},
+                                {"arguments", (caps_.requires_object_arguments ? args : json(minja::Value(args).dump(-1, /* to_json= */ true)))},
+                            }},
+                        },
+                    })},
+                };
+                std::string prefix, full;
+                {
+                    chat_template_inputs inputs;
+                    inputs.messages = json::array({user_msg});
+                    inputs.add_generation_prompt = true;
+                    prefix = apply(inputs);
+                }
+                {
+                    chat_template_inputs inputs;
+                    inputs.messages = json::array({user_msg, tool_call_msg});
+                    inputs.add_generation_prompt = false;
+                    full = apply(inputs);
+                }
+                auto eos_pos_last = full.rfind(eos_token_);
+                if (eos_pos_last == prefix.size() - eos_token_.size() ||
+                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
+                    full = full.substr(0, eos_pos_last);
+                }
+                size_t common_prefix_length = 0;
+                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
+                    if (prefix[i] != full[i]) {
+                        break;
+                    }
+                    if (prefix[i] == '<') {
+                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
+                        // but it removes thinking tags for past messages.
+                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
+                        continue;
+                    }
+                    common_prefix_length = i + 1;
+                }
+                auto example = full.substr(common_prefix_length);
+                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
+                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
+                } else {
+                    tool_call_example_ = example;
+                }
+            }
+        } catch (const std::exception & e) {
+            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
+        }
+    }
+
+    const std::string & source() const { return source_; }
+    const std::string & bos_token() const { return bos_token_; }
+    const std::string & eos_token() const { return eos_token_; }
+    const chat_template_caps & original_caps() const { return caps_; }
+
+    // Deprecated, please use the form with chat_template_inputs and chat_template_options
+    std::string apply(
+        const nlohmann::ordered_json & messages,
+        const nlohmann::ordered_json & tools,
+        bool add_generation_prompt,
+        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
+        bool apply_polyfills = true)
+    {
+        fprintf(stderr, "[%s] Deprecated!\n", __func__);
+        chat_template_inputs inputs;
+        inputs.messages = messages;
+        inputs.tools = tools;
+        inputs.add_generation_prompt = add_generation_prompt;
+        inputs.extra_context = extra_context;
+        inputs.now = std::chrono::system_clock::now();
+
+        chat_template_options opts;
+        opts.apply_polyfills = apply_polyfills;
+
+        return apply(inputs, opts);
+    }
+
+    std::string apply(
+        const chat_template_inputs & inputs,
+        const chat_template_options & opts = chat_template_options()) const
+    {
+        json actual_messages;
+
+        auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+        auto has_tool_calls = false;
+        auto has_tool_responses = false;
+        auto has_string_content = false;
+        for (const auto & message : inputs.messages) {
+            if (message.contains("tool_calls") && !message["tool_calls"].is_null()) {
+                has_tool_calls = true;
+            }
+            if (message.contains("role") && message["role"] == "tool") {
+                has_tool_responses = true;
+            }
+            if (message.contains("content") && message["content"].is_string()) {
+                has_string_content = true;
+            }
+        }
+
+        auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
+        auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
+        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
+        auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
+        auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
+        auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
+        auto polyfill_typed_content = opts.polyfill_typed_content && has_string_content && caps_.requires_typed_content;
+
+        auto needs_polyfills = opts.apply_polyfills && (false
+            || polyfill_system_role
+            || polyfill_tools
+            || polyfill_tool_calls
+            || polyfill_tool_responses
+            || polyfill_object_arguments
+            || polyfill_typed_content
+        );
+
+        if (needs_polyfills) {
+            actual_messages = json::array();
+
+            auto add_message = [&](const json & msg) {
+                if (polyfill_typed_content && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
+                    actual_messages.push_back({
+                        {"role", msg.at("role")},
+                        {"content", {{
+                            {"type", "text"},
+                            {"text", msg.at("content")},
+                        }}},
+                    });
+                } else {
+                    actual_messages.push_back(msg);
+                }
+            };
+
+            std::string pending_system;
+            auto flush_sys = [&]() {
+                if (!pending_system.empty()) {
+                    add_message({
+                        {"role", "user"},
+                        {"content", pending_system},
+                    });
+                    pending_system.clear();
+                }
+            };
+
+            json adjusted_messages;
+            if (polyfill_tools) {
+                adjusted_messages = add_system(inputs.messages,
+                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
+            } else {
+                adjusted_messages = inputs.messages;
+            }
+
+            for (const auto & message_ : adjusted_messages) {
+                auto message = message_;
+                if (!message.contains("role") || !message.contains("content")) {
+                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
+                }
+                std::string role = message.at("role");
+
+                if (message.contains("tool_calls")) {
+                    if (polyfill_object_arguments || polyfill_tool_calls) {
+                        for (auto & tool_call : message.at("tool_calls")) {
+                            if (tool_call["type"] == "function") {
+                                auto & function = tool_call.at("function");
+                                auto & arguments = function.at("arguments");
+                                if (arguments.is_string()) {
+                                    try {
+                                        arguments = json::parse(arguments.get<std::string>());
+                                    } catch (const std::exception & ecvt) {
+                                        fprintf(stderr, "Failed to parse arguments: %s\n", ecvt.what());
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if (polyfill_tool_calls) {
+                        auto content = message.at("content");
+                        auto tool_calls = json::array();
+                        for (const auto & tool_call : message.at("tool_calls")) {
+                            if (tool_call.at("type") != "function") {
+                                continue;
+                            }
+                            const auto & function = tool_call.at("function");
+                            auto tc = json {
+                                {"name", function.at("name")},
+                                {"arguments", function.at("arguments")},
+                            };
+                            if (tool_call.contains("id")) {
+                                tc["id"] = tool_call["id"];
+                            }
+                            tool_calls.push_back(tc);
+                        }
+                        auto obj = json {
+                            {"tool_calls", tool_calls},
+                        };
+                        if (!content.is_null() && content != "") {
+                            obj["content"] = content;
+                        }
+                        message["content"] = obj.dump(2);
+                        message.erase("tool_calls");
+                    }
+                }
+                if (polyfill_tool_responses && role == "tool") {
+                    message["role"] = "user";
+                    auto obj = json {
+                        {"tool_response", {
+                            {"content", message.at("content")},
+                        }},
+                    };
+                    if (message.contains("name")) {
+                        obj["tool_response"]["name"] = message.at("name");
+                    }
+                    if (message.contains("tool_call_id")) {
+                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
+                    }
+                    message["content"] = obj.dump(2);
+                    message.erase("name");
+                }
+
+                if (!message["content"].is_null() && polyfill_system_role) {
+                    std::string content = message.at("content");
+                    if (role == "system") {
+                        if (!pending_system.empty()) pending_system += "\n";
+                        pending_system += content;
+                        continue;
+                    } else {
+                        if (role == "user") {
+                            if (!pending_system.empty()) {
+                                message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
+                                pending_system.clear();
+                            }
+                        } else {
+                            flush_sys();
+                        }
+                    }
+                }
+                add_message(message);
+            }
+            flush_sys();
+        } else {
+            actual_messages = inputs.messages;
+        }
+
+        auto context = minja::Context::make(json({
+            {"messages", actual_messages},
+            {"add_generation_prompt", inputs.add_generation_prompt},
+        }));
+        context->set("bos_token", opts.use_bos_token ? bos_token_ : "");
+        context->set("eos_token", opts.use_eos_token ? eos_token_ : "");
+        if (opts.define_strftime_now) {
+            auto now = inputs.now;
+            context->set("strftime_now", Value::callable([now](const std::shared_ptr<minja::Context> &, minja::ArgumentsValue & args) {
+                args.expectArgs("strftime_now", {1, 1}, {0, 0});
+                auto format = args.args[0].get<std::string>();
+
+                auto time = std::chrono::system_clock::to_time_t(now);
+                auto local_time = *std::localtime(&time);
+                std::ostringstream ss;
+                ss << std::put_time(&local_time, format.c_str());
+                return ss.str();
+            }));
+        }
+        if (!inputs.tools.is_null()) {
+            context->set("tools", minja::Value(inputs.tools));
+        }
+        if (!inputs.extra_context.is_null()) {
+            for (auto & kv : inputs.extra_context.items()) {
+                context->set(kv.key(), minja::Value(kv.value()));
+            }
+        }
+
+        auto ret = template_root_->render(context);
+        // fprintf(stderr, "actual_messages: %s\n", actual_messages.dump(2).c_str());
+        // fprintf(stderr, "apply: %s\n\n", ret.c_str());
+        return ret;
+    }
+
+    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
+        json messages_with_system = messages;
+
+        if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
+            std::string existing_system = messages_with_system.at(0).at("content");
+            messages_with_system[0] = json {
+                {"role", "system"},
+                {"content", existing_system + "\n\n" + system_prompt},
+            };
+        } else {
+            messages_with_system.insert(messages_with_system.begin(), json {
+                {"role", "system"},
+                {"content", system_prompt},
+            });
+        }
+        return messages_with_system;
+    }
+};
+
+}  // namespace minja
diff --git a/common/chat.cpp b/common/chat.cpp
new file mode 100644
index 000000000..ef1c6fb3d
--- /dev/null
+++ b/common/chat.cpp
@@ -0,0 +1,966 @@
+#include "chat.hpp"
+#include "chat-template.hpp"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "minja.hpp"
+
+std::string common_chat_format_name(common_chat_format format) {
+    switch (format) {
+        case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
+        case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
+        case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
+        case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
+        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
+        case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
+        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
+        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
+        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+        default:
+            throw std::runtime_error("Unknown chat format");
+    }
+}
+
+const common_grammar_options grammar_options {
+    /* .dotall = */ false,
+    /* .compact_spaces = */ false,
+    // /* .compact_spaces = */ true,
+};
+
+static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
+    // // https://json.nlohmann.me/features/parsing/sax_interface/
+    struct json_error_locator : public nlohmann::json_sax<json> {
+        std::size_t position;
+        bool found_error;
+
+        json_error_locator() : position(0), found_error(false) {}
+
+        bool parse_error(std::size_t position, const std::string &, const json::exception &) override {
+            this->position = position - 1;
+            this->found_error = true;
+            return false;
+        }
+        bool null() override { return true; }
+        bool boolean(bool) override { return true; }
+        bool number_integer(number_integer_t) override { return true; }
+        bool number_unsigned(number_unsigned_t) override { return true; }
+        bool number_float(number_float_t, const string_t &) override { return true; }
+        bool string(string_t &) override { return true; }
+        bool binary(binary_t &) override { return true; }
+        bool start_object(std::size_t) override { return true; }
+        bool key(string_t &) override { return true; }
+        bool end_object() override { return true; }
+        bool start_array(std::size_t) override { return true; }
+        bool end_array() override { return true; }
+    };
+    json_error_locator err_loc;
+    json::sax_parse(it, end, &err_loc);
+
+    std::string::const_iterator temptative_end;
+    if (err_loc.found_error) {
+        temptative_end = it + err_loc.position;
+    } else {
+        temptative_end = end;
+    }
+    std::string json_sub {it, temptative_end};
+    try {
+        out = json::parse(json_sub);
+        it = temptative_end;
+        return true;
+    } catch (const std::exception &) {
+        return false;
+    }
+}
+
+
+/**
+ * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
+ * Aggregates the prefix, suffix and in-between text into the content.
+ */
+static common_chat_msg parse_json_tool_calls(
+    const std::string& input,
+    const std::optional<std::regex> & trigger_opt,
+    const std::regex & function_regex,
+    const std::regex & close_regex) {
+    std::smatch match;
+
+    common_chat_msg result;
+    result.role = "assistant";
+
+
+    auto end = input.end();
+    auto it = input.begin();
+
+    if (trigger_opt) {
+        if (!std::regex_search(it, end, match, *trigger_opt)) {
+            result.content = input;
+            return result;
+        }
+        result.content = match.prefix().str();
+        it = match.suffix().first;
+    }
+
+    while (it != end) {
+        std::sregex_iterator rend;
+        std::sregex_iterator rit(it, end, function_regex);
+        if (rit == rend) {
+            fprintf(stderr, "No more tool calls found\n");
+            result.content += std::string(it, end);
+            break;
+        }
+        auto name = rit->str(1);
+        result.content += std::string(it, rit->prefix().second);
+        it = rit->suffix().first;
+
+        json arguments;
+        if (!parse_json(it, end, arguments)) {
+            throw std::runtime_error("Failed to parse json tool call arguments");
+        }
+        if (!std::regex_search(it, end, match, close_regex)) {
+            throw std::runtime_error("Malformed input, missing closing pattern");
+        }
+        it = match.suffix().first;
+        result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
+    }
+    return result;
+}
+
+static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
+    auto content_end = input.find(prefix);
+    size_t tc_start = std::string::npos;
+
+    common_chat_msg result;
+    result.role = "assistant";
+    const auto process_tool_calls = [&](const json & tool_calls) {
+        for (const auto & tool_call : tool_calls) {
+            const auto & arguments = tool_call["arguments"];
+            result.tool_calls.push_back({
+                tool_call["name"],
+                arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
+                tool_call.contains("id") ? tool_call["id"] : "",
+            });
+        }
+    };
+    if (content_end == std::string::npos) {
+        result.content = input;
+    } else {
+        tc_start = content_end + prefix.size() - rstrip_prefix;
+        result.content = input.substr(0, content_end);
+        auto tool_calls = json::parse(input.substr(tc_start));
+        process_tool_calls(tool_calls);
+    }
+    return result;
+}
+
+static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
+    for (const auto & tool : tools) {
+        if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
+            LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
+            continue;
+        }
+        fn(tool);
+    }
+}
+
+static std::string apply(
+    const common_chat_template & tmpl,
+    const nlohmann::ordered_json & messages,
+    const nlohmann::ordered_json & tools,
+    bool add_generation_prompt,
+    const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
+{
+    minja::chat_template_inputs tmpl_inputs;
+    tmpl_inputs.messages = messages;
+    tmpl_inputs.tools = tools;
+    tmpl_inputs.add_generation_prompt = add_generation_prompt;
+    tmpl_inputs.extra_context = extra_context;
+    // TODO: add flag to control date/time, if only for testing purposes.
+    // tmpl_inputs.now = std::chrono::system_clock::now();
+
+    minja::chat_template_options tmpl_opts;
+    tmpl_opts.use_bos_token = false;
+    tmpl_opts.use_eos_token = false;
+
+    return tmpl.apply(tmpl_inputs, tmpl_opts);
+}
+
+static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+
+    auto tool_call_schemas = json::array();
+    foreach_function(inputs.tools, [&](const json & tool) {
+        const auto & function = tool["function"];
+        auto tool_schema = json {
+            {"type", "object"},
+            {"properties", {
+                {"name", {
+                    {"type", "string"},
+                    {"const", function["name"]},
+                }},
+                {"arguments", function["parameters"]},
+            }},
+            {"required", json::array({"name", "arguments"})},
+        };
+        if (function.contains("description")) {
+            tool_schema["description"] = function["description"];
+        }
+        if (inputs.parallel_tool_calls) {
+            tool_schema["properties"]["id"] = {
+                {"type", "string"},
+                {"minLength", 4},
+            };
+            tool_schema["required"].push_back("id");
+        }
+        tool_call_schemas.emplace_back(tool_schema);
+    });
+    const auto tool_call =
+        inputs.parallel_tool_calls
+            ? json {
+                {"type", "object"},
+                {"properties", {
+                    {"tool_calls", {
+                        {"type", "array"},
+                        {"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
+                            {"anyOf", tool_call_schemas},
+                        }},
+                        {"minItems", 1},
+                    }},
+                }},
+                {"required", json::array({"tool_calls"})},
+            }
+            : json {
+                {"type", "object"},
+                {"properties", {
+                    {"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
+                        {"anyOf", tool_call_schemas},
+                    }},
+                }},
+                {"required", json::array({"tool_call"})},
+            };
+    const auto schema =
+        inputs.tool_choice != "required"
+            ? json {
+                {"anyOf", json::array({
+                    tool_call,
+                    {
+                        {"type", "object"},
+                        {"properties", {
+                            {"response", inputs.json_schema.is_null()
+                                ? json {{"type", "string"}}
+                                : inputs.json_schema
+                            },
+                        }},
+                        {"required", json::array({"response"})},
+                    },
+                })}
+            }
+            : tool_call;
+
+    data.grammar_lazy = false;
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        builder.add_schema("root", schema);
+    }, grammar_options);
+
+    auto tweaked_messages = common_chat_template::add_system(
+        inputs.messages,
+        "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
+
+    data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_GENERIC;
+    return data;
+}
+static common_chat_msg common_chat_parse_generic(const std::string & input) {
+    json data = json::parse(input);
+    common_chat_msg result;
+    result.role = "assistant";
+    if (data.contains("tool_calls")) {
+        for (const auto & tool_call : data["tool_calls"]) {
+            result.tool_calls.push_back({
+                tool_call["name"],
+                tool_call["arguments"].dump(),
+                tool_call.contains("id") ? tool_call["id"] : "",
+            });
+        }
+    } else if (data.contains("tool_call")) {
+        result.tool_calls.push_back({
+            data["tool_call"]["name"],
+            data["tool_call"]["arguments"].dump(),
+            /* id= */ "",
+        });
+    } else if (data.contains("response")) {
+        const auto & response = data["response"];
+        result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
+    }
+    return result;
+}
+
+static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        auto schemas = json::array();
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            schemas.push_back({
+                {"type", "object"},
+                {"properties", {
+                    // Important note: the model is probably trained to take a JSON stringified arguments value.
+                    // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
+                    {"name", {
+                        {"type", "string"},
+                        {"const", function["name"]},
+                    }},
+                    {"arguments", function["parameters"]},
+                    {"id", {
+                        {"type", "string"},
+                        // Nemo's template expects a 9-character alphanumeric ID.
+                        {"pattern", "^[a-zA-Z0-9]{9}$"},
+                    }},
+                }},
+                {"required", json::array({"name", "arguments", "id"})},
+            });
+        });
+        auto schema = json {
+            {"type", "array"},
+            {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+            {"minItems", 1},
+        };
+        if (!inputs.parallel_tool_calls) {
+            schema["maxItems"] = 1;
+        }
+        builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
+    }, grammar_options);
+    data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
+    return data;
+}
+static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
+    return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
+}
+
+static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        auto schemas = json::array();
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            schemas.push_back({
+                {"type", "object"},
+                {"properties", {
+                    {"tool_call_id", {
+                        {"type", "string"},
+                        // Command-R's template expects an integer string.
+                        {"pattern", "^[0-9]{1,10}$"},
+                    }},
+                    {"tool_name", {
+                        {"type", "string"},
+                        {"const", function["name"]},
+                    }},
+                    {"parameters", function["parameters"]},
+                }},
+                {"required", json::array({"tool_call_id", "tool_name", "parameters"})},
+            });
+        });
+        auto schema = json {
+            {"type", "array"},
+            {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+            {"minItems", 1},
+        };
+        if (!inputs.parallel_tool_calls) {
+            schema["maxItems"] = 1;
+        }
+        builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
+    }, grammar_options);
+    data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
+    data.preserved_tokens = {
+        "<|START_RESPONSE|>",
+        "<|END_RESPONSE|>",
+        "<|START_THINKING|>",
+        "<|END_THINKING|>",
+        "<|END_ACTION|>",
+    };
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
+    return data;
+}
+static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
+    static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
+    static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
+    std::smatch match;
+
+    common_chat_msg result;
+    result.role = "assistant";
+    if (std::regex_match(input, match, response_regex)) {
+        result.content = match[1].str();
+    } else if (std::regex_match(input, match, thought_action_regex)) {
+        result.tool_plan = match[1].str();
+        auto actions_str = match[2].str();
+        auto actions = json::parse(actions_str);
+        for (const auto & action : actions) {
+            result.tool_calls.push_back({
+                /* .name = */      action["tool_name"],
+                /* .arguments = */ action["parameters"].dump(),
+                /* .id = */        action["tool_call_id"],
+            });
+        }
+    } else {
+        LOG_ERR("Failed to parse command_r output");
+        result.content = input;
+    }
+    return result;
+}
+
+static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
+    if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
+        throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
+    }
+    const auto & parameters_properties = parameters.at("properties");
+    const auto & parameters_required = parameters.at("required");
+    for (const auto & prop : expected_properties) {
+        if (!parameters_properties.contains(prop)) {
+            throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop);
+        }
+        if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
+            throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop);
+        }
+    }
+    if (parameters_properties.size() != expected_properties.size()) {
+        throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
+    }
+}
+
+static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) {
+    auto builtin_tools = json::array();
+    common_chat_params data;
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+
+        auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
+            if (name == "wolfram_alpha") {
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+                expect_tool_parameters(name, parameters, {"query"});
+            } else if (name == "web_search" || name == "brave_search") {
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+                expect_tool_parameters(name, parameters, {"query"});
+            } else if (name == "python" || name == "code_interpreter") {
+                // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+                expect_tool_parameters(name, parameters, {"code"});
+            } else {
+                return false;
+            }
+
+            std::vector<std::string> kvs;
+            for (const auto & [key, value] : parameters.at("properties").items()) {
+                kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value));
+            }
+
+            tool_rules.push_back(
+                builder.add_rule(
+                    name + "-call",
+                    "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
+            builtin_tools.push_back(name);
+
+            return true;
+        };
+
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            std::string name = function["name"];
+            auto parameters = function["parameters"];
+            builder.resolve_refs(parameters);
+
+            // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
+            if (allow_python_tag_builtin_tools) {
+                handle_builtin_tool(name, parameters);
+            }
+            tool_rules.push_back(
+                builder.add_rule(
+                    name + "-call",
+                    "\"{\" space "
+                    "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
+                    "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
+                        builder.add_schema(name + "-args", parameters) +
+                    " \"}\""));
+            data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
+        });
+        data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"type\": \"function\"", /* .at_start = */ true});
+        if (!builtin_tools.empty()) {
+            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
+        }
+        builder.add_rule("root", string_join(tool_rules, " | "));
+    }, grammar_options);
+    data.additional_stops.push_back("<|eom_id|>");
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
+        {"tools_in_user_message", false},
+        {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
+    });
+    data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
+        ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
+        : COMMON_CHAT_FORMAT_LLAMA_3_X;
+    return data;
+}
+static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
+    // TODO: tighten & simplify the parser, don't accept leading text context.
+    static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
+    static std::regex close_regex("\\}");
+    static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
+
+    if (with_builtin_tools) {
+        std::smatch match;
+        if (std::regex_match(input, match, builtin_call_regex)) {
+            auto name = match[1].str();
+            auto raw_args = match[2].str();
+
+            // TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
+            auto it_eq = raw_args.find('=');
+            auto arg_name = raw_args.substr(0, it_eq);
+            auto arg_value_str = raw_args.substr(it_eq + 1);
+            auto arg_value = json::parse(arg_value_str);
+
+            return {
+                /* .role = */ "assistant",
+                /* .content = */ match.prefix().str(),
+                /* .tool_calls = */ {
+                    {
+                        /* .name = */ match[1],
+                        /* .arguments = */ (json {
+                            {arg_name, arg_value},
+                        }).dump(),
+                        /* .id = */ "",
+                    },
+                },
+            };
+        }
+    }
+    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
+}
+
+static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            std::string name = function["name"];
+            auto parameters = function["parameters"];
+            auto args_rule = builder.add_schema(name + "-args", parameters);
+            tool_rules.push_back(builder.add_rule(name + "-call",
+                "\"<｜tool▁call▁begin｜>function<｜tool▁sep｜>" + name + "\\n```json\\n\" " + args_rule + " \"```<｜tool▁call▁end｜>\""));
+        });
+        data.grammar_triggers.push_back({"<｜tool▁calls▁begin｜>", /* .at_start = */ false});
+        data.preserved_tokens = {
+            "<｜tool▁sep｜>",
+            "<｜tool▁call▁end｜>",
+        };
+        builder.add_rule("root", "\"<｜tool▁calls▁begin｜>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
+    }, grammar_options);
+    auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.prompt = prompt;
+    data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
+    return data;
+}
+static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
+    static std::regex trigger_regex("<｜tool▁calls▁begin｜>");
+    static std::regex function_regex("<｜tool▁call▁begin｜>function<｜tool▁sep｜>([^\n]+)\n```json\n");
+    static std::regex close_regex("```<｜tool▁call▁end｜>");
+    return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
+}
+
+static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    fprintf(stderr, "%s\n", __func__);
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
+        {"datetime", "Jan 29 2025 13:00:00 GMT"},
+        {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
+    });
+    if (!inputs.tools.is_null() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != "required";
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool["function"];
+                schemas.push_back({
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {
+                            {"type", "string"},
+                            {"const", function["name"]},
+                        }},
+                        {"arguments", function["parameters"]},
+                    }},
+                    {"required", json::array({"name", "arguments", "id"})},
+                });
+            });
+            auto schema = json {
+                {"type", "array"},
+                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+                {"minItems", 1},
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
+        }, grammar_options);
+        data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
+        data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
+    } else {
+        data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    }
+    return data;
+}
+static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
+    return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
+}
+
+static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
+    // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
+    if (!inputs.tools.is_null() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != "required";
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> first_tool_rules;
+            std::vector<std::string> subsequent_tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool["function"];
+                std::string name = function["name"];
+                auto parameters = function["parameters"];
+                auto args_rule = builder.add_schema(name + "-args", parameters);
+                first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
+                subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
+                data.grammar_triggers.push_back({name, /* .at_start = */ true});
+                data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
+            });
+            auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
+            if (inputs.parallel_tool_calls) {
+                auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
+                builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
+            } else {
+                builder.add_rule("root", first_rule);
+            }
+
+        }, grammar_options);
+    }
+    return data;
+}
+
+static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
+    auto expected_it = expected.begin();
+    auto tmp_it = it;
+    while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
+        ++tmp_it;
+        ++expected_it;
+    }
+    if (expected_it == expected.end()) {
+        it = tmp_it;
+        return true;
+    }
+    return false;
+}
+
+static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
+    static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
+    static std::regex close_regex(R"($|(?=>>>))");
+
+    std::string content;
+    auto it = input.begin();
+    const auto end = input.end();
+
+    if (consume(it, end, "all\n")) {
+        std::smatch match;
+        if (std::regex_search(it, end, match, function_regex)) {
+            auto fun_it = match.prefix().second;
+            content = std::string(it, fun_it);
+            it = fun_it;
+        } else {
+            common_chat_msg res;
+            res.role = "assistant";
+            res.content = std::string(it, end);
+            return res;
+        }
+    }
+    // TODO: tighten & simplify.
+    try {
+        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
+        res.content = content + res.content;
+        return res;
+    } catch (const std::exception & e) {
+        LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
+        common_chat_msg res;
+        res.role = "assistant";
+        res.content = input;
+        return res;
+    }
+}
+
+static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
+    common_chat_params data;
+    json tools = inputs.tools.is_null() ? inputs.tools : json::array();
+    std::string python_code_argument_name;
+    auto has_raw_python = false;
+
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            const auto & parameters = function["parameters"];
+            std::string name = function["name"];
+            if (name == "python" || name == "ipython") {
+                if (!parameters.contains("type")) {
+                    throw std::runtime_error("Missing type in python tool");
+                }
+                has_raw_python = true;
+                auto type = parameters.at("type");
+                if (type == "object") {
+                    auto properties = parameters.at("properties");
+                    for (auto it = properties.begin(); it != properties.end(); ++it) {
+                        if (it.value().at("type") == "string") {
+                            if (!python_code_argument_name.empty()) {
+                                throw std::runtime_error("Multiple string arguments found in python tool");
+                            }
+                            python_code_argument_name = it.key();
+                        }
+                    }
+                    if (python_code_argument_name.empty()) {
+                        throw std::runtime_error("No string argument found in python tool");
+                    }
+                } else if (type != "string") {
+                    throw std::runtime_error("Invalid type in python tool: " + type.dump());
+                }
+            }
+            tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
+        });
+        if (has_raw_python) {
+            tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
+            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
+        }
+        auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
+        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
+        data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
+    }, grammar_options);
+
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    // TODO: if (has_raw_python)
+    data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
+    return data;
+}
+static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
+    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
+    static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
+    std::smatch match;
+    if (std::regex_search(input, match, python_tag_regex)) {
+        auto code = match[1].str();
+        return {
+            /* .role = */ "assistant",
+            /* .content = */ match.prefix().str(),
+            /* .tool_calls = */ {
+                {
+                    /* .name = */ "python",
+                    /* .arguments = */ (json {{"code", code}}).dump(),
+                    /* .id = */ "",
+                },
+            }
+        };
+    }
+    static std::regex function_regex(R"(<function=(\w+)>)");
+    static std::regex close_regex(R"(</function>)");
+    // TODO: tighten & simplify.
+    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
+}
+
+static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+    // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
+    data.grammar_lazy = inputs.tool_choice != "required";
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool["function"];
+            std::string name = function["name"];
+            auto parameters = function["parameters"];
+            builder.resolve_refs(parameters);
+            tool_rules.push_back(builder.add_schema(name + "-call", {
+                {"type", "object"},
+                {"properties", json {
+                    {"name", json {{"const", name}}},
+                    {"arguments", parameters},
+                }},
+                {"required", json::array({"name", "arguments"})},
+            }));
+        });
+        auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
+        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
+        data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
+        data.preserved_tokens = { "</tool_call>" };
+    }, grammar_options);
+
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
+    return data;
+}
+static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
+    try {
+        std::regex start_pattern(R"([\n\s]*<tool_call>)");
+        std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
+        std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
+
+        auto end = input.end();
+        std::sregex_iterator rend;
+        std::sregex_iterator rit(input.begin(), end, start_pattern);
+        if (rit == rend) {
+            return {
+                /* .role = */ "assistant",
+                /* .content = */ input,
+                /* .tool_calls = */ {},
+            };
+        }
+
+        common_chat_msg result;
+        result.role = "assistant";
+        result.content = rit->prefix();
+
+        auto it = rit->suffix().first;
+        while (it != end) {
+            json call;
+            if (!parse_json(it, end, call)) {
+                throw std::runtime_error("Failed to parse json tool call");
+            }
+            const auto & arguments = call["arguments"];
+            result.tool_calls.push_back({
+                call["name"],
+                arguments.dump(),
+                // arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
+                /* id= */ "",
+            });
+            rit = {it, end, middle_pattern};
+            if (rit != rend) {
+                it = rit->suffix().first;
+            } else {
+                rit = {it, end, end_pattern};
+                if (rit == rend) {
+                    throw std::runtime_error("Malformed input, missing </tool_call>");
+                }
+                break;
+            }
+        }
+        return result;
+    } catch (const std::exception & e) {
+        return {
+            /* .role = */ "assistant",
+            /* .content = */ input,
+            /* .tool_calls = */ {},
+        };
+    }
+}
+
+static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+    data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    data.grammar_lazy = false;
+    if (!inputs.json_schema.is_null()) {
+        if (!inputs.grammar.empty()) {
+            throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+        }
+        data.grammar = json_schema_to_grammar(inputs.json_schema);
+    } else {
+        data.grammar = inputs.grammar.empty();
+    }
+    return data;
+}
+
+common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
+    auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
+    LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
+
+    if (has_tools && !inputs.grammar.empty()) {
+        throw std::runtime_error("Cannot specify grammar with tools");
+    }
+
+    const auto & src = tmpl.source();
+    if (src.find(">>>all") != std::string::npos) {
+        // Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
+        return common_chat_params_init_functionary_v3_2(tmpl, inputs);
+    }
+    if (src.find(" functools[") != std::string::npos) {
+        // Firefunction v2 requires datetime and functions in the context, even w/o tools.
+        return common_chat_params_init_firefunction_v2(tmpl, inputs);
+    }
+
+    if (!has_tools) {
+        return common_chat_params_init_without_tools(tmpl, inputs);
+    }
+
+    if (src.find("<tool_call>") != std::string::npos) {
+        return common_chat_params_init_hermes_2_pro(tmpl, inputs);
+    }
+    if (src.find("<|start_header_id|>") != std::string::npos
+        && src.find("<function=") != std::string::npos) {
+        return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
+    }
+    if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
+        auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
+        return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
+    }
+    if (src.find("<｜tool▁calls▁begin｜>") != std::string::npos) {
+        return common_chat_params_init_deepseek_r1(tmpl, inputs);
+    }
+    if (src.find("[TOOL_CALLS]") != std::string::npos) {
+        return common_chat_params_init_mistral_nemo(tmpl, inputs);
+    }
+    if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
+        return common_chat_params_init_command_r7b(tmpl, inputs);
+    }
+    return common_chat_params_init_generic(tmpl, inputs);
+}
+
+static common_chat_msg common_chat_parse_content_only(const std::string & input) {
+    return {
+        /* .role = */ "assistant",
+        /* .content = */ input,
+        /* .tool_calls = */ {},
+    };
+}
+
+common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
+    switch (format) {
+        case COMMON_CHAT_FORMAT_CONTENT_ONLY:
+            return common_chat_parse_content_only(input);
+        case COMMON_CHAT_FORMAT_GENERIC:
+            return common_chat_parse_generic(input);
+        case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
+            return common_chat_parse_mistral_nemo(input);
+        case COMMON_CHAT_FORMAT_LLAMA_3_X:
+            return common_chat_parse_llama_3_1(input);
+        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
+            return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
+        case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
+            return common_chat_parse_deepseek_r1(input);
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
+            return common_chat_parse_functionary_v3_2(input);
+        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
+            return common_chat_parse_functionary_v3_1_llama_3_1(input);
+        case COMMON_CHAT_FORMAT_HERMES_2_PRO:
+            return common_chat_parse_hermes_2_pro(input);
+        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
+            return common_chat_parse_firefunction_v2(input);
+        case COMMON_CHAT_FORMAT_COMMAND_R7B:
+            return common_chat_parse_command_r7b(input);
+        default:
+            throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
+    }
+}
diff --git a/common/chat.hpp b/common/chat.hpp
new file mode 100644
index 000000000..33e64a430
--- /dev/null
+++ b/common/chat.hpp
@@ -0,0 +1,52 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+
+#pragma once
+
+#include "common.h"
+#include <json.hpp>
+#include <optional>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+struct common_chat_inputs {
+    json messages;
+    json tools;
+    json tool_choice;
+    json json_schema;
+    bool parallel_tool_calls;
+    bool stream;
+    std::string grammar;
+    bool add_generation_prompt = true;
+};
+
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    json                                prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+
+struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
diff --git a/scripts/gen-build-info-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake
similarity index 86%
rename from scripts/gen-build-info-cpp.cmake
rename to common/cmake/build-info-gen-cpp.cmake
index d89338920..fbc92b52c 100644
--- a/scripts/gen-build-info-cpp.cmake
+++ b/common/cmake/build-info-gen-cpp.cmake
@@ -1,7 +1,7 @@
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 
 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 
 # Only write the build info if it changed
 if(EXISTS ${OUTPUT_FILE})
diff --git a/common/common.cpp b/common/common.cpp
index 938c428cf..8661e164a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,21 +1,39 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
+#include "ggml.h"
+#include "gguf.h"
+
 #include "common.h"
+#include "log.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+#include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "chat.hpp"
+#include "chat-template.hpp"
 
 #include <algorithm>
-#include <cassert>
+#include <cinttypes>
+#include <climits>
 #include <cmath>
+#include <codecvt>
+#include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
-#include <iterator>
 #include <iostream>
+#include <iterator>
 #include <regex>
 #include <sstream>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <cinttypes>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -27,7 +45,6 @@
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
-#include <codecvt>
 #include <locale>
 #include <windows.h>
 #include <fcntl.h>
@@ -37,25 +54,57 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#if defined(LLAMA_USE_CURL)
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
+#else
+#include <sys/syslimits.h>
 #endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
-#endif
+//
+// CURL utils
+//
 
-int32_t get_num_physical_cores() {
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+#endif // LLAMA_USE_CURL
+
+using json = nlohmann::ordered_json;
+
+//
+// CPU utils
+//
+
+int32_t cpu_get_num_physical_cores() {
 #ifdef __linux__
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu"
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
         if (!thread_siblings.is_open()) {
             break; // no more cpus
@@ -79,14 +128,494 @@ int32_t get_num_physical_cores() {
     if (result == 0) {
         return num_physical_cores;
     }
-#elif defined(_WIN32)
-    //TODO: Implement
+#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    unsigned int n_threads_win = std::thread::hardware_concurrency();
+    unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
+
+    DWORD buffer_size = 0;
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
+        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+            return default_threads;
+        }
+    }
+
+    std::vector<char> buffer(buffer_size);
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
+        return default_threads;
+    }
+
+    int32_t num_physical_cores = 0;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    while (buffer_size > 0) {
+        if (info->Relationship == RelationProcessorCore) {
+            num_physical_cores += info->Processor.GroupCount;
+        }
+        buffer_size -= info->Size;
+        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
+    }
+
+    return num_physical_cores > 0 ? num_physical_cores : default_threads;
 #endif
     unsigned int n_threads = std::thread::hardware_concurrency();
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-void process_escapes(std::string& input) {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include <pthread.h>
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+
+static int cpu_count_math_cpus(int n_cpu) {
+    int result = 0;
+    for (int cpu = 0; cpu < n_cpu; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+
+#endif // __x86_64__ && __linux__
+
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int32_t cpu_get_num_math() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if (n_cpu < 1) {
+        return cpu_get_num_physical_cores();
+    }
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            int result = cpu_count_math_cpus(n_cpu);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return cpu_get_num_physical_cores();
+}
+
+// Helper for setting process priority
+
+#if defined(_WIN32)
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    DWORD p = NORMAL_PRIORITY_CLASS;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
+        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#else // MacOS and POSIX
+#include <sys/types.h>
+#include <sys/resource.h>
+
+bool set_process_priority(enum ggml_sched_priority prio) {
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        return true;
+    }
+
+    int p = 0;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
+        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        return false;
+    }
+    return true;
+}
+
+#endif
+
+//
+// CLI argument parsing
+//
+
+
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = cpu_get_num_math();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set && n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            LOG_ERR("Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            LOG_ERR("End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
+void common_init() {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+            common_log_add(common_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
+#ifdef NDEBUG
+    const char * build_type = "";
+#else
+    const char * build_type = " (debug)";
+#endif
+
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+}
+
+std::string common_params_get_system_info(const common_params & params) {
+    std::ostringstream os;
+
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
+    }
+#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
+    // TODO: windows + arm64 + mingw64
+    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
+#else
+    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+#endif
+
+    return os.str();
+}
+
+//
+// String utils
+//
+
+std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+std::string string_strip(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && std::isspace(str[start])) {
+        start++;
+    }
+    while (end > start && std::isspace(str[end - 1])) {
+        end--;
+    }
+    return str.substr(start, end - start);
+}
+
+std::string string_get_sortable_timestamp() {
+    using clock = std::chrono::system_clock;
+
+    const clock::time_point current_time = clock::now();
+    const time_t as_time_t = clock::to_time_t(current_time);
+    char timestamp_no_ns[100];
+    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+
+    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        current_time.time_since_epoch() % 1000000000).count();
+    char timestamp_ns[11];
+    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
+
+    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
+
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
+    std::ostringstream result;
+    for (size_t i = 0; i < values.size(); ++i) {
+        if (i > 0) {
+            result << separator;
+        }
+        result << values[i];
+    }
+    return result.str();
+}
+
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> parts;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        parts.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    parts.push_back(str.substr(start));
+
+    return parts;
+}
+
+std::string string_repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(str.length() * n);
+
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+
+    return result;
+}
+
+std::string string_from(bool value) {
+    return value ? "true" : "false";
+}
+
+std::string string_from(const std::vector<int> & values) {
+    std::stringstream buf;
+
+    buf << "[ ";
+    bool first = true;
+    for (auto e : values) {
+        if (first) {
+            first = false;
+        } else {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto & token : tokens) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = common_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+                std::remove_if(
+                    detokenized.begin(),
+                    detokenized.end(),
+                    [](const unsigned char c) { return !std::isprint(c); }),
+                detokenized.end());
+
+        buf << "\n"          << std::to_string(i)
+            << ", token '"   << detokenized << "'"
+            << ", pos "      << std::to_string(batch.pos[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
+            << ", logits "   << std::to_string(batch.logits[i]);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
+void string_process_escapes(std::string & input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
 
@@ -123,1348 +652,139 @@ void process_escapes(std::string& input) {
     input.resize(output_idx);
 }
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    bool result = true;
-    try {
-        if (!gpt_params_parse_ex(argc, argv, params)) {
-            gpt_print_usage(argc, argv, gpt_params());
-            exit(0);
+bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.val_i64 = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.val_f64 = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.val_bool = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.val_bool = false;
+        } else {
+            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
         }
+    } else if (strncmp(sep, "str:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        if (strlen(sep) > 127) {
+            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            return false;
+        }
+        strncpy(kvo.val_str, sep, 127);
+        kvo.val_str[127] = '\0';
+    } else {
+        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
     }
-    catch (const std::invalid_argument & ex) {
-        fprintf(stderr, "%s\n", ex.what());
-        gpt_print_usage(argc, argv, gpt_params());
-        exit(1);
-    }
-    return result;
+    overrides.emplace_back(std::move(kvo));
+    return true;
 }
 
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
-    bool invalid_param = false;
-    std::string arg;
-    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sparams;
+//
+// Filesystem utils
+//
 
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
+// Validate if a filename is safe to use
+// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
+bool fs_validate_filename(const std::string & filename) {
+    if (!filename.length()) {
+        // Empty filename invalid
+        return false;
+    }
+    if (filename.length() > 255) {
+        // Limit at common largest possible filename on Linux filesystems
+        // to avoid unnecessary further validation
+        // (On systems with smaller limits it will be caught by the OS)
+        return false;
+    }
 
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoul(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-            if (params.n_threads <= 0) {
-                params.n_threads = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-tb" || arg == "--threads-batch") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-            if (params.n_threads_batch <= 0) {
-                params.n_threads_batch = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-td" || arg == "--threads-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_draft = std::stoi(argv[i]);
-            if (params.n_threads_draft <= 0) {
-                params.n_threads_draft = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch_draft = std::stoi(argv[i]);
-            if (params.n_threads_batch_draft <= 0) {
-                params.n_threads_batch_draft = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "-e" || arg == "--escape") {
-            params.escape = true;
-        } else if (arg == "--prompt-cache") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.path_prompt_cache = argv[i];
-        } else if (arg == "--prompt-cache-all") {
-            params.prompt_cache_all = true;
-        } else if (arg == "--prompt-cache-ro") {
-            params.prompt_cache_ro = true;
-        } else if (arg == "-bf" || arg == "--binary-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i], std::ios::binary);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            // store the external file name in params
-            params.prompt_file = argv[i];
-            std::ostringstream ss;
-            ss << file.rdbuf();
-            params.prompt = ss.str();
-            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            // store the external file name in params
-            params.prompt_file = argv[i];
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (!params.prompt.empty() && params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-n" || arg == "--n-predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top-k") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx-size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
+    std::u32string filename_utf32;
+    try {
+#if defined(__clang__)
+        // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 
-            params.grp_attn_n = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
 
-            params.grp_attn_w = std::stoi(argv[i]);
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "--rope-scaling") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-            else { invalid_param = true; break; }
-        } else if (arg == "--rope-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
-        } else if (arg == "--yarn-orig-ctx") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_orig_ctx = std::stoi(argv[i]);
-        } else if (arg == "--yarn-ext-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-attn-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-fast") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-slow") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--defrag-thold" || arg == "-dt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.defrag_thold = std::stof(argv[i]);
-        } else if (arg == "--samplers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            const auto sampler_names = string_split(argv[i], ';');
-            sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
-        } else if (arg == "--sampling-seq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
-        } else if (arg == "--top-p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.top_p = std::stof(argv[i]);
-        } else if (arg == "--min-p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.min_p = std::stof(argv[i]);
-        } else if (arg == "--temp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.temp = std::stof(argv[i]);
-            sparams.temp = std::max(sparams.temp, 0.0f);
-        } else if (arg == "--tfs") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.tfs_z = std::stof(argv[i]);
-        } else if (arg == "--typical") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.typical_p = std::stof(argv[i]);
-        } else if (arg == "--repeat-last-n") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.penalty_last_n = std::stoi(argv[i]);
-            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
-        } else if (arg == "--repeat-penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.penalty_repeat = std::stof(argv[i]);
-        } else if (arg == "--frequency-penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.penalty_freq = std::stof(argv[i]);
-        } else if (arg == "--presence-penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.penalty_present = std::stof(argv[i]);
-        } else if (arg == "--dynatemp-range") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.dynatemp_range = std::stof(argv[i]);
-        } else if (arg == "--dynatemp-exp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.dynatemp_exponent = std::stof(argv[i]);
-        } else if (arg == "--mirostat") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat-lr") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat-ent") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "--cfg-negative-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.cfg_negative_prompt = argv[i];
-        } else if (arg == "--cfg-negative-prompt-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-                sparams.cfg_negative_prompt.pop_back();
-            }
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-        } else if (arg == "--keep") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_keep = std::stoi(argv[i]);
-        } else if (arg == "--draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_draft = std::stoi(argv[i]);
-        } else if (arg == "--chunks") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_chunks = std::stoi(argv[i]);
-        } else if (arg == "-np" || arg == "--parallel") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-ns" || arg == "--sequences") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_sequences = std::stoi(argv[i]);
-        } else if (arg == "--p-accept" || arg == "-pa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.p_accept = std::stof(argv[i]);
-        } else if (arg == "--p-split" || arg == "-ps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.p_split = std::stof(argv[i]);
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "-md" || arg == "--model-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model_draft = argv[i];
-        } else if (arg == "-a" || arg == "--alias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model_alias = argv[i];
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(argv[i], 1.0f);
-            params.use_mmap = false;
-        } else if (arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-            params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        } else if (arg == "--mmproj") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mmproj = argv[i];
-        } else if (arg == "--image") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.image = argv[i];
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "--embedding") {
-            params.embedding = true;
-        } else if (arg == "--interactive-first") {
-            params.interactive_first = true;
-        } else if (arg == "-ins" || arg == "--instruct") {
-            params.instruct = true;
-        } else if (arg == "-cml" || arg == "--chatml") {
-            params.chatml = true;
-        } else if (arg == "--infill") {
-            params.infill = true;
-        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
-            params.dump_kv_cache = true;
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            params.no_kv_offload = true;
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
-        } else if (arg == "--multiline-input") {
-            params.multiline_input = true;
-        } else if (arg == "--simple-io") {
-            params.simple_io = true;
-        } else if (arg == "-cb" || arg == "--cont-batching") {
-            params.cont_batching = true;
-        } else if (arg == "--color") {
-            params.use_color = true;
-        } else if (arg == "--mlock") {
-            params.use_mlock = true;
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_gpu_layers = std::stoi(argv[i]);
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_gpu_layers_draft = std::stoi(argv[i]);
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        } else if (arg == "--main-gpu" || arg == "-mg") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
-        } else if (arg == "--split-mode" || arg == "-sm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            } else {
-                invalid_param = true;
-                break;
-            }
-#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+        filename_utf32 = converter.from_bytes(filename);
 
-        } else if (arg == "--tensor-split" || arg == "-ts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-
-            // split string by , and /
-            const std::regex regex{R"([,/]+)"};
-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-            std::vector<std::string> split_arg{it, {}};
-            if (split_arg.size() >= llama_max_devices()) {
-                invalid_param = true;
-                break;
-            }
-            for (size_t i = 0; i < llama_max_devices(); ++i) {
-                if (i < split_arg.size()) {
-                    params.tensor_split[i] = std::stof(split_arg[i]);
-                } else {
-                    params.tensor_split[i] = 0.0f;
-                }
-            }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
-        } else if (arg == "--no-mmap") {
-            params.use_mmap = false;
-        } else if (arg == "--numa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-            else { invalid_param = true; break; }
-        } else if (arg == "--verbose-prompt") {
-            params.verbose_prompt = true;
-        } else if (arg == "--no-display-prompt") {
-            params.display_prompt = false;
-        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.antiprompt.emplace_back(argv[i]);
-        } else if (arg == "-ld" || arg == "--logdir") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.logdir = argv[i];
-
-            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-                params.logdir += DIRECTORY_SEPARATOR;
-            }
-        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.logits_file = argv[i];
-        } else if (arg == "--perplexity" || arg == "--all-logits") {
-            params.logits_all = true;
-        } else if (arg == "--ppl-stride") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.ppl_stride = std::stoi(argv[i]);
-        } else if (arg == "-ptc" || arg == "--print-token-count") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_print = std::stoi(argv[i]);
-        } else if (arg == "--ppl-output-type") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.ppl_output_type = std::stoi(argv[i]);
-        } else if (arg == "--hellaswag") {
-            params.hellaswag = true;
-        } else if (arg == "--hellaswag-tasks") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.hellaswag_tasks = std::stoi(argv[i]);
-        } else if (arg == "--winogrande") {
-            params.winogrande = true;
-        } else if (arg == "--winogrande-tasks") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.winogrande_tasks = std::stoi(argv[i]);
-        } else if (arg == "--multiple-choice") {
-            params.multiple_choice = true;
-        } else if (arg == "--multiple-choice-tasks") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.multiple_choice_tasks = std::stoi(argv[i]);
-        } else if (arg == "--kl-divergence") {
-            params.kl_divergence = true;
-        } else if (arg == "--ignore-eos") {
-            params.ignore_eos = true;
-        } else if (arg == "--no-penalize-nl") {
-            sparams.penalize_nl = false;
-        } else if (arg == "-l" || arg == "--logit-bias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::stringstream ss(argv[i]);
-            llama_token key;
-            char sign;
-            std::string value_str;
-            try {
-                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                } else {
-                    throw std::exception();
-                }
-            } catch (const std::exception&) {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "-h" || arg == "--help") {
+        // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
+        // or invalid encodings were encountered. Reject such attempts
+        std::string filename_reencoded = converter.to_bytes(filename_utf32);
+        if (filename_reencoded != filename) {
             return false;
-
-        } else if (arg == "--version") {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-            exit(0);
-        } else if (arg == "--random-prompt") {
-            params.random_prompt = true;
-        } else if (arg == "--in-prefix-bos") {
-            params.input_prefix_bos = true;
-        } else if (arg == "--in-prefix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_prefix = argv[i];
-        } else if (arg == "--in-suffix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_suffix = argv[i];
-        } else if (arg == "--grammar") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.grammar = argv[i];
-        } else if (arg == "--grammar-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(sparams.grammar)
-            );
-        } else if (arg == "--override-kv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            params.kv_overrides.push_back(kvo);
-#ifndef LOG_DISABLE_LOGS
-        // Parse args for logging parameters
-        } else if ( log_param_single_parse( argv[i] ) ) {
-            // Do nothing, log_param_single_parse automatically does it's thing
-            //  and returns if a match was found and parsed.
-        } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
-            // We have a matching known parameter requiring an argument,
-            //  now we need to check if there is anything after this argv
-            //  and flag invalid_param or parse it.
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
-                invalid_param = true;
-                break;
-            }
-        // End of Parse args for logging parameters
-#endif // LOG_DISABLE_LOGS
-        } else {
-            throw std::invalid_argument("error: unknown argument: " + arg);
         }
-    }
-    if (invalid_param) {
-        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
-    }
-    if (params.prompt_cache_all &&
-            (params.interactive || params.interactive_first ||
-             params.instruct)) {
-
-        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+    } catch (const std::exception &) {
+        return false;
     }
 
-    if (params.escape) {
-        process_escapes(params.prompt);
-        process_escapes(params.input_prefix);
-        process_escapes(params.input_suffix);
-        process_escapes(sparams.cfg_negative_prompt);
-        for (auto & antiprompt : params.antiprompt) {
-            process_escapes(antiprompt);
+    // Check for forbidden codepoints:
+    // - Control characters
+    // - Unicode equivalents of illegal characters
+    // - UTF-16 surrogate pairs
+    // - UTF-8 replacement character
+    // - Byte order mark (BOM)
+    // - Illegal characters: / \ : * ? " < > |
+    for (char32_t c : filename_utf32) {
+        if (c <= 0x1F // Control characters (C0)
+            || c == 0x7F // Control characters (DEL)
+            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
+            || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
+            || c == 0x2215 // Division Slash (forward slash equivalent)
+            || c == 0x2216 // Set Minus (backslash equivalent)
+            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c == 0xFFFD // Replacement Character (UTF-8)
+            || c == 0xFEFF // Byte Order Mark (BOM)
+            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
+            return false;
         }
     }
 
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
+    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
+    // Unicode and other whitespace is not affected, only 0x20 space
+    if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
+        return false;
+    }
+
+    // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
+    if (filename.find("..") != std::string::npos) {
+        return false;
+    }
+
+    // Reject "."
+    if (filename == ".") {
+        return false;
     }
 
     return true;
 }
 
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
-
-    std::string sampler_type_chars;
-    std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
-        sampler_type_chars += static_cast<char>(sampler_type);
-        sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
-    }
-    sampler_type_names.pop_back();
-
-    printf("\n");
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  --version             show version and build info\n");
-    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
-    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
-    printf("                        halt generation at PROMPT, return control in interactive mode\n");
-    printf("                        (can be specified more than once for multiple prompts).\n");
-    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
-    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -td N, --threads-draft N");
-    printf("                        number of threads to use during generation (default: same as --threads)\n");
-    printf("  -tbd N, --threads-batch-draft N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
-    printf("  -p PROMPT, --prompt PROMPT\n");
-    printf("                        prompt to start generation with (default: empty)\n");
-    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    printf("  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    printf("  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    printf("                        not supported with --interactive or other interactive options\n");
-    printf("  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    printf("  --random-prompt       start with a randomized prompt.\n");
-    printf("  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
-    printf("  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    printf("  -f FNAME, --file FNAME\n");
-    printf("                        prompt file to start generation.\n");
-    printf("  -bf FNAME, --binary-file FNAME\n");
-    printf("                        binary file containing multiple choice tasks.\n");
-    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
-    printf("                        (default: %s)\n", sampler_type_names.c_str());
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
-    printf("  --dynatemp-range N    dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
-    printf("  --dynatemp-exp N      dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
-    printf("  --mirostat N          use Mirostat sampling.\n");
-    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    printf("                        modifies the likelihood of token appearing in the completion,\n");
-    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
-    printf("  --grammar-file FNAME  file to read grammar from\n");
-    printf("  --cfg-negative-prompt PROMPT\n");
-    printf("                        negative prompt to use for guidance. (default: empty)\n");
-    printf("  --cfg-negative-prompt-file FNAME\n");
-    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
-    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  -dt N, --defrag-thold N\n");
-    printf("                        KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
-    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
-    printf("  --all-logits          return logits for all tokens in the batch (default: disabled)\n");
-    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
-    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
-    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
-    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base\n");
-    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
-    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
-    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
-    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
-    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
-    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
-    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
-    if (llama_supports_mlock()) {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_supports_mmap()) {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
-    printf("                          - distribute: spread execution evenly over all nodes\n");
-    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
-    printf("                          - numactl: use the CPU map provided by numactl\n");
-    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
-    if (llama_supports_gpu_offload()) {
-        printf("  -ngl N, --n-gpu-layers N\n");
-        printf("                        number of layers to store in VRAM\n");
-        printf("  -ngld N, --n-gpu-layers-draft N\n");
-        printf("                        number of layers to store in VRAM for the draft model\n");
-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-        printf("                        how to split the model across multiple GPUs, one of:\n");
-        printf("                          - none: use one GPU only\n");
-        printf("                          - layer (default): split layers and KV across GPUs\n");
-        printf("                          - row: split rows across GPUs\n");
-        printf("  -ts SPLIT, --tensor-split SPLIT\n");
-        printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
-        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-    }
-    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
-    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
-    printf("  -gan N, --grp-attn-n N\n");
-    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
-    printf("  -gaw N, --grp-attn-w N\n");
-    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
-    printf("  -dkvc, --dump-kv-cache\n");
-    printf("                        verbose print of the KV cache\n");
-    printf("  -nkvo, --no-kv-offload\n");
-    printf("                        disable KV offload\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
-    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding\n");
-    printf("  -ld LOGDIR, --logdir LOGDIR\n");
-    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -ptc N, --print-token-count N\n");
-    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
-    printf("\n");
-#ifndef LOG_DISABLE_LOGS
-    log_print_usage();
-#endif // LOG_DISABLE_LOGS
-}
-
-std::string get_system_info(const gpt_params & params) {
-    std::ostringstream os;
-
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
-    }
-    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
-
-    return os.str();
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-    }
-
-    GGML_UNREACHABLE();
-}
-
-//
-// String utils
-//
-
-std::vector<std::string> string_split(std::string input, char separator) {
-    std::vector<std::string> parts;
-    size_t separator_pos = input.find(separator);
-    while (separator_pos != std::string::npos) {
-        std::string part = input.substr(0, separator_pos);
-        parts.emplace_back(part);
-        input = input.substr(separator_pos + 1);
-        separator_pos = input.find(separator);
-    }
-    parts.emplace_back(input);
-    return parts;
-}
-
-std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
-    };
-
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
-
-std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
-
-std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
-        default : return "";
-    }
-}
-
-//
-// Model utils
-//
-
-struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
-    auto mparams = llama_model_default_params();
-
-    if (params.n_gpu_layers != -1) {
-        mparams.n_gpu_layers = params.n_gpu_layers;
-    }
-    mparams.main_gpu        = params.main_gpu;
-    mparams.split_mode      = params.split_mode;
-    mparams.tensor_split    = params.tensor_split;
-    mparams.use_mmap        = params.use_mmap;
-    mparams.use_mlock       = params.use_mlock;
-    if (params.kv_overrides.empty()) {
-        mparams.kv_overrides = NULL;
-    } else {
-        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
-        mparams.kv_overrides = params.kv_overrides.data();
-    }
-
-    return mparams;
-}
-
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Invalid cache type: " + s);
-}
-
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
-    auto cparams = llama_context_default_params();
-
-    cparams.n_ctx             = params.n_ctx;
-    cparams.n_batch           = params.n_batch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    cparams.seed              = params.seed;
-    cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
-    cparams.rope_scaling_type = params.rope_scaling_type;
-    cparams.rope_freq_base    = params.rope_freq_base;
-    cparams.rope_freq_scale   = params.rope_freq_scale;
-    cparams.yarn_ext_factor   = params.yarn_ext_factor;
-    cparams.yarn_attn_factor  = params.yarn_attn_factor;
-    cparams.yarn_beta_fast    = params.yarn_beta_fast;
-    cparams.yarn_beta_slow    = params.yarn_beta_slow;
-    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
-    cparams.defrag_thold      = params.defrag_thold;
-    cparams.offload_kqv       = !params.no_kv_offload;
-
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
-
-    return cparams;
-}
-
-void llama_batch_clear(struct llama_batch & batch) {
-    batch.n_tokens = 0;
-}
-
-void llama_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits) {
-    batch.token   [batch.n_tokens] = id;
-    batch.pos     [batch.n_tokens] = pos;
-    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
-    for (size_t i = 0; i < seq_ids.size(); ++i) {
-        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
-    }
-    batch.logits  [batch.n_tokens] = logits;
-
-    batch.n_tokens++;
-}
-
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
-    auto mparams = llama_model_params_from_gpt_params(params);
-
-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return std::make_tuple(nullptr, nullptr);
-    }
-
-    auto cparams = llama_context_params_from_gpt_params(params);
-
-    llama_context * lctx = llama_new_context_with_model(model, cparams);
-    if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
-        return std::make_tuple(nullptr, nullptr);
-    }
-
-    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
-        float lora_scale = std::get<1>(params.lora_adapter[i]);
-        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            llama_free(lctx);
-            llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
-        }
-    }
-
-    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
-    }
-
-    {
-        LOG("warming up the model with an empty run\n");
-
-        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
-        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-        llama_kv_cache_clear(lctx);
-        llama_reset_timings(lctx);
-    }
-
-    return std::make_tuple(model, lctx);
-}
-
-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(
-  const struct llama_context * ctx,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
-}
-
-std::vector<llama_token> llama_tokenize(
-    const struct llama_model * model,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
-    // upper limit for the number of tokens
-    int n_tokens = text.length() + add_bos;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-    return result;
-}
-
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
-    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
-std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
-
-    std::string piece;
-    std::string result;
-
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        piece = llama_token_to_piece(ctx, tokens[i]);
-
-        // remove the leading space of the first non-BOS token
-        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
-            piece = piece.substr(1);
-        }
-
-        result += piece;
-    }
-
-    return result;
-}
-
-std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    std::string piece;
-    std::string result;
-
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        piece = llama_token_to_piece(ctx, tokens[i]);
-
-        result += piece;
-    }
-
-    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return result;
-}
-
-bool llama_should_add_bos_token(const llama_model * model) {
-    const int add_bos = llama_add_bos_token(model);
-
-    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-}
-
-//
-// YAML utils
-//
-
 // returns true if successful, false otherwise
-bool create_directory_with_parents(const std::string & path) {
+bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
     std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
     std::wstring wpath = converter.from_bytes(path);
@@ -1535,257 +855,1106 @@ bool create_directory_with_parents(const std::string & path) {
 #endif // _WIN32
 }
 
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
+std::string fs_get_cache_directory() {
+    std::string cache_directory = "";
+    auto ensure_trailing_slash = [](std::string p) {
+        // Make sure to add trailing slash
+        if (p.back() != DIRECTORY_SEPARATOR) {
+            p += DIRECTORY_SEPARATOR;
+        }
+        return p;
+    };
+    if (getenv("LLAMA_CACHE")) {
+        cache_directory = std::getenv("LLAMA_CACHE");
+    } else {
+#ifdef __linux__
+        if (std::getenv("XDG_CACHE_HOME")) {
+            cache_directory = std::getenv("XDG_CACHE_HOME");
+        } else {
+            cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        }
+#elif defined(__APPLE__)
+        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
+#elif defined(_WIN32)
+        cache_directory = std::getenv("LOCALAPPDATA");
+#endif // __linux__
+        cache_directory = ensure_trailing_slash(cache_directory);
+        cache_directory += "llama.cpp";
     }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%e, ", data[i]);
-    }
-    fprintf(stream, "%e]\n", data.back());
+    return ensure_trailing_slash(cache_directory);
 }
 
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
+std::string fs_get_cache_file(const std::string & filename) {
+    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
+    std::string cache_directory = fs_get_cache_directory();
+    const bool success = fs_create_directory_with_parents(cache_directory);
+    if (!success) {
+        throw std::runtime_error("failed to create cache directory: " + cache_directory);
     }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%d, ", data[i]);
-    }
-    fprintf(stream, "%d]\n", data.back());
+    return cache_directory + filename;
 }
 
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
-    std::string data_str(data == NULL ? "" : data);
 
-    if (data_str.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
+//
+// Model utils
+//
+struct common_init_result common_init_from_params(common_params & params) {
+    common_init_result iparams;
+    auto mparams = common_model_params_to_llama(params);
+
+    llama_model * model = nullptr;
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
+    } else if (!params.model_url.empty()) {
+        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
+    } else {
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
     }
 
-    size_t pos_start = 0;
-    size_t pos_found = 0;
-
-    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
-        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
-        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
-        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
-        data_str = "\"" + data_str + "\"";
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        return iparams;
     }
 
-    if (data_str.find('\n') == std::string::npos) {
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    if (params.reranking) {
+        bool ok = true;
+
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (!ok) {
+            llama_model_free(model);
+
+            return iparams;
+        }
     }
 
-    fprintf(stream, "%s: |\n", prop_name);
-    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
-        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
-        pos_start = pos_found + 1;
+    auto cparams = common_context_params_to_llama(params);
+
+    llama_context * lctx = llama_init_from_model(model, cparams);
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        llama_model_free(model);
+        return iparams;
+    }
+
+    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
+    }
+
+    if (!params.control_vectors.empty()) {
+        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model);
+
+        const auto cvec = common_control_vector_load(params.control_vectors);
+        if (cvec.n_embd == -1) {
+            llama_free(lctx);
+            llama_model_free(model);
+
+            return iparams;
+        }
+
+        int err = llama_apply_adapter_cvec(
+                lctx,
+                cvec.data.data(),
+                cvec.data.size(),
+                cvec.n_embd,
+                params.control_vector_layer_start,
+                params.control_vector_layer_end);
+        if (err) {
+            llama_free(lctx);
+            llama_model_free(model);
+
+            return iparams;
+        }
+    }
+
+    // load and optionally apply lora adapters
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            llama_free(lctx);
+            llama_model_free(model);
+            return iparams;
+        }
+
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+
+    if (!params.lora_init_without_apply) {
+        common_set_adapter_lora(lctx, params.lora_adapters);
+    }
+
+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sampling.ignore_eos = false;
+    }
+
+    if (params.sampling.ignore_eos) {
+        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+            if (llama_vocab_is_eog(vocab, i)) {
+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+                params.sampling.logit_bias.push_back({i, -INFINITY});
+            }
+        }
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.warmup) {
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+
+        std::vector<llama_token> tmp;
+        llama_token bos = llama_vocab_bos(vocab);
+        llama_token eos = llama_vocab_eos(vocab);
+
+        // some models (e.g. T5) don't have a BOS token
+        if (bos != LLAMA_TOKEN_NULL) {
+            tmp.push_back(bos);
+        }
+        if (eos != LLAMA_TOKEN_NULL) {
+            tmp.push_back(eos);
+        }
+        if (tmp.empty()) {
+            tmp.push_back(0);
+        }
+
+        if (llama_model_has_encoder(model)) {
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+                decoder_start_token_id = bos;
+            }
+            tmp.clear();
+            tmp.push_back(decoder_start_token_id);
+        }
+        if (llama_model_has_decoder(model)) {
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
+        }
+        llama_kv_cache_clear(lctx);
+        llama_synchronize(lctx);
+        llama_perf_context_reset(lctx);
+    }
+
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
+
+    return iparams;
+}
+
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
+    llama_clear_adapter_lora(ctx);
+    for (auto & la : lora) {
+        if (la.scale != 0.0f) {
+            llama_set_adapter_lora(ctx, la.ptr, la.scale);
+        }
     }
 }
 
-std::string get_sortable_timestamp() {
-    using clock = std::chrono::system_clock;
+struct llama_model_params common_model_params_to_llama(common_params & params) {
+    auto mparams = llama_model_default_params();
 
-    const clock::time_point current_time = clock::now();
-    const time_t as_time_t = clock::to_time_t(current_time);
-    char timestamp_no_ns[100];
-    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+    if (!params.devices.empty()) {
+        mparams.devices = params.devices.data();
+    }
+    if (params.n_gpu_layers != -1) {
+        mparams.n_gpu_layers = params.n_gpu_layers;
+    }
+    mparams.main_gpu        = params.main_gpu;
+    mparams.split_mode      = params.split_mode;
+    mparams.tensor_split    = params.tensor_split;
+    mparams.use_mmap        = params.use_mmap;
+    mparams.use_mlock       = params.use_mlock;
+    mparams.check_tensors   = params.check_tensors;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }
 
-    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
-        current_time.time_since_epoch() % 1000000000).count();
-    char timestamp_ns[11];
-    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
-
-    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+    return mparams;
 }
 
-void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
-                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sparams;
+struct llama_context_params common_context_params_to_llama(const common_params & params) {
+    auto cparams = llama_context_default_params();
 
-    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
-    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
-    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
+    cparams.n_ctx             = params.n_ctx;
+    cparams.n_seq_max         = params.n_parallel;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_ubatch          = params.n_ubatch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+    cparams.logits_all        = params.logits_all;
+    cparams.embeddings        = params.embedding;
+    cparams.rope_scaling_type = params.rope_scaling_type;
+    cparams.rope_freq_base    = params.rope_freq_base;
+    cparams.rope_freq_scale   = params.rope_freq_scale;
+    cparams.yarn_ext_factor   = params.yarn_ext_factor;
+    cparams.yarn_attn_factor  = params.yarn_attn_factor;
+    cparams.yarn_beta_fast    = params.yarn_beta_fast;
+    cparams.yarn_beta_slow    = params.yarn_beta_slow;
+    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.pooling_type      = params.pooling_type;
+    cparams.attention_type    = params.attention_type;
+    cparams.defrag_thold      = params.defrag_thold;
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+    cparams.offload_kqv       = !params.no_kv_offload;
+    cparams.flash_attn        = params.flash_attn;
+    cparams.no_perf           = params.no_perf;
+
+    if (params.reranking) {
+        cparams.embeddings    = true;
+        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
+    }
+
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;
+
+    return cparams;
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
+
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
+#ifdef LLAMA_USE_CURL
+
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
+    return false;
+}
+
+static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
+    // Initialize libcurl
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    bool force_download = false;
+
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+
+    // Check if hf-token or bearer-token was specified
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+    }
+
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+    // Check if the file already exists locally
+    auto file_exists = std::filesystem::exists(path);
+
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;
+
+    if (file_exists) {
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
+                    if (previous_url != url) {
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
+                }
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
+                }
+            } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
+            }
+        }
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct common_load_model_from_url_headers {
+        std::string etag;
+        std::string last_modified;
+    };
+
+    common_load_model_from_url_headers headers;
+
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
+            }
+            return n_items;
+        };
+
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            force_download = true;
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        }
+    }
+
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
+    if (should_download) {
+        std::string path_temporary = path + ".downloadInProgress";
+        if (file_exists) {
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        // Set the output file
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
+        if (!outfile) {
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            return false;
+        }
+
+        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+            return fwrite(data, size, nmemb, (FILE *)fd);
+        };
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+
+        //  display download progress
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+
+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+            return false;
+        }
+
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();
+
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct llama_model * common_load_model_from_url(
+        const std::string & model_url,
+        const std::string & local_path,
+        const std::string & hf_token,
+        const struct llama_model_params & params) {
+    // Basic validation of the model_url
+    if (model_url.empty()) {
+        LOG_ERR("%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    if (!common_download_file(model_url, local_path, hf_token)) {
+        return NULL;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
+            return NULL;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
+                return NULL;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
+                return NULL;
+            }
+        }
+
+        // Prepare download in parallel
+        std::vector<std::future<bool>> futures_download;
+        for (int idx = 1; idx < n_split; idx++) {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
+                char split_path[PATH_MAX] = {0};
+                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+                return common_download_file(split_url, split_path, hf_token);
+            }, idx));
+        }
+
+        // Wait for all downloads to complete
+        for (auto & f : futures_download) {
+            if (!f.get()) {
+                return NULL;
+            }
+        }
+    }
+
+    return llama_model_load_from_file(local_path.c_str(), params);
+}
+
+struct llama_model * common_load_model_from_hf(
+        const std::string & repo,
+        const std::string & remote_path,
+        const std::string & local_path,
+        const std::string & hf_token,
+        const struct llama_model_params & params) {
+    // construct hugging face model url:
+    //
+    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //
+    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //
+
+    std::string model_url = "https://huggingface.co/";
+    model_url += repo;
+    model_url += "/resolve/main/";
+    model_url += remote_path;
+
+    return common_load_model_from_url(model_url, local_path, hf_token, params);
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
 
-#ifdef NDEBUG
-    fprintf(stream, "debug: false\n");
 #else
-    fprintf(stream, "debug: true\n");
-#endif // NDEBUG
 
-    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
+struct llama_model * common_load_model_from_url(
+        const std::string & /*model_url*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
+        const struct llama_model_params & /*params*/) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+    return nullptr;
+}
 
-#ifdef __OPTIMIZE__
-    fprintf(stream, "optimize: true\n");
-#else
-    fprintf(stream, "optimize: false\n");
-#endif // __OPTIMIZE__
+struct llama_model * common_load_model_from_hf(
+        const std::string & /*repo*/,
+        const std::string & /*remote_path*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
+        const struct llama_model_params & /*params*/) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return nullptr;
+}
 
-    fprintf(stream, "time: %s\n", timestamp.c_str());
+std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return std::make_pair("", "");
+}
 
-    fprintf(stream, "\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "# User Inputs #\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "\n");
+#endif // LLAMA_USE_CURL
 
-    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
-    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
-    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
-    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
-    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
-    dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
-    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
-    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
-    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
+//
+// Batch utils
+//
 
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
-    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+void common_batch_clear(struct llama_batch & batch) {
+    batch.n_tokens = 0;
+}
 
-    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
-    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
-    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
-    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
-    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
-    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
-    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
+void common_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits) {
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
 
-    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
-        if (ignore_eos && lb.first == logit_bias_eos->first) {
-            continue;
-        }
-        fprintf(stream, "  %d: %f", lb.first, lb.second);
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos;
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits  [batch.n_tokens] = logits;
+
+    batch.n_tokens++;
+}
+
+//
+// Token utils
+//
+
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+    return i;
+}
+
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
+    // check for empty sequences
+    if (a.empty() || b.empty()) {
+        return 0;
     }
 
-    fprintf(stream, "lora:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) != 1.0f) {
-            continue;
-        }
-        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
-    }
-    fprintf(stream, "lora_scaled:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) == 1.0f) {
-            continue;
-        }
-        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
-    }
-    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
-    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
-    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
-    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
-    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
-    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
-    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
-    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
-    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
-    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
-    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
-    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
-    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
-    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
-    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
+    // get the lengths of the input sequences
+    size_t a_len = a.size();
+    size_t b_len = b.size();
 
-    fprintf(stream, "reverse_prompt:\n");
-    for (std::string ap : params.antiprompt) {
-        size_t pos = 0;
-        while ((pos = ap.find('\n', pos)) != std::string::npos) {
-            ap.replace(pos, 1, "\\n");
-            pos += 1;
+    // initialize the maximum length of the longest common subsequence (LCS)
+    size_t max_length = 0;
+
+    // use two rows instead of a 2D matrix to optimize space
+    std::vector<size_t> prev_row(b_len + 1, 0);
+    std::vector<size_t> curr_row(b_len + 1, 0);
+
+    // iterate through the elements of a
+    for (size_t i = 1; i <= a_len; i++) {
+        // iterate through the elements of b
+        for (size_t j = 1; j <= b_len; j++) {
+            // if elements at the current positions match
+            if (a[i - 1] == b[j - 1]) {
+                // if it's the first element of either sequences, set LCS length to 1
+                if (i == 1 || j == 1) {
+                    curr_row[j] = 1;
+                } else {
+                    // increment LCS length by 1 compared to the previous element
+                    curr_row[j] = prev_row[j - 1] + 1;
+                }
+
+                // update max_length if necessary
+                if (curr_row[j] > max_length) {
+                    max_length = curr_row[j];
+                }
+            } else {
+                // reset LCS length if elements don't match
+                curr_row[j] = 0;
+            }
         }
 
-        fprintf(stream, "  - %s\n", ap.c_str());
+        // update the previous row for the next iteration
+        prev_row = curr_row;
     }
 
-    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
-    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
-    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
-    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
+    // return the maximum length of the LCS
+    return max_length;
+}
 
-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
-    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
+//
+// Vocab utils
+//
 
-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
-    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
-    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
+std::vector<llama_token> common_tokenize(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_tokenize(vocab, text, add_special, parse_special);
+}
+
+std::vector<llama_token> common_tokenize(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_token_to_piece(vocab, token, special);
+}
+
+std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+    const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
+    }
+
+    return piece;
+}
+
+std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    return common_detokenize(vocab, tokens, special);
+}
+
+std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+
+    text.resize(n_chars);
+
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}
+
+//
+// Chat template utils
+//
+
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+    if (use_jinja) {
+        try {
+            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            common_chat_params_init(chat_template, inputs);
+            return true;
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+            return false;
+        }
+    }
+    llama_chat_message chat[] = {{"user", "test"}};
+    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
+    return res >= 0;
+}
+
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & msgs,
+        bool add_ass,
+        bool use_jinja) {
+    if (use_jinja) {
+        auto messages = json::array();
+        for (const auto & msg : msgs) {
+            messages.push_back({{"role", msg.role}, {"content", msg.content}});
+        }
+        common_chat_inputs inputs;
+        inputs.messages = messages;
+        inputs.add_generation_prompt = add_ass;
+        return common_chat_params_init(tmpl, inputs).prompt;
+    }
+
+    int alloc_size = 0;
+    std::vector<llama_chat_message> chat;
+    for (const auto & msg : msgs) {
+        chat.push_back({msg.role.c_str(), msg.content.c_str()});
+        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
+    }
+
+    std::vector<char> buf(alloc_size);
+
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+    // error: chat template is not supported
+    if (res < 0) {
+        // if the custom "tmpl" is not supported, we throw an error
+        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+        throw std::runtime_error("this custom template is not supported");
+    }
+
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    }
+
+    std::string formatted_chat(buf.data(), res);
+    return formatted_chat;
+}
+
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja) {
+    std::ostringstream ss;
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
+    std::vector<common_chat_msg> chat_new(past_msg);
+    // if the past_msg ends with a newline, we must preserve it in the formatted version
+    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+        ss << "\n";
+    };
+    // format chat with new_msg
+    chat_new.push_back(new_msg);
+    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
+    // get the diff part
+    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return ss.str();
+}
+
+std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
+    std::vector<common_chat_msg> msgs = {
+        {"system",    "You are a helpful assistant", {}},
+        {"user",      "Hello", {}},
+        {"assistant", "Hi there", {}},
+        {"user",      "How are you?", {}},
+    };
+    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
+}
+
+#define CHATML_TEMPLATE_SRC \
+    "{%- for message in messages -%}\n" \
+    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
+    "{%- endfor -%}\n" \
+    "{%- if add_generation_prompt -%}\n" \
+    "  {{- '<|im_start|>assistant\n' -}}\n" \
+    "{%- endif -%}"
+
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
+{
+    std::string default_template_src;
+    std::string template_tool_use_src;
+
+    bool has_explicit_template = !chat_template_override.empty();
+    if (chat_template_override.empty()) {
+        auto str = llama_model_chat_template(model, /* name */ nullptr);
+        if (str) {
+            default_template_src = str;
+            has_explicit_template = true;
+        }
+        str = llama_model_chat_template(model, /* name */ "tool_use");
+        if (str) {
+            template_tool_use_src = str;
+            has_explicit_template = true;
+        }
+    } else {
+        default_template_src = chat_template_override;
+    }
+    if (default_template_src.empty() || default_template_src == "chatml") {
+        if (!template_tool_use_src.empty()) {
+            default_template_src = template_tool_use_src;
+        } else {
+            default_template_src = CHATML_TEMPLATE_SRC;
+        }
+    }
+    auto vocab = llama_model_get_vocab(model);
+    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
+        if (token == LLAMA_TOKEN_NULL) {
+            if (default_template_src.find(jinja_variable_name) != std::string::npos
+                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
+                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
+            }
+            return std::string();
+        } else {
+            return common_token_to_piece(vocab, token, true);
+        }
+    };
+    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
+    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+    try {
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
+            template_tool_use_src.empty()
+                ? nullptr
+                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
+        };
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
+            nullptr,
+        };
+    }
 }
 
 //
 // KV cache utils
 //
 
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
     llama_kv_cache_view_cell * c_curr = view.cells;
     llama_seq_id * cs_curr = view.cells_sequences;
 
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
         if (i % row_size == 0) {
             printf("\n%5d: ", i);
         }
         int seq_count = 0;
-        for (int j = 0; j < view.n_max_seq; j++) {
+        for (int j = 0; j < view.n_seq_max; j++) {
             if (cs_curr[j] >= 0) { seq_count++; }
         }
         putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
@@ -1794,18 +1963,18 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
     printf("\n=== Done dumping\n");
 }
 
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
 
     std::unordered_map<llama_seq_id, size_t> seqs;
     llama_kv_cache_view_cell * c_curr = view.cells;
     llama_seq_id * cs_curr = view.cells_sequences;
 
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        for (int j = 0; j < view.n_max_seq; j++) {
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+        for (int j = 0; j < view.n_seq_max; j++) {
             if (cs_curr[j] < 0) { continue; }
             if (seqs.find(cs_curr[j]) == seqs.end()) {
                 if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1824,11 +1993,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
 
     c_curr = view.cells;
     cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
         if (i % row_size == 0) {
             printf("\n%5d: ", i);
         }
-        for (int j = 0; j < view.n_max_seq; j++) {
+        for (int j = 0; j < view.n_seq_max; j++) {
             if (cs_curr[j] >= 0) {
                 const auto & it = seqs.find(cs_curr[j]);
                 putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
@@ -1841,3 +2010,189 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
 
     printf("\n=== Done dumping\n");
 }
+
+//
+// Embedding utils
+//
+
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
+    double sum = 0.0;
+
+    switch (embd_norm) {
+        case -1: // no normalisation
+            sum = 1.0;
+            break;
+        case 0: // max absolute
+            for (int i = 0; i < n; i++) {
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
+            }
+            sum /= 32760.0; // make an int16 range
+            break;
+        case 2: // euclidean
+            for (int i = 0; i < n; i++) {
+                sum += inp[i] * inp[i];
+            }
+            sum = std::sqrt(sum);
+            break;
+        default: // p-norm (euclidean is p-norm p=2)
+            for (int i = 0; i < n; i++) {
+                sum += std::pow(std::abs(inp[i]), embd_norm);
+            }
+            sum = std::pow(sum, 1.0 / embd_norm);
+            break;
+    }
+
+    const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
+
+    for (int i = 0; i < n; i++) {
+        out[i] = inp[i] * norm;
+    }
+}
+
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+    double sum  = 0.0;
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+
+    for (int i = 0; i < n; i++) {
+        sum  += embd1[i] * embd2[i];
+        sum1 += embd1[i] * embd1[i];
+        sum2 += embd2[i] * embd2[i];
+    }
+
+    // Handle the case where one or both vectors are zero vectors
+    if (sum1 == 0.0 || sum2 == 0.0) {
+        if (sum1 == 0.0 && sum2 == 0.0) {
+            return 1.0f; // two zero vectors are similar
+        }
+        return 0.0f;
+    }
+
+    return sum / (sqrt(sum1) * sqrt(sum2));
+}
+
+//
+// Control vector utils
+//
+
+static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
+    common_control_vector_data result = { -1, {} };
+
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
+    }
+
+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
+    if (n_tensors == 0) {
+        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+    }
+
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
+
+        int layer_idx = -1;
+
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
+            }
+        }
+        if (layer_idx < 0) {
+            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
+
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+
+    }
+
+    if (result.n_embd == -1) {
+        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
+    return result;
+}
+
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
+    common_control_vector_data result = { -1, {} };
+
+    for (const auto & info : load_infos) {
+        auto cur = common_control_vector_load_one(info);
+
+        if (cur.n_embd == -1) {
+            result.n_embd = -1;
+            break;
+        }
+        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
+            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result = std::move(cur);
+        } else {
+            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
+            for (size_t i = 0; i < cur.data.size(); i++) {
+                result.data[i] += cur.data[i];
+            }
+        }
+    }
+
+    if (result.n_embd == -1) {
+        LOG_ERR("%s: no valid control vector files passed\n", __func__);
+        result.data.clear();
+    }
+
+    return result;
+}
+
diff --git a/common/common.h b/common/common.h
index ab62bdb82..b208d0c7e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -2,20 +2,12 @@
 
 #pragma once
 
-#include "llama.h"
+#include "llama-cpp.h"
 
-#include "sampling.h"
-
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
-#include <cmath>
+#include <set>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <unordered_map>
-#include <tuple>
+#include <sstream>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -27,234 +19,699 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 
 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
+struct common_adapter_lora_info {
+    std::string path;
+    float scale;
+
+    struct llama_adapter_lora * ptr;
+};
+
+using llama_tokens = std::vector<llama_token>;
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const *LLAMA_COMMIT;
-extern char const *LLAMA_COMPILER;
-extern char const *LLAMA_BUILD_TARGET;
+extern const char * LLAMA_COMMIT;
+extern const char * LLAMA_COMPILER;
+extern const char * LLAMA_BUILD_TARGET;
+
+struct common_control_vector_load_info;
 
 //
-// CLI argument parsing
+// CPU utils
 //
-int32_t get_num_physical_cores();
 
-struct gpt_params {
-    uint32_t seed                 = -1;    // RNG seed
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
 
-    int32_t n_threads             = get_num_physical_cores();
-    int32_t n_threads_draft       = -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
-    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
-    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding
-    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            = 1;     // number of parallel sequences to decode
-    int32_t n_sequences           = 1;     // number of sequences to decode
-    float   p_accept              = 0.5f;  // speculative decoding accept probability
-    float   p_split               = 0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
-    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n            = 1;     // group-attention factor
-    int32_t grp_attn_w            = 512;   // group-attention width
-    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        = 0.0f;  // RoPE base frequency
-    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();
+
+//
+// Common params
+//
+
+enum llama_example {
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
+
+    LLAMA_EXAMPLE_COUNT,
+};
+
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_DRY         = 1,
+    COMMON_SAMPLER_TYPE_TOP_K       = 2,
+    COMMON_SAMPLER_TYPE_TOP_P       = 3,
+    COMMON_SAMPLER_TYPE_MIN_P       = 4,
+  //COMMON_SAMPLER_TYPE_TFS_Z       = 5,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 6,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
+    COMMON_SAMPLER_TYPE_XTC         = 8,
+    COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
+};
+
+// dimensionality reduction methods, used by cvector-generator
+enum dimre_method {
+    DIMRE_METHOD_PCA,
+    DIMRE_METHOD_MEAN,
+};
+
+enum common_conversation_mode {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
+    COMMON_CONVERSATION_MODE_ENABLED  = 1,
+    COMMON_CONVERSATION_MODE_AUTO     = 2,
+};
+
+struct common_grammar_trigger {
+    std::string word;
+    bool at_start;
+};
+
+// sampling parameters
+struct common_params_sampling {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
+    bool    ignore_eos         = false;
+    bool    no_perf            = false; // disable performance metrics
+    bool    timing_per_token   = false;
+
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
+
+
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
+        COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
+    };
+
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
+    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::set<llama_token>               preserved_tokens;
+
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print() const;
+};
+
+struct common_params_speculative {
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    int32_t n_ctx        =     0; // draft context size
+    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    float   p_split      =  0.1f; // speculative decoding split probability
+    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model = "";     // draft model for speculative decoding                      // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
+
+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+};
+
+struct common_params {
+    int32_t n_predict             =    -1; // new tokens to predict
+    int32_t n_ctx                 =  4096; // context size
+    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
+    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            =     1; // number of parallel sequences to decode
+    int32_t n_sequences           =     1; // number of sequences to decode
+    int32_t grp_attn_n            =     1; // group-attention factor
+    int32_t grp_attn_w            =   512; // group-attention width
+    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =  0.0f; // RoPE base frequency
+    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
     float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
-    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
-    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;
+    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx         =     0; // YaRN original context length
+    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
-    // // sampling parameters
-    struct llama_sampling_params sparams;
+    // offload params
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
-    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft       = "";                              // draft model for speculative decoding
-    std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "";
-    std::string prompt_file       = "";  // store the external prompt file name
-    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix      = "";  // string to prefix user inputs with
-    std::string input_suffix      = "";  // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir            = "";  // directory in which to save YAML log files
-    std::string logits_file       = "";  // file for saving *all* logits
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
 
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+
+    ggml_backend_sched_eval_callback cb_eval = nullptr;
+    void * cb_eval_user_data                 = nullptr;
+
+    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+
+    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+
+    struct common_params_sampling    sampling;
+    struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
+
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+
+    std::vector<std::string> in_files;   // all input files
+    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
 
-    // TODO: avoid tuple, use struct
-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
+    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
 
-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                    //                                       (which is more convenient to use for plotting)
-                                    //
-    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
 
-    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    int32_t verbosity                  = 0;
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
 
-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                     //                                       (which is more convenient to use for plotting)
+                                     //
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
 
-    bool   kl_divergence   = false; // compute KL-divergence
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 
-    bool random_prompt     = false; // do not randomize prompt if none provided
+    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
+    bool   kl_divergence    = false; // compute KL divergence
+
+    bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
+    bool special           = false; // enable special token output
     bool interactive       = false; // interactive mode
-    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
+    bool interactive_first = false; // wait for user input immediately
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
-    bool embedding         = false; // get only sentence embedding
-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
-    bool interactive_first = false; // wait for user input immediately
+    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
+    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // disable performance metrics
+    bool ctx_shift         = true;  // context shift on inifinite text generation
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
-    bool instruct          = false; // instruction mode (used for Alpaca models)
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
-    bool infill            = false; // use infill mode
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
+    bool warmup            = true;  // warmup run
+    bool check_tensors     = false; // validate tensor data
 
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+
+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
 
     // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    std::vector<std::string> image; // path to image file(s)
+
+    // embedding
+    bool embedding         = false; // get only sentence embedding
+    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
+    std::string embd_sep   = "\n";  // separator of embeddings
+    bool reranking         = false; // enable reranking support on server
+
+    // server params
+    int32_t port           = 8080;         // server listens on this network port
+    int32_t timeout_read   = 600;          // http read timeout in seconds
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
+
+    std::string hostname      = "127.0.0.1";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
+    bool enable_chat_template = true;
+
+    std::vector<std::string> api_keys;
+
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT
+
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
+    bool endpoint_metrics = false;
+
+    bool log_json = false;
+
+    std::string slot_save_path;
+
+    float slot_prompt_similarity = 0.5f;
+
+    // batched-bench params
+    bool is_pp_shared = false;
+
+    std::vector<int32_t> n_pp;
+    std::vector<int32_t> n_tg;
+    std::vector<int32_t> n_pl;
+
+    // retrieval params
+    std::vector<std::string> context_files; // context files to embed
+
+    int32_t chunk_size = 64; // chunk size for context embedding
+
+    std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+    // passkey params
+    int32_t n_junk = 250; // number of times to repeat the junk text
+    int32_t i_pos  = -1;  // position of the passkey in the junk text
+
+    // imatrix params
+    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
+    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
+    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
+    int32_t i_chunk     =  0; // start processing from this chunk
+
+    bool process_output = false; // collect data for the output tensor
+    bool compute_ppl    = true;  // whether to compute perplexity
+
+    // cvector-generator params
+    int n_pca_batch = 100;
+    int n_pca_iterations = 1000;
+    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
+    std::string cvector_outfile       = "control_vector.gguf";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+
+    bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    // batched-bench params
+    bool batched_bench_output_jsonl = false;
 };
 
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void common_init();
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+std::string common_params_get_system_info(const common_params & params);
 
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string get_system_info(const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-void process_escapes(std::string& input);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+bool set_process_priority(enum ggml_sched_priority prio);
 
 //
 // String utils
 //
 
-std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
-std::vector<std::string> string_split(std::string input, char separator);
-std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);
+
+std::string string_strip(const std::string & str);
+std::string string_get_sortable_timestamp();
+
+std::string string_join(const std::vector<std::string> & values, const std::string & separator);
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
+std::string string_repeat(const std::string & str, size_t n);
+
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+
+template<class T>
+static std::vector<T> string_split(const std::string & str, char delim) {
+    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
+
+template<>
+std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
+static bool string_starts_with(const std::string & str,
+                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
+
+bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+void string_process_escapes(std::string & input);
+
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+
+//
+// Filesystem utils
+//
+
+bool fs_validate_filename(const std::string & filename);
+bool fs_create_directory_with_parents(const std::string & path);
+
+std::string fs_get_cache_directory();
+std::string fs_get_cache_file(const std::string & filename);
 
 //
 // Model utils
 //
 
-// TODO: avoid tuplue, use struct
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+// note: defines object's lifetime
+struct common_init_result {
+    llama_model_ptr   model;
+    llama_context_ptr context;
 
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+    std::vector<llama_adapter_lora_ptr> lora;
+};
 
+struct common_init_result     common_init_from_params(common_params & params);
+
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
+
+struct llama_model * common_load_model_from_url(
+    const std::string & model_url,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+
+struct llama_model * common_load_model_from_hf(
+    const std::string & repo,
+    const std::string & remote_path,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+
+std::pair<std::string, std::string> common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & hf_token);
+
+// clear LoRA adapters from context, then apply new list of adapters
+void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+
+//
 // Batch utils
+//
 
-void llama_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct llama_batch & batch);
 
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
     const std::vector<llama_seq_id> & seq_ids,
                                bool   logits);
 
+//
+// Token utils
+//
+
+// longest common prefix
+size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
+
+// longet common subsequence
+size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
+
 //
 // Vocab utils
 //
 
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);
 
-std::vector<llama_token> llama_tokenize(
-    const struct llama_model * model,
+std::vector<llama_token> common_tokenize(
+    const struct llama_vocab * vocab,
            const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);
 
-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
+std::string common_token_to_piece(
         const struct llama_context * ctx,
-                       llama_token   token);
+                       llama_token   token,
+                       bool          special = true);
 
-// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
-//       that takes into account the tokenizer type and decides how to handle the leading space
-//
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
-std::string llama_detokenize_spm(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
+std::string common_token_to_piece(
+          const struct llama_vocab * vocab,
+                       llama_token   token,
+                       bool          special = true);
 
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
-std::string llama_detokenize_bpe(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
+// optionally renders special/control tokens
+std::string common_detokenize(
+            const struct llama_context * ctx,
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);
 
-// Uses the value from the model metadata if possible, otherwise
-// defaults to true when model type is SPM, otherwise false.
-bool llama_should_add_bos_token(const llama_model * model);
+std::string common_detokenize(
+              const struct llama_vocab * vocab,
+        const std::vector<llama_token> & tokens,
+                                  bool   special = true);
 
 //
-// YAML utils
+// Chat template utils
 //
 
-bool create_directory_with_parents(const std::string & path);
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
-std::string get_sortable_timestamp();
+struct common_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
 
-void dump_non_result_info_yaml(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+// same with llama_chat_message, but uses std::string
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_tool_call> tool_calls;
+    std::string tool_plan = "";
+};
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+
+namespace minja {
+    class chat_template;
+}
+
+typedef minja::chat_template common_chat_template;
+
+struct common_chat_templates {
+    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+    std::unique_ptr<common_chat_template> template_tool_use;
+};
+
+// CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & chat,
+        bool add_ass,
+        bool use_jinja);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const common_chat_template & tmpl, bool use_jinja);
+
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
 
 //
 // KV cache utils
 //
 
 // Dump the KV cache view with the number of sequences per cell.
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
+//
+// Embedding utils
+//
+
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
+
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct common_control_vector_data {
+    int n_embd;
+
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+
+struct common_control_vector_load_info {
+    float strength;
+
+    std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
+
+//
+// Split utils
+//
+
+namespace {
+
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+
+}
diff --git a/common/console.cpp b/common/console.cpp
index f65cbc6ed..078a8d678 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -94,6 +94,9 @@ namespace console {
                 simple_io = true;
             }
         }
+        if (simple_io) {
+            _setmode(_fileno(stdin), _O_U8TEXT);
+        }
 #else
         // POSIX-specific console initialization
         if (!simple_io) {
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
deleted file mode 100644
index bf89a96f3..000000000
--- a/common/grammar-parser.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
-        return result.first->second;
-    }
-
-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    static void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<llama_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
-    }
-
-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    static const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    static const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<llama_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < out_elements.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
-                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
-                }
-
-                // apply transformation to previous symbol (last_sym_start to end) according to
-                // rewrite rules:
-                // S* --> S' ::= S S' |
-                // S+ --> S' ::= S S' | S
-                // S? --> S' ::= S |
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                std::vector<llama_grammar_element> sub_rule;
-                // add preceding symbol to generated rule
-                sub_rule.insert(
-                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                if (*pos == '*' || *pos == '+') {
-                    // cause generated rule to recurse
-                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                }
-                // mark start of alternate def
-                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                if (*pos == '+') {
-                    // add preceding symbol as alternate only for '+' (otherwise empty)
-                    sub_rule.insert(
-                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                }
-                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, sub_rule_id, sub_rule);
-
-                // in original rule, replace previous symbol with reference to generated rule
-                out_elements.resize(last_sym_start);
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-
-                pos = parse_space(pos + 1, is_nested);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<llama_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({LLAMA_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({LLAMA_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    static const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    static void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    static bool is_char_element(llama_grammar_element elem) {
-        switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            default:                           return false;
-        }
-    }
-
-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-            }
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                case LLAMA_GRETYPE_ALT:
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                case LLAMA_GRETYPE_CHAR_NOT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    static void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<llama_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            llama_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case LLAMA_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case LLAMA_GRETYPE_CHAR_ALT:
-                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const llama_grammar_element *> parse_state::c_rules() {
-        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
diff --git a/common/grammar-parser.h b/common/grammar-parser.h
deleted file mode 100644
index 9037d7272..000000000
--- a/common/grammar-parser.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by llama.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "llama.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                 symbol_ids;
-        std::vector<std::vector<llama_grammar_element>> rules;
-
-        std::vector<const llama_grammar_element *> c_rules();
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
new file mode 100644
index 000000000..3ebcc3d9f
--- /dev/null
+++ b/common/json-schema-to-grammar.cpp
@@ -0,0 +1,1025 @@
+#include "json-schema-to-grammar.h"
+#include "common.h"
+
+#include <algorithm>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
+    auto has_max = max_items != std::numeric_limits<int>::max();
+
+    if (min_items == 0 && max_items == 1) {
+        return item_rule + "?";
+    }
+
+    if (separator_rule.empty()) {
+        if (min_items == 1 && !has_max) {
+            return item_rule + "+";
+        } else if (min_items == 0 && !has_max) {
+            return item_rule + "*";
+        } else {
+            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
+        }
+    }
+
+    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
+    if (min_items == 0) {
+        result = "(" + result + ")?";
+    }
+    return result;
+}
+
+/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
+class string_view {
+    const std::string & _str;
+    const size_t _start;
+    const size_t _end;
+public:
+    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
+
+    size_t size() const {
+        return _end - _start;
+    }
+
+    size_t length() const {
+        return size();
+    }
+
+    operator std::string() const {
+        return str();
+    }
+
+    std::string str() const {
+        return _str.substr(_start, _end - _start);
+    }
+
+    string_view substr(size_t pos, size_t len = std::string::npos) const {
+        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
+    }
+
+    char operator[](size_t pos) const {
+        auto index = _start + pos;
+        if (index >= _end) {
+            throw std::out_of_range("string_view index out of range");
+        }
+        return _str[_start + pos];
+    }
+
+    bool operator==(const string_view & other) const {
+        std::string this_str = *this;
+        std::string other_str = other;
+        return this_str == other_str;
+    }
+};
+
+static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+    auto has_min = min_value != std::numeric_limits<int>::min();
+    auto has_max = max_value != std::numeric_limits<int>::max();
+
+    auto digit_range = [&](char from, char to) {
+        out << "[";
+        if (from == to) {
+            out << from;
+        } else {
+            out << from << "-" << to;
+        }
+        out << "]";
+    };
+    auto more_digits = [&](int min_digits, int max_digits) {
+        out << "[0-9]";
+        if (min_digits == max_digits && min_digits == 1) {
+            return;
+        }
+        out << "{";
+        out << min_digits;
+        if (max_digits != min_digits) {
+            out << ",";
+            if (max_digits != std::numeric_limits<int>::max()) {
+                out << max_digits;
+            }
+        }
+        out << "}";
+    };
+    std::function<void(const string_view &, const string_view &)> uniform_range =
+        [&](const string_view & from, const string_view & to) {
+            size_t i = 0;
+            while (i < from.length() && i < to.length() && from[i] == to[i]) {
+                i++;
+            }
+            if (i > 0) {
+                out << "\"" << from.substr(0, i).str() << "\"";
+            }
+            if (i < from.length() && i < to.length()) {
+                if (i > 0) {
+                    out << " ";
+                }
+                auto sub_len = from.length() - i - 1;
+                if (sub_len > 0) {
+                    auto from_sub = from.substr(i + 1);
+                    auto to_sub = to.substr(i + 1);
+                    auto sub_zeros = string_repeat("0", sub_len);
+                    auto sub_nines = string_repeat("9", sub_len);
+
+                    auto to_reached = false;
+                    out << "(";
+                    if (from_sub == sub_zeros) {
+                        digit_range(from[i], to[i] - 1);
+                        out << " ";
+                        more_digits(sub_len, sub_len);
+                    } else {
+                        out << "[" << from[i] << "] ";
+                        out << "(";
+                        uniform_range(from_sub, sub_nines);
+                        out << ")";
+                        if (from[i] < to[i] - 1) {
+                            out << " | ";
+                            if (to_sub == sub_nines) {
+                                digit_range(from[i] + 1, to[i]);
+                                to_reached = true;
+                            } else {
+                                digit_range(from[i] + 1, to[i] - 1);
+                            }
+                            out << " ";
+                            more_digits(sub_len, sub_len);
+                        }
+                    }
+                    if (!to_reached) {
+                        out << " | ";
+                        digit_range(to[i], to[i]);
+                        out << " ";
+                        uniform_range(sub_zeros, to_sub);
+                    }
+                    out << ")";
+                } else {
+                    out << "[" << from[i] << "-" << to[i] << "]";
+                }
+            }
+        };
+
+    if (has_min && has_max) {
+        if (min_value < 0 && max_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
+            out << ")";
+            return;
+        }
+
+        if (min_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
+            out << ") | ";
+            min_value = 0;
+        }
+
+        auto min_s = std::to_string(min_value);
+        auto max_s = std::to_string(max_value);
+        auto min_digits = min_s.length();
+        auto max_digits = max_s.length();
+
+        for (auto digits = min_digits; digits < max_digits; digits++) {
+            uniform_range(min_s, string_repeat("9", digits));
+            min_s = "1" + string_repeat("0", digits);
+            out << " | ";
+        }
+        uniform_range(min_s, max_s);
+        return;
+    }
+
+    auto less_decimals = std::max(decimals_left - 1, 1);
+
+    if (has_min) {
+        if (min_value < 0) {
+            out << "\"-\" (";
+            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            out << ") | [0] | [1-9] ";
+            more_digits(0, decimals_left - 1);
+        } else if (min_value == 0) {
+            if (top_level) {
+                out << "[0] | [1-9] ";
+                more_digits(0, less_decimals);
+            } else {
+                more_digits(1, decimals_left);
+            }
+        } else if (min_value <= 9) {
+            char c = '0' + min_value;
+            auto range_start = top_level ? '1' : '0';
+            if (c > range_start) {
+                digit_range(range_start, c - 1);
+                out << " ";
+                more_digits(1, less_decimals);
+                out << " | ";
+            }
+            digit_range(c, '9');
+            out << " ";
+            more_digits(0, less_decimals);
+        } else {
+            auto min_s = std::to_string(min_value);
+            auto len = min_s.length();
+            auto c = min_s[0];
+
+            if (c > '1') {
+                digit_range(top_level ? '1' : '0', c - 1);
+                out << " ";
+                more_digits(len, less_decimals);
+                out << " | ";
+            }
+            digit_range(c, c);
+            out << " (";
+            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            out << ")";
+            if (c < '9') {
+                out << " | ";
+                digit_range(c + 1, '9');
+                out << " ";
+                more_digits(len - 1, less_decimals);
+            }
+        }
+        return;
+    }
+
+    if (has_max) {
+        if (max_value >= 0) {
+            if (top_level) {
+                out << "\"-\" [1-9] ";
+                more_digits(0, less_decimals);
+                out << " | ";
+            }
+            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
+        } else {
+            out << "\"-\" (";
+            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            out << ")";
+        }
+        return;
+    }
+
+    throw std::runtime_error("At least one of min_value or max_value must be set");
+}
+
+const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+
+struct BuiltinRule {
+    std::string content;
+    std::vector<std::string> deps;
+};
+
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
+    {"decimal-part", {"[0-9]{1,16}", {}}},
+    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+    {"null", {"\"null\" space", {}}},
+};
+
+std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date-time", {"date \"T\" time", {"date", "time"}}},
+    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
+};
+
+static bool is_reserved_name(const std::string & name) {
+    static std::unordered_set<std::string> RESERVED_NAMES;
+    if (RESERVED_NAMES.empty()) {
+        RESERVED_NAMES.insert("root");
+        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
+    }
+    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
+}
+
+std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
+std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
+std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
+};
+
+std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+
+static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
+    std::smatch match;
+    std::string result;
+
+    std::string::const_iterator searchStart(input.cbegin());
+    std::string::const_iterator searchEnd(input.cend());
+
+    while (std::regex_search(searchStart, searchEnd, match, regex)) {
+        result.append(searchStart, searchStart + match.position());
+        result.append(replacement(match));
+        searchStart = match.suffix().first;
+    }
+
+    result.append(searchStart, searchEnd);
+
+    return result;
+}
+
+static std::string format_literal(const std::string & literal) {
+    std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
+        char c = match.str()[0];
+        return GRAMMAR_LITERAL_ESCAPES.at(c);
+    });
+    return "\"" + escaped + "\"";
+}
+
+class SchemaConverter {
+private:
+    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
+    std::function<json(const std::string &)> _fetch_json;
+    bool _dotall;
+    std::map<std::string, std::string> _rules;
+    std::unordered_map<std::string, json> _refs;
+    std::unordered_set<std::string> _refs_being_resolved;
+    std::vector<std::string> _errors;
+    std::vector<std::string> _warnings;
+
+    std::string _add_rule(const std::string & name, const std::string & rule) {
+        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
+            _rules[esc_name] = rule;
+            return esc_name;
+        } else {
+            int i = 0;
+            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+                i++;
+            }
+            std::string key = esc_name + std::to_string(i);
+            _rules[key] = rule;
+            return key;
+        }
+    }
+
+    std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
+        std::vector<std::string> rules;
+        for (size_t i = 0; i < alt_schemas.size(); i++) {
+            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
+        }
+        return string_join(rules, " | ");
+    }
+
+    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
+        if (!(pattern.front() == '^' && pattern.back() == '$')) {
+            _errors.push_back("Pattern must start with '^' and end with '$'");
+            return "";
+        }
+        std::string sub_pattern = pattern.substr(1, pattern.length() - 2);
+        std::unordered_map<std::string, std::string> sub_rule_ids;
+
+        size_t i = 0;
+        size_t length = sub_pattern.length();
+
+        using literal_or_rule = std::pair<std::string, bool>;
+        auto to_rule = [&](const literal_or_rule & ls) {
+            auto is_literal = ls.second;
+            auto s = ls.first;
+            return is_literal ? "\"" + s + "\"" : s;
+        };
+        std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
+            size_t start = i;
+            std::vector<literal_or_rule> seq;
+
+            auto get_dot = [&]() {
+                std::string rule;
+                if (_dotall) {
+                    rule = "[\\U00000000-\\U0010FFFF]";
+                } else {
+                    rule = "[^\\x0A\\x0D]";
+                }
+                return _add_rule("dot", rule);
+            };
+
+            // Joins the sequence, merging consecutive literals together.
+            auto join_seq = [&]() {
+                std::vector<literal_or_rule> ret;
+
+                std::string literal;
+                auto flush_literal = [&]() {
+                    if (literal.empty()) {
+                        return false;
+                    }
+                    ret.emplace_back(literal, true);
+                    literal.clear();
+                    return true;
+                };
+
+                for (const auto & item : seq) {
+                    auto is_literal = item.second;
+                    if (is_literal) {
+                        literal += item.first;
+                    } else {
+                        flush_literal();
+                        ret.push_back(item);
+                    }
+                }
+                flush_literal();
+
+                std::vector<std::string> results;
+                for (const auto & item : ret) {
+                    results.push_back(to_rule(item));
+                }
+                return std::make_pair(string_join(results, " "), false);
+            };
+
+            while (i < length) {
+                char c = sub_pattern[i];
+                if (c == '.') {
+                    seq.emplace_back(get_dot(), false);
+                    i++;
+                } else if (c == '(') {
+                    i++;
+                    if (i < length) {
+                        if (sub_pattern[i] == '?') {
+                            _warnings.push_back("Unsupported pattern syntax");
+                        }
+                    }
+                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
+                } else if (c == ')') {
+                    i++;
+                    if (start > 0 && sub_pattern[start - 1] != '(') {
+                        _errors.push_back("Unbalanced parentheses");
+                    }
+                    return join_seq();
+                } else if (c == '[') {
+                    std::string square_brackets = std::string(1, c);
+                    i++;
+                    while (i < length && sub_pattern[i] != ']') {
+                        if (sub_pattern[i] == '\\') {
+                            square_brackets += sub_pattern.substr(i, 2);
+                            i += 2;
+                        } else {
+                            square_brackets += sub_pattern[i];
+                            i++;
+                        }
+                    }
+                    if (i >= length) {
+                        _errors.push_back("Unbalanced square brackets");
+                    }
+                    square_brackets += ']';
+                    i++;
+                    seq.emplace_back(square_brackets, false);
+                } else if (c == '|') {
+                    seq.emplace_back("|", false);
+                    i++;
+                } else if (c == '*' || c == '+' || c == '?') {
+                    seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
+                    i++;
+                } else if (c == '{') {
+                    std::string curly_brackets = std::string(1, c);
+                    i++;
+                    while (i < length && sub_pattern[i] != '}') {
+                        curly_brackets += sub_pattern[i];
+                        i++;
+                    }
+                    if (i >= length) {
+                        _errors.push_back("Unbalanced curly brackets");
+                    }
+                    curly_brackets += '}';
+                    i++;
+                    auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    int min_times = 0;
+                    int max_times = std::numeric_limits<int>::max();
+                    try {
+                        if (nums.size() == 1) {
+                            min_times = max_times = std::stoi(nums[0]);
+                        } else if (nums.size() != 2) {
+                            _errors.push_back("Wrong number of values in curly brackets");
+                        } else {
+                            if (!nums[0].empty()) {
+                                min_times = std::stoi(nums[0]);
+                            }
+                            if (!nums[1].empty()) {
+                                max_times = std::stoi(nums[1]);
+                            }
+                        }
+                    } catch (const std::invalid_argument & e) {
+                        _errors.push_back("Invalid number in curly brackets");
+                        return std::make_pair("", false);
+                    }
+                    auto &last = seq.back();
+                    auto &sub = last.first;
+                    auto sub_is_literal = last.second;
+
+                    if (!sub_is_literal) {
+                        std::string & sub_id = sub_rule_ids[sub];
+                        if (sub_id.empty()) {
+                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                        }
+                        sub = sub_id;
+                    }
+                    seq.back().first = build_repetition(
+                        sub_is_literal ? "\"" + sub + "\"" : sub,
+                        min_times,
+                        max_times,
+                        ""
+                    );
+                    seq.back().second = false;
+                } else {
+                    std::string literal;
+                    auto is_non_literal = [&](char c) {
+                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
+                    };
+                    while (i < length) {
+                        if (sub_pattern[i] == '\\' && i < length - 1) {
+                            char next = sub_pattern[i + 1];
+                            if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
+                                i++;
+                                literal += sub_pattern[i];
+                                i++;
+                            } else {
+                                literal += sub_pattern.substr(i, 2);
+                                i += 2;
+                            }
+                        } else if (sub_pattern[i] == '"') {
+                            literal += "\\\"";
+                            i++;
+                        } else if (!is_non_literal(sub_pattern[i]) &&
+                                (i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
+                            literal += sub_pattern[i];
+                            i++;
+                        } else {
+                            break;
+                        }
+                    }
+                    if (!literal.empty()) {
+                        seq.emplace_back(literal, true);
+                    }
+                }
+            }
+            return join_seq();
+        };
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
+    }
+
+    /*
+        Returns a rule that matches a JSON string that is none of the provided strings
+
+        not_strings({"a"})
+            -> ["] ( [a] char+ | [^"a] char* )? ["] space
+        not_strings({"and", "also"})
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+    */
+    std::string _not_strings(const std::vector<std::string> & strings) {
+
+        struct TrieNode {
+            std::map<char, TrieNode> children;
+            bool is_end_of_string;
+
+            TrieNode() : is_end_of_string(false) {}
+
+            void insert(const std::string & string) {
+                auto node = this;
+                for (char c : string) {
+                    node = &node->children[c];
+                }
+                node->is_end_of_string = true;
+            }
+        };
+
+        TrieNode trie;
+        for (const auto & s : strings) {
+            trie.insert(s);
+        }
+
+        std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+        std::ostringstream out;
+        out << "[\"] ( ";
+        std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
+            std::ostringstream rejects;
+            auto first = true;
+            for (const auto & kv : node.children) {
+                rejects << kv.first;
+                if (first) {
+                    first = false;
+                } else {
+                    out << " | ";
+                }
+                out << "[" << kv.first << "]";
+                if (!kv.second.children.empty()) {
+                    out << " (";
+                    visit(kv.second);
+                    out << ")";
+                } else if (kv.second.is_end_of_string) {
+                    out << " " << char_rule << "+";
+                }
+            }
+            if (!node.children.empty()) {
+                if (!first) {
+                    out << " | ";
+                }
+                out << "[^\"" << rejects.str() << "] " << char_rule << "*";
+            }
+        };
+        visit(trie);
+
+        out << " )";
+        if (!trie.is_end_of_string) {
+            out << "?";
+        }
+        out << " [\"] space";
+        return out.str();
+    }
+
+    std::string _resolve_ref(const std::string & ref) {
+        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
+            _refs_being_resolved.insert(ref);
+            json resolved = _refs[ref];
+            ref_name = visit(resolved, ref_name);
+            _refs_being_resolved.erase(ref);
+        }
+        return ref_name;
+    }
+
+    std::string _build_object_rule(
+        const std::vector<std::pair<std::string, json>> & properties,
+        const std::unordered_set<std::string> & required,
+        const std::string & name,
+        const json & additional_properties)
+    {
+        std::vector<std::string> required_props;
+        std::vector<std::string> optional_props;
+        std::unordered_map<std::string, std::string> prop_kv_rule_names;
+        std::vector<std::string> prop_names;
+        for (const auto & kv : properties) {
+            const auto &prop_name = kv.first;
+            const auto &prop_schema = kv.second;
+
+            std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
+            prop_kv_rule_names[prop_name] = _add_rule(
+                name + (name.empty() ? "" : "-") + prop_name + "-kv",
+                format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
+            );
+            if (required.find(prop_name) != required.end()) {
+                required_props.push_back(prop_name);
+            } else {
+                optional_props.push_back(prop_name);
+            }
+            prop_names.push_back(prop_name);
+        }
+        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
+            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
+            std::string value_rule =
+                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
+                : _add_primitive("value", PRIMITIVE_RULES.at("value"));
+
+            auto key_rule =
+                prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
+                : _add_rule(sub_name + "-k", _not_strings(prop_names));
+            std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
+            prop_kv_rule_names["*"] = kv_rule;
+            optional_props.push_back("*");
+        }
+
+        std::string rule = "\"{\" space ";
+        for (size_t i = 0; i < required_props.size(); i++) {
+            if (i > 0) {
+                rule += " \",\" space ";
+            }
+            rule += prop_kv_rule_names[required_props[i]];
+        }
+
+        if (!optional_props.empty()) {
+            rule += " (";
+            if (!required_props.empty()) {
+                rule += " \",\" space ( ";
+            }
+
+            std::function<std::string(const std::vector<std::string> &, bool)> get_recursive_refs = [&](const std::vector<std::string> & ks, bool first_is_optional) {
+                std::string res;
+                if (ks.empty()) {
+                    return res;
+                }
+                std::string k = ks[0];
+                std::string kv_rule_name = prop_kv_rule_names[k];
+                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
+                if (first_is_optional) {
+                    res = comma_ref + (k == "*" ? "*" : "?");
+                } else {
+                    res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
+                }
+                if (ks.size() > 1) {
+                    res += " " + _add_rule(
+                        name + (name.empty() ? "" : "-") + k + "-rest",
+                        get_recursive_refs(std::vector<std::string>(ks.begin() + 1, ks.end()), true)
+                    );
+                }
+                return res;
+            };
+
+            for (size_t i = 0; i < optional_props.size(); i++) {
+                if (i > 0) {
+                    rule += " | ";
+                }
+                rule += get_recursive_refs(std::vector<std::string>(optional_props.begin() + i, optional_props.end()), false);
+            }
+            if (!required_props.empty()) {
+                rule += " )";
+            }
+            rule += " )?";
+        }
+
+        rule += " \"}\" space";
+
+        return rule;
+    }
+
+    std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+        auto n = _add_rule(name, rule.content);
+        for (const auto & dep : rule.deps) {
+            BuiltinRule dep_rule;
+            auto it = PRIMITIVE_RULES.find(dep);
+            if (it == PRIMITIVE_RULES.end()) {
+                it = STRING_FORMAT_RULES.find(dep);
+                if (it == STRING_FORMAT_RULES.end()) {
+                    _errors.push_back("Rule " + dep + " not known");
+                    continue;
+                }
+            }
+            if (_rules.find(dep) == _rules.end()) {
+                _add_primitive(dep, it->second);
+            }
+        }
+        return n;
+    }
+
+public:
+    SchemaConverter(
+        const std::function<json(const std::string &)> & fetch_json,
+        bool dotall,
+        bool compact_spaces)
+          : _fetch_json(fetch_json), _dotall(dotall)
+    {
+        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
+    }
+
+    void resolve_refs(json & schema, const std::string & url) {
+        /*
+        * Resolves all $ref fields in the given schema, fetching any remote schemas,
+        * replacing each $ref with absolute reference URL and populates _refs with the
+        * respective referenced (sub)schema dictionaries.
+        */
+        std::function<void(json &)> visit_refs = [&](json & n) {
+            if (n.is_array()) {
+                for (auto & x : n) {
+                    visit_refs(x);
+                }
+            } else if (n.is_object()) {
+                if (n.contains("$ref")) {
+                    std::string ref = n["$ref"];
+                    if (_refs.find(ref) == _refs.end()) {
+                        json target;
+                        if (ref.find("https://") == 0) {
+                            std::string base_url = ref.substr(0, ref.find('#'));
+                            auto it = _refs.find(base_url);
+                            if (it != _refs.end()) {
+                                target = it->second;
+                            } else {
+                                // Fetch the referenced schema and resolve its refs
+                                auto referenced = _fetch_json(ref);
+                                resolve_refs(referenced, base_url);
+                                _refs[base_url] = referenced;
+                            }
+                            if (ref.find('#') == std::string::npos || ref.substr(ref.find('#') + 1).empty()) {
+                                return;
+                            }
+                        } else if (ref.find("#/") == 0) {
+                            target = schema;
+                            n["$ref"] = url + ref;
+                            ref = url + ref;
+                        } else {
+                            _errors.push_back("Unsupported ref: " + ref);
+                            return;
+                        }
+                        std::string pointer = ref.substr(ref.find('#') + 1);
+                        std::vector<std::string> tokens = string_split(pointer, "/");
+                        for (size_t i = 1; i < tokens.size(); ++i) {
+                            std::string sel = tokens[i];
+                            if (target.is_null() || !target.contains(sel)) {
+                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                return;
+                            }
+                            target = target[sel];
+                        }
+                        _refs[ref] = target;
+                    }
+                } else {
+                    for (auto & kv : n.items()) {
+                        visit_refs(kv.value());
+                    }
+                }
+            }
+        };
+
+        visit_refs(schema);
+    }
+
+    std::string _generate_constant_rule(const json & value) {
+        return format_literal(value.dump());
+    }
+
+    std::string visit(const json & schema, const std::string & name) {
+        json schema_type = schema.contains("type") ? schema["type"] : json();
+        std::string schema_format = schema.contains("format") ? schema["format"].get<std::string>() : "";
+        std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
+
+        if (schema.contains("$ref")) {
+            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
+        } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
+            std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
+            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+        } else if (schema_type.is_array()) {
+            std::vector<json> schema_types;
+            for (const auto & t : schema_type) {
+                json schema_copy(schema);
+                schema_copy["type"] = t;
+                schema_types.push_back(schema_copy);
+            }
+            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
+        } else if (schema.contains("const")) {
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
+        } else if (schema.contains("enum")) {
+            std::vector<std::string> enum_values;
+            for (const auto & v : schema["enum"]) {
+                enum_values.push_back(_generate_constant_rule(v));
+            }
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
+        } else if ((schema_type.is_null() || schema_type == "object")
+                && (schema.contains("properties") ||
+                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
+            std::unordered_set<std::string> required;
+            if (schema.contains("required") && schema["required"].is_array()) {
+                for (const auto & item : schema["required"]) {
+                    if (item.is_string()) {
+                        required.insert(item.get<std::string>());
+                    }
+                }
+            }
+            std::vector<std::pair<std::string, json>> properties;
+            if (schema.contains("properties")) {
+                for (const auto & prop : schema["properties"].items()) {
+                    properties.emplace_back(prop.key(), prop.value());
+                }
+            }
+            return _add_rule(rule_name,
+                _build_object_rule(
+                    properties, required, name,
+                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
+        } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
+            std::unordered_set<std::string> required;
+            std::vector<std::pair<std::string, json>> properties;
+            std::string hybrid_name = name;
+            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
+                if (comp_schema.contains("$ref")) {
+                    add_component(_refs[comp_schema["$ref"]], is_required);
+                } else if (comp_schema.contains("properties")) {
+                    for (const auto & prop : comp_schema["properties"].items()) {
+                        properties.emplace_back(prop.key(), prop.value());
+                        if (is_required) {
+                            required.insert(prop.key());
+                        }
+                    }
+                } else {
+                  // todo warning
+                }
+            };
+            for (auto & t : schema["allOf"]) {
+                if (t.contains("anyOf")) {
+                    for (auto & tt : t["anyOf"]) {
+                        add_component(tt, false);
+                    }
+                } else {
+                    add_component(t, true);
+                }
+            }
+            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
+        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
+            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
+            if (items.is_array()) {
+                std::string rule = "\"[\" space ";
+                for (size_t i = 0; i < items.size(); i++) {
+                    if (i > 0) {
+                        rule += " \",\" space ";
+                    }
+                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
+                }
+                rule += " \"]\" space";
+                return _add_rule(rule_name, rule);
+            } else {
+                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
+                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
+                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
+
+                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+            }
+        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
+            return _visit_pattern(schema["pattern"], rule_name);
+        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
+            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+            auto prim_name = schema_format + "-string";
+            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
+            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
+            int min_value = std::numeric_limits<int>::min();
+            int max_value = std::numeric_limits<int>::max();
+            if (schema.contains("minimum")) {
+                min_value = schema["minimum"].get<int>();
+            } else if (schema.contains("exclusiveMinimum")) {
+                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+            }
+            if (schema.contains("maximum")) {
+                max_value = schema["maximum"].get<int>();
+            } else if (schema.contains("exclusiveMaximum")) {
+                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+            }
+            std::stringstream out;
+            out << "(";
+            _build_min_max_int(min_value, max_value, out);
+            out << ") space";
+            return _add_rule(rule_name, out.str());
+        } else if (schema.empty() || schema_type == "object") {
+            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
+        } else {
+            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
+                _errors.push_back("Unrecognized schema: " + schema.dump());
+                return "";
+            }
+            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+        }
+    }
+
+    void check_errors() {
+        if (!_errors.empty()) {
+            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+        }
+        if (!_warnings.empty()) {
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
+        }
+    }
+
+    std::string format_grammar() {
+        std::stringstream ss;
+        for (const auto & kv : _rules) {
+            ss << kv.first << " ::= " << kv.second << std::endl;
+        }
+        return ss.str();
+    }
+};
+
+std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
+#ifdef LLAMA_USE_LLGUIDANCE
+    if (!force_gbnf) {
+        return "%llguidance {}\nstart: %json " + schema.dump();
+    }
+#else
+    (void)force_gbnf;
+#endif // LLAMA_USE_LLGUIDANCE
+    return build_grammar([&](const common_grammar_builder & callbacks) {
+        auto copy = schema;
+        callbacks.resolve_refs(copy);
+        callbacks.add_schema("", copy);
+    });
+}
+
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
+    common_grammar_builder builder {
+        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
+            return converter._add_rule(name, rule);
+        },
+        /* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
+            return converter.visit(schema, name == "root" ? "" : name);
+        },
+        /* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
+            converter.resolve_refs(schema, "");
+        }
+    };
+    cb(builder);
+    converter.check_errors();
+    return converter.format_grammar();
+}
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
new file mode 100644
index 000000000..62a3b0a44
--- /dev/null
+++ b/common/json-schema-to-grammar.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+
+std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
+                                   bool force_gbnf = false);
+
+struct common_grammar_builder {
+    std::function<std::string(const std::string &, const std::string &)> add_rule;
+    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
+    std::function<void(nlohmann::ordered_json &)> resolve_refs;
+};
+
+struct common_grammar_options {
+    bool dotall = false;
+    bool compact_spaces = false;
+};
+
+std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
diff --git a/examples/server/json.hpp b/common/json.hpp
similarity index 93%
rename from examples/server/json.hpp
rename to common/json.hpp
index ea945f346..a858728c4 100644
--- a/examples/server/json.hpp
+++ b/common/json.hpp
@@ -1,9 +1,9 @@
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 /****************************************************************************\
@@ -27,7 +27,6 @@
 #endif  // JSON_NO_IO
 #include <iterator> // random_access_iterator_tag
 #include <memory> // unique_ptr
-#include <numeric> // accumulate
 #include <string> // string, stoi, to_string
 #include <utility> // declval, forward, move, pair, swap
 #include <vector> // vector
@@ -35,10 +34,10 @@
 // #include <nlohmann/adl_serializer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -48,10 +47,10 @@
 // #include <nlohmann/detail/abi_macros.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -60,7 +59,7 @@
 
 #ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
     #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
-        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 2
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
             #warning "Already included a different version of the library!"
         #endif
     #endif
@@ -68,7 +67,7 @@
 
 #define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
 #define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_PATCH 2   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
 
 #ifndef JSON_DIAGNOSTICS
     #define JSON_DIAGNOSTICS 0
@@ -150,10 +149,10 @@
 // #include <nlohmann/detail/conversions/from_json.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -173,16 +172,19 @@
 // #include <nlohmann/detail/exceptions.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
 
 #include <cstddef> // nullptr_t
 #include <exception> // exception
+#if JSON_DIAGNOSTICS
+    #include <numeric> // accumulate
+#endif
 #include <stdexcept> // runtime_error
 #include <string> // to_string
 #include <vector> // vector
@@ -190,10 +192,10 @@
 // #include <nlohmann/detail/value_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -206,10 +208,10 @@
 // #include <nlohmann/detail/macro_scope.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -218,10 +220,10 @@
 // #include <nlohmann/detail/meta/detected.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -231,10 +233,10 @@
 // #include <nlohmann/detail/meta/void_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -318,10 +320,10 @@ NLOHMANN_JSON_NAMESPACE_END
 
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-FileCopyrightText: 2016-2021 Evan Nemerson <evan@nemerson.com>
 // SPDX-License-Identifier: MIT
 
@@ -2483,6 +2485,14 @@ JSON_HEDLEY_DIAGNOSTIC_POP
     #endif
 #endif
 
+#ifndef JSON_HAS_STATIC_RTTI
+    #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0
+        #define JSON_HAS_STATIC_RTTI 1
+    #else
+        #define JSON_HAS_STATIC_RTTI 0
+    #endif
+#endif
+
 #ifdef JSON_HAS_CPP_17
     #define JSON_INLINE_VARIABLE inline
 #else
@@ -2590,12 +2600,13 @@ JSON_HEDLEY_DIAGNOSTIC_POP
              class NumberUnsignedType, class NumberFloatType,              \
              template<typename> class AllocatorType,                       \
              template<typename, typename = void> class JSONSerializer,     \
-             class BinaryType>
+             class BinaryType,                                             \
+             class CustomBaseClass>
 
 #define NLOHMANN_BASIC_JSON_TPL                                            \
     basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
     NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
-    AllocatorType, JSONSerializer, BinaryType>
+    AllocatorType, JSONSerializer, BinaryType, CustomBaseClass>
 
 // Macros to simplify conversion from/to types
 
@@ -2745,7 +2756,10 @@ JSON_HEDLEY_DIAGNOSTIC_POP
 
 #define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
     friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
 
 /*!
 @brief macro
@@ -2756,10 +2770,12 @@ JSON_HEDLEY_DIAGNOSTIC_POP
     inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
     inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
 
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+
 #define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
     inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
-
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
 
 // inspired from https://stackoverflow.com/a/26745591
 // allows to call any std function as if (e.g. with begin):
@@ -2923,10 +2939,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/string_escape.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -2998,10 +3014,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/position_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3040,10 +3056,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/cpp_future.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-FileCopyrightText: 2018 The Abseil Authors
 // SPDX-License-Identifier: MIT
 
@@ -3214,10 +3230,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/type_traits.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3226,14 +3242,15 @@ NLOHMANN_JSON_NAMESPACE_END
 #include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
 #include <utility> // declval
 #include <tuple> // tuple
+#include <string> // char_traits
 
 // #include <nlohmann/detail/iterators/iterator_traits.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3298,10 +3315,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/call_std/begin.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3318,10 +3335,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/call_std/end.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3342,10 +3359,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/json_fwd.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
@@ -3389,7 +3406,8 @@ NLOHMANN_JSON_NAMESPACE_END
     template<typename U> class AllocatorType = std::allocator,
     template<typename T, typename SFINAE = void> class JSONSerializer =
     adl_serializer,
-    class BinaryType = std::vector<std::uint8_t>>
+    class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
+    class CustomBaseClass = void>
     class basic_json;
 
     /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
@@ -3577,6 +3595,63 @@ struct actual_object_comparator
 template<typename BasicJsonType>
 using actual_object_comparator_t = typename actual_object_comparator<BasicJsonType>::type;
 
+/////////////////
+// char_traits //
+/////////////////
+
+// Primary template of char_traits calls std char_traits
+template<typename T>
+struct char_traits : std::char_traits<T>
+{};
+
+// Explicitly define char traits for unsigned char since it is not standard
+template<>
+struct char_traits<unsigned char> : std::char_traits<char>
+{
+    using char_type = unsigned char;
+    using int_type = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type to_int_type(char_type c) noexcept
+    {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type to_char_type(int_type i) noexcept
+    {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type eof() noexcept
+    {
+        return static_cast<int_type>(EOF);
+    }
+};
+
+// Explicitly define char traits for signed char since it is not standard
+template<>
+struct char_traits<signed char> : std::char_traits<char>
+{
+    using char_type = signed char;
+    using int_type = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type to_int_type(char_type c) noexcept
+    {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type to_char_type(int_type i) noexcept
+    {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type eof() noexcept
+    {
+        return static_cast<int_type>(EOF);
+    }
+};
+
 ///////////////////
 // is_ functions //
 ///////////////////
@@ -3613,7 +3688,6 @@ template <typename... Ts>
 struct is_default_constructible<const std::tuple<Ts...>>
             : conjunction<is_default_constructible<Ts>...> {};
 
-
 template <typename T, typename... Args>
 struct is_constructible : std::is_constructible<T, Args...> {};
 
@@ -3629,7 +3703,6 @@ struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple
 template <typename... Ts>
 struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};
 
-
 template<typename T, typename = void>
 struct is_iterator_traits : std::false_type {};
 
@@ -4039,7 +4112,6 @@ struct value_in_range_of_impl2<OfType, T, false, true>
     }
 };
 
-
 template<typename OfType, typename T>
 struct value_in_range_of_impl2<OfType, T, true, true>
 {
@@ -4138,10 +4210,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/string_concat.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -4165,28 +4237,28 @@ inline std::size_t concat_length()
 }
 
 template<typename... Args>
-inline std::size_t concat_length(const char* cstr, Args&& ... rest);
+inline std::size_t concat_length(const char* cstr, const Args& ... rest);
 
 template<typename StringType, typename... Args>
-inline std::size_t concat_length(const StringType& str, Args&& ... rest);
+inline std::size_t concat_length(const StringType& str, const Args& ... rest);
 
 template<typename... Args>
-inline std::size_t concat_length(const char /*c*/, Args&& ... rest)
+inline std::size_t concat_length(const char /*c*/, const Args& ... rest)
 {
-    return 1 + concat_length(std::forward<Args>(rest)...);
+    return 1 + concat_length(rest...);
 }
 
 template<typename... Args>
-inline std::size_t concat_length(const char* cstr, Args&& ... rest)
+inline std::size_t concat_length(const char* cstr, const Args& ... rest)
 {
     // cppcheck-suppress ignoredReturnValue
-    return ::strlen(cstr) + concat_length(std::forward<Args>(rest)...);
+    return ::strlen(cstr) + concat_length(rest...);
 }
 
 template<typename StringType, typename... Args>
-inline std::size_t concat_length(const StringType& str, Args&& ... rest)
+inline std::size_t concat_length(const StringType& str, const Args& ... rest)
 {
-    return str.size() + concat_length(std::forward<Args>(rest)...);
+    return str.size() + concat_length(rest...);
 }
 
 template<typename OutStringType>
@@ -4277,7 +4349,7 @@ template<typename OutStringType = std::string, typename... Args>
 inline OutStringType concat(Args && ... args)
 {
     OutStringType str;
-    str.reserve(concat_length(std::forward<Args>(args)...));
+    str.reserve(concat_length(args...));
     concat_into(str, std::forward<Args>(args)...);
     return str;
 }
@@ -4286,7 +4358,6 @@ inline OutStringType concat(Args && ... args)
 NLOHMANN_JSON_NAMESPACE_END
 
 
-
 NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
 {
@@ -4334,9 +4405,9 @@ class exception : public std::exception
             {
                 case value_t::array:
                 {
-                    for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i)
+                    for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i)
                     {
-                        if (&current->m_parent->m_value.array->operator[](i) == current)
+                        if (&current->m_parent->m_data.m_value.array->operator[](i) == current)
                         {
                             tokens.emplace_back(std::to_string(i));
                             break;
@@ -4347,7 +4418,7 @@ class exception : public std::exception
 
                 case value_t::object:
                 {
-                    for (const auto& element : *current->m_parent->m_value.object)
+                    for (const auto& element : *current->m_parent->m_data.m_value.object)
                     {
                         if (&element.second == current)
                         {
@@ -4410,17 +4481,17 @@ class parse_error : public exception
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("parse_error", id_), "parse error",
-                               position_string(pos), ": ", exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("parse_error", id_), "parse error",
+                                     position_string(pos), ": ", exception::diagnostics(context), what_arg);
         return {id_, pos.chars_read_total, w.c_str()};
     }
 
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("parse_error", id_), "parse error",
-                               (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
-                               ": ", exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("parse_error", id_), "parse error",
+                                     (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
+                                     ": ", exception::diagnostics(context), what_arg);
         return {id_, byte_, w.c_str()};
     }
 
@@ -4454,7 +4525,7 @@ class invalid_iterator : public exception
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
         return {id_, w.c_str()};
     }
 
@@ -4472,7 +4543,7 @@ class type_error : public exception
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static type_error create(int id_, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
         return {id_, w.c_str()};
     }
 
@@ -4489,7 +4560,7 @@ class out_of_range : public exception
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
         return {id_, w.c_str()};
     }
 
@@ -4506,7 +4577,7 @@ class other_error : public exception
     template<typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
     static other_error create(int id_, const std::string& what_arg, BasicJsonContext context)
     {
-        std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
+        const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
         return {id_, w.c_str()};
     }
 
@@ -4525,10 +4596,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/identity_tag.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -4549,10 +4620,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/std_fs.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -5055,10 +5126,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/conversions/to_json.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -5075,10 +5146,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/iteration_proxy.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -5147,10 +5218,10 @@ template<typename IteratorType> class iteration_proxy_value
     // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions
     iteration_proxy_value(iteration_proxy_value&&)
     noexcept(std::is_nothrow_move_constructible<IteratorType>::value
-             && std::is_nothrow_move_constructible<string_type>::value) = default;
+             && std::is_nothrow_move_constructible<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
     iteration_proxy_value& operator=(iteration_proxy_value&&)
     noexcept(std::is_nothrow_move_assignable<IteratorType>::value
-             && std::is_nothrow_move_assignable<string_type>::value) = default;
+             && std::is_nothrow_move_assignable<string_type>::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
     ~iteration_proxy_value() = default;
 
     /// dereference operator (needed for range-based for)
@@ -5297,11 +5368,11 @@ namespace std
     #pragma clang diagnostic ignored "-Wmismatched-tags"
 #endif
 template<typename IteratorType>
-class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>> // NOLINT(cert-dcl58-cpp)
             : public std::integral_constant<std::size_t, 2> {};
 
 template<std::size_t N, typename IteratorType>
-class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >> // NOLINT(cert-dcl58-cpp)
 {
   public:
     using type = decltype(
@@ -5340,7 +5411,7 @@ namespace detail
 
 /*
  * Note all external_constructor<>::construct functions need to call
- * j.m_value.destroy(j.m_type) to avoid a memory leak in case j contains an
+ * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an
  * allocated value (e.g., a string). See bug issue
  * https://github.com/nlohmann/json/issues/2865 for more information.
  */
@@ -5353,9 +5424,9 @@ struct external_constructor<value_t::boolean>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::boolean;
-        j.m_value = b;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::boolean;
+        j.m_data.m_value = b;
         j.assert_invariant();
     }
 };
@@ -5366,18 +5437,18 @@ struct external_constructor<value_t::string>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::string;
-        j.m_value = s;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value = s;
         j.assert_invariant();
     }
 
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::string;
-        j.m_value = std::move(s);
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value = std::move(s);
         j.assert_invariant();
     }
 
@@ -5386,9 +5457,9 @@ struct external_constructor<value_t::string>
                              int > = 0 >
     static void construct(BasicJsonType& j, const CompatibleStringType& str)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::string;
-        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::string;
+        j.m_data.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
         j.assert_invariant();
     }
 };
@@ -5399,18 +5470,18 @@ struct external_constructor<value_t::binary>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::binary;
-        j.m_value = typename BasicJsonType::binary_t(b);
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(b);
         j.assert_invariant();
     }
 
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::binary;
-        j.m_value = typename BasicJsonType::binary_t(std::move(b));
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b));
         j.assert_invariant();
     }
 };
@@ -5421,9 +5492,9 @@ struct external_constructor<value_t::number_float>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::number_float;
-        j.m_value = val;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_float;
+        j.m_data.m_value = val;
         j.assert_invariant();
     }
 };
@@ -5434,9 +5505,9 @@ struct external_constructor<value_t::number_unsigned>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::number_unsigned;
-        j.m_value = val;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_unsigned;
+        j.m_data.m_value = val;
         j.assert_invariant();
     }
 };
@@ -5447,9 +5518,9 @@ struct external_constructor<value_t::number_integer>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::number_integer;
-        j.m_value = val;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::number_integer;
+        j.m_data.m_value = val;
         j.assert_invariant();
     }
 };
@@ -5460,9 +5531,9 @@ struct external_constructor<value_t::array>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::array;
-        j.m_value = arr;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = arr;
         j.set_parents();
         j.assert_invariant();
     }
@@ -5470,9 +5541,9 @@ struct external_constructor<value_t::array>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::array;
-        j.m_value = std::move(arr);
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = std::move(arr);
         j.set_parents();
         j.assert_invariant();
     }
@@ -5485,9 +5556,9 @@ struct external_constructor<value_t::array>
         using std::begin;
         using std::end;
 
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::array;
-        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
         j.set_parents();
         j.assert_invariant();
     }
@@ -5495,14 +5566,14 @@ struct external_constructor<value_t::array>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, const std::vector<bool>& arr)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::array;
-        j.m_value = value_t::array;
-        j.m_value.array->reserve(arr.size());
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->reserve(arr.size());
         for (const bool x : arr)
         {
-            j.m_value.array->push_back(x);
-            j.set_parent(j.m_value.array->back());
+            j.m_data.m_value.array->push_back(x);
+            j.set_parent(j.m_data.m_value.array->back());
         }
         j.assert_invariant();
     }
@@ -5511,13 +5582,13 @@ struct external_constructor<value_t::array>
              enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
     static void construct(BasicJsonType& j, const std::valarray<T>& arr)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::array;
-        j.m_value = value_t::array;
-        j.m_value.array->resize(arr.size());
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->resize(arr.size());
         if (arr.size() > 0)
         {
-            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+            std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin());
         }
         j.set_parents();
         j.assert_invariant();
@@ -5530,9 +5601,9 @@ struct external_constructor<value_t::object>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::object;
-        j.m_value = obj;
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value = obj;
         j.set_parents();
         j.assert_invariant();
     }
@@ -5540,9 +5611,9 @@ struct external_constructor<value_t::object>
     template<typename BasicJsonType>
     static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
     {
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::object;
-        j.m_value = std::move(obj);
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value = std::move(obj);
         j.set_parents();
         j.assert_invariant();
     }
@@ -5554,9 +5625,9 @@ struct external_constructor<value_t::object>
         using std::begin;
         using std::end;
 
-        j.m_value.destroy(j.m_type);
-        j.m_type = value_t::object;
-        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type = value_t::object;
+        j.m_data.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
         j.set_parents();
         j.assert_invariant();
     }
@@ -5626,7 +5697,8 @@ template<typename BasicJsonType, typename EnumType,
 inline void to_json(BasicJsonType& j, EnumType e) noexcept
 {
     using underlying_type = typename std::underlying_type<EnumType>::type;
-    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+    static constexpr value_t integral_value_t = std::is_unsigned<underlying_type>::value ? value_t::number_unsigned : value_t::number_integer;
+    external_constructor<integral_value_t>::construct(j, static_cast<underlying_type>(e));
 }
 #endif  // JSON_DISABLE_ENUM_SERIALIZATION
 
@@ -5796,10 +5868,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/byte_container_with_subtype.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -5908,10 +5980,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/hash.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6041,10 +6113,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/binary_reader.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6067,10 +6139,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/input_adapters.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6094,6 +6166,8 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/macro_scope.hpp>
 
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
 
 NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
@@ -6140,7 +6214,6 @@ class file_input_adapter
     std::FILE* m_file;
 };
 
-
 /*!
 Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
 beginning of input. Does not support changing the underlying std::streambuf
@@ -6214,16 +6287,16 @@ class iterator_input_adapter
         : current(std::move(first)), end(std::move(last))
     {}
 
-    typename std::char_traits<char_type>::int_type get_character()
+    typename char_traits<char_type>::int_type get_character()
     {
         if (JSON_HEDLEY_LIKELY(current != end))
         {
-            auto result = std::char_traits<char_type>::to_int_type(*current);
+            auto result = char_traits<char_type>::to_int_type(*current);
             std::advance(current, 1);
             return result;
         }
 
-        return std::char_traits<char_type>::eof();
+        return char_traits<char_type>::eof();
     }
 
   private:
@@ -6239,7 +6312,6 @@ class iterator_input_adapter
     }
 };
 
-
 template<typename BaseInputAdapter, size_t T>
 struct wide_string_input_helper;
 
@@ -6363,7 +6435,7 @@ struct wide_string_input_helper<BaseInputAdapter, 2>
     }
 };
 
-// Wraps another input apdater to convert wide character types into individual bytes.
+// Wraps another input adapter to convert wide character types into individual bytes.
 template<typename BaseInputAdapter, typename WideCharType>
 class wide_string_input_adapter
 {
@@ -6408,7 +6480,6 @@ class wide_string_input_adapter
     std::size_t utf8_bytes_filled = 0;
 };
 
-
 template<typename IteratorType, typename Enable = void>
 struct iterator_input_adapter_factory
 {
@@ -6565,10 +6636,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/json_sax.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6710,7 +6781,6 @@ struct json_sax
     virtual ~json_sax() = default;
 };
 
-
 namespace detail
 {
 /*!
@@ -6812,7 +6882,7 @@ class json_sax_dom_parser
         JSON_ASSERT(ref_stack.back()->is_object());
 
         // add null at given key and store the reference for later
-        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
         return true;
     }
 
@@ -6887,8 +6957,8 @@ class json_sax_dom_parser
 
         if (ref_stack.back()->is_array())
         {
-            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
-            return &(ref_stack.back()->m_value.array->back());
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_data.m_value.array->back());
         }
 
         JSON_ASSERT(ref_stack.back()->is_object());
@@ -7007,7 +7077,7 @@ class json_sax_dom_callback_parser
         // add discarded value at given key and store the reference for later
         if (keep && ref_stack.back())
         {
-            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
         }
 
         return true;
@@ -7092,7 +7162,7 @@ class json_sax_dom_callback_parser
         // remove discarded value
         if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
         {
-            ref_stack.back()->m_value.array->pop_back();
+            ref_stack.back()->m_data.m_value.array->pop_back();
         }
 
         return true;
@@ -7159,7 +7229,7 @@ class json_sax_dom_callback_parser
         if (ref_stack.empty())
         {
             root = std::move(value);
-            return {true, &root};
+            return {true, & root};
         }
 
         // skip this value if we already decided to skip the parent
@@ -7175,8 +7245,8 @@ class json_sax_dom_callback_parser
         // array
         if (ref_stack.back()->is_array())
         {
-            ref_stack.back()->m_value.array->emplace_back(std::move(value));
-            return {true, &(ref_stack.back()->m_value.array->back())};
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
+            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
         }
 
         // object
@@ -7201,9 +7271,9 @@ class json_sax_dom_callback_parser
     /// stack to model hierarchy of values
     std::vector<BasicJsonType*> ref_stack {};
     /// stack to manage which values to keep
-    std::vector<bool> keep_stack {};
+    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
     /// stack to manage which object keys to keep
-    std::vector<bool> key_keep_stack {};
+    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
     /// helper to hold the reference for the next object element
     BasicJsonType* object_element = nullptr;
     /// whether a syntax error occurred
@@ -7298,10 +7368,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/lexer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -7322,6 +7392,8 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/macro_scope.hpp>
 
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
 
 NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
@@ -7416,7 +7488,7 @@ class lexer : public lexer_base<BasicJsonType>
     using number_float_t = typename BasicJsonType::number_float_t;
     using string_t = typename BasicJsonType::string_t;
     using char_type = typename InputAdapterType::char_type;
-    using char_int_type = typename std::char_traits<char_type>::int_type;
+    using char_int_type = typename char_traits<char_type>::int_type;
 
   public:
     using token_type = typename lexer_base<BasicJsonType>::token_type;
@@ -7523,7 +7595,7 @@ class lexer : public lexer_base<BasicJsonType>
         for (auto range = ranges.begin(); range != ranges.end(); ++range)
         {
             get();
-            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
+            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions)
             {
                 add(current);
             }
@@ -7566,7 +7638,7 @@ class lexer : public lexer_base<BasicJsonType>
             switch (get())
             {
                 // end of file while parsing string
-                case std::char_traits<char_type>::eof():
+                case char_traits<char_type>::eof():
                 {
                     error_message = "invalid string: missing closing quote";
                     return token_type::parse_error;
@@ -8155,7 +8227,7 @@ class lexer : public lexer_base<BasicJsonType>
                     {
                         case '\n':
                         case '\r':
-                        case std::char_traits<char_type>::eof():
+                        case char_traits<char_type>::eof():
                         case '\0':
                             return true;
 
@@ -8172,7 +8244,7 @@ class lexer : public lexer_base<BasicJsonType>
                 {
                     switch (get())
                     {
-                        case std::char_traits<char_type>::eof():
+                        case char_traits<char_type>::eof():
                         case '\0':
                         {
                             error_message = "invalid comment; missing closing '*/'";
@@ -8601,10 +8673,10 @@ scan_number_done:
     token_type scan_literal(const char_type* literal_text, const std::size_t length,
                             token_type return_type)
     {
-        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
         for (std::size_t i = 1; i < length; ++i)
         {
-            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
+            if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
             {
                 error_message = "invalid literal";
                 return token_type::parse_error;
@@ -8622,7 +8694,7 @@ scan_number_done:
     {
         token_buffer.clear();
         token_string.clear();
-        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+        token_string.push_back(char_traits<char_type>::to_char_type(current));
     }
 
     /*
@@ -8630,7 +8702,7 @@ scan_number_done:
 
     This function provides the interface to the used input adapter. It does
     not throw in case the input reached EOF, but returns a
-    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    `char_traits<char>::eof()` in that case.  Stores the scanned characters
     for use in error messages.
 
     @return character read from the input
@@ -8650,9 +8722,9 @@ scan_number_done:
             current = ia.get_character();
         }
 
-        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
         {
-            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+            token_string.push_back(char_traits<char_type>::to_char_type(current));
         }
 
         if (current == '\n')
@@ -8691,7 +8763,7 @@ scan_number_done:
             --position.chars_read_current_line;
         }
 
-        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
         {
             JSON_ASSERT(!token_string.empty());
             token_string.pop_back();
@@ -8885,7 +8957,7 @@ scan_number_done:
             // end of input (the null byte is needed when parsing from
             // string literals)
             case '\0':
-            case std::char_traits<char_type>::eof():
+            case char_traits<char_type>::eof():
                 return token_type::end_of_input;
 
             // error
@@ -8903,7 +8975,7 @@ scan_number_done:
     const bool ignore_comments = false;
 
     /// the current character
-    char_int_type current = std::char_traits<char_type>::eof();
+    char_int_type current = char_traits<char_type>::eof();
 
     /// whether the next get() call should just return current
     bool next_unget = false;
@@ -8937,10 +9009,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/is_sax.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -9129,7 +9201,6 @@ static inline bool little_endianness(int num = 1) noexcept
     return *reinterpret_cast<char*>(&num) == 1;
 }
 
-
 ///////////////////
 // binary reader //
 ///////////////////
@@ -9147,7 +9218,7 @@ class binary_reader
     using binary_t = typename BasicJsonType::binary_t;
     using json_sax_t = SAX;
     using char_type = typename InputAdapterType::char_type;
-    using char_int_type = typename std::char_traits<char_type>::int_type;
+    using char_int_type = typename char_traits<char_type>::int_type;
 
   public:
     /*!
@@ -9220,7 +9291,7 @@ class binary_reader
                 get();
             }
 
-            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
+            if (JSON_HEDLEY_UNLIKELY(current != char_traits<char_type>::eof()))
             {
                 return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read,
                                         exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
@@ -9303,7 +9374,7 @@ class binary_reader
                                     exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr));
         }
 
-        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != std::char_traits<char_type>::eof();
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != char_traits<char_type>::eof();
     }
 
     /*!
@@ -9404,7 +9475,7 @@ class binary_reader
             {
                 std::array<char, 3> cr{{}};
                 static_cast<void>((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
-                std::string cr_str{cr.data()};
+                const std::string cr_str{cr.data()};
                 return sax->parse_error(element_type_parse_position, cr_str,
                                         parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr));
             }
@@ -9497,7 +9568,7 @@ class binary_reader
         switch (get_char ? get() : current)
         {
             // EOF
-            case std::char_traits<char_type>::eof():
+            case char_traits<char_type>::eof():
                 return unexpect_eof(input_format_t::cbor, "value");
 
             // Integer 0x00..0x17 (0..23)
@@ -10272,7 +10343,7 @@ class binary_reader
         switch (get())
         {
             // EOF
-            case std::char_traits<char_type>::eof():
+            case char_traits<char_type>::eof():
                 return unexpect_eof(input_format_t::msgpack, "value");
 
             // positive fixint
@@ -11339,7 +11410,7 @@ class binary_reader
                                         exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
             }
 
-            bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
             if (input_format == input_format_t::bjdata && is_ndarray)
             {
                 if (inside_ndarray)
@@ -11354,7 +11425,7 @@ class binary_reader
 
         if (current == '#')
         {
-            bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
             if (input_format == input_format_t::bjdata && is_ndarray)
             {
                 return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read,
@@ -11374,7 +11445,7 @@ class binary_reader
     {
         switch (prefix)
         {
-            case std::char_traits<char_type>::eof():  // EOF
+            case char_traits<char_type>::eof():  // EOF
                 return unexpect_eof(input_format, "value");
 
             case 'T':  // true
@@ -11819,7 +11890,7 @@ class binary_reader
 
     This function provides the interface to the used input adapter. It does
     not throw in case the input reached EOF, but returns a -'ve valued
-    `std::char_traits<char_type>::eof()` in that case.
+    `char_traits<char_type>::eof()` in that case.
 
     @return character read from the input
     */
@@ -11961,7 +12032,7 @@ class binary_reader
     JSON_HEDLEY_NON_NULL(3)
     bool unexpect_eof(const input_format_t format, const char* context) const
     {
-        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char_type>::eof()))
+        if (JSON_HEDLEY_UNLIKELY(current == char_traits<char_type>::eof()))
         {
             return sax->parse_error(chars_read, "<end of file>",
                                     parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
@@ -12028,7 +12099,7 @@ class binary_reader
     InputAdapterType ia;
 
     /// the current character
-    char_int_type current = std::char_traits<char_type>::eof();
+    char_int_type current = char_traits<char_type>::eof();
 
     /// the number of characters read
     std::size_t chars_read = 0;
@@ -12090,10 +12161,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/parser.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12439,13 +12510,25 @@ class parser
                                                 m_lexer.get_token_string(),
                                                 parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr));
                     }
+                    case token_type::end_of_input:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+                        }
 
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr));
+                    }
                     case token_type::uninitialized:
                     case token_type::end_array:
                     case token_type::end_object:
                     case token_type::name_separator:
                     case token_type::value_separator:
-                    case token_type::end_of_input:
                     case token_type::literal_or_value:
                     default: // the last token was unexpected
                     {
@@ -12607,10 +12690,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/internal_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12620,10 +12703,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/primitive_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12779,10 +12862,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/iter_impl.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12887,7 +12970,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
@@ -12984,17 +13067,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
-                m_it.object_iterator = m_object->m_value.object->begin();
+                m_it.object_iterator = m_object->m_data.m_value.object->begin();
                 break;
             }
 
             case value_t::array:
             {
-                m_it.array_iterator = m_object->m_value.array->begin();
+                m_it.array_iterator = m_object->m_data.m_value.array->begin();
                 break;
             }
 
@@ -13028,17 +13111,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
-                m_it.object_iterator = m_object->m_value.object->end();
+                m_it.object_iterator = m_object->m_data.m_value.object->end();
                 break;
             }
 
             case value_t::array:
             {
-                m_it.array_iterator = m_object->m_value.array->end();
+                m_it.array_iterator = m_object->m_data.m_value.array->end();
                 break;
             }
 
@@ -13067,17 +13150,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
-                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
                 return m_it.object_iterator->second;
             }
 
             case value_t::array:
             {
-                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
                 return *m_it.array_iterator;
             }
 
@@ -13111,17 +13194,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
-                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
                 return &(m_it.object_iterator->second);
             }
 
             case value_t::array:
             {
-                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
                 return &*m_it.array_iterator;
             }
 
@@ -13164,7 +13247,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
@@ -13215,7 +13298,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
             {
@@ -13262,7 +13345,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
                 return (m_it.object_iterator == other.m_it.object_iterator);
@@ -13307,7 +13390,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
                 JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object));
@@ -13363,7 +13446,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
                 JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
@@ -13442,7 +13525,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
                 JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
@@ -13471,7 +13554,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
     {
         JSON_ASSERT(m_object != nullptr);
 
-        switch (m_object->m_type)
+        switch (m_object->m_data.m_type)
         {
             case value_t::object:
                 JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object));
@@ -13541,10 +13624,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -13673,13 +13756,55 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/iterators/primitive_iterator.hpp>
 
+// #include <nlohmann/detail/json_custom_base_class.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <type_traits> // conditional, is_same
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+/*!
+@brief Default base class of the @ref basic_json class.
+
+So that the correct implementations of the copy / move ctors / assign operators
+of @ref basic_json do not require complex case distinctions
+(no base class / custom base class used as customization point),
+@ref basic_json always has a base class.
+By default, this class is used because it is empty and thus has no effect
+on the behavior of @ref basic_json.
+*/
+struct json_default_base {};
+
+template<class T>
+using json_base_class = typename std::conditional <
+                        std::is_same<T, void>::value,
+                        json_default_base,
+                        T
+                        >::type;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
 // #include <nlohmann/detail/json_pointer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -13911,7 +14036,7 @@ class json_pointer
         const char* p = s.c_str();
         char* p_end = nullptr;
         errno = 0; // strtoull doesn't reset errno
-        unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
+        const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
         if (p == p_end // invalid input or empty string
                 || errno == ERANGE // out of range
                 || JSON_HEDLEY_UNLIKELY(static_cast<std::size_t>(p_end - p) != s.size())) // incomplete read
@@ -14067,7 +14192,7 @@ class json_pointer
                     if (reference_token == "-")
                     {
                         // explicitly treat "-" as index beyond the end
-                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                        ptr = &ptr->operator[](ptr->m_data.m_value.array->size());
                     }
                     else
                     {
@@ -14119,7 +14244,7 @@ class json_pointer
                     {
                         // "-" always fails the range check
                         JSON_THROW(detail::out_of_range::create(402, detail::concat(
-                                "array index '-' (", std::to_string(ptr->m_value.array->size()),
+                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
                                 ") is out of range"), ptr));
                     }
 
@@ -14176,7 +14301,7 @@ class json_pointer
                     if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                     {
                         // "-" cannot be used for const access
-                        JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_value.array->size()), ") is out of range"), ptr));
+                        JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr));
                     }
 
                     // use unchecked array access
@@ -14226,7 +14351,7 @@ class json_pointer
                     {
                         // "-" always fails the range check
                         JSON_THROW(detail::out_of_range::create(402, detail::concat(
-                                "array index '-' (", std::to_string(ptr->m_value.array->size()),
+                                "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()),
                                 ") is out of range"), ptr));
                     }
 
@@ -14421,7 +14546,7 @@ class json_pointer
         {
             case detail::value_t::array:
             {
-                if (value.m_value.array->empty())
+                if (value.m_data.m_value.array->empty())
                 {
                     // flatten empty array as null
                     result[reference_string] = nullptr;
@@ -14429,10 +14554,10 @@ class json_pointer
                 else
                 {
                     // iterate array and use index as reference string
-                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i)
                     {
                         flatten(detail::concat(reference_string, '/', std::to_string(i)),
-                                value.m_value.array->operator[](i), result);
+                                value.m_data.m_value.array->operator[](i), result);
                     }
                 }
                 break;
@@ -14440,7 +14565,7 @@ class json_pointer
 
             case detail::value_t::object:
             {
-                if (value.m_value.object->empty())
+                if (value.m_data.m_value.object->empty())
                 {
                     // flatten empty object as null
                     result[reference_string] = nullptr;
@@ -14448,7 +14573,7 @@ class json_pointer
                 else
                 {
                     // iterate object and use keys as reference string
-                    for (const auto& element : *value.m_value.object)
+                    for (const auto& element : *value.m_data.m_value.object)
                     {
                         flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result);
                     }
@@ -14495,7 +14620,7 @@ class json_pointer
         BasicJsonType result;
 
         // iterate the JSON object values
-        for (const auto& element : *value.m_value.object)
+        for (const auto& element : *value.m_data.m_value.object)
         {
             if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
             {
@@ -14671,10 +14796,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/json_ref.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14763,10 +14888,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/binary_writer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14789,10 +14914,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/output_adapters.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14978,7 +15103,7 @@ class binary_writer
         {
             case value_t::object:
             {
-                write_bson_object(*j.m_value.object);
+                write_bson_object(*j.m_data.m_value.object);
                 break;
             }
 
@@ -15013,7 +15138,7 @@ class binary_writer
 
             case value_t::boolean:
             {
-                oa->write_character(j.m_value.boolean
+                oa->write_character(j.m_data.m_value.boolean
                                     ? to_char_type(0xF5)
                                     : to_char_type(0xF4));
                 break;
@@ -15021,42 +15146,42 @@ class binary_writer
 
             case value_t::number_integer:
             {
-                if (j.m_value.number_integer >= 0)
+                if (j.m_data.m_value.number_integer >= 0)
                 {
                     // CBOR does not differentiate between positive signed
                     // integers and unsigned integers. Therefore, we used the
                     // code from the value_t::number_unsigned case here.
-                    if (j.m_value.number_integer <= 0x17)
+                    if (j.m_data.m_value.number_integer <= 0x17)
                     {
-                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                     {
                         oa->write_character(to_char_type(0x18));
-                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
                     {
                         oa->write_character(to_char_type(0x19));
-                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
                     {
                         oa->write_character(to_char_type(0x1A));
-                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                     }
                     else
                     {
                         oa->write_character(to_char_type(0x1B));
-                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                     }
                 }
                 else
                 {
                     // The conversions below encode the sign in the first
                     // byte, and the value is converted to a positive number.
-                    const auto positive_number = -1 - j.m_value.number_integer;
-                    if (j.m_value.number_integer >= -24)
+                    const auto positive_number = -1 - j.m_data.m_value.number_integer;
+                    if (j.m_data.m_value.number_integer >= -24)
                     {
                         write_number(static_cast<std::uint8_t>(0x20 + positive_number));
                     }
@@ -15086,52 +15211,52 @@ class binary_writer
 
             case value_t::number_unsigned:
             {
-                if (j.m_value.number_unsigned <= 0x17)
+                if (j.m_data.m_value.number_unsigned <= 0x17)
                 {
-                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                 {
                     oa->write_character(to_char_type(0x18));
-                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                 {
                     oa->write_character(to_char_type(0x19));
-                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_unsigned));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                 {
                     oa->write_character(to_char_type(0x1A));
-                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_unsigned));
                 }
                 else
                 {
                     oa->write_character(to_char_type(0x1B));
-                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned));
                 }
                 break;
             }
 
             case value_t::number_float:
             {
-                if (std::isnan(j.m_value.number_float))
+                if (std::isnan(j.m_data.m_value.number_float))
                 {
                     // NaN is 0xf97e00 in CBOR
                     oa->write_character(to_char_type(0xF9));
                     oa->write_character(to_char_type(0x7E));
                     oa->write_character(to_char_type(0x00));
                 }
-                else if (std::isinf(j.m_value.number_float))
+                else if (std::isinf(j.m_data.m_value.number_float))
                 {
                     // Infinity is 0xf97c00, -Infinity is 0xf9fc00
                     oa->write_character(to_char_type(0xf9));
-                    oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
                     oa->write_character(to_char_type(0x00));
                 }
                 else
                 {
-                    write_compact_float(j.m_value.number_float, detail::input_format_t::cbor);
+                    write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor);
                 }
                 break;
             }
@@ -15139,7 +15264,7 @@ class binary_writer
             case value_t::string:
             {
                 // step 1: write control byte and the string length
-                const auto N = j.m_value.string->size();
+                const auto N = j.m_data.m_value.string->size();
                 if (N <= 0x17)
                 {
                     write_number(static_cast<std::uint8_t>(0x60 + N));
@@ -15169,15 +15294,15 @@ class binary_writer
 
                 // step 2: write the string
                 oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
-                    j.m_value.string->size());
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
                 break;
             }
 
             case value_t::array:
             {
                 // step 1: write control byte and the array size
-                const auto N = j.m_value.array->size();
+                const auto N = j.m_data.m_value.array->size();
                 if (N <= 0x17)
                 {
                     write_number(static_cast<std::uint8_t>(0x80 + N));
@@ -15206,7 +15331,7 @@ class binary_writer
                 // LCOV_EXCL_STOP
 
                 // step 2: write each element
-                for (const auto& el : *j.m_value.array)
+                for (const auto& el : *j.m_data.m_value.array)
                 {
                     write_cbor(el);
                 }
@@ -15215,32 +15340,32 @@ class binary_writer
 
             case value_t::binary:
             {
-                if (j.m_value.binary->has_subtype())
+                if (j.m_data.m_value.binary->has_subtype())
                 {
-                    if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
+                    if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
                     {
                         write_number(static_cast<std::uint8_t>(0xd8));
-                        write_number(static_cast<std::uint8_t>(j.m_value.binary->subtype()));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.binary->subtype()));
                     }
-                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
                     {
                         write_number(static_cast<std::uint8_t>(0xd9));
-                        write_number(static_cast<std::uint16_t>(j.m_value.binary->subtype()));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.binary->subtype()));
                     }
-                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
                     {
                         write_number(static_cast<std::uint8_t>(0xda));
-                        write_number(static_cast<std::uint32_t>(j.m_value.binary->subtype()));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.binary->subtype()));
                     }
-                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
+                    else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
                     {
                         write_number(static_cast<std::uint8_t>(0xdb));
-                        write_number(static_cast<std::uint64_t>(j.m_value.binary->subtype()));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.binary->subtype()));
                     }
                 }
 
                 // step 1: write control byte and the binary array size
-                const auto N = j.m_value.binary->size();
+                const auto N = j.m_data.m_value.binary->size();
                 if (N <= 0x17)
                 {
                     write_number(static_cast<std::uint8_t>(0x40 + N));
@@ -15270,7 +15395,7 @@ class binary_writer
 
                 // step 2: write each element
                 oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
                     N);
 
                 break;
@@ -15279,7 +15404,7 @@ class binary_writer
             case value_t::object:
             {
                 // step 1: write control byte and the object size
-                const auto N = j.m_value.object->size();
+                const auto N = j.m_data.m_value.object->size();
                 if (N <= 0x17)
                 {
                     write_number(static_cast<std::uint8_t>(0xA0 + N));
@@ -15308,7 +15433,7 @@ class binary_writer
                 // LCOV_EXCL_STOP
 
                 // step 2: write each element
-                for (const auto& el : *j.m_value.object)
+                for (const auto& el : *j.m_data.m_value.object)
                 {
                     write_cbor(el.first);
                     write_cbor(el.second);
@@ -15337,7 +15462,7 @@ class binary_writer
 
             case value_t::boolean: // true and false
             {
-                oa->write_character(j.m_value.boolean
+                oa->write_character(j.m_data.m_value.boolean
                                     ? to_char_type(0xC3)
                                     : to_char_type(0xC2));
                 break;
@@ -15345,75 +15470,75 @@ class binary_writer
 
             case value_t::number_integer:
             {
-                if (j.m_value.number_integer >= 0)
+                if (j.m_data.m_value.number_integer >= 0)
                 {
                     // MessagePack does not differentiate between positive
                     // signed integers and unsigned integers. Therefore, we used
                     // the code from the value_t::number_unsigned case here.
-                    if (j.m_value.number_unsigned < 128)
+                    if (j.m_data.m_value.number_unsigned < 128)
                     {
                         // positive fixnum
-                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                     {
                         // uint 8
                         oa->write_character(to_char_type(0xCC));
-                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                     {
                         // uint 16
                         oa->write_character(to_char_type(0xCD));
-                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                     {
                         // uint 32
                         oa->write_character(to_char_type(0xCE));
-                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                     {
                         // uint 64
                         oa->write_character(to_char_type(0xCF));
-                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                     }
                 }
                 else
                 {
-                    if (j.m_value.number_integer >= -32)
+                    if (j.m_data.m_value.number_integer >= -32)
                     {
                         // negative fixnum
-                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
-                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                     {
                         // int 8
                         oa->write_character(to_char_type(0xD0));
-                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
-                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                     {
                         // int 16
                         oa->write_character(to_char_type(0xD1));
-                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::int16_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
-                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                     {
                         // int 32
                         oa->write_character(to_char_type(0xD2));
-                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::int32_t>(j.m_data.m_value.number_integer));
                     }
-                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
-                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                             j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                     {
                         // int 64
                         oa->write_character(to_char_type(0xD3));
-                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
+                        write_number(static_cast<std::int64_t>(j.m_data.m_value.number_integer));
                     }
                 }
                 break;
@@ -15421,48 +15546,48 @@ class binary_writer
 
             case value_t::number_unsigned:
             {
-                if (j.m_value.number_unsigned < 128)
+                if (j.m_data.m_value.number_unsigned < 128)
                 {
                     // positive fixnum
-                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                 {
                     // uint 8
                     oa->write_character(to_char_type(0xCC));
-                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                 {
                     // uint 16
                     oa->write_character(to_char_type(0xCD));
-                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                 {
                     // uint 32
                     oa->write_character(to_char_type(0xCE));
-                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
                 }
-                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                 {
                     // uint 64
                     oa->write_character(to_char_type(0xCF));
-                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
                 }
                 break;
             }
 
             case value_t::number_float:
             {
-                write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack);
+                write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack);
                 break;
             }
 
             case value_t::string:
             {
                 // step 1: write control byte and the string length
-                const auto N = j.m_value.string->size();
+                const auto N = j.m_data.m_value.string->size();
                 if (N <= 31)
                 {
                     // fixstr
@@ -15489,15 +15614,15 @@ class binary_writer
 
                 // step 2: write the string
                 oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
-                    j.m_value.string->size());
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
                 break;
             }
 
             case value_t::array:
             {
                 // step 1: write control byte and the array size
-                const auto N = j.m_value.array->size();
+                const auto N = j.m_data.m_value.array->size();
                 if (N <= 15)
                 {
                     // fixarray
@@ -15517,7 +15642,7 @@ class binary_writer
                 }
 
                 // step 2: write each element
-                for (const auto& el : *j.m_value.array)
+                for (const auto& el : *j.m_data.m_value.array)
                 {
                     write_msgpack(el);
                 }
@@ -15528,10 +15653,10 @@ class binary_writer
             {
                 // step 0: determine if the binary type has a set subtype to
                 // determine whether or not to use the ext or fixext types
-                const bool use_ext = j.m_value.binary->has_subtype();
+                const bool use_ext = j.m_data.m_value.binary->has_subtype();
 
                 // step 1: write control byte and the byte string length
-                const auto N = j.m_value.binary->size();
+                const auto N = j.m_data.m_value.binary->size();
                 if (N <= (std::numeric_limits<std::uint8_t>::max)())
                 {
                     std::uint8_t output_type{};
@@ -15576,18 +15701,18 @@ class binary_writer
                 }
                 else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                 {
-                    std::uint8_t output_type = use_ext
-                                               ? 0xC8 // ext 16
-                                               : 0xC5; // bin 16
+                    const std::uint8_t output_type = use_ext
+                                                     ? 0xC8 // ext 16
+                                                     : 0xC5; // bin 16
 
                     oa->write_character(to_char_type(output_type));
                     write_number(static_cast<std::uint16_t>(N));
                 }
                 else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                 {
-                    std::uint8_t output_type = use_ext
-                                               ? 0xC9 // ext 32
-                                               : 0xC6; // bin 32
+                    const std::uint8_t output_type = use_ext
+                                                     ? 0xC9 // ext 32
+                                                     : 0xC6; // bin 32
 
                     oa->write_character(to_char_type(output_type));
                     write_number(static_cast<std::uint32_t>(N));
@@ -15596,12 +15721,12 @@ class binary_writer
                 // step 1.5: if this is an ext type, write the subtype
                 if (use_ext)
                 {
-                    write_number(static_cast<std::int8_t>(j.m_value.binary->subtype()));
+                    write_number(static_cast<std::int8_t>(j.m_data.m_value.binary->subtype()));
                 }
 
                 // step 2: write the byte string
                 oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
                     N);
 
                 break;
@@ -15610,7 +15735,7 @@ class binary_writer
             case value_t::object:
             {
                 // step 1: write control byte and the object size
-                const auto N = j.m_value.object->size();
+                const auto N = j.m_data.m_value.object->size();
                 if (N <= 15)
                 {
                     // fixmap
@@ -15630,7 +15755,7 @@ class binary_writer
                 }
 
                 // step 2: write each element
-                for (const auto& el : *j.m_value.object)
+                for (const auto& el : *j.m_data.m_value.object)
                 {
                     write_msgpack(el.first);
                     write_msgpack(el.second);
@@ -15670,7 +15795,7 @@ class binary_writer
             {
                 if (add_prefix)
                 {
-                    oa->write_character(j.m_value.boolean
+                    oa->write_character(j.m_data.m_value.boolean
                                         ? to_char_type('T')
                                         : to_char_type('F'));
                 }
@@ -15679,19 +15804,19 @@ class binary_writer
 
             case value_t::number_integer:
             {
-                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix, use_bjdata);
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata);
                 break;
             }
 
             case value_t::number_unsigned:
             {
-                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix, use_bjdata);
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata);
                 break;
             }
 
             case value_t::number_float:
             {
-                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix, use_bjdata);
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata);
                 break;
             }
 
@@ -15701,10 +15826,10 @@ class binary_writer
                 {
                     oa->write_character(to_char_type('S'));
                 }
-                write_number_with_ubjson_prefix(j.m_value.string->size(), true, use_bjdata);
+                write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata);
                 oa->write_characters(
-                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
-                    j.m_value.string->size());
+                    reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                    j.m_data.m_value.string->size());
                 break;
             }
 
@@ -15716,7 +15841,7 @@ class binary_writer
                 }
 
                 bool prefix_required = true;
-                if (use_type && !j.m_value.array->empty())
+                if (use_type && !j.m_data.m_value.array->empty())
                 {
                     JSON_ASSERT(use_count);
                     const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
@@ -15739,10 +15864,10 @@ class binary_writer
                 if (use_count)
                 {
                     oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_value.array->size(), true, use_bjdata);
+                    write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata);
                 }
 
-                for (const auto& el : *j.m_value.array)
+                for (const auto& el : *j.m_data.m_value.array)
                 {
                     write_ubjson(el, use_count, use_type, prefix_required, use_bjdata);
                 }
@@ -15762,7 +15887,7 @@ class binary_writer
                     oa->write_character(to_char_type('['));
                 }
 
-                if (use_type && !j.m_value.binary->empty())
+                if (use_type && !j.m_data.m_value.binary->empty())
                 {
                     JSON_ASSERT(use_count);
                     oa->write_character(to_char_type('$'));
@@ -15772,21 +15897,21 @@ class binary_writer
                 if (use_count)
                 {
                     oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_value.binary->size(), true, use_bjdata);
+                    write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata);
                 }
 
                 if (use_type)
                 {
                     oa->write_characters(
-                        reinterpret_cast<const CharType*>(j.m_value.binary->data()),
-                        j.m_value.binary->size());
+                        reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
+                        j.m_data.m_value.binary->size());
                 }
                 else
                 {
-                    for (size_t i = 0; i < j.m_value.binary->size(); ++i)
+                    for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i)
                     {
                         oa->write_character(to_char_type('U'));
-                        oa->write_character(j.m_value.binary->data()[i]);
+                        oa->write_character(j.m_data.m_value.binary->data()[i]);
                     }
                 }
 
@@ -15800,9 +15925,9 @@ class binary_writer
 
             case value_t::object:
             {
-                if (use_bjdata && j.m_value.object->size() == 3 && j.m_value.object->find("_ArrayType_") != j.m_value.object->end() && j.m_value.object->find("_ArraySize_") != j.m_value.object->end() && j.m_value.object->find("_ArrayData_") != j.m_value.object->end())
+                if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end())
                 {
-                    if (!write_bjdata_ndarray(*j.m_value.object, use_count, use_type))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
+                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
                     {
                         break;
                     }
@@ -15814,7 +15939,7 @@ class binary_writer
                 }
 
                 bool prefix_required = true;
-                if (use_type && !j.m_value.object->empty())
+                if (use_type && !j.m_data.m_value.object->empty())
                 {
                     JSON_ASSERT(use_count);
                     const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
@@ -15837,10 +15962,10 @@ class binary_writer
                 if (use_count)
                 {
                     oa->write_character(to_char_type('#'));
-                    write_number_with_ubjson_prefix(j.m_value.object->size(), true, use_bjdata);
+                    write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata);
                 }
 
-                for (const auto& el : *j.m_value.object)
+                for (const auto& el : *j.m_data.m_value.object)
                 {
                     write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata);
                     oa->write_characters(
@@ -15990,19 +16115,19 @@ class binary_writer
     void write_bson_unsigned(const string_t& name,
                              const BasicJsonType& j)
     {
-        if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
         {
             write_bson_entry_header(name, 0x10 /* int32 */);
-            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_value.number_unsigned), true);
+            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_data.m_value.number_unsigned), true);
         }
-        else if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        else if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
         {
             write_bson_entry_header(name, 0x12 /* int64 */);
-            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_value.number_unsigned), true);
+            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_data.m_value.number_unsigned), true);
         }
         else
         {
-            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
+            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
         }
     }
 
@@ -16083,13 +16208,13 @@ class binary_writer
         switch (j.type())
         {
             case value_t::object:
-                return header_size + calc_bson_object_size(*j.m_value.object);
+                return header_size + calc_bson_object_size(*j.m_data.m_value.object);
 
             case value_t::array:
-                return header_size + calc_bson_array_size(*j.m_value.array);
+                return header_size + calc_bson_array_size(*j.m_data.m_value.array);
 
             case value_t::binary:
-                return header_size + calc_bson_binary_size(*j.m_value.binary);
+                return header_size + calc_bson_binary_size(*j.m_data.m_value.binary);
 
             case value_t::boolean:
                 return header_size + 1ul;
@@ -16098,13 +16223,13 @@ class binary_writer
                 return header_size + 8ul;
 
             case value_t::number_integer:
-                return header_size + calc_bson_integer_size(j.m_value.number_integer);
+                return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer);
 
             case value_t::number_unsigned:
-                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);
+                return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned);
 
             case value_t::string:
-                return header_size + calc_bson_string_size(*j.m_value.string);
+                return header_size + calc_bson_string_size(*j.m_data.m_value.string);
 
             case value_t::null:
                 return header_size + 0ul;
@@ -16130,28 +16255,28 @@ class binary_writer
         switch (j.type())
         {
             case value_t::object:
-                return write_bson_object_entry(name, *j.m_value.object);
+                return write_bson_object_entry(name, *j.m_data.m_value.object);
 
             case value_t::array:
-                return write_bson_array(name, *j.m_value.array);
+                return write_bson_array(name, *j.m_data.m_value.array);
 
             case value_t::binary:
-                return write_bson_binary(name, *j.m_value.binary);
+                return write_bson_binary(name, *j.m_data.m_value.binary);
 
             case value_t::boolean:
-                return write_bson_boolean(name, j.m_value.boolean);
+                return write_bson_boolean(name, j.m_data.m_value.boolean);
 
             case value_t::number_float:
-                return write_bson_double(name, j.m_value.number_float);
+                return write_bson_double(name, j.m_data.m_value.number_float);
 
             case value_t::number_integer:
-                return write_bson_integer(name, j.m_value.number_integer);
+                return write_bson_integer(name, j.m_data.m_value.number_integer);
 
             case value_t::number_unsigned:
                 return write_bson_unsigned(name, j);
 
             case value_t::string:
-                return write_bson_string(name, *j.m_value.string);
+                return write_bson_string(name, *j.m_data.m_value.string);
 
             case value_t::null:
                 return write_bson_null(name);
@@ -16173,8 +16298,8 @@ class binary_writer
     */
     static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
     {
-        std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast<std::size_t>(0),
-                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast<std::size_t>(0),
+                                          [](size_t result, const typename BasicJsonType::object_t::value_type & el)
         {
             return result += calc_bson_element_size(el.first, el.second);
         });
@@ -16424,35 +16549,35 @@ class binary_writer
                 return 'Z';
 
             case value_t::boolean:
-                return j.m_value.boolean ? 'T' : 'F';
+                return j.m_data.m_value.boolean ? 'T' : 'F';
 
             case value_t::number_integer:
             {
-                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                 {
                     return 'i';
                 }
-                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                 {
                     return 'U';
                 }
-                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                 {
                     return 'I';
                 }
-                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()))
+                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()))
                 {
                     return 'u';
                 }
-                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                 {
                     return 'l';
                 }
-                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()))
+                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()))
                 {
                     return 'm';
                 }
-                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                 {
                     return 'L';
                 }
@@ -16462,35 +16587,35 @@ class binary_writer
 
             case value_t::number_unsigned:
             {
-                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
                 {
                     return 'i';
                 }
-                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
                 {
                     return 'U';
                 }
-                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
                 {
                     return 'I';
                 }
-                if (use_bjdata && j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)()))
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)()))
                 {
                     return 'u';
                 }
-                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
                 {
                     return 'l';
                 }
-                if (use_bjdata && j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)()))
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)()))
                 {
                     return 'm';
                 }
-                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+                if (j.m_data.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
                 {
                     return 'L';
                 }
-                if (use_bjdata && j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                 {
                     return 'M';
                 }
@@ -16499,7 +16624,7 @@ class binary_writer
             }
 
             case value_t::number_float:
-                return get_ubjson_float_prefix(j.m_value.number_float);
+                return get_ubjson_float_prefix(j.m_data.m_value.number_float);
 
             case value_t::string:
                 return 'S';
@@ -16548,7 +16673,7 @@ class binary_writer
         std::size_t len = (value.at(key).empty() ? 0 : 1);
         for (const auto& el : value.at(key))
         {
-            len *= static_cast<std::size_t>(el.m_value.number_unsigned);
+            len *= static_cast<std::size_t>(el.m_data.m_value.number_unsigned);
         }
 
         key = "_ArrayData_";
@@ -16570,70 +16695,70 @@ class binary_writer
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::uint8_t>(el.m_value.number_unsigned), true);
+                write_number(static_cast<std::uint8_t>(el.m_data.m_value.number_unsigned), true);
             }
         }
         else if (dtype == 'i')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::int8_t>(el.m_value.number_integer), true);
+                write_number(static_cast<std::int8_t>(el.m_data.m_value.number_integer), true);
             }
         }
         else if (dtype == 'u')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::uint16_t>(el.m_value.number_unsigned), true);
+                write_number(static_cast<std::uint16_t>(el.m_data.m_value.number_unsigned), true);
             }
         }
         else if (dtype == 'I')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::int16_t>(el.m_value.number_integer), true);
+                write_number(static_cast<std::int16_t>(el.m_data.m_value.number_integer), true);
             }
         }
         else if (dtype == 'm')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::uint32_t>(el.m_value.number_unsigned), true);
+                write_number(static_cast<std::uint32_t>(el.m_data.m_value.number_unsigned), true);
             }
         }
         else if (dtype == 'l')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::int32_t>(el.m_value.number_integer), true);
+                write_number(static_cast<std::int32_t>(el.m_data.m_value.number_integer), true);
             }
         }
         else if (dtype == 'M')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::uint64_t>(el.m_value.number_unsigned), true);
+                write_number(static_cast<std::uint64_t>(el.m_data.m_value.number_unsigned), true);
             }
         }
         else if (dtype == 'L')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<std::int64_t>(el.m_value.number_integer), true);
+                write_number(static_cast<std::int64_t>(el.m_data.m_value.number_integer), true);
             }
         }
         else if (dtype == 'd')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<float>(el.m_value.number_float), true);
+                write_number(static_cast<float>(el.m_data.m_value.number_float), true);
             }
         }
         else if (dtype == 'D')
         {
             for (const auto& el : value.at(key))
             {
-                write_number(static_cast<double>(el.m_value.number_float), true);
+                write_number(static_cast<double>(el.m_data.m_value.number_float), true);
             }
         }
         return false;
@@ -16757,11 +16882,11 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/serializer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
 // SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann <bjoern@hoehrmann.de>
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -16782,11 +16907,11 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/conversions/to_chars.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
 // SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -17692,7 +17817,7 @@ void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
     // NB: If the neighbors are computed for single-precision numbers, there is a single float
     //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
     //     value is off by 1 ulp.
-#if 0
+#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if)
     const boundaries w = compute_boundaries(static_cast<double>(value));
 #else
     const boundaries w = compute_boundaries(value);
@@ -17994,11 +18119,11 @@ class serializer
               const unsigned int indent_step,
               const unsigned int current_indent = 0)
     {
-        switch (val.m_type)
+        switch (val.m_data.m_type)
         {
             case value_t::object:
             {
-                if (val.m_value.object->empty())
+                if (val.m_data.m_value.object->empty())
                 {
                     o->write_characters("{}", 2);
                     return;
@@ -18016,8 +18141,8 @@ class serializer
                     }
 
                     // first n-1 elements
-                    auto i = val.m_value.object->cbegin();
-                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
                     {
                         o->write_characters(indent_string.c_str(), new_indent);
                         o->write_character('\"');
@@ -18028,8 +18153,8 @@ class serializer
                     }
 
                     // last element
-                    JSON_ASSERT(i != val.m_value.object->cend());
-                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
                     o->write_characters(indent_string.c_str(), new_indent);
                     o->write_character('\"');
                     dump_escaped(i->first, ensure_ascii);
@@ -18045,8 +18170,8 @@ class serializer
                     o->write_character('{');
 
                     // first n-1 elements
-                    auto i = val.m_value.object->cbegin();
-                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i)
                     {
                         o->write_character('\"');
                         dump_escaped(i->first, ensure_ascii);
@@ -18056,8 +18181,8 @@ class serializer
                     }
 
                     // last element
-                    JSON_ASSERT(i != val.m_value.object->cend());
-                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
                     o->write_character('\"');
                     dump_escaped(i->first, ensure_ascii);
                     o->write_characters("\":", 2);
@@ -18071,7 +18196,7 @@ class serializer
 
             case value_t::array:
             {
-                if (val.m_value.array->empty())
+                if (val.m_data.m_value.array->empty())
                 {
                     o->write_characters("[]", 2);
                     return;
@@ -18089,8 +18214,8 @@ class serializer
                     }
 
                     // first n-1 elements
-                    for (auto i = val.m_value.array->cbegin();
-                            i != val.m_value.array->cend() - 1; ++i)
+                    for (auto i = val.m_data.m_value.array->cbegin();
+                            i != val.m_data.m_value.array->cend() - 1; ++i)
                     {
                         o->write_characters(indent_string.c_str(), new_indent);
                         dump(*i, true, ensure_ascii, indent_step, new_indent);
@@ -18098,9 +18223,9 @@ class serializer
                     }
 
                     // last element
-                    JSON_ASSERT(!val.m_value.array->empty());
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
                     o->write_characters(indent_string.c_str(), new_indent);
-                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+                    dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
 
                     o->write_character('\n');
                     o->write_characters(indent_string.c_str(), current_indent);
@@ -18111,16 +18236,16 @@ class serializer
                     o->write_character('[');
 
                     // first n-1 elements
-                    for (auto i = val.m_value.array->cbegin();
-                            i != val.m_value.array->cend() - 1; ++i)
+                    for (auto i = val.m_data.m_value.array->cbegin();
+                            i != val.m_data.m_value.array->cend() - 1; ++i)
                     {
                         dump(*i, false, ensure_ascii, indent_step, current_indent);
                         o->write_character(',');
                     }
 
                     // last element
-                    JSON_ASSERT(!val.m_value.array->empty());
-                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
+                    dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
 
                     o->write_character(']');
                 }
@@ -18131,7 +18256,7 @@ class serializer
             case value_t::string:
             {
                 o->write_character('\"');
-                dump_escaped(*val.m_value.string, ensure_ascii);
+                dump_escaped(*val.m_data.m_value.string, ensure_ascii);
                 o->write_character('\"');
                 return;
             }
@@ -18153,24 +18278,24 @@ class serializer
 
                     o->write_characters("\"bytes\": [", 10);
 
-                    if (!val.m_value.binary->empty())
+                    if (!val.m_data.m_value.binary->empty())
                     {
-                        for (auto i = val.m_value.binary->cbegin();
-                                i != val.m_value.binary->cend() - 1; ++i)
+                        for (auto i = val.m_data.m_value.binary->cbegin();
+                                i != val.m_data.m_value.binary->cend() - 1; ++i)
                         {
                             dump_integer(*i);
                             o->write_characters(", ", 2);
                         }
-                        dump_integer(val.m_value.binary->back());
+                        dump_integer(val.m_data.m_value.binary->back());
                     }
 
                     o->write_characters("],\n", 3);
                     o->write_characters(indent_string.c_str(), new_indent);
 
                     o->write_characters("\"subtype\": ", 11);
-                    if (val.m_value.binary->has_subtype())
+                    if (val.m_data.m_value.binary->has_subtype())
                     {
-                        dump_integer(val.m_value.binary->subtype());
+                        dump_integer(val.m_data.m_value.binary->subtype());
                     }
                     else
                     {
@@ -18184,21 +18309,21 @@ class serializer
                 {
                     o->write_characters("{\"bytes\":[", 10);
 
-                    if (!val.m_value.binary->empty())
+                    if (!val.m_data.m_value.binary->empty())
                     {
-                        for (auto i = val.m_value.binary->cbegin();
-                                i != val.m_value.binary->cend() - 1; ++i)
+                        for (auto i = val.m_data.m_value.binary->cbegin();
+                                i != val.m_data.m_value.binary->cend() - 1; ++i)
                         {
                             dump_integer(*i);
                             o->write_character(',');
                         }
-                        dump_integer(val.m_value.binary->back());
+                        dump_integer(val.m_data.m_value.binary->back());
                     }
 
                     o->write_characters("],\"subtype\":", 12);
-                    if (val.m_value.binary->has_subtype())
+                    if (val.m_data.m_value.binary->has_subtype())
                     {
-                        dump_integer(val.m_value.binary->subtype());
+                        dump_integer(val.m_data.m_value.binary->subtype());
                         o->write_character('}');
                     }
                     else
@@ -18211,7 +18336,7 @@ class serializer
 
             case value_t::boolean:
             {
-                if (val.m_value.boolean)
+                if (val.m_data.m_value.boolean)
                 {
                     o->write_characters("true", 4);
                 }
@@ -18224,19 +18349,19 @@ class serializer
 
             case value_t::number_integer:
             {
-                dump_integer(val.m_value.number_integer);
+                dump_integer(val.m_data.m_value.number_integer);
                 return;
             }
 
             case value_t::number_unsigned:
             {
-                dump_integer(val.m_value.number_unsigned);
+                dump_integer(val.m_data.m_value.number_unsigned);
                 return;
             }
 
             case value_t::number_float:
             {
-                dump_float(val.m_value.number_float);
+                dump_float(val.m_data.m_value.number_float);
                 return;
             }
 
@@ -18810,8 +18935,8 @@ class serializer
                 ? (byte & 0x3fu) | (codep << 6u)
                 : (0xFFu >> type) & (byte);
 
-        std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
-        JSON_ASSERT(index < 400);
+        const std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        JSON_ASSERT(index < utf8d.size());
         state = utf8d[index];
         return state;
     }
@@ -18878,10 +19003,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/ordered_map.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -18998,7 +19123,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<class KeyType, detail::enable_if_t<
                  detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    T & at(KeyType && key)
+    T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
     {
         for (auto it = this->begin(); it != this->end(); ++it)
         {
@@ -19026,7 +19151,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<class KeyType, detail::enable_if_t<
                  detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    const T & at(KeyType && key) const
+    const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
     {
         for (auto it = this->begin(); it != this->end(); ++it)
         {
@@ -19060,7 +19185,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<class KeyType, detail::enable_if_t<
                  detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    size_type erase(KeyType && key)
+    size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
     {
         for (auto it = this->begin(); it != this->end(); ++it)
         {
@@ -19151,7 +19276,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<class KeyType, detail::enable_if_t<
                  detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    size_type count(KeyType && key) const
+    size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward)
     {
         for (auto it = this->begin(); it != this->end(); ++it)
         {
@@ -19177,7 +19302,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<class KeyType, detail::enable_if_t<
                  detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
-    iterator find(KeyType && key)
+    iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward)
     {
         for (auto it = this->begin(); it != this->end(); ++it)
         {
@@ -19240,7 +19365,9 @@ NLOHMANN_JSON_NAMESPACE_END
 
 
 #if defined(JSON_HAS_CPP_17)
-    #include <any>
+    #if JSON_HAS_STATIC_RTTI
+        #include <any>
+    #endif
     #include <string_view>
 #endif
 
@@ -19271,6 +19398,7 @@ The invariants are checked by member function assert_invariant().
 */
 NLOHMANN_BASIC_JSON_TPL_DECLARATION
 class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+    : public ::nlohmann::detail::json_base_class<CustomBaseClass>
 {
   private:
     template<detail::value_t> friend struct detail::external_constructor;
@@ -19297,6 +19425,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// workaround type for MSVC
     using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+    using json_base_class_t = ::nlohmann::detail::json_base_class<CustomBaseClass>;
 
   JSON_PRIVATE_UNLESS_TESTED:
     // convenience aliases for types residing in namespace detail;
@@ -19368,7 +19497,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     /////////////////////
     // container types //
     /////////////////////
@@ -19410,7 +19538,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     /// @brief returns the allocator associated with the container
     /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/
     static allocator_type get_allocator()
@@ -19425,7 +19552,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     {
         basic_json result;
 
-        result["copyright"] = "(C) 2013-2022 Niels Lohmann";
+        result["copyright"] = "(C) 2013-2023 Niels Lohmann";
         result["name"] = "JSON for Modern C++";
         result["url"] = "https://github.com/nlohmann/json";
         result["version"]["string"] =
@@ -19473,7 +19600,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
 #endif
 
-
 #if defined(_MSVC_LANG)
         result["compiler"]["c++"] = std::to_string(_MSVC_LANG);
 #elif defined(__cplusplus)
@@ -19484,7 +19610,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         return result;
     }
 
-
     ///////////////////////////
     // JSON value data types //
     ///////////////////////////
@@ -19692,7 +19817,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     object = nullptr;  // silence warning, see #821
                     if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
                     {
-                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.2", nullptr)); // LCOV_EXCL_LINE
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE
                     }
                     break;
                 }
@@ -19731,6 +19856,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         void destroy(value_t t)
         {
+            if (
+                (t == value_t::object && object == nullptr) ||
+                (t == value_t::array && array == nullptr) ||
+                (t == value_t::string && string == nullptr) ||
+                (t == value_t::binary && binary == nullptr)
+            )
+            {
+                //not initialized (e.g. due to exception in the ctor)
+                return;
+            }
             if (t == value_t::array || t == value_t::object)
             {
                 // flatten the current json_value to a heap-allocated stack
@@ -19761,18 +19896,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     // its children to the stack to be processed later
                     if (current_item.is_array())
                     {
-                        std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(), std::back_inserter(stack));
+                        std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack));
 
-                        current_item.m_value.array->clear();
+                        current_item.m_data.m_value.array->clear();
                     }
                     else if (current_item.is_object())
                     {
-                        for (auto&& it : *current_item.m_value.object)
+                        for (auto&& it : *current_item.m_data.m_value.object)
                         {
                             stack.push_back(std::move(it.second));
                         }
 
-                        current_item.m_value.object->clear();
+                        current_item.m_data.m_value.object->clear();
                     }
 
                     // it's now safe that current_item get destructed
@@ -19849,10 +19984,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     */
     void assert_invariant(bool check_parents = true) const noexcept
     {
-        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
-        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
-        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
-        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr);
 
 #if JSON_DIAGNOSTICS
         JSON_TRY
@@ -19871,11 +20006,11 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     void set_parents()
     {
 #if JSON_DIAGNOSTICS
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::array:
             {
-                for (auto& element : *m_value.array)
+                for (auto& element : *m_data.m_value.array)
                 {
                     element.m_parent = this;
                 }
@@ -19884,7 +20019,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
             case value_t::object:
             {
-                for (auto& element : *m_value.object)
+                for (auto& element : *m_data.m_value.object)
                 {
                     element.second.m_parent = this;
                 }
@@ -19925,7 +20060,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         {
             // see https://github.com/nlohmann/json/issues/2838
             JSON_ASSERT(type() == value_t::array);
-            if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity))
+            if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
             {
                 // capacity has changed: update all parents
                 set_parents();
@@ -19981,7 +20116,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @brief create an empty value with a given type
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
     basic_json(const value_t v)
-        : m_type(v), m_value(v)
+        : m_data(v)
     {
         assert_invariant();
     }
@@ -20055,12 +20190,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 *this = nullptr;
                 break;
             case value_t::discarded:
-                m_type = value_t::discarded;
+                m_data.m_type = value_t::discarded;
                 break;
             default:            // LCOV_EXCL_LINE
                 JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
         }
-        JSON_ASSERT(m_type == val.type());
+        JSON_ASSERT(m_data.m_type == val.type());
         set_parents();
         assert_invariant();
     }
@@ -20076,7 +20211,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         bool is_an_object = std::all_of(init.begin(), init.end(),
                                         [](const detail::json_ref<basic_json>& element_ref)
         {
-            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
+            // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int;
+            // (many string types can be constructed from 0 via its null-pointer guise, so we get a
+            // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows)
+            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast<size_type>(0)].is_string();
         });
 
         // adjust type if type deduction is not wanted
@@ -20098,22 +20236,22 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         if (is_an_object)
         {
             // the initializer list is a list of pairs -> create object
-            m_type = value_t::object;
-            m_value = value_t::object;
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
 
             for (auto& element_ref : init)
             {
                 auto element = element_ref.moved_or_copied();
-                m_value.object->emplace(
-                    std::move(*((*element.m_value.array)[0].m_value.string)),
-                    std::move((*element.m_value.array)[1]));
+                m_data.m_value.object->emplace(
+                    std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)),
+                    std::move((*element.m_data.m_value.array)[1]));
             }
         }
         else
         {
             // the initializer list describes an array -> create array
-            m_type = value_t::array;
-            m_value.array = create<array_t>(init.begin(), init.end());
+            m_data.m_type = value_t::array;
+            m_data.m_value.array = create<array_t>(init.begin(), init.end());
         }
 
         set_parents();
@@ -20126,8 +20264,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     static basic_json binary(const typename binary_t::container_type& init)
     {
         auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = init;
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = init;
         return res;
     }
 
@@ -20137,8 +20275,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype)
     {
         auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = binary_t(init, subtype);
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = binary_t(init, subtype);
         return res;
     }
 
@@ -20148,8 +20286,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     static basic_json binary(typename binary_t::container_type&& init)
     {
         auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = std::move(init);
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = std::move(init);
         return res;
     }
 
@@ -20159,8 +20297,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype)
     {
         auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = binary_t(std::move(init), subtype);
+        res.m_data.m_type = value_t::binary;
+        res.m_data.m_value = binary_t(std::move(init), subtype);
         return res;
     }
 
@@ -20182,10 +20320,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief construct an array with count copies of given value
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
-    basic_json(size_type cnt, const basic_json& val)
-        : m_type(value_t::array)
+    basic_json(size_type cnt, const basic_json& val):
+        m_data{cnt, val}
     {
-        m_value.array = create<array_t>(cnt, val);
         set_parents();
         assert_invariant();
     }
@@ -20207,10 +20344,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         }
 
         // copy type from first iterator
-        m_type = first.m_object->m_type;
+        m_data.m_type = first.m_object->m_data.m_type;
 
         // check if iterator range is complete for primitive values
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::boolean:
             case value_t::number_float:
@@ -20235,55 +20372,55 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 break;
         }
 
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::number_integer:
             {
-                m_value.number_integer = first.m_object->m_value.number_integer;
+                m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer;
                 break;
             }
 
             case value_t::number_unsigned:
             {
-                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
+                m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned;
                 break;
             }
 
             case value_t::number_float:
             {
-                m_value.number_float = first.m_object->m_value.number_float;
+                m_data.m_value.number_float = first.m_object->m_data.m_value.number_float;
                 break;
             }
 
             case value_t::boolean:
             {
-                m_value.boolean = first.m_object->m_value.boolean;
+                m_data.m_value.boolean = first.m_object->m_data.m_value.boolean;
                 break;
             }
 
             case value_t::string:
             {
-                m_value = *first.m_object->m_value.string;
+                m_data.m_value = *first.m_object->m_data.m_value.string;
                 break;
             }
 
             case value_t::object:
             {
-                m_value.object = create<object_t>(first.m_it.object_iterator,
-                                                  last.m_it.object_iterator);
+                m_data.m_value.object = create<object_t>(first.m_it.object_iterator,
+                                        last.m_it.object_iterator);
                 break;
             }
 
             case value_t::array:
             {
-                m_value.array = create<array_t>(first.m_it.array_iterator,
-                                                last.m_it.array_iterator);
+                m_data.m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                       last.m_it.array_iterator);
                 break;
             }
 
             case value_t::binary:
             {
-                m_value = *first.m_object->m_value.binary;
+                m_data.m_value = *first.m_object->m_data.m_value.binary;
                 break;
             }
 
@@ -20297,7 +20434,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         assert_invariant();
     }
 
-
     ///////////////////////////////////////
     // other constructors and destructor //
     ///////////////////////////////////////
@@ -20310,58 +20446,59 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @brief copy constructor
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
     basic_json(const basic_json& other)
-        : m_type(other.m_type)
+        : json_base_class_t(other)
     {
+        m_data.m_type = other.m_data.m_type;
         // check of passed value is valid
         other.assert_invariant();
 
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::object:
             {
-                m_value = *other.m_value.object;
+                m_data.m_value = *other.m_data.m_value.object;
                 break;
             }
 
             case value_t::array:
             {
-                m_value = *other.m_value.array;
+                m_data.m_value = *other.m_data.m_value.array;
                 break;
             }
 
             case value_t::string:
             {
-                m_value = *other.m_value.string;
+                m_data.m_value = *other.m_data.m_value.string;
                 break;
             }
 
             case value_t::boolean:
             {
-                m_value = other.m_value.boolean;
+                m_data.m_value = other.m_data.m_value.boolean;
                 break;
             }
 
             case value_t::number_integer:
             {
-                m_value = other.m_value.number_integer;
+                m_data.m_value = other.m_data.m_value.number_integer;
                 break;
             }
 
             case value_t::number_unsigned:
             {
-                m_value = other.m_value.number_unsigned;
+                m_data.m_value = other.m_data.m_value.number_unsigned;
                 break;
             }
 
             case value_t::number_float:
             {
-                m_value = other.m_value.number_float;
+                m_data.m_value = other.m_data.m_value.number_float;
                 break;
             }
 
             case value_t::binary:
             {
-                m_value = *other.m_value.binary;
+                m_data.m_value = *other.m_data.m_value.binary;
                 break;
             }
 
@@ -20378,15 +20515,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @brief move constructor
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
     basic_json(basic_json&& other) noexcept
-        : m_type(std::move(other.m_type)),
-          m_value(std::move(other.m_value))
+        : json_base_class_t(std::forward<json_base_class_t>(other)),
+          m_data(std::move(other.m_data))
     {
         // check that passed value is valid
         other.assert_invariant(false);
 
         // invalidate payload
-        other.m_type = value_t::null;
-        other.m_value = {};
+        other.m_data.m_type = value_t::null;
+        other.m_data.m_value = {};
 
         set_parents();
         assert_invariant();
@@ -20398,15 +20535,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         std::is_nothrow_move_constructible<value_t>::value&&
         std::is_nothrow_move_assignable<value_t>::value&&
         std::is_nothrow_move_constructible<json_value>::value&&
-        std::is_nothrow_move_assignable<json_value>::value
+        std::is_nothrow_move_assignable<json_value>::value&&
+        std::is_nothrow_move_assignable<json_base_class_t>::value
     )
     {
         // check that passed value is valid
         other.assert_invariant();
 
         using std::swap;
-        swap(m_type, other.m_type);
-        swap(m_value, other.m_value);
+        swap(m_data.m_type, other.m_data.m_type);
+        swap(m_data.m_value, other.m_data.m_value);
+        json_base_class_t::operator=(std::move(other));
 
         set_parents();
         assert_invariant();
@@ -20418,7 +20557,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     ~basic_json() noexcept
     {
         assert_invariant(false);
-        m_value.destroy(m_type);
     }
 
     /// @}
@@ -20458,7 +20596,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/type/
     constexpr value_t type() const noexcept
     {
-        return m_type;
+        return m_data.m_type;
     }
 
     /// @brief return whether type is primitive
@@ -20479,14 +20617,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/is_null/
     constexpr bool is_null() const noexcept
     {
-        return m_type == value_t::null;
+        return m_data.m_type == value_t::null;
     }
 
     /// @brief return whether value is a boolean
     /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/
     constexpr bool is_boolean() const noexcept
     {
-        return m_type == value_t::boolean;
+        return m_data.m_type == value_t::boolean;
     }
 
     /// @brief return whether value is a number
@@ -20500,63 +20638,63 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/
     constexpr bool is_number_integer() const noexcept
     {
-        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
+        return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned;
     }
 
     /// @brief return whether value is an unsigned integer number
     /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/
     constexpr bool is_number_unsigned() const noexcept
     {
-        return m_type == value_t::number_unsigned;
+        return m_data.m_type == value_t::number_unsigned;
     }
 
     /// @brief return whether value is a floating-point number
     /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/
     constexpr bool is_number_float() const noexcept
     {
-        return m_type == value_t::number_float;
+        return m_data.m_type == value_t::number_float;
     }
 
     /// @brief return whether value is an object
     /// @sa https://json.nlohmann.me/api/basic_json/is_object/
     constexpr bool is_object() const noexcept
     {
-        return m_type == value_t::object;
+        return m_data.m_type == value_t::object;
     }
 
     /// @brief return whether value is an array
     /// @sa https://json.nlohmann.me/api/basic_json/is_array/
     constexpr bool is_array() const noexcept
     {
-        return m_type == value_t::array;
+        return m_data.m_type == value_t::array;
     }
 
     /// @brief return whether value is a string
     /// @sa https://json.nlohmann.me/api/basic_json/is_string/
     constexpr bool is_string() const noexcept
     {
-        return m_type == value_t::string;
+        return m_data.m_type == value_t::string;
     }
 
     /// @brief return whether value is a binary array
     /// @sa https://json.nlohmann.me/api/basic_json/is_binary/
     constexpr bool is_binary() const noexcept
     {
-        return m_type == value_t::binary;
+        return m_data.m_type == value_t::binary;
     }
 
     /// @brief return whether value is discarded
     /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/
     constexpr bool is_discarded() const noexcept
     {
-        return m_type == value_t::discarded;
+        return m_data.m_type == value_t::discarded;
     }
 
     /// @brief return the type of the JSON value (implicit)
     /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/
     constexpr operator value_t() const noexcept
     {
-        return m_type;
+        return m_data.m_type;
     }
 
     /// @}
@@ -20571,7 +20709,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     {
         if (JSON_HEDLEY_LIKELY(is_boolean()))
         {
-            return m_value.boolean;
+            return m_data.m_value.boolean;
         }
 
         JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this));
@@ -20580,97 +20718,97 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// get a pointer to the value (object)
     object_t* get_impl_ptr(object_t* /*unused*/) noexcept
     {
-        return is_object() ? m_value.object : nullptr;
+        return is_object() ? m_data.m_value.object : nullptr;
     }
 
     /// get a pointer to the value (object)
     constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
     {
-        return is_object() ? m_value.object : nullptr;
+        return is_object() ? m_data.m_value.object : nullptr;
     }
 
     /// get a pointer to the value (array)
     array_t* get_impl_ptr(array_t* /*unused*/) noexcept
     {
-        return is_array() ? m_value.array : nullptr;
+        return is_array() ? m_data.m_value.array : nullptr;
     }
 
     /// get a pointer to the value (array)
     constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
     {
-        return is_array() ? m_value.array : nullptr;
+        return is_array() ? m_data.m_value.array : nullptr;
     }
 
     /// get a pointer to the value (string)
     string_t* get_impl_ptr(string_t* /*unused*/) noexcept
     {
-        return is_string() ? m_value.string : nullptr;
+        return is_string() ? m_data.m_value.string : nullptr;
     }
 
     /// get a pointer to the value (string)
     constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
     {
-        return is_string() ? m_value.string : nullptr;
+        return is_string() ? m_data.m_value.string : nullptr;
     }
 
     /// get a pointer to the value (boolean)
     boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
     {
-        return is_boolean() ? &m_value.boolean : nullptr;
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
     }
 
     /// get a pointer to the value (boolean)
     constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
     {
-        return is_boolean() ? &m_value.boolean : nullptr;
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
     }
 
     /// get a pointer to the value (integer number)
     number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
     {
-        return is_number_integer() ? &m_value.number_integer : nullptr;
+        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
     }
 
     /// get a pointer to the value (integer number)
     constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
     {
-        return is_number_integer() ? &m_value.number_integer : nullptr;
+        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
     }
 
     /// get a pointer to the value (unsigned number)
     number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
     {
-        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
     }
 
     /// get a pointer to the value (unsigned number)
     constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
     {
-        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
     }
 
     /// get a pointer to the value (floating-point number)
     number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
     {
-        return is_number_float() ? &m_value.number_float : nullptr;
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
     }
 
     /// get a pointer to the value (floating-point number)
     constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
     {
-        return is_number_float() ? &m_value.number_float : nullptr;
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
     }
 
     /// get a pointer to the value (binary)
     binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
     {
-        return is_binary() ? m_value.binary : nullptr;
+        return is_binary() ? m_data.m_value.binary : nullptr;
     }
 
     /// get a pointer to the value (binary)
     constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
     {
-        return is_binary() ? m_value.binary : nullptr;
+        return is_binary() ? m_data.m_value.binary : nullptr;
     }
 
     /*!
@@ -21053,7 +21191,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 #if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
                                                 detail::negation<std::is_same<ValueType, std::string_view>>,
 #endif
-#if defined(JSON_HAS_CPP_17)
+#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI
                                                 detail::negation<std::is_same<ValueType, std::any>>,
 #endif
                                                 detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>
@@ -21090,7 +21228,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     ////////////////////
     // element access //
     ////////////////////
@@ -21108,7 +21245,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         {
             JSON_TRY
             {
-                return set_parent(m_value.array->at(idx));
+                return set_parent(m_data.m_value.array->at(idx));
             }
             JSON_CATCH (std::out_of_range&)
             {
@@ -21131,7 +21268,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         {
             JSON_TRY
             {
-                return m_value.array->at(idx);
+                return m_data.m_value.array->at(idx);
             }
             JSON_CATCH (std::out_of_range&)
             {
@@ -21155,8 +21292,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
         }
 
-        auto it = m_value.object->find(key);
-        if (it == m_value.object->end())
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end())
         {
             JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
         }
@@ -21175,8 +21312,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
         }
 
-        auto it = m_value.object->find(std::forward<KeyType>(key));
-        if (it == m_value.object->end())
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end())
         {
             JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
         }
@@ -21193,8 +21330,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
         }
 
-        auto it = m_value.object->find(key);
-        if (it == m_value.object->end())
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end())
         {
             JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
         }
@@ -21213,8 +21350,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
         }
 
-        auto it = m_value.object->find(std::forward<KeyType>(key));
-        if (it == m_value.object->end())
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end())
         {
             JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
         }
@@ -21228,8 +21365,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // implicitly convert null value to an empty array
         if (is_null())
         {
-            m_type = value_t::array;
-            m_value.array = create<array_t>();
+            m_data.m_type = value_t::array;
+            m_data.m_value.array = create<array_t>();
             assert_invariant();
         }
 
@@ -21237,17 +21374,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         if (JSON_HEDLEY_LIKELY(is_array()))
         {
             // fill up array with null values if given idx is outside range
-            if (idx >= m_value.array->size())
+            if (idx >= m_data.m_value.array->size())
             {
 #if JSON_DIAGNOSTICS
                 // remember array size & capacity before resizing
-                const auto old_size = m_value.array->size();
-                const auto old_capacity = m_value.array->capacity();
+                const auto old_size = m_data.m_value.array->size();
+                const auto old_capacity = m_data.m_value.array->capacity();
 #endif
-                m_value.array->resize(idx + 1);
+                m_data.m_value.array->resize(idx + 1);
 
 #if JSON_DIAGNOSTICS
-                if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity))
+                if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity))
                 {
                     // capacity has changed: update all parents
                     set_parents();
@@ -21261,7 +21398,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 assert_invariant();
             }
 
-            return m_value.array->operator[](idx);
+            return m_data.m_value.array->operator[](idx);
         }
 
         JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
@@ -21274,7 +21411,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // const operator[] only works for arrays
         if (JSON_HEDLEY_LIKELY(is_array()))
         {
-            return m_value.array->operator[](idx);
+            return m_data.m_value.array->operator[](idx);
         }
 
         JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
@@ -21287,15 +21424,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // implicitly convert null value to an empty object
         if (is_null())
         {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
             assert_invariant();
         }
 
         // operator[] only works for objects
         if (JSON_HEDLEY_LIKELY(is_object()))
         {
-            auto result = m_value.object->emplace(std::move(key), nullptr);
+            auto result = m_data.m_value.object->emplace(std::move(key), nullptr);
             return set_parent(result.first->second);
         }
 
@@ -21309,8 +21446,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // const operator[] only works for objects
         if (JSON_HEDLEY_LIKELY(is_object()))
         {
-            auto it = m_value.object->find(key);
-            JSON_ASSERT(it != m_value.object->end());
+            auto it = m_data.m_value.object->find(key);
+            JSON_ASSERT(it != m_data.m_value.object->end());
             return it->second;
         }
 
@@ -21340,15 +21477,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // implicitly convert null value to an empty object
         if (is_null())
         {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
             assert_invariant();
         }
 
         // operator[] only works for objects
         if (JSON_HEDLEY_LIKELY(is_object()))
         {
-            auto result = m_value.object->emplace(std::forward<KeyType>(key), nullptr);
+            auto result = m_data.m_value.object->emplace(std::forward<KeyType>(key), nullptr);
             return set_parent(result.first->second);
         }
 
@@ -21364,8 +21501,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // const operator[] only works for objects
         if (JSON_HEDLEY_LIKELY(is_object()))
         {
-            auto it = m_value.object->find(std::forward<KeyType>(key));
-            JSON_ASSERT(it != m_value.object->end());
+            auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+            JSON_ASSERT(it != m_data.m_value.object->end());
             return it->second;
         }
 
@@ -21602,7 +21739,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         IteratorType result = end();
 
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::boolean:
             case value_t::number_float:
@@ -21619,32 +21756,32 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 if (is_string())
                 {
                     AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
-                    m_value.string = nullptr;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
                 }
                 else if (is_binary())
                 {
                     AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
-                    m_value.binary = nullptr;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
                 }
 
-                m_type = value_t::null;
+                m_data.m_type = value_t::null;
                 assert_invariant();
                 break;
             }
 
             case value_t::object:
             {
-                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator);
                 break;
             }
 
             case value_t::array:
             {
-                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator);
                 break;
             }
 
@@ -21672,7 +21809,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         IteratorType result = end();
 
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::boolean:
             case value_t::number_float:
@@ -21690,33 +21827,33 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 if (is_string())
                 {
                     AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
-                    m_value.string = nullptr;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
                 }
                 else if (is_binary())
                 {
                     AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
-                    m_value.binary = nullptr;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
                 }
 
-                m_type = value_t::null;
+                m_data.m_type = value_t::null;
                 assert_invariant();
                 break;
             }
 
             case value_t::object:
             {
-                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator,
                                               last.m_it.object_iterator);
                 break;
             }
 
             case value_t::array:
             {
-                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator,
                                              last.m_it.array_iterator);
                 break;
             }
@@ -21741,7 +21878,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
         }
 
-        return m_value.object->erase(std::forward<KeyType>(key));
+        return m_data.m_value.object->erase(std::forward<KeyType>(key));
     }
 
     template < typename KeyType, detail::enable_if_t <
@@ -21754,10 +21891,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
         }
 
-        const auto it = m_value.object->find(std::forward<KeyType>(key));
-        if (it != m_value.object->end())
+        const auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it != m_data.m_value.object->end())
         {
-            m_value.object->erase(it);
+            m_data.m_value.object->erase(it);
             return 1;
         }
         return 0;
@@ -21795,7 +21932,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
             }
 
-            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+            m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast<difference_type>(idx));
         }
         else
         {
@@ -21805,7 +21942,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     ////////////
     // lookup //
     ////////////
@@ -21821,7 +21957,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(key);
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
         }
 
         return result;
@@ -21835,7 +21971,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(key);
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
         }
 
         return result;
@@ -21851,7 +21987,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(std::forward<KeyType>(key));
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
         }
 
         return result;
@@ -21867,7 +22003,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(std::forward<KeyType>(key));
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
         }
 
         return result;
@@ -21878,7 +22014,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     size_type count(const typename object_t::key_type& key) const
     {
         // return 0 for all nonobject types
-        return is_object() ? m_value.object->count(key) : 0;
+        return is_object() ? m_data.m_value.object->count(key) : 0;
     }
 
     /// @brief returns the number of occurrences of a key in a JSON object
@@ -21888,14 +22024,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     size_type count(KeyType && key) const
     {
         // return 0 for all nonobject types
-        return is_object() ? m_value.object->count(std::forward<KeyType>(key)) : 0;
+        return is_object() ? m_data.m_value.object->count(std::forward<KeyType>(key)) : 0;
     }
 
     /// @brief check the existence of an element in a JSON object
     /// @sa https://json.nlohmann.me/api/basic_json/contains/
     bool contains(const typename object_t::key_type& key) const
     {
-        return is_object() && m_value.object->find(key) != m_value.object->end();
+        return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end();
     }
 
     /// @brief check the existence of an element in a JSON object
@@ -21904,7 +22040,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                  detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
     bool contains(KeyType && key) const
     {
-        return is_object() && m_value.object->find(std::forward<KeyType>(key)) != m_value.object->end();
+        return is_object() && m_data.m_value.object->find(std::forward<KeyType>(key)) != m_data.m_value.object->end();
     }
 
     /// @brief check the existence of an element in a JSON object given a JSON pointer
@@ -21923,7 +22059,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     ///////////////
     // iterators //
     ///////////////
@@ -22062,7 +22197,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     //////////////
     // capacity //
     //////////////
@@ -22074,7 +22208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/empty/
     bool empty() const noexcept
     {
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::null:
             {
@@ -22085,13 +22219,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             case value_t::array:
             {
                 // delegate call to array_t::empty()
-                return m_value.array->empty();
+                return m_data.m_value.array->empty();
             }
 
             case value_t::object:
             {
                 // delegate call to object_t::empty()
-                return m_value.object->empty();
+                return m_data.m_value.object->empty();
             }
 
             case value_t::string:
@@ -22113,7 +22247,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/size/
     size_type size() const noexcept
     {
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::null:
             {
@@ -22124,13 +22258,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             case value_t::array:
             {
                 // delegate call to array_t::size()
-                return m_value.array->size();
+                return m_data.m_value.array->size();
             }
 
             case value_t::object:
             {
                 // delegate call to object_t::size()
-                return m_value.object->size();
+                return m_data.m_value.object->size();
             }
 
             case value_t::string:
@@ -22152,18 +22286,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/max_size/
     size_type max_size() const noexcept
     {
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::array:
             {
                 // delegate call to array_t::max_size()
-                return m_value.array->max_size();
+                return m_data.m_value.array->max_size();
             }
 
             case value_t::object:
             {
                 // delegate call to object_t::max_size()
-                return m_value.object->max_size();
+                return m_data.m_value.object->max_size();
             }
 
             case value_t::null:
@@ -22184,7 +22318,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @}
 
-
     ///////////////
     // modifiers //
     ///////////////
@@ -22196,53 +22329,53 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/clear/
     void clear() noexcept
     {
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::number_integer:
             {
-                m_value.number_integer = 0;
+                m_data.m_value.number_integer = 0;
                 break;
             }
 
             case value_t::number_unsigned:
             {
-                m_value.number_unsigned = 0;
+                m_data.m_value.number_unsigned = 0;
                 break;
             }
 
             case value_t::number_float:
             {
-                m_value.number_float = 0.0;
+                m_data.m_value.number_float = 0.0;
                 break;
             }
 
             case value_t::boolean:
             {
-                m_value.boolean = false;
+                m_data.m_value.boolean = false;
                 break;
             }
 
             case value_t::string:
             {
-                m_value.string->clear();
+                m_data.m_value.string->clear();
                 break;
             }
 
             case value_t::binary:
             {
-                m_value.binary->clear();
+                m_data.m_value.binary->clear();
                 break;
             }
 
             case value_t::array:
             {
-                m_value.array->clear();
+                m_data.m_value.array->clear();
                 break;
             }
 
             case value_t::object:
             {
-                m_value.object->clear();
+                m_data.m_value.object->clear();
                 break;
             }
 
@@ -22266,15 +22399,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // transform null object into an array
         if (is_null())
         {
-            m_type = value_t::array;
-            m_value = value_t::array;
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
             assert_invariant();
         }
 
         // add element to array (move semantics)
-        const auto old_capacity = m_value.array->capacity();
-        m_value.array->push_back(std::move(val));
-        set_parent(m_value.array->back(), old_capacity);
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(std::move(val));
+        set_parent(m_data.m_value.array->back(), old_capacity);
         // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor
     }
 
@@ -22299,15 +22432,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // transform null object into an array
         if (is_null())
         {
-            m_type = value_t::array;
-            m_value = value_t::array;
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
             assert_invariant();
         }
 
         // add element to array
-        const auto old_capacity = m_value.array->capacity();
-        m_value.array->push_back(val);
-        set_parent(m_value.array->back(), old_capacity);
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(val);
+        set_parent(m_data.m_value.array->back(), old_capacity);
     }
 
     /// @brief add an object to an array
@@ -22331,13 +22464,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // transform null object into an object
         if (is_null())
         {
-            m_type = value_t::object;
-            m_value = value_t::object;
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
             assert_invariant();
         }
 
         // add element to object
-        auto res = m_value.object->insert(val);
+        auto res = m_data.m_value.object->insert(val);
         set_parent(res.first->second);
     }
 
@@ -22387,15 +22520,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // transform null object into an array
         if (is_null())
         {
-            m_type = value_t::array;
-            m_value = value_t::array;
+            m_data.m_type = value_t::array;
+            m_data.m_value = value_t::array;
             assert_invariant();
         }
 
         // add element to array (perfect forwarding)
-        const auto old_capacity = m_value.array->capacity();
-        m_value.array->emplace_back(std::forward<Args>(args)...);
-        return set_parent(m_value.array->back(), old_capacity);
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->emplace_back(std::forward<Args>(args)...);
+        return set_parent(m_data.m_value.array->back(), old_capacity);
     }
 
     /// @brief add an object to an object if key does not exist
@@ -22412,13 +22545,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // transform null object into an object
         if (is_null())
         {
-            m_type = value_t::object;
-            m_value = value_t::object;
+            m_data.m_type = value_t::object;
+            m_data.m_value = value_t::object;
             assert_invariant();
         }
 
         // add element to array (perfect forwarding)
-        auto res = m_value.object->emplace(std::forward<Args>(args)...);
+        auto res = m_data.m_value.object->emplace(std::forward<Args>(args)...);
         set_parent(res.first->second);
 
         // create result iterator and set iterator to the result of emplace
@@ -22436,14 +22569,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     iterator insert_iterator(const_iterator pos, Args&& ... args)
     {
         iterator result(this);
-        JSON_ASSERT(m_value.array != nullptr);
+        JSON_ASSERT(m_data.m_value.array != nullptr);
 
-        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
-        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
-        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+        auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator);
+        m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos;
 
         // This could have been written as:
-        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val);
         // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
 
         set_parents();
@@ -22570,7 +22703,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this));
         }
 
-        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+        m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
     }
 
     /// @brief updates a JSON object from another object, overwriting existing keys
@@ -22587,8 +22720,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // implicitly convert null value to an empty object
         if (is_null())
         {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
+            m_data.m_type = value_t::object;
+            m_data.m_value.object = create<object_t>();
             assert_invariant();
         }
 
@@ -22613,16 +22746,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         {
             if (merge_objects && it.value().is_object())
             {
-                auto it2 = m_value.object->find(it.key());
-                if (it2 != m_value.object->end())
+                auto it2 = m_data.m_value.object->find(it.key());
+                if (it2 != m_data.m_value.object->end())
                 {
                     it2->second.update(it.value(), true);
                     continue;
                 }
             }
-            m_value.object->operator[](it.key()) = it.value();
+            m_data.m_value.object->operator[](it.key()) = it.value();
 #if JSON_DIAGNOSTICS
-            m_value.object->operator[](it.key()).m_parent = this;
+            m_data.m_value.object->operator[](it.key()).m_parent = this;
 #endif
         }
     }
@@ -22632,12 +22765,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     void swap(reference other) noexcept (
         std::is_nothrow_move_constructible<value_t>::value&&
         std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
         std::is_nothrow_move_assignable<json_value>::value
     )
     {
-        std::swap(m_type, other.m_type);
-        std::swap(m_value, other.m_value);
+        std::swap(m_data.m_type, other.m_data.m_type);
+        std::swap(m_data.m_value, other.m_data.m_value);
 
         set_parents();
         other.set_parents();
@@ -22649,7 +22782,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     friend void swap(reference left, reference right) noexcept (
         std::is_nothrow_move_constructible<value_t>::value&&
         std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
         std::is_nothrow_move_assignable<json_value>::value
     )
     {
@@ -22658,13 +22791,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief exchanges the values
     /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(array_t& other) // NOLINT(bugprone-exception-escape)
+    void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
     {
         // swap only works for arrays
         if (JSON_HEDLEY_LIKELY(is_array()))
         {
             using std::swap;
-            swap(*(m_value.array), other);
+            swap(*(m_data.m_value.array), other);
         }
         else
         {
@@ -22674,13 +22807,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief exchanges the values
     /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(object_t& other) // NOLINT(bugprone-exception-escape)
+    void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
     {
         // swap only works for objects
         if (JSON_HEDLEY_LIKELY(is_object()))
         {
             using std::swap;
-            swap(*(m_value.object), other);
+            swap(*(m_data.m_value.object), other);
         }
         else
         {
@@ -22690,13 +22823,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief exchanges the values
     /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(string_t& other) // NOLINT(bugprone-exception-escape)
+    void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
     {
         // swap only works for strings
         if (JSON_HEDLEY_LIKELY(is_string()))
         {
             using std::swap;
-            swap(*(m_value.string), other);
+            swap(*(m_data.m_value.string), other);
         }
         else
         {
@@ -22706,13 +22839,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief exchanges the values
     /// @sa https://json.nlohmann.me/api/basic_json/swap/
-    void swap(binary_t& other) // NOLINT(bugprone-exception-escape)
+    void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
     {
         // swap only works for strings
         if (JSON_HEDLEY_LIKELY(is_binary()))
         {
             using std::swap;
-            swap(*(m_value.binary), other);
+            swap(*(m_data.m_value.binary), other);
         }
         else
         {
@@ -22728,7 +22861,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         if (JSON_HEDLEY_LIKELY(is_binary()))
         {
             using std::swap;
-            swap(*(m_value.binary), other);
+            swap(*(m_data.m_value.binary), other);
         }
         else
         {
@@ -22756,31 +22889,31 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         switch (lhs_type)                                                                                \
         {                                                                                                \
             case value_t::array:                                                                         \
-                return (*lhs.m_value.array) op (*rhs.m_value.array);                                     \
+                return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array);                                     \
                 \
             case value_t::object:                                                                        \
-                return (*lhs.m_value.object) op (*rhs.m_value.object);                                   \
+                return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object);                                   \
                 \
             case value_t::null:                                                                          \
                 return (null_result);                                                                    \
                 \
             case value_t::string:                                                                        \
-                return (*lhs.m_value.string) op (*rhs.m_value.string);                                   \
+                return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string);                                   \
                 \
             case value_t::boolean:                                                                       \
-                return (lhs.m_value.boolean) op (rhs.m_value.boolean);                                   \
+                return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean);                                   \
                 \
             case value_t::number_integer:                                                                \
-                return (lhs.m_value.number_integer) op (rhs.m_value.number_integer);                     \
+                return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer);                     \
                 \
             case value_t::number_unsigned:                                                               \
-                return (lhs.m_value.number_unsigned) op (rhs.m_value.number_unsigned);                   \
+                return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned);                   \
                 \
             case value_t::number_float:                                                                  \
-                return (lhs.m_value.number_float) op (rhs.m_value.number_float);                         \
+                return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float);                         \
                 \
             case value_t::binary:                                                                        \
-                return (*lhs.m_value.binary) op (*rhs.m_value.binary);                                   \
+                return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary);                                   \
                 \
             case value_t::discarded:                                                                     \
             default:                                                                                     \
@@ -22789,27 +22922,27 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     }                                                                                                    \
     else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)                   \
     {                                                                                                    \
-        return static_cast<number_float_t>(lhs.m_value.number_integer) op rhs.m_value.number_float;      \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float;      \
     }                                                                                                    \
     else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)                   \
     {                                                                                                    \
-        return lhs.m_value.number_float op static_cast<number_float_t>(rhs.m_value.number_integer);      \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_integer);      \
     }                                                                                                    \
     else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)                  \
     {                                                                                                    \
-        return static_cast<number_float_t>(lhs.m_value.number_unsigned) op rhs.m_value.number_float;     \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float;     \
     }                                                                                                    \
     else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)                  \
     {                                                                                                    \
-        return lhs.m_value.number_float op static_cast<number_float_t>(rhs.m_value.number_unsigned);     \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_unsigned);     \
     }                                                                                                    \
     else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)                \
     {                                                                                                    \
-        return static_cast<number_integer_t>(lhs.m_value.number_unsigned) op rhs.m_value.number_integer; \
+        return static_cast<number_integer_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \
     }                                                                                                    \
     else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)                \
     {                                                                                                    \
-        return lhs.m_value.number_integer op static_cast<number_integer_t>(rhs.m_value.number_unsigned); \
+        return lhs.m_data.m_value.number_integer op static_cast<number_integer_t>(rhs.m_data.m_value.number_unsigned); \
     }                                                                                                    \
     else if(compares_unordered(lhs, rhs))\
     {\
@@ -22826,8 +22959,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     // an operation is computed as an odd number of inverses of others
     static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept
     {
-        if ((lhs.is_number_float() && std::isnan(lhs.m_value.number_float) && rhs.is_number())
-                || (rhs.is_number_float() && std::isnan(rhs.m_value.number_float) && lhs.is_number()))
+        if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number())
+                || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number()))
         {
             return true;
         }
@@ -23171,7 +23304,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 #endif  // JSON_NO_IO
     /// @}
 
-
     /////////////////////
     // deserialization //
     /////////////////////
@@ -23328,7 +23460,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     JSON_HEDLEY_RETURNS_NON_NULL
     const char* type_name() const noexcept
     {
-        switch (m_type)
+        switch (m_data.m_type)
         {
             case value_t::null:
                 return "null";
@@ -23352,17 +23484,43 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         }
     }
 
-
   JSON_PRIVATE_UNLESS_TESTED:
     //////////////////////
     // member variables //
     //////////////////////
 
-    /// the type of the current element
-    value_t m_type = value_t::null;
+    struct data
+    {
+        /// the type of the current element
+        value_t m_type = value_t::null;
 
-    /// the value of the current element
-    json_value m_value = {};
+        /// the value of the current element
+        json_value m_value = {};
+
+        data(const value_t v)
+            : m_type(v), m_value(v)
+        {
+        }
+
+        data(size_type cnt, const basic_json& val)
+            : m_type(value_t::array)
+        {
+            m_value.array = create<array_t>(cnt, val);
+        }
+
+        data() noexcept = default;
+        data(data&&) noexcept = default;
+        data(const data&) noexcept = delete;
+        data& operator=(data&&) noexcept = delete;
+        data& operator=(const data&) noexcept = delete;
+
+        ~data() noexcept
+        {
+            m_value.destroy(m_type);
+        }
+    };
+
+    data m_data = {};
 
 #if JSON_DIAGNOSTICS
     /// a pointer to a parent value (for debugging purposes)
@@ -23543,7 +23701,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
     }
 
-
     JSON_HEDLEY_WARN_UNUSED_RESULT
     JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
     static basic_json from_cbor(detail::span_input_adapter&& i,
@@ -23667,7 +23824,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         return res ? result : basic_json(value_t::discarded);
     }
 
-
     /// @brief create a JSON value from an input in BJData format
     /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
     template<typename InputType>
@@ -23890,7 +24046,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             }
 
             // make sure the top element of the pointer exists
-            json_pointer top_pointer = ptr.top();
+            json_pointer const top_pointer = ptr.top();
             if (top_pointer != ptr)
             {
                 result.at(top_pointer);
@@ -23902,7 +24058,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             // parent must exist when performing patch add per RFC6902 specs
             basic_json& parent = result.at(ptr);
 
-            switch (parent.m_type)
+            switch (parent.m_data.m_type)
             {
                 case value_t::null:
                 case value_t::object:
@@ -23948,7 +24104,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         };
 
         // wrapper for "remove" operation; remove value at ptr
-        const auto operation_remove = [this, &result](json_pointer & ptr)
+        const auto operation_remove = [this, & result](json_pointer & ptr)
         {
             // get reference to parent of JSON pointer ptr
             const auto last_path = ptr.back();
@@ -23991,13 +24147,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                           bool string_type) -> basic_json &
             {
                 // find value
-                auto it = val.m_value.object->find(member);
+                auto it = val.m_data.m_value.object->find(member);
 
                 // context-sensitive error message
-                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\'');
+                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable)
 
                 // check if desired value is present
-                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end()))
                 {
                     // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
                     JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val));
@@ -24052,7 +24208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     json_pointer from_ptr(from_path);
 
                     // the "from" location must exist - use at()
-                    basic_json v = result.at(from_ptr);
+                    basic_json const v = result.at(from_ptr);
 
                     // The move operation is functionally identical to a
                     // "remove" operation on the "from" location, followed
@@ -24069,7 +24225,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     const json_pointer from_ptr(from_path);
 
                     // the "from" location must exist - use at()
-                    basic_json v = result.at(from_ptr);
+                    basic_json const v = result.at(from_ptr);
 
                     // The copy is functionally identical to an "add"
                     // operation at the target location using the value
@@ -24311,7 +24467,11 @@ inline namespace json_literals
 /// @brief user-defined string literal for JSON values
 /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
 JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    inline nlohmann::json operator ""_json(const char* s, std::size_t n)
+#else
+    inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+#endif
 {
     return nlohmann::json::parse(s, s + n);
 }
@@ -24319,7 +24479,11 @@ inline nlohmann::json operator "" _json(const char* s, std::size_t n)
 /// @brief user-defined string literal for JSON pointer
 /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
 JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n)
+#else
+    inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+#endif
 {
     return nlohmann::json::json_pointer(std::string(s, n));
 }
@@ -24338,7 +24502,7 @@ namespace std // NOLINT(cert-dcl58-cpp)
 /// @brief hash value for JSON objects
 /// @sa https://json.nlohmann.me/api/basic_json/std_hash/
 NLOHMANN_BASIC_JSON_TPL_DECLARATION
-struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL>
+struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL> // NOLINT(cert-dcl58-cpp)
 {
     std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const
     {
@@ -24371,8 +24535,8 @@ struct less< ::nlohmann::detail::value_t> // do not remove the space after '<',
 /// @brief exchanges the values of two JSON objects
 /// @sa https://json.nlohmann.me/api/basic_json/std_swap/
 NLOHMANN_BASIC_JSON_TPL_DECLARATION
-inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name)
-    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value&&                          // NOLINT(misc-redundant-expression)
+inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp)
+    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value&&                          // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
     is_nothrow_move_assignable<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value)
 {
     j1.swap(j2);
@@ -24383,17 +24547,22 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
 }  // namespace std
 
 #if JSON_USE_GLOBAL_UDLS
-    using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
-    using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+    #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+        using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+        using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+    #else
+        using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+        using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+    #endif
 #endif
 
 // #include <nlohmann/detail/macro_unscope.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -24428,16 +24597,17 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
     #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
     #undef JSON_HAS_THREE_WAY_COMPARISON
     #undef JSON_HAS_RANGES
+    #undef JSON_HAS_STATIC_RTTI
     #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
 #endif
 
 // #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.2
+// |  |  |__   |  |  | | | |  version 3.11.3
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
diff --git a/common/llguidance.cpp b/common/llguidance.cpp
new file mode 100644
index 000000000..2feeb93c8
--- /dev/null
+++ b/common/llguidance.cpp
@@ -0,0 +1,270 @@
+#include "sampling.h"
+#include "log.h"
+
+#ifdef LLAMA_USE_LLGUIDANCE
+
+#    include "llguidance.h"
+#    include <cmath>
+
+struct llama_sampler_llg {
+    const llama_vocab * vocab;
+    std::string         grammar_kind;
+    std::string         grammar_data;
+    LlgTokenizer *      tokenizer;
+    LlgConstraint *     grammar;
+    LlgMaskResult       llg_res;
+    bool                has_llg_res;
+};
+
+static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                             const char * grammar_data) {
+    LlgConstraintInit cinit;
+    llg_constraint_init_set_defaults(&cinit, tokenizer);
+    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
+    if (log_level && *log_level) {
+        cinit.log_stderr_level = atoi(log_level);
+    }
+    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
+    if (llg_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_get_error(c));
+        llg_free_constraint(c);
+        return nullptr;
+    }
+    return c;
+}
+
+static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
+    return "llguidance";
+}
+
+static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        LlgCommitResult res;
+        llg_commit_token(ctx->grammar, token, &res);
+        ctx->has_llg_res = false;
+    }
+}
+
+static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        if (!ctx->has_llg_res) {
+            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
+                ctx->has_llg_res = true;
+            } else {
+                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
+                llg_free_constraint(ctx->grammar);
+                ctx->grammar = nullptr;
+            }
+        }
+        if (ctx->has_llg_res) {
+            if (ctx->llg_res.is_stop) {
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            } else {
+                const uint32_t * mask = ctx->llg_res.sample_mask;
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    auto token = cur_p->data[i].id;
+                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void llama_sampler_llg_reset(llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (!ctx->grammar) {
+        return;
+    }
+
+    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
+    llg_free_constraint(ctx->grammar);
+    ctx->grammar     = grammar_new;
+    ctx->has_llg_res = false;
+}
+
+static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
+
+    auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_llg *) result->ctx;
+
+        if (ctx->grammar) {
+            result_ctx->grammar_kind = ctx->grammar_kind;
+            result_ctx->grammar_data = ctx->grammar_data;
+            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
+            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
+        }
+    }
+
+    return result;
+}
+
+static void llama_sampler_llg_free(llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_llg *) smpl->ctx;
+
+    if (ctx->grammar) {
+        llg_free_constraint(ctx->grammar);
+        llg_free_tokenizer(ctx->tokenizer);
+    }
+
+    delete ctx;
+}
+
+static llama_sampler_i llama_sampler_llg_i = {
+    /* .name   = */ llama_sampler_llg_name,
+    /* .accept = */ llama_sampler_llg_accept_impl,
+    /* .apply  = */ llama_sampler_llg_apply,
+    /* .reset  = */ llama_sampler_llg_reset,
+    /* .clone  = */ llama_sampler_llg_clone,
+    /* .free   = */ llama_sampler_llg_free,
+};
+
+static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
+                                            uint32_t * output_tokens, size_t output_tokens_len) {
+    const llama_vocab * vocab = (const llama_vocab *) user_data;
+    int                 r     = 0;
+    try {
+        r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
+                           true);
+    } catch (const std::exception & e) {
+        GGML_ABORT("llama_tokenize failed: %s\n", e.what());
+    }
+    if (r < 0) {
+        return -r;
+    }
+    return r;
+}
+
+static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
+    // TODO store the tokenizer in the vocab somehow
+    static const llama_vocab * vocab_cache;
+    static LlgTokenizer *      tokenizer_cache;
+
+    if (vocab_cache == vocab) {
+        return llg_clone_tokenizer(tokenizer_cache);
+    }
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    size_t vocab_size = llama_vocab_n_tokens(vocab);
+
+    auto token_lens       = new uint32_t[vocab_size];
+    // we typically have ~7 bytes per token; let's go on the safe side here
+    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
+    auto token_bytes      = new uint8_t[token_bytes_size];
+
+    size_t offset = 0;
+    for (size_t i = 0; i < vocab_size; i++) {
+        size_t max_token = 1024;
+        if (token_bytes_size - offset < max_token) {
+            GGML_ABORT("token_bytes buffer too small\n");
+        }
+
+        llama_token token = i;
+        auto        dp    = (char *) token_bytes + offset;
+        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
+        if (size < 0) {
+            GGML_ABORT("llama_detokenize failed\n");
+        }
+        if (size == 0) {
+            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
+            if (size < 0) {
+                GGML_ABORT("llama_detokenize failed\n");
+            }
+            if (size != 0) {
+                *dp = '\xff';  // special token prefix marker
+                size += 1;
+            }
+        }
+
+        token_lens[i] = size;
+        offset += size;
+    }
+
+    LlgTokenizerInit tinit = {
+        /* .vocab_size                         = */ (uint32_t) vocab_size,
+        /* .tok_eos                            = */ (uint32_t) tok_eos,
+        /* .token_lens                         = */ token_lens,
+        /* .token_bytes                        = */ token_bytes,
+        /* .tokenizer_json                     = */ nullptr,
+        /* .tokenize_assumes_string            = */ true,
+        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
+        /* .use_approximate_greedy_tokenize_fn = */ false,
+        /* .tokenize_user_data                 = */ vocab,
+    };
+
+    char           error_buffer[1024];
+    LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
+
+    delete[] token_bytes;
+    delete[] token_lens;
+
+    if (tokenizer == nullptr) {
+        LOG_ERR("llg tokenizer error: %s\n", error_buffer);
+        return tokenizer;
+    }
+
+    if (tokenizer_cache) {
+        llg_free_tokenizer(tokenizer_cache);
+    }
+    vocab_cache     = vocab;
+    tokenizer_cache = tokenizer;
+
+    return llg_clone_tokenizer(tokenizer_cache);
+}
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
+                                       const char * grammar_data) {
+    auto * ctx = new llama_sampler_llg;
+
+    if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
+        auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
+        *ctx           = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ grammar_kind,
+            /* .grammar_data = */ grammar_data,
+            /* .tokenizer    = */ tokenizer,
+            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    } else {
+        *ctx = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ {},
+            /* .grammar_data = */ {},
+            /* .tokenizer    = */ nullptr,
+            /* .grammar      = */ nullptr,
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    }
+
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_llg_i,
+        /* .ctx   = */ ctx
+    );
+}
+
+#else
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
+    LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+    return nullptr;
+}
+
+#endif  // LLAMA_USE_LLGUIDANCE
diff --git a/common/log.cpp b/common/log.cpp
new file mode 100644
index 000000000..4bfbecf15
--- /dev/null
+++ b/common/log.cpp
@@ -0,0 +1,392 @@
+#include "log.h"
+
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+
+void common_log_set_verbosity_thold(int verbosity) {
+    common_log_verbosity_thold = verbosity;
+}
+
+static int64_t t_us() {
+    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// colors
+enum common_log_col : int {
+    COMMON_LOG_COL_DEFAULT = 0,
+    COMMON_LOG_COL_BOLD,
+    COMMON_LOG_COL_RED,
+    COMMON_LOG_COL_GREEN,
+    COMMON_LOG_COL_YELLOW,
+    COMMON_LOG_COL_BLUE,
+    COMMON_LOG_COL_MAGENTA,
+    COMMON_LOG_COL_CYAN,
+    COMMON_LOG_COL_WHITE,
+};
+
+// disable colors by default
+static std::vector<const char *> g_col = {
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+};
+
+struct common_log_entry {
+    enum ggml_log_level level;
+
+    bool prefix;
+
+    int64_t timestamp;
+
+    std::vector<char> msg;
+
+    // signals the worker thread to stop
+    bool is_end;
+
+    void print(FILE * file = nullptr) const {
+        FILE * fcur = file;
+        if (!fcur) {
+            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+            // these messages will still be logged to a file
+            if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+                return;
+            }
+
+            fcur = stdout;
+
+            if (level != GGML_LOG_LEVEL_NONE) {
+                fcur = stderr;
+            }
+        }
+
+        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
+            if (timestamp) {
+                // [M.s.ms.us]
+                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+                        g_col[COMMON_LOG_COL_BLUE],
+                        (int) (timestamp / 1000000 / 60),
+                        (int) (timestamp / 1000000 % 60),
+                        (int) (timestamp / 1000 % 1000),
+                        (int) (timestamp % 1000),
+                        g_col[COMMON_LOG_COL_DEFAULT]);
+            }
+
+            switch (level) {
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN],   g_col[COMMON_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW],  ""                        ); break;
+                default:
+                    break;
+            }
+        }
+
+        fprintf(fcur, "%s", msg.data());
+
+        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
+            fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
+        }
+
+        fflush(fcur);
+    }
+};
+
+struct common_log {
+    // default capacity - will be expanded if needed
+    common_log() : common_log(256) {}
+
+    common_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }
+
+        head = 0;
+        tail = 0;
+
+        resume();
+    }
+
+    ~common_log() {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+    }
+
+private:
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;
+
+    FILE * file;
+
+    bool prefix;
+    bool timestamps;
+    bool running;
+
+    int64_t t_start;
+
+    // ring buffer of entries
+    std::vector<common_log_entry> entries;
+    size_t head;
+    size_t tail;
+
+    // worker thread copies into this
+    common_log_entry cur;
+
+public:
+    void add(enum ggml_log_level level, const char * fmt, va_list args) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (!running) {
+            // discard messages while the worker thread is paused
+            return;
+        }
+
+        auto & entry = entries[tail];
+
+        {
+            // cannot use args twice, so make a copy in case we need to expand the buffer
+            va_list args_copy;
+            va_copy(args_copy, args);
+
+#if 1
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+            }
+#else
+            // hack for bolding arguments
+
+            std::stringstream ss;
+            for (int i = 0; fmt[i] != 0; i++) {
+                if (fmt[i] == '%') {
+                    ss << LOG_COL_BOLD;
+                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+                    ss << LOG_COL_DEFAULT;
+                    if (fmt[i] == 0) break;
+                }
+                ss << fmt[i];
+            }
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+            }
+#endif
+            va_end(args_copy);
+        }
+
+        entry.level = level;
+        entry.prefix = prefix;
+        entry.timestamp = 0;
+        if (timestamps) {
+            entry.timestamp = t_us() - t_start;
+        }
+        entry.is_end = false;
+
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<common_log_entry> new_entries(2*entries.size());
+
+            size_t new_tail = 0;
+
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+
+            head = 0;
+            tail = new_tail;
+
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+
+            entries = std::move(new_entries);
+        }
+
+        cv.notify_one();
+    }
+
+    void resume() {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (running) {
+            return;
+        }
+
+        running = true;
+
+        thrd = std::thread([this]() {
+            while (true) {
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+
+                    cur = entries[head];
+
+                    head = (head + 1) % entries.size();
+                }
+
+                if (cur.is_end) {
+                    break;
+                }
+
+                cur.print(); // stdout and stderr
+
+                if (file) {
+                    cur.print(file);
+                }
+            }
+        });
+    }
+
+    void pause() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+
+            if (!running) {
+                return;
+            }
+
+            running = false;
+
+            // push an entry to signal the worker thread to stop
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;
+
+                tail = (tail + 1) % entries.size();
+            }
+
+            cv.notify_one();
+        }
+
+        thrd.join();
+    }
+
+    void set_file(const char * path) {
+        pause();
+
+        if (file) {
+            fclose(file);
+        }
+
+        if (path) {
+            file = fopen(path, "w");
+        } else {
+            file = nullptr;
+        }
+
+        resume();
+    }
+
+    void set_colors(bool colors) {
+        pause();
+
+        if (colors) {
+            g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[COMMON_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[COMMON_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[COMMON_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[COMMON_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[COMMON_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
+        } else {
+            for (size_t i = 0; i < g_col.size(); i++) {
+                g_col[i] = "";
+            }
+        }
+
+        resume();
+    }
+
+    void set_prefix(bool prefix) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->prefix = prefix;
+    }
+
+    void set_timestamps(bool timestamps) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->timestamps = timestamps;
+    }
+};
+
+//
+// public API
+//
+
+struct common_log * common_log_init() {
+    return new common_log;
+}
+
+struct common_log * common_log_main() {
+    static struct common_log log;
+
+    return &log;
+}
+
+void common_log_pause(struct common_log * log) {
+    log->pause();
+}
+
+void common_log_resume(struct common_log * log) {
+    log->resume();
+}
+
+void common_log_free(struct common_log * log) {
+    delete log;
+}
+
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    log->add(level, fmt, args);
+    va_end(args);
+}
+
+void common_log_set_file(struct common_log * log, const char * file) {
+    log->set_file(file);
+}
+
+void common_log_set_colors(struct common_log * log, bool colors) {
+    log->set_colors(colors);
+}
+
+void common_log_set_prefix(struct common_log * log, bool prefix) {
+    log->set_prefix(prefix);
+}
+
+void common_log_set_timestamps(struct common_log * log, bool timestamps) {
+    log->set_timestamps(timestamps);
+}
diff --git a/common/log.h b/common/log.h
index e4e1b9f4f..4ebc6314b 100644
--- a/common/log.h
+++ b/common/log.h
@@ -1,723 +1,103 @@
 #pragma once
 
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
+#include "ggml.h" // for ggml_log_level
 
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
+#define LOG_CLR_TO_EOL  "\033[K\r"
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
 
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a different function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_different()
-//  #include "log.h"
-//
-//  FILE* log_handler_different()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
-#endif
-
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
-    std::stringstream buf;
-
-    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
-    buf << ".";
-    buf << log_file_extension;
-
-    return buf.str();
-}
-
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
-
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
-
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
-//
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
-//
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
+#ifndef __GNUC__
+#    define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 
-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0
 
-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via common_log_set_verbosity()
+extern int common_log_verbosity_thold;
+
+void common_log_set_verbosity_thold(int verbosity); // not thread-safe
+
+// the common_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct common_log;
+
+struct common_log * common_log_init();
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
+void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
+void                common_log_free  (struct common_log * log);
+
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
+
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
+// regular log output:
 //
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
+//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   llm_load_tensors: ggml ctx size =    0.27 MiB
+//   llm_load_tensors: offloading 32 repeating layers to GPU
+//   llm_load_tensors: offloading non-repeating layers to GPU
 //
-#ifndef _MSC_VER
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
+// with prefix = true, timestamps = true, the log output will look like this:
+//
+//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
+//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
+//
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
+// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+//
+
+void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
+void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
+void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
+void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
+//
+// for example:
+//
+//   LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
+//
+
+#define LOG_TMPL(level, verbosity, ...) \
+    do { \
+        if ((verbosity) <= common_log_verbosity_thold) { \
+            common_log_add(common_log_main(), (level), __VA_ARGS__); \
+        } \
     } while (0)
-#else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    } while (0)
-#endif
 
-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
-//
-#ifndef _MSC_VER
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    } while (0)
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    } while (0)
-#endif
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
 
-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
 
-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
-//
-#ifndef _MSC_VER
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
-//
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
-//
-#ifndef _MSC_VER
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#ifndef _MSC_VER
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto &token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
-
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
-
-#endif // LOG_DISABLE_LOGS
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
diff --git a/common/minja.hpp b/common/minja.hpp
new file mode 100644
index 000000000..c58dd66e0
--- /dev/null
+++ b/common/minja.hpp
@@ -0,0 +1,2883 @@
+/*
+    Copyright 2024 Google LLC
+
+    Use of this source code is governed by an MIT-style
+    license that can be found in the LICENSE file or at
+    https://opensource.org/licenses/MIT.
+*/
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <regex>
+#include <memory>
+#include <stdexcept>
+#include <sstream>
+#include <unordered_set>
+#include <json.hpp>
+
+using json = nlohmann::ordered_json;
+
+namespace minja {
+
+class Context;
+
+struct Options {
+    bool trim_blocks;  // removes the first newline after a block
+    bool lstrip_blocks;  // removes leading whitespace on the line of the block
+    bool keep_trailing_newline;  // don't remove last newline
+};
+
+struct ArgumentsValue;
+
+inline std::string normalize_newlines(const std::string & s) {
+#ifdef _WIN32
+  static const std::regex nl_regex("\r\n");
+  return std::regex_replace(s, nl_regex, "\n");
+#else
+  return s;
+#endif
+}
+
+/* Values that behave roughly like in Python. */
+class Value : public std::enable_shared_from_this<Value> {
+public:
+  using CallableType = std::function<Value(const std::shared_ptr<Context> &, ArgumentsValue &)>;
+  using FilterType = std::function<Value(const std::shared_ptr<Context> &, ArgumentsValue &)>;
+
+private:
+  using ObjectType = nlohmann::ordered_map<json, Value>;  // Only contains primitive keys
+  using ArrayType = std::vector<Value>;
+
+  std::shared_ptr<ArrayType> array_;
+  std::shared_ptr<ObjectType> object_;
+  std::shared_ptr<CallableType> callable_;
+  json primitive_;
+
+  Value(const std::shared_ptr<ArrayType> & array) : array_(array) {}
+  Value(const std::shared_ptr<ObjectType> & object) : object_(object) {}
+  Value(const std::shared_ptr<CallableType> & callable) : object_(std::make_shared<ObjectType>()), callable_(callable) {}
+
+  /* Python-style string repr */
+  static void dump_string(const json & primitive, std::ostringstream & out, char string_quote = '\'') {
+    if (!primitive.is_string()) throw std::runtime_error("Value is not a string: " + primitive.dump());
+    auto s = primitive.dump();
+    if (string_quote == '"' || s.find('\'') != std::string::npos) {
+      out << s;
+      return;
+    }
+    // Reuse json dump, just changing string quotes
+    out << string_quote;
+    for (size_t i = 1, n = s.size() - 1; i < n; ++i) {
+      if (s[i] == '\\' && s[i + 1] == '"') {
+        out << '"';
+        i++;
+      } else if (s[i] == string_quote) {
+        out << '\\' << string_quote;
+      } else {
+        out << s[i];
+      }
+    }
+    out << string_quote;
+  }
+  void dump(std::ostringstream & out, int indent = -1, int level = 0, bool to_json = false) const {
+    auto print_indent = [&](int level) {
+      if (indent > 0) {
+          out << "\n";
+          for (int i = 0, n = level * indent; i < n; ++i) out << ' ';
+      }
+    };
+    auto print_sub_sep = [&]() {
+      out << ',';
+      if (indent < 0) out << ' ';
+      else print_indent(level + 1);
+    };
+
+    auto string_quote = to_json ? '"' : '\'';
+
+    if (is_null()) out << "null";
+    else if (array_) {
+      out << "[";
+      print_indent(level + 1);
+      for (size_t i = 0; i < array_->size(); ++i) {
+        if (i) print_sub_sep();
+        (*array_)[i].dump(out, indent, level + 1, to_json);
+      }
+      print_indent(level);
+      out << "]";
+    } else if (object_) {
+      out << "{";
+      print_indent(level + 1);
+      for (auto begin = object_->begin(), it = begin; it != object_->end(); ++it) {
+        if (it != begin) print_sub_sep();
+        if (it->first.is_string()) {
+          dump_string(it->first, out, string_quote);
+        } else {
+          out << string_quote << it->first.dump() << string_quote;
+        }
+        out << ": ";
+        it->second.dump(out, indent, level + 1, to_json);
+      }
+      print_indent(level);
+      out << "}";
+    } else if (callable_) {
+      throw std::runtime_error("Cannot dump callable to JSON");
+    } else if (is_boolean() && !to_json) {
+      out << (this->to_bool() ? "True" : "False");
+    } else if (is_string() && !to_json) {
+      dump_string(primitive_, out, string_quote);
+    } else {
+      out << primitive_.dump();
+    }
+  }
+
+public:
+  Value() {}
+  Value(const bool& v) : primitive_(v) {}
+  Value(const int64_t & v) : primitive_(v) {}
+  Value(const double& v) : primitive_(v) {}
+  Value(const std::nullptr_t &) {}
+  Value(const std::string & v) : primitive_(v) {}
+  Value(const char * v) : primitive_(std::string(v)) {}
+
+  Value(const json & v) {
+    if (v.is_object()) {
+      auto object = std::make_shared<ObjectType>();
+      for (auto it = v.begin(); it != v.end(); ++it) {
+        (*object)[it.key()] = it.value();
+      }
+      object_ = std::move(object);
+    } else if (v.is_array()) {
+      auto array = std::make_shared<ArrayType>();
+      for (const auto& item : v) {
+        array->push_back(Value(item));
+      }
+      array_ = array;
+    } else {
+      primitive_ = v;
+    }
+  }
+
+  std::vector<Value> keys() {
+    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
+    std::vector<Value> res;
+    for (const auto& item : *object_) {
+      res.push_back(item.first);
+    }
+    return res;
+  }
+
+  size_t size() const {
+    if (is_object()) return object_->size();
+    if (is_array()) return array_->size();
+    if (is_string()) return primitive_.get<std::string>().length();
+    throw std::runtime_error("Value is not an array or object: " + dump());
+  }
+
+  static Value array(const std::vector<Value> values = {}) {
+    auto array = std::make_shared<ArrayType>();
+    for (const auto& item : values) {
+      array->push_back(item);
+    }
+    return Value(array);
+  }
+  static Value object(const std::shared_ptr<ObjectType> object = std::make_shared<ObjectType>()) {
+    return Value(object);
+  }
+  static Value callable(const CallableType & callable) {
+    return Value(std::make_shared<CallableType>(callable));
+  }
+
+  void insert(size_t index, const Value& v) {
+    if (!array_)
+      throw std::runtime_error("Value is not an array: " + dump());
+    array_->insert(array_->begin() + index, v);
+  }
+  void push_back(const Value& v) {
+    if (!array_)
+      throw std::runtime_error("Value is not an array: " + dump());
+    array_->push_back(v);
+  }
+  Value pop(const Value& index) {
+    if (is_array()) {
+      if (array_->empty())
+        throw std::runtime_error("pop from empty list");
+      if (index.is_null()) {
+        auto ret = array_->back();
+        array_->pop_back();
+        return ret;
+      } else if (!index.is_number_integer()) {
+        throw std::runtime_error("pop index must be an integer: " + index.dump());
+      } else {
+        auto i = index.get<int>();
+        if (i < 0 || i >= static_cast<int>(array_->size()))
+          throw std::runtime_error("pop index out of range: " + index.dump());
+        auto it = array_->begin() + (i < 0 ? array_->size() + i : i);
+        auto ret = *it;
+        array_->erase(it);
+        return ret;
+      }
+    } else if (is_object()) {
+      if (!index.is_hashable())
+        throw std::runtime_error("Unashable type: " + index.dump());
+      auto it = object_->find(index.primitive_);
+      if (it == object_->end())
+        throw std::runtime_error("Key not found: " + index.dump());
+      auto ret = it->second;
+      object_->erase(it);
+      return ret;
+    } else {
+      throw std::runtime_error("Value is not an array or object: " + dump());
+    }
+  }
+  Value get(const Value& key) {
+    if (array_) {
+      if (!key.is_number_integer()) {
+        return Value();
+      }
+      auto index = key.get<int>();
+      return array_->at(index < 0 ? array_->size() + index : index);
+    } else if (object_) {
+      if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+      auto it = object_->find(key.primitive_);
+      if (it == object_->end()) return Value();
+      return it->second;
+    }
+    return Value();
+  }
+  void set(const Value& key, const Value& value) {
+    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
+    if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+    (*object_)[key.primitive_] = value;
+  }
+  Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
+    if (!callable_) throw std::runtime_error("Value is not callable: " + dump());
+    return (*callable_)(context, args);
+  }
+
+  bool is_object() const { return !!object_; }
+  bool is_array() const { return !!array_; }
+  bool is_callable() const { return !!callable_; }
+  bool is_null() const { return !object_ && !array_ && primitive_.is_null() && !callable_; }
+  bool is_boolean() const { return primitive_.is_boolean(); }
+  bool is_number_integer() const { return primitive_.is_number_integer(); }
+  bool is_number_float() const { return primitive_.is_number_float(); }
+  bool is_number() const { return primitive_.is_number(); }
+  bool is_string() const { return primitive_.is_string(); }
+  bool is_iterable() const { return is_array() || is_object() || is_string(); }
+
+  bool is_primitive() const { return !array_ && !object_ && !callable_; }
+  bool is_hashable() const { return is_primitive(); }
+
+  bool empty() const {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (is_string()) return primitive_.empty();
+    if (is_array()) return array_->empty();
+    if (is_object()) return object_->empty();
+    return false;
+  }
+
+  void for_each(const std::function<void(Value &)> & callback) const {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (array_) {
+      for (auto& item : *array_) {
+        callback(item);
+      }
+    } else if (object_) {
+      for (auto & item : *object_) {
+        Value key(item.first);
+        callback(key);
+      }
+    } else if (is_string()) {
+      for (char c : primitive_.get<std::string>()) {
+        auto val = Value(std::string(1, c));
+        callback(val);
+      }
+    } else {
+      throw std::runtime_error("Value is not iterable: " + dump());
+    }
+  }
+
+  bool to_bool() const {
+    if (is_null()) return false;
+    if (is_boolean()) return get<bool>();
+    if (is_number()) return get<double>() != 0;
+    if (is_string()) return !get<std::string>().empty();
+    if (is_array()) return !empty();
+    return true;
+  }
+
+  int64_t to_int() const {
+    if (is_null()) return 0;
+    if (is_boolean()) return get<bool>() ? 1 : 0;
+    if (is_number()) return static_cast<int64_t>(get<double>());
+    if (is_string()) {
+      try {
+        return std::stol(get<std::string>());
+      } catch (const std::exception &) {
+        return 0;
+      }
+    }
+    return 0;
+  }
+
+  bool operator<(const Value & other) const {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (is_number() && other.is_number()) return get<double>() < other.get<double>();
+    if (is_string() && other.is_string()) return get<std::string>() < other.get<std::string>();
+    throw std::runtime_error("Cannot compare values: " + dump() + " < " + other.dump());
+  }
+  bool operator>=(const Value & other) const { return !(*this < other); }
+
+  bool operator>(const Value & other) const {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (is_number() && other.is_number()) return get<double>() > other.get<double>();
+    if (is_string() && other.is_string()) return get<std::string>() > other.get<std::string>();
+    throw std::runtime_error("Cannot compare values: " + dump() + " > " + other.dump());
+  }
+  bool operator<=(const Value & other) const { return !(*this > other); }
+
+  bool operator==(const Value & other) const {
+    if (callable_ || other.callable_) {
+      if (callable_.get() != other.callable_.get()) return false;
+    }
+    if (array_) {
+      if (!other.array_) return false;
+      if (array_->size() != other.array_->size()) return false;
+      for (size_t i = 0; i < array_->size(); ++i) {
+        if (!(*array_)[i].to_bool() || !(*other.array_)[i].to_bool() || (*array_)[i] != (*other.array_)[i]) return false;
+      }
+      return true;
+    } else if (object_) {
+      if (!other.object_) return false;
+      if (object_->size() != other.object_->size()) return false;
+      for (const auto& item : *object_) {
+        if (!item.second.to_bool() || !other.object_->count(item.first) || item.second != other.object_->at(item.first)) return false;
+      }
+      return true;
+    } else {
+      return primitive_ == other.primitive_;
+    }
+  }
+  bool operator!=(const Value & other) const { return !(*this == other); }
+
+  bool contains(const char * key) const { return contains(std::string(key)); }
+  bool contains(const std::string & key) const {
+    if (array_) {
+      return false;
+    } else if (object_) {
+      return object_->find(key) != object_->end();
+    } else {
+      throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
+    }
+  }
+  bool contains(const Value & value) const {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (array_) {
+      for (const auto& item : *array_) {
+        if (item.to_bool() && item == value) return true;
+      }
+      return false;
+    } else if (object_) {
+      if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
+      return object_->find(value.primitive_) != object_->end();
+    } else {
+      throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
+    }
+  }
+  void erase(size_t index) {
+    if (!array_) throw std::runtime_error("Value is not an array: " + dump());
+    array_->erase(array_->begin() + index);
+  }
+  void erase(const std::string & key) {
+    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
+    object_->erase(key);
+  }
+  const Value& at(const Value & index) const {
+    return const_cast<Value*>(this)->at(index);
+  }
+  Value& at(const Value & index) {
+    if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+    if (is_array()) return array_->at(index.get<int>());
+    if (is_object()) return object_->at(index.primitive_);
+    throw std::runtime_error("Value is not an array or object: " + dump());
+  }
+  const Value& at(size_t index) const {
+    return const_cast<Value*>(this)->at(index);
+  }
+  Value& at(size_t index) {
+    if (is_null())
+      throw std::runtime_error("Undefined value or reference");
+    if (is_array()) return array_->at(index);
+    if (is_object()) return object_->at(index);
+    throw std::runtime_error("Value is not an array or object: " + dump());
+  }
+
+  template <typename T>
+  T get(const std::string & key, T default_value) const {
+    if (!contains(key)) return default_value;
+    return at(key).get<T>();
+  }
+
+  template <typename T>
+  T get() const {
+    if (is_primitive()) return primitive_.get<T>();
+    throw std::runtime_error("get<T> not defined for this value type: " + dump());
+  }
+
+  std::string dump(int indent=-1, bool to_json=false) const {
+    std::ostringstream out;
+    dump(out, indent, 0, to_json);
+    return out.str();
+  }
+
+  Value operator-() const {
+      if (is_number_integer())
+        return -get<int64_t>();
+      else
+        return -get<double>();
+  }
+  std::string to_str() const {
+    if (is_string()) return get<std::string>();
+    if (is_number_integer()) return std::to_string(get<int64_t>());
+    if (is_number_float()) return std::to_string(get<double>());
+    if (is_boolean()) return get<bool>() ? "True" : "False";
+    if (is_null()) return "None";
+    return dump();
+  }
+  Value operator+(const Value& rhs) const {
+      if (is_string() || rhs.is_string()) {
+        return to_str() + rhs.to_str();
+      } else if (is_number_integer() && rhs.is_number_integer()) {
+        return get<int64_t>() + rhs.get<int64_t>();
+      } else if (is_array() && rhs.is_array()) {
+        auto res = Value::array();
+        for (const auto& item : *array_) res.push_back(item);
+        for (const auto& item : *rhs.array_) res.push_back(item);
+        return res;
+      } else {
+        return get<double>() + rhs.get<double>();
+      }
+  }
+  Value operator-(const Value& rhs) const {
+      if (is_number_integer() && rhs.is_number_integer())
+        return get<int64_t>() - rhs.get<int64_t>();
+      else
+        return get<double>() - rhs.get<double>();
+  }
+  Value operator*(const Value& rhs) const {
+      if (is_string() && rhs.is_number_integer()) {
+        std::ostringstream out;
+        for (int64_t i = 0, n = rhs.get<int64_t>(); i < n; ++i) {
+          out << to_str();
+        }
+        return out.str();
+      }
+      else if (is_number_integer() && rhs.is_number_integer())
+        return get<int64_t>() * rhs.get<int64_t>();
+      else
+        return get<double>() * rhs.get<double>();
+  }
+  Value operator/(const Value& rhs) const {
+      if (is_number_integer() && rhs.is_number_integer())
+        return get<int64_t>() / rhs.get<int64_t>();
+      else
+        return get<double>() / rhs.get<double>();
+  }
+  Value operator%(const Value& rhs) const {
+    return get<int64_t>() % rhs.get<int64_t>();
+  }
+};
+
+struct ArgumentsValue {
+  std::vector<Value> args;
+  std::vector<std::pair<std::string, Value>> kwargs;
+
+  bool has_named(const std::string & name) {
+    for (const auto & p : kwargs) {
+      if (p.first == name) return true;
+    }
+    return false;
+  }
+
+  Value get_named(const std::string & name) {
+    for (const auto & [key, value] : kwargs) {
+      if (key == name) return value;
+    }
+    return Value();
+  }
+
+  bool empty() {
+    return args.empty() && kwargs.empty();
+  }
+
+  void expectArgs(const std::string & method_name, const std::pair<size_t, size_t> & pos_count, const std::pair<size_t, size_t> & kw_count) {
+    if (args.size() < pos_count.first || args.size() > pos_count.second || kwargs.size() < kw_count.first || kwargs.size() > kw_count.second) {
+      std::ostringstream out;
+      out << method_name << " must have between " << pos_count.first << " and " << pos_count.second << " positional arguments and between " << kw_count.first << " and " << kw_count.second << " keyword arguments";
+      throw std::runtime_error(out.str());
+    }
+  }
+};
+
+template <>
+inline json Value::get<json>() const {
+  if (is_primitive()) return primitive_;
+  if (is_null()) return json();
+  if (array_) {
+    std::vector<json> res;
+    for (const auto& item : *array_) {
+      res.push_back(item.get<json>());
+    }
+    return res;
+  }
+  if (object_) {
+    json res = json::object();
+    for (const auto& [key, value] : *object_) {
+      if (key.is_string()) {
+        res[key.get<std::string>()] = value.get<json>();
+      } else if (key.is_primitive()) {
+        res[key.dump()] = value.get<json>();
+      } else {
+        throw std::runtime_error("Invalid key type for conversion to JSON: " + key.dump());
+      }
+    }
+    if (is_callable()) {
+      res["__callable__"] = true;
+    }
+    return res;
+  }
+  throw std::runtime_error("get<json> not defined for this value type: " + dump());
+}
+
+} // namespace minja
+
+namespace std {
+  template <>
+  struct hash<minja::Value> {
+    size_t operator()(const minja::Value & v) const {
+      if (!v.is_hashable())
+        throw std::runtime_error("Unsupported type for hashing: " + v.dump());
+      return std::hash<json>()(v.get<json>());
+    }
+  };
+} // namespace std
+
+namespace minja {
+
+static std::string error_location_suffix(const std::string & source, size_t pos) {
+  auto get_line = [&](size_t line) {
+    auto start = source.begin();
+    for (size_t i = 1; i < line; ++i) {
+      start = std::find(start, source.end(), '\n') + 1;
+    }
+    auto end = std::find(start, source.end(), '\n');
+    return std::string(start, end);
+  };
+  auto start = source.begin();
+  auto end = source.end();
+  auto it = start + pos;
+  auto line = std::count(start, it, '\n') + 1;
+  auto max_line = std::count(start, end, '\n') + 1;
+  auto col = pos - std::string(start, it).rfind('\n');
+  std::ostringstream out;
+  out << " at row " << line << ", column " << col << ":\n";
+  if (line > 1) out << get_line(line - 1) << "\n";
+  out << get_line(line) << "\n";
+  out << std::string(col - 1, ' ') << "^\n";
+  if (line < max_line) out << get_line(line + 1) << "\n";
+
+  return out.str();
+}
+
+class Context : public std::enable_shared_from_this<Context> {
+  protected:
+    Value values_;
+    std::shared_ptr<Context> parent_;
+  public:
+    Context(Value && values, const std::shared_ptr<Context> & parent = nullptr) : values_(std::move(values)), parent_(parent) {
+        if (!values_.is_object()) throw std::runtime_error("Context values must be an object: " + values_.dump());
+    }
+    virtual ~Context() {}
+
+    static std::shared_ptr<Context> builtins();
+    static std::shared_ptr<Context> make(Value && values, const std::shared_ptr<Context> & parent = builtins());
+
+    std::vector<Value> keys() {
+        return values_.keys();
+    }
+    virtual Value get(const Value & key) {
+        if (values_.contains(key)) return values_.at(key);
+        if (parent_) return parent_->get(key);
+        return Value();
+    }
+    virtual Value & at(const Value & key) {
+        if (values_.contains(key)) return values_.at(key);
+        if (parent_) return parent_->at(key);
+        throw std::runtime_error("Undefined variable: " + key.dump());
+    }
+    virtual bool contains(const Value & key) {
+        if (values_.contains(key)) return true;
+        if (parent_) return parent_->contains(key);
+        return false;
+    }
+    virtual void set(const Value & key, const Value & value) {
+        values_.set(key, value);
+    }
+};
+
+struct Location {
+    std::shared_ptr<std::string> source;
+    size_t pos;
+};
+
+class Expression {
+protected:
+    virtual Value do_evaluate(const std::shared_ptr<Context> & context) const = 0;
+public:
+    using Parameters = std::vector<std::pair<std::string, std::shared_ptr<Expression>>>;
+
+    Location location;
+
+    Expression(const Location & location) : location(location) {}
+    virtual ~Expression() = default;
+
+    Value evaluate(const std::shared_ptr<Context> & context) const {
+        try {
+            return do_evaluate(context);
+        } catch (const std::exception & e) {
+            std::ostringstream out;
+            out << e.what();
+            if (location.source) out << error_location_suffix(*location.source, location.pos);
+            throw std::runtime_error(out.str());
+        }
+    }
+};
+
+class VariableExpr : public Expression {
+    std::string name;
+public:
+    VariableExpr(const Location & location, const std::string& n)
+      : Expression(location), name(n) {}
+    std::string get_name() const { return name; }
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!context->contains(name)) {
+            return Value();
+        }
+        return context->at(name);
+    }
+};
+
+static void destructuring_assign(const std::vector<std::string> & var_names, const std::shared_ptr<Context> & context, Value& item) {
+  if (var_names.size() == 1) {
+      Value name(var_names[0]);
+      context->set(name, item);
+  } else {
+      if (!item.is_array() || item.size() != var_names.size()) {
+          throw std::runtime_error("Mismatched number of variables and items in destructuring assignment");
+      }
+      for (size_t i = 0; i < var_names.size(); ++i) {
+          context->set(var_names[i], item.at(i));
+      }
+  }
+}
+
+enum SpaceHandling { Keep, Strip, StripSpaces, StripNewline };
+
+class TemplateToken {
+public:
+    enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Generation, EndGeneration, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter, Break, Continue };
+
+    static std::string typeToString(Type t) {
+        switch (t) {
+            case Type::Text: return "text";
+            case Type::Expression: return "expression";
+            case Type::If: return "if";
+            case Type::Else: return "else";
+            case Type::Elif: return "elif";
+            case Type::EndIf: return "endif";
+            case Type::For: return "for";
+            case Type::EndFor: return "endfor";
+            case Type::Set: return "set";
+            case Type::EndSet: return "endset";
+            case Type::Comment: return "comment";
+            case Type::Macro: return "macro";
+            case Type::EndMacro: return "endmacro";
+            case Type::Filter: return "filter";
+            case Type::EndFilter: return "endfilter";
+            case Type::Generation: return "generation";
+            case Type::EndGeneration: return "endgeneration";
+            case Type::Break: return "break";
+            case Type::Continue: return "continue";
+        }
+        return "Unknown";
+    }
+
+    TemplateToken(Type type, const Location & location, SpaceHandling pre, SpaceHandling post) : type(type), location(location), pre_space(pre), post_space(post) {}
+    virtual ~TemplateToken() = default;
+
+    Type type;
+    Location location;
+    SpaceHandling pre_space = SpaceHandling::Keep;
+    SpaceHandling post_space = SpaceHandling::Keep;
+};
+
+struct TextTemplateToken : public TemplateToken {
+    std::string text;
+    TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {}
+};
+
+struct ExpressionTemplateToken : public TemplateToken {
+    std::shared_ptr<Expression> expr;
+    ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {}
+};
+
+struct IfTemplateToken : public TemplateToken {
+    std::shared_ptr<Expression> condition;
+    IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {}
+};
+
+struct ElifTemplateToken : public TemplateToken {
+    std::shared_ptr<Expression> condition;
+    ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {}
+};
+
+struct ElseTemplateToken : public TemplateToken {
+    ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {}
+};
+
+struct EndIfTemplateToken : public TemplateToken {
+    EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {}
+};
+
+struct MacroTemplateToken : public TemplateToken {
+    std::shared_ptr<VariableExpr> name;
+    Expression::Parameters params;
+    MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
+      : TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {}
+};
+
+struct EndMacroTemplateToken : public TemplateToken {
+    EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {}
+};
+
+struct FilterTemplateToken : public TemplateToken {
+    std::shared_ptr<Expression> filter;
+    FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
+      : TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {}
+};
+
+struct EndFilterTemplateToken : public TemplateToken {
+    EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {}
+};
+
+struct ForTemplateToken : public TemplateToken {
+    std::vector<std::string> var_names;
+    std::shared_ptr<Expression> iterable;
+    std::shared_ptr<Expression> condition;
+    bool recursive;
+    ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
+      std::shared_ptr<Expression> && c, bool r)
+      : TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
+};
+
+struct EndForTemplateToken : public TemplateToken {
+    EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
+};
+
+struct GenerationTemplateToken : public TemplateToken {
+    GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
+};
+
+struct EndGenerationTemplateToken : public TemplateToken {
+    EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
+};
+
+struct SetTemplateToken : public TemplateToken {
+    std::string ns;
+    std::vector<std::string> var_names;
+    std::shared_ptr<Expression> value;
+    SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+      : TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
+};
+
+struct EndSetTemplateToken : public TemplateToken {
+    EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {}
+};
+
+struct CommentTemplateToken : public TemplateToken {
+    std::string text;
+    CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
+};
+
+enum class LoopControlType { Break, Continue };
+
+class LoopControlException : public std::runtime_error {
+public:
+    LoopControlType control_type;
+    LoopControlException(const std::string & message, LoopControlType control_type) : std::runtime_error(message), control_type(control_type) {}
+    LoopControlException(LoopControlType control_type)
+      : std::runtime_error((control_type == LoopControlType::Continue ? "continue" : "break") + std::string(" outside of a loop")),
+        control_type(control_type) {}
+};
+
+struct LoopControlTemplateToken : public TemplateToken {
+    LoopControlType control_type;
+    LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
+};
+
+class TemplateNode {
+    Location location_;
+protected:
+    virtual void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const = 0;
+
+public:
+    TemplateNode(const Location & location) : location_(location) {}
+    void render(std::ostringstream & out, const std::shared_ptr<Context> & context) const {
+        try {
+            do_render(out, context);
+        } catch (const LoopControlException & e) {
+            // TODO: make stack creation lazy. Only needed if it was thrown outside of a loop.
+            std::ostringstream err;
+            err << e.what();
+            if (location_.source) err << error_location_suffix(*location_.source, location_.pos);
+            throw LoopControlException(err.str(), e.control_type);
+        } catch (const std::exception & e) {
+            std::ostringstream err;
+            err << e.what();
+            if (location_.source) err << error_location_suffix(*location_.source, location_.pos);
+            throw std::runtime_error(err.str());
+        }
+    }
+    const Location & location() const { return location_; }
+    virtual ~TemplateNode() = default;
+    std::string render(const std::shared_ptr<Context> & context) const {
+        std::ostringstream out;
+        render(out, context);
+        return out.str();
+    }
+};
+
+class SequenceNode : public TemplateNode {
+    std::vector<std::shared_ptr<TemplateNode>> children;
+public:
+    SequenceNode(const Location & location, std::vector<std::shared_ptr<TemplateNode>> && c)
+      : TemplateNode(location), children(std::move(c)) {}
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
+        for (const auto& child : children) child->render(out, context);
+    }
+};
+
+class TextNode : public TemplateNode {
+    std::string text;
+public:
+    TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {}
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> &) const override {
+      out << text;
+    }
+};
+
+class ExpressionNode : public TemplateNode {
+    std::shared_ptr<Expression> expr;
+public:
+    ExpressionNode(const Location & location, std::shared_ptr<Expression> && e) : TemplateNode(location), expr(std::move(e)) {}
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
+      if (!expr) throw std::runtime_error("ExpressionNode.expr is null");
+      auto result = expr->evaluate(context);
+      if (result.is_string()) {
+          out << result.get<std::string>();
+      } else if (result.is_boolean()) {
+          out << (result.get<bool>() ? "True" : "False");
+      } else if (!result.is_null()) {
+          out << result.dump();
+      }
+  }
+};
+
+class IfNode : public TemplateNode {
+    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
+public:
+    IfNode(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
+        : TemplateNode(location), cascade(std::move(c)) {}
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
+      for (const auto& branch : cascade) {
+          auto enter_branch = true;
+          if (branch.first) {
+            enter_branch = branch.first->evaluate(context).to_bool();
+          }
+          if (enter_branch) {
+            if (!branch.second) throw std::runtime_error("IfNode.cascade.second is null");
+              branch.second->render(out, context);
+              return;
+          }
+      }
+    }
+};
+
+class LoopControlNode : public TemplateNode {
+    LoopControlType control_type_;
+  public:
+    LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
+    void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
+      throw LoopControlException(control_type_);
+    }
+};
+
+class ForNode : public TemplateNode {
+    std::vector<std::string> var_names;
+    std::shared_ptr<Expression> iterable;
+    std::shared_ptr<Expression> condition;
+    std::shared_ptr<TemplateNode> body;
+    bool recursive;
+    std::shared_ptr<TemplateNode> else_body;
+public:
+    ForNode(const Location & location, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
+      std::shared_ptr<Expression> && condition, std::shared_ptr<TemplateNode> && body, bool recursive, std::shared_ptr<TemplateNode> && else_body)
+            : TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
+
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
+      // https://jinja.palletsprojects.com/en/3.0.x/templates/#for
+      if (!iterable) throw std::runtime_error("ForNode.iterable is null");
+      if (!body) throw std::runtime_error("ForNode.body is null");
+
+      auto iterable_value = iterable->evaluate(context);
+      Value::CallableType loop_function;
+
+      std::function<void(Value&)> visit = [&](Value& iter) {
+          auto filtered_items = Value::array();
+          if (!iter.is_null()) {
+            if (!iterable_value.is_iterable()) {
+              throw std::runtime_error("For loop iterable must be iterable: " + iterable_value.dump());
+            }
+            iterable_value.for_each([&](Value & item) {
+                destructuring_assign(var_names, context, item);
+                if (!condition || condition->evaluate(context).to_bool()) {
+                  filtered_items.push_back(item);
+                }
+            });
+          }
+          if (filtered_items.empty()) {
+            if (else_body) {
+              else_body->render(out, context);
+            }
+          } else {
+              auto loop = recursive ? Value::callable(loop_function) : Value::object();
+              loop.set("length", (int64_t) filtered_items.size());
+
+              size_t cycle_index = 0;
+              loop.set("cycle", Value::callable([&](const std::shared_ptr<Context> &, ArgumentsValue & args) {
+                  if (args.args.empty() || !args.kwargs.empty()) {
+                      throw std::runtime_error("cycle() expects at least 1 positional argument and no named arg");
+                  }
+                  auto item = args.args[cycle_index];
+                  cycle_index = (cycle_index + 1) % args.args.size();
+                  return item;
+              }));
+              auto loop_context = Context::make(Value::object(), context);
+              loop_context->set("loop", loop);
+              for (size_t i = 0, n = filtered_items.size(); i < n; ++i) {
+                  auto & item = filtered_items.at(i);
+                  destructuring_assign(var_names, loop_context, item);
+                  loop.set("index", (int64_t) i + 1);
+                  loop.set("index0", (int64_t) i);
+                  loop.set("revindex", (int64_t) (n - i));
+                  loop.set("revindex0", (int64_t) (n - i - 1));
+                  loop.set("length", (int64_t) n);
+                  loop.set("first", i == 0);
+                  loop.set("last", i == (n - 1));
+                  loop.set("previtem", i > 0 ? filtered_items.at(i - 1) : Value());
+                  loop.set("nextitem", i < n - 1 ? filtered_items.at(i + 1) : Value());
+                  try {
+                      body->render(out, loop_context);
+                  } catch (const LoopControlException & e) {
+                      if (e.control_type == LoopControlType::Break) break;
+                      if (e.control_type == LoopControlType::Continue) continue;
+                  }
+              }
+          }
+      };
+
+      if (recursive) {
+        loop_function = [&](const std::shared_ptr<Context> &, ArgumentsValue & args) {
+            if (args.args.size() != 1 || !args.kwargs.empty() || !args.args[0].is_array()) {
+                throw std::runtime_error("loop() expects exactly 1 positional iterable argument");
+            }
+            auto & items = args.args[0];
+            visit(items);
+            return Value();
+        };
+      }
+
+      visit(iterable_value);
+  }
+};
+
+class MacroNode : public TemplateNode {
+    std::shared_ptr<VariableExpr> name;
+    Expression::Parameters params;
+    std::shared_ptr<TemplateNode> body;
+    std::unordered_map<std::string, size_t> named_param_positions;
+public:
+    MacroNode(const Location & location, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
+        for (size_t i = 0; i < params.size(); ++i) {
+          const auto & name = params[i].first;
+          if (!name.empty()) {
+            named_param_positions[name] = i;
+          }
+        }
+    }
+    void do_render(std::ostringstream &, const std::shared_ptr<Context> & macro_context) const override {
+        if (!name) throw std::runtime_error("MacroNode.name is null");
+        if (!body) throw std::runtime_error("MacroNode.body is null");
+        auto callable = Value::callable([&](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
+            auto call_context = macro_context;
+            std::vector<bool> param_set(params.size(), false);
+            for (size_t i = 0, n = args.args.size(); i < n; i++) {
+                auto & arg = args.args[i];
+                if (i >= params.size()) throw std::runtime_error("Too many positional arguments for macro " + name->get_name());
+                param_set[i] = true;
+                auto & param_name = params[i].first;
+                call_context->set(param_name, arg);
+            }
+            for (auto & [arg_name, value] : args.kwargs) {
+                auto it = named_param_positions.find(arg_name);
+                if (it == named_param_positions.end()) throw std::runtime_error("Unknown parameter name for macro " + name->get_name() + ": " + arg_name);
+
+                call_context->set(arg_name, value);
+                param_set[it->second] = true;
+            }
+            // Set default values for parameters that were not passed
+            for (size_t i = 0, n = params.size(); i < n; i++) {
+                if (!param_set[i] && params[i].second != nullptr) {
+                    auto val = params[i].second->evaluate(context);
+                    call_context->set(params[i].first, val);
+                }
+            }
+            return body->render(call_context);
+        });
+        macro_context->set(name->get_name(), callable);
+    }
+};
+
+class FilterNode : public TemplateNode {
+    std::shared_ptr<Expression> filter;
+    std::shared_ptr<TemplateNode> body;
+
+public:
+    FilterNode(const Location & location, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(location), filter(std::move(f)), body(std::move(b)) {}
+
+    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
+        if (!filter) throw std::runtime_error("FilterNode.filter is null");
+        if (!body) throw std::runtime_error("FilterNode.body is null");
+        auto filter_value = filter->evaluate(context);
+        if (!filter_value.is_callable()) {
+            throw std::runtime_error("Filter must be a callable: " + filter_value.dump());
+        }
+        std::string rendered_body = body->render(context);
+
+        ArgumentsValue filter_args = {{Value(rendered_body)}, {}};
+        auto result = filter_value.call(context, filter_args);
+        out << result.to_str();
+    }
+};
+
+class SetNode : public TemplateNode {
+    std::string ns;
+    std::vector<std::string> var_names;
+    std::shared_ptr<Expression> value;
+public:
+    SetNode(const Location & location, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+        : TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {}
+    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
+      if (!value) throw std::runtime_error("SetNode.value is null");
+      if (!ns.empty()) {
+        if (var_names.size() != 1) {
+          throw std::runtime_error("Namespaced set only supports a single variable name");
+        }
+        auto & name = var_names[0];
+        auto ns_value = context->get(ns);
+        if (!ns_value.is_object()) throw std::runtime_error("Namespace '" + ns + "' is not an object");
+        ns_value.set(name, this->value->evaluate(context));
+      } else {
+        auto val = value->evaluate(context);
+        destructuring_assign(var_names, context, val);
+      }
+    }
+};
+
+class SetTemplateNode : public TemplateNode {
+    std::string name;
+    std::shared_ptr<TemplateNode> template_value;
+public:
+    SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr<TemplateNode> && tv)
+        : TemplateNode(location), name(name), template_value(std::move(tv)) {}
+    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
+      if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null");
+      Value value { template_value->render(context) };
+      context->set(name, value);
+    }
+};
+
+class IfExpr : public Expression {
+    std::shared_ptr<Expression> condition;
+    std::shared_ptr<Expression> then_expr;
+    std::shared_ptr<Expression> else_expr;
+public:
+    IfExpr(const Location & location, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
+        : Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+      if (!condition) throw std::runtime_error("IfExpr.condition is null");
+      if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null");
+      if (condition->evaluate(context).to_bool()) {
+        return then_expr->evaluate(context);
+      }
+      if (else_expr) {
+        return else_expr->evaluate(context);
+      }
+      return nullptr;
+    }
+};
+
+class LiteralExpr : public Expression {
+    Value value;
+public:
+    LiteralExpr(const Location & location, const Value& v)
+      : Expression(location), value(v) {}
+    Value do_evaluate(const std::shared_ptr<Context> &) const override { return value; }
+};
+
+class ArrayExpr : public Expression {
+    std::vector<std::shared_ptr<Expression>> elements;
+public:
+    ArrayExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && e)
+      : Expression(location), elements(std::move(e)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        auto result = Value::array();
+        for (const auto& e : elements) {
+            if (!e) throw std::runtime_error("Array element is null");
+            result.push_back(e->evaluate(context));
+        }
+        return result;
+    }
+};
+
+class DictExpr : public Expression {
+    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
+public:
+    DictExpr(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
+      : Expression(location), elements(std::move(e)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        auto result = Value::object();
+        for (const auto& [key, value] : elements) {
+            if (!key) throw std::runtime_error("Dict key is null");
+            if (!value) throw std::runtime_error("Dict value is null");
+            result.set(key->evaluate(context), value->evaluate(context));
+        }
+        return result;
+    }
+};
+
+class SliceExpr : public Expression {
+public:
+    std::shared_ptr<Expression> start, end;
+    SliceExpr(const Location & location, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
+      : Expression(location), start(std::move(s)), end(std::move(e)) {}
+    Value do_evaluate(const std::shared_ptr<Context> &) const override {
+        throw std::runtime_error("SliceExpr not implemented");
+    }
+};
+
+class SubscriptExpr : public Expression {
+    std::shared_ptr<Expression> base;
+    std::shared_ptr<Expression> index;
+public:
+    SubscriptExpr(const Location & location, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
+        : Expression(location), base(std::move(b)), index(std::move(i)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!base) throw std::runtime_error("SubscriptExpr.base is null");
+        if (!index) throw std::runtime_error("SubscriptExpr.index is null");
+        auto target_value = base->evaluate(context);
+        if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
+          auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
+          auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
+          if (target_value.is_string()) {
+            std::string s = target_value.get<std::string>();
+            if (start < 0) start = s.size() + start;
+            if (end < 0) end = s.size() + end;
+            return s.substr(start, end - start);
+          } else if (target_value.is_array()) {
+            if (start < 0) start = target_value.size() + start;
+            if (end < 0) end = target_value.size() + end;
+            auto result = Value::array();
+            for (auto i = start; i < end; ++i) {
+              result.push_back(target_value.at(i));
+            }
+            return result;
+          } else {
+            throw std::runtime_error(target_value.is_null() ? "Cannot subscript null" : "Subscripting only supported on arrays and strings");
+          }
+        } else {
+          auto index_value = index->evaluate(context);
+          if (target_value.is_null()) {
+            if (auto t = dynamic_cast<VariableExpr*>(base.get())) {
+              throw std::runtime_error("'" + t->get_name() + "' is " + (context->contains(t->get_name()) ? "null" : "not defined"));
+            }
+            throw std::runtime_error("Trying to access property '" +  index_value.dump() + "' on null!");
+          }
+          return target_value.get(index_value);
+        }
+    }
+};
+
+class UnaryOpExpr : public Expression {
+public:
+    enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict };
+    std::shared_ptr<Expression> expr;
+    Op op;
+    UnaryOpExpr(const Location & location, std::shared_ptr<Expression> && e, Op o)
+      : Expression(location), expr(std::move(e)), op(o) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null");
+        auto e = expr->evaluate(context);
+        switch (op) {
+            case Op::Plus: return e;
+            case Op::Minus: return -e;
+            case Op::LogicalNot: return !e.to_bool();
+            case Op::Expansion:
+            case Op::ExpansionDict:
+                throw std::runtime_error("Expansion operator is only supported in function calls and collections");
+
+        }
+        throw std::runtime_error("Unknown unary operator");
+    }
+};
+
+class BinaryOpExpr : public Expression {
+public:
+    enum class Op { StrConcat, Add, Sub, Mul, MulMul, Div, DivDiv, Mod, Eq, Ne, Lt, Gt, Le, Ge, And, Or, In, NotIn, Is, IsNot };
+private:
+    std::shared_ptr<Expression> left;
+    std::shared_ptr<Expression> right;
+    Op op;
+public:
+    BinaryOpExpr(const Location & location, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
+        : Expression(location), left(std::move(l)), right(std::move(r)), op(o) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!left) throw std::runtime_error("BinaryOpExpr.left is null");
+        if (!right) throw std::runtime_error("BinaryOpExpr.right is null");
+        auto l = left->evaluate(context);
+
+        auto do_eval = [&](const Value & l) -> Value {
+          if (op == Op::Is || op == Op::IsNot) {
+            auto t = dynamic_cast<VariableExpr*>(right.get());
+            if (!t) throw std::runtime_error("Right side of 'is' operator must be a variable");
+
+            auto eval = [&]() {
+              const auto & name = t->get_name();
+              if (name == "none") return l.is_null();
+              if (name == "boolean") return l.is_boolean();
+              if (name == "integer") return l.is_number_integer();
+              if (name == "float") return l.is_number_float();
+              if (name == "number") return l.is_number();
+              if (name == "string") return l.is_string();
+              if (name == "mapping") return l.is_object();
+              if (name == "iterable") return l.is_iterable();
+              if (name == "sequence") return l.is_array();
+              if (name == "defined") return !l.is_null();
+              throw std::runtime_error("Unknown type for 'is' operator: " + name);
+            };
+            auto value = eval();
+            return Value(op == Op::Is ? value : !value);
+          }
+
+          if (op == Op::And) {
+            if (!l.to_bool()) return Value(false);
+            return right->evaluate(context).to_bool();
+          } else if (op == Op::Or) {
+            if (l.to_bool()) return l;
+            return right->evaluate(context);
+          }
+
+          auto r = right->evaluate(context);
+          switch (op) {
+              case Op::StrConcat: return l.to_str() + r.to_str();
+              case Op::Add:       return l + r;
+              case Op::Sub:       return l - r;
+              case Op::Mul:       return l * r;
+              case Op::Div:       return l / r;
+              case Op::MulMul:    return std::pow(l.get<double>(), r.get<double>());
+              case Op::DivDiv:    return l.get<int64_t>() / r.get<int64_t>();
+              case Op::Mod:       return l.get<int64_t>() % r.get<int64_t>();
+              case Op::Eq:        return l == r;
+              case Op::Ne:        return l != r;
+              case Op::Lt:        return l < r;
+              case Op::Gt:        return l > r;
+              case Op::Le:        return l <= r;
+              case Op::Ge:        return l >= r;
+              case Op::In:        return (r.is_array() || r.is_object()) && r.contains(l);
+              case Op::NotIn:     return !(r.is_array() && r.contains(l));
+              default:            break;
+          }
+          throw std::runtime_error("Unknown binary operator");
+        };
+
+        if (l.is_callable()) {
+          return Value::callable([l, do_eval](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
+            auto ll = l.call(context, args);
+            return do_eval(ll); //args[0].second);
+          });
+        } else {
+          return do_eval(l);
+        }
+    }
+};
+
+struct ArgumentsExpression {
+    std::vector<std::shared_ptr<Expression>> args;
+    std::vector<std::pair<std::string, std::shared_ptr<Expression>>> kwargs;
+
+    ArgumentsValue evaluate(const std::shared_ptr<Context> & context) const {
+        ArgumentsValue vargs;
+        for (const auto& arg : this->args) {
+            if (auto un_expr = std::dynamic_pointer_cast<UnaryOpExpr>(arg)) {
+                if (un_expr->op == UnaryOpExpr::Op::Expansion) {
+                    auto array = un_expr->expr->evaluate(context);
+                    if (!array.is_array()) {
+                        throw std::runtime_error("Expansion operator only supported on arrays");
+                    }
+                    array.for_each([&](Value & value) {
+                        vargs.args.push_back(value);
+                    });
+                    continue;
+                } else if (un_expr->op == UnaryOpExpr::Op::ExpansionDict) {
+                    auto dict = un_expr->expr->evaluate(context);
+                    if (!dict.is_object()) {
+                        throw std::runtime_error("ExpansionDict operator only supported on objects");
+                    }
+                    dict.for_each([&](const Value & key) {
+                        vargs.kwargs.push_back({key.get<std::string>(), dict.at(key)});
+                    });
+                    continue;
+                }
+            }
+            vargs.args.push_back(arg->evaluate(context));
+        }
+        for (const auto& [name, value] : this->kwargs) {
+            vargs.kwargs.push_back({name, value->evaluate(context)});
+        }
+        return vargs;
+    }
+};
+
+static std::string strip(const std::string & s) {
+  auto start = s.find_first_not_of(" \t\n\r");
+  if (start == std::string::npos) return "";
+  auto end = s.find_last_not_of(" \t\n\r");
+  return s.substr(start, end - start + 1);
+}
+
+static std::string capitalize(const std::string & s) {
+  if (s.empty()) return s;
+  auto result = s;
+  result[0] = std::toupper(result[0]);
+  return result;
+}
+
+static std::string html_escape(const std::string & s) {
+  std::string result;
+  result.reserve(s.size());
+  for (const auto & c : s) {
+    switch (c) {
+      case '&': result += "&amp;"; break;
+      case '<': result += "&lt;"; break;
+      case '>': result += "&gt;"; break;
+      case '"': result += "&#34;"; break;
+      case '\'': result += "&apos;"; break;
+      default: result += c; break;
+    }
+  }
+  return result;
+}
+
+class MethodCallExpr : public Expression {
+    std::shared_ptr<Expression> object;
+    std::shared_ptr<VariableExpr> method;
+    ArgumentsExpression args;
+public:
+    MethodCallExpr(const Location & location, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
+        : Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!object) throw std::runtime_error("MethodCallExpr.object is null");
+        if (!method) throw std::runtime_error("MethodCallExpr.method is null");
+        auto obj = object->evaluate(context);
+        auto vargs = args.evaluate(context);
+        if (obj.is_null()) {
+          throw std::runtime_error("Trying to call method '" + method->get_name() + "' on null");
+        }
+        if (obj.is_array()) {
+          if (method->get_name() == "append") {
+              vargs.expectArgs("append method", {1, 1}, {0, 0});
+              obj.push_back(vargs.args[0]);
+              return Value();
+          } else if (method->get_name() == "pop") {
+              vargs.expectArgs("pop method", {0, 1}, {0, 0});
+              return obj.pop(vargs.args.empty() ? Value() : vargs.args[0]);
+          } else if (method->get_name() == "insert") {
+              vargs.expectArgs("insert method", {2, 2}, {0, 0});
+              auto index = vargs.args[0].get<int64_t>();
+              if (index < 0 || index > (int64_t) obj.size()) throw std::runtime_error("Index out of range for insert method");
+              obj.insert(index, vargs.args[1]);
+              return Value();
+          }
+        } else if (obj.is_object()) {
+          if (method->get_name() == "items") {
+            vargs.expectArgs("items method", {0, 0}, {0, 0});
+            auto result = Value::array();
+            for (const auto& key : obj.keys()) {
+              result.push_back(Value::array({key, obj.at(key)}));
+            }
+            return result;
+          } else if (method->get_name() == "pop") {
+            vargs.expectArgs("pop method", {1, 1}, {0, 0});
+            return obj.pop(vargs.args[0]);
+          } else if (method->get_name() == "get") {
+            vargs.expectArgs("get method", {1, 2}, {0, 0});
+            auto key = vargs.args[0];
+            if (vargs.args.size() == 1) {
+              return obj.contains(key) ? obj.at(key) : Value();
+            } else {
+              return obj.contains(key) ? obj.at(key) : vargs.args[1];
+            }
+          } else if (obj.contains(method->get_name())) {
+            auto callable = obj.at(method->get_name());
+            if (!callable.is_callable()) {
+              throw std::runtime_error("Property '" + method->get_name() + "' is not callable");
+            }
+            return callable.call(context, vargs);
+          }
+        } else if (obj.is_string()) {
+          auto str = obj.get<std::string>();
+          if (method->get_name() == "strip") {
+            vargs.expectArgs("strip method", {0, 0}, {0, 0});
+            return Value(strip(str));
+          } else if (method->get_name() == "capitalize") {
+            vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
+            return Value(capitalize(str));
+          } else if (method->get_name() == "endswith") {
+            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
+            auto suffix = vargs.args[0].get<std::string>();
+            return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
+          } else if (method->get_name() == "title") {
+            vargs.expectArgs("title method", {0, 0}, {0, 0});
+            auto res = str;
+            for (size_t i = 0, n = res.size(); i < n; ++i) {
+              if (i == 0 || std::isspace(res[i - 1])) res[i] = std::toupper(res[i]);
+              else res[i] = std::tolower(res[i]);
+            }
+            return res;
+          }
+        }
+        throw std::runtime_error("Unknown method: " + method->get_name());
+    }
+};
+
+class CallExpr : public Expression {
+public:
+    std::shared_ptr<Expression> object;
+    ArgumentsExpression args;
+    CallExpr(const Location & location, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
+        : Expression(location), object(std::move(obj)), args(std::move(a)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        if (!object) throw std::runtime_error("CallExpr.object is null");
+        auto obj = object->evaluate(context);
+        if (!obj.is_callable()) {
+          throw std::runtime_error("Object is not callable: " + obj.dump(2));
+        }
+        auto vargs = args.evaluate(context);
+        return obj.call(context, vargs);
+    }
+};
+
+class FilterExpr : public Expression {
+    std::vector<std::shared_ptr<Expression>> parts;
+public:
+    FilterExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && p)
+      : Expression(location), parts(std::move(p)) {}
+    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
+        Value result;
+        bool first = true;
+        for (const auto& part : parts) {
+          if (!part) throw std::runtime_error("FilterExpr.part is null");
+          if (first) {
+            first = false;
+            result = part->evaluate(context);
+          } else {
+            if (auto ce = dynamic_cast<CallExpr*>(part.get())) {
+              auto target = ce->object->evaluate(context);
+              ArgumentsValue args = ce->args.evaluate(context);
+              args.args.insert(args.args.begin(), result);
+              result = target.call(context, args);
+            } else {
+              auto callable = part->evaluate(context);
+              ArgumentsValue args;
+              args.args.insert(args.args.begin(), result);
+              result = callable.call(context, args);
+            }
+          }
+        }
+        return result;
+    }
+
+    void prepend(std::shared_ptr<Expression> && e) {
+        parts.insert(parts.begin(), std::move(e));
+    }
+};
+
+class Parser {
+private:
+    using CharIterator = std::string::const_iterator;
+
+    std::shared_ptr<std::string> template_str;
+    CharIterator start, end, it;
+    Options options;
+
+    Parser(const std::shared_ptr<std::string>& template_str, const Options & options) : template_str(template_str), options(options) {
+      if (!template_str) throw std::runtime_error("Template string is null");
+      start = it = this->template_str->begin();
+      end = this->template_str->end();
+    }
+
+    bool consumeSpaces(SpaceHandling space_handling = SpaceHandling::Strip) {
+      if (space_handling == SpaceHandling::Strip) {
+        while (it != end && std::isspace(*it)) ++it;
+      }
+      return true;
+    }
+
+    std::unique_ptr<std::string> parseString() {
+      auto doParse = [&](char quote) -> std::unique_ptr<std::string> {
+        if (it == end || *it != quote) return nullptr;
+        std::string result;
+        bool escape = false;
+        for (++it; it != end; ++it) {
+          if (escape) {
+            escape = false;
+            switch (*it) {
+              case 'n': result += '\n'; break;
+              case 'r': result += '\r'; break;
+              case 't': result += '\t'; break;
+              case 'b': result += '\b'; break;
+              case 'f': result += '\f'; break;
+              case '\\': result += '\\'; break;
+              default:
+                if (*it == quote) {
+                  result += quote;
+                } else {
+                  result += *it;
+                }
+                break;
+            }
+          } else if (*it == '\\') {
+            escape = true;
+          } else if (*it == quote) {
+              ++it;
+            return std::make_unique<std::string>(std::move(result));
+          } else {
+            result += *it;
+          }
+        }
+        return nullptr;
+      };
+
+      consumeSpaces();
+      if (it == end) return nullptr;
+      if (*it == '"') return doParse('"');
+      if (*it == '\'') return doParse('\'');
+      return nullptr;
+    }
+
+    json parseNumber(CharIterator& it, const CharIterator& end) {
+        auto before = it;
+        consumeSpaces();
+        auto start = it;
+        bool hasDecimal = false;
+        bool hasExponent = false;
+
+        if (it != end && (*it == '-' || *it == '+')) ++it;
+
+        while (it != end) {
+          if (std::isdigit(*it)) {
+            ++it;
+          } else if (*it == '.') {
+            if (hasDecimal) throw std::runtime_error("Multiple decimal points");
+            hasDecimal = true;
+            ++it;
+          } else if (it != start && (*it == 'e' || *it == 'E')) {
+            if (hasExponent) throw std::runtime_error("Multiple exponents");
+            hasExponent = true;
+            ++it;
+          } else {
+            break;
+          }
+        }
+        if (start == it) {
+          it = before;
+          return json(); // No valid characters found
+        }
+
+        std::string str(start, it);
+        try {
+          return json::parse(str);
+        } catch (json::parse_error& e) {
+          throw std::runtime_error("Failed to parse number: '" + str + "' (" + std::string(e.what()) + ")");
+          return json();
+        }
+    }
+
+    /** integer, float, bool, string */
+    std::shared_ptr<Value> parseConstant() {
+      auto start = it;
+      consumeSpaces();
+      if (it == end) return nullptr;
+      if (*it == '"' || *it == '\'') {
+        auto str = parseString();
+        if (str) return std::make_shared<Value>(*str);
+      }
+      static std::regex prim_tok(R"(true\b|True\b|false\b|False\b|None\b)");
+      auto token = consumeToken(prim_tok);
+      if (!token.empty()) {
+        if (token == "true" || token == "True") return std::make_shared<Value>(true);
+        if (token == "false" || token == "False") return std::make_shared<Value>(false);
+        if (token == "None") return std::make_shared<Value>(nullptr);
+        throw std::runtime_error("Unknown constant token: " + token);
+      }
+
+      auto number = parseNumber(it, end);
+      if (!number.is_null()) return std::make_shared<Value>(number);
+
+      it = start;
+      return nullptr;
+    }
+
+    class expression_parsing_error : public std::runtime_error {
+        const CharIterator it;
+      public:
+        expression_parsing_error(const std::string & message, const CharIterator & it)
+            : std::runtime_error(message), it(it) {}
+        size_t get_pos(const CharIterator & begin) const {
+            return std::distance(begin, it);
+      }
+    };
+
+    bool peekSymbols(const std::vector<std::string> & symbols) const {
+        for (const auto & symbol : symbols) {
+            if (std::distance(it, end) >= (int64_t) symbol.size() && std::string(it, it + symbol.size()) == symbol) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    std::vector<std::string> consumeTokenGroups(const std::regex & regex, SpaceHandling space_handling = SpaceHandling::Strip) {
+        auto start = it;
+        consumeSpaces(space_handling);
+        std::smatch match;
+        if (std::regex_search(it, end, match, regex) && match.position() == 0) {
+            it += match[0].length();
+            std::vector<std::string> ret;
+            for (size_t i = 0, n = match.size(); i < n; ++i) {
+                ret.push_back(match[i].str());
+            }
+            return ret;
+        }
+        it = start;
+        return {};
+    }
+    std::string consumeToken(const std::regex & regex, SpaceHandling space_handling = SpaceHandling::Strip) {
+        auto start = it;
+        consumeSpaces(space_handling);
+        std::smatch match;
+        if (std::regex_search(it, end, match, regex) && match.position() == 0) {
+            it += match[0].length();
+            return match[0].str();
+        }
+        it = start;
+        return "";
+    }
+
+    std::string consumeToken(const std::string & token, SpaceHandling space_handling = SpaceHandling::Strip) {
+        auto start = it;
+        consumeSpaces(space_handling);
+        if (std::distance(it, end) >= (int64_t) token.size() && std::string(it, it + token.size()) == token) {
+            it += token.size();
+            return token;
+        }
+        it = start;
+        return "";
+    }
+
+    std::shared_ptr<Expression> parseExpression(bool allow_if_expr = true) {
+        auto left = parseLogicalOr();
+        if (it == end) return left;
+
+        if (!allow_if_expr) return left;
+
+        static std::regex if_tok(R"(if\b)");
+        if (consumeToken(if_tok).empty()) {
+          return left;
+        }
+
+        auto location = get_location();
+        auto [condition, else_expr] = parseIfExpression();
+        return std::make_shared<IfExpr>(location, std::move(condition), std::move(left), std::move(else_expr));
+    }
+
+    Location get_location() const {
+        return {template_str, (size_t) std::distance(start, it)};
+    }
+
+    std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>> parseIfExpression() {
+        auto condition = parseLogicalOr();
+        if (!condition) throw std::runtime_error("Expected condition expression");
+
+        static std::regex else_tok(R"(else\b)");
+        std::shared_ptr<Expression> else_expr;
+        if (!consumeToken(else_tok).empty()) {
+          else_expr = parseExpression();
+          if (!else_expr) throw std::runtime_error("Expected 'else' expression");
+        }
+        return std::pair(std::move(condition), std::move(else_expr));
+    }
+
+    std::shared_ptr<Expression> parseLogicalOr() {
+        auto left = parseLogicalAnd();
+        if (!left) throw std::runtime_error("Expected left side of 'logical or' expression");
+
+        static std::regex or_tok(R"(or\b)");
+        auto location = get_location();
+        while (!consumeToken(or_tok).empty()) {
+            auto right = parseLogicalAnd();
+            if (!right) throw std::runtime_error("Expected right side of 'or' expression");
+            left = std::make_shared<BinaryOpExpr>(location, std::move(left), std::move(right), BinaryOpExpr::Op::Or);
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> parseLogicalNot() {
+        static std::regex not_tok(R"(not\b)");
+        auto location = get_location();
+
+        if (!consumeToken(not_tok).empty()) {
+          auto sub = parseLogicalNot();
+          if (!sub) throw std::runtime_error("Expected expression after 'not' keyword");
+          return std::make_shared<UnaryOpExpr>(location, std::move(sub), UnaryOpExpr::Op::LogicalNot);
+        }
+        return parseLogicalCompare();
+    }
+
+    std::shared_ptr<Expression> parseLogicalAnd() {
+        auto left = parseLogicalNot();
+        if (!left) throw std::runtime_error("Expected left side of 'logical and' expression");
+
+        static std::regex and_tok(R"(and\b)");
+        auto location = get_location();
+        while (!consumeToken(and_tok).empty()) {
+            auto right = parseLogicalNot();
+            if (!right) throw std::runtime_error("Expected right side of 'and' expression");
+            left = std::make_shared<BinaryOpExpr>(location, std::move(left), std::move(right), BinaryOpExpr::Op::And);
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> parseLogicalCompare() {
+        auto left = parseStringConcat();
+        if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
+
+        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
+        static std::regex not_tok(R"(not\b)");
+        std::string op_str;
+        while (!(op_str = consumeToken(compare_tok)).empty()) {
+            auto location = get_location();
+            if (op_str == "is") {
+              auto negated = !consumeToken(not_tok).empty();
+
+              auto identifier = parseIdentifier();
+              if (!identifier) throw std::runtime_error("Expected identifier after 'is' keyword");
+
+              return std::make_shared<BinaryOpExpr>(
+                  left->location,
+                  std::move(left), std::move(identifier),
+                  negated ? BinaryOpExpr::Op::IsNot : BinaryOpExpr::Op::Is);
+            }
+            auto right = parseStringConcat();
+            if (!right) throw std::runtime_error("Expected right side of 'logical compare' expression");
+            BinaryOpExpr::Op op;
+            if (op_str == "==") op = BinaryOpExpr::Op::Eq;
+            else if (op_str == "!=") op = BinaryOpExpr::Op::Ne;
+            else if (op_str == "<") op = BinaryOpExpr::Op::Lt;
+            else if (op_str == ">") op = BinaryOpExpr::Op::Gt;
+            else if (op_str == "<=") op = BinaryOpExpr::Op::Le;
+            else if (op_str == ">=") op = BinaryOpExpr::Op::Ge;
+            else if (op_str == "in") op = BinaryOpExpr::Op::In;
+            else if (op_str.substr(0, 3) == "not") op = BinaryOpExpr::Op::NotIn;
+            else throw std::runtime_error("Unknown comparison operator: " + op_str);
+            left = std::make_shared<BinaryOpExpr>(get_location(), std::move(left), std::move(right), op);
+        }
+        return left;
+    }
+
+    Expression::Parameters parseParameters() {
+        consumeSpaces();
+        if (consumeToken("(").empty()) throw std::runtime_error("Expected opening parenthesis in param list");
+
+        Expression::Parameters result;
+
+        while (it != end) {
+            if (!consumeToken(")").empty()) {
+                return result;
+            }
+            auto expr = parseExpression();
+            if (!expr) throw std::runtime_error("Expected expression in call args");
+
+            if (auto ident = dynamic_cast<VariableExpr*>(expr.get())) {
+                if (!consumeToken("=").empty()) {
+                    auto value = parseExpression();
+                    if (!value) throw std::runtime_error("Expected expression in for named arg");
+                    result.emplace_back(ident->get_name(), std::move(value));
+                } else {
+                    result.emplace_back(ident->get_name(), nullptr);
+                }
+            } else {
+                result.emplace_back(std::string(), std::move(expr));
+            }
+            if (consumeToken(",").empty()) {
+              if (consumeToken(")").empty()) {
+                throw std::runtime_error("Expected closing parenthesis in call args");
+              }
+              return result;
+            }
+        }
+        throw std::runtime_error("Expected closing parenthesis in call args");
+    }
+
+    ArgumentsExpression parseCallArgs() {
+        consumeSpaces();
+        if (consumeToken("(").empty()) throw std::runtime_error("Expected opening parenthesis in call args");
+
+        ArgumentsExpression result;
+
+        while (it != end) {
+            if (!consumeToken(")").empty()) {
+                return result;
+            }
+            auto expr = parseExpression();
+            if (!expr) throw std::runtime_error("Expected expression in call args");
+
+            if (auto ident = dynamic_cast<VariableExpr*>(expr.get())) {
+                if (!consumeToken("=").empty()) {
+                    auto value = parseExpression();
+                    if (!value) throw std::runtime_error("Expected expression in for named arg");
+                    result.kwargs.emplace_back(ident->get_name(), std::move(value));
+                } else {
+                    result.args.emplace_back(std::move(expr));
+                }
+            } else {
+                result.args.emplace_back(std::move(expr));
+            }
+            if (consumeToken(",").empty()) {
+              if (consumeToken(")").empty()) {
+                throw std::runtime_error("Expected closing parenthesis in call args");
+              }
+              return result;
+            }
+        }
+        throw std::runtime_error("Expected closing parenthesis in call args");
+    }
+
+    std::shared_ptr<VariableExpr> parseIdentifier() {
+        static std::regex ident_regex(R"((?!(?:not|is|and|or|del)\b)[a-zA-Z_]\w*)");
+        auto location = get_location();
+        auto ident = consumeToken(ident_regex);
+        if (ident.empty())
+          return nullptr;
+        return std::make_shared<VariableExpr>(location, ident);
+    }
+
+    std::shared_ptr<Expression> parseStringConcat() {
+        auto left = parseMathPow();
+        if (!left) throw std::runtime_error("Expected left side of 'string concat' expression");
+
+        static std::regex concat_tok(R"(~(?!\}))");
+        if (!consumeToken(concat_tok).empty()) {
+            auto right = parseLogicalAnd();
+            if (!right) throw std::runtime_error("Expected right side of 'string concat' expression");
+            left = std::make_shared<BinaryOpExpr>(get_location(), std::move(left), std::move(right), BinaryOpExpr::Op::StrConcat);
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> parseMathPow() {
+        auto left = parseMathPlusMinus();
+        if (!left) throw std::runtime_error("Expected left side of 'math pow' expression");
+
+        while (!consumeToken("**").empty()) {
+            auto right = parseMathPlusMinus();
+            if (!right) throw std::runtime_error("Expected right side of 'math pow' expression");
+            left = std::make_shared<BinaryOpExpr>(get_location(), std::move(left), std::move(right), BinaryOpExpr::Op::MulMul);
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> parseMathPlusMinus() {
+        static std::regex plus_minus_tok(R"(\+|-(?![}%#]\}))");
+
+        auto left = parseMathMulDiv();
+        if (!left) throw std::runtime_error("Expected left side of 'math plus/minus' expression");
+        std::string op_str;
+        while (!(op_str = consumeToken(plus_minus_tok)).empty()) {
+            auto right = parseMathMulDiv();
+            if (!right) throw std::runtime_error("Expected right side of 'math plus/minus' expression");
+            auto op = op_str == "+" ? BinaryOpExpr::Op::Add : BinaryOpExpr::Op::Sub;
+            left = std::make_shared<BinaryOpExpr>(get_location(), std::move(left), std::move(right), op);
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> parseMathMulDiv() {
+        auto left = parseMathUnaryPlusMinus();
+        if (!left) throw std::runtime_error("Expected left side of 'math mul/div' expression");
+
+        static std::regex mul_div_tok(R"(\*\*?|//?|%(?!\}))");
+        std::string op_str;
+        while (!(op_str = consumeToken(mul_div_tok)).empty()) {
+            auto right = parseMathUnaryPlusMinus();
+            if (!right) throw std::runtime_error("Expected right side of 'math mul/div' expression");
+            auto op = op_str == "*" ? BinaryOpExpr::Op::Mul
+                : op_str == "**" ? BinaryOpExpr::Op::MulMul
+                : op_str == "/" ? BinaryOpExpr::Op::Div
+                : op_str == "//" ? BinaryOpExpr::Op::DivDiv
+                : BinaryOpExpr::Op::Mod;
+            left = std::make_shared<BinaryOpExpr>(get_location(), std::move(left), std::move(right), op);
+        }
+
+        if (!consumeToken("|").empty()) {
+            auto expr = parseMathMulDiv();
+            if (auto filter = dynamic_cast<FilterExpr*>(expr.get())) {
+                filter->prepend(std::move(left));
+                return expr;
+            } else {
+                std::vector<std::shared_ptr<Expression>> parts;
+                parts.emplace_back(std::move(left));
+                parts.emplace_back(std::move(expr));
+                return std::make_shared<FilterExpr>(get_location(), std::move(parts));
+            }
+        }
+        return left;
+    }
+
+    std::shared_ptr<Expression> call_func(const std::string & name, ArgumentsExpression && args) const {
+        return std::make_shared<CallExpr>(get_location(), std::make_shared<VariableExpr>(get_location(), name), std::move(args));
+    }
+
+    std::shared_ptr<Expression> parseMathUnaryPlusMinus() {
+        static std::regex unary_plus_minus_tok(R"(\+|-(?![}%#]\}))");
+        auto op_str = consumeToken(unary_plus_minus_tok);
+        auto expr = parseExpansion();
+        if (!expr) throw std::runtime_error("Expected expr of 'unary plus/minus/expansion' expression");
+
+        if (!op_str.empty()) {
+            auto op = op_str == "+" ? UnaryOpExpr::Op::Plus : UnaryOpExpr::Op::Minus;
+            return std::make_shared<UnaryOpExpr>(get_location(), std::move(expr), op);
+        }
+        return expr;
+    }
+
+    std::shared_ptr<Expression> parseExpansion() {
+      static std::regex expansion_tok(R"(\*\*?)");
+      auto op_str = consumeToken(expansion_tok);
+      auto expr = parseValueExpression();
+      if (op_str.empty()) return expr;
+      if (!expr) throw std::runtime_error("Expected expr of 'expansion' expression");
+      return std::make_shared<UnaryOpExpr>(get_location(), std::move(expr), op_str == "*" ? UnaryOpExpr::Op::Expansion : UnaryOpExpr::Op::ExpansionDict);
+    }
+
+    std::shared_ptr<Expression> parseValueExpression() {
+      auto parseValue = [&]() -> std::shared_ptr<Expression> {
+        auto location = get_location();
+        auto constant = parseConstant();
+        if (constant) return std::make_shared<LiteralExpr>(location, *constant);
+
+        static std::regex null_regex(R"(null\b)");
+        if (!consumeToken(null_regex).empty()) return std::make_shared<LiteralExpr>(location, Value());
+
+        auto identifier = parseIdentifier();
+        if (identifier) return identifier;
+
+        auto braced = parseBracedExpressionOrArray();
+        if (braced) return braced;
+
+        auto array = parseArray();
+        if (array) return array;
+
+        auto dictionary = parseDictionary();
+        if (dictionary) return dictionary;
+
+        throw std::runtime_error("Expected value expression");
+      };
+
+      auto value = parseValue();
+
+      while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
+        if (!consumeToken("[").empty()) {
+            std::shared_ptr<Expression> index;
+            if (!consumeToken(":").empty()) {
+              auto slice_end = parseExpression();
+              index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
+            } else {
+              auto slice_start = parseExpression();
+              if (!consumeToken(":").empty()) {
+                consumeSpaces();
+                if (peekSymbols({ "]" })) {
+                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
+                } else {
+                  auto slice_end = parseExpression();
+                  index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
+                }
+              } else {
+                index = std::move(slice_start);
+              }
+            }
+            if (!index) throw std::runtime_error("Empty index in subscript");
+            if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
+
+            value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
+        } else if (!consumeToken(".").empty()) {
+            auto identifier = parseIdentifier();
+            if (!identifier) throw std::runtime_error("Expected identifier in subscript");
+
+            consumeSpaces();
+            if (peekSymbols({ "(" })) {
+              auto callParams = parseCallArgs();
+              value = std::make_shared<MethodCallExpr>(identifier->location, std::move(value), std::move(identifier), std::move(callParams));
+            } else {
+              auto key = std::make_shared<LiteralExpr>(identifier->location, Value(identifier->get_name()));
+              value = std::make_shared<SubscriptExpr>(identifier->location, std::move(value), std::move(key));
+            }
+        }
+        consumeSpaces();
+      }
+
+      if (peekSymbols({ "(" })) {
+        auto location = get_location();
+        auto callParams = parseCallArgs();
+        value = std::make_shared<CallExpr>(location, std::move(value), std::move(callParams));
+      }
+      return value;
+    }
+
+    std::shared_ptr<Expression> parseBracedExpressionOrArray() {
+        if (consumeToken("(").empty()) return nullptr;
+
+        auto expr = parseExpression();
+        if (!expr) throw std::runtime_error("Expected expression in braced expression");
+
+        if (!consumeToken(")").empty()) {
+            return expr;  // Drop the parentheses
+        }
+
+        std::vector<std::shared_ptr<Expression>> tuple;
+        tuple.emplace_back(std::move(expr));
+
+        while (it != end) {
+          if (consumeToken(",").empty()) throw std::runtime_error("Expected comma in tuple");
+          auto next = parseExpression();
+          if (!next) throw std::runtime_error("Expected expression in tuple");
+          tuple.push_back(std::move(next));
+
+          if (!consumeToken(")").empty()) {
+              return std::make_shared<ArrayExpr>(get_location(), std::move(tuple));
+          }
+        }
+        throw std::runtime_error("Expected closing parenthesis");
+    }
+
+    std::shared_ptr<Expression> parseArray() {
+        if (consumeToken("[").empty()) return nullptr;
+
+        std::vector<std::shared_ptr<Expression>> elements;
+        if (!consumeToken("]").empty()) {
+            return std::make_shared<ArrayExpr>(get_location(), std::move(elements));
+        }
+        auto first_expr = parseExpression();
+        if (!first_expr) throw std::runtime_error("Expected first expression in array");
+        elements.push_back(std::move(first_expr));
+
+        while (it != end) {
+            if (!consumeToken(",").empty()) {
+              auto expr = parseExpression();
+              if (!expr) throw std::runtime_error("Expected expression in array");
+              elements.push_back(std::move(expr));
+            } else if (!consumeToken("]").empty()) {
+                return std::make_shared<ArrayExpr>(get_location(), std::move(elements));
+            } else {
+                throw std::runtime_error("Expected comma or closing bracket in array");
+            }
+        }
+        throw std::runtime_error("Expected closing bracket");
+    }
+
+    std::shared_ptr<Expression> parseDictionary() {
+        if (consumeToken("{").empty()) return nullptr;
+
+        std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
+        if (!consumeToken("}").empty()) {
+            return std::make_shared<DictExpr>(get_location(), std::move(elements));
+        }
+
+        auto parseKeyValuePair = [&]() {
+            auto key = parseExpression();
+            if (!key) throw std::runtime_error("Expected key in dictionary");
+            if (consumeToken(":").empty()) throw std::runtime_error("Expected colon betweek key & value in dictionary");
+            auto value = parseExpression();
+            if (!value) throw std::runtime_error("Expected value in dictionary");
+            elements.emplace_back(std::pair(std::move(key), std::move(value)));
+        };
+
+        parseKeyValuePair();
+
+        while (it != end) {
+            if (!consumeToken(",").empty()) {
+                parseKeyValuePair();
+            } else if (!consumeToken("}").empty()) {
+                return std::make_shared<DictExpr>(get_location(), std::move(elements));
+            } else {
+                throw std::runtime_error("Expected comma or closing brace in dictionary");
+            }
+        }
+        throw std::runtime_error("Expected closing brace");
+    }
+
+    SpaceHandling parsePreSpace(const std::string& s) const {
+        if (s == "-")
+          return SpaceHandling::Strip;
+        return SpaceHandling::Keep;
+    }
+
+    SpaceHandling parsePostSpace(const std::string& s) const {
+        if (s == "-") return SpaceHandling::Strip;
+        return SpaceHandling::Keep;
+    }
+
+    using TemplateTokenVector = std::vector<std::unique_ptr<TemplateToken>>;
+    using TemplateTokenIterator = TemplateTokenVector::const_iterator;
+
+    std::vector<std::string> parseVarNames() {
+      static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
+
+      std::vector<std::string> group;
+      if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
+      std::vector<std::string> varnames;
+      std::istringstream iss(group[1]);
+      std::string varname;
+      while (std::getline(iss, varname, ',')) {
+        varnames.push_back(strip(varname));
+      }
+      return varnames;
+    }
+
+    std::runtime_error unexpected(const TemplateToken & token) const {
+      return std::runtime_error("Unexpected " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+    std::runtime_error unterminated(const TemplateToken & token) const {
+      return std::runtime_error("Unterminated " + TemplateToken::typeToString(token.type)
+        + error_location_suffix(*template_str, token.location.pos));
+    }
+
+    TemplateTokenVector tokenize() {
+      static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
+      static std::regex expr_open_regex(R"(\{\{([-~])?)");
+      static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
+      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
+      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
+      static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
+      static std::regex block_close_regex(R"(\s*([-~])?%\})");
+
+      TemplateTokenVector tokens;
+      std::vector<std::string> group;
+      std::string text;
+      std::smatch match;
+
+      try {
+        while (it != end) {
+          auto location = get_location();
+
+          if (!(group = consumeTokenGroups(comment_tok, SpaceHandling::Keep)).empty()) {
+            auto pre_space = parsePreSpace(group[1]);
+            auto content = group[2];
+            auto post_space = parsePostSpace(group[3]);
+            tokens.push_back(std::make_unique<CommentTemplateToken>(location, pre_space, post_space, content));
+          } else if (!(group = consumeTokenGroups(expr_open_regex, SpaceHandling::Keep)).empty()) {
+            auto pre_space = parsePreSpace(group[1]);
+            auto expr = parseExpression();
+
+            if ((group = consumeTokenGroups(expr_close_regex)).empty()) {
+              throw std::runtime_error("Expected closing expression tag");
+            }
+
+            auto post_space = parsePostSpace(group[1]);
+            tokens.push_back(std::make_unique<ExpressionTemplateToken>(location, pre_space, post_space, std::move(expr)));
+          } else if (!(group = consumeTokenGroups(block_open_regex, SpaceHandling::Keep)).empty()) {
+            auto pre_space = parsePreSpace(group[1]);
+
+            std::string keyword;
+
+            auto parseBlockClose = [&]() -> SpaceHandling {
+              if ((group = consumeTokenGroups(block_close_regex)).empty()) throw std::runtime_error("Expected closing block tag");
+              return parsePostSpace(group[1]);
+            };
+
+            if ((keyword = consumeToken(block_keyword_tok)).empty()) throw std::runtime_error("Expected block keyword");
+
+            if (keyword == "if") {
+              auto condition = parseExpression();
+              if (!condition) throw std::runtime_error("Expected condition in if block");
+
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<IfTemplateToken>(location, pre_space, post_space, std::move(condition)));
+            } else if (keyword == "elif") {
+              auto condition = parseExpression();
+              if (!condition) throw std::runtime_error("Expected condition in elif block");
+
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<ElifTemplateToken>(location, pre_space, post_space, std::move(condition)));
+            } else if (keyword == "else") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<ElseTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "endif") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndIfTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "for") {
+              static std::regex recursive_tok(R"(recursive\b)");
+              static std::regex if_tok(R"(if\b)");
+
+              auto varnames = parseVarNames();
+              static std::regex in_tok(R"(in\b)");
+              if (consumeToken(in_tok).empty()) throw std::runtime_error("Expected 'in' keyword in for block");
+              auto iterable = parseExpression(/* allow_if_expr = */ false);
+              if (!iterable) throw std::runtime_error("Expected iterable in for block");
+
+              std::shared_ptr<Expression> condition;
+              if (!consumeToken(if_tok).empty()) {
+                condition = parseExpression();
+              }
+              auto recursive = !consumeToken(recursive_tok).empty();
+
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<ForTemplateToken>(location, pre_space, post_space, std::move(varnames), std::move(iterable), std::move(condition), recursive));
+            } else if (keyword == "endfor") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndForTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "generation") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<GenerationTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "endgeneration") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "set") {
+              static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
+
+              std::string ns;
+              std::vector<std::string> var_names;
+              std::shared_ptr<Expression> value;
+              if (!(group = consumeTokenGroups(namespaced_var_regex)).empty()) {
+                ns = group[1];
+                var_names.push_back(group[2]);
+
+                if (consumeToken("=").empty()) throw std::runtime_error("Expected equals sign in set block");
+
+                value = parseExpression();
+                if (!value) throw std::runtime_error("Expected value in set block");
+              } else {
+                var_names = parseVarNames();
+
+                if (!consumeToken("=").empty()) {
+                  value = parseExpression();
+                  if (!value) throw std::runtime_error("Expected value in set block");
+                }
+              }
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<SetTemplateToken>(location, pre_space, post_space, ns, var_names, std::move(value)));
+            } else if (keyword == "endset") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndSetTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "macro") {
+              auto macroname = parseIdentifier();
+              if (!macroname) throw std::runtime_error("Expected macro name in macro block");
+              auto params = parseParameters();
+
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<MacroTemplateToken>(location, pre_space, post_space, std::move(macroname), std::move(params)));
+            } else if (keyword == "endmacro") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndMacroTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "filter") {
+              auto filter = parseExpression();
+              if (!filter) throw std::runtime_error("Expected expression in filter block");
+
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<FilterTemplateToken>(location, pre_space, post_space, std::move(filter)));
+            } else if (keyword == "endfilter") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndFilterTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "break" || keyword == "continue") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<LoopControlTemplateToken>(location, pre_space, post_space, keyword == "break" ? LoopControlType::Break : LoopControlType::Continue));
+            } else {
+              throw std::runtime_error("Unexpected block: " + keyword);
+            }
+          } else if (std::regex_search(it, end, match, non_text_open_regex)) {
+            if (!match.position()) {
+                if (match[0] != "{#")
+                    throw std::runtime_error("Internal error: Expected a comment");
+                throw std::runtime_error("Missing end of comment tag");
+            }
+            auto text_end = it + match.position();
+            text = std::string(it, text_end);
+            it = text_end;
+            tokens.push_back(std::make_unique<TextTemplateToken>(location, SpaceHandling::Keep, SpaceHandling::Keep, text));
+          } else {
+            text = std::string(it, end);
+            it = end;
+            tokens.push_back(std::make_unique<TextTemplateToken>(location, SpaceHandling::Keep, SpaceHandling::Keep, text));
+          }
+        }
+        return tokens;
+      } catch (const std::exception & e) {
+        throw std::runtime_error(e.what() + error_location_suffix(*template_str, std::distance(start, it)));
+      }
+    }
+
+    std::shared_ptr<TemplateNode> parseTemplate(
+          const TemplateTokenIterator & begin,
+          TemplateTokenIterator & it,
+          const TemplateTokenIterator & end,
+          bool fully = false) const {
+        std::vector<std::shared_ptr<TemplateNode>> children;
+        while (it != end) {
+          const auto start = it;
+          const auto & token = *(it++);
+          if (auto if_token = dynamic_cast<IfTemplateToken*>(token.get())) {
+              std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
+              cascade.emplace_back(std::move(if_token->condition), parseTemplate(begin, it, end));
+
+              while (it != end && (*it)->type == TemplateToken::Type::Elif) {
+                  auto elif_token = dynamic_cast<ElifTemplateToken*>((*(it++)).get());
+                  cascade.emplace_back(std::move(elif_token->condition), parseTemplate(begin, it, end));
+              }
+
+              if (it != end && (*it)->type == TemplateToken::Type::Else) {
+                cascade.emplace_back(nullptr, parseTemplate(begin, ++it, end));
+              }
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndIf) {
+                  throw unterminated(**start);
+              }
+              children.emplace_back(std::make_shared<IfNode>(token->location, std::move(cascade)));
+          } else if (auto for_token = dynamic_cast<ForTemplateToken*>(token.get())) {
+              auto body = parseTemplate(begin, it, end);
+              auto else_body = std::shared_ptr<TemplateNode>();
+              if (it != end && (*it)->type == TemplateToken::Type::Else) {
+                else_body = parseTemplate(begin, ++it, end);
+              }
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndFor) {
+                  throw unterminated(**start);
+              }
+              children.emplace_back(std::make_shared<ForNode>(token->location, std::move(for_token->var_names), std::move(for_token->iterable), std::move(for_token->condition), std::move(body), for_token->recursive, std::move(else_body)));
+          } else if (dynamic_cast<GenerationTemplateToken*>(token.get())) {
+              auto body = parseTemplate(begin, it, end);
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndGeneration) {
+                  throw unterminated(**start);
+              }
+              // Treat as a no-op, as our scope is templates for inference, not training (`{% generation %}` wraps generated tokens for masking).
+              children.emplace_back(std::move(body));
+          } else if (auto text_token = dynamic_cast<TextTemplateToken*>(token.get())) {
+              SpaceHandling pre_space = (it - 1) != begin ? (*(it - 2))->post_space : SpaceHandling::Keep;
+              SpaceHandling post_space = it != end ? (*it)->pre_space : SpaceHandling::Keep;
+
+              auto text = text_token->text;
+              if (post_space == SpaceHandling::Strip) {
+                static std::regex trailing_space_regex(R"(\s+$)");
+                text = std::regex_replace(text, trailing_space_regex, "");
+              } else if (options.lstrip_blocks && it != end) {
+                auto i = text.size();
+                while (i > 0 && (text[i - 1] == ' ' || text[i - 1] == '\t')) i--;
+                if ((i == 0 && (it - 1) == begin) || (i > 0 && text[i - 1] == '\n')) {
+                  text.resize(i);
+                }
+              }
+              if (pre_space == SpaceHandling::Strip) {
+                static std::regex leading_space_regex(R"(^\s+)");
+                text = std::regex_replace(text, leading_space_regex, "");
+              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
+                if (text.length() > 0 && text[0] == '\n') {
+                  text.erase(0, 1);
+                }
+              }
+              if (it == end && !options.keep_trailing_newline) {
+                auto i = text.size();
+                if (i > 0 && text[i - 1] == '\n') {
+                  i--;
+                  if (i > 0 && text[i - 1] == '\r') i--;
+                  text.resize(i);
+                }
+              }
+              children.emplace_back(std::make_shared<TextNode>(token->location, text));
+          } else if (auto expr_token = dynamic_cast<ExpressionTemplateToken*>(token.get())) {
+              children.emplace_back(std::make_shared<ExpressionNode>(token->location, std::move(expr_token->expr)));
+          } else if (auto set_token = dynamic_cast<SetTemplateToken*>(token.get())) {
+            if (set_token->value) {
+              children.emplace_back(std::make_shared<SetNode>(token->location, set_token->ns, set_token->var_names, std::move(set_token->value)));
+            } else {
+              auto value_template = parseTemplate(begin, it, end);
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndSet) {
+                  throw unterminated(**start);
+              }
+              if (!set_token->ns.empty()) throw std::runtime_error("Namespaced set not supported in set with template value");
+              if (set_token->var_names.size() != 1) throw std::runtime_error("Structural assignment not supported in set with template value");
+              auto & name = set_token->var_names[0];
+              children.emplace_back(std::make_shared<SetTemplateNode>(token->location, name, std::move(value_template)));
+            }
+          } else if (auto macro_token = dynamic_cast<MacroTemplateToken*>(token.get())) {
+              auto body = parseTemplate(begin, it, end);
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndMacro) {
+                  throw unterminated(**start);
+              }
+              children.emplace_back(std::make_shared<MacroNode>(token->location, std::move(macro_token->name), std::move(macro_token->params), std::move(body)));
+          } else if (auto filter_token = dynamic_cast<FilterTemplateToken*>(token.get())) {
+              auto body = parseTemplate(begin, it, end);
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndFilter) {
+                  throw unterminated(**start);
+              }
+              children.emplace_back(std::make_shared<FilterNode>(token->location, std::move(filter_token->filter), std::move(body)));
+          } else if (dynamic_cast<CommentTemplateToken*>(token.get())) {
+              // Ignore comments
+          } else if (auto ctrl_token = dynamic_cast<LoopControlTemplateToken*>(token.get())) {
+              children.emplace_back(std::make_shared<LoopControlNode>(token->location, ctrl_token->control_type));
+          } else if (dynamic_cast<EndForTemplateToken*>(token.get())
+                  || dynamic_cast<EndSetTemplateToken*>(token.get())
+                  || dynamic_cast<EndMacroTemplateToken*>(token.get())
+                  || dynamic_cast<EndFilterTemplateToken*>(token.get())
+                  || dynamic_cast<EndIfTemplateToken*>(token.get())
+                  || dynamic_cast<ElseTemplateToken*>(token.get())
+                  || dynamic_cast<EndGenerationTemplateToken*>(token.get())
+                  || dynamic_cast<ElifTemplateToken*>(token.get())) {
+              it--;  // unconsume the token
+              break;  // exit the loop
+          } else {
+              throw unexpected(**(it-1));
+          }
+        }
+        if (fully && it != end) {
+            throw unexpected(**it);
+        }
+        if (children.empty()) {
+          return std::make_shared<TextNode>(Location { template_str, 0 }, std::string());
+        } else if (children.size() == 1) {
+          return std::move(children[0]);
+        } else {
+          return std::make_shared<SequenceNode>(children[0]->location(), std::move(children));
+        }
+    }
+
+public:
+
+    static std::shared_ptr<TemplateNode> parse(const std::string& template_str, const Options & options) {
+        Parser parser(std::make_shared<std::string>(normalize_newlines(template_str)), options);
+        auto tokens = parser.tokenize();
+        TemplateTokenIterator begin = tokens.begin();
+        auto it = begin;
+        TemplateTokenIterator end = tokens.end();
+        return parser.parseTemplate(begin, it, end, /* full= */ true);
+    }
+};
+
+static Value simple_function(const std::string & fn_name, const std::vector<std::string> & params, const std::function<Value(const std::shared_ptr<Context> &, Value & args)> & fn) {
+  std::map<std::string, size_t> named_positions;
+  for (size_t i = 0, n = params.size(); i < n; i++) named_positions[params[i]] = i;
+
+  return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) -> Value {
+    auto args_obj = Value::object();
+    std::vector<bool> provided_args(params.size());
+    for (size_t i = 0, n = args.args.size(); i < n; i++) {
+      auto & arg = args.args[i];
+      if (i < params.size()) {
+        args_obj.set(params[i], arg);
+        provided_args[i] = true;
+      } else {
+        throw std::runtime_error("Too many positional params for " + fn_name);
+      }
+    }
+    for (auto & [name, value] : args.kwargs) {
+      auto named_pos_it = named_positions.find(name);
+      if (named_pos_it == named_positions.end()) {
+        throw std::runtime_error("Unknown argument " + name + " for function " + fn_name);
+      }
+      provided_args[named_pos_it->second] = true;
+      args_obj.set(name, value);
+    }
+    return fn(context, args_obj);
+  });
+}
+
+inline std::shared_ptr<Context> Context::builtins() {
+  auto globals = Value::object();
+
+  globals.set("raise_exception", simple_function("raise_exception", { "message" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+    throw std::runtime_error(args.at("message").get<std::string>());
+  }));
+  globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr<Context> &, Value & args) {
+    return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* tojson= */ true));
+  }));
+  globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto items = Value::array();
+    if (args.contains("object")) {
+      auto & obj = args.at("object");
+      if (obj.is_string()) {
+        auto json_obj = json::parse(obj.get<std::string>());
+        for (const auto & kv : json_obj.items()) {
+          items.push_back(Value::array({kv.key(), kv.value()}));
+        }
+      } else if (!obj.is_null()) {
+        for (auto & key : obj.keys()) {
+          items.push_back(Value::array({key, obj.at(key)}));
+        }
+      }
+    }
+    return items;
+  }));
+  globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto items = args.at("items");
+    if (!items.is_array()) throw std::runtime_error("object is not a list");
+    if (items.size() == 0) return Value();
+    return items.at(items.size() - 1);
+  }));
+  globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto & text = args.at("text");
+    return text.is_null() ? text : Value(strip(text.get<std::string>()));
+  }));
+  globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto text = args.at("text");
+    if (text.is_null()) return text;
+    std::string res;
+    auto str = text.get<std::string>();
+    std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
+    return Value(res);
+  }));
+  globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
+    args.expectArgs("default", {2, 3}, {0, 1});
+    auto & value = args.args[0];
+    auto & default_value = args.args[1];
+    bool boolean = false;
+    if (args.args.size() == 3) {
+      boolean = args.args[2].get<bool>();
+    } else {
+      Value bv = args.get_named("boolean");
+      if (!bv.is_null()) {
+        boolean = bv.get<bool>();
+      }
+    }
+    return boolean ? (value.to_bool() ? value : default_value) : value.is_null() ? default_value : value;
+  }));
+  auto escape = simple_function("escape", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
+    return Value(html_escape(args.at("text").get<std::string>()));
+  });
+  globals.set("e", escape);
+  globals.set("escape", escape);
+  globals.set("joiner", simple_function("joiner", { "sep" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto sep = args.get<std::string>("sep", "");
+    auto first = std::make_shared<bool>(true);
+    return simple_function("", {}, [sep, first](const std::shared_ptr<Context> &, const Value &) -> Value {
+      if (*first) {
+        *first = false;
+        return "";
+      }
+      return sep;
+    });
+    return Value(html_escape(args.at("text").get<std::string>()));
+  }));
+  globals.set("count", simple_function("count", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
+    return Value((int64_t) args.at("items").size());
+  }));
+  globals.set("dictsort", simple_function("dictsort", { "value" }, [](const std::shared_ptr<Context> &, Value & args) {
+    if (args.size() != 1) throw std::runtime_error("dictsort expects exactly 1 argument (TODO: fix implementation)");
+    auto & value = args.at("value");
+    auto keys = value.keys();
+    std::sort(keys.begin(), keys.end());
+    auto res = Value::array();
+    for (auto & key : keys) {
+      res.push_back(Value::array({key, value.at(key)}));
+    }
+    return res;
+  }));
+  globals.set("join", simple_function("join", { "items", "d" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto do_join = [](Value & items, const std::string & sep) {
+      if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
+      std::ostringstream oss;
+      auto first = true;
+      for (size_t i = 0, n = items.size(); i < n; ++i) {
+        if (first) first = false;
+        else oss << sep;
+        oss << items.at(i).to_str();
+      }
+      return Value(oss.str());
+    };
+    auto sep = args.get<std::string>("d", "");
+    if (args.contains("items")) {
+        auto & items = args.at("items");
+        return do_join(items, sep);
+    } else {
+      return simple_function("", {"items"}, [sep, do_join](const std::shared_ptr<Context> &, Value & args) {
+        auto & items = args.at("items");
+        if (!items.to_bool() || !items.is_array()) throw std::runtime_error("join expects an array for items, got: " + items.dump());
+        return do_join(items, sep);
+      });
+    }
+  }));
+  globals.set("namespace", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
+    auto ns = Value::object();
+    args.expectArgs("namespace", {0, 0}, {0, (std::numeric_limits<size_t>::max)()});
+    for (auto & [name, value] : args.kwargs) {
+      ns.set(name, value);
+    }
+    return ns;
+  }));
+  auto equalto = simple_function("equalto", { "expected", "actual" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      return args.at("actual") == args.at("expected");
+  });
+  globals.set("equalto", equalto);
+  globals.set("==", equalto);
+  globals.set("length", simple_function("length", { "items" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      auto & items = args.at("items");
+      return (int64_t) items.size();
+  }));
+  globals.set("safe", simple_function("safe", { "value" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      return args.at("value").to_str();
+  }));
+  globals.set("string", simple_function("string", { "value" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      return args.at("value").to_str();
+  }));
+  globals.set("int", simple_function("int", { "value" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      return args.at("value").to_int();
+  }));
+  globals.set("list", simple_function("list", { "items" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      auto & items = args.at("items");
+      if (!items.is_array()) throw std::runtime_error("object is not iterable");
+      return items;
+  }));
+  globals.set("unique", simple_function("unique", { "items" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
+      auto & items = args.at("items");
+      if (!items.is_array()) throw std::runtime_error("object is not iterable");
+      std::unordered_set<Value> seen;
+      auto result = Value::array();
+      for (size_t i = 0, n = items.size(); i < n; i++) {
+        auto pair = seen.insert(items.at(i));
+        if (pair.second) {
+          result.push_back(items.at(i));
+        }
+      }
+      return result;
+  }));
+  auto make_filter = [](const Value & filter, Value & extra_args) -> Value {
+    return simple_function("", { "value" }, [=](const std::shared_ptr<Context> & context, Value & args) {
+      auto & value = args.at("value");
+      ArgumentsValue actual_args;
+      actual_args.args.emplace_back(value);
+      for (size_t i = 0, n = extra_args.size(); i < n; i++) {
+        actual_args.args.emplace_back(extra_args.at(i));
+      }
+      return filter.call(context, actual_args);
+    });
+  };
+  auto select_or_reject = [make_filter](bool is_select) {
+    return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
+      args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
+      auto & items = args.args[0];
+      if (items.is_null())
+        return Value::array();
+      if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
+
+      auto filter_fn = context->get(args.args[1]);
+      if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
+
+      auto filter_args = Value::array();
+      for (size_t i = 2, n = args.args.size(); i < n; i++) {
+        filter_args.push_back(args.args[i]);
+      }
+      auto filter = make_filter(filter_fn, filter_args);
+
+      auto res = Value::array();
+      for (size_t i = 0, n = items.size(); i < n; i++) {
+        auto & item = items.at(i);
+        ArgumentsValue filter_args;
+        filter_args.args.emplace_back(item);
+        auto pred_res = filter.call(context, filter_args);
+        if (pred_res.to_bool() == (is_select ? true : false)) {
+          res.push_back(item);
+        }
+      }
+      return res;
+    });
+  };
+  globals.set("select", select_or_reject(/* is_select= */ true));
+  globals.set("reject", select_or_reject(/* is_select= */ false));
+  globals.set("map", Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
+    auto res = Value::array();
+    if (args.args.size() == 1 &&
+      ((args.has_named("attribute") && args.kwargs.size() == 1) || (args.has_named("default") && args.kwargs.size() == 2))) {
+      auto & items = args.args[0];
+      auto attr_name = args.get_named("attribute");
+      auto default_value = args.get_named("default");
+      for (size_t i = 0, n = items.size(); i < n; i++) {
+        auto & item = items.at(i);
+        auto attr = item.get(attr_name);
+        res.push_back(attr.is_null() ? default_value : attr);
+      }
+    } else if (args.kwargs.empty() && args.args.size() >= 2) {
+      auto fn = context->get(args.args[1]);
+      if (fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
+      ArgumentsValue filter_args { {Value()}, {} };
+      for (size_t i = 2, n = args.args.size(); i < n; i++) {
+        filter_args.args.emplace_back(args.args[i]);
+      }
+      for (size_t i = 0, n = args.args[0].size(); i < n; i++) {
+        auto & item = args.args[0].at(i);
+        filter_args.args[0] = item;
+        res.push_back(fn.call(context, filter_args));
+      }
+    } else {
+      throw std::runtime_error("Invalid or unsupported arguments for map");
+    }
+    return res;
+  }));
+  globals.set("indent", simple_function("indent", { "text", "indent", "first" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto text = args.at("text").get<std::string>();
+    auto first = args.get<bool>("first", false);
+    std::string out;
+    std::string indent(args.get<int64_t>("indent", 0), ' ');
+    std::istringstream iss(text);
+    std::string line;
+    auto is_first = true;
+    while (std::getline(iss, line, '\n')) {
+      auto needs_indent = !is_first || first;
+      if (is_first) is_first = false;
+      else out += "\n";
+      if (needs_indent) out += indent;
+      out += line;
+    }
+    if (!text.empty() && text.back() == '\n') out += "\n";
+    return out;
+  }));
+  auto select_or_reject_attr = [](bool is_select) {
+    return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
+      args.expectArgs(is_select ? "selectattr" : "rejectattr", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
+      auto & items = args.args[0];
+      if (items.is_null())
+        return Value::array();
+      if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
+      auto attr_name = args.args[1].get<std::string>();
+
+      bool has_test = false;
+      Value test_fn;
+      ArgumentsValue test_args {{Value()}, {}};
+      if (args.args.size() >= 3) {
+        has_test = true;
+        test_fn = context->get(args.args[2]);
+        if (test_fn.is_null()) throw std::runtime_error("Undefined test: " + args.args[2].dump());
+        for (size_t i = 3, n = args.args.size(); i < n; i++) {
+          test_args.args.emplace_back(args.args[i]);
+        }
+        test_args.kwargs = args.kwargs;
+      }
+
+      auto res = Value::array();
+      for (size_t i = 0, n = items.size(); i < n; i++) {
+        auto & item = items.at(i);
+        auto attr = item.get(attr_name);
+        if (has_test) {
+          test_args.args[0] = attr;
+          if (test_fn.call(context, test_args).to_bool() == (is_select ? true : false)) {
+            res.push_back(item);
+          }
+        } else {
+          res.push_back(attr);
+        }
+      }
+      return res;
+    });
+  };
+  globals.set("selectattr", select_or_reject_attr(/* is_select= */ true));
+  globals.set("rejectattr", select_or_reject_attr(/* is_select= */ false));
+  globals.set("range", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
+    std::vector<int64_t> startEndStep(3);
+    std::vector<bool> param_set(3);
+    if (args.args.size() == 1) {
+      startEndStep[1] = args.args[0].get<int64_t>();
+      param_set[1] = true;
+    } else {
+      for (size_t i = 0; i < args.args.size(); i++) {
+        auto & arg = args.args[i];
+        auto v = arg.get<int64_t>();
+        startEndStep[i] = v;
+        param_set[i] = true;
+        }
+      }
+      for (auto & [name, value] : args.kwargs) {
+        size_t i;
+        if (name == "start") i = 0;
+        else if (name == "end") i = 1;
+        else if (name == "step") i = 2;
+        else throw std::runtime_error("Unknown argument " + name + " for function range");
+
+        if (param_set[i]) {
+          throw std::runtime_error("Duplicate argument " + name + " for function range");
+        }
+        startEndStep[i] = value.get<int64_t>();
+        param_set[i] = true;
+    }
+    if (!param_set[1]) {
+      throw std::runtime_error("Missing required argument 'end' for function range");
+    }
+    int64_t start = param_set[0] ? startEndStep[0] : 0;
+    int64_t end = startEndStep[1];
+    int64_t step = param_set[2] ? startEndStep[2] : 1;
+
+    auto res = Value::array();
+    if (step > 0) {
+      for (int64_t i = start; i < end; i += step) {
+        res.push_back(Value(i));
+      }
+    } else {
+      for (int64_t i = start; i > end; i += step) {
+        res.push_back(Value(i));
+      }
+    }
+    return res;
+  }));
+
+  return std::make_shared<Context>(std::move(globals));
+}
+
+inline std::shared_ptr<Context> Context::make(Value && values, const std::shared_ptr<Context> & parent) {
+  return std::make_shared<Context>(values.is_null() ? Value::object() : std::move(values), parent);
+}
+
+}  // namespace minja
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
new file mode 100644
index 000000000..a057ae45f
--- /dev/null
+++ b/common/ngram-cache.cpp
@@ -0,0 +1,285 @@
+#include "ngram-cache.h"
+#include "common.h"
+#include "log.h"
+
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <thread>
+
+void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+    const int64_t t_start_ms = ggml_time_ms();
+    const int64_t inp_size = inp.size();
+
+    const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
+    int64_t n_done = 0;
+
+    for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
+        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
+        for (int64_t i = i_start; i < inp_size; ++i) {
+            const int64_t ngram_start = i - ngram_size;
+            common_ngram ngram(&inp[ngram_start], ngram_size);
+            const llama_token token = inp[i];
+
+            common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            if (part_it == ngram_cache.end()) {
+                common_ngram_cache_part part;
+                part.emplace(token, 1);
+                ngram_cache.emplace(ngram, part);
+            } else {
+                common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                if (token_count_it == part_it->second.end()) {
+                    part_it->second.emplace(token, 1);
+                } else {
+                    token_count_it->second++;
+                }
+            }
+            ++n_done;
+
+            if (print_progress && n_done % 10000000 == 0) {
+                const int64_t t_now_ms = ggml_time_ms();
+                const int64_t eta_ms   = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
+                const int64_t eta_min  = eta_ms / (60*1000);
+                const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+                fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
+            }
+        }
+    }
+}
+
+// Helper function to get a token from the combined, speculative sequence of inp and draft.
+static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
+    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
+}
+
+// If sample size or percentage are below these thresholds the draft is aborted early:
+constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
+constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
+constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
+constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
+
+// Helper function that tries to draft a token from only the static ngram cache:
+static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
+    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+    if (part_static_it == nc_static.end()) {
+        return LLAMA_TOKEN_NULL;
+    }
+    const common_ngram_cache_part part_static = part_static_it->second;
+
+    int max_count_static  = 0;
+    int sum_count_static  = 0;
+    llama_token max_token = LLAMA_TOKEN_NULL;
+
+    for (std::pair<llama_token, int> token_count_static : part_static) {
+        const llama_token token = token_count_static.first;
+        const int32_t count_static  = token_count_static.second;
+
+        if (count_static > max_count_static) {
+            max_token        = token;
+            max_count_static = count_static;
+        }
+        sum_count_static += count_static;
+    }
+
+    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
+        return LLAMA_TOKEN_NULL;
+    }
+    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
+        return LLAMA_TOKEN_NULL;
+    }
+    return max_token;
+}
+
+// Try to draft a token from primary cache (context/dynamic), validate with static cache:
+static llama_token try_draft(
+    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
+    const int * min_sample_size, const int * min_percent) {
+
+    llama_token drafted_token = LLAMA_TOKEN_NULL;
+
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
+        const common_ngram ngram_primary = ngrams_primary[i];
+
+        common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        if (part_primary_it == nc_primary.end()) {
+            continue;
+        }
+        const common_ngram_cache_part part_primary = part_primary_it->second;
+
+        int max_count_primary = 0;
+        int max_count_static  = 0;
+        int sum_count_primary = 0;
+        llama_token max_token = LLAMA_TOKEN_NULL;
+
+        for (std::pair<llama_token, int> token_count_primary : part_primary) {
+            const llama_token token = token_count_primary.first;
+
+            common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+
+            const int32_t count_primary = token_count_primary.second;
+            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
+
+            if (count_primary*count_static > max_count_primary*max_count_static) {
+                max_token         = token;
+                max_count_primary = count_primary;
+                max_count_static  = count_static;
+            }
+            sum_count_primary += count_primary;
+        }
+
+        if (sum_count_primary < min_sample_size[i]) {
+            continue;
+        }
+        if (100*max_count_primary < min_percent[i]*sum_count_primary) {
+            continue;;
+        }
+        drafted_token = max_token;
+    }
+
+    return drafted_token;
+}
+
+void common_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
+) {
+    GGML_ASSERT(draft.size() == 1);
+    const int inp_size = inp.size();
+
+    if (inp_size < LLAMA_NGRAM_STATIC) {
+        return;
+    }
+
+    while ((int) draft.size()-1 < n_draft) {
+        llama_token drafted_token = LLAMA_TOKEN_NULL;
+
+        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
+        common_ngram ngram_static;
+        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
+        }
+        common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        common_ngram_cache_part part_static;
+        if (part_static_it != nc_static.end()) {
+            part_static = part_static_it->second;
+        }
+
+        // cd = context + dynamic
+        std::vector<common_ngram> ngrams_cd;
+        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
+            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
+            common_ngram ngram_cd;
+            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
+                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
+            }
+            ngrams_cd.push_back(ngram_cd);
+        }
+        if (drafted_token == LLAMA_TOKEN_NULL) {
+            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
+        }
+        if (drafted_token == LLAMA_TOKEN_NULL) {
+            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
+        }
+        if (drafted_token == LLAMA_TOKEN_NULL) {
+            drafted_token = try_draft(nc_static, ngram_static);
+        }
+
+        if (drafted_token == LLAMA_TOKEN_NULL) {
+            break;
+        }
+
+        LOG(" - draft candidate: token=%d\n", drafted_token);
+        draft.push_back(drafted_token);
+    }
+}
+
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
+    std::ofstream file_out(filename, std::ios::binary);
+    for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
+        const common_ngram      ngram        = item.first;
+        common_ngram_cache_part token_counts = item.second;
+        GGML_ASSERT(!token_counts.empty());
+        const int32_t ntokens = token_counts.size();
+        GGML_ASSERT(ntokens > 0);
+
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
+        for (std::pair<llama_token, int32_t> item2 : token_counts) {
+            const llama_token token = item2.first;
+            const int32_t     count = item2.second;
+            GGML_ASSERT(count > 0);
+
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
+        }
+    }
+
+}
+
+common_ngram_cache common_ngram_cache_load(std::string & filename) {
+    std::ifstream hashmap_file(filename, std::ios::binary);
+    if (!hashmap_file) {
+        throw std::ifstream::failure("Unable to open file " + filename);
+    }
+    common_ngram_cache ngram_cache;
+
+    common_ngram ngram;
+    int32_t     ntokens;
+    llama_token token;
+    int32_t     count;
+
+    char * ngramc   = reinterpret_cast<char*>(&ngram);
+    char * ntokensc = reinterpret_cast<char*>(&ntokens);
+    char * tokenc   = reinterpret_cast<char*>(&token);
+    char * countc   = reinterpret_cast<char*>(&count);
+    while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
+        GGML_ASSERT(!hashmap_file.eof());
+        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
+        GGML_ASSERT(ntokens > 0);
+        common_ngram_cache_part token_counts;
+
+        for (int i = 0; i < ntokens; ++i) {
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
+            GGML_ASSERT(count > 0);
+            token_counts.emplace(token, count);
+        }
+
+        ngram_cache.emplace(ngram, token_counts);
+    }
+    GGML_ASSERT(hashmap_file.eof());
+
+    return ngram_cache;
+}
+
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
+    for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const common_ngram      ngram = ngram_part.first;
+        common_ngram_cache_part  part = ngram_part.second;
+
+        common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        if (part_merged_it == ngram_cache_target.end()) {
+            ngram_cache_target.emplace(ngram, part);
+            continue;
+        }
+
+        for (std::pair<llama_token, int32_t> token_count : part) {
+            const llama_token token = token_count.first;
+            const int32_t     count = token_count.second;
+            GGML_ASSERT(count > 0);
+
+            common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            if (token_count_merged_it == part_merged_it->second.end()) {
+                part_merged_it->second.emplace(token, count);
+                continue;
+            }
+
+            token_count_merged_it->second += count;
+        }
+    }
+}
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
new file mode 100644
index 000000000..dfe012abe
--- /dev/null
+++ b/common/ngram-cache.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include "llama.h"
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#define LLAMA_NGRAM_MIN    1
+#define LLAMA_NGRAM_MAX    4
+#define LLAMA_NGRAM_STATIC 2
+
+// Data structures to map n-grams to empirical token probabilities:
+
+struct common_ngram {
+    llama_token tokens[LLAMA_NGRAM_MAX];
+
+    common_ngram() {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = LLAMA_TOKEN_NULL;
+        }
+    }
+
+    common_ngram(const llama_token * input, const int ngram_size) {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
+        }
+    }
+
+    bool operator==(const common_ngram & other) const {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            if (tokens[i] != other.tokens[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
+struct common_token_hash_function {
+    size_t operator()(const llama_token token) const {
+        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+        return token * 11400714819323198485llu;
+    }
+};
+
+struct common_ngram_hash_function {
+    size_t operator()(const common_ngram & ngram) const {
+        size_t hash = common_token_hash_function{}(ngram.tokens[0]);
+        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= common_token_hash_function{}(ngram.tokens[i]);
+        }
+        return hash;
+    }
+};
+
+// token -> number of times token has been seen
+typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
+
+// n-gram -> empirical distribution of following tokens
+typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
+
+
+// Update an ngram cache with tokens.
+// ngram_cache:         the cache to modify.
+// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
+// inp_data:            the token sequence with which to update ngram_cache.
+// nnew:                how many new tokens have been appended to inp_data since the last call to this function.
+// print_progress:      whether to print progress to stderr.
+//
+// In order to get correct results inp_data can ONLY BE APPENDED TO.
+// Changes in the middle need a complete rebuild.
+void common_ngram_cache_update(
+    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+
+// Try to draft tokens from ngram caches.
+// inp:                the tokens generated so far.
+// draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
+// n_draft:            maximum number of tokens to add to draft.
+// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
+// nc_context:         ngram cache based on current context.
+// nc_dynamic:         ngram cache based on previous user generations.
+// nc_static:          ngram cache generated from a large text corpus, used for validation.
+void common_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
+
+// Save an ngram cache to a file.
+// ngram_cache: the ngram cache to save.
+// filename:    the path under which to save the ngram cache.
+void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
+
+// Load an ngram cache saved with common_ngram_cache_save.
+// filename: the path from which to load the ngram cache.
+// returns:  an ngram cache containing the information saved to filename.
+common_ngram_cache common_ngram_cache_load(std::string & filename);
+
+// Merge two ngram caches.
+// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
+// ngram_cache_add:    the ngram cache to add to ngram_cache_target.
+void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
diff --git a/common/sampling.cpp b/common/sampling.cpp
index e67096bea..e4b21ca10 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,318 +1,526 @@
 #include "sampling.h"
 
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
+#include "common.h"
 
-    result->params  = params;
-    result->grammar = nullptr;
+#include <cmath>
+#include <unordered_map>
 
-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+// TODO: deduplicate with llama-impl.h
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
 
-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            delete result;
-            return nullptr;
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
+
+struct common_sampler {
+    common_params_sampling params;
+
+    struct llama_sampler * grmr;
+    struct llama_sampler * chain;
+
+    ring_buffer<llama_token> prev;
+
+    std::vector<llama_token_data> cur;
+
+    llama_token_data_array cur_p;
+
+    void set_logits(struct llama_context * ctx, int idx) {
+        const auto * logits = llama_get_logits_ith(ctx, idx);
+
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+
+        const int n_vocab = llama_vocab_n_tokens(vocab);
+
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
         }
 
-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        result->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
+        cur_p = { cur.data(), cur.size(), -1, false };
     }
+};
 
-    result->prev.resize(params.n_prev);
-
-    return result;
-}
-
-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    delete ctx;
-}
-
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
-    }
-
-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        ctx->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
+std::string common_params_sampling::print() const {
     char result[1024];
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
+            mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
 }
 
-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
+
+    lparams.no_perf = params.no_perf;
+
+    std::vector<const char *> trigger_words;
+    trigger_words.reserve(params.grammar_trigger_words.size());
+    for (const auto & str : params.grammar_trigger_words) {
+        trigger_words.push_back(str.word.c_str());
+    }
+
+    struct llama_sampler * grmr;
+    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
+#ifdef LLAMA_USE_LLGUIDANCE
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+#else
+        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+#endif // LLAMA_USE_LLGUIDANCE
+    } else {
+        grmr = params.grammar_lazy
+             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
+                                               trigger_words.data(), trigger_words.size(),
+                                               params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
+             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+    }
+
+    auto * result = new common_sampler {
+        /* .params = */ params,
+        /* .grmr   = */ grmr,
+        /* .chain  = */ llama_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur    = */ {},
+        /* .cur_p  = */ {},
+    };
+
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_logit_bias(
+                llama_vocab_n_tokens(vocab),
+                params.logit_bias.size(),
+                params.logit_bias.data()));
+
     if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
-            if (!sampler_type_name.empty()) {
-                result += "-> " + sampler_type_name + " ";
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char *> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto & str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
+                        }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                    break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
             }
         }
+        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+    } else if (params.mirostat == 1) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+    } else if (params.mirostat == 2) {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
     } else {
-        result += "-> mirostat ";
+        GGML_ASSERT(false && "unknown mirostat version");
     }
 
     return result;
 }
 
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t   min_keep) {
-    const float         temp              = params.temp;
-    const float         dynatemp_range    = params.dynatemp_range;
-    const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
+void common_sampler_free(struct common_sampler * gsmpl) {
+    if (gsmpl) {
+        llama_sampler_free(gsmpl->grmr);
 
-    for (auto sampler_type : samplers_sequence) {
-        switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case llama_sampler_type::TEMPERATURE:
-                if (dynatemp_range > 0) {
-                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
-                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
-                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
-                }
-                break;
-            default : break;
-        }
+        llama_sampler_free(gsmpl->chain);
+
+        delete gsmpl;
     }
 }
 
-static llama_token llama_sampling_sample_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const float   temp            = params.temp;
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    llama_token id = 0;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    if (accept_grammar) {
+        llama_sampler_accept(gsmpl->grmr, token);
     }
 
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
+    llama_sampler_accept(gsmpl->chain, token);
+
+    gsmpl->prev.push_back(token);
+}
+
+void common_sampler_reset(struct common_sampler * gsmpl) {
+    llama_sampler_reset(gsmpl->grmr);
+
+    llama_sampler_reset(gsmpl->chain);
+}
+
+struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
+    return new common_sampler {
+        /* .params = */ gsmpl->params,
+        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev   = */ gsmpl->prev,
+        /* .cur    = */ gsmpl->cur,
+        /* .cur_p  = */ gsmpl->cur_p,
+    };
+}
+
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
+    // TODO: measure grammar performance
+
+    if (gsmpl) {
+        llama_perf_sampler_print(gsmpl->chain);
+    }
+    if (ctx) {
+        llama_perf_context_print(ctx);
+    }
+}
+
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    gsmpl->set_logits(ctx, idx);
+
+    auto & grmr  = gsmpl->grmr;
+    auto & chain = gsmpl->chain;
+    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
+
+    if (grammar_first) {
+        llama_sampler_apply(grmr, &cur_p);
     }
 
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+    llama_sampler_apply(chain, &cur_p);
+
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
+
+    const llama_token id = cur_p.data[cur_p.selected].id;
+
+    if (grammar_first) {
+        return id;
     }
 
-    cur.clear();
+    // check if it the sampled token fits the grammar
+    {
+        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
 
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        llama_sampler_apply(grmr, &single_token_data_array);
+
+        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+        if (is_valid) {
+            return id;
+        }
     }
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    // resampling:
+    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
+    gsmpl->set_logits(ctx, idx);
 
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+    llama_sampler_apply(grmr,  &cur_p);
+    llama_sampler_apply(chain, &cur_p);
 
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
 
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
+    return cur_p.data[cur_p.selected].id;
+}
+
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+}
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
+// helpers
+
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+    return &gsmpl->cur_p;
+}
+
+llama_token common_sampler_last(const struct common_sampler * gsmpl) {
+    return gsmpl->prev.rat(0);
+}
+
+std::string common_sampler_print(const struct common_sampler * gsmpl) {
+    std::string result = "logits ";
+
+    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+    }
+
+    return result;
+}
+
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
+    n = std::min(n, (int) gsmpl->prev.size());
+
+    if (n <= 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
+
+    for (int i = n - 1; i >= 0; i--) {
+        const llama_token id = gsmpl->prev.rat(i);
+
+        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+
+        result += common_token_to_piece(ctx_main, id);
+    }
+
+    return result;
+}
+
+char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
+    switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return 'd';
+        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
+        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
+        default : return '?';
+    }
+}
+
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
+    switch (cnstr) {
+        case COMMON_SAMPLER_TYPE_DRY:         return "dry";
+        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
+        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
+        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
+        default : return "";
+    }
+}
+
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "dry",         COMMON_SAMPLER_TYPE_DRY },
+        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
+        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
+    };
+
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
+        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
+    };
+
+    std::vector<common_sampler_type> samplers;
+    samplers.reserve(names.size());
+
+    for (const auto & name : names) {
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
+        } else {
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                 }
             }
         }
     }
 
-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
+    return samplers;
+}
 
-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
-        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.min_keep);
+std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, common_sampler_type> sampler_name_map = {
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY),         COMMON_SAMPLER_TYPE_DRY },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
+    };
 
-            sampler_queue(ctx_main, params, cur_p, min_keep);
+    std::vector<common_sampler_type> samplers;
+    samplers.reserve(chars.size());
 
-            id = llama_sample_token(ctx_main, &cur_p);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+    for (const auto & c : chars) {
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
         }
     }
 
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
-        }
-    }
-
-    return id;
-}
-
-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
-    }
+    return samplers;
 }
diff --git a/common/sampling.h b/common/sampling.h
index 95d875394..2064421db 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,137 +2,106 @@
 
 #include "llama.h"
 
-#include "grammar-parser.h"
+#include "common.h"
 
 #include <string>
 #include <vector>
-#include <unordered_map>
 
-// sampler types
-enum class llama_sampler_type : char {
-    TOP_K       = 'k',
-    TOP_P       = 'p',
-    MIN_P       = 'm',
-    TFS_Z       = 'f',
-    TYPICAL_P   = 'y',
-    TEMPERATURE = 't'
-};
-
-// sampling parameters
-typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-
-    std::vector<llama_sampler_type> samplers_sequence = {
-        llama_sampler_type::TOP_K,
-        llama_sampler_type::TFS_Z,
-        llama_sampler_type::TYPICAL_P,
-        llama_sampler_type::TOP_P,
-        llama_sampler_type::MIN_P,
-        llama_sampler_type::TEMPERATURE
-    };
-
-    std::string grammar;  // optional BNF-like grammar to constrain sampling
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
-
-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
-
-    // mirostat sampler state
-    float mirostat_mu;
-
-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-};
-
-#include "common.h"
-
-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
-
-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
+// common_sampler extends llama_sampler with additional functionality:
 //
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
+//  - grammar support
+//  - custom sampler logic based on the parameters
+//  - history of the last accepted tokens
+//  - performance metrics
 //
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
+// This goal is to have a common implementation of the sampling logic shared across the examples.
+// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
+// complex (top-k, top-p, etc).
 //
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
+// Another example is related to the grammar. In general, the grammar constraints applied on the full
+// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
+// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
+// grammar constraints are applied to the full vocabulary and the token is resampled.
+//
+// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
+// be moved into the core llama library.
+//
+// For convenience, the common_sampler also maintains a container with the current candidate tokens.
+// This can be used to access the probabilities of the rest of the non-sampled tokens.
+//
+// TODO: measure grammar performance
 //
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
 
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
+struct common_sampler;
+
+// llama_sampler API overloads
+
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
+
+void common_sampler_free(struct common_sampler * gsmpl);
+
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_reset (struct common_sampler * gsmpl);
+struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
+
+// arguments can be nullptr to skip printing
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+
+// generalized version of common_sampler_sample
+//
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
+//
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
+//
+// requires: idxs.size() == draft.size() + 1
+//
+// returns at least 1 token, up to idxs.size()
+//
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+
+// assume idxs == [ 0, 1, 2, ..., draft.size() ]
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
+
+// helpers
+
+// access the internal list of current candidate tokens
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+
+// get the last accepted token
+llama_token common_sampler_last(const struct common_sampler * gsmpl);
+
+// print the sampler chain into a string
+std::string common_sampler_print(const struct common_sampler * gsmpl);
+
+// get a string representation of the last accepted tokens
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+
+char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
+
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+                const char * grammar_kind, const char * grammar_data);
diff --git a/common/speculative.cpp b/common/speculative.cpp
new file mode 100644
index 000000000..318e96ea3
--- /dev/null
+++ b/common/speculative.cpp
@@ -0,0 +1,277 @@
+#include "speculative.h"
+
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+
+#include <cstring>
+
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
+#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
+
+struct common_speculative {
+    struct llama_context * ctx;
+    struct common_sampler * smpl;
+
+    llama_batch batch;
+    llama_tokens prompt;
+};
+
+struct common_speculative * common_speculative_init(
+        struct llama_context * ctx_dft) {
+    auto * result = new common_speculative {
+        /* .ctx    = */ ctx_dft,
+        /* .smpl   = */ nullptr,
+        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .prompt = */ {},
+    };
+
+    // TODO: optimize or pass from outside?
+#if 0
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 40;
+        params.top_p = 0.9;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+            COMMON_SAMPLER_TYPE_TOP_P,
+            COMMON_SAMPLER_TYPE_INFILL,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#else
+    {
+        common_params_sampling params;
+        params.no_perf = false;
+
+        params.top_k = 10;
+
+        params.samplers = {
+            COMMON_SAMPLER_TYPE_TOP_K,
+        };
+
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
+    }
+#endif
+
+    return result;
+}
+
+void common_speculative_free(struct common_speculative * spec) {
+    if (spec == nullptr) {
+        return;
+    }
+
+    common_sampler_free(spec->smpl);
+
+    llama_batch_free(spec->batch);
+
+    delete spec;
+}
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft) {
+    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
+    const struct llama_model * model_dft = llama_get_model(ctx_dft);
+
+    const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
+    const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
+
+    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
+
+    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
+
+    if (vocab_type_tgt != vocab_type_dft) {
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
+                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
+        return false;
+    }
+
+    if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
+        LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
+        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
+        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
+        return false;
+    }
+
+    {
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
+
+        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
+                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            return false;
+        }
+
+        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
+            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
+            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
+                LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
+                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+llama_tokens common_speculative_gen_draft(
+        struct common_speculative * spec,
+        struct common_speculative_params params,
+        const llama_tokens & prompt_tgt,
+        llama_token id_last) {
+    auto & batch  = spec->batch;
+    auto & ctx    = spec->ctx;
+    auto & smpl   = spec->smpl;
+    auto & prompt = spec->prompt;
+
+    int reuse_i = 0;
+    int reuse_n = 0;
+
+    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
+
+    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
+
+    // reuse as much as possible from the old draft context
+    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
+    for (int i = 0; i < (int) prompt.size(); ++i) {
+        int cur = 0;
+        while (i_start + cur < (int) prompt_tgt.size() &&
+               i       + cur < (int) prompt.size() &&
+               prompt_tgt[i_start + cur] == prompt[i + cur]) {
+            cur++;
+        }
+
+        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
+            reuse_i = i;
+            reuse_n = cur;
+        }
+    }
+
+    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
+
+    llama_tokens result;
+    result.reserve(params.n_draft);
+
+    if (reuse_n == 0) {
+        llama_kv_cache_clear(ctx);
+
+        prompt.clear();
+    } else {
+        // this happens when a previous draft has been discarded (for example, due to being too small), but the
+        // target model agreed with it. in this case, we simply pass back the previous results to save compute
+        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
+            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
+                result.push_back(prompt[i]);
+
+                if (params.n_draft <= (int) result.size()) {
+                    break;
+                }
+            }
+
+            return result;
+        }
+
+        if (reuse_i > 0) {
+            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+
+            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
+        }
+
+        if (reuse_n < (int) prompt.size()) {
+            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+
+            prompt.erase(prompt.begin() + reuse_n, prompt.end());
+        }
+    }
+
+    // prepare a batch to evaluate any new tokens in the prompt
+    common_batch_clear(batch);
+
+    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
+        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
+        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
+
+        prompt.push_back(prompt_tgt[i]);
+    }
+
+    // we should rarely end-up here during normal decoding
+    if (batch.n_tokens > 0) {
+        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
+
+        llama_decode(ctx, batch);
+    }
+
+    const llama_pos n_past = prompt.size();
+
+    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
+
+    common_batch_clear(batch);
+    common_batch_add  (batch, id_last, n_past, { 0 }, true);
+
+    prompt.push_back(id_last);
+
+    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
+
+    llama_decode(ctx, batch);
+
+    common_sampler_reset(smpl);
+
+    // sample n_draft tokens from the draft model
+    for (int i = 0; i < params.n_draft; ++i) {
+        common_batch_clear(batch);
+
+        common_sampler_sample(smpl, ctx, 0, true);
+
+        const auto * cur_p = common_sampler_get_candidates(smpl);
+
+        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
+        }
+
+        // add drafted token for each sequence
+        const llama_token id = cur_p->data[0].id;
+
+        // only collect very high-confidence draft tokens
+        if (cur_p->data[0].p < params.p_min) {
+            break;
+        }
+
+        common_sampler_accept(smpl, id, true);
+
+        result.push_back(id);
+
+        if (params.n_draft <= (int) result.size()) {
+            break;
+        }
+
+        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
+
+        // evaluate the drafted tokens on the draft model
+        llama_decode(ctx, batch);
+
+        prompt.push_back(id);
+    }
+
+    return result;
+}
diff --git a/common/speculative.h b/common/speculative.h
new file mode 100644
index 000000000..50ec03446
--- /dev/null
+++ b/common/speculative.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "llama.h"
+#include "common.h"
+
+struct common_speculative;
+
+struct common_speculative_params {
+    int n_draft = 16;  // max drafted tokens
+    int n_reuse = 256;
+
+    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
+};
+
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+
+void common_speculative_free(struct common_speculative * spec);
+
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft);
+
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_gen_draft(
+               struct common_speculative * spec,
+        struct common_speculative_params   params,
+                      const llama_tokens & prompt,
+                             llama_token   id_last);
diff --git a/common/stb_image.h b/common/stb_image.h
index 4766d7e67..9eedabedc 100644
--- a/common/stb_image.h
+++ b/common/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,8 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.30  (2024-05-31) avoid erroneous gcc warning
+      2.29  (2023-05-xx) optimizations
       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes
@@ -371,13 +373,14 @@ RECENT REVISION HISTORY:
 
 #define STBI_VERSION 1
 
-enum {
-    STBI_default = 0, // only used for desired_channels
+enum
+{
+   STBI_default = 0, // only used for desired_channels
 
-    STBI_grey = 1,
-    STBI_grey_alpha = 2,
-    STBI_rgb = 3,
-    STBI_rgb_alpha = 4
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
 };
 
 #include <stdlib.h>
@@ -405,11 +408,11 @@ extern "C" {
 // load image by filename, open file, or memory buffer
 //
 
-typedef struct {
-    int (*read)(void * user, char * data,
-                int size);            // fill 'data' with 'size' bytes.  return number of bytes actually read
-    void (*skip)(void * user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-    int (*eof)(void * user);          // returns nonzero if we are at end of file/data
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
 ////////////////////////////////////
@@ -417,24 +420,21 @@ typedef struct {
 // 8-bits-per-channel interface
 //
 
-STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
-                                        int desired_channels);
-STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
-                                           int * channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
-STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 // for stbi_load_from_file, file pointer is left pointing immediately after image
 #endif
 
 #ifndef STBI_NO_GIF
-STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z,
-                                            int * comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 #endif
 
 #ifdef STBI_WINDOWS_UTF8
-STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input);
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
 #endif
 
 ////////////////////////////////////
@@ -442,14 +442,12 @@ STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wc
 // 16-bits-per-channel interface
 //
 
-STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
-                                           int desired_channels);
-STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
-                                              int * channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
-STBIDEF stbi_us * stbi_load_from_file_16(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 #endif
 
 ////////////////////////////////////
@@ -457,55 +455,56 @@ STBIDEF stbi_us * stbi_load_from_file_16(FILE * f, int * x, int * y, int * chann
 // float-per-channel interface
 //
 #ifndef STBI_NO_LINEAR
-STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
-                                       int desired_channels);
-STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * channels_in_file,
-                                          int desired_channels);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
 
-#ifndef STBI_NO_STDIO
-STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * channels_in_file, int desired_channels);
-STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * channels_in_file, int desired_channels);
-#endif
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
 #endif
 
 #ifndef STBI_NO_HDR
-STBIDEF void stbi_hdr_to_ldr_gamma(float gamma);
-STBIDEF void stbi_hdr_to_ldr_scale(float scale);
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
 #endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
-STBIDEF void stbi_ldr_to_hdr_gamma(float gamma);
-STBIDEF void stbi_ldr_to_hdr_scale(float scale);
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
 #endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
-STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user);
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len);
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
 #ifndef STBI_NO_STDIO
-STBIDEF int stbi_is_hdr(char const * filename);
-STBIDEF int stbi_is_hdr_from_file(FILE * f);
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
 #endif // STBI_NO_STDIO
 
+
 // get a VERY brief reason for failure
 // on most compilers (and ALL modern mainstream compilers) this is threadsafe
-STBIDEF const char * stbi_failure_reason(void);
+STBIDEF const char *stbi_failure_reason  (void);
 
 // free the loaded image -- this is just free()
-STBIDEF void stbi_image_free(void * retval_from_stbi_load);
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
 
 // get image dimensions & components without fully decoding
-STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp);
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp);
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len);
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * clbk, void * user);
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp);
-STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp);
-STBIDEF int stbi_is_16_bit(char const * filename);
-STBIDEF int stbi_is_16_bit_from_file(FILE * f);
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
 #endif
 
+
+
 // for image formats that explicitly notate that they have premultiplied alpha,
 // we just return the colors as stored in the file. set this flag to force
 // unpremultiplication. results are undefined if the unpremultiply overflow.
@@ -527,14 +526,14 @@ STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_fli
 
 // ZLIB client - used by PNG, available for other purposes
 
-STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen);
-STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen,
-                                                            int parse_header);
-STBIDEF char * stbi_zlib_decode_malloc(const char * buffer, int len, int * outlen);
-STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, const char * ibuffer, int ilen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
 
-STBIDEF char * stbi_zlib_decode_noheader_malloc(const char * buffer, int len, int * outlen);
-STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen);
 
 #ifdef __cplusplus
 }
@@ -547,50 +546,52 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const cha
 
 #ifdef STB_IMAGE_IMPLEMENTATION
 
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) ||                   \
-    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||                    \
-    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
-#ifndef STBI_ONLY_JPEG
-#define STBI_NO_JPEG
-#endif
-#ifndef STBI_ONLY_PNG
-#define STBI_NO_PNG
-#endif
-#ifndef STBI_ONLY_BMP
-#define STBI_NO_BMP
-#endif
-#ifndef STBI_ONLY_PSD
-#define STBI_NO_PSD
-#endif
-#ifndef STBI_ONLY_TGA
-#define STBI_NO_TGA
-#endif
-#ifndef STBI_ONLY_GIF
-#define STBI_NO_GIF
-#endif
-#ifndef STBI_ONLY_HDR
-#define STBI_NO_HDR
-#endif
-#ifndef STBI_ONLY_PIC
-#define STBI_NO_PIC
-#endif
-#ifndef STBI_ONLY_PNM
-#define STBI_NO_PNM
-#endif
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
 #endif
 
 #if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
 #define STBI_NO_ZLIB
 #endif
 
-#include <limits.h>
+
 #include <stdarg.h>
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h> // ldexp, pow
+#include <math.h>  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -608,54 +609,55 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const cha
 #define STBI_EXTERN extern
 #endif
 
+
 #ifndef _MSC_VER
-#ifdef __cplusplus
-#define stbi_inline inline
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
 #else
-#define stbi_inline
-#endif
-#else
-#define stbi_inline __forceinline
+   #define stbi_inline __forceinline
 #endif
 
 #ifndef STBI_NO_THREAD_LOCALS
-#if defined(__cplusplus) && __cplusplus >= 201103L
-#define STBI_THREAD_LOCAL thread_local
-#elif defined(__GNUC__) && __GNUC__ < 5
-#define STBI_THREAD_LOCAL __thread
-#elif defined(_MSC_VER)
-#define STBI_THREAD_LOCAL __declspec(thread)
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
-#define STBI_THREAD_LOCAL _Thread_local
-#endif
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
 
-#ifndef STBI_THREAD_LOCAL
-#if defined(__GNUC__)
-#define STBI_THREAD_LOCAL __thread
-#endif
-#endif
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
 #endif
 
 #if defined(_MSC_VER) || defined(__SYMBIAN32__)
 typedef unsigned short stbi__uint16;
-typedef signed short stbi__int16;
-typedef unsigned int stbi__uint32;
-typedef signed int stbi__int32;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
 #else
 #include <stdint.h>
 typedef uint16_t stbi__uint16;
-typedef int16_t stbi__int16;
+typedef int16_t  stbi__int16;
 typedef uint32_t stbi__uint32;
-typedef int32_t stbi__int32;
+typedef int32_t  stbi__int32;
 #endif
 
 // should produce compiler error if size is wrong
-typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 
 #ifdef _MSC_VER
-#define STBI_NOTUSED(v) (void)(v)
+#define STBI_NOTUSED(v)  (void)(v)
 #else
-#define STBI_NOTUSED(v) (void)sizeof(v)
+#define STBI_NOTUSED(v)  (void)sizeof(v)
 #endif
 
 #ifdef _MSC_VER
@@ -663,9 +665,9 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 #endif
 
 #ifdef STBI_HAS_LROTL
-#define stbi_lrot(x, y) _lrotl(x, y)
+   #define stbi_lrot(x,y)  _lrotl(x,y)
 #else
-#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (-(y)&31)))
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
 #endif
 
 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
@@ -677,13 +679,13 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz) malloc(sz)
-#define STBI_REALLOC(p, newsz) realloc(p, newsz)
-#define STBI_FREE(p) free(p)
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
 #endif
 
 #ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz)
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
 #endif
 
 // x86/x64 detection
@@ -725,31 +727,34 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
 
 #ifdef _MSC_VER
 
-#if _MSC_VER >= 1400 // not VC6
-#include <intrin.h>  // __cpuid
-static int stbi__cpuid3(void) {
-    int info[4];
-    __cpuid(info, 1);
-    return info[3];
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
 }
 #else
-static int stbi__cpuid3(void) {
-    int res;
-    __asm {
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
       mov  eax,1
       cpuid
       mov  res,edx
-    }
-    return res;
+   }
+   return res;
 }
 #endif
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void) {
-    int info3 = stbi__cpuid3();
-    return ((info3 >> 26) & 1) != 0;
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
 }
 #endif
 
@@ -757,11 +762,12 @@ static int stbi__sse2_available(void) {
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void) {
-    // If we're even attempting to compile this on GCC/Clang, that means
-    // -msse2 is on, which means the compiler is allowed to use SSE2
-    // instructions at will, and so are we.
-    return 1;
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
 }
 #endif
 
@@ -796,162 +802,190 @@ static int stbi__sse2_available(void) {
 
 // stbi__context structure is our basic context used by all images, so it
 // contains all the IO context, plus some basic image information
-typedef struct {
-    stbi__uint32 img_x, img_y;
-    int img_n, img_out_n;
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
 
-    stbi_io_callbacks io;
-    void * io_user_data;
+   stbi_io_callbacks io;
+   void *io_user_data;
 
-    int read_from_callbacks;
-    int buflen;
-    stbi_uc buffer_start[128];
-    int callback_already_read;
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
 
-    stbi_uc *img_buffer, *img_buffer_end;
-    stbi_uc *img_buffer_original, *img_buffer_original_end;
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
 
-static void stbi__refill_buffer(stbi__context * s);
+
+static void stbi__refill_buffer(stbi__context *s);
 
 // initialize a memory-decode context
-static void stbi__start_mem(stbi__context * s, stbi_uc const * buffer, int len) {
-    s->io.read = NULL;
-    s->read_from_callbacks = 0;
-    s->callback_already_read = 0;
-    s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer;
-    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len;
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
 
 // initialize a callback-based context
-static void stbi__start_callbacks(stbi__context * s, stbi_io_callbacks * c, void * user) {
-    s->io = *c;
-    s->io_user_data = user;
-    s->buflen = sizeof(s->buffer_start);
-    s->read_from_callbacks = 1;
-    s->callback_already_read = 0;
-    s->img_buffer = s->img_buffer_original = s->buffer_start;
-    stbi__refill_buffer(s);
-    s->img_buffer_original_end = s->img_buffer_end;
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
 }
 
 #ifndef STBI_NO_STDIO
 
-static int stbi__stdio_read(void * user, char * data, int size) { return (int)fread(data, 1, size, (FILE *)user); }
-
-static void stbi__stdio_skip(void * user, int n) {
-    int ch;
-    fseek((FILE *)user, n, SEEK_CUR);
-    ch = fgetc((FILE *)user); /* have to read a byte to reset feof()'s flag */
-    if (ch != EOF) {
-        ungetc(ch, (FILE *)user); /* push byte back onto stream if valid. */
-    }
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
 }
 
-static int stbi__stdio_eof(void * user) { return feof((FILE *)user) || ferror((FILE *)user); }
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
 
-static stbi_io_callbacks stbi__stdio_callbacks = {
-    stbi__stdio_read,
-    stbi__stdio_skip,
-    stbi__stdio_eof,
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
 };
 
-static void stbi__start_file(stbi__context * s, FILE * f) { stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f); }
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
 
-// static void stop_file(stbi__context *s) { }
+//static void stop_file(stbi__context *s) { }
 
 #endif // !STBI_NO_STDIO
 
-static void stbi__rewind(stbi__context * s) {
-    // conceptually rewind SHOULD rewind to the beginning of the stream,
-    // but we just rewind to the beginning of the initial buffer, because
-    // we only use it after doing 'test', which only ever looks at at most 92 bytes
-    s->img_buffer = s->img_buffer_original;
-    s->img_buffer_end = s->img_buffer_original_end;
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
 }
 
-enum { STBI_ORDER_RGB, STBI_ORDER_BGR };
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
 
-typedef struct {
-    int bits_per_channel;
-    int num_channels;
-    int channel_order;
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
 } stbi__result_info;
 
 #ifndef STBI_NO_JPEG
-static int stbi__jpeg_test(stbi__context * s);
-static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
-static int stbi__png_test(stbi__context * s);
-static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp);
-static int stbi__png_is16(stbi__context * s);
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
-static int stbi__bmp_test(stbi__context * s);
-static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
-static int stbi__tga_test(stbi__context * s);
-static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context * s);
-static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc);
-static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp);
-static int stbi__psd_is16(stbi__context * s);
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test(stbi__context * s);
-static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
-static int stbi__pic_test(stbi__context * s);
-static void * stbi__pic_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
-static int stbi__gif_test(stbi__context * s);
-static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp);
-static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp);
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
-static int stbi__pnm_test(stbi__context * s);
-static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri);
-static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp);
-static int stbi__pnm_is16(stbi__context * s);
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
 #endif
 
 static
 #ifdef STBI_THREAD_LOCAL
-    STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
 #endif
-    const char * stbi__g_failure_reason;
+const char *stbi__g_failure_reason;
 
-STBIDEF const char * stbi_failure_reason(void) { return stbi__g_failure_reason; }
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
 
 #ifndef STBI_NO_FAILURE_STRINGS
-static int stbi__err(const char * str) {
-    stbi__g_failure_reason = str;
-    return 0;
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
 }
 #endif
 
-static void * stbi__malloc(size_t size) { return STBI_MALLOC(size); }
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
 
 // stb_image uses ints pervasively, including for offset calculations.
 // therefore the largest decoded image size we can support with the
@@ -965,88 +999,88 @@ static void * stbi__malloc(size_t size) { return STBI_MALLOC(size); }
 
 // return 1 if the sum is valid, 0 on overflow.
 // negative terms are considered invalid.
-static int stbi__addsizes_valid(int a, int b) {
-    if (b < 0)
-        return 0;
-    // now 0 <= b <= INT_MAX, hence also
-    // 0 <= INT_MAX - b <= INTMAX.
-    // And "a + b <= INT_MAX" (which might overflow) is the
-    // same as a <= INT_MAX - b (no overflow)
-    return a <= INT_MAX - b;
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
 }
 
 // returns 1 if the product is valid, 0 on overflow.
 // negative factors are considered invalid.
-static int stbi__mul2sizes_valid(int a, int b) {
-    if (a < 0 || b < 0)
-        return 0;
-    if (b == 0)
-        return 1; // mul-by-0 is always safe
-    // portable way to check for no overflows in a*b
-    return a <= INT_MAX / b;
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
 }
 
 #if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad2sizes_valid(int a, int b, int add) {
-    return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
 }
 #endif
 
 // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad3sizes_valid(int a, int b, int c, int add) {
-    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__addsizes_valid(a * b * c, add);
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
 }
 
 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) {
-    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && stbi__mul2sizes_valid(a * b * c, d) &&
-           stbi__addsizes_valid(a * b * c * d, add);
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
 }
 #endif
 
 #if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 // mallocs with size overflow checking
-static void * stbi__malloc_mad2(int a, int b, int add) {
-    if (!stbi__mad2sizes_valid(a, b, add))
-        return NULL;
-    return stbi__malloc(a * b + add);
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
 }
 #endif
 
-static void * stbi__malloc_mad3(int a, int b, int c, int add) {
-    if (!stbi__mad3sizes_valid(a, b, c, add))
-        return NULL;
-    return stbi__malloc(a * b * c + add);
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
 }
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static void * stbi__malloc_mad4(int a, int b, int c, int d, int add) {
-    if (!stbi__mad4sizes_valid(a, b, c, d, add))
-        return NULL;
-    return stbi__malloc(a * b * c * d + add);
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
 }
 #endif
 
 // returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
-static int stbi__addints_valid(int a, int b) {
-    if ((a >= 0) != (b >= 0))
-        return 1; // a and b have different signs, so no overflow
-    if (a < 0 && b < 0)
-        return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
-    return a <= INT_MAX - b;
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
 }
 
-// returns 1 if the product of two signed shorts is valid, 0 on overflow.
-static int stbi__mul2shorts_valid(short a, short b) {
-    if (b == 0 || b == -1)
-        return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
-    if ((a >= 0) == (b >= 0))
-        return a <= SHRT_MAX / b; // product is positive, so similar to mul2sizes_valid
-    if (b < 0)
-        return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
-    return a >= SHRT_MIN / b;
+// returns 1 if the product of two ints fits in a signed short, 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
 }
 
 // stbi__err - error
@@ -1054,411 +1088,423 @@ static int stbi__mul2shorts_valid(short a, short b) {
 // stbi__errpuc - error returning pointer to unsigned char
 
 #ifdef STBI_NO_FAILURE_STRINGS
-#define stbi__err(x, y) 0
+   #define stbi__err(x,y)  0
 #elif defined(STBI_FAILURE_USERMSG)
-#define stbi__err(x, y) stbi__err(y)
+   #define stbi__err(x,y)  stbi__err(y)
 #else
-#define stbi__err(x, y) stbi__err(x)
+   #define stbi__err(x,y)  stbi__err(x)
 #endif
 
-#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL))
-#define stbi__errpuc(x, y) ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL))
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
 
-STBIDEF void stbi_image_free(void * retval_from_stbi_load) { STBI_FREE(retval_from_stbi_load); }
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
 
 #ifndef STBI_NO_LINEAR
-static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp);
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 #endif
 
 #ifndef STBI_NO_HDR
-static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp);
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
 #endif
 
 static int stbi__vertically_flip_on_load_global = 0;
 
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) {
-    stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
 }
 
 #ifndef STBI_THREAD_LOCAL
-#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
 #else
 static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
 
-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip) {
-    stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
-    stbi__vertically_flip_on_load_set = 1;
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
 }
 
-#define stbi__vertically_flip_on_load                                                                                          \
-    (stbi__vertically_flip_on_load_set ? stbi__vertically_flip_on_load_local : stbi__vertically_flip_on_load_global)
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
 #endif // STBI_THREAD_LOCAL
 
-static void * stbi__load_main(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) {
-    memset(ri, 0, sizeof(*ri));         // make sure it's initialized if we add new fields
-    ri->bits_per_channel = 8;           // default is 8 so most paths don't have to be changed
-    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
-    ri->num_channels = 0;
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
 
-// test the formats with a very explicit header first (at least a FOURCC
-// or distinctive magic number first)
-#ifndef STBI_NO_PNG
-    if (stbi__png_test(s))
-        return stbi__png_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_BMP
-    if (stbi__bmp_test(s))
-        return stbi__bmp_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_GIF
-    if (stbi__gif_test(s))
-        return stbi__gif_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PSD
-    if (stbi__psd_test(s))
-        return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
-#else
-    STBI_NOTUSED(bpc);
-#endif
-#ifndef STBI_NO_PIC
-    if (stbi__pic_test(s))
-        return stbi__pic_load(s, x, y, comp, req_comp, ri);
-#endif
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
 
-// then the formats that can end up attempting to load with just 1 or 2
-// bytes matching expectations; these are prone to false positives, so
-// try them later
-#ifndef STBI_NO_JPEG
-    if (stbi__jpeg_test(s))
-        return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PNM
-    if (stbi__pnm_test(s))
-        return stbi__pnm_load(s, x, y, comp, req_comp, ri);
-#endif
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
 
-#ifndef STBI_NO_HDR
-    if (stbi__hdr_test(s)) {
-        float * hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
-        return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
-    }
-#endif
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
 
-#ifndef STBI_NO_TGA
-    // test tga last because it's a crappy test!
-    if (stbi__tga_test(s))
-        return stbi__tga_load(s, x, y, comp, req_comp, ri);
-#endif
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
 
-    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static stbi_uc * stbi__convert_16_to_8(stbi__uint16 * orig, int w, int h, int channels) {
-    int i;
-    int img_len = w * h * channels;
-    stbi_uc * reduced;
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
 
-    reduced = (stbi_uc *)stbi__malloc(img_len);
-    if (reduced == NULL)
-        return stbi__errpuc("outofmem", "Out of memory");
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
 
-    for (i = 0; i < img_len; ++i)
-        reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
 
-    STBI_FREE(orig);
-    return reduced;
+   STBI_FREE(orig);
+   return reduced;
 }
 
-static stbi__uint16 * stbi__convert_8_to_16(stbi_uc * orig, int w, int h, int channels) {
-    int i;
-    int img_len = w * h * channels;
-    stbi__uint16 * enlarged;
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
 
-    enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2);
-    if (enlarged == NULL)
-        return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
 
-    for (i = 0; i < img_len; ++i)
-        enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
 
-    STBI_FREE(orig);
-    return enlarged;
+   STBI_FREE(orig);
+   return enlarged;
 }
 
-static void stbi__vertical_flip(void * image, int w, int h, int bytes_per_pixel) {
-    int row;
-    size_t bytes_per_row = (size_t)w * bytes_per_pixel;
-    stbi_uc temp[2048];
-    stbi_uc * bytes = (stbi_uc *)image;
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
 
-    for (row = 0; row < (h >> 1); row++) {
-        stbi_uc * row0 = bytes + row * bytes_per_row;
-        stbi_uc * row1 = bytes + (h - row - 1) * bytes_per_row;
-        // swap row0 with row1
-        size_t bytes_left = bytes_per_row;
-        while (bytes_left) {
-            size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
-            memcpy(temp, row0, bytes_copy);
-            memcpy(row0, row1, bytes_copy);
-            memcpy(row1, temp, bytes_copy);
-            row0 += bytes_copy;
-            row1 += bytes_copy;
-            bytes_left -= bytes_copy;
-        }
-    }
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
 }
 
 #ifndef STBI_NO_GIF
-static void stbi__vertical_flip_slices(void * image, int w, int h, int z, int bytes_per_pixel) {
-    int slice;
-    int slice_size = w * h * bytes_per_pixel;
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
 
-    stbi_uc * bytes = (stbi_uc *)image;
-    for (slice = 0; slice < z; ++slice) {
-        stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
-        bytes += slice_size;
-    }
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
 }
 #endif
 
-static unsigned char * stbi__load_and_postprocess_8bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
-    stbi__result_info ri;
-    void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 
-    if (result == NULL)
-        return NULL;
+   if (result == NULL)
+      return NULL;
 
-    // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-    STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 
-    if (ri.bits_per_channel != 8) {
-        result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
-        ri.bits_per_channel = 8;
-    }
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
 
-    // @TODO: move stbi__convert_format to here
+   // @TODO: move stbi__convert_format to here
 
-    if (stbi__vertically_flip_on_load) {
-        int channels = req_comp ? req_comp : *comp;
-        stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
-    }
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
 
-    return (unsigned char *)result;
+   return (unsigned char *) result;
 }
 
-static stbi__uint16 * stbi__load_and_postprocess_16bit(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
-    stbi__result_info ri;
-    void * result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 
-    if (result == NULL)
-        return NULL;
+   if (result == NULL)
+      return NULL;
 
-    // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-    STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 
-    if (ri.bits_per_channel != 16) {
-        result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
-        ri.bits_per_channel = 16;
-    }
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
 
-    // @TODO: move stbi__convert_format16 to here
-    // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
 
-    if (stbi__vertically_flip_on_load) {
-        int channels = req_comp ? req_comp : *comp;
-        stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
-    }
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
 
-    return (stbi__uint16 *)result;
+   return (stbi__uint16 *) result;
 }
 
 #if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
-static void stbi__float_postprocess(float * result, int * x, int * y, int * comp, int req_comp) {
-    if (stbi__vertically_flip_on_load && result != NULL) {
-        int channels = req_comp ? req_comp : *comp;
-        stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
-    }
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
 }
 #endif
 
 #ifndef STBI_NO_STDIO
 
 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char * str,
-                                                                    int cbmb, wchar_t * widestr, int cchwide);
-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags,
-                                                                    const wchar_t * widestr, int cchwide, char * str, int cbmb,
-                                                                    const char * defchar, int * used_default);
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
 #endif
 
 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBIDEF int stbi_convert_wchar_to_utf8(char * buffer, size_t bufferlen, const wchar_t * input) {
-    return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int)bufferlen, NULL, NULL);
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 }
 #endif
 
-static FILE * stbi__fopen(char const * filename, char const * mode) {
-    FILE * f;
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-    wchar_t wMode[64];
-    wchar_t wFilename[1024];
-    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename) / sizeof(*wFilename)))
-        return 0;
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
 
-    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode) / sizeof(*wMode)))
-        return 0;
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
 
 #if defined(_MSC_VER) && _MSC_VER >= 1400
-    if (0 != _wfopen_s(&f, wFilename, wMode))
-        f = 0;
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
 #else
-    f = _wfopen(wFilename, wMode);
+   f = _wfopen(wFilename, wMode);
 #endif
 
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-    if (0 != fopen_s(&f, filename, mode))
-        f = 0;
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
 #else
-    f = fopen(filename, mode);
+   f = fopen(filename, mode);
 #endif
-    return f;
+   return f;
 }
 
-STBIDEF stbi_uc * stbi_load(char const * filename, int * x, int * y, int * comp, int req_comp) {
-    FILE * f = stbi__fopen(filename, "rb");
-    unsigned char * result;
-    if (!f)
-        return stbi__errpuc("can't fopen", "Unable to open file");
-    result = stbi_load_from_file(f, x, y, comp, req_comp);
-    fclose(f);
-    return result;
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
 }
 
-STBIDEF stbi_uc * stbi_load_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) {
-    unsigned char * result;
-    stbi__context s;
-    stbi__start_file(&s, f);
-    result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-    if (result) {
-        // need to 'unget' all the characters in the IO buffer
-        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
-    }
-    return result;
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
 }
 
-STBIDEF stbi__uint16 * stbi_load_from_file_16(FILE * f, int * x, int * y, int * comp, int req_comp) {
-    stbi__uint16 * result;
-    stbi__context s;
-    stbi__start_file(&s, f);
-    result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
-    if (result) {
-        // need to 'unget' all the characters in the IO buffer
-        fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
-    }
-    return result;
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
 }
 
-STBIDEF stbi_us * stbi_load_16(char const * filename, int * x, int * y, int * comp, int req_comp) {
-    FILE * f = stbi__fopen(filename, "rb");
-    stbi__uint16 * result;
-    if (!f)
-        return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file");
-    result = stbi_load_from_file_16(f, x, y, comp, req_comp);
-    fclose(f);
-    return result;
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
 }
 
-#endif //! STBI_NO_STDIO
 
-STBIDEF stbi_us * stbi_load_16_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * channels_in_file,
-                                           int desired_channels) {
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
 }
 
-STBIDEF stbi_us * stbi_load_16_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y,
-                                              int * channels_in_file, int desired_channels) {
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-    return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
 }
 
-STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) {
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
-STBIDEF stbi_uc * stbi_load_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp,
-                                           int req_comp) {
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-    return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
 #ifndef STBI_NO_GIF
-STBIDEF stbi_uc * stbi_load_gif_from_memory(stbi_uc const * buffer, int len, int ** delays, int * x, int * y, int * z,
-                                            int * comp, int req_comp) {
-    unsigned char * result;
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
 
-    result = (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-    if (stbi__vertically_flip_on_load) {
-        stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
-    }
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
 
-    return result;
+   return result;
 }
 #endif
 
 #ifndef STBI_NO_LINEAR
-static float * stbi__loadf_main(stbi__context * s, int * x, int * y, int * comp, int req_comp) {
-    unsigned char * data;
-#ifndef STBI_NO_HDR
-    if (stbi__hdr_test(s)) {
-        stbi__result_info ri;
-        float * hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
-        if (hdr_data)
-            stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
-        return hdr_data;
-    }
-#endif
-    data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-    if (data)
-        return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
-    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
 }
 
-STBIDEF float * stbi_loadf_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp, int req_comp) {
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__loadf_main(&s, x, y, comp, req_comp);
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
 }
 
-STBIDEF float * stbi_loadf_from_callbacks(stbi_io_callbacks const * clbk, void * user, int * x, int * y, int * comp,
-                                          int req_comp) {
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-    return stbi__loadf_main(&s, x, y, comp, req_comp);
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
 }
 
 #ifndef STBI_NO_STDIO
-STBIDEF float * stbi_loadf(char const * filename, int * x, int * y, int * comp, int req_comp) {
-    float * result;
-    FILE * f = stbi__fopen(filename, "rb");
-    if (!f)
-        return stbi__errpf("can't fopen", "Unable to open file");
-    result = stbi_loadf_from_file(f, x, y, comp, req_comp);
-    fclose(f);
-    return result;
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
 }
 
-STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * comp, int req_comp) {
-    stbi__context s;
-    stbi__start_file(&s, f);
-    return stbi__loadf_main(&s, x, y, comp, req_comp);
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
 }
 #endif // !STBI_NO_STDIO
 
@@ -1468,208 +1514,222 @@ STBIDEF float * stbi_loadf_from_file(FILE * f, int * x, int * y, int * comp, int
 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
 // reports false!
 
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const * buffer, int len) {
-#ifndef STBI_NO_HDR
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__hdr_test(&s);
-#else
-    STBI_NOTUSED(buffer);
-    STBI_NOTUSED(len);
-    return 0;
-#endif
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
 }
 
 #ifndef STBI_NO_STDIO
-STBIDEF int stbi_is_hdr(char const * filename) {
-    FILE * f = stbi__fopen(filename, "rb");
-    int result = 0;
-    if (f) {
-        result = stbi_is_hdr_from_file(f);
-        fclose(f);
-    }
-    return result;
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
 }
 
-STBIDEF int stbi_is_hdr_from_file(FILE * f) {
-#ifndef STBI_NO_HDR
-    long pos = ftell(f);
-    int res;
-    stbi__context s;
-    stbi__start_file(&s, f);
-    res = stbi__hdr_test(&s);
-    fseek(f, pos, SEEK_SET);
-    return res;
-#else
-    STBI_NOTUSED(f);
-    return 0;
-#endif
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
 }
 #endif // !STBI_NO_STDIO
 
-STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const * clbk, void * user) {
-#ifndef STBI_NO_HDR
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-    return stbi__hdr_test(&s);
-#else
-    STBI_NOTUSED(clbk);
-    STBI_NOTUSED(user);
-    return 0;
-#endif
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
 }
 
 #ifndef STBI_NO_LINEAR
-static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f;
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 
-STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
-static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f;
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
 
-STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1 / gamma; }
-STBIDEF void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1 / scale; }
 
 //////////////////////////////////////////////////////////////////////////////
 //
 // Common code used by all image loaders
 //
 
-enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
 
-static void stbi__refill_buffer(stbi__context * s) {
-    int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
-    s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
-    if (n == 0) {
-        // at end of file, treat same as if from memory, but need to handle case
-        // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
-        s->read_from_callbacks = 0;
-        s->img_buffer = s->buffer_start;
-        s->img_buffer_end = s->buffer_start + 1;
-        *s->img_buffer = 0;
-    } else {
-        s->img_buffer = s->buffer_start;
-        s->img_buffer_end = s->buffer_start + n;
-    }
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
 }
 
-stbi_inline static stbi_uc stbi__get8(stbi__context * s) {
-    if (s->img_buffer < s->img_buffer_end)
-        return *s->img_buffer++;
-    if (s->read_from_callbacks) {
-        stbi__refill_buffer(s);
-        return *s->img_buffer++;
-    }
-    return 0;
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
 }
 
 #if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 // nothing
 #else
-stbi_inline static int stbi__at_eof(stbi__context * s) {
-    if (s->io.read) {
-        if (!(s->io.eof)(s->io_user_data))
-            return 0;
-        // if feof() is true, check if buffer = end
-        // special case: we've only got the special 0 character at the end
-        if (s->read_from_callbacks == 0)
-            return 1;
-    }
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
 
-    return s->img_buffer >= s->img_buffer_end;
+   return s->img_buffer >= s->img_buffer_end;
 }
 #endif
 
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) &&   \
-    defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
 // nothing
 #else
-static void stbi__skip(stbi__context * s, int n) {
-    if (n == 0)
-        return; // already there!
-    if (n < 0) {
-        s->img_buffer = s->img_buffer_end;
-        return;
-    }
-    if (s->io.read) {
-        int blen = (int)(s->img_buffer_end - s->img_buffer);
-        if (blen < n) {
-            s->img_buffer = s->img_buffer_end;
-            (s->io.skip)(s->io_user_data, n - blen);
-            return;
-        }
-    }
-    s->img_buffer += n;
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
 }
 #endif
 
 #if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
 // nothing
 #else
-static int stbi__getn(stbi__context * s, stbi_uc * buffer, int n) {
-    if (s->io.read) {
-        int blen = (int)(s->img_buffer_end - s->img_buffer);
-        if (blen < n) {
-            int res, count;
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
 
-            memcpy(buffer, s->img_buffer, blen);
+         memcpy(buffer, s->img_buffer, blen);
 
-            count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen);
-            res = (count == (n - blen));
-            s->img_buffer = s->img_buffer_end;
-            return res;
-        }
-    }
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
 
-    if (s->img_buffer + n <= s->img_buffer_end) {
-        memcpy(buffer, s->img_buffer, n);
-        s->img_buffer += n;
-        return 1;
-    } else
-        return 0;
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
 }
 #endif
 
 #if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
 // nothing
 #else
-static int stbi__get16be(stbi__context * s) {
-    int z = stbi__get8(s);
-    return (z << 8) + stbi__get8(s);
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
 }
 #endif
 
 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
 // nothing
 #else
-static stbi__uint32 stbi__get32be(stbi__context * s) {
-    stbi__uint32 z = stbi__get16be(s);
-    return (z << 16) + stbi__get16be(s);
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
 }
 #endif
 
 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
 // nothing
 #else
-static int stbi__get16le(stbi__context * s) {
-    int z = stbi__get8(s);
-    return z + (stbi__get8(s) << 8);
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
 }
 #endif
 
 #ifndef STBI_NO_BMP
-static stbi__uint32 stbi__get32le(stbi__context * s) {
-    stbi__uint32 z = stbi__get16le(s);
-    z += (stbi__uint32)stbi__get16le(s) << 16;
-    return z;
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
 }
 #endif
 
-#define STBI__BYTECAST(x) ((stbi_uc)((x)&255)) // truncate int to byte without warnings
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) &&   \
-    defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 // nothing
 #else
 //////////////////////////////////////////////////////////////////////////////
@@ -1683,264 +1743,169 @@ static stbi__uint32 stbi__get32le(stbi__context * s) {
 //  assume data buffer is malloced, so malloc a new one and free that one
 //  only failure mode is malloc failing
 
-static stbi_uc stbi__compute_y(int r, int g, int b) { return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); }
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
 #endif
 
-#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) &&    \
-    defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 // nothing
 #else
-static unsigned char * stbi__convert_format(unsigned char * data, int img_n, int req_comp, unsigned int x, unsigned int y) {
-    int i, j;
-    unsigned char * good;
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
 
-    if (req_comp == img_n)
-        return data;
-    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0);
-    if (good == NULL) {
-        STBI_FREE(data);
-        return stbi__errpuc("outofmem", "Out of memory");
-    }
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
 
-    for (j = 0; j < (int)y; ++j) {
-        unsigned char * src = data + j * x * img_n;
-        unsigned char * dest = good + j * x * req_comp;
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) ((a)*8 + (b))
-#define STBI__CASE(a, b)                                                                                                       \
-    case STBI__COMBO(a, b):                                                                                                    \
-        for (i = x - 1; i >= 0; --i, src += a, dest += b)
-        // convert source image with img_n components to one with req_comp components;
-        // avoid switch per pixel, so use switch per scanline and massive macros
-        switch (STBI__COMBO(img_n, req_comp)) {
-            STBI__CASE(1, 2) {
-                dest[0] = src[0];
-                dest[1] = 255;
-            }
-            break;
-            STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-            break;
-            STBI__CASE(1, 4) {
-                dest[0] = dest[1] = dest[2] = src[0];
-                dest[3] = 255;
-            }
-            break;
-            STBI__CASE(2, 1) { dest[0] = src[0]; }
-            break;
-            STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-            break;
-            STBI__CASE(2, 4) {
-                dest[0] = dest[1] = dest[2] = src[0];
-                dest[3] = src[1];
-            }
-            break;
-            STBI__CASE(3, 4) {
-                dest[0] = src[0];
-                dest[1] = src[1];
-                dest[2] = src[2];
-                dest[3] = 255;
-            }
-            break;
-            STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
-            break;
-            STBI__CASE(3, 2) {
-                dest[0] = stbi__compute_y(src[0], src[1], src[2]);
-                dest[1] = 255;
-            }
-            break;
-            STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
-            break;
-            STBI__CASE(4, 2) {
-                dest[0] = stbi__compute_y(src[0], src[1], src[2]);
-                dest[1] = src[3];
-            }
-            break;
-            STBI__CASE(4, 3) {
-                dest[0] = src[0];
-                dest[1] = src[1];
-                dest[2] = src[2];
-            }
-            break;
-        default:
-            STBI_ASSERT(0);
-            STBI_FREE(data);
-            STBI_FREE(good);
-            return stbi__errpuc("unsupported", "Unsupported format conversion");
-        }
-#undef STBI__CASE
-    }
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
 
-    STBI_FREE(data);
-    return good;
+   STBI_FREE(data);
+   return good;
 }
 #endif
 
 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
 // nothing
 #else
-static stbi__uint16 stbi__compute_y_16(int r, int g, int b) { return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); }
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
 #endif
 
 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
 // nothing
 #else
-static stbi__uint16 * stbi__convert_format16(stbi__uint16 * data, int img_n, int req_comp, unsigned int x, unsigned int y) {
-    int i, j;
-    stbi__uint16 * good;
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
 
-    if (req_comp == img_n)
-        return data;
-    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-    good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2);
-    if (good == NULL) {
-        STBI_FREE(data);
-        return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
-    }
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
 
-    for (j = 0; j < (int)y; ++j) {
-        stbi__uint16 * src = data + j * x * img_n;
-        stbi__uint16 * dest = good + j * x * req_comp;
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
 
-#define STBI__COMBO(a, b) ((a)*8 + (b))
-#define STBI__CASE(a, b)                                                                                                       \
-    case STBI__COMBO(a, b):                                                                                                    \
-        for (i = x - 1; i >= 0; --i, src += a, dest += b)
-        // convert source image with img_n components to one with req_comp components;
-        // avoid switch per pixel, so use switch per scanline and massive macros
-        switch (STBI__COMBO(img_n, req_comp)) {
-            STBI__CASE(1, 2) {
-                dest[0] = src[0];
-                dest[1] = 0xffff;
-            }
-            break;
-            STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-            break;
-            STBI__CASE(1, 4) {
-                dest[0] = dest[1] = dest[2] = src[0];
-                dest[3] = 0xffff;
-            }
-            break;
-            STBI__CASE(2, 1) { dest[0] = src[0]; }
-            break;
-            STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-            break;
-            STBI__CASE(2, 4) {
-                dest[0] = dest[1] = dest[2] = src[0];
-                dest[3] = src[1];
-            }
-            break;
-            STBI__CASE(3, 4) {
-                dest[0] = src[0];
-                dest[1] = src[1];
-                dest[2] = src[2];
-                dest[3] = 0xffff;
-            }
-            break;
-            STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
-            break;
-            STBI__CASE(3, 2) {
-                dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
-                dest[1] = 0xffff;
-            }
-            break;
-            STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
-            break;
-            STBI__CASE(4, 2) {
-                dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
-                dest[1] = src[3];
-            }
-            break;
-            STBI__CASE(4, 3) {
-                dest[0] = src[0];
-                dest[1] = src[1];
-                dest[2] = src[2];
-            }
-            break;
-        default:
-            STBI_ASSERT(0);
-            STBI_FREE(data);
-            STBI_FREE(good);
-            return (stbi__uint16 *)stbi__errpuc("unsupported", "Unsupported format conversion");
-        }
-#undef STBI__CASE
-    }
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
 
-    STBI_FREE(data);
-    return good;
+   STBI_FREE(data);
+   return good;
 }
 #endif
 
 #ifndef STBI_NO_LINEAR
-static float * stbi__ldr_to_hdr(stbi_uc * data, int x, int y, int comp) {
-    int i, k, n;
-    float * output;
-    if (!data)
-        return NULL;
-    output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-    if (output == NULL) {
-        STBI_FREE(data);
-        return stbi__errpf("outofmem", "Out of memory");
-    }
-    // compute number of non-alpha components
-    if (comp & 1)
-        n = comp;
-    else
-        n = comp - 1;
-    for (i = 0; i < x * y; ++i) {
-        for (k = 0; k < n; ++k) {
-            output[i * comp + k] = (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
-        }
-    }
-    if (n < comp) {
-        for (i = 0; i < x * y; ++i) {
-            output[i * comp + n] = data[i * comp + n] / 255.0f;
-        }
-    }
-    STBI_FREE(data);
-    return output;
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
 }
 #endif
 
 #ifndef STBI_NO_HDR
-#define stbi__float2int(x) ((int)(x))
-static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp) {
-    int i, k, n;
-    stbi_uc * output;
-    if (!data)
-        return NULL;
-    output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0);
-    if (output == NULL) {
-        STBI_FREE(data);
-        return stbi__errpuc("outofmem", "Out of memory");
-    }
-    // compute number of non-alpha components
-    if (comp & 1)
-        n = comp;
-    else
-        n = comp - 1;
-    for (i = 0; i < x * y; ++i) {
-        for (k = 0; k < n; ++k) {
-            float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-            if (z < 0)
-                z = 0;
-            if (z > 255)
-                z = 255;
-            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
-        }
-        if (k < comp) {
-            float z = data[i * comp + k] * 255 + 0.5f;
-            if (z < 0)
-                z = 0;
-            if (z > 255)
-                z = 255;
-            output[i * comp + k] = (stbi_uc)stbi__float2int(z);
-        }
-    }
-    STBI_FREE(data);
-    return output;
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
 }
 #endif
 
@@ -1968,783 +1933,763 @@ static stbi_uc * stbi__hdr_to_ldr(float * data, int x, int y, int comp) {
 #ifndef STBI_NO_JPEG
 
 // huffman decoding acceleration
-#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
 
-typedef struct {
-    stbi_uc fast[1 << FAST_BITS];
-    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
-    stbi__uint16 code[256];
-    stbi_uc values[256];
-    stbi_uc size[257];
-    unsigned int maxcode[18];
-    int delta[17]; // old 'firstsymbol' - old 'firstcode'
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
 } stbi__huffman;
 
-typedef struct {
-    stbi__context * s;
-    stbi__huffman huff_dc[4];
-    stbi__huffman huff_ac[4];
-    stbi__uint16 dequant[4][64];
-    stbi__int16 fast_ac[4][1 << FAST_BITS];
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
 
-    // sizes for components, interleaved MCUs
-    int img_h_max, img_v_max;
-    int img_mcu_x, img_mcu_y;
-    int img_mcu_w, img_mcu_h;
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
 
-    // definition of jpeg image component
-    struct {
-        int id;
-        int h, v;
-        int tq;
-        int hd, ha;
-        int dc_pred;
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
 
-        int x, y, w2, h2;
-        stbi_uc * data;
-        void *raw_data, *raw_coeff;
-        stbi_uc * linebuf;
-        short * coeff;        // progressive only
-        int coeff_w, coeff_h; // number of 8x8 coefficient blocks
-    } img_comp[4];
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
 
-    stbi__uint32 code_buffer; // jpeg entropy-coded buffer
-    int code_bits;            // number of valid bits
-    unsigned char marker;     // marker seen while filling entropy buffer
-    int nomore;               // flag if we saw a marker so must stop
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
 
-    int progressive;
-    int spec_start;
-    int spec_end;
-    int succ_high;
-    int succ_low;
-    int eob_run;
-    int jfif;
-    int app14_color_transform; // Adobe APP14 tag
-    int rgb;
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
 
-    int scan_n, order[4];
-    int restart_interval, todo;
+   int scan_n, order[4];
+   int restart_interval, todo;
 
-    // kernels
-    void (*idct_block_kernel)(stbi_uc * out, int out_stride, short data[64]);
-    void (*YCbCr_to_RGB_kernel)(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count,
-                                int step);
-    stbi_uc * (*resample_row_hv_2_kernel)(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs);
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
 } stbi__jpeg;
 
-static int stbi__build_huffman(stbi__huffman * h, int * count) {
-    int i, j, k = 0;
-    unsigned int code;
-    // build size list for each symbol (from JPEG spec)
-    for (i = 0; i < 16; ++i) {
-        for (j = 0; j < count[i]; ++j) {
-            h->size[k++] = (stbi_uc)(i + 1);
-            if (k >= 257)
-                return stbi__err("bad size list", "Corrupt JPEG");
-        }
-    }
-    h->size[k] = 0;
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
 
-    // compute actual symbols (from jpeg spec)
-    code = 0;
-    k = 0;
-    for (j = 1; j <= 16; ++j) {
-        // compute delta to add to code to compute symbol id
-        h->delta[j] = k - code;
-        if (h->size[k] == j) {
-            while (h->size[k] == j)
-                h->code[k++] = (stbi__uint16)(code++);
-            if (code - 1 >= (1u << j))
-                return stbi__err("bad code lengths", "Corrupt JPEG");
-        }
-        // compute largest code + 1 for this size, preshifted as needed later
-        h->maxcode[j] = code << (16 - j);
-        code <<= 1;
-    }
-    h->maxcode[j] = 0xffffffff;
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
 
-    // build non-spec acceleration table; 255 is flag for not-accelerated
-    memset(h->fast, 255, 1 << FAST_BITS);
-    for (i = 0; i < k; ++i) {
-        int s = h->size[i];
-        if (s <= FAST_BITS) {
-            int c = h->code[i] << (FAST_BITS - s);
-            int m = 1 << (FAST_BITS - s);
-            for (j = 0; j < m; ++j) {
-                h->fast[c + j] = (stbi_uc)i;
-            }
-        }
-    }
-    return 1;
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
 }
 
 // build a table that decodes both magnitude and value of small ACs in
 // one go.
-static void stbi__build_fast_ac(stbi__int16 * fast_ac, stbi__huffman * h) {
-    int i;
-    for (i = 0; i < (1 << FAST_BITS); ++i) {
-        stbi_uc fast = h->fast[i];
-        fast_ac[i] = 0;
-        if (fast < 255) {
-            int rs = h->values[fast];
-            int run = (rs >> 4) & 15;
-            int magbits = rs & 15;
-            int len = h->size[fast];
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
 
-            if (magbits && len + magbits <= FAST_BITS) {
-                // magnitude code followed by receive_extend code
-                int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
-                int m = 1 << (magbits - 1);
-                if (k < m)
-                    k += (~0U << magbits) + 1;
-                // if the result is small enough, we can fit it in fast_ac table
-                if (k >= -128 && k <= 127)
-                    fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
-            }
-        }
-    }
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
 }
 
-static void stbi__grow_buffer_unsafe(stbi__jpeg * j) {
-    do {
-        unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-        if (b == 0xff) {
-            int c = stbi__get8(j->s);
-            while (c == 0xff)
-                c = stbi__get8(j->s); // consume fill bytes
-            if (c != 0) {
-                j->marker = (unsigned char)c;
-                j->nomore = 1;
-                return;
-            }
-        }
-        j->code_buffer |= b << (24 - j->code_bits);
-        j->code_bits += 8;
-    } while (j->code_bits <= 24);
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
 }
 
 // (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17] = {0,   1,    3,    7,    15,   31,    63,    127,  255,
-                                             511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 
 // decode a jpeg huffman value from the bitstream
-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg * j, stbi__huffman * h) {
-    unsigned int temp;
-    int c, k;
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
 
-    if (j->code_bits < 16)
-        stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 
-    // look at the top FAST_BITS and determine what symbol ID it is,
-    // if the code is <= FAST_BITS
-    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-    k = h->fast[c];
-    if (k < 255) {
-        int s = h->size[k];
-        if (s > j->code_bits)
-            return -1;
-        j->code_buffer <<= s;
-        j->code_bits -= s;
-        return h->values[k];
-    }
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
 
-    // naive test is to shift the code_buffer down so k bits are
-    // valid, then test against maxcode. To speed this up, we've
-    // preshifted maxcode left so that it has (16-k) 0s at the
-    // end; in other words, regardless of the number of bits, it
-    // wants to be compared against something shifted to have 16;
-    // that way we don't need to shift inside the loop.
-    temp = j->code_buffer >> 16;
-    for (k = FAST_BITS + 1;; ++k)
-        if (temp < h->maxcode[k])
-            break;
-    if (k == 17) {
-        // error! code not found
-        j->code_bits -= 16;
-        return -1;
-    }
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
 
-    if (k > j->code_bits)
-        return -1;
+   if (k > j->code_bits)
+      return -1;
 
-    // convert the huffman code to the symbol id
-    c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
-    if (c < 0 || c >= 256) // symbol id out of bounds!
-        return -1;
-    STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
 
-    // convert the id to a symbol
-    j->code_bits -= k;
-    j->code_buffer <<= k;
-    return h->values[c];
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
 }
 
 // bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767};
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
-stbi_inline static int stbi__extend_receive(stbi__jpeg * j, int n) {
-    unsigned int k;
-    int sgn;
-    if (j->code_bits < n)
-        stbi__grow_buffer_unsafe(j);
-    if (j->code_bits < n)
-        return 0; // ran out of bits from stream, return 0s intead of continuing
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
 
-    sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
-    k = stbi_lrot(j->code_buffer, n);
-    j->code_buffer = k & ~stbi__bmask[n];
-    k &= stbi__bmask[n];
-    j->code_bits -= n;
-    return k + (stbi__jbias[n] & (sgn - 1));
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
 }
 
 // get some unsigned bits
-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg * j, int n) {
-    unsigned int k;
-    if (j->code_bits < n)
-        stbi__grow_buffer_unsafe(j);
-    if (j->code_bits < n)
-        return 0; // ran out of bits from stream, return 0s intead of continuing
-    k = stbi_lrot(j->code_buffer, n);
-    j->code_buffer = k & ~stbi__bmask[n];
-    k &= stbi__bmask[n];
-    j->code_bits -= n;
-    return k;
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
 }
 
-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg * j) {
-    unsigned int k;
-    if (j->code_bits < 1)
-        stbi__grow_buffer_unsafe(j);
-    if (j->code_bits < 1)
-        return 0; // ran out of bits from stream, return 0s intead of continuing
-    k = j->code_buffer;
-    j->code_buffer <<= 1;
-    --j->code_bits;
-    return k & 0x80000000;
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
 }
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = {
-    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35,
-    42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
-    // let corrupt input sample past end
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63};
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg * j, short data[64], stbi__huffman * hdc, stbi__huffman * hac, stbi__int16 * fac,
-                                   int b, stbi__uint16 * dequant) {
-    int diff, dc, k;
-    int t;
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
 
-    if (j->code_bits < 16)
-        stbi__grow_buffer_unsafe(j);
-    t = stbi__jpeg_huff_decode(j, hdc);
-    if (t < 0 || t > 15)
-        return stbi__err("bad huffman code", "Corrupt JPEG");
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
 
-    // 0 all the ac values now so we can do it 32-bits at a time
-    memset(data, 0, 64 * sizeof(data[0]));
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
 
-    diff = t ? stbi__extend_receive(j, t) : 0;
-    if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff))
-        return stbi__err("bad delta", "Corrupt JPEG");
-    dc = j->img_comp[b].dc_pred + diff;
-    j->img_comp[b].dc_pred = dc;
-    if (!stbi__mul2shorts_valid(dc, dequant[0]))
-        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-    data[0] = (short)(dc * dequant[0]);
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
 
-    // decode AC components, see JPEG spec
-    k = 1;
-    do {
-        unsigned int zig;
-        int c, r, s;
-        if (j->code_bits < 16)
-            stbi__grow_buffer_unsafe(j);
-        c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-        r = fac[c];
-        if (r) {                // fast-AC path
-            k += (r >> 4) & 15; // run
-            s = r & 15;         // combined length
-            if (s > j->code_bits)
-                return stbi__err("bad huffman code", "Combined length longer than code bits available");
-            j->code_buffer <<= s;
-            j->code_bits -= s;
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
             // decode into unzigzag'd location
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short)((r >> 8) * dequant[zig]);
-        } else {
-            int rs = stbi__jpeg_huff_decode(j, hac);
-            if (rs < 0)
-                return stbi__err("bad huffman code", "Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-                if (rs != 0xf0)
-                    break; // end block
-                k += 16;
-            } else {
-                k += r;
-                // decode into unzigzag'd location
-                zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
-            }
-        }
-    } while (k < 64);
-    return 1;
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
 }
 
-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg * j, short data[64], stbi__huffman * hdc, int b) {
-    int diff, dc;
-    int t;
-    if (j->spec_end != 0)
-        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if (j->code_bits < 16)
-        stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 
-    if (j->succ_high == 0) {
-        // first scan for DC coefficient, must be first
-        memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
-        t = stbi__jpeg_huff_decode(j, hdc);
-        if (t < 0 || t > 15)
-            return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-        diff = t ? stbi__extend_receive(j, t) : 0;
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
 
-        if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff))
-            return stbi__err("bad delta", "Corrupt JPEG");
-        dc = j->img_comp[b].dc_pred + diff;
-        j->img_comp[b].dc_pred = dc;
-        if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low))
-            return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-        data[0] = (short)(dc * (1 << j->succ_low));
-    } else {
-        // refinement scan for DC coefficient
-        if (stbi__jpeg_get_bit(j))
-            data[0] += (short)(1 << j->succ_low);
-    }
-    return 1;
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
 }
 
 // @OPTIMIZE: store non-zigzagged during the decode passes,
 // and only de-zigzag when dequantizing
-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg * j, short data[64], stbi__huffman * hac, stbi__int16 * fac) {
-    int k;
-    if (j->spec_start == 0)
-        return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 
-    if (j->succ_high == 0) {
-        int shift = j->succ_low;
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
 
-        if (j->eob_run) {
-            --j->eob_run;
-            return 1;
-        }
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
 
-        k = j->spec_start;
-        do {
-            unsigned int zig;
-            int c, r, s;
-            if (j->code_bits < 16)
-                stbi__grow_buffer_unsafe(j);
-            c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-            r = fac[c];
-            if (r) {                // fast-AC path
-                k += (r >> 4) & 15; // run
-                s = r & 15;         // combined length
-                if (s > j->code_bits)
-                    return stbi__err("bad huffman code", "Combined length longer than code bits available");
-                j->code_buffer <<= s;
-                j->code_bits -= s;
-                zig = stbi__jpeg_dezigzag[k++];
-                data[zig] = (short)((r >> 8) * (1 << shift));
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
             } else {
-                int rs = stbi__jpeg_huff_decode(j, hac);
-                if (rs < 0)
-                    return stbi__err("bad huffman code", "Corrupt JPEG");
-                s = rs & 15;
-                r = rs >> 4;
-                if (s == 0) {
-                    if (r < 15) {
-                        j->eob_run = (1 << r);
-                        if (r)
-                            j->eob_run += stbi__jpeg_get_bits(j, r);
-                        --j->eob_run;
-                        break;
-                    }
-                    k += 16;
-                } else {
-                    k += r;
-                    zig = stbi__jpeg_dezigzag[k++];
-                    data[zig] = (short)(stbi__extend_receive(j, s) * (1 << shift));
-                }
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
             }
-        } while (k <= j->spec_end);
-    } else {
-        // refinement scan for these AC coefficients
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
 
-        short bit = (short)(1 << j->succ_low);
+      short bit = (short) (1 << j->succ_low);
 
-        if (j->eob_run) {
-            --j->eob_run;
-            for (k = j->spec_start; k <= j->spec_end; ++k) {
-                short * p = &data[stbi__jpeg_dezigzag[k]];
-                if (*p != 0)
-                    if (stbi__jpeg_get_bit(j))
-                        if ((*p & bit) == 0) {
-                            if (*p > 0)
-                                *p += bit;
-                            else
-                                *p -= bit;
-                        }
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
             }
-        } else {
-            k = j->spec_start;
-            do {
-                int r, s;
-                int rs = stbi__jpeg_huff_decode(
-                    j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-                if (rs < 0)
-                    return stbi__err("bad huffman code", "Corrupt JPEG");
-                s = rs & 15;
-                r = rs >> 4;
-                if (s == 0) {
-                    if (r < 15) {
-                        j->eob_run = (1 << r) - 1;
-                        if (r)
-                            j->eob_run += stbi__jpeg_get_bits(j, r);
-                        r = 64; // force end of block
-                    } else {
-                        // r=15 s=0 should write 16 0s, so we just do
-                        // a run of 15 0s and then write s (which is 0),
-                        // so we don't have to do anything special here
-                    }
-                } else {
-                    if (s != 1)
-                        return stbi__err("bad huffman code", "Corrupt JPEG");
-                    // sign bit
-                    if (stbi__jpeg_get_bit(j))
-                        s = bit;
-                    else
-                        s = -bit;
-                }
 
-                // advance by r
-                while (k <= j->spec_end) {
-                    short * p = &data[stbi__jpeg_dezigzag[k++]];
-                    if (*p != 0) {
-                        if (stbi__jpeg_get_bit(j))
-                            if ((*p & bit) == 0) {
-                                if (*p > 0)
-                                    *p += bit;
-                                else
-                                    *p -= bit;
-                            }
-                    } else {
-                        if (r == 0) {
-                            *p = (short)s;
-                            break;
-                        }
-                        --r;
-                    }
-                }
-            } while (k <= j->spec_end);
-        }
-    }
-    return 1;
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
 }
 
 // take a -128..127 value and stbi__clamp it and convert to 0..255
-stbi_inline static stbi_uc stbi__clamp(int x) {
-    // trick to use a single test to catch both cases
-    if ((unsigned int)x > 255) {
-        if (x < 0)
-            return 0;
-        if (x > 255)
-            return 255;
-    }
-    return (stbi_uc)x;
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
 }
 
-#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
-#define stbi__fsh(x) ((x)*4096)
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
 
 // derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)                                                                          \
-    int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;                                                                    \
-    p2 = s2;                                                                                                                   \
-    p3 = s6;                                                                                                                   \
-    p1 = (p2 + p3) * stbi__f2f(0.5411961f);                                                                                    \
-    t2 = p1 + p3 * stbi__f2f(-1.847759065f);                                                                                   \
-    t3 = p1 + p2 * stbi__f2f(0.765366865f);                                                                                    \
-    p2 = s0;                                                                                                                   \
-    p3 = s4;                                                                                                                   \
-    t0 = stbi__fsh(p2 + p3);                                                                                                   \
-    t1 = stbi__fsh(p2 - p3);                                                                                                   \
-    x0 = t0 + t3;                                                                                                              \
-    x3 = t0 - t3;                                                                                                              \
-    x1 = t1 + t2;                                                                                                              \
-    x2 = t1 - t2;                                                                                                              \
-    t0 = s7;                                                                                                                   \
-    t1 = s5;                                                                                                                   \
-    t2 = s3;                                                                                                                   \
-    t3 = s1;                                                                                                                   \
-    p3 = t0 + t2;                                                                                                              \
-    p4 = t1 + t3;                                                                                                              \
-    p1 = t0 + t3;                                                                                                              \
-    p2 = t1 + t2;                                                                                                              \
-    p5 = (p3 + p4) * stbi__f2f(1.175875602f);                                                                                  \
-    t0 = t0 * stbi__f2f(0.298631336f);                                                                                         \
-    t1 = t1 * stbi__f2f(2.053119869f);                                                                                         \
-    t2 = t2 * stbi__f2f(3.072711026f);                                                                                         \
-    t3 = t3 * stbi__f2f(1.501321110f);                                                                                         \
-    p1 = p5 + p1 * stbi__f2f(-0.899976223f);                                                                                   \
-    p2 = p5 + p2 * stbi__f2f(-2.562915447f);                                                                                   \
-    p3 = p3 * stbi__f2f(-1.961570560f);                                                                                        \
-    p4 = p4 * stbi__f2f(-0.390180644f);                                                                                        \
-    t3 += p1 + p4;                                                                                                             \
-    t2 += p2 + p3;                                                                                                             \
-    t1 += p2 + p4;                                                                                                             \
-    t0 += p1 + p3;
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
 
-static void stbi__idct_block(stbi_uc * out, int out_stride, short data[64]) {
-    int i, val[64], *v = val;
-    stbi_uc * o;
-    short * d = data;
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
 
-    // columns
-    for (i = 0; i < 8; ++i, ++d, ++v) {
-        // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) {
-            //    no shortcut                 0     seconds
-            //    (1|2|3|4|5|6|7)==0          0     seconds
-            //    all separate               -0.047 seconds
-            //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-            int dcterm = d[0] * 4;
-            v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
-        } else {
-            STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
-            // constants scaled things up by 1<<12; let's bring them back
-            // down, but keep 2 extra bits of precision
-            x0 += 512;
-            x1 += 512;
-            x2 += 512;
-            x3 += 512;
-            v[0] = (x0 + t3) >> 10;
-            v[56] = (x0 - t3) >> 10;
-            v[8] = (x1 + t2) >> 10;
-            v[48] = (x1 - t2) >> 10;
-            v[16] = (x2 + t1) >> 10;
-            v[40] = (x2 - t1) >> 10;
-            v[24] = (x3 + t0) >> 10;
-            v[32] = (x3 - t0) >> 10;
-        }
-    }
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
 
-    for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
-        // no fast case since the first 1D IDCT spread components out
-        STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
-        // constants scaled things up by 1<<12, plus we had 1<<2 from first
-        // loop, plus horizontal and vertical each scale by sqrt(8) so together
-        // we've got an extra 1<<3, so 1<<17 total we need to remove.
-        // so we want to round that, which means adding 0.5 * 1<<17,
-        // aka 65536. Also, we'll end up with -128 to 127 that we want
-        // to encode as 0..255 by adding 128, so we'll add that before the shift
-        x0 += 65536 + (128 << 17);
-        x1 += 65536 + (128 << 17);
-        x2 += 65536 + (128 << 17);
-        x3 += 65536 + (128 << 17);
-        // tried computing the shifts into temps, or'ing the temps to see
-        // if any were out of range, but that was slower
-        o[0] = stbi__clamp((x0 + t3) >> 17);
-        o[7] = stbi__clamp((x0 - t3) >> 17);
-        o[1] = stbi__clamp((x1 + t2) >> 17);
-        o[6] = stbi__clamp((x1 - t2) >> 17);
-        o[2] = stbi__clamp((x2 + t1) >> 17);
-        o[5] = stbi__clamp((x2 - t1) >> 17);
-        o[3] = stbi__clamp((x3 + t0) >> 17);
-        o[4] = stbi__clamp((x3 - t0) >> 17);
-    }
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
 }
 
 #ifdef STBI_SSE2
 // sse2 integer IDCT. not the fastest possible implementation but it
 // produces bit-identical results to the generic C version so it's
 // fully "transparent".
-static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
-    // This is constructed to match our regular (generic) integer IDCT exactly.
-    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-    __m128i tmp;
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
 
-// dot product constant: even elems=x, odd elems=y
-#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
 
-// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
-// out(1) = c1[even]*x + c1[odd]*y
-#define dct_rot(out0, out1, x, y, c0, c1)                                                                                      \
-    __m128i c0##lo = _mm_unpacklo_epi16((x), (y));                                                                             \
-    __m128i c0##hi = _mm_unpackhi_epi16((x), (y));                                                                             \
-    __m128i out0##_l = _mm_madd_epi16(c0##lo, c0);                                                                             \
-    __m128i out0##_h = _mm_madd_epi16(c0##hi, c0);                                                                             \
-    __m128i out1##_l = _mm_madd_epi16(c0##lo, c1);                                                                             \
-    __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
 
-// out = in << 12  (in 16-bit, out 32-bit)
-#define dct_widen(out, in)                                                                                                     \
-    __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4);                                        \
-    __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
 
-// wide add
-#define dct_wadd(out, a, b)                                                                                                    \
-    __m128i out##_l = _mm_add_epi32(a##_l, b##_l);                                                                             \
-    __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
 
-// wide sub
-#define dct_wsub(out, a, b)                                                                                                    \
-    __m128i out##_l = _mm_sub_epi32(a##_l, b##_l);                                                                             \
-    __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
 
-// butterfly a/b, add bias, then shift by "s" and pack
-#define dct_bfly32o(out0, out1, a, b, bias, s)                                                                                 \
-    {                                                                                                                          \
-        __m128i abiased_l = _mm_add_epi32(a##_l, bias);                                                                        \
-        __m128i abiased_h = _mm_add_epi32(a##_h, bias);                                                                        \
-        dct_wadd(sum, abiased, b);                                                                                             \
-        dct_wsub(dif, abiased, b);                                                                                             \
-        out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s));                                            \
-        out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s));                                            \
-    }
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
 
-// 8-bit interleave step (for transposes)
-#define dct_interleave8(a, b)                                                                                                  \
-    tmp = a;                                                                                                                   \
-    a = _mm_unpacklo_epi8(a, b);                                                                                               \
-    b = _mm_unpackhi_epi8(tmp, b)
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
 
-// 16-bit interleave step (for transposes)
-#define dct_interleave16(a, b)                                                                                                 \
-    tmp = a;                                                                                                                   \
-    a = _mm_unpacklo_epi16(a, b);                                                                                              \
-    b = _mm_unpackhi_epi16(tmp, b)
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
 
-#define dct_pass(bias, shift)                                                                                                  \
-    {                                                                                                                          \
-        /* even part */                                                                                                        \
-        dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);                                                                         \
-        __m128i sum04 = _mm_add_epi16(row0, row4);                                                                             \
-        __m128i dif04 = _mm_sub_epi16(row0, row4);                                                                             \
-        dct_widen(t0e, sum04);                                                                                                 \
-        dct_widen(t1e, dif04);                                                                                                 \
-        dct_wadd(x0, t0e, t3e);                                                                                                \
-        dct_wsub(x3, t0e, t3e);                                                                                                \
-        dct_wadd(x1, t1e, t2e);                                                                                                \
-        dct_wsub(x2, t1e, t2e);                                                                                                \
-        /* odd part */                                                                                                         \
-        dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);                                                                         \
-        dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);                                                                         \
-        __m128i sum17 = _mm_add_epi16(row1, row7);                                                                             \
-        __m128i sum35 = _mm_add_epi16(row3, row5);                                                                             \
-        dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1);                                                                       \
-        dct_wadd(x4, y0o, y4o);                                                                                                \
-        dct_wadd(x5, y1o, y5o);                                                                                                \
-        dct_wadd(x6, y2o, y5o);                                                                                                \
-        dct_wadd(x7, y3o, y4o);                                                                                                \
-        dct_bfly32o(row0, row7, x0, x7, bias, shift);                                                                          \
-        dct_bfly32o(row1, row6, x1, x6, bias, shift);                                                                          \
-        dct_bfly32o(row2, row5, x2, x5, bias, shift);                                                                          \
-        dct_bfly32o(row3, row4, x3, x4, bias, shift);                                                                          \
-    }
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
 
-    __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
-    __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f), stbi__f2f(0.5411961f));
-    __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
-    __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
-    __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f), stbi__f2f(-1.961570560f));
-    __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
-    __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f), stbi__f2f(-0.390180644f));
-    __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
 
-    // rounding biases in column/row passes, see stbi__idct_block for explanation.
-    __m128i bias_0 = _mm_set1_epi32(512);
-    __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
 
-    // load
-    row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
-    row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
-    row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
-    row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
-    row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
-    row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
-    row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
-    row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
 
-    // column pass
-    dct_pass(bias_0, 10);
+   // column pass
+   dct_pass(bias_0, 10);
 
-    {
-        // 16bit 8x8 transpose pass 1
-        dct_interleave16(row0, row4);
-        dct_interleave16(row1, row5);
-        dct_interleave16(row2, row6);
-        dct_interleave16(row3, row7);
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
 
-        // transpose pass 2
-        dct_interleave16(row0, row2);
-        dct_interleave16(row1, row3);
-        dct_interleave16(row4, row6);
-        dct_interleave16(row5, row7);
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
 
-        // transpose pass 3
-        dct_interleave16(row0, row1);
-        dct_interleave16(row2, row3);
-        dct_interleave16(row4, row5);
-        dct_interleave16(row6, row7);
-    }
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
 
-    // row pass
-    dct_pass(bias_1, 17);
+   // row pass
+   dct_pass(bias_1, 17);
 
-    {
-        // pack
-        __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
-        __m128i p1 = _mm_packus_epi16(row2, row3);
-        __m128i p2 = _mm_packus_epi16(row4, row5);
-        __m128i p3 = _mm_packus_epi16(row6, row7);
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
 
-        // 8bit 8x8 transpose pass 1
-        dct_interleave8(p0, p2); // a0e0a1e1...
-        dct_interleave8(p1, p3); // c0g0c1g1...
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
 
-        // transpose pass 2
-        dct_interleave8(p0, p1); // a0c0e0g0...
-        dct_interleave8(p2, p3); // b0d0f0h0...
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
 
-        // transpose pass 3
-        dct_interleave8(p0, p2); // a0b0c0d0...
-        dct_interleave8(p1, p3); // a4b4c4d4...
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
 
-        // store
-        _mm_storel_epi64((__m128i *)out, p0);
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, p2);
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, p1);
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, p3);
-        out += out_stride;
-        _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
-    }
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
 
 #undef dct_const
 #undef dct_rot
@@ -2763,235 +2708,198 @@ static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
 
 // NEON integer IDCT. should produce bit-identical
 // results to the generic C version.
-static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
-    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
 
-    int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
-    int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
-    int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
-    int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
-    int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
-    int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
-    int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
-    int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
-    int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
-    int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
-    int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
-    int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
 
-#define dct_long_mul(out, inq, coeff)                                                                                          \
-    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff);                                                                   \
-    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
 
-#define dct_long_mac(out, acc, inq, coeff)                                                                                     \
-    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff);                                                          \
-    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
 
-#define dct_widen(out, inq)                                                                                                    \
-    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12);                                                                    \
-    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
 
 // wide add
-#define dct_wadd(out, a, b)                                                                                                    \
-    int32x4_t out##_l = vaddq_s32(a##_l, b##_l);                                                                               \
-    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
 
 // wide sub
-#define dct_wsub(out, a, b)                                                                                                    \
-    int32x4_t out##_l = vsubq_s32(a##_l, b##_l);                                                                               \
-    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
 
 // butterfly a/b, then shift using "shiftop" by "s" and pack
-#define dct_bfly32o(out0, out1, a, b, shiftop, s)                                                                              \
-    {                                                                                                                          \
-        dct_wadd(sum, a, b);                                                                                                   \
-        dct_wsub(dif, a, b);                                                                                                   \
-        out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s));                                                             \
-        out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s));                                                             \
-    }
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
 
-#define dct_pass(shiftop, shift)                                                                                               \
-    {                                                                                                                          \
-        /* even part */                                                                                                        \
-        int16x8_t sum26 = vaddq_s16(row2, row6);                                                                               \
-        dct_long_mul(p1e, sum26, rot0_0);                                                                                      \
-        dct_long_mac(t2e, p1e, row6, rot0_1);                                                                                  \
-        dct_long_mac(t3e, p1e, row2, rot0_2);                                                                                  \
-        int16x8_t sum04 = vaddq_s16(row0, row4);                                                                               \
-        int16x8_t dif04 = vsubq_s16(row0, row4);                                                                               \
-        dct_widen(t0e, sum04);                                                                                                 \
-        dct_widen(t1e, dif04);                                                                                                 \
-        dct_wadd(x0, t0e, t3e);                                                                                                \
-        dct_wsub(x3, t0e, t3e);                                                                                                \
-        dct_wadd(x1, t1e, t2e);                                                                                                \
-        dct_wsub(x2, t1e, t2e);                                                                                                \
-        /* odd part */                                                                                                         \
-        int16x8_t sum15 = vaddq_s16(row1, row5);                                                                               \
-        int16x8_t sum17 = vaddq_s16(row1, row7);                                                                               \
-        int16x8_t sum35 = vaddq_s16(row3, row5);                                                                               \
-        int16x8_t sum37 = vaddq_s16(row3, row7);                                                                               \
-        int16x8_t sumodd = vaddq_s16(sum17, sum35);                                                                            \
-        dct_long_mul(p5o, sumodd, rot1_0);                                                                                     \
-        dct_long_mac(p1o, p5o, sum17, rot1_1);                                                                                 \
-        dct_long_mac(p2o, p5o, sum35, rot1_2);                                                                                 \
-        dct_long_mul(p3o, sum37, rot2_0);                                                                                      \
-        dct_long_mul(p4o, sum15, rot2_1);                                                                                      \
-        dct_wadd(sump13o, p1o, p3o);                                                                                           \
-        dct_wadd(sump24o, p2o, p4o);                                                                                           \
-        dct_wadd(sump23o, p2o, p3o);                                                                                           \
-        dct_wadd(sump14o, p1o, p4o);                                                                                           \
-        dct_long_mac(x4, sump13o, row7, rot3_0);                                                                               \
-        dct_long_mac(x5, sump24o, row5, rot3_1);                                                                               \
-        dct_long_mac(x6, sump23o, row3, rot3_2);                                                                               \
-        dct_long_mac(x7, sump14o, row1, rot3_3);                                                                               \
-        dct_bfly32o(row0, row7, x0, x7, shiftop, shift);                                                                       \
-        dct_bfly32o(row1, row6, x1, x6, shiftop, shift);                                                                       \
-        dct_bfly32o(row2, row5, x2, x5, shiftop, shift);                                                                       \
-        dct_bfly32o(row3, row4, x3, x4, shiftop, shift);                                                                       \
-    }
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
 
-    // load
-    row0 = vld1q_s16(data + 0 * 8);
-    row1 = vld1q_s16(data + 1 * 8);
-    row2 = vld1q_s16(data + 2 * 8);
-    row3 = vld1q_s16(data + 3 * 8);
-    row4 = vld1q_s16(data + 4 * 8);
-    row5 = vld1q_s16(data + 5 * 8);
-    row6 = vld1q_s16(data + 6 * 8);
-    row7 = vld1q_s16(data + 7 * 8);
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
 
-    // add DC bias
-    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
 
-    // column pass
-    dct_pass(vrshrn_n_s32, 10);
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
 
-    // 16bit 8x8 transpose
-    {
+   // 16bit 8x8 transpose
+   {
 // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
 // whether compilers actually get this is another story, sadly.
-#define dct_trn16(x, y)                                                                                                        \
-    {                                                                                                                          \
-        int16x8x2_t t = vtrnq_s16(x, y);                                                                                       \
-        x = t.val[0];                                                                                                          \
-        y = t.val[1];                                                                                                          \
-    }
-#define dct_trn32(x, y)                                                                                                        \
-    {                                                                                                                          \
-        int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y));                                         \
-        x = vreinterpretq_s16_s32(t.val[0]);                                                                                   \
-        y = vreinterpretq_s16_s32(t.val[1]);                                                                                   \
-    }
-#define dct_trn64(x, y)                                                                                                        \
-    {                                                                                                                          \
-        int16x8_t x0 = x;                                                                                                      \
-        int16x8_t y0 = y;                                                                                                      \
-        x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));                                                                  \
-        y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0));                                                                \
-    }
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
 
-        // pass 1
-        dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
-        dct_trn16(row2, row3);
-        dct_trn16(row4, row5);
-        dct_trn16(row6, row7);
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
 
-        // pass 2
-        dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
-        dct_trn32(row1, row3);
-        dct_trn32(row4, row6);
-        dct_trn32(row5, row7);
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
 
-        // pass 3
-        dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
-        dct_trn64(row1, row5);
-        dct_trn64(row2, row6);
-        dct_trn64(row3, row7);
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
 
 #undef dct_trn16
 #undef dct_trn32
 #undef dct_trn64
-    }
+   }
 
-    // row pass
-    // vrshrn_n_s32 only supports shifts up to 16, we need
-    // 17. so do a non-rounding shift of 16 first then follow
-    // up with a rounding shift by 1.
-    dct_pass(vshrn_n_s32, 16);
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
 
-    {
-        // pack and round
-        uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
-        uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
-        uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
-        uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
-        uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
-        uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
-        uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
-        uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
 
-        // again, these can translate into one instruction, but often don't.
-#define dct_trn8_8(x, y)                                                                                                       \
-    {                                                                                                                          \
-        uint8x8x2_t t = vtrn_u8(x, y);                                                                                         \
-        x = t.val[0];                                                                                                          \
-        y = t.val[1];                                                                                                          \
-    }
-#define dct_trn8_16(x, y)                                                                                                      \
-    {                                                                                                                          \
-        uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y));                                             \
-        x = vreinterpret_u8_u16(t.val[0]);                                                                                     \
-        y = vreinterpret_u8_u16(t.val[1]);                                                                                     \
-    }
-#define dct_trn8_32(x, y)                                                                                                      \
-    {                                                                                                                          \
-        uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y));                                             \
-        x = vreinterpret_u8_u32(t.val[0]);                                                                                     \
-        y = vreinterpret_u8_u32(t.val[1]);                                                                                     \
-    }
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
 
-        // sadly can't use interleaved stores here since we only write
-        // 8 bytes to each scan line!
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
 
-        // 8x8 8-bit transpose pass 1
-        dct_trn8_8(p0, p1);
-        dct_trn8_8(p2, p3);
-        dct_trn8_8(p4, p5);
-        dct_trn8_8(p6, p7);
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
 
-        // pass 2
-        dct_trn8_16(p0, p2);
-        dct_trn8_16(p1, p3);
-        dct_trn8_16(p4, p6);
-        dct_trn8_16(p5, p7);
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
 
-        // pass 3
-        dct_trn8_32(p0, p4);
-        dct_trn8_32(p1, p5);
-        dct_trn8_32(p2, p6);
-        dct_trn8_32(p3, p7);
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
 
-        // store
-        vst1_u8(out, p0);
-        out += out_stride;
-        vst1_u8(out, p1);
-        out += out_stride;
-        vst1_u8(out, p2);
-        out += out_stride;
-        vst1_u8(out, p3);
-        out += out_stride;
-        vst1_u8(out, p4);
-        out += out_stride;
-        vst1_u8(out, p5);
-        out += out_stride;
-        vst1_u8(out, p6);
-        out += out_stride;
-        vst1_u8(out, p7);
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
 
 #undef dct_trn8_8
 #undef dct_trn8_16
 #undef dct_trn8_32
-    }
+   }
 
 #undef dct_long_mul
 #undef dct_long_mac
@@ -3004,1267 +2912,1169 @@ static void stbi__idct_simd(stbi_uc * out, int out_stride, short data[64]) {
 
 #endif // STBI_NEON
 
-#define STBI__MARKER_none 0xff
+#define STBI__MARKER_none  0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
 // marker, return 0xff, which is never a valid marker value
-static stbi_uc stbi__get_marker(stbi__jpeg * j) {
-    stbi_uc x;
-    if (j->marker != STBI__MARKER_none) {
-        x = j->marker;
-        j->marker = STBI__MARKER_none;
-        return x;
-    }
-    x = stbi__get8(j->s);
-    if (x != 0xff)
-        return STBI__MARKER_none;
-    while (x == 0xff)
-        x = stbi__get8(j->s); // consume repeated 0xff fill bytes
-    return x;
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
 }
 
 // in each scan, we'll have scan_n components, and the order
 // of the components is specified by order[]
-#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
 
 // after a restart interval, stbi__jpeg_reset the entropy decoder and
 // the dc prediction
-static void stbi__jpeg_reset(stbi__jpeg * j) {
-    j->code_bits = 0;
-    j->code_buffer = 0;
-    j->nomore = 0;
-    j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
-    j->marker = STBI__MARKER_none;
-    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
-    j->eob_run = 0;
-    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
-    // since we don't even allow 1<<30 pixels
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
 }
 
-static int stbi__parse_entropy_coded_data(stbi__jpeg * z) {
-    stbi__jpeg_reset(z);
-    if (!z->progressive) {
-        if (z->scan_n == 1) {
-            int i, j;
-            STBI_SIMD_ALIGN(short, data[64]);
-            int n = z->order[0];
-            // non-interleaved data, we just need to process one block at a time,
-            // in trivial scanline order
-            // number of blocks to do just depends on how many actual "pixels" this
-            // component has, independent of interleaved MCU blocking and such
-            int w = (z->img_comp[n].x + 7) >> 3;
-            int h = (z->img_comp[n].y + 7) >> 3;
-            for (j = 0; j < h; ++j) {
-                for (i = 0; i < w; ++i) {
-                    int ha = z->img_comp[n].ha;
-                    if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n,
-                                                 z->dequant[z->img_comp[n].tq]))
-                        return 0;
-                    z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
-                    // every data block is an MCU, so countdown the restart interval
-                    if (--z->todo <= 0) {
-                        if (z->code_bits < 24)
-                            stbi__grow_buffer_unsafe(z);
-                        // if it's NOT a restart, then just bail, so we get corrupt data
-                        // rather than no data
-                        if (!STBI__RESTART(z->marker))
-                            return 1;
-                        stbi__jpeg_reset(z);
-                    }
-                }
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
             }
-            return 1;
-        } else { // interleaved
-            int i, j, k, x, y;
-            STBI_SIMD_ALIGN(short, data[64]);
-            for (j = 0; j < z->img_mcu_y; ++j) {
-                for (i = 0; i < z->img_mcu_x; ++i) {
-                    // scan an interleaved mcu... process scan_n components in order
-                    for (k = 0; k < z->scan_n; ++k) {
-                        int n = z->order[k];
-                        // scan out an mcu's worth of this component; that's just determined
-                        // by the basic H and V specified for the component
-                        for (y = 0; y < z->img_comp[n].v; ++y) {
-                            for (x = 0; x < z->img_comp[n].h; ++x) {
-                                int x2 = (i * z->img_comp[n].h + x) * 8;
-                                int y2 = (j * z->img_comp[n].v + y) * 8;
-                                int ha = z->img_comp[n].ha;
-                                if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha,
-                                                             z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
-                                    return 0;
-                                z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2,
-                                                     data);
-                            }
-                        }
-                    }
-                    // after all interleaved components, that's an interleaved MCU,
-                    // so now count down the restart interval
-                    if (--z->todo <= 0) {
-                        if (z->code_bits < 24)
-                            stbi__grow_buffer_unsafe(z);
-                        if (!STBI__RESTART(z->marker))
-                            return 1;
-                        stbi__jpeg_reset(z);
-                    }
-                }
-            }
-            return 1;
-        }
-    } else {
-        if (z->scan_n == 1) {
-            int i, j;
-            int n = z->order[0];
-            // non-interleaved data, we just need to process one block at a time,
-            // in trivial scanline order
-            // number of blocks to do just depends on how many actual "pixels" this
-            // component has, independent of interleaved MCU blocking and such
-            int w = (z->img_comp[n].x + 7) >> 3;
-            int h = (z->img_comp[n].y + 7) >> 3;
-            for (j = 0; j < h; ++j) {
-                for (i = 0; i < w; ++i) {
-                    short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-                    if (z->spec_start == 0) {
-                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                            return 0;
-                    } else {
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
                         int ha = z->img_comp[n].ha;
-                        if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
-                            return 0;
-                    }
-                    // every data block is an MCU, so countdown the restart interval
-                    if (--z->todo <= 0) {
-                        if (z->code_bits < 24)
-                            stbi__grow_buffer_unsafe(z);
-                        if (!STBI__RESTART(z->marker))
-                            return 1;
-                        stbi__jpeg_reset(z);
-                    }
-                }
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
             }
-            return 1;
-        } else { // interleaved
-            int i, j, k, x, y;
-            for (j = 0; j < z->img_mcu_y; ++j) {
-                for (i = 0; i < z->img_mcu_x; ++i) {
-                    // scan an interleaved mcu... process scan_n components in order
-                    for (k = 0; k < z->scan_n; ++k) {
-                        int n = z->order[k];
-                        // scan out an mcu's worth of this component; that's just determined
-                        // by the basic H and V specified for the component
-                        for (y = 0; y < z->img_comp[n].v; ++y) {
-                            for (x = 0; x < z->img_comp[n].h; ++x) {
-                                int x2 = (i * z->img_comp[n].h + x);
-                                int y2 = (j * z->img_comp[n].v + y);
-                                short * data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                                if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                                    return 0;
-                            }
-                        }
-                    }
-                    // after all interleaved components, that's an interleaved MCU,
-                    // so now count down the restart interval
-                    if (--z->todo <= 0) {
-                        if (z->code_bits < 24)
-                            stbi__grow_buffer_unsafe(z);
-                        if (!STBI__RESTART(z->marker))
-                            return 1;
-                        stbi__jpeg_reset(z);
-                    }
-                }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
             }
-            return 1;
-        }
-    }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
 }
 
-static void stbi__jpeg_dequantize(short * data, stbi__uint16 * dequant) {
-    int i;
-    for (i = 0; i < 64; ++i)
-        data[i] *= dequant[i];
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
 }
 
-static void stbi__jpeg_finish(stbi__jpeg * z) {
-    if (z->progressive) {
-        // dequantize and idct the data
-        int i, j, n;
-        for (n = 0; n < z->s->img_n; ++n) {
-            int w = (z->img_comp[n].x + 7) >> 3;
-            int h = (z->img_comp[n].y + 7) >> 3;
-            for (j = 0; j < h; ++j) {
-                for (i = 0; i < w; ++i) {
-                    short * data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-                    stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-                    z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
-                }
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
             }
-        }
-    }
+         }
+      }
+   }
 }
 
-static int stbi__process_marker(stbi__jpeg * z, int m) {
-    int L;
-    switch (m) {
-    case STBI__MARKER_none: // no marker found
-        return stbi__err("expected marker", "Corrupt JPEG");
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
 
-    case 0xDD: // DRI - specify restart interval
-        if (stbi__get16be(z->s) != 4)
-            return stbi__err("bad DRI len", "Corrupt JPEG");
-        z->restart_interval = stbi__get16be(z->s);
-        return 1;
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
 
-    case 0xDB: // DQT - define quantization table
-        L = stbi__get16be(z->s) - 2;
-        while (L > 0) {
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
             int q = stbi__get8(z->s);
             int p = q >> 4, sixteen = (p != 0);
-            int t = q & 15, i;
-            if (p != 0 && p != 1)
-                return stbi__err("bad DQT type", "Corrupt JPEG");
-            if (t > 3)
-                return stbi__err("bad DQT table", "Corrupt JPEG");
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
 
-            for (i = 0; i < 64; ++i)
-                z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
             L -= (sixteen ? 129 : 65);
-        }
-        return L == 0;
+         }
+         return L==0;
 
-    case 0xC4: // DHT - define huffman table
-        L = stbi__get16be(z->s) - 2;
-        while (L > 0) {
-            stbi_uc * v;
-            int sizes[16], i, n = 0;
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
             int q = stbi__get8(z->s);
             int tc = q >> 4;
             int th = q & 15;
-            if (tc > 1 || th > 3)
-                return stbi__err("bad DHT header", "Corrupt JPEG");
-            for (i = 0; i < 16; ++i) {
-                sizes[i] = stbi__get8(z->s);
-                n += sizes[i];
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
             }
-            if (n > 256)
-                return stbi__err("bad DHT header", "Corrupt JPEG"); // Loop over i < n would write past end of values!
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
             L -= 17;
             if (tc == 0) {
-                if (!stbi__build_huffman(z->huff_dc + th, sizes))
-                    return 0;
-                v = z->huff_dc[th].values;
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
             } else {
-                if (!stbi__build_huffman(z->huff_ac + th, sizes))
-                    return 0;
-                v = z->huff_ac[th].values;
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
             }
-            for (i = 0; i < n; ++i)
-                v[i] = stbi__get8(z->s);
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
             if (tc != 0)
-                stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
             L -= n;
-        }
-        return L == 0;
-    }
+         }
+         return L==0;
+   }
 
-    // check for comment block or APP blocks
-    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-        L = stbi__get16be(z->s);
-        if (L < 2) {
-            if (m == 0xFE)
-                return stbi__err("bad COM len", "Corrupt JPEG");
-            else
-                return stbi__err("bad APP len", "Corrupt JPEG");
-        }
-        L -= 2;
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
 
-        if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
-            static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
-            int ok = 1;
-            int i;
-            for (i = 0; i < 5; ++i)
-                if (stbi__get8(z->s) != tag[i])
-                    ok = 0;
-            L -= 5;
-            if (ok)
-                z->jfif = 1;
-        } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
-            static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
-            int ok = 1;
-            int i;
-            for (i = 0; i < 6; ++i)
-                if (stbi__get8(z->s) != tag[i])
-                    ok = 0;
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
             L -= 6;
-            if (ok) {
-                stbi__get8(z->s);                            // version
-                stbi__get16be(z->s);                         // flags0
-                stbi__get16be(z->s);                         // flags1
-                z->app14_color_transform = stbi__get8(z->s); // color transform
-                L -= 6;
-            }
-        }
+         }
+      }
 
-        stbi__skip(z->s, L);
-        return 1;
-    }
+      stbi__skip(z->s, L);
+      return 1;
+   }
 
-    return stbi__err("unknown marker", "Corrupt JPEG");
+   return stbi__err("unknown marker","Corrupt JPEG");
 }
 
 // after we see SOS
-static int stbi__process_scan_header(stbi__jpeg * z) {
-    int i;
-    int Ls = stbi__get16be(z->s);
-    z->scan_n = stbi__get8(z->s);
-    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
-        return stbi__err("bad SOS component count", "Corrupt JPEG");
-    if (Ls != 6 + 2 * z->scan_n)
-        return stbi__err("bad SOS len", "Corrupt JPEG");
-    for (i = 0; i < z->scan_n; ++i) {
-        int id = stbi__get8(z->s), which;
-        int q = stbi__get8(z->s);
-        for (which = 0; which < z->s->img_n; ++which)
-            if (z->img_comp[which].id == id)
-                break;
-        if (which == z->s->img_n)
-            return 0; // no match
-        z->img_comp[which].hd = q >> 4;
-        if (z->img_comp[which].hd > 3)
-            return stbi__err("bad DC huff", "Corrupt JPEG");
-        z->img_comp[which].ha = q & 15;
-        if (z->img_comp[which].ha > 3)
-            return stbi__err("bad AC huff", "Corrupt JPEG");
-        z->order[i] = which;
-    }
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
 
-    {
-        int aa;
-        z->spec_start = stbi__get8(z->s);
-        z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
-        aa = stbi__get8(z->s);
-        z->succ_high = (aa >> 4);
-        z->succ_low = (aa & 15);
-        if (z->progressive) {
-            if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
-                return stbi__err("bad SOS", "Corrupt JPEG");
-        } else {
-            if (z->spec_start != 0)
-                return stbi__err("bad SOS", "Corrupt JPEG");
-            if (z->succ_high != 0 || z->succ_low != 0)
-                return stbi__err("bad SOS", "Corrupt JPEG");
-            z->spec_end = 63;
-        }
-    }
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
 
-    return 1;
+   return 1;
 }
 
-static int stbi__free_jpeg_components(stbi__jpeg * z, int ncomp, int why) {
-    int i;
-    for (i = 0; i < ncomp; ++i) {
-        if (z->img_comp[i].raw_data) {
-            STBI_FREE(z->img_comp[i].raw_data);
-            z->img_comp[i].raw_data = NULL;
-            z->img_comp[i].data = NULL;
-        }
-        if (z->img_comp[i].raw_coeff) {
-            STBI_FREE(z->img_comp[i].raw_coeff);
-            z->img_comp[i].raw_coeff = 0;
-            z->img_comp[i].coeff = 0;
-        }
-        if (z->img_comp[i].linebuf) {
-            STBI_FREE(z->img_comp[i].linebuf);
-            z->img_comp[i].linebuf = NULL;
-        }
-    }
-    return why;
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
 }
 
-static int stbi__process_frame_header(stbi__jpeg * z, int scan) {
-    stbi__context * s = z->s;
-    int Lf, p, i, q, h_max = 1, v_max = 1, c;
-    Lf = stbi__get16be(s);
-    if (Lf < 11)
-        return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
-    p = stbi__get8(s);
-    if (p != 8)
-        return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
-    s->img_y = stbi__get16be(s);
-    if (s->img_y == 0)
-        return stbi__err("no header height",
-                         "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
-    s->img_x = stbi__get16be(s);
-    if (s->img_x == 0)
-        return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
-    if (s->img_y > STBI_MAX_DIMENSIONS)
-        return stbi__err("too large", "Very large image (corrupt?)");
-    if (s->img_x > STBI_MAX_DIMENSIONS)
-        return stbi__err("too large", "Very large image (corrupt?)");
-    c = stbi__get8(s);
-    if (c != 3 && c != 1 && c != 4)
-        return stbi__err("bad component count", "Corrupt JPEG");
-    s->img_n = c;
-    for (i = 0; i < c; ++i) {
-        z->img_comp[i].data = NULL;
-        z->img_comp[i].linebuf = NULL;
-    }
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
 
-    if (Lf != 8 + 3 * s->img_n)
-        return stbi__err("bad SOF len", "Corrupt JPEG");
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 
-    z->rgb = 0;
-    for (i = 0; i < s->img_n; ++i) {
-        static const unsigned char rgb[3] = {'R', 'G', 'B'};
-        z->img_comp[i].id = stbi__get8(s);
-        if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
-            ++z->rgb;
-        q = stbi__get8(s);
-        z->img_comp[i].h = (q >> 4);
-        if (!z->img_comp[i].h || z->img_comp[i].h > 4)
-            return stbi__err("bad H", "Corrupt JPEG");
-        z->img_comp[i].v = q & 15;
-        if (!z->img_comp[i].v || z->img_comp[i].v > 4)
-            return stbi__err("bad V", "Corrupt JPEG");
-        z->img_comp[i].tq = stbi__get8(s);
-        if (z->img_comp[i].tq > 3)
-            return stbi__err("bad TQ", "Corrupt JPEG");
-    }
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
 
-    if (scan != STBI__SCAN_load)
-        return 1;
+   if (scan != STBI__SCAN_load) return 1;
 
-    if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
-        return stbi__err("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 
-    for (i = 0; i < s->img_n; ++i) {
-        if (z->img_comp[i].h > h_max)
-            h_max = z->img_comp[i].h;
-        if (z->img_comp[i].v > v_max)
-            v_max = z->img_comp[i].v;
-    }
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
 
-    // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
-    // and I've never seen a non-corrupted JPEG file actually use them
-    for (i = 0; i < s->img_n; ++i) {
-        if (h_max % z->img_comp[i].h != 0)
-            return stbi__err("bad H", "Corrupt JPEG");
-        if (v_max % z->img_comp[i].v != 0)
-            return stbi__err("bad V", "Corrupt JPEG");
-    }
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
 
-    // compute interleaved mcu info
-    z->img_h_max = h_max;
-    z->img_v_max = v_max;
-    z->img_mcu_w = h_max * 8;
-    z->img_mcu_h = v_max * 8;
-    // these sizes can't be more than 17 bits
-    z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
-    z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
-    for (i = 0; i < s->img_n; ++i) {
-        // number of effective pixels (e.g. for non-interleaved MCU)
-        z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
-        z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
-        // to simplify generation, we'll allocate enough memory to decode
-        // the bogus oversized data from using interleaved MCUs and their
-        // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
-        // discard the extra data until colorspace conversion
-        //
-        // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
-        // so these muls can't overflow with 32-bit ints (which we require)
-        z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
-        z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-        z->img_comp[i].coeff = 0;
-        z->img_comp[i].raw_coeff = 0;
-        z->img_comp[i].linebuf = NULL;
-        z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-        if (z->img_comp[i].raw_data == NULL)
-            return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
-        // align blocks for idct using mmx/sse
-        z->img_comp[i].data = (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
-        if (z->progressive) {
-            // w2, h2 are multiples of 8 (see above)
-            z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
-            z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
-            z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-            if (z->img_comp[i].raw_coeff == NULL)
-                return stbi__free_jpeg_components(z, i + 1, stbi__err("outofmem", "Out of memory"));
-            z->img_comp[i].coeff = (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
-        }
-    }
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
 
-    return 1;
+   return 1;
 }
 
 // use comparisons since in some cases we handle more than one case (e.g. SOF)
-#define stbi__DNL(x) ((x) == 0xdc)
-#define stbi__SOI(x) ((x) == 0xd8)
-#define stbi__EOI(x) ((x) == 0xd9)
-#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
-#define stbi__SOS(x) ((x) == 0xda)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
 
-#define stbi__SOF_progressive(x) ((x) == 0xc2)
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
 
-static int stbi__decode_jpeg_header(stbi__jpeg * z, int scan) {
-    int m;
-    z->jfif = 0;
-    z->app14_color_transform = -1; // valid values are 0,1,2
-    z->marker = STBI__MARKER_none; // initialize cached marker to empty
-    m = stbi__get_marker(z);
-    if (!stbi__SOI(m))
-        return stbi__err("no SOI", "Corrupt JPEG");
-    if (scan == STBI__SCAN_type)
-        return 1;
-    m = stbi__get_marker(z);
-    while (!stbi__SOF(m)) {
-        if (!stbi__process_marker(z, m))
-            return 0;
-        m = stbi__get_marker(z);
-        while (m == STBI__MARKER_none) {
-            // some files have extra padding after their blocks, so ok, we'll scan
-            if (stbi__at_eof(z->s))
-                return stbi__err("no SOF", "Corrupt JPEG");
-            m = stbi__get_marker(z);
-        }
-    }
-    z->progressive = stbi__SOF_progressive(m);
-    if (!stbi__process_frame_header(z, scan))
-        return 0;
-    return 1;
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
 }
 
-static int stbi__skip_jpeg_junk_at_end(stbi__jpeg * j) {
-    // some JPEGs have junk at end, skip over it but if we find what looks
-    // like a valid marker, resume there
-    while (!stbi__at_eof(j->s)) {
-        int x = stbi__get8(j->s);
-        while (x == 255) { // might be a marker
-            if (stbi__at_eof(j->s))
-                return STBI__MARKER_none;
-            x = stbi__get8(j->s);
-            if (x != 0x00 && x != 0xff) {
-                // not a stuffed zero or lead-in to another marker, looks
-                // like an actual marker, return it
-                return x;
-            }
-            // stuffed zero has x=0 now which ends the loop, meaning we go
-            // back to regular scan loop.
-            // repeated 0xff keeps trying to read the next byte of the marker.
-        }
-    }
-    return STBI__MARKER_none;
+static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      stbi_uc x = stbi__get8(j->s);
+      while (x == 0xff) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
 }
 
 // decode image to YCbCr format
-static int stbi__decode_jpeg_image(stbi__jpeg * j) {
-    int m;
-    for (m = 0; m < 4; m++) {
-        j->img_comp[m].raw_data = NULL;
-        j->img_comp[m].raw_coeff = NULL;
-    }
-    j->restart_interval = 0;
-    if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
-        return 0;
-    m = stbi__get_marker(j);
-    while (!stbi__EOI(m)) {
-        if (stbi__SOS(m)) {
-            if (!stbi__process_scan_header(j))
-                return 0;
-            if (!stbi__parse_entropy_coded_data(j))
-                return 0;
-            if (j->marker == STBI__MARKER_none) {
-                j->marker = stbi__skip_jpeg_junk_at_end(j);
-                // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
-            }
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
             m = stbi__get_marker(j);
-            if (STBI__RESTART(m))
-                m = stbi__get_marker(j);
-        } else if (stbi__DNL(m)) {
-            int Ld = stbi__get16be(j->s);
-            stbi__uint32 NL = stbi__get16be(j->s);
-            if (Ld != 4)
-                return stbi__err("bad DNL len", "Corrupt JPEG");
-            if (NL != j->s->img_y)
-                return stbi__err("bad DNL height", "Corrupt JPEG");
-            m = stbi__get_marker(j);
-        } else {
-            if (!stbi__process_marker(j, m))
-                return 1;
-            m = stbi__get_marker(j);
-        }
-    }
-    if (j->progressive)
-        stbi__jpeg_finish(j);
-    return 1;
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
 }
 
 // static jfif-centered resampling (across block boundaries)
 
-typedef stbi_uc * (*resample_row_func)(stbi_uc * out, stbi_uc * in0, stbi_uc * in1, int w, int hs);
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
 
-#define stbi__div4(x) ((stbi_uc)((x) >> 2))
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
 
-static stbi_uc * resample_row_1(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    STBI_NOTUSED(out);
-    STBI_NOTUSED(in_far);
-    STBI_NOTUSED(w);
-    STBI_NOTUSED(hs);
-    return in_near;
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
 }
 
-static stbi_uc * stbi__resample_row_v_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    // need to generate two samples vertically for every one in input
-    int i;
-    STBI_NOTUSED(hs);
-    for (i = 0; i < w; ++i)
-        out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
-    return out;
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
 }
 
-static stbi_uc * stbi__resample_row_h_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    // need to generate two samples horizontally for every one in input
-    int i;
-    stbi_uc * input = in_near;
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
 
-    if (w == 1) {
-        // if only one sample, can't do any interpolation
-        out[0] = out[1] = input[0];
-        return out;
-    }
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
 
-    out[0] = input[0];
-    out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
-    for (i = 1; i < w - 1; ++i) {
-        int n = 3 * input[i] + 2;
-        out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
-        out[i * 2 + 1] = stbi__div4(n + input[i + 1]);
-    }
-    out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2);
-    out[i * 2 + 1] = input[w - 1];
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
 
-    STBI_NOTUSED(in_far);
-    STBI_NOTUSED(hs);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
 
-    return out;
+   return out;
 }
 
-#define stbi__div16(x) ((stbi_uc)((x) >> 4))
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
 
-static stbi_uc * stbi__resample_row_hv_2(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    // need to generate 2x2 samples for every one in input
-    int i, t0, t1;
-    if (w == 1) {
-        out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
-        return out;
-    }
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
 
-    t1 = 3 * in_near[0] + in_far[0];
-    out[0] = stbi__div4(t1 + 2);
-    for (i = 1; i < w; ++i) {
-        t0 = t1;
-        t1 = 3 * in_near[i] + in_far[i];
-        out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
-        out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
-    }
-    out[w * 2 - 1] = stbi__div4(t1 + 2);
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
 
-    STBI_NOTUSED(hs);
+   STBI_NOTUSED(hs);
 
-    return out;
+   return out;
 }
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
-static stbi_uc * stbi__resample_row_hv_2_simd(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    // need to generate 2x2 samples for every one in input
-    int i = 0, t0, t1;
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
 
-    if (w == 1) {
-        out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
-        return out;
-    }
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
 
-    t1 = 3 * in_near[0] + in_far[0];
-    // process groups of 8 pixels for as long as we can.
-    // note we can't handle the last pixel in a row in this loop
-    // because we need to handle the filter boundary conditions.
-    for (; i < ((w - 1) & ~7); i += 8) {
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
 #if defined(STBI_SSE2)
-        // load and perform the vertical filtering pass
-        // this uses 3*x + y = 4*x + (y - x)
-        __m128i zero = _mm_setzero_si128();
-        __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i));
-        __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i));
-        __m128i farw = _mm_unpacklo_epi8(farb, zero);
-        __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
-        __m128i diff = _mm_sub_epi16(farw, nearw);
-        __m128i nears = _mm_slli_epi16(nearw, 2);
-        __m128i curr = _mm_add_epi16(nears, diff); // current row
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
 
-        // horizontal filter works the same based on shifted vers of current
-        // row. "prev" is current row shifted right by 1 pixel; we need to
-        // insert the previous pixel value (from t1).
-        // "next" is current row shifted left by 1 pixel, with first pixel
-        // of next block of 8 pixels added in.
-        __m128i prv0 = _mm_slli_si128(curr, 2);
-        __m128i nxt0 = _mm_srli_si128(curr, 2);
-        __m128i prev = _mm_insert_epi16(prv0, t1, 0);
-        __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
 
-        // horizontal filter, polyphase implementation since it's convenient:
-        // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-        // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-        // note the shared term.
-        __m128i bias = _mm_set1_epi16(8);
-        __m128i curs = _mm_slli_epi16(curr, 2);
-        __m128i prvd = _mm_sub_epi16(prev, curr);
-        __m128i nxtd = _mm_sub_epi16(next, curr);
-        __m128i curb = _mm_add_epi16(curs, bias);
-        __m128i even = _mm_add_epi16(prvd, curb);
-        __m128i odd = _mm_add_epi16(nxtd, curb);
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
 
-        // interleave even and odd pixels, then undo scaling.
-        __m128i int0 = _mm_unpacklo_epi16(even, odd);
-        __m128i int1 = _mm_unpackhi_epi16(even, odd);
-        __m128i de0 = _mm_srli_epi16(int0, 4);
-        __m128i de1 = _mm_srli_epi16(int1, 4);
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
 
-        // pack and write output
-        __m128i outv = _mm_packus_epi16(de0, de1);
-        _mm_storeu_si128((__m128i *)(out + i * 2), outv);
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
 #elif defined(STBI_NEON)
-        // load and perform the vertical filtering pass
-        // this uses 3*x + y = 4*x + (y - x)
-        uint8x8_t farb = vld1_u8(in_far + i);
-        uint8x8_t nearb = vld1_u8(in_near + i);
-        int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
-        int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-        int16x8_t curr = vaddq_s16(nears, diff); // current row
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
 
-        // horizontal filter works the same based on shifted vers of current
-        // row. "prev" is current row shifted right by 1 pixel; we need to
-        // insert the previous pixel value (from t1).
-        // "next" is current row shifted left by 1 pixel, with first pixel
-        // of next block of 8 pixels added in.
-        int16x8_t prv0 = vextq_s16(curr, curr, 7);
-        int16x8_t nxt0 = vextq_s16(curr, curr, 1);
-        int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
-        int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
 
-        // horizontal filter, polyphase implementation since it's convenient:
-        // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-        // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-        // note the shared term.
-        int16x8_t curs = vshlq_n_s16(curr, 2);
-        int16x8_t prvd = vsubq_s16(prev, curr);
-        int16x8_t nxtd = vsubq_s16(next, curr);
-        int16x8_t even = vaddq_s16(curs, prvd);
-        int16x8_t odd = vaddq_s16(curs, nxtd);
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
 
-        // undo scaling and round, then store with even/odd phases interleaved
-        uint8x8x2_t o;
-        o.val[0] = vqrshrun_n_s16(even, 4);
-        o.val[1] = vqrshrun_n_s16(odd, 4);
-        vst2_u8(out + i * 2, o);
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
 #endif
 
-        // "previous" value for next iter
-        t1 = 3 * in_near[i + 7] + in_far[i + 7];
-    }
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
 
-    t0 = t1;
-    t1 = 3 * in_near[i] + in_far[i];
-    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
 
-    for (++i; i < w; ++i) {
-        t0 = t1;
-        t1 = 3 * in_near[i] + in_far[i];
-        out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
-        out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
-    }
-    out[w * 2 - 1] = stbi__div4(t1 + 2);
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
 
-    STBI_NOTUSED(hs);
+   STBI_NOTUSED(hs);
 
-    return out;
+   return out;
 }
 #endif
 
-static stbi_uc * stbi__resample_row_generic(stbi_uc * out, stbi_uc * in_near, stbi_uc * in_far, int w, int hs) {
-    // resample with nearest-neighbor
-    int i, j;
-    STBI_NOTUSED(in_far);
-    for (i = 0; i < w; ++i)
-        for (j = 0; j < hs; ++j)
-            out[i * hs + j] = in_near[i];
-    return out;
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
 }
 
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc * out, const stbi_uc * y, const stbi_uc * pcb, const stbi_uc * pcr, int count,
-                                   int step) {
-    int i;
-    for (i = 0; i < count; ++i) {
-        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
-        int r, g, b;
-        int cr = pcr[i] - 128;
-        int cb = pcb[i] - 128;
-        r = y_fixed + cr * stbi__float2fixed(1.40200f);
-        g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
-        b = y_fixed + cb * stbi__float2fixed(1.77200f);
-        r >>= 20;
-        g >>= 20;
-        b >>= 20;
-        if ((unsigned)r > 255) {
-            if (r < 0)
-                r = 0;
-            else
-                r = 255;
-        }
-        if ((unsigned)g > 255) {
-            if (g < 0)
-                g = 0;
-            else
-                g = 255;
-        }
-        if ((unsigned)b > 255) {
-            if (b < 0)
-                b = 0;
-            else
-                b = 255;
-        }
-        out[0] = (stbi_uc)r;
-        out[1] = (stbi_uc)g;
-        out[2] = (stbi_uc)b;
-        out[3] = 255;
-        out += step;
-    }
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
 }
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc * out, stbi_uc const * y, stbi_uc const * pcb, stbi_uc const * pcr, int count,
-                                    int step) {
-    int i = 0;
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
 
 #ifdef STBI_SSE2
-    // step == 3 is pretty ugly on the final interleave, and i'm not convinced
-    // it's useful in practice (you wouldn't use it for textures, for example).
-    // so just accelerate step == 4 case.
-    if (step == 4) {
-        // this is a fairly straightforward implementation and not super-optimized.
-        __m128i signflip = _mm_set1_epi8(-0x80);
-        __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
-        __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
-        __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
-        __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
-        __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
-        __m128i xw = _mm_set1_epi16(255); // alpha channel
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
 
-        for (; i + 7 < count; i += 8) {
-            // load
-            __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
-            __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
-            __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
-            __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
-            __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 
-            // unpack to short (and left-shift cr, cb by 8)
-            __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
-            __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
-            __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
 
-            // color transform
-            __m128i yws = _mm_srli_epi16(yw, 4);
-            __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
-            __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-            __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
-            __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-            __m128i rws = _mm_add_epi16(cr0, yws);
-            __m128i gwt = _mm_add_epi16(cb0, yws);
-            __m128i bws = _mm_add_epi16(yws, cb1);
-            __m128i gws = _mm_add_epi16(gwt, cr1);
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
 
-            // descale
-            __m128i rw = _mm_srai_epi16(rws, 4);
-            __m128i bw = _mm_srai_epi16(bws, 4);
-            __m128i gw = _mm_srai_epi16(gws, 4);
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
 
-            // back to byte, set up for transpose
-            __m128i brb = _mm_packus_epi16(rw, bw);
-            __m128i gxb = _mm_packus_epi16(gw, xw);
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
 
-            // transpose to interleave channels
-            __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
-            __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
-            __m128i o0 = _mm_unpacklo_epi16(t0, t1);
-            __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
 
-            // store
-            _mm_storeu_si128((__m128i *)(out + 0), o0);
-            _mm_storeu_si128((__m128i *)(out + 16), o1);
-            out += 32;
-        }
-    }
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
 #endif
 
 #ifdef STBI_NEON
-    // in this version, step=3 support would be easy to add. but is there demand?
-    if (step == 4) {
-        // this is a fairly straightforward implementation and not super-optimized.
-        uint8x8_t signflip = vdup_n_u8(0x80);
-        int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
-        int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
-        int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
-        int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
 
-        for (; i + 7 < count; i += 8) {
-            // load
-            uint8x8_t y_bytes = vld1_u8(y + i);
-            uint8x8_t cr_bytes = vld1_u8(pcr + i);
-            uint8x8_t cb_bytes = vld1_u8(pcb + i);
-            int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
-            int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
 
-            // expand to s16
-            int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
-            int16x8_t crw = vshll_n_s8(cr_biased, 7);
-            int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
 
-            // color transform
-            int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
-            int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
-            int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
-            int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
-            int16x8_t rws = vaddq_s16(yws, cr0);
-            int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
-            int16x8_t bws = vaddq_s16(yws, cb1);
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
 
-            // undo scaling, round, convert to byte
-            uint8x8x4_t o;
-            o.val[0] = vqrshrun_n_s16(rws, 4);
-            o.val[1] = vqrshrun_n_s16(gws, 4);
-            o.val[2] = vqrshrun_n_s16(bws, 4);
-            o.val[3] = vdup_n_u8(255);
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
 
-            // store, interleaving r/g/b/a
-            vst4_u8(out, o);
-            out += 8 * 4;
-        }
-    }
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
 #endif
 
-    for (; i < count; ++i) {
-        int y_fixed = (y[i] << 20) + (1 << 19); // rounding
-        int r, g, b;
-        int cr = pcr[i] - 128;
-        int cb = pcb[i] - 128;
-        r = y_fixed + cr * stbi__float2fixed(1.40200f);
-        g = y_fixed + cr * -stbi__float2fixed(0.71414f) + ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
-        b = y_fixed + cb * stbi__float2fixed(1.77200f);
-        r >>= 20;
-        g >>= 20;
-        b >>= 20;
-        if ((unsigned)r > 255) {
-            if (r < 0)
-                r = 0;
-            else
-                r = 255;
-        }
-        if ((unsigned)g > 255) {
-            if (g < 0)
-                g = 0;
-            else
-                g = 255;
-        }
-        if ((unsigned)b > 255) {
-            if (b < 0)
-                b = 0;
-            else
-                b = 255;
-        }
-        out[0] = (stbi_uc)r;
-        out[1] = (stbi_uc)g;
-        out[2] = (stbi_uc)b;
-        out[3] = 255;
-        out += step;
-    }
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
 }
 #endif
 
 // set up the kernels
-static void stbi__setup_jpeg(stbi__jpeg * j) {
-    j->idct_block_kernel = stbi__idct_block;
-    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
-    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 
 #ifdef STBI_SSE2
-    if (stbi__sse2_available()) {
-        j->idct_block_kernel = stbi__idct_simd;
-        j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-        j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-    }
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
 #endif
 
 #ifdef STBI_NEON
-    j->idct_block_kernel = stbi__idct_simd;
-    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
 
 // clean up the temporary component buffers
-static void stbi__cleanup_jpeg(stbi__jpeg * j) { stbi__free_jpeg_components(j, j->s->img_n, 0); }
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
 
-typedef struct {
-    resample_row_func resample;
-    stbi_uc *line0, *line1;
-    int hs, vs;  // expansion factor in each axis
-    int w_lores; // horizontal pixels pre-expansion
-    int ystep;   // how far through vertical expansion we are
-    int ypos;    // which pre-expansion row we're on
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
 // fast 0..255 * 0..255 => 0..255 rounded multiplication
-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) {
-    unsigned int t = x * y + 128;
-    return (stbi_uc)((t + (t >> 8)) >> 8);
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
 }
 
-static stbi_uc * load_jpeg_image(stbi__jpeg * z, int * out_x, int * out_y, int * comp, int req_comp) {
-    int n, decode_n, is_rgb;
-    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
-    // validate req_comp
-    if (req_comp < 0 || req_comp > 4)
-        return stbi__errpuc("bad req_comp", "Internal error");
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
 
-    // load a jpeg image from whichever source, but leave in YCbCr format
-    if (!stbi__decode_jpeg_image(z)) {
-        stbi__cleanup_jpeg(z);
-        return NULL;
-    }
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
-    // determine actual number of components to generate
-    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
 
-    is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-    if (z->s->img_n == 3 && n < 3 && !is_rgb)
-        decode_n = 1;
-    else
-        decode_n = z->s->img_n;
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
 
-    // nothing to do if no components requested; check this now to avoid
-    // accessing uninitialized coutput[0] later
-    if (decode_n <= 0) {
-        stbi__cleanup_jpeg(z);
-        return NULL;
-    }
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
 
-    // resample and color-convert
-    {
-        int k;
-        unsigned int i, j;
-        stbi_uc * output;
-        stbi_uc * coutput[4] = {NULL, NULL, NULL, NULL};
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
 
-        stbi__resample res_comp[4];
+      stbi__resample res_comp[4];
 
-        for (k = 0; k < decode_n; ++k) {
-            stbi__resample * r = &res_comp[k];
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
 
-            // allocate line buffer big enough for upsampling off the edges
-            // with upsample factor of 4
-            z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3);
-            if (!z->img_comp[k].linebuf) {
-                stbi__cleanup_jpeg(z);
-                return stbi__errpuc("outofmem", "Out of memory");
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
             }
-
-            r->hs = z->img_h_max / z->img_comp[k].h;
-            r->vs = z->img_v_max / z->img_comp[k].v;
-            r->ystep = r->vs >> 1;
-            r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
-            r->ypos = 0;
-            r->line0 = r->line1 = z->img_comp[k].data;
-
-            if (r->hs == 1 && r->vs == 1)
-                r->resample = resample_row_1;
-            else if (r->hs == 1 && r->vs == 2)
-                r->resample = stbi__resample_row_v_2;
-            else if (r->hs == 2 && r->vs == 1)
-                r->resample = stbi__resample_row_h_2;
-            else if (r->hs == 2 && r->vs == 2)
-                r->resample = z->resample_row_hv_2_kernel;
-            else
-                r->resample = stbi__resample_row_generic;
-        }
-
-        // can't error after this so, this is safe
-        output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-        if (!output) {
-            stbi__cleanup_jpeg(z);
-            return stbi__errpuc("outofmem", "Out of memory");
-        }
-
-        // now go ahead and resample
-        for (j = 0; j < z->s->img_y; ++j) {
-            stbi_uc * out = output + n * z->s->img_x * j;
-            for (k = 0; k < decode_n; ++k) {
-                stbi__resample * r = &res_comp[k];
-                int y_bot = r->ystep >= (r->vs >> 1);
-                coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1,
-                                         r->w_lores, r->hs);
-                if (++r->ystep >= r->vs) {
-                    r->ystep = 0;
-                    r->line0 = r->line1;
-                    if (++r->ypos < z->img_comp[k].y)
-                        r->line1 += z->img_comp[k].w2;
-                }
-            }
-            if (n >= 3) {
-                stbi_uc * y = coutput[0];
-                if (z->s->img_n == 3) {
-                    if (is_rgb) {
-                        for (i = 0; i < z->s->img_x; ++i) {
-                            out[0] = y[i];
-                            out[1] = coutput[1][i];
-                            out[2] = coutput[2][i];
-                            out[3] = 255;
-                            out += n;
-                        }
-                    } else {
-                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                    }
-                } else if (z->s->img_n == 4) {
-                    if (z->app14_color_transform == 0) { // CMYK
-                        for (i = 0; i < z->s->img_x; ++i) {
-                            stbi_uc m = coutput[3][i];
-                            out[0] = stbi__blinn_8x8(coutput[0][i], m);
-                            out[1] = stbi__blinn_8x8(coutput[1][i], m);
-                            out[2] = stbi__blinn_8x8(coutput[2][i], m);
-                            out[3] = 255;
-                            out += n;
-                        }
-                    } else if (z->app14_color_transform == 2) { // YCCK
-                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                        for (i = 0; i < z->s->img_x; ++i) {
-                            stbi_uc m = coutput[3][i];
-                            out[0] = stbi__blinn_8x8(255 - out[0], m);
-                            out[1] = stbi__blinn_8x8(255 - out[1], m);
-                            out[2] = stbi__blinn_8x8(255 - out[2], m);
-                            out += n;
-                        }
-                    } else { // YCbCr + alpha?  Ignore the fourth channel for now
-                        z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                    }
-                } else
-                    for (i = 0; i < z->s->img_x; ++i) {
-                        out[0] = out[1] = out[2] = y[i];
-                        out[3] = 255; // not used if n==3
-                        out += n;
-                    }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
             } else {
-                if (is_rgb) {
-                    if (n == 1)
-                        for (i = 0; i < z->s->img_x; ++i)
-                            *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-                    else {
-                        for (i = 0; i < z->s->img_x; ++i, out += 2) {
-                            out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-                            out[1] = 255;
-                        }
-                    }
-                } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
-                    for (i = 0; i < z->s->img_x; ++i) {
-                        stbi_uc m = coutput[3][i];
-                        stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
-                        stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
-                        stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
-                        out[0] = stbi__compute_y(r, g, b);
-                        out[1] = 255;
-                        out += n;
-                    }
-                } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
-                    for (i = 0; i < z->s->img_x; ++i) {
-                        out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
-                        out[1] = 255;
-                        out += n;
-                    }
-                } else {
-                    stbi_uc * y = coutput[0];
-                    if (n == 1)
-                        for (i = 0; i < z->s->img_x; ++i)
-                            out[i] = y[i];
-                    else
-                        for (i = 0; i < z->s->img_x; ++i) {
-                            *out++ = y[i];
-                            *out++ = 255;
-                        }
-                }
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
             }
-        }
-        stbi__cleanup_jpeg(z);
-        *out_x = z->s->img_x;
-        *out_y = z->s->img_y;
-        if (comp)
-            *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
-        return output;
-    }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
 }
 
-static void * stbi__jpeg_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    unsigned char * result;
-    stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
-    if (!j)
-        return stbi__errpuc("outofmem", "Out of memory");
-    memset(j, 0, sizeof(stbi__jpeg));
-    STBI_NOTUSED(ri);
-    j->s = s;
-    stbi__setup_jpeg(j);
-    result = load_jpeg_image(j, x, y, comp, req_comp);
-    STBI_FREE(j);
-    return result;
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
 }
 
-static int stbi__jpeg_test(stbi__context * s) {
-    int r;
-    stbi__jpeg * j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
-    if (!j)
-        return stbi__err("outofmem", "Out of memory");
-    memset(j, 0, sizeof(stbi__jpeg));
-    j->s = s;
-    stbi__setup_jpeg(j);
-    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
-    stbi__rewind(s);
-    STBI_FREE(j);
-    return r;
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
 }
 
-static int stbi__jpeg_info_raw(stbi__jpeg * j, int * x, int * y, int * comp) {
-    if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
-        stbi__rewind(j->s);
-        return 0;
-    }
-    if (x)
-        *x = j->s->img_x;
-    if (y)
-        *y = j->s->img_y;
-    if (comp)
-        *comp = j->s->img_n >= 3 ? 3 : 1;
-    return 1;
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
 }
 
-static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp) {
-    int result;
-    stbi__jpeg * j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg)));
-    if (!j)
-        return stbi__err("outofmem", "Out of memory");
-    memset(j, 0, sizeof(stbi__jpeg));
-    j->s = s;
-    result = stbi__jpeg_info_raw(j, x, y, comp);
-    STBI_FREE(j);
-    return result;
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
 }
 #endif
 
@@ -4278,81 +4088,84 @@ static int stbi__jpeg_info(stbi__context * s, int * x, int * y, int * comp) {
 #ifndef STBI_NO_ZLIB
 
 // fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
-#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
 #define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
-typedef struct {
-    stbi__uint16 fast[1 << STBI__ZFAST_BITS];
-    stbi__uint16 firstcode[16];
-    int maxcode[17];
-    stbi__uint16 firstsymbol[16];
-    stbi_uc size[STBI__ZNSYMS];
-    stbi__uint16 value[STBI__ZNSYMS];
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
 } stbi__zhuffman;
 
-stbi_inline static int stbi__bitreverse16(int n) {
-    n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
-    n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
-    n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
-    n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
-    return n;
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
 }
 
-stbi_inline static int stbi__bit_reverse(int v, int bits) {
-    STBI_ASSERT(bits <= 16);
-    // to bit reverse n bits, reverse 16 and shift
-    // e.g. 11 bits, bit reverse and shift away 5
-    return stbi__bitreverse16(v) >> (16 - bits);
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
 }
 
-static int stbi__zbuild_huffman(stbi__zhuffman * z, const stbi_uc * sizelist, int num) {
-    int i, k = 0;
-    int code, next_code[16], sizes[17];
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
 
-    // DEFLATE spec for generating codes
-    memset(sizes, 0, sizeof(sizes));
-    memset(z->fast, 0, sizeof(z->fast));
-    for (i = 0; i < num; ++i)
-        ++sizes[sizelist[i]];
-    sizes[0] = 0;
-    for (i = 1; i < 16; ++i)
-        if (sizes[i] > (1 << i))
-            return stbi__err("bad sizes", "Corrupt PNG");
-    code = 0;
-    for (i = 1; i < 16; ++i) {
-        next_code[i] = code;
-        z->firstcode[i] = (stbi__uint16)code;
-        z->firstsymbol[i] = (stbi__uint16)k;
-        code = (code + sizes[i]);
-        if (sizes[i])
-            if (code - 1 >= (1 << i))
-                return stbi__err("bad codelengths", "Corrupt PNG");
-        z->maxcode[i] = code << (16 - i); // preshift for inner loop
-        code <<= 1;
-        k += sizes[i];
-    }
-    z->maxcode[16] = 0x10000; // sentinel
-    for (i = 0; i < num; ++i) {
-        int s = sizelist[i];
-        if (s) {
-            int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
-            stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
-            z->size[c] = (stbi_uc)s;
-            z->value[c] = (stbi__uint16)i;
-            if (s <= STBI__ZFAST_BITS) {
-                int j = stbi__bit_reverse(next_code[s], s);
-                while (j < (1 << STBI__ZFAST_BITS)) {
-                    z->fast[j] = fastv;
-                    j += (1 << s);
-                }
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
             }
-            ++next_code[s];
-        }
-    }
-    return 1;
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
 }
 
 // zlib-from-memory implementation for PNG reading
@@ -4361,298 +4174,297 @@ static int stbi__zbuild_huffman(stbi__zhuffman * z, const stbi_uc * sizelist, in
 //    we require PNG read all the IDATs and combine them into a single
 //    memory buffer
 
-typedef struct {
-    stbi_uc *zbuffer, *zbuffer_end;
-    int num_bits;
-    stbi__uint32 code_buffer;
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   int hit_zeof_once;
+   stbi__uint32 code_buffer;
 
-    char * zout;
-    char * zout_start;
-    char * zout_end;
-    int z_expandable;
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
 
-    stbi__zhuffman z_length, z_distance;
+   stbi__zhuffman z_length, z_distance;
 } stbi__zbuf;
 
-stbi_inline static int stbi__zeof(stbi__zbuf * z) { return (z->zbuffer >= z->zbuffer_end); }
-
-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf * z) { return stbi__zeof(z) ? 0 : *z->zbuffer++; }
-
-static void stbi__fill_bits(stbi__zbuf * z) {
-    do {
-        if (z->code_buffer >= (1U << z->num_bits)) {
-            z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */
-            return;
-        }
-        z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
-        z->num_bits += 8;
-    } while (z->num_bits <= 24);
-}
-
-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf * z, int n) {
-    unsigned int k;
-    if (z->num_bits < n)
-        stbi__fill_bits(z);
-    k = z->code_buffer & ((1 << n) - 1);
-    z->code_buffer >>= n;
-    z->num_bits -= n;
-    return k;
-}
-
-static int stbi__zhuffman_decode_slowpath(stbi__zbuf * a, stbi__zhuffman * z) {
-    int b, s, k;
-    // not resolved by fast table, so compute it the slow way
-    // use jpeg approach, which requires MSbits at top
-    k = stbi__bit_reverse(a->code_buffer, 16);
-    for (s = STBI__ZFAST_BITS + 1;; ++s)
-        if (k < z->maxcode[s])
-            break;
-    if (s >= 16)
-        return -1; // invalid code!
-    // code size is s, so:
-    b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
-    if (b >= STBI__ZNSYMS)
-        return -1; // some data was corrupt somewhere!
-    if (z->size[b] != s)
-        return -1; // was originally an assert, but report failure instead.
-    a->code_buffer >>= s;
-    a->num_bits -= s;
-    return z->value[b];
-}
-
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf * a, stbi__zhuffman * z) {
-    int b, s;
-    if (a->num_bits < 16) {
-        if (stbi__zeof(a)) {
-            return -1; /* report error for unexpected end of data. */
-        }
-        stbi__fill_bits(a);
-    }
-    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-    if (b) {
-        s = b >> 9;
-        a->code_buffer >>= s;
-        a->num_bits -= s;
-        return b & 511;
-    }
-    return stbi__zhuffman_decode_slowpath(a, z);
-}
-
-static int stbi__zexpand(stbi__zbuf * z, char * zout, int n) // need to make room for n bytes
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
 {
-    char * q;
-    unsigned int cur, limit, old_limit;
-    z->zout = zout;
-    if (!z->z_expandable)
-        return stbi__err("output buffer limit", "Corrupt PNG");
-    cur = (unsigned int)(z->zout - z->zout_start);
-    limit = old_limit = (unsigned)(z->zout_end - z->zout_start);
-    if (UINT_MAX - cur < (unsigned)n)
-        return stbi__err("outofmem", "Out of memory");
-    while (cur + n > limit) {
-        if (limit > UINT_MAX / 2)
-            return stbi__err("outofmem", "Out of memory");
-        limit *= 2;
-    }
-    q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
-    STBI_NOTUSED(old_limit);
-    if (q == NULL)
-        return stbi__err("outofmem", "Out of memory");
-    z->zout_start = q;
-    z->zout = q + cur;
-    z->zout_end = q + limit;
-    return 1;
+   return (z->zbuffer >= z->zbuffer_end);
 }
 
-static const int stbi__zlength_base[31] = {3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
-                                           35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
-
-static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
-                                            3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 0, 0};
-
-static const int stbi__zdist_base[32] = {1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
-                                         49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
-                                         2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
-
-static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
-                                          6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
-
-static int stbi__parse_huffman_block(stbi__zbuf * a) {
-    char * zout = a->zout;
-    for (;;) {
-        int z = stbi__zhuffman_decode(a, &a->z_length);
-        if (z < 256) {
-            if (z < 0)
-                return stbi__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
-            if (zout >= a->zout_end) {
-                if (!stbi__zexpand(a, zout, 1))
-                    return 0;
-                zout = a->zout;
-            }
-            *zout++ = (char)z;
-        } else {
-            stbi_uc * p;
-            int len, dist;
-            if (z == 256) {
-                a->zout = zout;
-                return 1;
-            }
-            if (z >= 286)
-                return stbi__err("bad huffman code",
-                                 "Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
-            z -= 257;
-            len = stbi__zlength_base[z];
-            if (stbi__zlength_extra[z])
-                len += stbi__zreceive(a, stbi__zlength_extra[z]);
-            z = stbi__zhuffman_decode(a, &a->z_distance);
-            if (z < 0 || z >= 30)
-                return stbi__err("bad huffman code",
-                                 "Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
-            dist = stbi__zdist_base[z];
-            if (stbi__zdist_extra[z])
-                dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-            if (zout - a->zout_start < dist)
-                return stbi__err("bad dist", "Corrupt PNG");
-            if (zout + len > a->zout_end) {
-                if (!stbi__zexpand(a, zout, len))
-                    return 0;
-                zout = a->zout;
-            }
-            p = (stbi_uc *)(zout - dist);
-            if (dist == 1) { // run of one byte; common in images.
-                stbi_uc v = *p;
-                if (len) {
-                    do
-                        *zout++ = v;
-                    while (--len);
-                }
-            } else {
-                if (len) {
-                    do
-                        *zout++ = *p++;
-                    while (--len);
-                }
-            }
-        }
-    }
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
 }
 
-static int stbi__compute_huffman_codes(stbi__zbuf * a) {
-    static const stbi_uc length_dezigzag[19] = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
-    stbi__zhuffman z_codelength;
-    stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
-    stbi_uc codelength_sizes[19];
-    int i, n;
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
 
-    int hlit = stbi__zreceive(a, 5) + 257;
-    int hdist = stbi__zreceive(a, 5) + 1;
-    int hclen = stbi__zreceive(a, 4) + 4;
-    int ntot = hlit + hdist;
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
 
-    memset(codelength_sizes, 0, sizeof(codelength_sizes));
-    for (i = 0; i < hclen; ++i) {
-        int s = stbi__zreceive(a, 3);
-        codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
-    }
-    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
-        return 0;
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
 
-    n = 0;
-    while (n < ntot) {
-        int c = stbi__zhuffman_decode(a, &z_codelength);
-        if (c < 0 || c >= 19)
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         if (!a->hit_zeof_once) {
+            // This is the first time we hit eof, insert 16 extra padding btis
+            // to allow us to keep going; if we actually consume any of them
+            // though, that is invalid data. This is caught later.
+            a->hit_zeof_once = 1;
+            a->num_bits += 16; // add 16 implicit zero bits
+         } else {
+            // We already inserted our extra 16 padding bits and are again
+            // out, this stream is actually prematurely terminated.
+            return -1;
+         }
+      } else {
+         stbi__fill_bits(a);
+      }
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            if (a->hit_zeof_once && a->num_bits < 16) {
+               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
+               // buffer so the decoder can just do its speculative decoding. But if we
+               // actually consumed any of those bits (which is the case when num_bits < 16),
+               // the stream actually read past the end so it is malformed.
+               return stbi__err("unexpected end","Corrupt PNG");
+            }
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (len > a->zout_end - zout) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
             return stbi__err("bad codelengths", "Corrupt PNG");
-        if (c < 16)
-            lencodes[n++] = (stbi_uc)c;
-        else {
-            stbi_uc fill = 0;
-            if (c == 16) {
-                c = stbi__zreceive(a, 2) + 3;
-                if (n == 0)
-                    return stbi__err("bad codelengths", "Corrupt PNG");
-                fill = lencodes[n - 1];
-            } else if (c == 17) {
-                c = stbi__zreceive(a, 3) + 3;
-            } else if (c == 18) {
-                c = stbi__zreceive(a, 7) + 11;
-            } else {
-                return stbi__err("bad codelengths", "Corrupt PNG");
-            }
-            if (ntot - n < c)
-                return stbi__err("bad codelengths", "Corrupt PNG");
-            memset(lencodes + n, fill, c);
-            n += c;
-        }
-    }
-    if (n != ntot)
-        return stbi__err("bad codelengths", "Corrupt PNG");
-    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
-        return 0;
-    if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
-        return 0;
-    return 1;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
 }
 
-static int stbi__parse_uncompressed_block(stbi__zbuf * a) {
-    stbi_uc header[4];
-    int len, nlen, k;
-    if (a->num_bits & 7)
-        stbi__zreceive(a, a->num_bits & 7); // discard
-    // drain the bit-packed data into header
-    k = 0;
-    while (a->num_bits > 0) {
-        header[k++] = (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
-        a->code_buffer >>= 8;
-        a->num_bits -= 8;
-    }
-    if (a->num_bits < 0)
-        return stbi__err("zlib corrupt", "Corrupt PNG");
-    // now fill header the normal way
-    while (k < 4)
-        header[k++] = stbi__zget8(a);
-    len = header[1] * 256 + header[0];
-    nlen = header[3] * 256 + header[2];
-    if (nlen != (len ^ 0xffff))
-        return stbi__err("zlib corrupt", "Corrupt PNG");
-    if (a->zbuffer + len > a->zbuffer_end)
-        return stbi__err("read past buffer", "Corrupt PNG");
-    if (a->zout + len > a->zout_end)
-        if (!stbi__zexpand(a, a->zout, len))
-            return 0;
-    memcpy(a->zout, a->zbuffer, len);
-    a->zbuffer += len;
-    a->zout += len;
-    return 1;
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
 }
 
-static int stbi__parse_zlib_header(stbi__zbuf * a) {
-    int cmf = stbi__zget8(a);
-    int cm = cmf & 15;
-    /* int cinfo = cmf >> 4; */
-    int flg = stbi__zget8(a);
-    if (stbi__zeof(a))
-        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
-    if ((cmf * 256 + flg) % 31 != 0)
-        return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
-    if (flg & 32)
-        return stbi__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
-    if (cm != 8)
-        return stbi__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
-    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
-    return 1;
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
 }
 
-static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8};
-static const stbi_uc stbi__zdefault_distance[32] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
 /*
 Init algorithm:
 {
@@ -4666,122 +4478,118 @@ Init algorithm:
 }
 */
 
-static int stbi__parse_zlib(stbi__zbuf * a, int parse_header) {
-    int final, type;
-    if (parse_header)
-        if (!stbi__parse_zlib_header(a))
-            return 0;
-    a->num_bits = 0;
-    a->code_buffer = 0;
-    do {
-        final = stbi__zreceive(a, 1);
-        type = stbi__zreceive(a, 2);
-        if (type == 0) {
-            if (!stbi__parse_uncompressed_block(a))
-                return 0;
-        } else if (type == 3) {
-            return 0;
-        } else {
-            if (type == 1) {
-                // use fixed code lengths
-                if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, STBI__ZNSYMS))
-                    return 0;
-                if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
-                    return 0;
-            } else {
-                if (!stbi__compute_huffman_codes(a))
-                    return 0;
-            }
-            if (!stbi__parse_huffman_block(a))
-                return 0;
-        }
-    } while (!final);
-    return 1;
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   a->hit_zeof_once = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
 }
 
-static int stbi__do_zlib(stbi__zbuf * a, char * obuf, int olen, int exp, int parse_header) {
-    a->zout_start = obuf;
-    a->zout = obuf;
-    a->zout_end = obuf + olen;
-    a->z_expandable = exp;
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
 
-    return stbi__parse_zlib(a, parse_header);
+   return stbi__parse_zlib(a, parse_header);
 }
 
-STBIDEF char * stbi_zlib_decode_malloc_guesssize(const char * buffer, int len, int initial_size, int * outlen) {
-    stbi__zbuf a;
-    char * p = (char *)stbi__malloc(initial_size);
-    if (p == NULL)
-        return NULL;
-    a.zbuffer = (stbi_uc *)buffer;
-    a.zbuffer_end = (stbi_uc *)buffer + len;
-    if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
-        if (outlen)
-            *outlen = (int)(a.zout - a.zout_start);
-        return a.zout_start;
-    } else {
-        STBI_FREE(a.zout_start);
-        return NULL;
-    }
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
 }
 
-STBIDEF char * stbi_zlib_decode_malloc(char const * buffer, int len, int * outlen) {
-    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
 }
 
-STBIDEF char * stbi_zlib_decode_malloc_guesssize_headerflag(const char * buffer, int len, int initial_size, int * outlen,
-                                                            int parse_header) {
-    stbi__zbuf a;
-    char * p = (char *)stbi__malloc(initial_size);
-    if (p == NULL)
-        return NULL;
-    a.zbuffer = (stbi_uc *)buffer;
-    a.zbuffer_end = (stbi_uc *)buffer + len;
-    if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
-        if (outlen)
-            *outlen = (int)(a.zout - a.zout_start);
-        return a.zout_start;
-    } else {
-        STBI_FREE(a.zout_start);
-        return NULL;
-    }
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
 }
 
-STBIDEF int stbi_zlib_decode_buffer(char * obuffer, int olen, char const * ibuffer, int ilen) {
-    stbi__zbuf a;
-    a.zbuffer = (stbi_uc *)ibuffer;
-    a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
-    if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
-        return (int)(a.zout - a.zout_start);
-    else
-        return -1;
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
 }
 
-STBIDEF char * stbi_zlib_decode_noheader_malloc(char const * buffer, int len, int * outlen) {
-    stbi__zbuf a;
-    char * p = (char *)stbi__malloc(16384);
-    if (p == NULL)
-        return NULL;
-    a.zbuffer = (stbi_uc *)buffer;
-    a.zbuffer_end = (stbi_uc *)buffer + len;
-    if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
-        if (outlen)
-            *outlen = (int)(a.zout - a.zout_start);
-        return a.zout_start;
-    } else {
-        STBI_FREE(a.zout_start);
-        return NULL;
-    }
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
 }
 
-STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const char * ibuffer, int ilen) {
-    stbi__zbuf a;
-    a.zbuffer = (stbi_uc *)ibuffer;
-    a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
-    if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
-        return (int)(a.zout - a.zout_start);
-    else
-        return -1;
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
 }
 #endif
 
@@ -4796,1303 +4604,1131 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char * obuffer, int olen, const cha
 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
 
 #ifndef STBI_NO_PNG
-typedef struct {
-    stbi__uint32 length;
-    stbi__uint32 type;
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
 } stbi__pngchunk;
 
-static stbi__pngchunk stbi__get_chunk_header(stbi__context * s) {
-    stbi__pngchunk c;
-    c.length = stbi__get32be(s);
-    c.type = stbi__get32be(s);
-    return c;
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
 }
 
-static int stbi__check_png_header(stbi__context * s) {
-    static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
-    int i;
-    for (i = 0; i < 8; ++i)
-        if (stbi__get8(s) != png_sig[i])
-            return stbi__err("bad png sig", "Not a PNG");
-    return 1;
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
 }
 
-typedef struct {
-    stbi__context * s;
-    stbi_uc *idata, *expanded, *out;
-    int depth;
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
 } stbi__png;
 
+
 enum {
-    STBI__F_none = 0,
-    STBI__F_sub = 1,
-    STBI__F_up = 2,
-    STBI__F_avg = 3,
-    STBI__F_paeth = 4,
-    // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-    STBI__F_avg_first,
-    STBI__F_paeth_first
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first
 };
 
-static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first, STBI__F_paeth_first};
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
+};
 
-static int stbi__paeth(int a, int b, int c) {
-    int p = a + b - c;
-    int pa = abs(p - a);
-    int pb = abs(p - b);
-    int pc = abs(p - c);
-    if (pa <= pb && pa <= pc)
-        return a;
-    if (pb <= pc)
-        return b;
-    return c;
+static int stbi__paeth(int a, int b, int c)
+{
+   // This formulation looks very different from the reference in the PNG spec, but is
+   // actually equivalent and has favorable data dependencies and admits straightforward
+   // generation of branch-free code, which helps performance significantly.
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
 }
 
-static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0, 0x11, 0, 0, 0, 0x01};
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
+{
+   int i;
+   // must process data backwards since we allow dest==src
+   if (img_n == 1) {
+      for (i=x-1; i >= 0; --i) {
+         dest[i*2+1] = 255;
+         dest[i*2+0] = src[i];
+      }
+   } else {
+      STBI_ASSERT(img_n == 3);
+      for (i=x-1; i >= 0; --i) {
+         dest[i*4+3] = 255;
+         dest[i*4+2] = src[i*3+2];
+         dest[i*4+1] = src[i*3+1];
+         dest[i*4+0] = src[i*3+0];
+      }
+   }
+}
 
 // create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png * a, stbi_uc * raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x,
-                                      stbi__uint32 y, int depth, int color) {
-    int bytes = (depth == 16 ? 2 : 1);
-    stbi__context * s = a->s;
-    stbi__uint32 i, j, stride = x * out_n * bytes;
-    stbi__uint32 img_len, img_width_bytes;
-    int k;
-    int img_n = s->img_n; // copy it into a local for later
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   stbi_uc *filter_buf;
+   int all_ok = 1;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
 
-    int output_bytes = out_n * bytes;
-    int filter_bytes = img_n * bytes;
-    int width = x;
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
 
-    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
-    a->out = (stbi_uc *)stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
-    if (!a->out)
-        return stbi__err("outofmem", "Out of memory");
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
 
-    if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
-        return stbi__err("too large", "Corrupt PNG");
-    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
-    img_len = (img_width_bytes + 1) * y;
+   // note: error exits here don't need to clean up a->out individually,
+   // stbi__do_png always does on error.
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
+   img_len = (img_width_bytes + 1) * y;
 
-    // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
-    // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
-    // so just check for raw_len < img_len always.
-    if (raw_len < img_len)
-        return stbi__err("not enough pixels", "Corrupt PNG");
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
-    for (j = 0; j < y; ++j) {
-        stbi_uc * cur = a->out + stride * j;
-        stbi_uc * prior;
-        int filter = *raw++;
+   // Allocate two scan lines worth of filter workspace buffer.
+   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
+   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
 
-        if (filter > 4)
-            return stbi__err("invalid filter", "Corrupt PNG");
+   // Filtering for low-bit-depth images
+   if (depth < 8) {
+      filter_bytes = 1;
+      width = img_width_bytes;
+   }
 
-        if (depth < 8) {
-            if (img_width_bytes > x)
-                return stbi__err("invalid width", "Corrupt PNG");
-            cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-            filter_bytes = 1;
-            width = img_width_bytes;
-        }
-        prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+   for (j=0; j < y; ++j) {
+      // cur/prior filter buffers alternate
+      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
+      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
+      stbi_uc *dest = a->out + stride*j;
+      int nk = width * filter_bytes;
+      int filter = *raw++;
 
-        // if first row, use special filter that doesn't sample previous row
-        if (j == 0)
-            filter = first_row_filter[filter];
+      // check filter type
+      if (filter > 4) {
+         all_ok = stbi__err("invalid filter","Corrupt PNG");
+         break;
+      }
 
-        // handle first byte explicitly
-        for (k = 0; k < filter_bytes; ++k) {
-            switch (filter) {
-            case STBI__F_none:
-                cur[k] = raw[k];
-                break;
-            case STBI__F_sub:
-                cur[k] = raw[k];
-                break;
-            case STBI__F_up:
-                cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-                break;
-            case STBI__F_avg:
-                cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
-                break;
-            case STBI__F_paeth:
-                cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
-                break;
-            case STBI__F_avg_first:
-                cur[k] = raw[k];
-                break;
-            case STBI__F_paeth_first:
-                cur[k] = raw[k];
-                break;
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // perform actual filtering
+      switch (filter) {
+      case STBI__F_none:
+         memcpy(cur, raw, nk);
+         break;
+      case STBI__F_sub:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
+         break;
+      case STBI__F_up:
+         for (k = 0; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+         break;
+      case STBI__F_avg:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
+         break;
+      case STBI__F_paeth:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
+         break;
+      case STBI__F_avg_first:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
+         break;
+      }
+
+      raw += nk;
+
+      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
+      if (depth < 8) {
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+         stbi_uc *in = cur;
+         stbi_uc *out = dest;
+         stbi_uc inb = 0;
+         stbi__uint32 nsmp = x*img_n;
+
+         // expand bits to bytes first
+         if (depth == 4) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 1) == 0) inb = *in++;
+               *out++ = scale * (inb >> 4);
+               inb <<= 4;
             }
-        }
-
-        if (depth == 8) {
-            if (img_n != out_n)
-                cur[img_n] = 255; // first pixel
-            raw += img_n;
-            cur += out_n;
-            prior += out_n;
-        } else if (depth == 16) {
-            if (img_n != out_n) {
-                cur[filter_bytes] = 255;     // first pixel top byte
-                cur[filter_bytes + 1] = 255; // first pixel bottom byte
+         } else if (depth == 2) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 3) == 0) inb = *in++;
+               *out++ = scale * (inb >> 6);
+               inb <<= 2;
             }
-            raw += filter_bytes;
-            cur += output_bytes;
-            prior += output_bytes;
-        } else {
-            raw += 1;
-            cur += 1;
-            prior += 1;
-        }
-
-        // this is a little gross, so that we don't switch per-pixel or per-component
-        if (depth < 8 || img_n == out_n) {
-            int nk = (width - 1) * filter_bytes;
-#define STBI__CASE(f)                                                                                                          \
-    case f:                                                                                                                    \
-        for (k = 0; k < nk; ++k)
-            switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:
-                memcpy(cur, raw, nk);
-                break;
-                STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); }
-                break;
-                STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
-                break;
-                STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); }
-                break;
-                STBI__CASE(STBI__F_paeth) {
-                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes]));
-                }
-                break;
-                STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); }
-                break;
-                STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); }
-                break;
+         } else {
+            STBI_ASSERT(depth == 1);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 7) == 0) inb = *in++;
+               *out++ = scale * (inb >> 7);
+               inb <<= 1;
             }
-#undef STBI__CASE
-            raw += nk;
-        } else {
-            STBI_ASSERT(img_n + 1 == out_n);
-#define STBI__CASE(f)                                                                                                          \
-    case f:                                                                                                                    \
-        for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, cur += output_bytes, prior += output_bytes) \
-            for (k = 0; k < filter_bytes; ++k)
-            switch (filter) {
-                STBI__CASE(STBI__F_none) { cur[k] = raw[k]; }
-                break;
-                STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]); }
-                break;
-                STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
-                break;
-                STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); }
-                break;
-                STBI__CASE(STBI__F_paeth) {
-                    cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes]));
-                }
-                break;
-                STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); }
-                break;
-                STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0)); }
-                break;
+         }
+
+         // insert alpha=255 values if desired
+         if (img_n != out_n)
+            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+      } else if (depth == 8) {
+         if (img_n == out_n)
+            memcpy(dest, cur, x*img_n);
+         else
+            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+      } else if (depth == 16) {
+         // convert the image data from big-endian to platform-native
+         stbi__uint16 *dest16 = (stbi__uint16*)dest;
+         stbi__uint32 nsmp = x*img_n;
+
+         if (img_n == out_n) {
+            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+               *dest16 = (cur[0] << 8) | cur[1];
+         } else {
+            STBI_ASSERT(img_n+1 == out_n);
+            if (img_n == 1) {
+               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = 0xffff;
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = (cur[2] << 8) | cur[3];
+                  dest16[2] = (cur[4] << 8) | cur[5];
+                  dest16[3] = 0xffff;
+               }
             }
-#undef STBI__CASE
+         }
+      }
+   }
 
-            // the loop above sets the high byte of the pixels' alpha, but for
-            // 16 bit png files we also need the low byte set. we'll do that here.
-            if (depth == 16) {
-                cur = a->out + stride * j; // start at the beginning of the row again
-                for (i = 0; i < x; ++i, cur += output_bytes) {
-                    cur[filter_bytes + 1] = 255;
-                }
-            }
-        }
-    }
+   STBI_FREE(filter_buf);
+   if (!all_ok) return 0;
 
-    // we make a separate pass to expand bits to pixels; for performance,
-    // this could run two scanlines behind the above code, so it won't
-    // intefere with filtering but will still be in the cache.
-    if (depth < 8) {
-        for (j = 0; j < y; ++j) {
-            stbi_uc * cur = a->out + stride * j;
-            stbi_uc * in = a->out + stride * j + x * out_n - img_width_bytes;
-            // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for
-            // 1/2/4-bit png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that
-            // will be skipped in the later loop
-            stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
-
-            // note that the final byte might overshoot and write more data than desired.
-            // we can allocate enough data that this never writes out of memory, but it
-            // could also overwrite the next scanline. can it overwrite non-empty data
-            // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-            // so we need to explicitly clamp the final ones
-
-            if (depth == 4) {
-                for (k = x * img_n; k >= 2; k -= 2, ++in) {
-                    *cur++ = scale * ((*in >> 4));
-                    *cur++ = scale * ((*in) & 0x0f);
-                }
-                if (k > 0)
-                    *cur++ = scale * ((*in >> 4));
-            } else if (depth == 2) {
-                for (k = x * img_n; k >= 4; k -= 4, ++in) {
-                    *cur++ = scale * ((*in >> 6));
-                    *cur++ = scale * ((*in >> 4) & 0x03);
-                    *cur++ = scale * ((*in >> 2) & 0x03);
-                    *cur++ = scale * ((*in) & 0x03);
-                }
-                if (k > 0)
-                    *cur++ = scale * ((*in >> 6));
-                if (k > 1)
-                    *cur++ = scale * ((*in >> 4) & 0x03);
-                if (k > 2)
-                    *cur++ = scale * ((*in >> 2) & 0x03);
-            } else if (depth == 1) {
-                for (k = x * img_n; k >= 8; k -= 8, ++in) {
-                    *cur++ = scale * ((*in >> 7));
-                    *cur++ = scale * ((*in >> 6) & 0x01);
-                    *cur++ = scale * ((*in >> 5) & 0x01);
-                    *cur++ = scale * ((*in >> 4) & 0x01);
-                    *cur++ = scale * ((*in >> 3) & 0x01);
-                    *cur++ = scale * ((*in >> 2) & 0x01);
-                    *cur++ = scale * ((*in >> 1) & 0x01);
-                    *cur++ = scale * ((*in) & 0x01);
-                }
-                if (k > 0)
-                    *cur++ = scale * ((*in >> 7));
-                if (k > 1)
-                    *cur++ = scale * ((*in >> 6) & 0x01);
-                if (k > 2)
-                    *cur++ = scale * ((*in >> 5) & 0x01);
-                if (k > 3)
-                    *cur++ = scale * ((*in >> 4) & 0x01);
-                if (k > 4)
-                    *cur++ = scale * ((*in >> 3) & 0x01);
-                if (k > 5)
-                    *cur++ = scale * ((*in >> 2) & 0x01);
-                if (k > 6)
-                    *cur++ = scale * ((*in >> 1) & 0x01);
-            }
-            if (img_n != out_n) {
-                int q;
-                // insert alpha = 255
-                cur = a->out + stride * j;
-                if (img_n == 1) {
-                    for (q = x - 1; q >= 0; --q) {
-                        cur[q * 2 + 1] = 255;
-                        cur[q * 2 + 0] = cur[q];
-                    }
-                } else {
-                    STBI_ASSERT(img_n == 3);
-                    for (q = x - 1; q >= 0; --q) {
-                        cur[q * 4 + 3] = 255;
-                        cur[q * 4 + 2] = cur[q * 3 + 2];
-                        cur[q * 4 + 1] = cur[q * 3 + 1];
-                        cur[q * 4 + 0] = cur[q * 3 + 0];
-                    }
-                }
-            }
-        }
-    } else if (depth == 16) {
-        // force the image data from big-endian to platform-native.
-        // this is done in a separate pass due to the decoding relying
-        // on the data being untouched, but could probably be done
-        // per-line during decode if care is taken.
-        stbi_uc * cur = a->out;
-        stbi__uint16 * cur16 = (stbi__uint16 *)cur;
-
-        for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
-            *cur16 = (cur[0] << 8) | cur[1];
-        }
-    }
-
-    return 1;
+   return 1;
 }
 
-static int stbi__create_png_image(stbi__png * a, stbi_uc * image_data, stbi__uint32 image_data_len, int out_n, int depth,
-                                  int color, int interlaced) {
-    int bytes = (depth == 16 ? 2 : 1);
-    int out_bytes = out_n * bytes;
-    stbi_uc * final;
-    int p;
-    if (!interlaced)
-        return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
-    // de-interlacing
-    final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-    if (!final)
-        return stbi__err("outofmem", "Out of memory");
-    for (p = 0; p < 7; ++p) {
-        int xorig[] = {0, 4, 0, 2, 0, 1, 0};
-        int yorig[] = {0, 0, 4, 0, 2, 0, 1};
-        int xspc[] = {8, 8, 4, 4, 2, 2, 1};
-        int yspc[] = {8, 8, 8, 4, 4, 2, 2};
-        int i, j, x, y;
-        // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
-        x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
-        y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
-        if (x && y) {
-            stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-            if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
-                STBI_FREE(final);
-                return 0;
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
             }
-            for (j = 0; j < y; ++j) {
-                for (i = 0; i < x; ++i) {
-                    int out_y = j * yspc[p] + yorig[p];
-                    int out_x = i * xspc[p] + xorig[p];
-                    memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, a->out + (j * x + i) * out_bytes,
-                           out_bytes);
-                }
-            }
-            STBI_FREE(a->out);
-            image_data += img_len;
-            image_data_len -= img_len;
-        }
-    }
-    a->out = final;
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
 
-    return 1;
+   return 1;
 }
 
-static int stbi__compute_transparency(stbi__png * z, stbi_uc tc[3], int out_n) {
-    stbi__context * s = z->s;
-    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-    stbi_uc * p = z->out;
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
 
-    // compute color-based transparency, assuming we've
-    // already got 255 as the alpha value in the output
-    STBI_ASSERT(out_n == 2 || out_n == 4);
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if (out_n == 2) {
-        for (i = 0; i < pixel_count; ++i) {
-            p[1] = (p[0] == tc[0] ? 0 : 255);
-            p += 2;
-        }
-    } else {
-        for (i = 0; i < pixel_count; ++i) {
-            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-                p[3] = 0;
-            p += 4;
-        }
-    }
-    return 1;
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
 }
 
-static int stbi__compute_transparency16(stbi__png * z, stbi__uint16 tc[3], int out_n) {
-    stbi__context * s = z->s;
-    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-    stbi__uint16 * p = (stbi__uint16 *)z->out;
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
 
-    // compute color-based transparency, assuming we've
-    // already got 65535 as the alpha value in the output
-    STBI_ASSERT(out_n == 2 || out_n == 4);
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
 
-    if (out_n == 2) {
-        for (i = 0; i < pixel_count; ++i) {
-            p[1] = (p[0] == tc[0] ? 0 : 65535);
-            p += 2;
-        }
-    } else {
-        for (i = 0; i < pixel_count; ++i) {
-            if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-                p[3] = 0;
-            p += 4;
-        }
-    }
-    return 1;
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
 }
 
-static int stbi__expand_png_palette(stbi__png * a, stbi_uc * palette, int len, int pal_img_n) {
-    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
-    stbi_uc *p, *temp_out, *orig = a->out;
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
 
-    p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-    if (p == NULL)
-        return stbi__err("outofmem", "Out of memory");
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
-    // between here and free(out) below, exitting would leak
-    temp_out = p;
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
 
-    if (pal_img_n == 3) {
-        for (i = 0; i < pixel_count; ++i) {
-            int n = orig[i] * 4;
-            p[0] = palette[n];
-            p[1] = palette[n + 1];
-            p[2] = palette[n + 2];
-            p += 3;
-        }
-    } else {
-        for (i = 0; i < pixel_count; ++i) {
-            int n = orig[i] * 4;
-            p[0] = palette[n];
-            p[1] = palette[n + 1];
-            p[2] = palette[n + 2];
-            p[3] = palette[n + 3];
-            p += 4;
-        }
-    }
-    STBI_FREE(a->out);
-    a->out = temp_out;
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
 
-    STBI_NOTUSED(len);
+   STBI_NOTUSED(len);
 
-    return 1;
+   return 1;
 }
 
 static int stbi__unpremultiply_on_load_global = 0;
 static int stbi__de_iphone_flag_global = 0;
 
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) {
-    stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 }
 
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) {
-    stbi__de_iphone_flag_global = flag_true_if_should_convert;
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 }
 
 #ifndef STBI_THREAD_LOCAL
-#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global
-#define stbi__de_iphone_flag stbi__de_iphone_flag_global
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
 #else
 static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
 static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
 
-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) {
-    stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
-    stbi__unpremultiply_on_load_set = 1;
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
 }
 
-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) {
-    stbi__de_iphone_flag_local = flag_true_if_should_convert;
-    stbi__de_iphone_flag_set = 1;
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
 }
 
-#define stbi__unpremultiply_on_load                                                                                            \
-    (stbi__unpremultiply_on_load_set ? stbi__unpremultiply_on_load_local : stbi__unpremultiply_on_load_global)
-#define stbi__de_iphone_flag (stbi__de_iphone_flag_set ? stbi__de_iphone_flag_local : stbi__de_iphone_flag_global)
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
 #endif // STBI_THREAD_LOCAL
 
-static void stbi__de_iphone(stbi__png * z) {
-    stbi__context * s = z->s;
-    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-    stbi_uc * p = z->out;
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
 
-    if (s->img_out_n == 3) { // convert bgr to rgb
-        for (i = 0; i < pixel_count; ++i) {
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
             stbi_uc t = p[0];
             p[0] = p[2];
             p[2] = t;
-            p += 3;
-        }
-    } else {
-        STBI_ASSERT(s->img_out_n == 4);
-        if (stbi__unpremultiply_on_load) {
-            // convert bgr to rgb and unpremultiply
-            for (i = 0; i < pixel_count; ++i) {
-                stbi_uc a = p[3];
-                stbi_uc t = p[0];
-                if (a) {
-                    stbi_uc half = a / 2;
-                    p[0] = (p[2] * 255 + half) / a;
-                    p[1] = (p[1] * 255 + half) / a;
-                    p[2] = (t * 255 + half) / a;
-                } else {
-                    p[0] = p[2];
-                    p[2] = t;
-                }
-                p += 4;
-            }
-        } else {
-            // convert bgr to rgb
-            for (i = 0; i < pixel_count; ++i) {
-                stbi_uc t = p[0];
-                p[0] = p[2];
-                p[2] = t;
-                p += 4;
-            }
-        }
-    }
+            p += 4;
+         }
+      }
+   }
 }
 
-#define STBI__PNG_TYPE(a, b, c, d) (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + (unsigned)(d))
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 
-static int stbi__parse_png_file(stbi__png * z, int scan, int req_comp) {
-    stbi_uc palette[1024], pal_img_n = 0;
-    stbi_uc has_trans = 0, tc[3] = {0};
-    stbi__uint16 tc16[3];
-    stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
-    int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
-    stbi__context * s = z->s;
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
 
-    z->expanded = NULL;
-    z->idata = NULL;
-    z->out = NULL;
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
 
-    if (!stbi__check_png_header(s))
-        return 0;
+   if (!stbi__check_png_header(s)) return 0;
 
-    if (scan == STBI__SCAN_type)
-        return 1;
+   if (scan == STBI__SCAN_type) return 1;
 
-    for (;;) {
-        stbi__pngchunk c = stbi__get_chunk_header(s);
-        switch (c.type) {
-        case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
             is_iphone = 1;
             stbi__skip(s, c.length);
             break;
-        case STBI__PNG_TYPE('I', 'H', 'D', 'R'): {
-            int comp, filter;
-            if (!first)
-                return stbi__err("multiple IHDR", "Corrupt PNG");
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
             first = 0;
-            if (c.length != 13)
-                return stbi__err("bad IHDR len", "Corrupt PNG");
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
             s->img_x = stbi__get32be(s);
             s->img_y = stbi__get32be(s);
-            if (s->img_y > STBI_MAX_DIMENSIONS)
-                return stbi__err("too large", "Very large image (corrupt?)");
-            if (s->img_x > STBI_MAX_DIMENSIONS)
-                return stbi__err("too large", "Very large image (corrupt?)");
-            z->depth = stbi__get8(s);
-            if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)
-                return stbi__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
-            color = stbi__get8(s);
-            if (color > 6)
-                return stbi__err("bad ctype", "Corrupt PNG");
-            if (color == 3 && z->depth == 16)
-                return stbi__err("bad ctype", "Corrupt PNG");
-            if (color == 3)
-                pal_img_n = 3;
-            else if (color & 1)
-                return stbi__err("bad ctype", "Corrupt PNG");
-            comp = stbi__get8(s);
-            if (comp)
-                return stbi__err("bad comp method", "Corrupt PNG");
-            filter = stbi__get8(s);
-            if (filter)
-                return stbi__err("bad filter method", "Corrupt PNG");
-            interlace = stbi__get8(s);
-            if (interlace > 1)
-                return stbi__err("bad interlace method", "Corrupt PNG");
-            if (!s->img_x || !s->img_y)
-                return stbi__err("0-pixel image", "Corrupt PNG");
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
             if (!pal_img_n) {
-                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-                if ((1 << 30) / s->img_x / s->img_n < s->img_y)
-                    return stbi__err("too large", "Image too large to decode");
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
             } else {
-                // if paletted, then pal_n is our final components, and
-                // img_n is # components to decompress/filter.
-                s->img_n = 1;
-                if ((1 << 30) / s->img_x / 4 < s->img_y)
-                    return stbi__err("too large", "Corrupt PNG");
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
             }
             // even with SCAN_header, have to scan to see if we have a tRNS
             break;
-        }
+         }
 
-        case STBI__PNG_TYPE('P', 'L', 'T', 'E'): {
-            if (first)
-                return stbi__err("first not IHDR", "Corrupt PNG");
-            if (c.length > 256 * 3)
-                return stbi__err("invalid PLTE", "Corrupt PNG");
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
             pal_len = c.length / 3;
-            if (pal_len * 3 != c.length)
-                return stbi__err("invalid PLTE", "Corrupt PNG");
-            for (i = 0; i < pal_len; ++i) {
-                palette[i * 4 + 0] = stbi__get8(s);
-                palette[i * 4 + 1] = stbi__get8(s);
-                palette[i * 4 + 2] = stbi__get8(s);
-                palette[i * 4 + 3] = 255;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
             }
             break;
-        }
+         }
 
-        case STBI__PNG_TYPE('t', 'R', 'N', 'S'): {
-            if (first)
-                return stbi__err("first not IHDR", "Corrupt PNG");
-            if (z->idata)
-                return stbi__err("tRNS after IDAT", "Corrupt PNG");
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
             if (pal_img_n) {
-                if (scan == STBI__SCAN_header) {
-                    s->img_n = 4;
-                    return 1;
-                }
-                if (pal_len == 0)
-                    return stbi__err("tRNS before PLTE", "Corrupt PNG");
-                if (c.length > pal_len)
-                    return stbi__err("bad tRNS len", "Corrupt PNG");
-                pal_img_n = 4;
-                for (i = 0; i < c.length; ++i)
-                    palette[i * 4 + 3] = stbi__get8(s);
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
             } else {
-                if (!(s->img_n & 1))
-                    return stbi__err("tRNS with alpha", "Corrupt PNG");
-                if (c.length != (stbi__uint32)s->img_n * 2)
-                    return stbi__err("bad tRNS len", "Corrupt PNG");
-                has_trans = 1;
-                // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
-                if (scan == STBI__SCAN_header) {
-                    ++s->img_n;
-                    return 1;
-                }
-                if (z->depth == 16) {
-                    for (k = 0; k < s->img_n; ++k)
-                        tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
-                } else {
-                    for (k = 0; k < s->img_n; ++k)
-                        tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
-                                stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
-                }
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
             }
             break;
-        }
+         }
 
-        case STBI__PNG_TYPE('I', 'D', 'A', 'T'): {
-            if (first)
-                return stbi__err("first not IHDR", "Corrupt PNG");
-            if (pal_img_n && !pal_len)
-                return stbi__err("no PLTE", "Corrupt PNG");
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
             if (scan == STBI__SCAN_header) {
-                // header scan definitely stops at first IDAT
-                if (pal_img_n)
-                    s->img_n = pal_img_n;
-                return 1;
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
             }
-            if (c.length > (1u << 30))
-                return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
-            if ((int)(ioff + c.length) < (int)ioff)
-                return 0;
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
-                stbi__uint32 idata_limit_old = idata_limit;
-                stbi_uc * p;
-                if (idata_limit == 0)
-                    idata_limit = c.length > 4096 ? c.length : 4096;
-                while (ioff + c.length > idata_limit)
-                    idata_limit *= 2;
-                STBI_NOTUSED(idata_limit_old);
-                p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
-                if (p == NULL)
-                    return stbi__err("outofmem", "Out of memory");
-                z->idata = p;
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
             }
-            if (!stbi__getn(s, z->idata + ioff, c.length))
-                return stbi__err("outofdata", "Corrupt PNG");
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
             ioff += c.length;
             break;
-        }
+         }
 
-        case STBI__PNG_TYPE('I', 'E', 'N', 'D'): {
+         case STBI__PNG_TYPE('I','E','N','D'): {
             stbi__uint32 raw_len, bpl;
-            if (first)
-                return stbi__err("first not IHDR", "Corrupt PNG");
-            if (scan != STBI__SCAN_load)
-                return 1;
-            if (z->idata == NULL)
-                return stbi__err("no IDAT", "Corrupt PNG");
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
             // initial guess for decoded data size to avoid unnecessary reallocs
             bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
             raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-            z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag((char *)z->idata, ioff, raw_len,
-                                                                                  (int *)&raw_len, !is_iphone);
-            if (z->expanded == NULL)
-                return 0; // zlib should set error
-            STBI_FREE(z->idata);
-            z->idata = NULL;
-            if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
-                s->img_out_n = s->img_n + 1;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
             else
-                s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace))
-                return 0;
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
             if (has_trans) {
-                if (z->depth == 16) {
-                    if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
-                        return 0;
-                } else {
-                    if (!stbi__compute_transparency(z, tc, s->img_out_n))
-                        return 0;
-                }
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
             }
             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-                stbi__de_iphone(z);
+               stbi__de_iphone(z);
             if (pal_img_n) {
-                // pal_img_n == 3 or 4
-                s->img_n = pal_img_n; // record the actual colors we had
-                s->img_out_n = pal_img_n;
-                if (req_comp >= 3)
-                    s->img_out_n = req_comp;
-                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
-                    return 0;
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
             } else if (has_trans) {
-                // non-paletted image with tRNS -> source image has (constant) alpha
-                ++s->img_n;
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
             }
-            STBI_FREE(z->expanded);
-            z->expanded = NULL;
+            STBI_FREE(z->expanded); z->expanded = NULL;
             // end of PNG chunk, read and skip CRC
             stbi__get32be(s);
             return 1;
-        }
+         }
 
-        default:
+         default:
             // if critical, fail
-            if (first)
-                return stbi__err("first not IHDR", "Corrupt PNG");
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if ((c.type & (1 << 29)) == 0) {
-#ifndef STBI_NO_FAILURE_STRINGS
-                // not threadsafe
-                static char invalid_chunk[] = "XXXX PNG chunk not known";
-                invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-                invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-                invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
-                invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
-#endif
-                return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
             }
             stbi__skip(s, c.length);
             break;
-        }
-        // end of PNG chunk, read and skip CRC
-        stbi__get32be(s);
-    }
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
 }
 
-static void * stbi__do_png(stbi__png * p, int * x, int * y, int * n, int req_comp, stbi__result_info * ri) {
-    void * result = NULL;
-    if (req_comp < 0 || req_comp > 4)
-        return stbi__errpuc("bad req_comp", "Internal error");
-    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-        if (p->depth <= 8)
-            ri->bits_per_channel = 8;
-        else if (p->depth == 16)
-            ri->bits_per_channel = 16;
-        else
-            return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
-        result = p->out;
-        p->out = NULL;
-        if (req_comp && req_comp != p->s->img_out_n) {
-            if (ri->bits_per_channel == 8)
-                result = stbi__convert_format((unsigned char *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-            else
-                result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-            p->s->img_out_n = req_comp;
-            if (result == NULL)
-                return result;
-        }
-        *x = p->s->img_x;
-        *y = p->s->img_y;
-        if (n)
-            *n = p->s->img_n;
-    }
-    STBI_FREE(p->out);
-    p->out = NULL;
-    STBI_FREE(p->expanded);
-    p->expanded = NULL;
-    STBI_FREE(p->idata);
-    p->idata = NULL;
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
 
-    return result;
+   return result;
 }
 
-static void * stbi__png_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    stbi__png p;
-    p.s = s;
-    return stbi__do_png(&p, x, y, comp, req_comp, ri);
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 }
 
-static int stbi__png_test(stbi__context * s) {
-    int r;
-    r = stbi__check_png_header(s);
-    stbi__rewind(s);
-    return r;
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
 }
 
-static int stbi__png_info_raw(stbi__png * p, int * x, int * y, int * comp) {
-    if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
-        stbi__rewind(p->s);
-        return 0;
-    }
-    if (x)
-        *x = p->s->img_x;
-    if (y)
-        *y = p->s->img_y;
-    if (comp)
-        *comp = p->s->img_n;
-    return 1;
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
 }
 
-static int stbi__png_info(stbi__context * s, int * x, int * y, int * comp) {
-    stbi__png p;
-    p.s = s;
-    return stbi__png_info_raw(&p, x, y, comp);
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
 }
 
-static int stbi__png_is16(stbi__context * s) {
-    stbi__png p;
-    p.s = s;
-    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-        return 0;
-    if (p.depth != 16) {
-        stbi__rewind(p.s);
-        return 0;
-    }
-    return 1;
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
 }
 #endif
 
 // Microsoft/Windows BMP image
 
 #ifndef STBI_NO_BMP
-static int stbi__bmp_test_raw(stbi__context * s) {
-    int r;
-    int sz;
-    if (stbi__get8(s) != 'B')
-        return 0;
-    if (stbi__get8(s) != 'M')
-        return 0;
-    stbi__get32le(s); // discard filesize
-    stbi__get16le(s); // discard reserved
-    stbi__get16le(s); // discard reserved
-    stbi__get32le(s); // discard data offset
-    sz = stbi__get32le(s);
-    r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
-    return r;
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
 }
 
-static int stbi__bmp_test(stbi__context * s) {
-    int r = stbi__bmp_test_raw(s);
-    stbi__rewind(s);
-    return r;
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
 }
 
+
 // returns 0..31 for the highest set bit
-static int stbi__high_bit(unsigned int z) {
-    int n = 0;
-    if (z == 0)
-        return -1;
-    if (z >= 0x10000) {
-        n += 16;
-        z >>= 16;
-    }
-    if (z >= 0x00100) {
-        n += 8;
-        z >>= 8;
-    }
-    if (z >= 0x00010) {
-        n += 4;
-        z >>= 4;
-    }
-    if (z >= 0x00004) {
-        n += 2;
-        z >>= 2;
-    }
-    if (z >= 0x00002) {
-        n += 1; /* >>=  1;*/
-    }
-    return n;
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
 }
 
-static int stbi__bitcount(unsigned int a) {
-    a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
-    a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
-    a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
-    a = (a + (a >> 8));                             // max 16 per 8 bits
-    a = (a + (a >> 16));                            // max 32 per 8 bits
-    return a & 0xff;
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
 }
 
 // extract an arbitrarily-aligned N-bit value (N=bits)
 // from v, and then make it 8-bits long and fractionally
 // extend it to full full range.
-static int stbi__shiftsigned(unsigned int v, int shift, int bits) {
-    static unsigned int mul_table[9] = {
-        0,
-        0xff /*0b11111111*/,
-        0x55 /*0b01010101*/,
-        0x49 /*0b01001001*/,
-        0x11 /*0b00010001*/,
-        0x21 /*0b00100001*/,
-        0x41 /*0b01000001*/,
-        0x81 /*0b10000001*/,
-        0x01 /*0b00000001*/,
-    };
-    static unsigned int shift_table[9] = {
-        0, 0, 0, 1, 0, 2, 4, 6, 0,
-    };
-    if (shift < 0)
-        v <<= -shift;
-    else
-        v >>= shift;
-    STBI_ASSERT(v < 256);
-    v >>= (8 - bits);
-    STBI_ASSERT(bits >= 0 && bits <= 8);
-    return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
 }
 
-typedef struct {
-    int bpp, offset, hsz;
-    unsigned int mr, mg, mb, ma, all_a;
-    int extra_read;
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
 } stbi__bmp_data;
 
-static int stbi__bmp_set_mask_defaults(stbi__bmp_data * info, int compress) {
-    // BI_BITFIELDS specifies masks explicitly, don't override
-    if (compress == 3)
-        return 1;
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
 
-    if (compress == 0) {
-        if (info->bpp == 16) {
-            info->mr = 31u << 10;
-            info->mg = 31u << 5;
-            info->mb = 31u << 0;
-        } else if (info->bpp == 32) {
-            info->mr = 0xffu << 16;
-            info->mg = 0xffu << 8;
-            info->mb = 0xffu << 0;
-            info->ma = 0xffu << 24;
-            info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-        } else {
-            // otherwise, use defaults, which is all-0
-            info->mr = info->mg = info->mb = info->ma = 0;
-        }
-        return 1;
-    }
-    return 0; // error
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
 }
 
-static void * stbi__bmp_parse_header(stbi__context * s, stbi__bmp_data * info) {
-    int hsz;
-    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
-        return stbi__errpuc("not BMP", "Corrupt BMP");
-    stbi__get32le(s); // discard filesize
-    stbi__get16le(s); // discard reserved
-    stbi__get16le(s); // discard reserved
-    info->offset = stbi__get32le(s);
-    info->hsz = hsz = stbi__get32le(s);
-    info->mr = info->mg = info->mb = info->ma = 0;
-    info->extra_read = 14;
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
 
-    if (info->offset < 0)
-        return stbi__errpuc("bad BMP", "bad BMP");
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
 
-    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
-        return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-    if (hsz == 12) {
-        s->img_x = stbi__get16le(s);
-        s->img_y = stbi__get16le(s);
-    } else {
-        s->img_x = stbi__get32le(s);
-        s->img_y = stbi__get32le(s);
-    }
-    if (stbi__get16le(s) != 1)
-        return stbi__errpuc("bad BMP", "bad BMP");
-    info->bpp = stbi__get16le(s);
-    if (hsz != 12) {
-        int compress = stbi__get32le(s);
-        if (compress == 1 || compress == 2)
-            return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-        if (compress >= 4)
-            return stbi__errpuc("BMP JPEG/PNG",
-                                "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
-        if (compress == 3 && info->bpp != 16 && info->bpp != 32)
-            return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
-        stbi__get32le(s);                              // discard sizeof
-        stbi__get32le(s);                              // discard hres
-        stbi__get32le(s);                              // discard vres
-        stbi__get32le(s);                              // discard colorsused
-        stbi__get32le(s);                              // discard max important
-        if (hsz == 40 || hsz == 56) {
-            if (hsz == 56) {
-                stbi__get32le(s);
-                stbi__get32le(s);
-                stbi__get32le(s);
-                stbi__get32le(s);
-            }
-            if (info->bpp == 16 || info->bpp == 32) {
-                if (compress == 0) {
-                    stbi__bmp_set_mask_defaults(info, compress);
-                } else if (compress == 3) {
-                    info->mr = stbi__get32le(s);
-                    info->mg = stbi__get32le(s);
-                    info->mb = stbi__get32le(s);
-                    info->extra_read += 12;
-                    // not documented, but generated by photoshop and handled by mspaint
-                    if (info->mr == info->mg && info->mg == info->mb) {
-                        // ?!?!?
-                        return stbi__errpuc("bad BMP", "bad BMP");
-                    }
-                } else
-                    return stbi__errpuc("bad BMP", "bad BMP");
-            }
-        } else {
-            // V4/V5 header
-            int i;
-            if (hsz != 108 && hsz != 124)
-                return stbi__errpuc("bad BMP", "bad BMP");
-            info->mr = stbi__get32le(s);
-            info->mg = stbi__get32le(s);
-            info->mb = stbi__get32le(s);
-            info->ma = stbi__get32le(s);
-            if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
-                stbi__bmp_set_mask_defaults(info, compress);
-            stbi__get32le(s); // discard color space
-            for (i = 0; i < 12; ++i)
-                stbi__get32le(s); // discard color space parameters
-            if (hsz == 124) {
-                stbi__get32le(s); // discard rendering intent
-                stbi__get32le(s); // discard offset of profile data
-                stbi__get32le(s); // discard size of profile data
-                stbi__get32le(s); // discard reserved
-            }
-        }
-    }
-    return (void *)1;
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
 }
 
-static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    stbi_uc * out;
-    unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a;
-    stbi_uc pal[256][4];
-    int psize = 0, i, j, width;
-    int flip_vertically, pad, target;
-    stbi__bmp_data info;
-    STBI_NOTUSED(ri);
 
-    info.all_a = 255;
-    if (stbi__bmp_parse_header(s, &info) == NULL)
-        return NULL; // error code already set
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
 
-    flip_vertically = ((int)s->img_y) > 0;
-    s->img_y = abs((int)s->img_y);
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
 
-    if (s->img_y > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
-    if (s->img_x > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
 
-    mr = info.mr;
-    mg = info.mg;
-    mb = info.mb;
-    ma = info.ma;
-    all_a = info.all_a;
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
-    if (info.hsz == 12) {
-        if (info.bpp < 24)
-            psize = (info.offset - info.extra_read - 24) / 3;
-    } else {
-        if (info.bpp < 16)
-            psize = (info.offset - info.extra_read - info.hsz) >> 2;
-    }
-    if (psize == 0) {
-        // accept some number of extra bytes after the header, but if the offset points either to before
-        // the header ends or implies a large amount of extra data, reject the file as malformed
-        int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
-        int header_limit = 1024;        // max we actually read is below 256 bytes currently.
-        int extra_data_limit = 256 * 4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
-        if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
-            return stbi__errpuc("bad header", "Corrupt BMP");
-        }
-        // we established that bytes_read_so_far is positive and sensible.
-        // the first half of this test rejects offsets that are either too small positives, or
-        // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
-        // ensures the number computed in the second half of the test can't overflow.
-        if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
-            return stbi__errpuc("bad offset", "Corrupt BMP");
-        } else {
-            stbi__skip(s, info.offset - bytes_read_so_far);
-        }
-    }
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
 
-    if (info.bpp == 24 && ma == 0xff000000)
-        s->img_n = 3;
-    else
-        s->img_n = ma ? 4 : 3;
-    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
-        target = req_comp;
-    else
-        target = s->img_n; // if they want monochrome, we'll post-convert
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
 
-    // sanity-check size
-    if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
-        return stbi__errpuc("too large", "Corrupt BMP");
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
 
-    out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-    if (!out)
-        return stbi__errpuc("outofmem", "Out of memory");
-    if (info.bpp < 16) {
-        int z = 0;
-        if (psize == 0 || psize > 256) {
-            STBI_FREE(out);
-            return stbi__errpuc("invalid", "Corrupt BMP");
-        }
-        for (i = 0; i < psize; ++i) {
-            pal[i][2] = stbi__get8(s);
-            pal[i][1] = stbi__get8(s);
-            pal[i][0] = stbi__get8(s);
-            if (info.hsz != 12)
-                stbi__get8(s);
-            pal[i][3] = 255;
-        }
-        stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-        if (info.bpp == 1)
-            width = (s->img_x + 7) >> 3;
-        else if (info.bpp == 4)
-            width = (s->img_x + 1) >> 1;
-        else if (info.bpp == 8)
-            width = s->img_x;
-        else {
-            STBI_FREE(out);
-            return stbi__errpuc("bad bpp", "Corrupt BMP");
-        }
-        pad = (-width) & 3;
-        if (info.bpp == 1) {
-            for (j = 0; j < (int)s->img_y; ++j) {
-                int bit_offset = 7, v = stbi__get8(s);
-                for (i = 0; i < (int)s->img_x; ++i) {
-                    int color = (v >> bit_offset) & 0x1;
-                    out[z++] = pal[color][0];
-                    out[z++] = pal[color][1];
-                    out[z++] = pal[color][2];
-                    if (target == 4)
-                        out[z++] = 255;
-                    if (i + 1 == (int)s->img_x)
-                        break;
-                    if ((--bit_offset) < 0) {
-                        bit_offset = 7;
-                        v = stbi__get8(s);
-                    }
-                }
-                stbi__skip(s, pad);
-            }
-        } else {
-            for (j = 0; j < (int)s->img_y; ++j) {
-                for (i = 0; i < (int)s->img_x; i += 2) {
-                    int v = stbi__get8(s), v2 = 0;
-                    if (info.bpp == 4) {
-                        v2 = v & 15;
-                        v >>= 4;
-                    }
-                    out[z++] = pal[v][0];
-                    out[z++] = pal[v][1];
-                    out[z++] = pal[v][2];
-                    if (target == 4)
-                        out[z++] = 255;
-                    if (i + 1 == (int)s->img_x)
-                        break;
-                    v = (info.bpp == 8) ? stbi__get8(s) : v2;
-                    out[z++] = pal[v][0];
-                    out[z++] = pal[v][1];
-                    out[z++] = pal[v][2];
-                    if (target == 4)
-                        out[z++] = 255;
-                }
-                stbi__skip(s, pad);
-            }
-        }
-    } else {
-        int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0, bcount = 0, acount = 0;
-        int z = 0;
-        int easy = 0;
-        stbi__skip(s, info.offset - info.extra_read - info.hsz);
-        if (info.bpp == 24)
-            width = 3 * s->img_x;
-        else if (info.bpp == 16)
-            width = 2 * s->img_x;
-        else /* bpp = 32 and pad = 0 */
-            width = 0;
-        pad = (-width) & 3;
-        if (info.bpp == 24) {
-            easy = 1;
-        } else if (info.bpp == 32) {
-            if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
-                easy = 2;
-        }
-        if (!easy) {
-            if (!mr || !mg || !mb) {
-                STBI_FREE(out);
-                return stbi__errpuc("bad masks", "Corrupt BMP");
-            }
-            // right shift amt to put high bit in position #7
-            rshift = stbi__high_bit(mr) - 7;
-            rcount = stbi__bitcount(mr);
-            gshift = stbi__high_bit(mg) - 7;
-            gcount = stbi__bitcount(mg);
-            bshift = stbi__high_bit(mb) - 7;
-            bcount = stbi__bitcount(mb);
-            ashift = stbi__high_bit(ma) - 7;
-            acount = stbi__bitcount(ma);
-            if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) {
-                STBI_FREE(out);
-                return stbi__errpuc("bad masks", "Corrupt BMP");
-            }
-        }
-        for (j = 0; j < (int)s->img_y; ++j) {
-            if (easy) {
-                for (i = 0; i < (int)s->img_x; ++i) {
-                    unsigned char a;
-                    out[z + 2] = stbi__get8(s);
-                    out[z + 1] = stbi__get8(s);
-                    out[z + 0] = stbi__get8(s);
-                    z += 3;
-                    a = (easy == 2 ? stbi__get8(s) : 255);
-                    all_a |= a;
-                    if (target == 4)
-                        out[z++] = a;
-                }
-            } else {
-                int bpp = info.bpp;
-                for (i = 0; i < (int)s->img_x; ++i) {
-                    stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
-                    unsigned int a;
-                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
-                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
-                    out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
-                    a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-                    all_a |= a;
-                    if (target == 4)
-                        out[z++] = STBI__BYTECAST(a);
-                }
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
             }
             stbi__skip(s, pad);
-        }
-    }
-
-    // if alpha channel is all 0s, replace with all 255s
-    if (target == 4 && all_a == 0)
-        for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
-            out[i] = 255;
-
-    if (flip_vertically) {
-        stbi_uc t;
-        for (j = 0; j < (int)s->img_y >> 1; ++j) {
-            stbi_uc * p1 = out + j * s->img_x * target;
-            stbi_uc * p2 = out + (s->img_y - 1 - j) * s->img_x * target;
-            for (i = 0; i < (int)s->img_x * target; ++i) {
-                t = p1[i];
-                p1[i] = p2[i];
-                p2[i] = t;
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
             }
-        }
-    }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
 
-    if (req_comp && req_comp != target) {
-        out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-        if (out == NULL)
-            return out; // stbi__convert_format frees input on failure
-    }
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
 
-    *x = s->img_x;
-    *y = s->img_y;
-    if (comp)
-        *comp = s->img_n;
-    return out;
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
 }
 #endif
 
@@ -6100,74 +5736,68 @@ static void * stbi__bmp_load(stbi__context * s, int * x, int * y, int * comp, in
 // by Jonathan Dummer
 #ifndef STBI_NO_TGA
 // returns STBI_rgb or whatever, 0 on error
-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int * is_rgb16) {
-    // only RGB or RGBA (incl. 16bit) or grey allowed
-    if (is_rgb16)
-        *is_rgb16 = 0;
-    switch (bits_per_pixel) {
-    case 8:
-        return STBI_grey;
-    case 16:
-        if (is_grey)
-            return STBI_grey_alpha;
-        // fallthrough
-    case 15:
-        if (is_rgb16)
-            *is_rgb16 = 1;
-        return STBI_rgb;
-    case 24: // fallthrough
-    case 32:
-        return bits_per_pixel / 8;
-    default:
-        return 0;
-    }
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
 }
 
-static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp) {
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
     int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
     int sz, tga_colormap_type;
-    stbi__get8(s);                     // discard Offset
+    stbi__get8(s);                   // discard Offset
     tga_colormap_type = stbi__get8(s); // colormap type
-    if (tga_colormap_type > 1) {
+    if( tga_colormap_type > 1 ) {
         stbi__rewind(s);
-        return 0; // only RGB or indexed allowed
+        return 0;      // only RGB or indexed allowed
     }
     tga_image_type = stbi__get8(s); // image type
-    if (tga_colormap_type == 1) {   // colormapped (paletted) image
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
         if (tga_image_type != 1 && tga_image_type != 9) {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s); //   check bits per palette color entry
-        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
             stbi__rewind(s);
             return 0;
         }
-        stbi__skip(s, 4); // skip image x and y origin
+        stbi__skip(s,4);       // skip image x and y origin
         tga_colormap_bpp = sz;
     } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if ((tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11)) {
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
             stbi__rewind(s);
             return 0; // only RGB or grey allowed, +/- RLE
         }
-        stbi__skip(s, 9); // skip colormap specification and image x/y origin
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
         tga_colormap_bpp = 0;
     }
     tga_w = stbi__get16le(s);
-    if (tga_w < 1) {
+    if( tga_w < 1 ) {
         stbi__rewind(s);
-        return 0; // test width
+        return 0;   // test width
     }
     tga_h = stbi__get16le(s);
-    if (tga_h < 1) {
+    if( tga_h < 1 ) {
         stbi__rewind(s);
-        return 0; // test height
+        return 0;   // test height
     }
     tga_bits_per_pixel = stbi__get8(s); // bits per pixel
-    stbi__get8(s);                      // ignore alpha bits
+    stbi__get8(s); // ignore alpha bits
     if (tga_colormap_bpp != 0) {
-        if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
             // when using a colormap, tga_bits_per_pixel is the size of the indexes
             // I don't think anything but 8 or 16bit indexes makes sense
             stbi__rewind(s);
@@ -6177,268 +5807,270 @@ static int stbi__tga_info(stbi__context * s, int * x, int * y, int * comp) {
     } else {
         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
     }
-    if (!tga_comp) {
-        stbi__rewind(s);
-        return 0;
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
     }
-    if (x)
-        *x = tga_w;
-    if (y)
-        *y = tga_h;
-    if (comp)
-        *comp = tga_comp;
-    return 1; // seems to have passed everything
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
 }
 
-static int stbi__tga_test(stbi__context * s) {
-    int res = 0;
-    int sz, tga_color_type;
-    stbi__get8(s);                  //   discard Offset
-    tga_color_type = stbi__get8(s); //   color type
-    if (tga_color_type > 1)
-        goto errorEnd;         //   only RGB or indexed allowed
-    sz = stbi__get8(s);        //   image type
-    if (tga_color_type == 1) { // colormapped (paletted) image
-        if (sz != 1 && sz != 9)
-            goto errorEnd;  // colortype 1 demands image type 1 or 9
-        stbi__skip(s, 4);   // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s); //   check bits per palette color entry
-        if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
-            goto errorEnd;
-        stbi__skip(s, 4); // skip image x and y origin
-    } else {              // "normal" image w/o colormap
-        if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
-            goto errorEnd; // only RGB or grey allowed, +/- RLE
-        stbi__skip(s, 9);  // skip colormap specification and image x/y origin
-    }
-    if (stbi__get16le(s) < 1)
-        goto errorEnd; //   test width
-    if (stbi__get16le(s) < 1)
-        goto errorEnd;  //   test height
-    sz = stbi__get8(s); //   bits per pixel
-    if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
-        goto errorEnd; // for colormapped images, bpp is size of an index
-    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
-        goto errorEnd;
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
 
-    res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
 
 errorEnd:
-    stbi__rewind(s);
-    return res;
+   stbi__rewind(s);
+   return res;
 }
 
 // read 16bit value and convert to 24bit RGB
-static void stbi__tga_read_rgb16(stbi__context * s, stbi_uc * out) {
-    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
-    stbi__uint16 fiveBitMask = 31;
-    // we have 3 channels with 5bits each
-    int r = (px >> 10) & fiveBitMask;
-    int g = (px >> 5) & fiveBitMask;
-    int b = px & fiveBitMask;
-    // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
-    out[0] = (stbi_uc)((r * 255) / 31);
-    out[1] = (stbi_uc)((g * 255) / 31);
-    out[2] = (stbi_uc)((b * 255) / 31);
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
 
-    // some people claim that the most significant bit might be used for alpha
-    // (possibly if an alpha-bit is set in the "image descriptor byte")
-    // but that only made 16bit test images completely translucent..
-    // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
 }
 
-static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    //   read in the TGA header stuff
-    int tga_offset = stbi__get8(s);
-    int tga_indexed = stbi__get8(s);
-    int tga_image_type = stbi__get8(s);
-    int tga_is_RLE = 0;
-    int tga_palette_start = stbi__get16le(s);
-    int tga_palette_len = stbi__get16le(s);
-    int tga_palette_bits = stbi__get8(s);
-    int tga_x_origin = stbi__get16le(s);
-    int tga_y_origin = stbi__get16le(s);
-    int tga_width = stbi__get16le(s);
-    int tga_height = stbi__get16le(s);
-    int tga_bits_per_pixel = stbi__get8(s);
-    int tga_comp, tga_rgb16 = 0;
-    int tga_inverted = stbi__get8(s);
-    // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
-    //   image data
-    unsigned char * tga_data;
-    unsigned char * tga_palette = NULL;
-    int i, j;
-    unsigned char raw_data[4] = {0};
-    int RLE_count = 0;
-    int RLE_repeating = 0;
-    int read_next_pixel = 1;
-    STBI_NOTUSED(ri);
-    STBI_NOTUSED(tga_x_origin); // @TODO
-    STBI_NOTUSED(tga_y_origin); // @TODO
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
 
-    if (tga_height > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
-    if (tga_width > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
-    //   do a tiny bit of precessing
-    if (tga_image_type >= 8) {
-        tga_image_type -= 8;
-        tga_is_RLE = 1;
-    }
-    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
-    //   If I'm paletted, then I'll use the number of bits from the palette
-    if (tga_indexed)
-        tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
-    else
-        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
 
-    if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
-        return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
-    //   tga info
-    *x = tga_width;
-    *y = tga_height;
-    if (comp)
-        *comp = tga_comp;
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
 
-    if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
-        return stbi__errpuc("too large", "Corrupt TGA");
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
 
-    tga_data = (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-    if (!tga_data)
-        return stbi__errpuc("outofmem", "Out of memory");
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
-    // skip to the data's starting position (offset usually = 0)
-    stbi__skip(s, tga_offset);
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
 
-    if (!tga_indexed && !tga_is_RLE && !tga_rgb16) {
-        for (i = 0; i < tga_height; ++i) {
-            int row = tga_inverted ? tga_height - i - 1 : i;
-            stbi_uc * tga_row = tga_data + row * tga_width * tga_comp;
-            stbi__getn(s, tga_row, tga_width * tga_comp);
-        }
-    } else {
-        //   do I need to load a palette?
-        if (tga_indexed) {
-            if (tga_palette_len == 0) { /* you have to have at least one entry! */
-                STBI_FREE(tga_data);
-                return stbi__errpuc("bad palette", "Corrupt TGA");
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
             }
-
-            //   any data to skip? (offset usually = 0)
-            stbi__skip(s, tga_palette_start);
-            //   load the palette
-            tga_palette = (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-            if (!tga_palette) {
-                STBI_FREE(tga_data);
-                return stbi__errpuc("outofmem", "Out of memory");
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
             }
-            if (tga_rgb16) {
-                stbi_uc * pal_entry = tga_palette;
-                STBI_ASSERT(tga_comp == STBI_rgb);
-                for (i = 0; i < tga_palette_len; ++i) {
-                    stbi__tga_read_rgb16(s, pal_entry);
-                    pal_entry += tga_comp;
-                }
-            } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
-                STBI_FREE(tga_data);
-                STBI_FREE(tga_palette);
-                return stbi__errpuc("bad palette", "Corrupt TGA");
-            }
-        }
-        //   load the data
-        for (i = 0; i < tga_width * tga_height; ++i) {
-            //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-            if (tga_is_RLE) {
-                if (RLE_count == 0) {
-                    //   yep, get the next byte as a RLE command
-                    int RLE_cmd = stbi__get8(s);
-                    RLE_count = 1 + (RLE_cmd & 127);
-                    RLE_repeating = RLE_cmd >> 7;
-                    read_next_pixel = 1;
-                } else if (!RLE_repeating) {
-                    read_next_pixel = 1;
-                }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
             } else {
-                read_next_pixel = 1;
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
             }
-            //   OK, if I need to read a pixel, do it now
-            if (read_next_pixel) {
-                //   load however much data we did have
-                if (tga_indexed) {
-                    // read in index, then perform the lookup
-                    int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-                    if (pal_idx >= tga_palette_len) {
-                        // invalid index
-                        pal_idx = 0;
-                    }
-                    pal_idx *= tga_comp;
-                    for (j = 0; j < tga_comp; ++j) {
-                        raw_data[j] = tga_palette[pal_idx + j];
-                    }
-                } else if (tga_rgb16) {
-                    STBI_ASSERT(tga_comp == STBI_rgb);
-                    stbi__tga_read_rgb16(s, raw_data);
-                } else {
-                    //   read in the data raw
-                    for (j = 0; j < tga_comp; ++j) {
-                        raw_data[j] = stbi__get8(s);
-                    }
-                }
-                //   clear the reading flag for the next pixel
-                read_next_pixel = 0;
-            } // end of reading a pixel
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
 
-            // copy data
-            for (j = 0; j < tga_comp; ++j)
-                tga_data[i * tga_comp + j] = raw_data[j];
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
 
-            //   in case we're in RLE mode, keep counting down
-            --RLE_count;
-        }
-        //   do I need to invert the image?
-        if (tga_inverted) {
-            for (j = 0; j * 2 < tga_height; ++j) {
-                int index1 = j * tga_width * tga_comp;
-                int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-                for (i = tga_width * tga_comp; i > 0; --i) {
-                    unsigned char temp = tga_data[index1];
-                    tga_data[index1] = tga_data[index2];
-                    tga_data[index2] = temp;
-                    ++index1;
-                    ++index2;
-                }
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
             }
-        }
-        //   clear my palette, if I had one
-        if (tga_palette != NULL) {
-            STBI_FREE(tga_palette);
-        }
-    }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
 
-    // swap RGB - if the source data was RGB16, it already is in the right order
-    if (tga_comp >= 3 && !tga_rgb16) {
-        unsigned char * tga_pixel = tga_data;
-        for (i = 0; i < tga_width * tga_height; ++i) {
-            unsigned char temp = tga_pixel[0];
-            tga_pixel[0] = tga_pixel[2];
-            tga_pixel[2] = temp;
-            tga_pixel += tga_comp;
-        }
-    }
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
 
-    // convert to target component count
-    if (req_comp && req_comp != tga_comp)
-        tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
 
-    //   the things I do to get rid of an error message, and yet keep
-    //   Microsoft's C compilers happy... [8^(
-    tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin = tga_y_origin = 0;
-    STBI_NOTUSED(tga_palette_start);
-    //   OK, done
-    return tga_data;
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
 }
 #endif
 
@@ -6446,253 +6078,250 @@ static void * stbi__tga_load(stbi__context * s, int * x, int * y, int * comp, in
 // Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
 
 #ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context * s) {
-    int r = (stbi__get32be(s) == 0x38425053);
-    stbi__rewind(s);
-    return r;
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
 }
 
-static int stbi__psd_decode_rle(stbi__context * s, stbi_uc * p, int pixelCount) {
-    int count, nleft, len;
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
 
-    count = 0;
-    while ((nleft = pixelCount - count) > 0) {
-        len = stbi__get8(s);
-        if (len == 128) {
-            // No-op.
-        } else if (len < 128) {
-            // Copy next len+1 bytes literally.
-            len++;
-            if (len > nleft)
-                return 0; // corrupt data
-            count += len;
-            while (len) {
-                *p = stbi__get8(s);
-                p += 4;
-                len--;
-            }
-        } else if (len > 128) {
-            stbi_uc val;
-            // Next -len+1 bytes in the dest are replicated from next source byte.
-            // (Interpret len as a negative 8-bit int.)
-            len = 257 - len;
-            if (len > nleft)
-                return 0; // corrupt data
-            val = stbi__get8(s);
-            count += len;
-            while (len) {
-                *p = val;
-                p += 4;
-                len--;
-            }
-        }
-    }
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
 
-    return 1;
+   return 1;
 }
 
-static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri, int bpc) {
-    int pixelCount;
-    int channelCount, compression;
-    int channel, i;
-    int bitdepth;
-    int w, h;
-    stbi_uc * out;
-    STBI_NOTUSED(ri);
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
 
-    // Check identifier
-    if (stbi__get32be(s) != 0x38425053) // "8BPS"
-        return stbi__errpuc("not PSD", "Corrupt PSD image");
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
 
-    // Check file type version.
-    if (stbi__get16be(s) != 1)
-        return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
 
-    // Skip 6 reserved bytes.
-    stbi__skip(s, 6);
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
 
-    // Read the number of channels (R, G, B, A, etc).
-    channelCount = stbi__get16be(s);
-    if (channelCount < 0 || channelCount > 16)
-        return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
 
-    // Read the rows and columns of the image.
-    h = stbi__get32be(s);
-    w = stbi__get32be(s);
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
 
-    if (h > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
-    if (w > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
-    // Make sure the depth is 8 bits.
-    bitdepth = stbi__get16be(s);
-    if (bitdepth != 8 && bitdepth != 16)
-        return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
-    // Make sure the color mode is RGB.
-    // Valid options are:
-    //   0: Bitmap
-    //   1: Grayscale
-    //   2: Indexed color
-    //   3: RGB color
-    //   4: CMYK color
-    //   7: Multichannel
-    //   8: Duotone
-    //   9: Lab color
-    if (stbi__get16be(s) != 3)
-        return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
 
-    // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
-    stbi__skip(s, stbi__get32be(s));
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
 
-    // Skip the image resources.  (resolution, pen tool paths, etc)
-    stbi__skip(s, stbi__get32be(s));
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
 
-    // Skip the reserved data.
-    stbi__skip(s, stbi__get32be(s));
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
 
-    // Find out if the data is compressed.
-    // Known values:
-    //   0: no compression
-    //   1: RLE compressed
-    compression = stbi__get16be(s);
-    if (compression > 1)
-        return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
-    // Check size
-    if (!stbi__mad3sizes_valid(4, w, h, 0))
-        return stbi__errpuc("too large", "Corrupt PSD");
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
 
-    // Create the destination image.
+   // Create the destination image.
 
-    if (!compression && bitdepth == 16 && bpc == 16) {
-        out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0);
-        ri->bits_per_channel = 16;
-    } else
-        out = (stbi_uc *)stbi__malloc(4 * w * h);
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
 
-    if (!out)
-        return stbi__errpuc("outofmem", "Out of memory");
-    pixelCount = w * h;
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
 
-    // Initialize the data to zero.
-    // memset( out, 0, pixelCount * 4 );
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
 
-    // Finally, the image data.
-    if (compression) {
-        // RLE as used by .PSD and .TIFF
-        // Loop until you get the number of unpacked bytes you are expecting:
-        //     Read the next source byte into n.
-        //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
-        //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
-        //     Else if n is 128, noop.
-        // Endloop
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
 
-        // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
-        // which we're going to just skip.
-        stbi__skip(s, h * channelCount * 2);
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
 
-        // Read the RLE data by channel.
-        for (channel = 0; channel < 4; channel++) {
-            stbi_uc * p;
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
 
-            p = out + channel;
-            if (channel >= channelCount) {
-                // Fill this channel with default data.
-                for (i = 0; i < pixelCount; i++, p += 4)
-                    *p = (channel == 3 ? 255 : 0);
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
             } else {
-                // Read the RLE data.
-                if (!stbi__psd_decode_rle(s, p, pixelCount)) {
-                    STBI_FREE(out);
-                    return stbi__errpuc("corrupt", "bad RLE data");
-                }
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
             }
-        }
-    } else {
-        // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-        // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
-
-        // Read the data by channel.
-        for (channel = 0; channel < 4; channel++) {
-            if (channel >= channelCount) {
-                // Fill this channel with default data.
-                if (bitdepth == 16 && bpc == 16) {
-                    stbi__uint16 * q = ((stbi__uint16 *)out) + channel;
-                    stbi__uint16 val = channel == 3 ? 65535 : 0;
-                    for (i = 0; i < pixelCount; i++, q += 4)
-                        *q = val;
-                } else {
-                    stbi_uc * p = out + channel;
-                    stbi_uc val = channel == 3 ? 255 : 0;
-                    for (i = 0; i < pixelCount; i++, p += 4)
-                        *p = val;
-                }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
             } else {
-                if (ri->bits_per_channel == 16) { // output bpc
-                    stbi__uint16 * q = ((stbi__uint16 *)out) + channel;
-                    for (i = 0; i < pixelCount; i++, q += 4)
-                        *q = (stbi__uint16)stbi__get16be(s);
-                } else {
-                    stbi_uc * p = out + channel;
-                    if (bitdepth == 16) { // input bpc
-                        for (i = 0; i < pixelCount; i++, p += 4)
-                            *p = (stbi_uc)(stbi__get16be(s) >> 8);
-                    } else {
-                        for (i = 0; i < pixelCount; i++, p += 4)
-                            *p = stbi__get8(s);
-                    }
-                }
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
             }
-        }
-    }
+         }
+      }
+   }
 
-    // remove weird white matte from PSD
-    if (channelCount >= 4) {
-        if (ri->bits_per_channel == 16) {
-            for (i = 0; i < w * h; ++i) {
-                stbi__uint16 * pixel = (stbi__uint16 *)out + 4 * i;
-                if (pixel[3] != 0 && pixel[3] != 65535) {
-                    float a = pixel[3] / 65535.0f;
-                    float ra = 1.0f / a;
-                    float inv_a = 65535.0f * (1 - ra);
-                    pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a);
-                    pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a);
-                    pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a);
-                }
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
             }
-        } else {
-            for (i = 0; i < w * h; ++i) {
-                unsigned char * pixel = out + 4 * i;
-                if (pixel[3] != 0 && pixel[3] != 255) {
-                    float a = pixel[3] / 255.0f;
-                    float ra = 1.0f / a;
-                    float inv_a = 255.0f * (1 - ra);
-                    pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
-                    pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
-                    pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
-                }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
             }
-        }
-    }
+         }
+      }
+   }
 
-    // convert to desired output format
-    if (req_comp && req_comp != 4) {
-        if (ri->bits_per_channel == 16)
-            out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp, w, h);
-        else
-            out = stbi__convert_format(out, 4, req_comp, w, h);
-        if (out == NULL)
-            return out; // stbi__convert_format frees input on failure
-    }
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
 
-    if (comp)
-        *comp = 4;
-    *y = h;
-    *x = w;
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
 
-    return out;
+   return out;
 }
 #endif
 
@@ -6704,221 +6333,216 @@ static void * stbi__psd_load(stbi__context * s, int * x, int * y, int * comp, in
 // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
 
 #ifndef STBI_NO_PIC
-static int stbi__pic_is4(stbi__context * s, const char * str) {
-    int i;
-    for (i = 0; i < 4; ++i)
-        if (stbi__get8(s) != (stbi_uc)str[i])
-            return 0;
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
 
-    return 1;
+   return 1;
 }
 
-static int stbi__pic_test_core(stbi__context * s) {
-    int i;
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
 
-    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
-        return 0;
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
 
-    for (i = 0; i < 84; ++i)
-        stbi__get8(s);
+   for(i=0;i<84;++i)
+      stbi__get8(s);
 
-    if (!stbi__pic_is4(s, "PICT"))
-        return 0;
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
 
-    return 1;
+   return 1;
 }
 
-typedef struct {
-    stbi_uc size, type, channel;
+typedef struct
+{
+   stbi_uc size,type,channel;
 } stbi__pic_packet;
 
-static stbi_uc * stbi__readval(stbi__context * s, int channel, stbi_uc * dest) {
-    int mask = 0x80, i;
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
 
-    for (i = 0; i < 4; ++i, mask >>= 1) {
-        if (channel & mask) {
-            if (stbi__at_eof(s))
-                return stbi__errpuc("bad file", "PIC file too short");
-            dest[i] = stbi__get8(s);
-        }
-    }
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
 
-    return dest;
+   return dest;
 }
 
-static void stbi__copyval(int channel, stbi_uc * dest, const stbi_uc * src) {
-    int mask = 0x80, i;
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
 
-    for (i = 0; i < 4; ++i, mask >>= 1)
-        if (channel & mask)
-            dest[i] = src[i];
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
 }
 
-static stbi_uc * stbi__pic_load_core(stbi__context * s, int width, int height, int * comp, stbi_uc * result) {
-    int act_comp = 0, num_packets = 0, y, chained;
-    stbi__pic_packet packets[10];
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
 
-    // this will (should...) cater for even some bizarre stuff like having data
+   // this will (should...) cater for even some bizarre stuff like having data
     // for the same channel in multiple packets.
-    do {
-        stbi__pic_packet * packet;
+   do {
+      stbi__pic_packet *packet;
 
-        if (num_packets == sizeof(packets) / sizeof(packets[0]))
-            return stbi__errpuc("bad format", "too many packets");
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
 
-        packet = &packets[num_packets++];
+      packet = &packets[num_packets++];
 
-        chained = stbi__get8(s);
-        packet->size = stbi__get8(s);
-        packet->type = stbi__get8(s);
-        packet->channel = stbi__get8(s);
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
 
-        act_comp |= packet->channel;
+      act_comp |= packet->channel;
 
-        if (stbi__at_eof(s))
-            return stbi__errpuc("bad file", "file too short (reading packets)");
-        if (packet->size != 8)
-            return stbi__errpuc("bad format", "packet isn't 8bpp");
-    } while (chained);
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
 
-    *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
 
-    for (y = 0; y < height; ++y) {
-        int packet_idx;
+   for(y=0; y<height; ++y) {
+      int packet_idx;
 
-        for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) {
-            stbi__pic_packet * packet = &packets[packet_idx];
-            stbi_uc * dest = result + y * width * 4;
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
 
-            switch (packet->type) {
+         switch (packet->type) {
             default:
-                return stbi__errpuc("bad format", "packet has bad compression type");
+               return stbi__errpuc("bad format","packet has bad compression type");
 
-            case 0: { // uncompressed
-                int x;
+            case 0: {//uncompressed
+               int x;
 
-                for (x = 0; x < width; ++x, dest += 4)
-                    if (!stbi__readval(s, packet->channel, dest))
-                        return 0;
-                break;
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
             }
 
-            case 1: // Pure RLE
-            {
-                int left = width, i;
+            case 1://Pure RLE
+               {
+                  int left=width, i;
 
-                while (left > 0) {
-                    stbi_uc count, value[4];
+                  while (left>0) {
+                     stbi_uc count,value[4];
 
-                    count = stbi__get8(s);
-                    if (stbi__at_eof(s))
-                        return stbi__errpuc("bad file", "file too short (pure read count)");
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
 
-                    if (count > left)
-                        count = (stbi_uc)left;
+                     if (count > left)
+                        count = (stbi_uc) left;
 
-                    if (!stbi__readval(s, packet->channel, value))
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
                         return 0;
 
-                    for (i = 0; i < count; ++i, dest += 4)
-                        stbi__copyval(packet->channel, dest, value);
-                    left -= count;
-                }
-            } break;
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
 
-            case 2: { // Mixed RLE
-                int left = width;
-                while (left > 0) {
-                    int count = stbi__get8(s), i;
-                    if (stbi__at_eof(s))
-                        return stbi__errpuc("bad file", "file too short (mixed read count)");
-
-                    if (count >= 128) { // Repeated
-                        stbi_uc value[4];
-
-                        if (count == 128)
-                            count = stbi__get16be(s);
-                        else
-                            count -= 127;
-                        if (count > left)
-                            return stbi__errpuc("bad file", "scanline overrun");
-
-                        if (!stbi__readval(s, packet->channel, value))
-                            return 0;
-
-                        for (i = 0; i < count; ++i, dest += 4)
-                            stbi__copyval(packet->channel, dest, value);
-                    } else { // Raw
-                        ++count;
-                        if (count > left)
-                            return stbi__errpuc("bad file", "scanline overrun");
-
-                        for (i = 0; i < count; ++i, dest += 4)
-                            if (!stbi__readval(s, packet->channel, dest))
-                                return 0;
-                    }
-                    left -= count;
-                }
-                break;
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
             }
-            }
-        }
-    }
+         }
+      }
+   }
 
-    return result;
+   return result;
 }
 
-static void * stbi__pic_load(stbi__context * s, int * px, int * py, int * comp, int req_comp, stbi__result_info * ri) {
-    stbi_uc * result;
-    int i, x, y, internal_comp;
-    STBI_NOTUSED(ri);
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
 
-    if (!comp)
-        comp = &internal_comp;
+   if (!comp) comp = &internal_comp;
 
-    for (i = 0; i < 92; ++i)
-        stbi__get8(s);
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
 
-    x = stbi__get16be(s);
-    y = stbi__get16be(s);
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
 
-    if (y > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
-    if (x > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
-    if (stbi__at_eof(s))
-        return stbi__errpuc("bad file", "file too short (pic header)");
-    if (!stbi__mad3sizes_valid(x, y, 4, 0))
-        return stbi__errpuc("too large", "PIC image too large to decode");
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
-    stbi__get32be(s); // skip `ratio'
-    stbi__get16be(s); // skip `fields'
-    stbi__get16be(s); // skip `pad'
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
 
-    // intermediate buffer is RGBA
-    result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0);
-    if (!result)
-        return stbi__errpuc("outofmem", "Out of memory");
-    memset(result, 0xff, x * y * 4);
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
 
-    if (!stbi__pic_load_core(s, x, y, comp, result)) {
-        STBI_FREE(result);
-        result = 0;
-    }
-    *px = x;
-    *py = y;
-    if (req_comp == 0)
-        req_comp = *comp;
-    result = stbi__convert_format(result, 4, req_comp, x, y);
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
 
-    return result;
+   return result;
 }
 
-static int stbi__pic_test(stbi__context * s) {
-    int r = stbi__pic_test_core(s);
-    stbi__rewind(s);
-    return r;
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
 }
 #endif
 
@@ -6926,968 +6550,931 @@ static int stbi__pic_test(stbi__context * s) {
 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
 
 #ifndef STBI_NO_GIF
-typedef struct {
-    stbi__int16 prefix;
-    stbi_uc first;
-    stbi_uc suffix;
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
 } stbi__gif_lzw;
 
-typedef struct {
-    int w, h;
-    stbi_uc * out;        // output buffer (always 4 components)
-    stbi_uc * background; // The current "background" as far as a gif is concerned
-    stbi_uc * history;
-    int flags, bgindex, ratio, transparent, eflags;
-    stbi_uc pal[256][4];
-    stbi_uc lpal[256][4];
-    stbi__gif_lzw codes[8192];
-    stbi_uc * color_table;
-    int parse, step;
-    int lflags;
-    int start_x, start_y;
-    int max_x, max_y;
-    int cur_x, cur_y;
-    int line_size;
-    int delay;
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
 } stbi__gif;
 
-static int stbi__gif_test_raw(stbi__context * s) {
-    int sz;
-    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
-        return 0;
-    sz = stbi__get8(s);
-    if (sz != '9' && sz != '7')
-        return 0;
-    if (stbi__get8(s) != 'a')
-        return 0;
-    return 1;
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
 }
 
-static int stbi__gif_test(stbi__context * s) {
-    int r = stbi__gif_test_raw(s);
-    stbi__rewind(s);
-    return r;
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
 }
 
-static void stbi__gif_parse_colortable(stbi__context * s, stbi_uc pal[256][4], int num_entries, int transp) {
-    int i;
-    for (i = 0; i < num_entries; ++i) {
-        pal[i][2] = stbi__get8(s);
-        pal[i][1] = stbi__get8(s);
-        pal[i][0] = stbi__get8(s);
-        pal[i][3] = transp == i ? 0 : 255;
-    }
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
 }
 
-static int stbi__gif_header(stbi__context * s, stbi__gif * g, int * comp, int is_info) {
-    stbi_uc version;
-    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
-        return stbi__err("not GIF", "Corrupt GIF");
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
 
-    version = stbi__get8(s);
-    if (version != '7' && version != '9')
-        return stbi__err("not GIF", "Corrupt GIF");
-    if (stbi__get8(s) != 'a')
-        return stbi__err("not GIF", "Corrupt GIF");
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
 
-    stbi__g_failure_reason = "";
-    g->w = stbi__get16le(s);
-    g->h = stbi__get16le(s);
-    g->flags = stbi__get8(s);
-    g->bgindex = stbi__get8(s);
-    g->ratio = stbi__get8(s);
-    g->transparent = -1;
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
 
-    if (g->w > STBI_MAX_DIMENSIONS)
-        return stbi__err("too large", "Very large image (corrupt?)");
-    if (g->h > STBI_MAX_DIMENSIONS)
-        return stbi__err("too large", "Very large image (corrupt?)");
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
 
-    if (comp != 0)
-        *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
 
-    if (is_info)
-        return 1;
+   if (is_info) return 1;
 
-    if (g->flags & 0x80)
-        stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
 
-    return 1;
+   return 1;
 }
 
-static int stbi__gif_info_raw(stbi__context * s, int * x, int * y, int * comp) {
-    stbi__gif * g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif));
-    if (!g)
-        return stbi__err("outofmem", "Out of memory");
-    if (!stbi__gif_header(s, g, comp, 1)) {
-        STBI_FREE(g);
-        stbi__rewind(s);
-        return 0;
-    }
-    if (x)
-        *x = g->w;
-    if (y)
-        *y = g->h;
-    STBI_FREE(g);
-    return 1;
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
 }
 
-static void stbi__out_gif_code(stbi__gif * g, stbi__uint16 code) {
-    stbi_uc *p, *c;
-    int idx;
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
 
-    // recurse to decode the prefixes, since the linked-list is backwards,
-    // and working backwards through an interleaved image would be nasty
-    if (g->codes[code].prefix >= 0)
-        stbi__out_gif_code(g, g->codes[code].prefix);
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
 
-    if (g->cur_y >= g->max_y)
-        return;
+   if (g->cur_y >= g->max_y) return;
 
-    idx = g->cur_x + g->cur_y;
-    p = &g->out[idx];
-    g->history[idx / 4] = 1;
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
 
-    c = &g->color_table[g->codes[code].suffix * 4];
-    if (c[3] > 128) { // don't render transparent pixels;
-        p[0] = c[2];
-        p[1] = c[1];
-        p[2] = c[0];
-        p[3] = c[3];
-    }
-    g->cur_x += 4;
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
 
-    if (g->cur_x >= g->max_x) {
-        g->cur_x = g->start_x;
-        g->cur_y += g->step;
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
 
-        while (g->cur_y >= g->max_y && g->parse > 0) {
-            g->step = (1 << g->parse) * g->line_size;
-            g->cur_y = g->start_y + (g->step >> 1);
-            --g->parse;
-        }
-    }
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
 }
 
-static stbi_uc * stbi__process_gif_raster(stbi__context * s, stbi__gif * g) {
-    stbi_uc lzw_cs;
-    stbi__int32 len, init_code;
-    stbi__uint32 first;
-    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
-    stbi__gif_lzw * p;
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
 
-    lzw_cs = stbi__get8(s);
-    if (lzw_cs > 12)
-        return NULL;
-    clear = 1 << lzw_cs;
-    first = 1;
-    codesize = lzw_cs + 1;
-    codemask = (1 << codesize) - 1;
-    bits = 0;
-    valid_bits = 0;
-    for (init_code = 0; init_code < clear; init_code++) {
-        g->codes[init_code].prefix = -1;
-        g->codes[init_code].first = (stbi_uc)init_code;
-        g->codes[init_code].suffix = (stbi_uc)init_code;
-    }
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
 
-    // support no starting clear code
-    avail = clear + 2;
-    oldcode = -1;
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
 
-    len = 0;
-    for (;;) {
-        if (valid_bits < codesize) {
-            if (len == 0) {
-                len = stbi__get8(s); // start new block
-                if (len == 0)
-                    return g->out;
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
             }
-            --len;
-            bits |= (stbi__int32)stbi__get8(s) << valid_bits;
-            valid_bits += 8;
-        } else {
-            stbi__int32 code = bits & codemask;
-            bits >>= codesize;
-            valid_bits -= codesize;
-            // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-            if (code == clear) { // clear code
-                codesize = lzw_cs + 1;
-                codemask = (1 << codesize) - 1;
-                avail = clear + 2;
-                oldcode = -1;
-                first = 0;
-            } else if (code == clear + 1) { // end of stream code
-                stbi__skip(s, len);
-                while ((len = stbi__get8(s)) > 0)
-                    stbi__skip(s, len);
-                return g->out;
-            } else if (code <= avail) {
-                if (first) {
-                    return stbi__errpuc("no clear code", "Corrupt GIF");
-                }
 
-                if (oldcode >= 0) {
-                    p = &g->codes[avail++];
-                    if (avail > 8192) {
-                        return stbi__errpuc("too many codes", "Corrupt GIF");
-                    }
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
 
-                    p->prefix = (stbi__int16)oldcode;
-                    p->first = g->codes[oldcode].first;
-                    p->suffix = (code == avail) ? p->first : g->codes[code].first;
-                } else if (code == avail)
-                    return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
 
-                stbi__out_gif_code(g, (stbi__uint16)code);
+            stbi__out_gif_code(g, (stbi__uint16) code);
 
-                if ((avail & codemask) == 0 && avail <= 0x0FFF) {
-                    codesize++;
-                    codemask = (1 << codesize) - 1;
-                }
-
-                oldcode = code;
-            } else {
-                return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
             }
-        }
-    }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
 }
 
 // this function is designed to support animated gifs, although stb_image doesn't support it
 // two back is the image from two frames ago, used for a very specific disposal format
-static stbi_uc * stbi__gif_load_next(stbi__context * s, stbi__gif * g, int * comp, int req_comp, stbi_uc * two_back) {
-    int dispose;
-    int first_frame;
-    int pi;
-    int pcount;
-    STBI_NOTUSED(req_comp);
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
 
-    // on first frame, any non-written pixels get the background colour (non-transparent)
-    first_frame = 0;
-    if (g->out == 0) {
-        if (!stbi__gif_header(s, g, comp, 0))
-            return 0; // stbi__g_failure_reason set by stbi__gif_header
-        if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
-            return stbi__errpuc("too large", "GIF image is too large");
-        pcount = g->w * g->h;
-        g->out = (stbi_uc *)stbi__malloc(4 * pcount);
-        g->background = (stbi_uc *)stbi__malloc(4 * pcount);
-        g->history = (stbi_uc *)stbi__malloc(pcount);
-        if (!g->out || !g->background || !g->history)
-            return stbi__errpuc("outofmem", "Out of memory");
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
 
-        // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
-        // background colour is only used for pixels that are not rendered first frame, after that "background"
-        // color refers to the color that was there the previous frame.
-        memset(g->out, 0x00, 4 * pcount);
-        memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
-        memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
-        first_frame = 1;
-    } else {
-        // second frame - how do we dispose of the previous one?
-        dispose = (g->eflags & 0x1C) >> 2;
-        pcount = g->w * g->h;
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
 
-        if ((dispose == 3) && (two_back == 0)) {
-            dispose = 2; // if I don't have an image to revert back to, default to the old background
-        }
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
 
-        if (dispose == 3) { // use previous graphic
-            for (pi = 0; pi < pcount; ++pi) {
-                if (g->history[pi]) {
-                    memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
-                }
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
             }
-        } else if (dispose == 2) {
-            // restore what was changed last frame to background before that frame;
-            for (pi = 0; pi < pcount; ++pi) {
-                if (g->history[pi]) {
-                    memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
-                }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
             }
-        } else {
-            // This is a non-disposal case eithe way, so just
-            // leave the pixels as is, and they will become the new background
-            // 1: do not dispose
-            // 0:  not specified.
-        }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
 
-        // background is what out is after the undoing of the previou frame;
-        memcpy(g->background, g->out, 4 * g->w * g->h);
-    }
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
 
-    // clear my history;
-    memset(g->history, 0x00, g->w * g->h); // pixels that were affected previous frame
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
 
-    for (;;) {
-        int tag = stbi__get8(s);
-        switch (tag) {
-        case 0x2C: /* Image Descriptor */
-        {
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
             stbi__int32 x, y, w, h;
-            stbi_uc * o;
+            stbi_uc *o;
 
             x = stbi__get16le(s);
             y = stbi__get16le(s);
             w = stbi__get16le(s);
             h = stbi__get16le(s);
             if (((x + w) > (g->w)) || ((y + h) > (g->h)))
-                return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
 
             g->line_size = g->w * 4;
             g->start_x = x * 4;
             g->start_y = y * g->line_size;
-            g->max_x = g->start_x + w * 4;
-            g->max_y = g->start_y + h * g->line_size;
-            g->cur_x = g->start_x;
-            g->cur_y = g->start_y;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
 
             // if the width of the specified rectangle is 0, that means
             // we may not see *any* pixels or the image is malformed;
             // to make sure this is caught, move the current y down to
             // max_y (which is what out_gif_code checks).
             if (w == 0)
-                g->cur_y = g->max_y;
+               g->cur_y = g->max_y;
 
             g->lflags = stbi__get8(s);
 
             if (g->lflags & 0x40) {
-                g->step = 8 * g->line_size; // first interlaced spacing
-                g->parse = 3;
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
             } else {
-                g->step = g->line_size;
-                g->parse = 0;
+               g->step = g->line_size;
+               g->parse = 0;
             }
 
             if (g->lflags & 0x80) {
-                stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
-                g->color_table = (stbi_uc *)g->lpal;
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
-                g->color_table = (stbi_uc *)g->pal;
+               g->color_table = (stbi_uc *) g->pal;
             } else
-                return stbi__errpuc("missing color table", "Corrupt GIF");
+               return stbi__errpuc("missing color table", "Corrupt GIF");
 
             o = stbi__process_gif_raster(s, g);
-            if (!o)
-                return NULL;
+            if (!o) return NULL;
 
             // if this was the first frame,
             pcount = g->w * g->h;
             if (first_frame && (g->bgindex > 0)) {
-                // if first frame, any pixel not drawn to gets the background color
-                for (pi = 0; pi < pcount; ++pi) {
-                    if (g->history[pi] == 0) {
-                        g->pal[g->bgindex][3] =
-                            255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
-                        memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
-                    }
-                }
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
             }
 
             return o;
-        }
+         }
 
-        case 0x21: // Comment Extension.
-        {
+         case 0x21: // Comment Extension.
+         {
             int len;
             int ext = stbi__get8(s);
             if (ext == 0xF9) { // Graphic Control Extension.
-                len = stbi__get8(s);
-                if (len == 4) {
-                    g->eflags = stbi__get8(s);
-                    g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
 
-                    // unset old transparent
-                    if (g->transparent >= 0) {
-                        g->pal[g->transparent][3] = 255;
-                    }
-                    if (g->eflags & 0x01) {
-                        g->transparent = stbi__get8(s);
-                        if (g->transparent >= 0) {
-                            g->pal[g->transparent][3] = 0;
-                        }
-                    } else {
-                        // don't need transparent
-                        stbi__skip(s, 1);
-                        g->transparent = -1;
-                    }
-                } else {
-                    stbi__skip(s, len);
-                    break;
-                }
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
             }
             while ((len = stbi__get8(s)) != 0) {
-                stbi__skip(s, len);
+               stbi__skip(s, len);
             }
             break;
-        }
+         }
 
-        case 0x3B:               // gif stream termination code
-            return (stbi_uc *)s; // using '1' causes warning on some compilers
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
 
-        default:
+         default:
             return stbi__errpuc("unknown code", "Corrupt GIF");
-        }
-    }
+      }
+   }
 }
 
-static void * stbi__load_gif_main_outofmem(stbi__gif * g, stbi_uc * out, int ** delays) {
-    STBI_FREE(g->out);
-    STBI_FREE(g->history);
-    STBI_FREE(g->background);
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
 
-    if (out)
-        STBI_FREE(out);
-    if (delays && *delays)
-        STBI_FREE(*delays);
-    return stbi__errpuc("outofmem", "Out of memory");
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
 }
 
-static void * stbi__load_gif_main(stbi__context * s, int ** delays, int * x, int * y, int * z, int * comp, int req_comp) {
-    if (stbi__gif_test(s)) {
-        int layers = 0;
-        stbi_uc * u = 0;
-        stbi_uc * out = 0;
-        stbi_uc * two_back = 0;
-        stbi__gif g;
-        int stride;
-        int out_size = 0;
-        int delays_size = 0;
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
 
-        STBI_NOTUSED(out_size);
-        STBI_NOTUSED(delays_size);
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
 
-        memset(&g, 0, sizeof(g));
-        if (delays) {
-            *delays = 0;
-        }
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
 
-        do {
-            u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-            if (u == (stbi_uc *)s)
-                u = 0; // end of animated gif marker
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
 
-            if (u) {
-                *x = g.w;
-                *y = g.h;
-                ++layers;
-                stride = g.w * g.h * 4;
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
 
-                if (out) {
-                    void * tmp = (stbi_uc *)STBI_REALLOC_SIZED(out, out_size, layers * stride);
-                    if (!tmp)
-                        return stbi__load_gif_main_outofmem(&g, out, delays);
-                    else {
-                        out = (stbi_uc *)tmp;
-                        out_size = layers * stride;
-                    }
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
 
-                    if (delays) {
-                        int * new_delays = (int *)STBI_REALLOC_SIZED(*delays, delays_size, sizeof(int) * layers);
-                        if (!new_delays)
-                            return stbi__load_gif_main_outofmem(&g, out, delays);
-                        *delays = new_delays;
-                        delays_size = layers * sizeof(int);
-                    }
-                } else {
-                    out = (stbi_uc *)stbi__malloc(layers * stride);
-                    if (!out)
-                        return stbi__load_gif_main_outofmem(&g, out, delays);
-                    out_size = layers * stride;
-                    if (delays) {
-                        *delays = (int *)stbi__malloc(layers * sizeof(int));
-                        if (!*delays)
-                            return stbi__load_gif_main_outofmem(&g, out, delays);
-                        delays_size = layers * sizeof(int);
-                    }
-                }
-                memcpy(out + ((layers - 1) * stride), u, stride);
-                if (layers >= 2) {
-                    two_back = out - 2 * stride;
-                }
-
-                if (delays) {
-                    (*delays)[layers - 1U] = g.delay;
-                }
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
             }
-        } while (u != 0);
 
-        // free temp buffer;
-        STBI_FREE(g.out);
-        STBI_FREE(g.history);
-        STBI_FREE(g.background);
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
 
-        // do the final conversion after loading everything;
-        if (req_comp && req_comp != 4)
-            out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
 
-        *z = layers;
-        return out;
-    } else {
-        return stbi__errpuc("not GIF", "Image was not as a gif type.");
-    }
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
 }
 
-static void * stbi__gif_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    stbi_uc * u = 0;
-    stbi__gif g;
-    memset(&g, 0, sizeof(g));
-    STBI_NOTUSED(ri);
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
 
-    u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-    if (u == (stbi_uc *)s)
-        u = 0; // end of animated gif marker
-    if (u) {
-        *x = g.w;
-        *y = g.h;
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
 
-        // moved conversion to after successful load so that the same
-        // can be done for multiple frames.
-        if (req_comp && req_comp != 4)
-            u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
-    } else if (g.out) {
-        // if there was an error and we allocated an image buffer, free it!
-        STBI_FREE(g.out);
-    }
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
 
-    // free buffers needed for multiple frame loading;
-    STBI_FREE(g.history);
-    STBI_FREE(g.background);
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
 
-    return u;
+   return u;
 }
 
-static int stbi__gif_info(stbi__context * s, int * x, int * y, int * comp) { return stbi__gif_info_raw(s, x, y, comp); }
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
 #endif
 
 // *************************************************************************************************
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context * s, const char * signature) {
-    int i;
-    for (i = 0; signature[i]; ++i)
-        if (stbi__get8(s) != signature[i])
-            return 0;
-    stbi__rewind(s);
-    return 1;
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
 }
 
-static int stbi__hdr_test(stbi__context * s) {
-    int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
-    stbi__rewind(s);
-    if (!r) {
-        r = stbi__hdr_test_core(s, "#?RGBE\n");
-        stbi__rewind(s);
-    }
-    return r;
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
 }
 
-#define STBI__HDR_BUFLEN 1024
-static char * stbi__hdr_gettoken(stbi__context * z, char * buffer) {
-    int len = 0;
-    char c = '\0';
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
 
-    c = (char)stbi__get8(z);
+   c = (char) stbi__get8(z);
 
-    while (!stbi__at_eof(z) && c != '\n') {
-        buffer[len++] = c;
-        if (len == STBI__HDR_BUFLEN - 1) {
-            // flush to end of line
-            while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
-                ;
-            break;
-        }
-        c = (char)stbi__get8(z);
-    }
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
 
-    buffer[len] = 0;
-    return buffer;
+   buffer[len] = 0;
+   return buffer;
 }
 
-static void stbi__hdr_convert(float * output, stbi_uc * input, int req_comp) {
-    if (input[3] != 0) {
-        float f1;
-        // Exponent
-        f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
-        if (req_comp <= 2)
-            output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
-        else {
-            output[0] = input[0] * f1;
-            output[1] = input[1] * f1;
-            output[2] = input[2] * f1;
-        }
-        if (req_comp == 2)
-            output[1] = 1;
-        if (req_comp == 4)
-            output[3] = 1;
-    } else {
-        switch (req_comp) {
-        case 4:
-            output[3] = 1; /* fallthrough */
-        case 3:
-            output[0] = output[1] = output[2] = 0;
-            break;
-        case 2:
-            output[1] = 1; /* fallthrough */
-        case 1:
-            output[0] = 0;
-            break;
-        }
-    }
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
 }
 
-static float * stbi__hdr_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    char buffer[STBI__HDR_BUFLEN];
-    char * token;
-    int valid = 0;
-    int width, height;
-    stbi_uc * scanline;
-    float * hdr_data;
-    int len;
-    unsigned char count, value;
-    int i, j, k, c1, c2, z;
-    const char * headerToken;
-    STBI_NOTUSED(ri);
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
 
-    // Check identifier
-    headerToken = stbi__hdr_gettoken(s, buffer);
-    if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
-        return stbi__errpf("not HDR", "Corrupt HDR image");
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
 
-    // Parse header
-    for (;;) {
-        token = stbi__hdr_gettoken(s, buffer);
-        if (token[0] == 0)
-            break;
-        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
-            valid = 1;
-    }
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
 
-    if (!valid)
-        return stbi__errpf("unsupported format", "Unsupported HDR format");
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
 
-    // Parse width and height
-    // can't use sscanf() if we're not using stdio!
-    token = stbi__hdr_gettoken(s, buffer);
-    if (strncmp(token, "-Y ", 3))
-        return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-    token += 3;
-    height = (int)strtol(token, &token, 10);
-    while (*token == ' ')
-        ++token;
-    if (strncmp(token, "+X ", 3))
-        return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-    token += 3;
-    width = (int)strtol(token, NULL, 10);
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
 
-    if (height > STBI_MAX_DIMENSIONS)
-        return stbi__errpf("too large", "Very large image (corrupt?)");
-    if (width > STBI_MAX_DIMENSIONS)
-        return stbi__errpf("too large", "Very large image (corrupt?)");
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
 
-    *x = width;
-    *y = height;
+   *x = width;
+   *y = height;
 
-    if (comp)
-        *comp = 3;
-    if (req_comp == 0)
-        req_comp = 3;
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
 
-    if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
-        return stbi__errpf("too large", "HDR image is too large");
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
 
-    // Read data
-    hdr_data = (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-    if (!hdr_data)
-        return stbi__errpf("outofmem", "Out of memory");
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
 
-    // Load image data
-    // image data is stored as some number of sca
-    if (width < 8 || width >= 32768) {
-        // Read flat data
-        for (j = 0; j < height; ++j) {
-            for (i = 0; i < width; ++i) {
-                stbi_uc rgbe[4];
-            main_decode_loop:
-                stbi__getn(s, rgbe, 4);
-                stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
-            }
-        }
-    } else {
-        // Read RLE-encoded data
-        scanline = NULL;
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
 
-        for (j = 0; j < height; ++j) {
-            c1 = stbi__get8(s);
-            c2 = stbi__get8(s);
-            len = stbi__get8(s);
-            if (c1 != 2 || c2 != 2 || (len & 0x80)) {
-                // not run-length encoded, so we have to actually use THIS data as a decoded
-                // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
-                stbi_uc rgbe[4];
-                rgbe[0] = (stbi_uc)c1;
-                rgbe[1] = (stbi_uc)c2;
-                rgbe[2] = (stbi_uc)len;
-                rgbe[3] = (stbi_uc)stbi__get8(s);
-                stbi__hdr_convert(hdr_data, rgbe, req_comp);
-                i = 1;
-                j = 0;
-                STBI_FREE(scanline);
-                goto main_decode_loop; // yes, this makes no sense
-            }
-            len <<= 8;
-            len |= stbi__get8(s);
-            if (len != width) {
-                STBI_FREE(hdr_data);
-                STBI_FREE(scanline);
-                return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
-            }
-            if (scanline == NULL) {
-                scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0);
-                if (!scanline) {
-                    STBI_FREE(hdr_data);
-                    return stbi__errpf("outofmem", "Out of memory");
-                }
-            }
-
-            for (k = 0; k < 4; ++k) {
-                int nleft;
-                i = 0;
-                while ((nleft = width - i) > 0) {
-                    count = stbi__get8(s);
-                    if (count > 128) {
-                        // Run
-                        value = stbi__get8(s);
-                        count -= 128;
-                        if ((count == 0) || (count > nleft)) {
-                            STBI_FREE(hdr_data);
-                            STBI_FREE(scanline);
-                            return stbi__errpf("corrupt", "bad RLE data in HDR");
-                        }
-                        for (z = 0; z < count; ++z)
-                            scanline[i++ * 4 + k] = value;
-                    } else {
-                        // Dump
-                        if ((count == 0) || (count > nleft)) {
-                            STBI_FREE(hdr_data);
-                            STBI_FREE(scanline);
-                            return stbi__errpf("corrupt", "bad RLE data in HDR");
-                        }
-                        for (z = 0; z < count; ++z)
-                            scanline[i++ * 4 + k] = stbi__get8(s);
-                    }
-                }
-            }
-            for (i = 0; i < width; ++i)
-                stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, scanline + i * 4, req_comp);
-        }
-        if (scanline)
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
             STBI_FREE(scanline);
-    }
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
 
-    return hdr_data;
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
 }
 
-static int stbi__hdr_info(stbi__context * s, int * x, int * y, int * comp) {
-    char buffer[STBI__HDR_BUFLEN];
-    char * token;
-    int valid = 0;
-    int dummy;
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
 
-    if (!x)
-        x = &dummy;
-    if (!y)
-        y = &dummy;
-    if (!comp)
-        comp = &dummy;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
-    if (stbi__hdr_test(s) == 0) {
-        stbi__rewind(s);
-        return 0;
-    }
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
 
-    for (;;) {
-        token = stbi__hdr_gettoken(s, buffer);
-        if (token[0] == 0)
-            break;
-        if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
-            valid = 1;
-    }
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
 
-    if (!valid) {
-        stbi__rewind(s);
-        return 0;
-    }
-    token = stbi__hdr_gettoken(s, buffer);
-    if (strncmp(token, "-Y ", 3)) {
-        stbi__rewind(s);
-        return 0;
-    }
-    token += 3;
-    *y = (int)strtol(token, &token, 10);
-    while (*token == ' ')
-        ++token;
-    if (strncmp(token, "+X ", 3)) {
-        stbi__rewind(s);
-        return 0;
-    }
-    token += 3;
-    *x = (int)strtol(token, NULL, 10);
-    *comp = 3;
-    return 1;
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
 }
 #endif // STBI_NO_HDR
 
 #ifndef STBI_NO_BMP
-static int stbi__bmp_info(stbi__context * s, int * x, int * y, int * comp) {
-    void * p;
-    stbi__bmp_data info;
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
 
-    info.all_a = 255;
-    p = stbi__bmp_parse_header(s, &info);
-    if (p == NULL) {
-        stbi__rewind(s);
-        return 0;
-    }
-    if (x)
-        *x = s->img_x;
-    if (y)
-        *y = s->img_y;
-    if (comp) {
-        if (info.bpp == 24 && info.ma == 0xff000000)
-            *comp = 3;
-        else
-            *comp = info.ma ? 4 : 3;
-    }
-    return 1;
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
 }
 #endif
 
 #ifndef STBI_NO_PSD
-static int stbi__psd_info(stbi__context * s, int * x, int * y, int * comp) {
-    int channelCount, dummy, depth;
-    if (!x)
-        x = &dummy;
-    if (!y)
-        y = &dummy;
-    if (!comp)
-        comp = &dummy;
-    if (stbi__get32be(s) != 0x38425053) {
-        stbi__rewind(s);
-        return 0;
-    }
-    if (stbi__get16be(s) != 1) {
-        stbi__rewind(s);
-        return 0;
-    }
-    stbi__skip(s, 6);
-    channelCount = stbi__get16be(s);
-    if (channelCount < 0 || channelCount > 16) {
-        stbi__rewind(s);
-        return 0;
-    }
-    *y = stbi__get32be(s);
-    *x = stbi__get32be(s);
-    depth = stbi__get16be(s);
-    if (depth != 8 && depth != 16) {
-        stbi__rewind(s);
-        return 0;
-    }
-    if (stbi__get16be(s) != 3) {
-        stbi__rewind(s);
-        return 0;
-    }
-    *comp = 4;
-    return 1;
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
 }
 
-static int stbi__psd_is16(stbi__context * s) {
-    int channelCount, depth;
-    if (stbi__get32be(s) != 0x38425053) {
-        stbi__rewind(s);
-        return 0;
-    }
-    if (stbi__get16be(s) != 1) {
-        stbi__rewind(s);
-        return 0;
-    }
-    stbi__skip(s, 6);
-    channelCount = stbi__get16be(s);
-    if (channelCount < 0 || channelCount > 16) {
-        stbi__rewind(s);
-        return 0;
-    }
-    STBI_NOTUSED(stbi__get32be(s));
-    STBI_NOTUSED(stbi__get32be(s));
-    depth = stbi__get16be(s);
-    if (depth != 16) {
-        stbi__rewind(s);
-        return 0;
-    }
-    return 1;
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
 }
 #endif
 
 #ifndef STBI_NO_PIC
-static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp) {
-    int act_comp = 0, num_packets = 0, chained, dummy;
-    stbi__pic_packet packets[10];
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
 
-    if (!x)
-        x = &dummy;
-    if (!y)
-        y = &dummy;
-    if (!comp)
-        comp = &dummy;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
-    if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
-        stbi__rewind(s);
-        return 0;
-    }
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
 
-    stbi__skip(s, 88);
+   stbi__skip(s, 88);
 
-    *x = stbi__get16be(s);
-    *y = stbi__get16be(s);
-    if (stbi__at_eof(s)) {
-        stbi__rewind(s);
-        return 0;
-    }
-    if ((*x) != 0 && (1 << 28) / (*x) < (*y)) {
-        stbi__rewind(s);
-        return 0;
-    }
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
 
-    stbi__skip(s, 8);
+   stbi__skip(s, 8);
 
-    do {
-        stbi__pic_packet * packet;
+   do {
+      stbi__pic_packet *packet;
 
-        if (num_packets == sizeof(packets) / sizeof(packets[0]))
-            return 0;
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
 
-        packet = &packets[num_packets++];
-        chained = stbi__get8(s);
-        packet->size = stbi__get8(s);
-        packet->type = stbi__get8(s);
-        packet->channel = stbi__get8(s);
-        act_comp |= packet->channel;
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
 
-        if (stbi__at_eof(s)) {
-            stbi__rewind(s);
-            return 0;
-        }
-        if (packet->size != 8) {
-            stbi__rewind(s);
-            return 0;
-        }
-    } while (chained);
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
 
-    *comp = (act_comp & 0x10 ? 4 : 3);
+   *comp = (act_comp & 0x10 ? 4 : 3);
 
-    return 1;
+   return 1;
 }
 #endif
 
@@ -7904,271 +7491,272 @@ static int stbi__pic_info(stbi__context * s, int * x, int * y, int * comp) {
 
 #ifndef STBI_NO_PNM
 
-static int stbi__pnm_test(stbi__context * s) {
-    char p, t;
-    p = (char)stbi__get8(s);
-    t = (char)stbi__get8(s);
-    if (p != 'P' || (t != '5' && t != '6')) {
-        stbi__rewind(s);
-        return 0;
-    }
-    return 1;
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
 }
 
-static void * stbi__pnm_load(stbi__context * s, int * x, int * y, int * comp, int req_comp, stbi__result_info * ri) {
-    stbi_uc * out;
-    STBI_NOTUSED(ri);
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
 
-    ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
-    if (ri->bits_per_channel == 0)
-        return 0;
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
 
-    if (s->img_y > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
-    if (s->img_x > STBI_MAX_DIMENSIONS)
-        return stbi__errpuc("too large", "Very large image (corrupt?)");
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
 
-    *x = s->img_x;
-    *y = s->img_y;
-    if (comp)
-        *comp = s->img_n;
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
 
-    if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
-        return stbi__errpuc("too large", "PNM too large");
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
 
-    out = (stbi_uc *)stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
-    if (!out)
-        return stbi__errpuc("outofmem", "Out of memory");
-    if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
-        STBI_FREE(out);
-        return stbi__errpuc("bad PNM", "PNM file truncated");
-    }
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
 
-    if (req_comp && req_comp != s->img_n) {
-        if (ri->bits_per_channel == 16) {
-            out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, s->img_n, req_comp, s->img_x, s->img_y);
-        } else {
-            out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-        }
-        if (out == NULL)
-            return out; // stbi__convert_format frees input on failure
-    }
-    return out;
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
 }
 
-static int stbi__pnm_isspace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; }
-
-static void stbi__pnm_skip_whitespace(stbi__context * s, char * c) {
-    for (;;) {
-        while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-            *c = (char)stbi__get8(s);
-
-        if (stbi__at_eof(s) || *c != '#')
-            break;
-
-        while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
-            *c = (char)stbi__get8(s);
-    }
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
 }
 
-static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; }
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
 
-static int stbi__pnm_getinteger(stbi__context * s, char * c) {
-    int value = 0;
+      if (stbi__at_eof(s) || *c != '#')
+         break;
 
-    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
-        value = value * 10 + (*c - '0');
-        *c = (char)stbi__get8(s);
-        if ((value > 214748364) || (value == 214748364 && *c > '7'))
-            return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
-    }
-
-    return value;
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
 }
 
-static int stbi__pnm_info(stbi__context * s, int * x, int * y, int * comp) {
-    int maxv, dummy;
-    char c, p, t;
-
-    if (!x)
-        x = &dummy;
-    if (!y)
-        y = &dummy;
-    if (!comp)
-        comp = &dummy;
-
-    stbi__rewind(s);
-
-    // Get identifier
-    p = (char)stbi__get8(s);
-    t = (char)stbi__get8(s);
-    if (p != 'P' || (t != '5' && t != '6')) {
-        stbi__rewind(s);
-        return 0;
-    }
-
-    *comp = (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
-
-    c = (char)stbi__get8(s);
-    stbi__pnm_skip_whitespace(s, &c);
-
-    *x = stbi__pnm_getinteger(s, &c); // read width
-    if (*x == 0)
-        return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-    stbi__pnm_skip_whitespace(s, &c);
-
-    *y = stbi__pnm_getinteger(s, &c); // read height
-    if (*y == 0)
-        return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-    stbi__pnm_skip_whitespace(s, &c);
-
-    maxv = stbi__pnm_getinteger(s, &c); // read max value
-    if (maxv > 65535)
-        return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
-    else if (maxv > 255)
-        return 16;
-    else
-        return 8;
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
 }
 
-static int stbi__pnm_is16(stbi__context * s) {
-    if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
-        return 1;
-    return 0;
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
 }
 #endif
 
-static int stbi__info_main(stbi__context * s, int * x, int * y, int * comp) {
-#ifndef STBI_NO_JPEG
-    if (stbi__jpeg_info(s, x, y, comp))
-        return 1;
-#endif
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
 
-#ifndef STBI_NO_PNG
-    if (stbi__png_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_GIF
-    if (stbi__gif_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_BMP
-    if (stbi__bmp_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_PSD
-    if (stbi__psd_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_PIC
-    if (stbi__pic_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_PNM
-    if (stbi__pnm_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
 
-#ifndef STBI_NO_HDR
-    if (stbi__hdr_info(s, x, y, comp))
-        return 1;
-#endif
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
 
-// test tga last because it's a crappy test!
-#ifndef STBI_NO_TGA
-    if (stbi__tga_info(s, x, y, comp))
-        return 1;
-#endif
-    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static int stbi__is_16_main(stbi__context * s) {
-#ifndef STBI_NO_PNG
-    if (stbi__png_is16(s))
-        return 1;
-#endif
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
 
-#ifndef STBI_NO_PSD
-    if (stbi__psd_is16(s))
-        return 1;
-#endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
 
-#ifndef STBI_NO_PNM
-    if (stbi__pnm_is16(s))
-        return 1;
-#endif
-    return 0;
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
 }
 
 #ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const * filename, int * x, int * y, int * comp) {
-    FILE * f = stbi__fopen(filename, "rb");
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
     int result;
-    if (!f)
-        return stbi__err("can't fopen", "Unable to open file");
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
     result = stbi_info_from_file(f, x, y, comp);
     fclose(f);
     return result;
 }
 
-STBIDEF int stbi_info_from_file(FILE * f, int * x, int * y, int * comp) {
-    int r;
-    stbi__context s;
-    long pos = ftell(f);
-    stbi__start_file(&s, f);
-    r = stbi__info_main(&s, x, y, comp);
-    fseek(f, pos, SEEK_SET);
-    return r;
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
 }
 
-STBIDEF int stbi_is_16_bit(char const * filename) {
-    FILE * f = stbi__fopen(filename, "rb");
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
     int result;
-    if (!f)
-        return stbi__err("can't fopen", "Unable to open file");
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
     result = stbi_is_16_bit_from_file(f);
     fclose(f);
     return result;
 }
 
-STBIDEF int stbi_is_16_bit_from_file(FILE * f) {
-    int r;
-    stbi__context s;
-    long pos = ftell(f);
-    stbi__start_file(&s, f);
-    r = stbi__is_16_main(&s);
-    fseek(f, pos, SEEK_SET);
-    return r;
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
 }
 #endif // !STBI_NO_STDIO
 
-STBIDEF int stbi_info_from_memory(stbi_uc const * buffer, int len, int * x, int * y, int * comp) {
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__info_main(&s, x, y, comp);
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
 }
 
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const * c, void * user, int * x, int * y, int * comp) {
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
-    return stbi__info_main(&s, x, y, comp);
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
 }
 
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const * buffer, int len) {
-    stbi__context s;
-    stbi__start_mem(&s, buffer, len);
-    return stbi__is_16_main(&s);
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
 }
 
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * c, void * user) {
-    stbi__context s;
-    stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
-    return stbi__is_16_main(&s);
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
 }
 
 #endif // STB_IMAGE_IMPLEMENTATION
@@ -8279,9 +7867,12 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * c, void * us
       1.30  (2011-06-11)
               added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
               removed deprecated format-specific test/load functions
-              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks
-   anyway error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in
-   decoding 32-bit BMP (David Woo) 1.29  (2010-08-16) various warning fixes from Aurelien Pocheville 1.28  (2010-08-01)
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
               fix bug in GIF palette transparency (SpartanJ)
       1.27  (2010-08-01)
               cast-to-stbi_uc to fix warnings
@@ -8353,6 +7944,7 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const * c, void * us
               first released version
 */
 
+
 /*
 ------------------------------------------------------------------------------
 This software is available under 2 licenses -- choose whichever you prefer.
diff --git a/common/train.cpp b/common/train.cpp
deleted file mode 100644
index 0dbfd24df..000000000
--- a/common/train.cpp
+++ /dev/null
@@ -1,1513 +0,0 @@
-#include "train.h"
-#include "common.h"
-
-#include <random>
-#include <sstream>
-#include <functional>
-
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> rd;
-    float min;
-    float max;
-};
-
-struct random_uniform_distribution {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> rd;
-};
-
-struct train_state  * init_train_state() {
-    struct train_state * state = new struct train_state;
-    state->train_its     = 0;
-    state->train_samples = 0;
-    state->train_tokens  = 0;
-    state->train_epochs  = 0;
-    state->shuffle_samples_hash  = 0;
-    state->shuffle_sample_count  = 0;
-    state->shuffle_next_sample   = 0;
-    state->shuffle_rng_state_current = "";
-    state->shuffle_rng_state_next    = "";
-
-    state->opt = new struct ggml_opt_context;
-    state->opt->ctx = NULL;
-    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
-    state->opt->loss_after = 0.0f;
-
-    return state;
-}
-
-void free_train_state(struct train_state  * state) {
-    delete state->opt;
-    delete state;
-}
-
-struct random_normal_distribution * init_random_normal_distribution(
-    int seed, float mean, float std, float min, float max
-) {
-    struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-    return rnd;
-}
-
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
-    struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
-    return rnd;
-}
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd) {
-    free(rnd);
-}
-
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd) {
-    free(rnd);
-}
-
-struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
-    float scale = 1.0f; // xavier
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            scale /= sqrtf((float) tensor->ne[0]);
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            die("Unsupported tensor->n_dims");
-    };
-    return tensor;
-}
-
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_uniform(rnd);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_uniform(rnd);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_uniform(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_uniform(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            die("Unsupported tensor->n_dims");
-    };
-    return tensor;
-}
-
-float frand() {
-    return (float)rand()/((float)(RAND_MAX) + 1.0f);
-}
-
-float frand_normal(struct random_normal_distribution * rnd) {
-    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
-}
-
-float frand_uniform(struct random_uniform_distribution * rnd) {
-    return rnd->rd(rnd->gen);
-}
-
-int clamp(const int v, const int min, const int max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-float fclamp(const float v, const float min, const float max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == 1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == 1);
-}
-
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
-int64_t get_example_targets_batch(
-    struct llama_context * lctx,
-    struct ggml_tensor   * tokens_input,
-    struct ggml_tensor   * target_probs,
-    int64_t                example_id,
-    const size_t         * samples_offs,
-    const size_t         * samples_begin,
-    const size_t         * samples_size,
-          size_t           samples_count,
-    const llama_token    * train_data,
-    size_t                 n_train_data,
-    bool                   separate_with_eos,
-    bool                   separate_with_bos,
-    bool                   fill_with_next_samples,
-    bool                   sample_random_offsets
-) {
-    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(target_probs));
-    int64_t n_vocab  = target_probs->ne[0];
-    int64_t n_tokens = tokens_input->ne[0];
-    int64_t n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
-    GGML_ASSERT(n_tokens == target_probs->ne[1]);
-    GGML_ASSERT(n_batch  == target_probs->ne[2]);
-
-    int64_t used_samples = 0;
-
-    ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(llama_get_model(lctx));
-    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
-    for (int k=0; k<n_batch; ++k) {
-        // printf("%s: batch %d\n", __func__, k);
-        size_t sample_idx   = (example_id + used_samples) % samples_count;
-        size_t sample_offs  = sample_random_offsets ? samples_offs[sample_idx] : 0;
-        size_t sample_begin = samples_begin[sample_idx];
-        size_t sample_size  = samples_size[sample_idx];
-        ++used_samples;
-
-        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
-
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
-        bool sample_separation_eos = !separate_with_eos;
-        bool sample_separation_bos = !separate_with_bos;
-        for (int64_t i=0; i<n_tokens; ++i) {
-            llama_token token = eos;
-            if (sample_offs >= sample_size && fill_with_next_samples) {
-                if (!sample_separation_eos) {
-                    // insert eos token to separate samples
-                    sample_separation_eos = true;
-                } else if (!sample_separation_bos) {
-                    // insert bos token to separate samples
-                    sample_separation_bos = true;
-                    token = bos;
-                } else {
-                    // sample separation is done, continue with next sample
-                    sample_separation_eos = !separate_with_eos;
-                    sample_separation_bos = !separate_with_bos;
-                    sample_offs  = 0;
-                    sample_idx   = (example_id + used_samples) % samples_count;
-                    sample_begin = samples_begin[sample_idx];
-                    sample_size  = samples_size[sample_idx];
-                    ++used_samples;
-                }
-            }
-            // note: no else-if here
-            if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
-                ++sample_offs;
-            }
-            ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
-            if (i+1<n_tokens) {
-                ggml_set_i32_nd(tokens_input, (int) (i + 1), (int) k, 0, 0, token);
-            }
-        }
-    }
-
-    return used_samples;
-}
-
-void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::failbit);
-    s_rng_state.str(rng_state);
-    s_rng_state >> rng;
-}
-
-std::string mt19937_get_state(const std::mt19937& rng) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state << rng;
-    return s_rng_state.str();
-}
-
-std::string mt19937_seed_to_state(unsigned seed) {
-    std::mt19937 rng(seed);
-    return mt19937_get_state(rng);
-}
-
-std::string shuffle_samples(
-        const std::string & rng_state,
-        size_t            * shuffled_offs,
-        size_t            * shuffled_begins,
-        size_t            * shuffled_sizes,
-        const size_t      * begins,
-        const size_t      * sizes,
-        size_t              count) {
-    if (count == 0) return rng_state;
-
-    std::mt19937 rng;
-    mt19937_set_state(rng, rng_state);
-
-    // sort indices by random value for each index
-    std::vector<size_t> idcs;
-    {
-        std::vector<unsigned> rnd;
-        idcs.resize(count);
-        rnd.resize(count);
-        for (unsigned i=0; i<count; ++i) {
-            idcs[i] = i;
-            rnd[i]  = rng();
-        }
-
-        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
-            // stable sort for reproducibility
-            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
-        });
-    }
-
-    // create random offsets
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_offs[i] = (size_t) ((sizes[idcs[i]] - 1) * ((double) rng() / (double) (rng.max()-1)));
-    }
-
-    // reorder begins and sizes by sorted indices
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_begins[i] = begins[idcs[i]];
-    }
-
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_sizes[i] = sizes[idcs[i]];
-    }
-
-    return mt19937_get_state(rng);
-}
-
-size_t hash_combine(size_t h1, size_t h2) {
-    return h1 ^ (h2 << 1);
-}
-
-size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
-    std::hash<std::string> h_string;
-    std::hash<unsigned long long> h_ull;
-    size_t h = h_string(std::string(fn));
-    h = hash_combine(h, h_ull((unsigned long long) sample_count));
-    for (size_t i=0; i< sample_count; ++i) {
-        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
-        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
-    }
-    return h;
-}
-
-std::string replace_str(const char * s, const char * needle, const char * replacement) {
-    std::string str = s;
-    size_t pos = str.find(needle);
-    if (pos != std::string::npos) {
-        str.replace(pos, strlen(needle), replacement);
-    }
-    return str;
-}
-
-void print_duration(double fmillis) {
-    if (fmillis < 1000.0f) {
-        printf("%.1fms", (float) fmillis);
-        return;
-    }
-    const int64_t one_sec  = 1000;
-    const int64_t one_min  = one_sec  * 60;
-    const int64_t one_hour = one_min  * 60;
-    const int64_t one_day  = one_hour * 24;
-
-    int64_t millis  = (int64_t) fmillis;
-    int64_t days    = millis/one_day;
-    int64_t hours   = (millis - days*one_day)/one_hour;
-    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
-    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
-
-    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
-    if (days > 0) {
-        printf("%lldd ", (long long int) days);
-    }
-    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
-}
-
-float cosine_decay(int64_t step, int64_t decay_steps, float minimum) {
-    if (step > decay_steps) {
-        step = decay_steps;
-    }
-    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - minimum)*cosine_decay + minimum;
-    return decay;
-}
-
-float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) {
-    while (step > decay_steps) {
-        step -= decay_steps;
-        decay_steps = (int64_t) (restart_step_mult * decay_steps);
-    }
-    return cosine_decay(step, decay_steps, minimum);
-}
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t cos_decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart) {
-
-    float result =
-        (step < warmup_steps)
-            ? (float) step / (float) warmup_steps
-            : enable_restart
-                ? cosine_decay_restart(
-                    step - warmup_steps,
-                    cos_decay_steps,
-                    cos_decay_minimum,
-                    cos_decay_restart_step_mult)
-                : cosine_decay(
-                    step,
-                    cos_decay_steps,
-                    cos_decay_minimum);
-
-    float min = overall_minimum / learning_rate;
-    result = min + result * (1.0f - min);
-    return result;
-}
-
-static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
-    GGML_ASSERT(a != NULL);
-    GGML_ASSERT(b != NULL);
-    GGML_ASSERT(a->type == b->type);
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
-
-    return true;
-}
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
-    if (dst == NULL) {
-        return;
-    }
-    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
-    GGML_ASSERT(are_same_layout(dst, t));
-    memcpy(dst->data, t->data, ggml_nbytes(t));
-
-    if (strlen(ggml_get_name(dst)) == 0) {
-        ggml_set_name(dst, name);
-    }
-}
-
-// gguf constants
-static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
-static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
-static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
-static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
-static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
-static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
-
-    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
-    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
-
-    uint64_t nx;
-    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
-    opt->nx = (size_t) nx;
-
-    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
-
-    std::string opt_type;
-    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
-    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_TYPE_ADAM;
-
-        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        copy_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_TYPE_LBFGS;
-
-        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
-        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
-        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
-        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
-        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
-        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        copy_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-        copy_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-        copy_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-        copy_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-        copy_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-        copy_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-        copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-        copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-        copy_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-        copy_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-    } else {
-        die("unknown optimizer type\n");
-    }
-}
-
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
-    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
-    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
-
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
-
-                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-                if (opt->adam.pf) {
-                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-                }
-
-                gguf_add_tensor(fctx, opt->adam.m);
-                gguf_add_tensor(fctx, opt->adam.v);
-                if (opt->adam.pf) {
-                    gguf_add_tensor(fctx, opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
-
-                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-                if (opt->lbfgs.pf) {
-                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-                }
-                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-
-                gguf_add_tensor(fctx, opt->lbfgs.x);
-                gguf_add_tensor(fctx, opt->lbfgs.xp);
-                gguf_add_tensor(fctx, opt->lbfgs.g);
-                gguf_add_tensor(fctx, opt->lbfgs.gp);
-                gguf_add_tensor(fctx, opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    gguf_add_tensor(fctx, opt->lbfgs.pf);
-                }
-                gguf_add_tensor(fctx, opt->lbfgs.lmal);
-                gguf_add_tensor(fctx, opt->lbfgs.lmys);
-                gguf_add_tensor(fctx, opt->lbfgs.lms);
-                gguf_add_tensor(fctx, opt->lbfgs.lmy);
-            } break;
-    }
-}
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
-    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) {
-        return false;
-    }
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-    GGML_ASSERT(file_version <= 1);
-
-    if (file_version == 0) {
-
-        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
-
-    } else if (file_version == 1) {
-
-        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
-        GGUF_GET_KEY(fctx, train->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
-
-        GGUF_GET_KEY(fctx, train->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
-        GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
-        GGUF_GET_KEY(fctx, train->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, train->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
-    }
-
-    load_opt_context_gguf(fctx, f_ggml_ctx, train->opt);
-    return true;
-}
-
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    train->train_samples);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     train->train_tokens);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     train->train_epochs);
-
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    train->shuffle_rng_state_current.c_str());
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) train->shuffle_next_sample);
-
-    save_opt_context_gguf(fctx, train->opt);
-}
-
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-// mark each byte with its utf8 unit number.
-// returns the number of utf8 characters.
-// e.g. when bytes == '\x61\xD0\xB0\x62',
-// then utf8_units will become [0,0,1,0]
-// utf8_nunits will become [1,2,2,1] and 3 is returned.
-// bytes where utf8_units is zero, are the begin of an utf8 character.
-static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
-    size_t offs = 0;
-    size_t count_utf8 = 0;
-    while(offs < count) {
-        int len = (int) utf8_len(bytes[offs]);
-        for (int i=0; i<len; ++i) {
-            utf8_units[offs+i]  = i;
-            utf8_nunits[offs+i] = len;
-        }
-        offs += len;
-        ++count_utf8;
-    }
-    return count_utf8;
-}
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
-
-    if (f.size == 0) {
-        out_tokens.clear();
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        printf("%s: warning: empty or not existing training data file '%s'\n",
-            __func__, filename);
-        return out_tokens.size();
-    }
-
-    // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
-    const int n_max_tokens_overhead = 1;
-
-    std::vector<char> buf;
-    buf.resize(f.size);
-
-    f.read_raw(buf.data(), f.size);
-
-    std::vector<int> utf8_units;
-    std::vector<int> utf8_nunits;
-    utf8_units.resize(buf.size());
-    utf8_nunits.resize(buf.size());
-    mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
-
-    if (sample_start.size() == 0) {
-        // tokenize all data at once
-        out_tokens.resize(buf.size() + n_max_tokens_overhead);
-
-        int n_tokens = llama_tokenize(
-            llama_get_model(lctx),
-            buf.data(),
-            (int) buf.size(),
-            out_tokens.data(),
-            (int) out_tokens.size(),
-            false, false);
-        if (n_tokens < 0) {
-            out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(
-                llama_get_model(lctx),
-                buf.data(),
-                (int) buf.size(),
-                out_tokens.data(),
-                (int) out_tokens.size(),
-                false, false);
-        }
-        if (n_tokens >= 0) {
-            out_tokens.resize(n_tokens);
-        }
-
-        // generate sample starts at all token positions
-        out_samples_begin.clear();
-        out_samples_begin.push_back(0);
-        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
-        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
-        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
-            out_samples_begin.push_back(sample_begin);
-            out_samples_size.push_back(context_length);
-        }
-    } else {
-        // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size());
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        out_tokens.clear();
-
-        // find all positions of pattern sample_start
-        size_t sample_begin = data_str.find(sample_start, 0);
-        while (sample_begin != std::string::npos) {
-            out_samples_begin.push_back(sample_begin);
-            const size_t search_start = sample_begin + sample_start.size();
-            sample_begin = data_str.find(sample_start, search_start);
-        }
-        if (out_samples_begin.size() == 0) {
-            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
-                __func__, sample_start.c_str());
-            out_samples_begin.push_back(0);
-        }
-
-        out_samples_size.resize(out_samples_begin.size(), 0);
-
-        std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
-
-        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
-        size_t found_too_big_sample   = 0;
-        size_t found_too_small_sample = 0;
-        size_t found_empty_sample     = 0;
-        size_t found_min_sample_size  = SIZE_MAX;
-        size_t found_max_sample_size  = 0;
-
-        size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(llama_get_model(lctx));
-        for (llama_token token=0; token < n_vocab; ++token) {
-            max_token_text_size = std::max(
-                max_token_text_size,
-                strlen(llama_token_get_text(llama_get_model(lctx), token)));
-        }
-
-        // upper bound of context byte length.
-        // strings with this byte length should always tokenize to at least context_length tokens.
-        size_t context_byte_len = max_token_text_size*context_length;
-
-        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
-            // determine sample begin and end from pattern positions
-            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
-            size_t sample_end   = overlapping_samples
-                                    ? std::min(
-                                        data_str.size(),
-                                        sample_begin + context_byte_len)
-                                    : (i+1 < out_samples_begin.size()
-                                        ? out_samples_begin[i+1]
-                                        : data_str.size());
-            if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
-                // sample end is in the middle of an utf8 character.
-                // advance sample_end to the begin of the next utf8 character.
-                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
-            }
-            size_t sample_size = sample_end - sample_begin;
-            if (sample_size == 0) {
-                ++found_empty_sample;
-            }
-
-            if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
-                // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size);
-                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-
-                // printf("sample: '%s'\n", buf_sample.data());
-
-                // tokenize the sample
-                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(llama_get_model(lctx),
-                    buf_sample.data(),
-                    (int) buf_sample.size(),
-                    tok_sample.data(),
-                    (int) tok_sample.size(),
-                    false, false);
-                if (n_tokens < 0) {
-                    tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(llama_get_model(lctx),
-                        buf_sample.data(),
-                        (int) buf_sample.size(),
-                        tok_sample.data(),
-                        (int) tok_sample.size(),
-                        false, false);
-                    GGML_ASSERT(n_tokens >= 0);
-                }
-                GGML_ASSERT(n_tokens <= (int) tok_sample.size());
-
-                if ((size_t) n_tokens > context_length) {
-                    ++found_too_big_sample;
-                } else if ((size_t) n_tokens < context_length) {
-                    ++found_too_small_sample;
-                }
-                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
-                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
-
-                // write out tokens, start and size of sample
-                // overwrite the string start position with the token start position
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = (size_t) n_tokens;
-                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
-            } else {
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = 0;
-            }
-
-        }
-        if (found_too_big_sample > 0) {
-            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
-                __func__, found_too_big_sample, found_max_sample_size, context_length);
-        }
-
-        if (found_too_small_sample > 0) {
-            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
-                __func__, found_too_small_sample, found_min_sample_size, context_length);
-        }
-
-        if (found_empty_sample) {
-            printf("%s: warning: found %zu empty samples.\n",
-                __func__, found_empty_sample);
-        }
-    }
-    printf("%s: total number of samples: %zu\n",
-        __func__, out_samples_begin.size());
-
-    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
-
-    return out_tokens.size();
-}
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    return replace_str(filename, pattern_it, sit.c_str());
-}
-
-struct train_params_common get_default_train_params_common() {
-    struct train_params_common params;
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.gguf";
-    params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
-    params.pattern_fn_it     = "ITERATION";
-    params.fn_latest         = "LATEST";
-
-    params.print_usage = false;
-
-    params.save_every = 10;
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_gradient_accumulation = 1;
-    params.n_epochs   = -1;
-    params.n_gpu_layers = 0;
-
-    params.custom_n_ctx = false;
-
-    params.use_flash              = true;
-    params.use_checkpointing      = true;
-
-    params.sample_start           = "";
-    params.include_sample_start   = false;
-    params.escape                 = false;
-    params.overlapping_samples    = false;
-    params.fill_with_next_samples = false;
-    params.separate_with_eos      = false;
-    params.separate_with_bos      = true;
-    params.sample_random_offsets  = false;
-    params.force_reshuffle        = false;
-
-    params.opt_past               = 0;
-    params.opt_delta              = 1e-5f;
-    params.opt_max_no_improvement = 0;
-
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_min     = 0.1f;
-    params.enable_restart    = false;
-
-    params.adam_n_iter         = 256;
-    params.adam_alpha          = 1e-3f;
-    params.adam_min_alpha      = 0;
-    params.adam_decay          = 1e-1f;
-    params.adam_decay_min_ndim = 2;
-    params.adam_beta1          = 0.9f;
-    params.adam_beta2          = 0.999f;
-    params.adam_gclip          = 1.0f;
-    params.adam_eps_f          = 0.0f;
-
-    return params;
-}
-
-void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) {
-    // fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    // fprintf(stderr, "\n");
-    // fprintf(stderr, "options:\n");
-    // fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
-    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
-    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
-    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
-    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
-    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
-    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
-    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
-    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
-    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
-    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
-    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
-    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
-    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --sample-random-offsets    Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : "");
-    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
-    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
-    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
-    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
-    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
-    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --epochs N                 Maximum number epochs to process. (default %d)\n", params->n_epochs);
-    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
-    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
-    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
-    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
-    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
-    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
-    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N   Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
-    fprintf(stderr, "\n");
-}
-
-bool consume_common_train_arg(
-    int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param
-) {
-    int& i = *idx;
-    std::string arg = argv[i];
-    const std::string arg_prefix = "--";
-    if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-        std::replace(arg.begin(), arg.end(), '_', '-');
-    }
-    if (arg == "--train-data") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_train_data = argv[i];
-    } else if (arg == "--checkpoint-in") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_checkpoint_in = argv[i];
-    } else if (arg == "--checkpoint-out") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_checkpoint_out = argv[i];
-    } else if (arg == "--pattern-fn-it") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->pattern_fn_it = argv[i];
-    } else if (arg == "--fn-latest") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->fn_latest = argv[i];
-    } else if (arg == "--save-every") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->save_every = std::stoi(argv[i]);
-    } else if (arg == "-s" || arg == "--seed") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->seed = std::stoi(argv[i]);
-    } else if (arg == "-c" || arg == "--ctx") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_ctx = std::stoi(argv[i]);
-        params->custom_n_ctx = true;
-    } else if (arg == "-t" || arg == "--threads") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_threads = std::stoi(argv[i]);
-    } else if (arg == "-b" || arg == "--batch") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_batch = std::stoi(argv[i]);
-    } else if (arg == "--grad-acc") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
-    } else if (arg == "--sample-start") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->sample_start = std::string(argv[i]);
-    } else if (arg == "--escape") {
-        params->escape = true;
-    } else if (arg == "--include-sample-start") {
-        params->include_sample_start = true;
-    } else if (arg == "--overlapping-samples") {
-        params->overlapping_samples = true;
-    } else if (arg == "--fill-with-next-samples") {
-        params->fill_with_next_samples = true;
-    } else if (arg == "--separate-with-eos") {
-        params->separate_with_eos = true;
-    } else if (arg == "--separate-with-bos") {
-        params->separate_with_bos = true;
-    } else if (arg == "--no-separate-with-eos") {
-        params->separate_with_eos = false;
-    } else if (arg == "--no-separate-with-bos") {
-        params->separate_with_bos = false;
-    } else if (arg == "--sample-random-offsets") {
-        params->sample_random_offsets = true;
-    } else if (arg == "--force-reshuffle") {
-        params->force_reshuffle = true;
-    } else if (arg == "--no-flash") {
-        params->use_flash = false;
-    } else if (arg == "--use-flash") {
-        params->use_flash = true;
-    } else if (arg == "--no-checkpointing") {
-        params->use_checkpointing = false;
-    } else if (arg == "--use-checkpointing") {
-        params->use_checkpointing = true;
-    } else if (arg == "--warmup") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->warmup = std::stoi(argv[i]);
-    } else if (arg == "--cos-decay-steps") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_steps = std::stoi(argv[i]);
-    } else if (arg == "--cos-decay-restart") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_restart = std::stof(argv[i]);
-    } else if (arg == "--cos-decay-min") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->cos_decay_min = std::stof(argv[i]);
-    } else if (arg == "--enable-restart") {
-        params->enable_restart = true;
-    } else if (arg == "--disable-restart") {
-        params->enable_restart = false;
-    } else if (arg == "--opt-past") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_past = std::stoi(argv[i]);
-    } else if (arg == "--opt-delta") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_delta = std::stof(argv[i]);
-    } else if (arg == "--opt-max-no-improvement") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->opt_max_no_improvement = std::stoi(argv[i]);
-    } else if (arg == "--adam-epsf") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_eps_f = std::stof(argv[i]);
-    } else if (arg == "--epochs") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->n_epochs = std::stoi(argv[i]);
-    } else if (arg == "--adam-iter") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_n_iter = std::stoi(argv[i]);
-    } else if (arg == "--adam-alpha") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_alpha = std::stof(argv[i]);
-    } else if (arg == "--adam-min-alpha") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_min_alpha = std::stof(argv[i]);
-    } else if (arg == "--adam-decay") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_decay = std::stof(argv[i]);
-    } else if (arg == "--adam-decay-min-ndim") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_decay_min_ndim = std::stoi(argv[i]);
-    } else if (arg == "--adam-beta1") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_beta1 = std::stof(argv[i]);
-    } else if (arg == "--adam-beta2") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_beta2 = std::stof(argv[i]);
-    } else if (arg == "--adam-gclip") {
-        if (++i >= argc) {
-            *invalid_param = true;
-            return true;
-        }
-        params->adam_gclip = std::stof(argv[i]);
-    } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                *invalid_param = true;
-                return true;
-            }
-            if (llama_supports_gpu_offload()) {
-                params->n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-    } else if (arg == "-h" || arg == "--help") {
-        params->print_usage = true;
-        return true;
-    } else {
-        return false;
-    }
-    return true;
-}
-
-void finish_processing_train_args(struct train_params_common * params) {
-    if (params->escape) {
-        process_escapes(params->sample_start);
-    }
-}
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
-    struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
-    struct train_params_common     * params = data->params;
-    struct train_state             * train  = data->train;
-    struct ggml_opt_context        * opt    = train->opt;
-    int n_batch = params->n_batch;
-    int n_ctx = params->n_ctx;
-
-    if (accum_step == 0) {
-        // time measurement
-        int64_t now = ggml_time_ms();
-        if (now > data->last_time && opt->iter > data->first_iter) {
-            double dt = (double) (now - data->last_time);
-            if (data->millis_per_iter == 0.0) {
-                data->millis_per_iter = dt;
-            } else {
-                const double gain = 0.7;
-                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
-            }
-        }
-
-        double remaining_millis = 0.0;
-        if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->adam_n_iter;
-            const int done_iter = opt->iter - data->first_iter;
-            const int remaining_iter = n_iter - done_iter;
-            remaining_millis = remaining_iter * data->millis_per_iter;
-        }
-
-        // file saving
-        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-        if (save_now) {
-            int new_iters = opt->iter - data->last_save_iter;
-            train->train_its    += new_iters;
-            train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
-
-            if (data->save_cb) {
-                data->save_cb(data->save_data, train);
-            }
-
-            data->last_save_iter = opt->iter;
-        }
-
-        // exclude file saving from time measurement, by measuring last_time after saving
-        data->last_time = ggml_time_ms();
-
-        *sched = learning_schedule(
-            opt->iter,
-            params->warmup,
-            params->cos_decay_steps,
-            params->adam_alpha,
-            params->adam_min_alpha,
-            params->cos_decay_min,
-            params->cos_decay_restart,
-            params->enable_restart);
-
-        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
-        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
-            *sched, opt->loss_after);
-
-
-        if (data->millis_per_iter > 0) {
-            printf(" dt=");
-            print_duration(data->millis_per_iter);
-            printf(" eta=");
-            print_duration(remaining_millis);
-        }
-
-        float improvement = opt->loss_before - opt->loss_after;
-        const float plot_scale = 10.0f;
-        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-        printf(" |");
-        for (int i=0; i<bar_len; ++i) {
-            printf("-");
-        }
-        printf(">");
-        printf("\n");
-    }
-
-    int64_t used_samples = get_example_targets_batch(
-        data->lctx,
-        data->tokens_input,
-        data->target_probs,
-        train->shuffle_next_sample,
-        data->shuffled_samples_offs,
-        data->shuffled_samples_begin,
-        data->shuffled_samples_size,
-        data->samples_count,
-        data->tokens_data,
-        data->tokens_size,
-        params->separate_with_eos,
-        params->separate_with_bos,
-        params->fill_with_next_samples,
-        params->sample_random_offsets);
-
-    train->train_samples += used_samples;
-    train->shuffle_next_sample += used_samples;
-
-    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
-        ++train->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
-        // note: we may have used some samples from the current shuffling more than once
-        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
-        train->shuffle_rng_state_next = shuffle_samples(
-            train->shuffle_rng_state_current,
-            data->shuffled_samples_offs,
-            data->shuffled_samples_begin,
-            data->shuffled_samples_size,
-            data->samples_begin,
-            data->samples_size,
-            data->samples_count);
-        train->shuffle_next_sample = 0;
-    }
-
-    const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs);
-    if (last_epoch_reached) {
-        // allow optimization iteration at last epoch to be completed before canceling
-        if (data->iter_at_last_epoch < 0) {
-            data->iter_at_last_epoch = opt->iter;
-        } else if (opt->iter > data->iter_at_last_epoch) {
-            *cancel = true;
-        }
-    }
-}
diff --git a/common/train.h b/common/train.h
deleted file mode 100644
index 263d940c0..000000000
--- a/common/train.h
+++ /dev/null
@@ -1,233 +0,0 @@
-// Various helper functions and utilities for training
-
-#pragma once
-
-#include <string>
-#include <random>
-#include <vector>
-
-#include "ggml.h"
-#include "llama.h"
-
-#define LLAMA_TRAIN_MAX_NODES 16384
-
-typedef std::string mt19937_state;
-
-struct train_state {
-    struct ggml_opt_context * opt;
-
-    uint64_t train_its;
-    uint64_t train_samples;
-    uint64_t train_tokens;
-    uint64_t train_epochs;
-
-    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    mt19937_state shuffle_rng_state_current;
-    mt19937_state shuffle_rng_state_next;
-    size_t        shuffle_sample_count;
-    size_t        shuffle_next_sample;
-};
-
-struct train_params_common {
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    bool print_usage;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-    int n_epochs;
-    int n_gpu_layers;
-
-    bool custom_n_ctx;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-    bool sample_random_offsets;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-};
-
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
-
-struct train_opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_offs;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int                          first_epoch;
-    int                          iter_at_last_epoch;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-struct train_state * init_train_state();
-void free_train_state(struct train_state  * state);
-
-struct train_params_common get_default_train_params_common();
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
-
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
-void finish_processing_train_args(struct train_params_common * params);
-
-struct random_normal_distribution;
-struct random_uniform_distribution;
-
-struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd);
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
-
-struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
-
-// generate random float in interval [0,1)
-float frand();
-float frand_normal (struct random_normal_distribution * rnd);
-float frand_uniform(struct random_uniform_distribution * rnd);
-
-int   clamp (const int v, const int min, const int max);
-float fclamp(const float v, const float min, const float max);
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size);
-
-int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_offs,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples,
-        bool                   sample_random_offsets);
-
-
-void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
-mt19937_state mt19937_get_state(const std::mt19937& rng);
-mt19937_state mt19937_seed_to_state(unsigned seed);
-
-mt19937_state shuffle_samples(
-        const mt19937_state & rng_state,
-        size_t              * shuffled_offs,
-        size_t              * shuffled_begins,
-        size_t              * shuffled_sizes,
-        const size_t        * begins,
-        const size_t        * sizes,
-        size_t                count);
-
-size_t hash_combine(size_t h1, size_t h2);
-
-size_t compute_samples_hash(
-    const char* fn,
-    const size_t* samples_begin,
-    const size_t* samples_size,
-    size_t sample_count);
-
-
-std::string replace_str(const char * s, const char * needle, const char * replacement);
-
-void print_duration(double milliseconds);
-
-float cosine_decay(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum);
-
-float cosine_decay_restart(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum,
-    float   restart_step_mult);
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart);
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
deleted file mode 100755
index 28b92ac38..000000000
--- a/convert-hf-to-gguf.py
+++ /dev/null
@@ -1,1939 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import contextlib
-import json
-import os
-import re
-import sys
-from enum import IntEnum
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
-
-import numpy as np
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-from convert import HfVocab
-
-
-###### MODEL DEFINITIONS ######
-
-class SentencePieceTokenTypes(IntEnum):
-    NORMAL = 1
-    UNKNOWN = 2
-    CONTROL = 3
-    USER_DEFINED = 4
-    UNUSED = 5
-    BYTE = 6
-
-
-class Model:
-    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
-        self.dir_model = dir_model
-        self.ftype = ftype
-        self.fname_out = fname_out
-        self.is_big_endian = is_big_endian
-        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
-        self.is_safetensors = self._is_model_safetensors()
-        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
-        self.part_names = self._get_part_names()
-        self.hparams = Model.load_hparams(self.dir_model)
-        self.model_arch = self._get_model_architecture()
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
-        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
-
-    def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
-        key = next((k for k in keys if k in self.hparams), None)
-        if key is not None:
-            return self.hparams[key]
-        if optional:
-            return None
-        raise KeyError(f"could not find any of: {keys}")
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-        for part_name in self.part_names:
-            print(f"gguf: loading model part '{part_name}'")
-            ctx: ContextManager[Any]
-            if self.is_safetensors:
-                from safetensors import safe_open
-                ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
-            else:
-                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
-
-            with ctx as model_part:
-                for name in model_part.keys():
-                    data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
-                    yield name, data
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_block_count(self.block_count)
-
-        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
-            self.gguf_writer.add_context_length(n_ctx)
-
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        self.gguf_writer.add_embedding_length(n_embd)
-
-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
-            self.gguf_writer.add_feed_forward_length(n_ff)
-
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        self.gguf_writer.add_head_count(n_head)
-
-        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
-            self.gguf_writer.add_head_count_kv(n_head_kv)
-
-        if (rope_theta := self.hparams.get("rope_theta")) is not None:
-            self.gguf_writer.add_rope_freq_base(rope_theta)
-        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
-            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
-            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-    def write(self):
-        self.write_tensors()
-        self.gguf_writer.write_header_to_file()
-        self.gguf_writer.write_kv_data_to_file()
-        self.gguf_writer.write_tensors_to_file()
-        self.gguf_writer.close()
-
-    def write_vocab(self):
-        self.gguf_writer.write_header_to_file()
-        self.gguf_writer.write_kv_data_to_file()
-        self.gguf_writer.close()
-
-    @staticmethod
-    def count_model_parts(dir_model: Path, prefix: str) -> int:
-        num_parts = 0
-        for filename in os.listdir(dir_model):
-            if filename.endswith(prefix):
-                num_parts += 1
-
-        return num_parts
-
-    @staticmethod
-    def load_hparams(dir_model):
-        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-            return json.load(f)
-
-    @staticmethod
-    def from_model_architecture(model_architecture):
-        if model_architecture == "GPTNeoXForCausalLM":
-            return GPTNeoXModel
-        if model_architecture == "BloomForCausalLM":
-            return BloomModel
-        if model_architecture == "MPTForCausalLM":
-            return MPTModel
-        if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
-            return BaichuanModel
-        if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
-            return FalconModel
-        if model_architecture == "GPTBigCodeForCausalLM":
-            return StarCoderModel
-        if model_architecture == "GPTRefactForCausalLM":
-            return RefactModel
-        if model_architecture == "PersimmonForCausalLM":
-            return PersimmonModel
-        if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
-            return StableLMModel
-        if model_architecture == "QWenLMHeadModel":
-            return QwenModel
-        if model_architecture == "Qwen2ForCausalLM":
-            return Model
-        if model_architecture == "MixtralForCausalLM":
-            return MixtralModel
-        if model_architecture == "GPT2LMHeadModel":
-            return GPT2Model
-        if model_architecture == "PhiForCausalLM":
-            return Phi2Model
-        if model_architecture == "PlamoForCausalLM":
-            return PlamoModel
-        if model_architecture == "CodeShellForCausalLM":
-            return CodeShellModel
-        if model_architecture == "OrionForCausalLM":
-            return OrionModel
-        if model_architecture == "InternLM2ForCausalLM":
-            return InternLM2Model
-        if model_architecture == "MiniCPMForCausalLM":
-            return MiniCPMModel
-        if model_architecture == "BertModel":
-            return BertModel
-        if model_architecture == "NomicBertModel":
-            return NomicBertModel
-        if model_architecture == "GemmaForCausalLM":
-            return GemmaModel
-        if model_architecture == "Starcoder2ForCausalLM":
-            return Model
-        return Model
-
-    def _is_model_safetensors(self) -> bool:
-        return Model.count_model_parts(self.dir_model, ".safetensors") > 0
-
-    def _get_part_names(self):
-        if self.is_safetensors:
-            if self.num_parts == 1:  # there's only one .safetensors file
-                return ("model.safetensors",)
-            return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
-
-        if self.num_parts == 1:  # there's only one .bin file
-            return ("pytorch_model.bin",)
-        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
-
-    def _get_model_architecture(self) -> gguf.MODEL_ARCH:
-        arch = self.hparams["architectures"][0]
-        if arch == "GPTNeoXForCausalLM":
-            return gguf.MODEL_ARCH.GPTNEOX
-        if arch == "BloomForCausalLM":
-            return gguf.MODEL_ARCH.BLOOM
-        if arch == "MPTForCausalLM":
-            return gguf.MODEL_ARCH.MPT
-        if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
-            return gguf.MODEL_ARCH.BAICHUAN
-        if arch in ("FalconForCausalLM", "RWForCausalLM"):
-            return gguf.MODEL_ARCH.FALCON
-        if arch == "GPTBigCodeForCausalLM":
-            return gguf.MODEL_ARCH.STARCODER
-        if arch == "GPTRefactForCausalLM":
-            return gguf.MODEL_ARCH.REFACT
-        if arch == "PersimmonForCausalLM":
-            return gguf.MODEL_ARCH.PERSIMMON
-        if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
-            return gguf.MODEL_ARCH.STABLELM
-        if arch == "QWenLMHeadModel":
-            return gguf.MODEL_ARCH.QWEN
-        if arch == "Qwen2ForCausalLM":
-            return gguf.MODEL_ARCH.QWEN2
-        if arch == "MixtralForCausalLM":
-            return gguf.MODEL_ARCH.LLAMA
-        if arch == "GPT2LMHeadModel":
-            return gguf.MODEL_ARCH.GPT2
-        if arch == "PhiForCausalLM":
-            return gguf.MODEL_ARCH.PHI2
-        if arch == "PlamoForCausalLM":
-            return gguf.MODEL_ARCH.PLAMO
-        if arch == "CodeShellForCausalLM":
-            return gguf.MODEL_ARCH.CODESHELL
-        if arch == "OrionForCausalLM":
-            return gguf.MODEL_ARCH.ORION
-        if arch == "InternLM2ForCausalLM":
-            return gguf.MODEL_ARCH.INTERNLM2
-        if arch == "MiniCPMForCausalLM":
-            return gguf.MODEL_ARCH.MINICPM
-        if arch == "BertModel":
-            return gguf.MODEL_ARCH.BERT
-        if arch == "NomicBertModel":
-            return gguf.MODEL_ARCH.NOMIC_BERT
-        if arch == "GemmaForCausalLM":
-            return gguf.MODEL_ARCH.GEMMA
-        if arch == "Starcoder2ForCausalLM":
-            return gguf.MODEL_ARCH.STARCODER2
-
-        raise NotImplementedError(f'Architecture "{arch}" not supported!')
-
-    def _set_vocab_gpt2(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model)
-        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-        assert max(tokenizer.vocab.values()) < vocab_size
-
-        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
-        added_vocab = tokenizer.get_added_vocab()
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode('utf-8')
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                if tokenizer.added_tokens_decoder[i].special:
-                    toktypes.append(gguf.TokenType.CONTROL)
-                else:
-                    toktypes.append(gguf.TokenType.USER_DEFINED)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def _set_vocab_qwen(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[QwenModel.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-
-        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
-        added_vocab = tokenizer.special_tokens
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        # only add special tokens when they were not already loaded from config.json
-        if len(special_vocab.special_token_ids) == 0:
-            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        # this one is usually not in config.json anyway
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def _set_vocab_sentencepiece(self):
-        from sentencepiece import SentencePieceProcessor
-
-        tokenizer_path = self.dir_model / 'tokenizer.model'
-
-        tokens: list[bytes] = []
-        scores: list[float] = []
-        toktypes: list[int] = []
-
-        if not tokenizer_path.is_file():
-            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
-            sys.exit(1)
-
-        tokenizer = SentencePieceProcessor(str(tokenizer_path))
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
-        for token_id in range(vocab_size):
-            piece = tokenizer.id_to_piece(token_id)
-            text = piece.encode("utf-8")
-            score = tokenizer.get_score(token_id)
-
-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.is_unknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.is_control(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.is_unused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.is_byte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
-
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        added_tokens_file = self.dir_model / 'added_tokens.json'
-        if added_tokens_file.is_file():
-            with open(added_tokens_file, "r", encoding="utf-8") as f:
-                added_tokens_json = json.load(f)
-
-                for key in added_tokens_json:
-                    tokens.append(key.encode("utf-8"))
-                    scores.append(-1000.0)
-                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def _set_vocab_hf(self):
-        path = self.dir_model
-        added_tokens_path = self.dir_model
-        vocab = HfVocab(
-            path, added_tokens_path if added_tokens_path.exists() else None
-        )
-        tokens = []
-        scores = []
-        toktypes = []
-
-        for text, score, toktype in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        assert len(tokens) == vocab.vocab_size
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-
-class GPTNeoXModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(
-            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
-        )
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-
-
-class BloomModel(Model):
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name("Bloom")
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
-        self.gguf_writer.add_embedding_length(n_embed)
-        self.gguf_writer.add_feed_forward_length(4 * n_embed)
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head)
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams["n_layer"]
-        tensors = dict(self.get_tensors())
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        has_lm_head = True
-        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
-        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-
-        for name, data_torch in tensors.items():
-            if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
-                has_lm_head = False
-
-            name = re.sub(r'transformer\.', '', name)
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
-                # Map bloom-style qkv_linear to gpt-style qkv_linear
-                # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
-                # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
-                qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
-                data = np.concatenate(
-                    (
-                        qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
-                        qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
-                        qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
-                    ),
-                    axis=0,
-                )
-                print("re-format attention.linear_qkv.weight")
-            elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
-                qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
-                data = np.concatenate(
-                    (
-                        qkv_bias[:, 0, :].reshape((n_embed,)),
-                        qkv_bias[:, 1, :].reshape((n_embed,)),
-                        qkv_bias[:, 2, :].reshape((n_embed,)),
-                    ),
-                    axis=0,
-                )
-                print("re-format attention.linear_qkv.bias")
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-            if not has_lm_head and name == "word_embeddings.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
-
-class MPTModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams["n_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
-        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
-        self.gguf_writer.add_head_count(self.hparams["n_heads"])
-        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
-            self.gguf_writer.add_head_count_kv(kv_n_heads)
-        self.gguf_writer.add_layer_norm_eps(1e-5)
-        if self.hparams["attn_config"]["clip_qkv"] is not None:
-            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
-        self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            if "scales" in name:
-                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                if new_name is not None:
-                    new_name = new_name.replace("scales", "act.scales")
-            else:
-                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class OrionModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-        hf_repo = self.hparams.get("_name_or_path", "")
-
-        ctx_length = 0
-        if "max_sequence_length" in self.hparams:
-            ctx_length = self.hparams["max_sequence_length"]
-        elif "max_position_embeddings" in self.hparams:
-            ctx_length = self.hparams["max_position_embeddings"]
-        elif "model_max_length" in self.hparams:
-            ctx_length = self.hparams["model_max_length"]
-        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
-
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_source_hf_repo(hf_repo)
-        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_context_length(ctx_length)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        # note: config provides rms norm but it is actually layer norm
-        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
-        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
-
-    def write_tensors(self):
-        # Collect tensors from generator object
-        model_kv = dict(self.get_tensors())
-        block_count = self.hparams["num_hidden_layers"]
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class BaichuanModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-        hf_repo = self.hparams.get("_name_or_path", "")
-
-        ctx_length = 0
-        if "max_sequence_length" in self.hparams:
-            ctx_length = self.hparams["max_sequence_length"]
-        elif "max_position_embeddings" in self.hparams:
-            ctx_length = self.hparams["max_position_embeddings"]
-        elif "model_max_length" in self.hparams:
-            ctx_length = self.hparams["model_max_length"]
-        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_source_hf_repo(hf_repo)
-        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_context_length(ctx_length)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
-    def write_tensors(self):
-        # Collect tensors from generator object
-        model_kv = dict(self.get_tensors())
-        block_count = self.hparams["num_hidden_layers"]
-        head_count = self.hparams["num_attention_heads"]
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
-        for i in range(block_count):
-            if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
-                print(f"Unpacking and permuting layer {i}")
-                model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
-                    self._reverse_hf_permute_part(w, 0, head_count, head_count)
-                model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
-                    self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
-                model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
-                    self._reverse_hf_part(w, 2)
-                del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
-
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
-
-    def _reverse_hf_permute_part(
-        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
-    ) -> Tensor:
-        r = weights.shape[0] // 3
-        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
-
-    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
-        r = weights.shape[0] // 3
-        return weights[r * n_part:r * n_part + r, ...]
-
-
-class FalconModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams.get("num_hidden_layers")
-        if block_count is None:
-            block_count = self.hparams["n_layer"]  # old name
-
-        n_head = self.hparams.get("num_attention_heads")
-        if n_head is None:
-            n_head = self.hparams["n_head"]  # old name
-
-        n_head_kv = self.hparams.get("num_kv_heads")
-        if n_head_kv is None:
-            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
-
-        self.gguf_writer.add_name("Falcon")
-        self.gguf_writer.add_context_length(2048)  # not in config.json
-        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head_kv)
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("num_hidden_layers")
-        if block_count is None:
-            block_count = self.hparams["n_layer"]  # old name
-
-        n_head = self.hparams.get("num_attention_heads")
-        if n_head is None:
-            n_head = self.hparams["n_head"]  # old name
-
-        n_head_kv = self.hparams.get("num_kv_heads")
-        if n_head_kv is None:
-            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
-
-        head_dim = self.hparams["hidden_size"] // n_head
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            # QKV tensor transform
-            # The original query_key_value tensor contains n_head_kv "kv groups",
-            # each consisting of n_head/n_head_kv query weights followed by one key
-            # and one value weight (shared by all query heads in the kv group).
-            # This layout makes it a big pain to work with in GGML.
-            # So we rearrange them here,, so that we have n_head query weights
-            # followed by n_head_kv key weights followed by n_head_kv value weights,
-            # in contiguous fashion.
-            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
-
-            if "query_key_value" in name:
-                qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-                q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
-                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class StarCoderModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams["n_layer"]
-
-        self.gguf_writer.add_name("StarCoder")
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_head_count_kv(1)
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-
-class RefactModel(Model):
-    def set_gguf_parameters(self):
-        hidden_dim = self.hparams["n_embd"]
-        inner_dim = 4 * hidden_dim
-        hidden_dim = int(2 * inner_dim / 3)
-        multiple_of = 256
-        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        block_count = self.hparams["n_layer"]
-
-        self.gguf_writer.add_name("Refact")
-        # refact uses Alibi. So this is from config.json which might be used by training.
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-
-        self.gguf_writer.add_feed_forward_length(ff_dim)
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_head_count_kv(1)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        hidden_dim = self.hparams["n_embd"]
-        inner_dim = 4 * hidden_dim
-        hidden_dim = int(2 * inner_dim / 3)
-        multiple_of = 256
-        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        n_head = self.hparams["n_head"]
-        n_head_kv = 1
-        head_dim = self.hparams["n_embd"] // n_head
-        block_count = self.hparams["n_layer"]
-
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        tensors = dict(self.get_tensors())
-        for i in range(block_count):
-            if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
-                tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
-                tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
-                del tensors[f"transformer.h.{i}.attn.kv.weight"]
-            if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
-                tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
-                del tensors[f"transformer.h.{i}.attn.q.weight"]
-            if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
-                tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
-                tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
-                del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-
-        for name, data_torch in tensors.items():
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class PersimmonModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = head_count
-        hidden_size = self.hparams["hidden_size"]
-
-        self.gguf_writer.add_name('persimmon-8b-chat')
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hidden_size)
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-
-        # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
-        #       than the head size?
-        #       ref: https://github.com/ggerganov/llama.cpp/pull/4889
-        # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
-        self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
-
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-        # self.gguf_writer.add_bos_token_id(71013)
-        # self.gguf_writer.add_eos_token_id(71013)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            if name.endswith(".self_attention.rotary_emb.inv_freq"):
-                continue
-            old_dtype = data_torch.dtype
-            # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-            data = data_torch.to(torch.float32).squeeze().numpy()
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-            n_dims = len(data.shape)
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class StableLMModel(Model):
-    def set_vocab(self):
-        if (self.dir_model / "tokenizer.json").is_file():
-            self._set_vocab_gpt2()
-        else:
-            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
-            self._set_vocab_qwen()
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
-        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
-        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
-
-
-class MixtralModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-
-class MiniCPMModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_name("MiniCPM")
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def set_vocab(self):
-        self._set_vocab_hf()
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        n_head = self.hparams.get("num_attention_heads")
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            # HF models permute some of the tensors, so we need to undo that
-            if name.endswith(("q_proj.weight")):
-                data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
-            if name.endswith(("k_proj.weight")):
-                data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class QwenModel(Model):
-    @staticmethod
-    def token_bytes_to_string(b):
-        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-        byte_encoder = bytes_to_unicode()
-        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
-
-    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
-        parts = [bytes([b]) for b in token]
-        while True:
-            min_idx = None
-            min_rank = None
-            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-                rank = mergeable_ranks.get(pair[0] + pair[1])
-                if rank is not None and (min_rank is None or rank < min_rank):
-                    min_idx = i
-                    min_rank = rank
-            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-                break
-            assert min_idx is not None
-            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
-        return parts
-
-    def set_vocab(self):
-        self._set_vocab_qwen()
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name("Qwen")
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
-
-    def write_tensors(self):
-        block_count = self.hparams["num_hidden_layers"]
-        model_kv = dict(self.get_tensors())
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class GPT2Model(Model):
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
-                continue
-
-            if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
-                data_torch = data_torch.transpose(1, 0)
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-            # note: GPT2 output is tied to (same as) wte in original model
-            if new_name == "token_embd.weight":
-                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-                self.gguf_writer.add_tensor("output.weight", data)
-
-
-class Phi2Model(Model):
-    def set_gguf_parameters(self):
-        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
-
-        rot_pct = self.find_hparam(["partial_rotary_factor"])
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-
-        self.gguf_writer.add_name("Phi2")
-        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
-
-        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(4 * n_embd)
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head)
-        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
-        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_add_bos_token(False)
-
-
-class PlamoModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name("PLaMo")
-        self.gguf_writer.add_context_length(4096)  # not in config.json
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
-        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-
-    def shuffle_attn_q_weight(self, data_torch):
-        assert data_torch.size() == (5120, 5120)
-        data_torch = data_torch.reshape(8, 5, 128, 5120)
-        data_torch = torch.permute(data_torch, (1, 0, 2, 3))
-        data_torch = torch.reshape(data_torch, (5120, 5120))
-        return data_torch
-
-    def shuffle_attn_output_weight(self, data_torch):
-        assert data_torch.size() == (5120, 5120)
-        data_torch = data_torch.reshape(5120, 8, 5, 128)
-        data_torch = torch.permute(data_torch, (0, 2, 1, 3))
-        data_torch = torch.reshape(data_torch, (5120, 5120))
-        return data_torch
-
-    def write_tensors(self):
-        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            if "self_attn.rotary_emb.inv_freq" in name:
-                continue
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            # shuffle for broadcasting of gqa in ggml_mul_mat
-            if new_name.endswith("attn_q.weight"):
-                data_torch = self.shuffle_attn_q_weight(data_torch)
-            elif new_name.endswith("attn_output.weight"):
-                data_torch = self.shuffle_attn_output_weight(data_torch)
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class CodeShellModel(Model):
-    def set_gguf_parameters(self):
-        block_count = self.hparams["n_layer"]
-
-        self.gguf_writer.add_name("CodeShell")
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_rope_freq_base(10000.0)
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-        self.gguf_writer.add_rope_scaling_factor(1.0)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        tensors = dict(self.get_tensors())
-        has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
-        for name, data_torch in tensors.items():
-            # we don't need these
-            if name.endswith((".attn.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-            if not has_lm_head and name == "transformer.wte.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
-
-class InternLM2Model(Model):
-    def set_vocab(self):
-        # (TODO): Is there a better way?
-        # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
-        # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
-        # recognized as an empty string in C++.
-        from sentencepiece import SentencePieceProcessor
-        from sentencepiece import sentencepiece_model_pb2 as model
-
-        tokenizer_path = self.dir_model / 'tokenizer.model'
-
-        tokens: list[bytes] = []
-        scores: list[float] = []
-        toktypes: list[int] = []
-
-        if not tokenizer_path.is_file():
-            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
-            sys.exit(1)
-
-        sentencepiece_model = model.ModelProto()
-        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
-        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
-
-        tokenizer = SentencePieceProcessor(str(tokenizer_path))
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
-        for token_id in range(vocab_size):
-            piece = tokenizer.id_to_piece(token_id)
-            text = piece.encode("utf-8")
-            score = tokenizer.get_score(token_id)
-            if text == b"\x00":
-                # (TODO): fixme
-                # Hack here and replace the \x00 characters.
-                print(f"InternLM2 convert token '{text}' to '🐉'!")
-                text = "🐉"
-
-            toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.is_unknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.is_control(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.is_unused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.is_byte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
-
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        added_tokens_file = self.dir_model / 'added_tokens.json'
-        if added_tokens_file.is_file():
-            with open(added_tokens_file, "r", encoding="utf-8") as f:
-                added_tokens_json = json.load(f)
-
-                for key in added_tokens_json:
-                    tokens.append(key.encode("utf-8"))
-                    scores.append(-1000.0)
-                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_add_space_prefix(add_prefix)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        old_eos = special_vocab.special_token_ids["eos"]
-        if "chat" in os.path.basename(self.dir_model.absolute()):
-            # For the chat model, we replace the eos with '<|im_end|>'.
-            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
-            print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
-in chat mode so that the conversation can end normally.")
-
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def _try_get_sft_eos(self, tokenizer):
-        unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
-        im_end_list = tokenizer.encode('<|im_end|>')
-        assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
-        if len(unused_145_list) == 1:
-            eos_token = unused_145_list[0]
-        if len(im_end_list) == 1:
-            eos_token = im_end_list[0]
-        return eos_token
-
-    def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name("InternLM2")
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-
-    def post_write_tensors(self, tensor_map, name, data_torch):
-        old_dtype = data_torch.dtype
-
-        # convert any unsupported data types to float32
-        if data_torch.dtype not in (torch.float16, torch.float32):
-            data_torch = data_torch.to(torch.float32)
-
-        data = data_torch.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-        if new_name is None:
-            print(f"Can not map tensor {name!r}")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if self.ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-        self.gguf_writer.add_tensor(new_name, data)
-
-    def write_tensors(self):
-        from einops import rearrange
-
-        num_heads = self.hparams.get("num_attention_heads")
-        num_kv_heads = self.hparams.get("num_key_value_heads")
-        hidden_size = self.hparams.get("hidden_size")
-        q_per_kv = num_heads // num_kv_heads
-        head_dim = hidden_size // num_heads
-        num_groups = num_heads // q_per_kv
-
-        block_count = self.hparams["num_hidden_layers"]
-        model_kv = dict(self.get_tensors())
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            if re.match(qkv_pattern, name):
-                bid = re.findall(qkv_pattern, name)[0]
-                qkv = data_torch
-                qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
-                q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
-                # The model weights of q and k equire additional reshape.
-                q = self._hf_permute_qk(rearrange(q, " o g n i ->  o (g n i)").T, num_heads, num_heads)
-                k = self._hf_permute_qk(rearrange(k, " o g n i ->  o (g n i)").T, num_heads, num_kv_heads)
-                v = rearrange(v, " o g n i ->  o (g n i)").T
-                self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
-                self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
-                self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
-            else:
-                self.post_write_tensors(tensor_map, name, data_torch)
-
-
-class BertModel(Model):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.vocab_size = None
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_causal_attention(False)
-
-        # get pooling path
-        with open(self.dir_model / "modules.json", encoding="utf-8") as f:
-            modules = json.load(f)
-        pooling_path = None
-        for mod in modules:
-            if mod["type"] == "sentence_transformers.models.Pooling":
-                pooling_path = mod["path"]
-                break
-
-        # get pooling type
-        pooling_type = gguf.PoolingType.NONE
-        if pooling_path is not None:
-            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
-                pooling = json.load(f)
-            if pooling["pooling_mode_mean_tokens"]:
-                pooling_type = gguf.PoolingType.MEAN
-            elif pooling["pooling_mode_cls_token"]:
-                pooling_type = gguf.PoolingType.CLS
-            else:
-                raise NotImplementedError("Only MEAN and CLS pooling types supported")
-
-        self.gguf_writer.add_pooling_type(pooling_type.value)
-
-    def set_vocab(self):
-        path = self.dir_model
-        added_tokens_path = self.dir_model if self.dir_model.exists() else None
-
-        # use huggingface vocab to get all tokens
-        vocab = HfVocab(path, added_tokens_path)
-        tokens, scores, toktypes = zip(*vocab.all_tokens())
-        assert len(tokens) == vocab.vocab_size
-        self.vocab_size = vocab.vocab_size
-
-        # we need this to validate the size of the token_type embeddings
-        # though currently we are passing all zeros to the token_type embeddings
-        n_token_types = len(set(toktypes))
-        self.gguf_writer.add_token_type_count(n_token_types)
-
-        # convert to phantom space vocab
-        def phantom(tok, typ):
-            if tok.startswith(b"[") and tok.endswith(b"]"):
-                return tok
-            if tok.startswith(b"##"):
-                return tok[2:]
-            return b"\xe2\x96\x81" + tok
-        tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
-
-        # set up bos and eos tokens (cls and sep)
-        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
-        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
-
-        # add vocab to gguf
-        self.gguf_writer.add_tokenizer_model("bert")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        # handle special tokens
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def write_tensors(self):
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-        tensors = dict(self.get_tensors())
-        for name, data_torch in tensors.items():
-            # we are only using BERT for embeddings so we don't need the pooling layer
-            if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
-                continue  # we don't need these
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            data = data_torch.squeeze().numpy()
-            n_dims = len(data.shape)
-            new_dtype: type[np.floating[Any]]
-
-            if (
-                self.ftype == 1 and name.endswith(".weight") and n_dims == 2
-                and name != "embeddings.token_type_embeddings.weight"  # not used with get_rows, must be F32
-            ):
-                # if f16 desired, convert any float32 2-dim weight tensors to float16
-                new_dtype = np.float16
-            else:
-                # if f32 desired, convert any float16 to float32
-                new_dtype = np.float32
-
-            print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
-
-            if data.dtype != new_dtype:
-                data = data.astype(new_dtype)
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class NomicBertModel(BertModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # the HF config claims n_ctx=8192, but it uses RoPE scaling
-        self.hparams["n_ctx"] = 2048
-
-        # SwigLU activation
-        assert self.hparams["activation_function"] == "swiglu"
-        # this doesn't do anything in the HF version
-        assert self.hparams["causal"] is False
-        # no bias tensors
-        assert self.hparams["qkv_proj_bias"] is False
-        assert self.hparams["mlp_fc1_bias"] is False
-        assert self.hparams["mlp_fc2_bias"] is False
-        # norm at end of layer
-        assert self.hparams["prenorm"] is False
-        # standard RoPE
-        assert self.hparams["rotary_emb_fraction"] == 1.0
-        assert self.hparams["rotary_emb_interleaved"] is False
-        assert self.hparams["rotary_emb_scale_base"] is None
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-
-    def get_tensors(self):
-        assert self.vocab_size is not None
-        for name, data in super().get_tensors():
-            # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
-            if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
-                rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
-                assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
-                data = data[:self.vocab_size, :]
-            yield name, data
-
-
-class GemmaModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_key_length(hparams["head_dim"])
-        self.gguf_writer.add_value_length(hparams["head_dim"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
-            if name.endswith("norm.weight"):
-                data_torch = data_torch + 1
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-###### CONVERSION LOGIC ######
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a huggingface model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--awq-path", type=Path, default=None,
-        help="Path to scale awq cache file")
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16"], default="f16",
-        help="output format - use f32 for float32, f16 for float16",
-    )
-    parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file",
-    )
-
-    return parser.parse_args()
-
-
-def main() -> None:
-    args = parse_args()
-
-    dir_model = args.model
-
-    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
-        tmp_model_path = args.model / "weighted_model"
-        dir_model = tmp_model_path
-        if tmp_model_path.is_dir():
-            print(f"{tmp_model_path} exists as a weighted model.")
-        else:
-            tmp_model_path.mkdir(parents=True, exist_ok=True)
-            print("Saving new weighted model ...")
-            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
-            print(f"Saved weighted model at {tmp_model_path}.")
-
-    if not dir_model.is_dir():
-        print(f'Error: {args.model} is not a directory', file=sys.stderr)
-        sys.exit(1)
-
-    ftype_map = {
-        "f32": gguf.GGMLQuantizationType.F32,
-        "f16": gguf.GGMLQuantizationType.F16,
-    }
-
-    if args.outfile is not None:
-        fname_out = args.outfile
-    else:
-        # output in the same directory as the model by default
-        fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
-
-    print(f"Loading model: {dir_model.name}")
-
-    hparams = Model.load_hparams(dir_model)
-
-    with torch.inference_mode():
-        model_class = Model.from_model_architecture(hparams["architectures"][0])
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
-
-        print("Set model parameters")
-        model_instance.set_gguf_parameters()
-
-        print("Set model tokenizer")
-        model_instance.set_vocab()
-
-        if args.vocab_only:
-            print(f"Exporting model vocab to '{fname_out}'")
-            model_instance.write_vocab()
-        else:
-            print(f"Exporting model to '{fname_out}'")
-            model_instance.write()
-
-        print(f"Model successfully exported to '{fname_out}'")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
deleted file mode 100755
index 9a9936dec..000000000
--- a/convert-lora-to-ggml.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any, BinaryIO, Sequence
-
-import numpy as np
-import torch
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
-
-
-def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
-    fout.write(b"ggla"[::-1])  # magic (ggml lora)
-    fout.write(struct.pack("i", 1))  # file version
-    fout.write(struct.pack("i", params["r"]))
-    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
-    # but some models ship a float value instead
-    # let's convert to int, but fail if lossless conversion is not possible
-    assert (
-        int(params["lora_alpha"]) == params["lora_alpha"]
-    ), "cannot convert float to int losslessly"
-    fout.write(struct.pack("i", int(params["lora_alpha"])))
-
-
-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
-    sname = name.encode("utf-8")
-    fout.write(
-        struct.pack(
-            "iii",
-            len(shape),
-            len(sname),
-            NUMPY_TYPE_TO_FTYPE[data_type.name],
-        )
-    )
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-    fout.seek((fout.tell() + 31) & -32)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        print(
-            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-        )
-        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
-        sys.exit(1)
-
-    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-
-    if os.path.exists(input_model):
-        model = torch.load(input_model, map_location="cpu")
-    else:
-        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-        model = load_file(input_model, device="cpu")
-
-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
-
-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        print(f"Error: unsupported architecture {arch_name}")
-        sys.exit(1)
-
-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-
-    with open(input_json, "r") as f:
-        params = json.load(f)
-
-    if params["peft_type"] != "LORA":
-        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-        sys.exit(1)
-
-    if params["fan_in_fan_out"] is True:
-        print("Error: param fan_in_fan_out is not supported")
-        sys.exit(1)
-
-    if params["bias"] is not None and params["bias"] != "none":
-        print("Error: param bias is not supported")
-        sys.exit(1)
-
-    # TODO: these seem to be layers that have been trained but without lora.
-    # doesn't seem widely used but eventually should be supported
-    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        print("Error: param modules_to_save is not supported")
-        sys.exit(1)
-
-    with open(output_path, "wb") as fout:
-        fout.truncate()
-
-        write_file_header(fout, params)
-        for k, v in model.items():
-            orig_k = k
-            if k.endswith(".default.weight"):
-                k = k.replace(".default.weight", ".weight")
-            if k in ["llama_proj.weight", "llama_proj.bias"]:
-                continue
-            if k.endswith("lora_A.weight"):
-                if v.dtype != torch.float16 and v.dtype != torch.float32:
-                    v = v.float()
-                v = v.T
-            else:
-                v = v.float()
-
-            t = v.detach().numpy()
-
-            prefix = "base_model.model."
-            if k.startswith(prefix):
-                k = k[len(prefix) :]
-
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
-                k = k[: -len(lora_suffixes[0])]
-            else:
-                print(f"Error: unrecognized tensor name {orig_k}")
-                sys.exit(1)
-
-            tname = name_map.get_name(k)
-            if tname is None:
-                print(f"Error: could not map tensor name {orig_k}")
-                print(" Note: the arch parameter must be specified if the model is not llama")
-                sys.exit(1)
-
-            if suffix == ".lora_A.weight":
-                tname += ".weight.loraA"
-            elif suffix == ".lora_B.weight":
-                tname += ".weight.loraB"
-            else:
-                assert False
-
-            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-            write_tensor_header(fout, tname, t.shape, t.dtype)
-            t.tofile(fout)
-
-    print(f"Converted {input_json} and {input_model} to {output_path}")
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
deleted file mode 100755
index def210531..000000000
--- a/convert-persimmon-to-gguf.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-
-def _flatten_dict(dct, tensors, prefix=None):
-    assert isinstance(dct, dict)
-    for key in dct.keys():
-        new_prefix = prefix + '.' + key if prefix is not None else key
-        if isinstance(dct[key], torch.Tensor):
-            tensors[new_prefix] = dct[key]
-        elif isinstance(dct[key], dict):
-            _flatten_dict(dct[key], tensors, new_prefix)
-        else:
-            raise ValueError(type(dct[key]))
-    return None
-
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
-    tokenizer_path = dir_model / 'adept_vocab.model'
-    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
-    tokenizer = SentencePieceProcessor(str(tokenizer_path))
-    print('gguf: adding tokens')
-    tokens: list[bytes] = []
-    scores: list[float] = []
-    toktypes: list[int] = []
-
-    for i in range(tokenizer.vocab_size()):
-        text: bytes
-        score: float
-
-        piece = tokenizer.id_to_piece(i)
-        text = piece.encode("utf-8")
-        score = tokenizer.get_score(i)
-
-        toktype = 1
-        if tokenizer.is_unknown(i):
-            toktype = 2
-        if tokenizer.is_control(i):
-            toktype = 3
-        if tokenizer.is_unused(i):
-            toktype = 5
-        if tokenizer.is_byte(i):
-            toktype = 6
-
-        tokens.append(text)
-        scores.append(score)
-        toktypes.append(toktype)
-        pass
-    return tokens, scores, toktypes
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
-    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
-    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
-    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
-    args = parser.parse_args()
-    sys.path.append(str(args.adept_inference_dir))
-    persimmon_model = torch.load(args.ckpt_path)
-    hparams = persimmon_model['args']
-    pprint(hparams)
-    tensors: dict[str, torch.Tensor] = {}
-    _flatten_dict(persimmon_model['model'], tensors, None)
-
-    arch = gguf.MODEL_ARCH.PERSIMMON
-    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
-    block_count = hparams.num_layers
-    head_count = hparams.num_attention_heads
-    head_count_kv = head_count
-    ctx_length = hparams.seq_length
-    hidden_size = hparams.hidden_size
-
-    gguf_writer.add_name('persimmon-8b-chat')
-    gguf_writer.add_context_length(ctx_length)
-    gguf_writer.add_embedding_length(hidden_size)
-    gguf_writer.add_block_count(block_count)
-    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
-    # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
-    gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
-    gguf_writer.add_head_count(head_count)
-    gguf_writer.add_head_count_kv(head_count_kv)
-    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
-    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
-    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
-    gguf_writer.add_tokenizer_model('llama')
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
-    gguf_writer.add_bos_token_id(71013)
-    gguf_writer.add_eos_token_id(71013)
-
-    tensor_map = gguf.get_tensor_name_map(arch, block_count)
-    print(tensor_map)
-    for name in tensors.keys():
-        data = tensors[name]
-        if name.endswith(".self_attention.rotary_emb.inv_freq"):
-            continue
-        old_dtype = data.dtype
-        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data.to(torch.float32).squeeze().numpy()
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-        n_dims = len(data.shape)
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-        gguf_writer.add_tensor(new_name, data)
-    print("gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-    gguf_writer.close()
-
-    print(f"gguf: model successfully exported to '{args.outfile}'")
-    print("")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
new file mode 100755
index 000000000..018a2a588
--- /dev/null
+++ b/convert_hf_to_gguf.py
@@ -0,0 +1,5112 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import ast
+import logging
+import argparse
+import contextlib
+import json
+import os
+import re
+import sys
+from enum import IntEnum
+from pathlib import Path
+from hashlib import sha256
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+from itertools import chain
+
+import math
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+logger = logging.getLogger("hf-to-gguf")
+
+
+###### MODEL DEFINITIONS ######
+
+class SentencePieceTokenTypes(IntEnum):
+    NORMAL = 1
+    UNKNOWN = 2
+    CONTROL = 3
+    USER_DEFINED = 4
+    UNUSED = 5
+    BYTE = 6
+
+
+AnyModel = TypeVar("AnyModel", bound="type[Model]")
+
+
+class Model:
+    _model_classes: dict[str, type[Model]] = {}
+
+    dir_model: Path
+    ftype: gguf.LlamaFileType
+    fname_out: Path
+    is_big_endian: bool
+    endianess: gguf.GGUFEndian
+    use_temp_file: bool
+    lazy: bool
+    part_names: list[str]
+    is_safetensors: bool
+    hparams: dict[str, Any]
+    block_count: int
+    tensor_map: gguf.TensorNameMap
+    tensor_names: set[str] | None
+    gguf_writer: gguf.GGUFWriter
+    model_name: str | None
+    metadata_override: Path | None
+    dir_model_card: Path
+
+    # subclasses should define this!
+    model_arch: gguf.MODEL_ARCH
+
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
+                 use_temp_file: bool = False, eager: bool = False,
+                 metadata_override: Path | None = None, model_name: str | None = None,
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
+                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
+        if type(self) is Model:
+            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
+
+        self.dir_model = dir_model
+        self.ftype = ftype
+        self.fname_out = fname_out
+        self.is_big_endian = is_big_endian
+        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.use_temp_file = use_temp_file
+        self.lazy = not eager
+        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
+        self.is_safetensors = len(self.part_names) > 0
+        if not self.is_safetensors:
+            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
+        self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
+        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+        self.tensor_names = None
+        self.metadata_override = metadata_override
+        self.model_name = model_name
+        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
+
+        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
+        if self.ftype == gguf.LlamaFileType.GUESSED:
+            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
+            _, first_tensor = next(self.get_tensors())
+            if first_tensor.dtype == torch.float16:
+                logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
+                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+            else:
+                logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
+                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+
+        # Configure GGUF Writer
+        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
+                                           split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
+
+    @classmethod
+    def __init_subclass__(cls):
+        # can't use an abstract property, because overriding it without type errors
+        # would require using decorated functions instead of simply defining the property
+        if "model_arch" not in cls.__dict__:
+            raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
+
+    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
+        key = next((k for k in keys if k in self.hparams), None)
+        if key is not None:
+            return self.hparams[key]
+        if optional:
+            return None
+        raise KeyError(f"could not find any of: {keys}")
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        tensor_names_from_parts: set[str] = set()
+
+        index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
+        index_name += ".index.json"
+        index_file = self.dir_model / index_name
+
+        if index_file.is_file():
+            self.tensor_names = set()
+            logger.info(f"gguf: loading model weight map from '{index_name}'")
+            with open(index_file, "r", encoding="utf-8") as f:
+                index: dict[str, Any] = json.load(f)
+                weight_map = index.get("weight_map")
+                if weight_map is None or not isinstance(weight_map, dict):
+                    raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
+                self.tensor_names.update(weight_map.keys())
+        else:
+            self.tensor_names = tensor_names_from_parts
+            weight_map = {}
+
+        for part_name in self.part_names:
+            logger.info(f"gguf: loading model part '{part_name}'")
+            ctx: ContextManager[Any]
+            if self.is_safetensors:
+                from safetensors import safe_open
+                ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
+            else:
+                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
+
+            with ctx as model_part:
+                tensor_names_from_parts.update(model_part.keys())
+
+                for name in model_part.keys():
+                    if self.is_safetensors:
+                        if self.lazy:
+                            data = model_part.get_slice(name)
+                            data = LazyTorchTensor.from_safetensors_slice(data)
+                        else:
+                            data = model_part.get_tensor(name)
+                    else:
+                        data = model_part[name]
+                        if self.lazy:
+                            data = LazyTorchTensor.from_eager(data)
+                    yield name, data
+
+        # verify tensor name presence and identify potentially missing files
+        if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
+            missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
+            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
+            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
+            if len(extra) == 0 and len(missing_files) > 0:
+                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+            else:
+                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
+                                 f"Missing tensors: {missing}\n"
+                                 f"Extra tensors: {extra}")
+
+    def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
+        if key not in gguf.MODEL_TENSORS[self.model_arch]:
+            raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
+        name: str = gguf.TENSOR_NAMES[key]
+        if "{bid}" in name:
+            assert bid is not None
+            name = name.format(bid=bid)
+        return name + suffix
+
+    def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
+        if key not in gguf.MODEL_TENSORS[self.model_arch]:
+            return False
+        key_name: str = gguf.TENSOR_NAMES[key]
+        if "{bid}" in key_name:
+            if bid is None:
+                return False
+            key_name = key_name.format(bid=bid)
+        else:
+            if bid is not None:
+                return False
+        return name == (key_name + suffix)
+
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+        new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
+        if new_name is None:
+            raise ValueError(f"Can not map tensor {name!r}")
+        return new_name
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_block_count(self.block_count)
+
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
+            self.gguf_writer.add_context_length(n_ctx)
+            logger.info(f"gguf: context length = {n_ctx}")
+
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+            logger.info(f"gguf: embedding length = {n_embd}")
+
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+            self.gguf_writer.add_feed_forward_length(n_ff)
+            logger.info(f"gguf: feed forward length = {n_ff}")
+
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+            self.gguf_writer.add_head_count(n_head)
+            logger.info(f"gguf: head count = {n_head}")
+
+        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
+            self.gguf_writer.add_head_count_kv(n_head_kv)
+            logger.info(f"gguf: key-value head count = {n_head_kv}")
+
+        if (rope_theta := self.hparams.get("rope_theta")) is not None:
+            self.gguf_writer.add_rope_freq_base(rope_theta)
+            logger.info(f"gguf: rope theta = {rope_theta}")
+        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
+            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
+        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
+            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
+        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+            logger.info(f"gguf: expert count = {n_experts}")
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+            logger.info(f"gguf: experts used count = {n_experts_used}")
+
+        if (head_dim := self.hparams.get("head_dim")) is not None:
+            self.gguf_writer.add_key_length(head_dim)
+            self.gguf_writer.add_value_length(head_dim)
+
+        self.gguf_writer.add_file_type(self.ftype)
+        logger.info(f"gguf: file type = {self.ftype}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+        del name, new_name, bid, n_dims  # unused
+
+        return False
+
+    # some models need extra generated tensors (like rope_freqs)
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        return ()
+
+    def prepare_tensors(self):
+        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+
+        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            # use the first number-like part of the tensor name as the block id
+            bid = None
+            for part in name.split("."):
+                if part.isdecimal():
+                    bid = int(part)
+                    break
+
+            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
+                # TODO: why do we squeeze here?
+                # data = data_torch.squeeze().numpy()
+                data = data_torch.numpy()
+
+                # if data ends up empty, it means data_torch was a scalar tensor -> restore
+                if len(data.shape) == 0:
+                    data = data_torch.numpy()
+
+                n_dims = len(data.shape)
+                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
+
+                # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
+                if n_dims <= 1 or new_name.endswith("_norm.weight"):
+                    data_qtype = gguf.GGMLQuantizationType.F32
+
+                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                # Some tensor types are always in float32
+                if data_qtype is False and (
+                    any(
+                        self.match_model_tensor_name(new_name, key, bid)
+                        for key in (
+                            gguf.MODEL_TENSOR.FFN_GATE_INP,
+                            gguf.MODEL_TENSOR.POS_EMBD,
+                            gguf.MODEL_TENSOR.TOKEN_TYPES,
+                            gguf.MODEL_TENSOR.SSM_CONV1D,
+                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
+                            gguf.MODEL_TENSOR.TIME_MIX_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
+                            gguf.MODEL_TENSOR.POSNET_NORM1,
+                            gguf.MODEL_TENSOR.POSNET_NORM2,
+                        )
+                    )
+                    or not new_name.endswith(".weight")
+                ):
+                    data_qtype = gguf.GGMLQuantizationType.F32
+
+                if data_qtype is False and any(
+                    self.match_model_tensor_name(new_name, key, bid)
+                    for key in (
+                        gguf.MODEL_TENSOR.TOKEN_EMBD,
+                        gguf.MODEL_TENSOR.OUTPUT,
+                    )
+                ):
+                    if self.ftype in (
+                        gguf.LlamaFileType.MOSTLY_TQ1_0,
+                        gguf.LlamaFileType.MOSTLY_TQ2_0,
+                    ):
+                        # TODO: use Q4_K and Q6_K
+                        data_qtype = gguf.GGMLQuantizationType.F16
+
+                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
+                if isinstance(data_qtype, bool):
+                    if self.ftype == gguf.LlamaFileType.ALL_F32:
+                        data_qtype = gguf.GGMLQuantizationType.F32
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                        data_qtype = gguf.GGMLQuantizationType.F16
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                        data_qtype = gguf.GGMLQuantizationType.BF16
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
+                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
+                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
+                    else:
+                        raise ValueError(f"Unknown file type: {self.ftype.name}")
+
+                try:
+                    data = gguf.quants.quantize(data, data_qtype)
+                except gguf.QuantError as e:
+                    logger.warning("%s, %s", e, "falling back to F16")
+                    data_qtype = gguf.GGMLQuantizationType.F16
+                    data = gguf.quants.quantize(data, data_qtype)
+
+                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
+
+                # reverse shape to make it similar to the internal ggml dimension order
+                shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+
+                # n_dims is implicit in the shape
+                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+
+                self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
+
+    def set_type(self):
+        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
+
+    def prepare_metadata(self, vocab_only: bool):
+
+        total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
+
+        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
+
+        # Fallback to model directory name if metadata name is still missing
+        if self.metadata.name is None:
+            self.metadata.name = self.dir_model.name
+
+        # Generate parameter weight class (useful for leader boards) if not yet determined
+        if self.metadata.size_label is None and total_params > 0:
+            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
+
+        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
+        output_type: str = self.ftype.name.partition("_")[2]
+
+        # Filename Output
+        if self.fname_out.is_dir():
+            # Generate default filename based on model specification and available metadata
+            if not vocab_only:
+                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
+            else:
+                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
+
+            # Use the default filename
+            self.fname_out = self.fname_out / f"{fname_default}.gguf"
+        else:
+            # Output path is a custom defined templated filename
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
+            #       file template strings as it doesn't actually exist as a file
+
+            # Process templated file name with the output ftype, useful with the "auto" ftype
+            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
+
+        self.set_type()
+
+        logger.info("Set meta model")
+        self.metadata.set_gguf_meta_model(self.gguf_writer)
+
+        logger.info("Set model parameters")
+        self.set_gguf_parameters()
+
+        logger.info("Set model tokenizer")
+        self.set_vocab()
+
+        logger.info("Set model quantization version")
+        self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+
+    def write(self):
+        self.prepare_tensors()
+        self.prepare_metadata(vocab_only=False)
+        self.gguf_writer.write_header_to_file(path=self.fname_out)
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.write_tensors_to_file(progress=True)
+        self.gguf_writer.close()
+
+    def write_vocab(self):
+        if len(self.gguf_writer.tensors) != 1:
+            raise ValueError('Splitting the vocabulary is not supported')
+
+        self.prepare_metadata(vocab_only=True)
+        self.gguf_writer.write_header_to_file(path=self.fname_out)
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.close()
+
+    @staticmethod
+    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
+        part_names: list[str] = []
+        for filename in os.listdir(dir_model):
+            if filename.startswith(prefix) and filename.endswith(suffix):
+                part_names.append(filename)
+
+        part_names.sort()
+
+        return part_names
+
+    @staticmethod
+    def load_hparams(dir_model: Path):
+        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @classmethod
+    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
+        assert names
+
+        def func(modelcls: AnyModel) -> AnyModel:
+            for name in names:
+                cls._model_classes[name] = modelcls
+            return modelcls
+        return func
+
+    @classmethod
+    def print_registered_models(cls):
+        for name in sorted(cls._model_classes.keys()):
+            logger.error(f"- {name}")
+
+    @classmethod
+    def from_model_architecture(cls, arch: str) -> type[Model]:
+        try:
+            return cls._model_classes[arch]
+        except KeyError:
+            raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
+
+    def does_token_look_special(self, token: str | bytes) -> bool:
+        if isinstance(token, (bytes, bytearray)):
+            token_text = token.decode(encoding="utf-8")
+        elif isinstance(token, memoryview):
+            token_text = token.tobytes().decode(encoding="utf-8")
+        else:
+            token_text = token
+
+        # Some models mark some added tokens which ought to be control tokens as not special.
+        # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
+        seems_special = token_text in (
+            "<pad>",  # deepseek-coder
+            "<mask>", "<2mass>", "[@BOS@]",  # gemma{,-2}
+        )
+
+        seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
+        seems_special = seems_special or (token_text.startswith("<｜") and token_text.endswith("｜>"))  # deepseek-coder
+
+        # TODO: should these be marked as UNUSED instead? (maybe not)
+        seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">"))  # gemma{,-2}
+
+        return seems_special
+
+    # used for GPT-2 BPE and WordPiece vocabs
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
+        assert max(tokenizer.vocab.values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            else:
+                token: str = reverse_vocab[i]
+                if token in added_vocab:
+                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
+                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
+                    if not tokenizer.added_tokens_decoder[i].normalized:
+                        previous_token = token
+                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
+                        if previous_token != token:
+                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
+
+                    if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
+                        toktypes.append(gguf.TokenType.CONTROL)
+                    else:
+                        # NOTE: this was added for Gemma.
+                        # Encoding and decoding the tokens above isn't sufficient for this case.
+                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
+                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                else:
+                    toktypes.append(gguf.TokenType.NORMAL)
+                tokens.append(token)
+
+        return tokens, toktypes, tokpre
+
+    # NOTE: this function is generated by convert_hf_to_gguf_update.py
+    #       do not modify it manually!
+    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # Marker: Start get_vocab_base_pre
+    def get_vocab_base_pre(self, tokenizer) -> str:
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+        # is specific for the BPE pre-tokenizer used by the model
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+        # use in llama.cpp to implement the same pre-tokenizer
+
+        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+        chktok = tokenizer.encode(chktxt)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+        logger.debug(f"chktok: {chktok}")
+        logger.debug(f"chkhsh: {chkhsh}")
+
+        res = None
+
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
+        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+            res = "llama-bpe"
+        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+            res = "deepseek-llm"
+        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+            res = "deepseek-coder"
+        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+            # ref: https://huggingface.co/tiiuae/falcon-7b
+            res = "falcon"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
+            res = "falcon3"
+        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
+            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
+            res = "bert-bge"
+        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
+            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
+            res = "bert-bge-large"
+        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
+            # ref: https://huggingface.co/mosaicml/mpt-7b
+            res = "mpt"
+        if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
+            # ref: https://huggingface.co/bigcode/starcoder2-3b
+            res = "starcoder"
+        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+            # ref: https://huggingface.co/openai-community/gpt2
+            res = "gpt-2"
+        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
+            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
+            res = "stablelm2"
+        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
+            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
+            res = "refact"
+        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
+            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
+            res = "command-r"
+        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
+            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
+            res = "qwen2"
+        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
+            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
+            res = "olmo"
+        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
+            # ref: https://huggingface.co/databricks/dbrx-base
+            res = "dbrx"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+            res = "jina-v1-en"
+        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
+            res = "jina-v2-en"
+        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
+            res = "jina-v2-es"
+        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
+            res = "jina-v2-de"
+        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
+            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
+            res = "smaug-bpe"
+        if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
+            # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
+            res = "poro-chat"
+        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
+            res = "jina-v2-code"
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
+            # ref: https://huggingface.co/LumiOpen/Viking-7B
+            res = "viking"
+        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
+            # ref: https://huggingface.co/core42/jais-13b
+            res = "jais"
+        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
+            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
+            res = "codeshell"
+        if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
+            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
+            res = "tekken"
+        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
+            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
+            res = "smollm"
+        if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
+            # ref: https://huggingface.co/bigscience/bloom
+            res = "bloom"
+        if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
+            # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
+            res = "gpt3-finnish"
+        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
+            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
+            res = "exaone"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"
+        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
+            # ref: https://huggingface.co/facebook/chameleon-7b
+            res = "chameleon"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
+        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
+            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
+            res = "roberta-bpe"
+        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
+            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
+            res = "gigachat"
+        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
+            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
+            res = "megrez"
+        if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
+            # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
+            res = "deepseek-v3"
+        if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
+            # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+            res = "deepseek-r1-qwen"
+
+        if res is None:
+            logger.warning("\n")
+            logger.warning("**************************************************************************************")
+            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+            logger.warning("**          There are 2 possible reasons for this:")
+            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+            logger.warning("**          - the pre-tokenization config has changed upstream")
+            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("**")
+            logger.warning(f"** chkhsh:  {chkhsh}")
+            logger.warning("**************************************************************************************")
+            logger.warning("\n")
+            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+        logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
+        logger.debug(f"chkhsh: {chkhsh}")
+
+        return res
+        # Marker: End get_vocab_base_pre
+
+    def _set_vocab_none(self) -> None:
+        self.gguf_writer.add_tokenizer_model("none")
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def _set_vocab_qwen(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[QwenModel.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
+        added_vocab = tokenizer.special_tokens
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def _set_vocab_sentencepiece(self, add_to_gguf=True):
+        tokens, scores, toktypes = self._create_vocab_sentencepiece()
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def _create_vocab_sentencepiece(self):
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if token_id >= vocab_size:
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+                for token_id, token_data in added_tokens_decoder.items():
+                    token_id = int(token_id)
+                    token: str = token_data["content"]
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                        if tokens[token_id] != token.encode("utf-8"):
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
+                    if token_data.get("special") or self.does_token_look_special(token):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+                    else:
+                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
+                        toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+                    scores[token_id] = -1000.0
+                    tokens[token_id] = token.encode("utf-8")
+
+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+            for i in range(1, pad_count + 1):
+                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+        return tokens, scores, toktypes
+
+    def _set_vocab_llama_hf(self):
+        vocab = gguf.LlamaHfVocab(self.dir_model)
+        tokens = []
+        scores = []
+        toktypes = []
+
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
+        tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
+        logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
+        vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
+
+        default_pre = "mpt" if model_name == "gpt-neox" else "default"
+
+        field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
+        assert field  # tokenizer model
+        self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
+
+        field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
+        self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
+
+        field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
+        assert field  # token list
+        self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
+        if model_name == "llama-spm":
+            field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
+            assert field  # token scores
+            self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
+        field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
+        assert field  # token types
+        self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
+        if model_name != "llama-spm":
+            field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
+            assert field  # token merges
+            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
+            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
+            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
+            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
+            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
+            self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
+        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
+            self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
+
+
+@Model.register("GPTNeoXForCausalLM")
+class GPTNeoXModel(Model):
+    model_arch = gguf.MODEL_ARCH.GPTNEOX
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(
+            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
+        )
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
+            data_torch = torch.cat(
+                (
+                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.weight")
+        elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
+            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
+            data_torch = torch.cat(
+                (
+                    qkv_bias[:, 0, :].reshape((n_embed,)),
+                    qkv_bias[:, 1, :].reshape((n_embed,)),
+                    qkv_bias[:, 2, :].reshape((n_embed,)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.bias")
+
+        tensors.append((self.map_tensor_name(name), data_torch))
+
+        return tensors
+
+
+@Model.register("BloomForCausalLM", "BloomModel")
+class BloomModel(Model):
+    model_arch = gguf.MODEL_ARCH.BLOOM
+
+    def set_gguf_parameters(self):
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
+        self.gguf_writer.add_embedding_length(n_embed)
+        self.gguf_writer.add_feed_forward_length(4 * n_embed)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+
+        name = re.sub(r'transformer\.', '', name)
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+            qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
+            data_torch = torch.cat(
+                (
+                    qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                    qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.weight")
+        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
+            qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
+            data_torch = torch.cat(
+                (
+                    qkv_bias[:, 0, :].reshape((n_embed,)),
+                    qkv_bias[:, 1, :].reshape((n_embed,)),
+                    qkv_bias[:, 2, :].reshape((n_embed,)),
+                ),
+                dim=0,
+            )
+            logger.info("re-format attention.linear_qkv.bias")
+
+        tensors.append((self.map_tensor_name(name), data_torch))
+
+        if name == "word_embeddings.weight":
+            assert self.tensor_names is not None
+
+            # TODO: tie them at runtime, don't duplicate in the model file
+            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
+        return tensors
+
+
+@Model.register("MPTForCausalLM")
+class MPTModel(Model):
+    model_arch = gguf.MODEL_ARCH.MPT
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_gpt2()
+        except Exception:
+            # Fallback for SEA-LION model
+            self._set_vocab_sentencepiece()
+            self.gguf_writer.add_add_bos_token(False)
+            self.gguf_writer.add_pad_token_id(3)
+            self.gguf_writer.add_eos_token_id(1)
+            self.gguf_writer.add_unk_token_id(0)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layers"]
+        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
+        self.gguf_writer.add_head_count(self.hparams["n_heads"])
+        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
+            self.gguf_writer.add_head_count_kv(kv_n_heads)
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+        if self.hparams["attn_config"]["clip_qkv"] is not None:
+            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
+        if self.hparams["attn_config"]["alibi"]:
+            self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
+        else:
+            self.gguf_writer.add_max_alibi_bias(0.0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if "scales" in name:
+            new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales"))
+            new_name = new_name.replace("scales", "act.scales")
+        else:
+            new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias"))
+
+        return [(new_name, data_torch)]
+
+
+@Model.register("OrionForCausalLM")
+class OrionModel(Model):
+    model_arch = gguf.MODEL_ARCH.ORION
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            raise ValueError("gguf: can not find ctx length parameter.")
+
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        # note: config provides rms norm but it is actually layer norm
+        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
+        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
+
+
+@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
+class BaichuanModel(Model):
+    model_arch = gguf.MODEL_ARCH.BAICHUAN
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            raise ValueError("gguf: can not find ctx length parameter.")
+
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
+            logger.info(f"Unpacking and permuting layer {bid}")
+            tensors = [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
+                    self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
+                    self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
+                    self._reverse_hf_part(data_torch, 2)),
+            ]
+        else:
+            tensors = [(self.map_tensor_name(name), data_torch)]
+
+        return tensors
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def _reverse_hf_permute_part(
+        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
+    ) -> Tensor:
+        r = weights.shape[0] // 3
+        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
+
+    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
+        r = weights.shape[0] // 3
+        return weights[r * n_part:r * n_part + r, ...]
+
+
+@Model.register("XverseForCausalLM")
+class XverseModel(Model):
+    model_arch = gguf.MODEL_ARCH.XVERSE
+
+    def set_vocab(self):
+        assert (self.dir_model / "tokenizer.json").is_file()
+        dir_model = self.dir_model
+        hparams = self.hparams
+
+        tokens: list[bytes] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model)
+        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
+        # because vocab_size is the count of items, and indexes start at 0.
+        max_vocab_index = max(tokenizer.get_vocab().values())
+        if max_vocab_index >= vocab_size:
+            raise ValueError("Vocabulary size exceeds expected maximum size.")
+
+        reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for token_id in range(vocab_size):
+            token_text = reverse_vocab[token_id].encode('utf-8')
+            # replace "\x00" to string with length > 0
+            if token_text == b"\x00":
+                toktype = gguf.TokenType.BYTE  # special
+                token_text = f"<{token_text}>".encode('utf-8')
+            elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                toktype = gguf.TokenType.BYTE  # special
+            elif reverse_vocab[token_id] in added_vocab:
+                if tokenizer.added_tokens_decoder[token_id].special:
+                    toktype = gguf.TokenType.CONTROL
+                else:
+                    toktype = gguf.TokenType.USER_DEFINED
+            else:
+                toktype = gguf.TokenType.NORMAL
+
+            tokens.append(token_text)
+            toktypes.append(toktype)
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            raise ValueError("gguf: can not find ctx length parameter.")
+
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        # HF models permute some of the tensors, so we need to undo that
+        if name.endswith("q_proj.weight"):
+            data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
+        if name.endswith("k_proj.weight"):
+            data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+
+@Model.register("FalconForCausalLM", "RWForCausalLM")
+class FalconModel(Model):
+    model_arch = gguf.MODEL_ARCH.FALCON
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams.get("num_hidden_layers")
+        if block_count is None:
+            block_count = self.hparams["n_layer"]  # old name
+
+        n_head = self.hparams.get("num_attention_heads")
+        if n_head is None:
+            n_head = self.hparams["n_head"]  # old name
+
+        n_head_kv = self.hparams.get("num_kv_heads")
+        if n_head_kv is None:
+            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
+
+        self.gguf_writer.add_context_length(2048)  # not in config.json
+        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head_kv)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+        if "query_key_value" in name:
+            n_head = self.find_hparam(["num_attention_heads", "n_head"])
+            n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
+            head_dim = self.hparams["hidden_size"] // n_head
+
+            qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+            q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("GPTBigCodeForCausalLM")
+class StarCoderModel(Model):
+    model_arch = gguf.MODEL_ARCH.STARCODER
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(1)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+
+@Model.register("GPTRefactForCausalLM")
+class RefactModel(Model):
+    model_arch = gguf.MODEL_ARCH.REFACT
+
+    def set_vocab(self):
+        super().set_vocab()
+
+        # TODO: how to determine special FIM tokens automatically?
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
+        special_vocab._set_special_token("prefix", 1)
+        special_vocab._set_special_token("suffix", 3)
+        special_vocab._set_special_token("middle", 2)
+        special_vocab.chat_template = None  # do not add it twice
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        hidden_dim = self.hparams["n_embd"]
+        inner_dim = 4 * hidden_dim
+        hidden_dim = int(2 * inner_dim / 3)
+        multiple_of = 256
+        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        block_count = self.hparams["n_layer"]
+
+        # refact uses Alibi. So this is from config.json which might be used by training.
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+
+        self.gguf_writer.add_feed_forward_length(ff_dim)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(1)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        hidden_dim = self.hparams["n_embd"]
+        inner_dim = 4 * hidden_dim
+        hidden_dim = int(2 * inner_dim / 3)
+        multiple_of = 256
+        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        n_head = self.hparams["n_head"]
+        n_head_kv = 1
+        head_dim = self.hparams["n_embd"] // n_head
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if bid is not None:
+            if name == f"transformer.h.{bid}.attn.kv.weight":
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim]))
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:]))
+            elif name == f"transformer.h.{bid}.attn.q.weight":
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch))
+            elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight":
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]))
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]))
+
+        if len(tensors) == 0:
+            tensors.append((self.map_tensor_name(name), data_torch))
+
+        return tensors
+
+
+@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
+class StableLMModel(Model):
+    model_arch = gguf.MODEL_ARCH.STABLELM
+
+    def set_vocab(self):
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        else:
+            # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
+            self._set_vocab_qwen()
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
+        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+        self.gguf_writer.add_file_type(self.ftype)
+
+    _q_norms: list[dict[str, Tensor]] | None = None
+    _k_norms: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams["num_key_value_heads"]
+
+        if name.find("q_layernorm.norms") != -1:
+            assert bid is not None
+
+            if self._q_norms is None:
+                self._q_norms = [{} for _ in range(self.block_count)]
+
+            self._q_norms[bid][name] = data_torch
+
+            if len(self._q_norms[bid]) >= n_head:
+                return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm")
+            else:
+                return []
+
+        if name.find("k_layernorm.norms") != -1:
+            assert bid is not None
+
+            if self._k_norms is None:
+                self._k_norms = [{} for _ in range(self.block_count)]
+
+            self._k_norms[bid][name] = data_torch
+
+            if len(self._k_norms[bid]) >= n_kv_head:
+                return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm")
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"):
+        datas: list[Tensor] = []
+        # extract the norms in order
+        for xid in range(n_head):
+            ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
+            datas.append(norms[ename])
+            del norms[ename]
+        data_torch = torch.stack(datas, dim=0)
+
+        merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
+        new_name = self.map_tensor_name(merged_name)
+
+        return [(new_name, data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._q_norms is not None or self._k_norms is not None:
+            # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
+            norms = (
+                [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else []
+            ) + (
+                [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else []
+            )
+            if len(norms) > 0:
+                raise ValueError(f"Unprocessed norms: {norms}")
+
+
+@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+class LlamaModel(Model):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            try:
+                self._set_vocab_llama_hf()
+            except (FileNotFoundError, TypeError):
+                # Llama 3
+                self._set_vocab_gpt2()
+
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        if self.hparams.get("vocab_size", 32000) == 32016:
+            special_vocab = gguf.SpecialVocab(
+                self.dir_model, load_merges=False,
+                special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+            )
+            special_vocab._set_special_token("prefix", 32007)
+            special_vocab._set_special_token("suffix", 32008)
+            special_vocab._set_special_token("middle", 32009)
+            special_vocab._set_special_token("eot",    32010)
+            special_vocab.add_to_gguf(self.gguf_writer)
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+        # Apply to granite small models only
+        if self.hparams.get("vocab_size", 32000) == 49152:
+            self.gguf_writer.add_add_bos_token(False)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("block_sparse_moe.experts") != -1:
+            n_experts = self.hparams["num_local_experts"]
+
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for wid in ["w1", "w2", "w3"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeciLMForCausalLM")
+class DeciModel(Model):
+    model_arch = gguf.MODEL_ARCH.DECI
+
+    @staticmethod
+    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+        # DeciLM-specific code
+        intermediate_size = int(2 * ffn_mult * n_embd / 3)
+        return DeciModel._find_multiple(intermediate_size, 256)
+
+    @staticmethod
+    def _find_multiple(n: int, k: int) -> int:
+        # DeciLM-specific code
+        if n % k == 0:
+            return n
+        return n + k - (n % k)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
+            _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
+            assert self.block_count == len(_block_configs)
+            self._num_kv_heads = list()
+            self._num_heads = list()
+            _ffn_multipliers = list()
+            # ***linear attention layer***
+            # if n_heads_in_group is None and replace_with_linear is True
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
+            # ***attention-free layer***
+            # if n_heads_in_group is None and replace_with_linear is False
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
+            # ***normal attention-layer***
+            # if n_heads_in_group is not None, then
+            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
+            # _num_heads[il] is num_attention_head
+            for il in range(len(_block_configs)):
+                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
+                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(self.hparams["num_attention_heads"])
+                    else:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(0)
+                else:
+                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
+                    self._num_heads.append(self.hparams["num_attention_heads"])
+                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(_ffn_multipliers)
+            assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
+            assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
+            assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
+            self._ffn_dims: list[int] = [
+                DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
+                for multiplier in _ffn_multipliers
+            ]
+
+    def set_vocab(self):
+        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
+        # eos_token from '|eot_id|' to '|end_of_text|'
+        if self.hparams.get("vocab_size", 128256) == 128256:
+            tokens, toktypes, tokpre = self.get_vocab_base()
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+            special_vocab.add_to_gguf(self.gguf_writer)
+        else:
+            # DeciLM-7B
+            self._set_vocab_llama_hf()
+
+    def set_gguf_parameters(self):
+        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(self._ffn_dims)
+            if (rope_theta := self.hparams.get("rope_theta")) is not None:
+                self.gguf_writer.add_rope_freq_base(rope_theta)
+            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+            self.gguf_writer.add_head_count(self._num_heads)
+            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
+            self.gguf_writer.add_block_count(self.block_count)
+            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+            self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+            self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+            self.gguf_writer.add_file_type(self.ftype)
+        else: # DeciLM-7B
+            super().set_gguf_parameters()
+            if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
+                self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
+                assert self.block_count == len(self._num_kv_heads)
+                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        if bid is not None:
+            if "num_key_value_heads_per_layer" in self.hparams:
+                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
+            elif "block_configs" in self.hparams:
+                n_kv_head = self._num_kv_heads[bid]
+                n_head = self._num_heads[bid]
+            else:
+                n_kv_head = self.hparams.get("num_key_value_heads")
+        else:
+            n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+
+@Model.register("BitnetForCausalLM")
+class BitnetModel(Model):
+    model_arch = gguf.MODEL_ARCH.BITNET
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+        self.gguf_writer.add_rope_scaling_factor(1.0)
+
+    def weight_quant(self, weight: Tensor) -> Tensor:
+        dtype = weight.dtype
+        weight = weight.float()
+        scale = weight.abs().mean().clamp(min=1e-5)
+        iscale = 1 / scale
+        # TODO: multiply by the scale directly instead of inverting it twice
+        # (this is also unnecessarily doubly inverted upstream)
+        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
+        result = (weight * iscale).round().clamp(-1, 1) / iscale
+        return result.type(dtype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        new_name = self.map_tensor_name(name)
+
+        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
+            gguf.MODEL_TENSOR.ATTN_Q,
+            gguf.MODEL_TENSOR.ATTN_K,
+            gguf.MODEL_TENSOR.ATTN_V,
+            gguf.MODEL_TENSOR.ATTN_OUT,
+            gguf.MODEL_TENSOR.FFN_UP,
+            gguf.MODEL_TENSOR.FFN_DOWN,
+            gguf.MODEL_TENSOR.FFN_GATE,
+        ]):
+            # transform weight into 1/0/-1 (in fp32)
+            data_torch = self.weight_quant(data_torch)
+
+        yield (new_name, data_torch)
+
+
+@Model.register("GrokForCausalLM")
+class GrokModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find(".moe.") != -1:
+            n_experts = self.hparams["num_local_experts"]
+
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for wid in ["linear", "linear_1", "linear_v"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("DbrxForCausalLM")
+class DbrxModel(Model):
+    model_arch = gguf.MODEL_ARCH.DBRX
+
+    def set_gguf_parameters(self):
+        ffn_config = self.hparams["ffn_config"]
+        attn_config = self.hparams["attn_config"]
+        self.gguf_writer.add_block_count(self.hparams["n_layers"])
+
+        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
+
+        self.gguf_writer.add_head_count(self.hparams["n_heads"])
+        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
+
+        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
+
+        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
+
+        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
+        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
+
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+
+        self.gguf_writer.add_file_type(self.ftype)
+        logger.info(f"gguf: file type = {self.ftype}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_expert = self.hparams["ffn_config"]["moe_num_experts"]
+        n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
+        n_embd = self.hparams["d_model"]
+
+        # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+        # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+        # But llama.cpp moe graph works differently
+        # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+        # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
+        exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
+                            "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
+                            "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
+        experts = False
+
+        for exp_tensor_name in exp_tensor_names.keys():
+            if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
+                experts = True
+                data_torch = data_torch.view(n_expert, n_ff, n_embd)
+                if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
+                    data_torch = data_torch.permute(*permute_tensor)
+                break
+
+        # map tensor names
+        # In MoE models the ffn tensors are typically most of the model weights,
+        # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+        # Every other model has the weight names ending in .weight,
+        # let's assume that is the convention which is not the case for dbrx:
+        # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
+        new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
+
+        return [(new_name, data_torch)]
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+        del name, new_name, bid  # unused
+
+        return n_dims > 1
+
+
+@Model.register("MiniCPMForCausalLM")
+class MiniCPMModel(Model):
+    model_arch = gguf.MODEL_ARCH.MINICPM
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        embedding_scale = float(self.hparams["scale_emb"])
+        self.gguf_writer.add_embedding_scale(embedding_scale)
+        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
+        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
+        self.gguf_writer.add_residual_scale(residual_scale)
+        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
+        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
+        self.gguf_writer.add_logit_scale(logit_scale)
+        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        # HF models permute some of the tensors, so we need to undo that
+        if name.endswith(("q_proj.weight")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("MiniCPM3ForCausalLM")
+class MiniCPM3Model(Model):
+    model_arch = gguf.MODEL_ARCH.MINICPM3
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            rope_dims = self.hparams["qk_rope_head_dim"]
+
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+
+@Model.register("QWenLMHeadModel")
+class QwenModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN
+
+    @staticmethod
+    def token_bytes_to_string(b):
+        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+        byte_encoder = bytes_to_unicode()
+        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
+
+    @staticmethod
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
+        parts = [bytes([b]) for b in token]
+        while True:
+            min_idx = None
+            min_rank = None
+            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+                rank = mergeable_ranks.get(pair[0] + pair[1])
+                if rank is not None and (min_rank is None or rank < min_rank):
+                    min_idx = i
+                    min_rank = rank
+            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+                break
+            assert min_idx is not None
+            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+        return parts
+
+    def set_vocab(self):
+        self._set_vocab_qwen()
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+
+@Model.register("Qwen2ForCausalLM")
+class Qwen2Model(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+
+
+@Model.register("Qwen2VLForConditionalGeneration")
+class Qwen2VLModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
+        mrope_section += [0] * max(0, 4 - len(mrope_section))
+        self.gguf_writer.add_rope_dimension_sections(mrope_section)
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, data in super().get_tensors():
+            if name.startswith("visual."):
+                continue
+            yield name, data
+
+
+@Model.register("WavTokenizerDec")
+class WavTokenizerDecModel(Model):
+    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if \
+                name.endswith("codebook.cluster_size") or \
+                name.endswith("codebook.embed_avg") or \
+                name.endswith("codebook.inited"):
+            logger.debug(f"Skipping {name!r}")
+            return []
+
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def set_vocab(self):
+        self._set_vocab_none()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
+        self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
+        self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
+        self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
+
+        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
+        self.gguf_writer.add_posnet_block_count     (self.hparams["posnet"]["n_layer"])
+
+        self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
+        self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
+
+        self.gguf_writer.add_causal_attention(False)
+
+
+@Model.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
+        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
+            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
+            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("GPT2LMHeadModel")
+class GPT2Model(Model):
+    model_arch = gguf.MODEL_ARCH.GPT2
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        # we don't need these
+        if name.endswith((".attn.bias", ".attn.masked_bias")):
+            return tensors
+
+        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
+            data_torch = data_torch.transpose(1, 0)
+
+        new_name = self.map_tensor_name(name)
+
+        tensors.append((new_name, data_torch))
+
+        # note: GPT2 output is tied to (same as) wte in original model
+        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
+        return tensors
+
+
+@Model.register("PhiForCausalLM")
+class Phi2Model(Model):
+    model_arch = gguf.MODEL_ARCH.PHI2
+
+    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
+        rot_pct = self.find_hparam(["partial_rotary_factor"])
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+
+        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(4 * n_embd)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_add_bos_token(False)
+
+
+@Model.register("Phi3ForCausalLM")
+class Phi3MiniModel(Model):
+    model_arch = gguf.MODEL_ARCH.PHI3
+
+    def set_vocab(self):
+        # Phi-4 model uses GPT2Tokenizer
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                tokenizer_class = tokenizer_config_json['tokenizer_class']
+                if tokenizer_class == 'GPT2Tokenizer':
+                    return self._set_vocab_gpt2()
+
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            raise ValueError(f'Error: Missing {tokenizer_path}')
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if token_id >= vocab_size:
+                        logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+                for token_id, foken_data in added_tokens_decoder.items():
+                    token_id = int(token_id)
+                    token = foken_data["content"].encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        tokenizer_file = self.dir_model / 'tokenizer.json'
+        if tokenizer_file.is_file():
+            with open(tokenizer_file, "r", encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+                added_tokens = tokenizer_json.get("added_tokens", [])
+                for foken_data in added_tokens:
+                    token_id = int(foken_data["id"])
+                    token = foken_data["content"].encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
+        rms_eps = self.find_hparam(["rms_norm_eps"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims = n_embd // n_head
+
+        self.gguf_writer.add_context_length(max_pos_embds)
+        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
+        self.gguf_writer.add_rope_dimension_count(rope_dims)
+        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
+        self.gguf_writer.add_file_type(self.ftype)
+        sliding_window = self.hparams.get("sliding_window")
+        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
+        if sliding_window is None:
+            sliding_window = 0
+        self.gguf_writer.add_sliding_window(sliding_window)
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims = n_embd // n_head
+
+        # write rope scaling for long context (128k) model
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is None:
+            return
+
+        scale = max_pos_embds / orig_max_pos_embds
+
+        rope_scaling_type = rope_scaling.get('type', '').lower()
+        if len(rope_scaling_type) == 0:
+            raise KeyError('Missing the required key rope_scaling.type')
+
+        if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
+            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
+        elif rope_scaling_type == 'yarn':
+            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
+        else:
+            raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
+
+        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
+
+        long_factors = rope_scaling.get('long_factor', None)
+        short_factors = rope_scaling.get('short_factor', None)
+
+        if long_factors is None or short_factors is None:
+            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+
+
+@Model.register("PhiMoEForCausalLM")
+class PhiMoeModel(Phi3MiniModel):
+    model_arch = gguf.MODEL_ARCH.PHIMOE
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
+        self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("block_sparse_moe.experts") != -1:
+            n_experts = self.hparams["num_local_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["w1", "w2", "w3"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("PlamoForCausalLM")
+class PlamoModel(Model):
+    model_arch = gguf.MODEL_ARCH.PLAMO
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_context_length(4096)  # not in config.json
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
+        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def shuffle_attn_q_weight(self, data_torch):
+        assert data_torch.size() == (5120, 5120)
+        data_torch = data_torch.reshape(8, 5, 128, 5120)
+        data_torch = torch.permute(data_torch, (1, 0, 2, 3))
+        data_torch = torch.reshape(data_torch, (5120, 5120))
+        return data_torch
+
+    def shuffle_attn_output_weight(self, data_torch):
+        assert data_torch.size() == (5120, 5120)
+        data_torch = data_torch.reshape(5120, 8, 5, 128)
+        data_torch = torch.permute(data_torch, (0, 2, 1, 3))
+        data_torch = torch.reshape(data_torch, (5120, 5120))
+        return data_torch
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        new_name = self.map_tensor_name(name)
+
+        # shuffle for broadcasting of gqa in ggml_mul_mat
+        if new_name.endswith("attn_q.weight"):
+            data_torch = self.shuffle_attn_q_weight(data_torch)
+        elif new_name.endswith("attn_output.weight"):
+            data_torch = self.shuffle_attn_output_weight(data_torch)
+
+        return [(new_name, data_torch)]
+
+
+@Model.register("CodeShellForCausalLM")
+class CodeShellModel(Model):
+    model_arch = gguf.MODEL_ARCH.CODESHELL
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_rope_freq_base(10000.0)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+        self.gguf_writer.add_rope_scaling_factor(1.0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        new_name = self.map_tensor_name(name)
+
+        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
+
+        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
+            assert self.tensor_names is not None
+
+            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
+                # copy tok_embd.weight to output.weight
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
+        return tensors
+
+
+@Model.register("InternLM2ForCausalLM")
+class InternLM2Model(Model):
+    model_arch = gguf.MODEL_ARCH.INTERNLM2
+
+    def set_vocab(self):
+        # (TODO): Is there a better way?
+        # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
+        # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
+        # recognized as an empty string in C++.
+        from sentencepiece import SentencePieceProcessor
+        from sentencepiece import sentencepiece_model_pb2 as model
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        tokens: list[bytes] = []
+        scores: list[float] = []
+        toktypes: list[int] = []
+
+        if not tokenizer_path.is_file():
+            logger.error(f'Error: Missing {tokenizer_path}')
+            sys.exit(1)
+
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        for token_id in range(vocab_size):
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+            if text == b"\x00":
+                # (TODO): fixme
+                # Hack here and replace the \x00 characters.
+                logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
+                text = "🐉".encode("utf-8")
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+            # take care of ununsed raw token
+            if piece.startswith('[UNUSED'):
+                toktype = SentencePieceTokenTypes.UNUSED
+
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+
+                for key in added_tokens_json:
+                    tokens.append(key.encode("utf-8"))
+                    scores.append(-1000.0)
+                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+
+        chat_eos_token = '<|im_end|>'
+        chat_eos_token_id = None
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+                for token_id, foken_data in added_tokens_decoder.items():
+                    token_id = int(token_id)
+                    token = foken_data["content"]
+                    if token == chat_eos_token:
+                        chat_eos_token_id = token_id
+                    token = token.encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        tokenizer_file = self.dir_model / 'tokenizer.json'
+        if tokenizer_file.is_file():
+            with open(tokenizer_file, "r", encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+                added_tokens = tokenizer_json.get("added_tokens", [])
+                for foken_data in added_tokens:
+                    token_id = int(foken_data["id"])
+                    token = foken_data["content"]
+                    if token == chat_eos_token:
+                        chat_eos_token_id = token_id
+                    token = token.encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_add_space_prefix(add_prefix)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        old_eos = special_vocab.special_token_ids["eos"]
+        if chat_eos_token_id is not None:
+            # For the chat model, we replace the eos with '<|im_end|>'.
+            # TODO: this is a hack, should be fixed
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
+            special_vocab.special_token_ids["eos"] = chat_eos_token_id
+            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
+                           " in chat mode so that the conversation can end normally.")
+
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
+        self.gguf_writer.add_file_type(self.ftype)
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        num_heads = self.hparams["num_attention_heads"]
+        num_kv_heads = self.hparams["num_key_value_heads"]
+        n_embd = self.hparams["hidden_size"]
+        q_per_kv = num_heads // num_kv_heads
+        head_dim = n_embd // num_heads
+        num_groups = num_heads // q_per_kv
+
+        if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
+            qkv = data_torch
+
+            qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
+            q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
+
+            # The model weights of q and k equire additional reshape.
+            q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
+            k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
+            v = v.reshape((-1, v.shape[-1]))
+
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v),
+            ]
+        else:
+            return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("InternLM3ForCausalLM")
+class InternLM3Model(Model):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    def set_vocab(self):
+        tokens, scores, toktypes = self._create_vocab_sentencepiece()
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+                if "added_tokens_decoder" in tokenizer_config_json:
+                    for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
+                        if token_data.get("special"):
+                            token_id = int(token_id)
+                            token = token_data["content"]
+                            special_vocab._set_special_token(token, token_id)
+                            # update eos token
+                            if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
+                                special_vocab.special_token_ids["eos"] = token_id
+
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
+class BertModel(Model):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.vocab_size = None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_causal_attention(False)
+
+        # get pooling path
+        pooling_path = None
+        module_path = self.dir_model / "modules.json"
+        if module_path.is_file():
+            with open(module_path, encoding="utf-8") as f:
+                modules = json.load(f)
+            for mod in modules:
+                if mod["type"] == "sentence_transformers.models.Pooling":
+                    pooling_path = mod["path"]
+                    break
+
+        # get pooling type
+        if pooling_path is not None:
+            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
+                pooling = json.load(f)
+            if pooling["pooling_mode_mean_tokens"]:
+                pooling_type = gguf.PoolingType.MEAN
+            elif pooling["pooling_mode_cls_token"]:
+                pooling_type = gguf.PoolingType.CLS
+            else:
+                raise NotImplementedError("Only MEAN and CLS pooling types supported")
+            self.gguf_writer.add_pooling_type(pooling_type)
+
+    def set_vocab(self):
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.vocab_size = len(tokens)
+
+        # we need this to validate the size of the token_type embeddings
+        # though currently we are passing all zeros to the token_type embeddings
+        # "Sequence A" or "Sequence B"
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        # convert to phantom space vocab
+        def phantom(tok):
+            if tok.startswith("[") and tok.endswith("]"):
+                return tok
+            if tok.startswith("##"):
+                return tok[2:]
+            return "\u2581" + tok
+        tokens = list(map(phantom, tokens))
+
+        # add vocab to gguf
+        self.gguf_writer.add_tokenizer_model("bert")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        # handle special tokens
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("bert."):
+            name = name[5:]
+
+        if name.endswith(".gamma"):
+            name = name[:-6] + ".weight"
+
+        if name.endswith(".beta"):
+            name = name[:-5] + ".bias"
+
+        # we are only using BERT for embeddings so we don't need the pooling layer
+        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
+            return [] # we don't need these
+
+        if name.startswith("cls.predictions"):
+            return []
+
+        if name.startswith("cls.seq_relationship"):
+            return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        else:
+            return super().set_vocab()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset:,:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@Model.register("NomicBertModel")
+class NomicBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # the HF config claims n_ctx=8192, but it uses RoPE scaling
+        self.hparams["n_ctx"] = 2048
+
+        # SwigLU activation
+        assert self.hparams["activation_function"] == "swiglu"
+        # this doesn't do anything in the HF version
+        assert self.hparams["causal"] is False
+        # no bias tensors
+        assert self.hparams["qkv_proj_bias"] is False
+        assert self.hparams["mlp_fc1_bias"] is False
+        assert self.hparams["mlp_fc2_bias"] is False
+        # norm at end of layer
+        assert self.hparams["prenorm"] is False
+        # standard RoPE
+        assert self.hparams["rotary_emb_fraction"] == 1.0
+        assert self.hparams["rotary_emb_interleaved"] is False
+        assert self.hparams["rotary_emb_scale_base"] is None
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+
+
+@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
+class XLMRobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
+        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+        from sentencepiece import SentencePieceProcessor
+        from sentencepiece import sentencepiece_model_pb2 as model
+
+        tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
+        if not tokenizer_path.is_file():
+            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+        assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
+
+        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+            for i in range(1, pad_count + 1):
+                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+        # realign tokens (see HF tokenizer code)
+        tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
+        scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
+        toktypes = [
+            SentencePieceTokenTypes.CONTROL,
+            SentencePieceTokenTypes.CONTROL,
+            SentencePieceTokenTypes.CONTROL,
+            SentencePieceTokenTypes.UNKNOWN,
+        ] + toktypes[3:-1]
+
+        self.gguf_writer.add_tokenizer_model("t5")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_add_space_prefix(add_prefix)
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+        if precompiled_charsmap:
+            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset:,:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@Model.register("GemmaForCausalLM")
+class GemmaModel(Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+        # TODO: these special tokens should be exported only for the CodeGemma family
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+        special_vocab._set_special_token("prefix", 67)
+        special_vocab._set_special_token("suffix", 69)
+        special_vocab._set_special_token("middle", 68)
+        special_vocab._set_special_token("fsep",   70)
+        special_vocab._set_special_token("eot",    107)
+        special_vocab.chat_template = None  # do not add it twice
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+        self.gguf_writer.add_add_space_prefix(False)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_key_length(hparams["head_dim"])
+        self.gguf_writer.add_value_length(hparams["head_dim"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # To prevent errors, skip loading lm_head.weight.
+        if name == "lm_head.weight":
+            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+            return []
+
+        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("Gemma2ForCausalLM")
+class Gemma2Model(Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA2
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+        self.gguf_writer.add_add_space_prefix(False)
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_key_length(hparams["head_dim"])
+        self.gguf_writer.add_value_length(hparams["head_dim"])
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_attn_logit_softcapping(
+            self.hparams["attn_logit_softcapping"]
+        )
+        self.gguf_writer.add_final_logit_softcapping(
+            self.hparams["final_logit_softcapping"]
+        )
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # To prevent errors, skip loading lm_head.weight.
+        if name == "lm_head.weight":
+            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+            return []
+
+        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("Starcoder2ForCausalLM")
+class StarCoder2Model(Model):
+    model_arch = gguf.MODEL_ARCH.STARCODER2
+
+
+@Model.register("Rwkv6ForCausalLM")
+class Rwkv6Model(Model):
+    model_arch = gguf.MODEL_ARCH.RWKV6
+
+    def set_vocab(self):
+        assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
+        vocab_size = self.hparams.get("vocab_size", 65536)
+
+        tokens: list[bytes] = ['<s>'.encode("utf-8")]
+        toktypes: list[int] = [gguf.TokenType.CONTROL]
+
+        with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            for line in lines:
+                parts = line.split(' ')
+                assert len(parts) >= 3
+                token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
+                token = token.encode("utf-8") if isinstance(token, str) else token
+                assert isinstance(token, bytes)
+                assert len(token) == token_len
+                token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff"
+                tokens.append(token_text.encode("utf-8"))
+                toktypes.append(gguf.TokenType.NORMAL)
+        remainder = vocab_size - len(tokens)
+        assert remainder >= 0
+        for i in range(len(tokens), vocab_size):
+            tokens.append(f"[PAD{i}]".encode("utf-8"))
+            toktypes.append(gguf.TokenType.UNUSED)
+
+        self.gguf_writer.add_tokenizer_model("rwkv")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.chat_template = "rwkv-world"
+        # hack: Add '\n\n' as the EOT token to make it chat normally
+        special_vocab._set_special_token("eot", 261)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_size = self.hparams["head_size"]
+        hidden_size = self.hparams["hidden_size"]
+        layer_norm_eps = self.hparams["layer_norm_epsilon"]
+        rescale_every_n_layers = self.hparams["rescale_every"]
+        intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
+        time_mix_extra_dim = 64 if hidden_size == 4096 else 32
+        time_decay_extra_dim = 128 if hidden_size == 4096 else 64
+
+        # RWKV isn't context limited
+        self.gguf_writer.add_context_length(1048576)
+        self.gguf_writer.add_embedding_length(hidden_size)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
+        self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
+        self.gguf_writer.add_wkv_head_size(head_size)
+        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
+        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        # required by llama.cpp, unused
+        self.gguf_writer.add_head_count(0)
+
+    lerp_weights: dict[int, dict[str, Tensor]] = {}
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        new_name = self.map_tensor_name(name)
+
+        if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
+            new_name += ".weight"
+
+        if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
+            data_torch = data_torch.transpose(0, 1)
+
+        if new_name.endswith("time_mix_w2.weight"):
+            data_torch = data_torch.permute(0, 2, 1)
+
+        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
+            data_torch = data_torch.squeeze()
+
+        try:
+            rescale_every_n_layers = self.hparams["rescale_every"]
+            if rescale_every_n_layers > 0:
+                if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
+                    data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
+        except KeyError:
+            pass
+
+        # concat time_mix_lerp weights to reduce some cpu overhead
+        # also reduces the number of tensors in the model
+        if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
+            try:
+                self.lerp_weights[bid][new_name] = data_torch
+            except KeyError:
+                self.lerp_weights[bid] = {new_name: data_torch}
+            if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
+                new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
+                data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
+                yield (new_name, data)
+            return
+
+        yield (new_name, data_torch)
+
+
+@Model.register("RWKV6Qwen2ForCausalLM")
+class RWKV6Qwen2Model(Rwkv6Model):
+    model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        num_attention_heads = self.hparams["num_attention_heads"]
+        num_key_value_heads = self.hparams["num_key_value_heads"]
+        hidden_size = self.hparams["hidden_size"]
+        head_size = hidden_size // num_attention_heads
+        rms_norm_eps = self.hparams["rms_norm_eps"]
+        intermediate_size = self.hparams["intermediate_size"]
+        time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
+        time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
+
+        # RWKV isn't context limited
+        self.gguf_writer.add_context_length(1048576)
+        self.gguf_writer.add_embedding_length(hidden_size)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_wkv_head_size(head_size)
+        self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
+        self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        # special parameters for time_mixing in RWKV6QWEN2
+        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+        self.gguf_writer.add_token_shift_count(1)
+        # RWKV6QWEN2 use grouped key/value like GQA
+        self.gguf_writer.add_head_count_kv(num_key_value_heads)
+
+        # required by llama.cpp, unused
+        self.gguf_writer.add_head_count(0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        for new_name, data in super().modify_tensors(data_torch, name, bid):
+            if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
+                data = data.view(5, -1, data.shape[-1])
+                # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
+                # permute them here to avoid code changes
+                data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
+                if "w2" in new_name:
+                    data = data.view(5, -1, data.shape[-1])
+                yield (new_name, data)
+                continue
+            yield (new_name, data)
+
+
+@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
+class MambaModel(Model):
+    model_arch = gguf.MODEL_ARCH.MAMBA
+
+    def set_vocab(self):
+        vocab_size = self.hparams["vocab_size"]
+        # Round vocab size to next multiple of 8
+        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
+        # pad using ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
+        self.hparams["vocab_size"] = vocab_size
+
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        elif (self.dir_model / "tokenizer.model").is_file():
+            self._set_vocab_sentencepiece()
+        else:
+            # Use the GPT-NeoX tokenizer when no tokenizer files are present
+            self._set_vocab_builtin("gpt-neox", vocab_size)
+
+    def set_gguf_parameters(self):
+        d_model = self.find_hparam(["hidden_size",       "d_model"])
+        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
+        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
+        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 16
+        # ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
+        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
+        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
+        use_dt_b_c_norm = False
+        # For falconmamba we do apply RMS norm on B / DT and C layers
+        if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
+            use_dt_b_c_norm = True
+        # Fail early for models which don't have a block expansion factor of 2
+        assert d_inner == 2 * d_model
+
+        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
+        self.gguf_writer.add_embedding_length(d_model)
+        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
+        self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_ssm_conv_kernel(d_conv)
+        self.gguf_writer.add_ssm_inner_size(d_inner)
+        self.gguf_writer.add_ssm_state_size(d_state)
+        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+        self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
+        self.gguf_writer.add_file_type(self.ftype)
+
+    _tok_embd = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
+
+        new_name = self.map_tensor_name(name)
+
+        if name.endswith(".A_log"):
+            logger.debug("A_log --> A ==> " + new_name)
+            data_torch = -torch.exp(data_torch)
+
+        # assuming token_embd.weight is seen before output.weight
+        if self._tok_embd is not None and new_name == output_name:
+            if torch.equal(self._tok_embd, data_torch):
+                logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting")
+                return []
+        elif new_name == tok_embd_name:
+            self._tok_embd = data_torch
+
+        return [(new_name, data_torch)]
+
+
+@Model.register("CohereForCausalLM")
+class CommandR2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COMMAND_R
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        # aya-23 models don't have model_max_length specified
+        self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
+@Model.register("Cohere2ForCausalLM")
+class Cohere2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COHERE2
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+
+        rotary_pct = self.hparams["rotary_pct"]
+        hidden_size = self.hparams["hidden_size"]
+        num_attention_heads = self.hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
+@Model.register("OlmoForCausalLM")
+@Model.register("OLMoForCausalLM")
+class OlmoModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+        clip_qkv = self.hparams.get("clip_qkv")
+        if clip_qkv is not None:
+            self.gguf_writer.add_clamp_kqv(clip_qkv)
+
+    # Same as super class, but permuting q_proj, k_proj
+    # Copied from: LlamaModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith("q_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith("k_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("Olmo2ForCausalLM")
+class Olmo2Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO2
+
+
+@Model.register("OlmoeForCausalLM")
+class OlmoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    # Copied from: Qwen2MoeModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # Copied from: Qwen2MoeModel
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("JinaBertModel", "JinaBertForMaskedLM")
+class JinaBertV2Model(BertModel):
+    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.intermediate_size = self.hparams["intermediate_size"]
+
+    def get_tensors(self):
+        for name, data in super().get_tensors():
+            if 'gated_layer' in name:
+                d1 = data[:self.intermediate_size, :]
+                name1 = name.replace('gated_layers', 'gated_layers_w')
+                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
+                d2 = data[self.intermediate_size:, :]
+                name2 = name.replace('gated_layers', 'gated_layers_v')
+                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
+                yield name1, d1
+                yield name2, d2
+                continue
+
+            yield name, data
+
+    def set_vocab(self):
+        tokenizer_class = 'BertTokenizer'
+        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_class = json.load(f)['tokenizer_class']
+
+        if tokenizer_class == 'BertTokenizer':
+            super().set_vocab()
+        elif tokenizer_class == 'RobertaTokenizer':
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_token_type_count(2)
+        else:
+            raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "bert.", remove the prefix
+        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+        if name.startswith("bert."):
+            name = name[5:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@Model.register("OpenELMForCausalLM")
+class OpenELMModel(Model):
+    model_arch = gguf.MODEL_ARCH.OPENELM
+
+    @staticmethod
+    def _make_divisible(v: float | int, divisor: int) -> int:
+        # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
+        new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
+        ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
+        self._n_embd: int = self.hparams["model_dim"]
+        self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
+        self._num_query_heads: list[int] = self.hparams["num_query_heads"]
+        self._ffn_dims: list[int] = [
+            OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
+            for multiplier in ffn_multipliers
+        ]
+        assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
+        assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
+
+    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
+
+    def set_gguf_parameters(self):
+        n_embd = self._n_embd
+        head_dim = self.hparams["head_dim"]
+        rot_pct = 1.0
+        assert self.block_count == len(self._num_kv_heads)
+        assert self.block_count == len(self._num_query_heads)
+        assert self.block_count == len(self._ffn_dims)
+
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_context_length(self.hparams["max_context_length"])
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(self._ffn_dims)
+        self.gguf_writer.add_head_count(self._num_query_heads)
+        self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+        self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
+        # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
+        self.gguf_writer.add_layer_norm_rms_eps(1e-6)
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
+        self.gguf_writer.add_key_length(head_dim)
+        self.gguf_writer.add_value_length(head_dim)
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
+        if "n_layers" in keys:
+            return self.hparams["num_transformer_layers"]
+
+        return super().find_hparam(keys, optional)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
+        # split ff
+        if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
+            ff_dim = self._ffn_dims[bid]
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
+            return
+
+        yield (self.map_tensor_name(name), data_torch)
+
+
+@Model.register("ArcticForCausalLM")
+class ArcticModel(Model):
+    model_arch = gguf.MODEL_ARCH.ARCTIC
+
+    def set_vocab(self):
+        # The reason for using a custom implementation here is that the
+        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
+        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            logger.error(f'Error: Missing {tokenizer_path}')
+            sys.exit(1)
+
+        # Read the whole vocabulary from the tokenizer.model file
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        # Use the added_tokens_decoder field from tokeniser_config.json as the source
+        # of information about added/redefined tokens and modify them accordingly.
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+
+                if "added_tokens_decoder" in tokenizer_config_json:
+                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
+                    for token_id, token_json in added_tokens_decoder.items():
+                        token_id = int(token_id)
+                        if token_id >= vocab_size:
+                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                            continue
+
+                        token_content = token_json["content"]
+                        token_type = SentencePieceTokenTypes.USER_DEFINED
+                        token_score = -10000.0
+
+                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
+                        # Set the score to 0.0 as in the original tokenizer.model
+                        if ("special" in token_json) and token_json["special"]:
+                            if token_content == tokenizer_config_json["unk_token"]:
+                                token_type = SentencePieceTokenTypes.UNKNOWN
+                            else:
+                                token_type = SentencePieceTokenTypes.CONTROL
+                            token_score = 0.0
+
+                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
+                        tokens[token_id] = token_content.encode("utf-8")
+                        toktypes[token_id] = token_type
+                        scores[token_id] = token_score
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith("q_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith("k_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("block_sparse_moe.experts") != -1:
+            n_experts = self.hparams["num_local_experts"]
+
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for wid in ["w1", "w2", "w3"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeepseekForCausalLM")
+class DeepseekModel(Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_weights_scale(1.0)
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeepseekV2ForCausalLM")
+@Model.register("DeepseekV3ForCausalLM")
+class DeepseekV2Model(Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(hparams["v_head_dim"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+        if hparams["scoring_func"] == "sigmoid":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        elif hparams["scoring_func"] == "softmax":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+        else:
+            raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
+
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+                self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # rename e_score_correction_bias tensors
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+
+        # skip Multi-Token Prediction (MTP) layers
+        block_count = self.hparams["num_hidden_layers"]
+        match = re.match(r"model.layers.(\d+)", name)
+        if match and int(match.group(1)) >= block_count:
+            return []
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("T5WithLMHeadModel")
+@Model.register("T5ForConditionalGeneration")
+@Model.register("MT5ForConditionalGeneration")
+@Model.register("UMT5ForConditionalGeneration")
+class T5Model(Model):
+    model_arch = gguf.MODEL_ARCH.T5
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.shared_token_embeddings_found = False
+
+    def set_vocab(self):
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
+        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+        from sentencepiece import SentencePieceProcessor
+        from sentencepiece import sentencepiece_model_pb2 as model
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        # many older models use spiece.model tokenizer model filename
+        if not tokenizer_path.is_file():
+            tokenizer_path = self.dir_model / 'spiece.model'
+
+        if not tokenizer_path.is_file():
+            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+
+        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
+        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
+            # assure the tokenizer model file name is correct
+            assert tokenizer_path.name == 'tokenizer.model'
+            return self._set_vocab_sentencepiece()
+        else:
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
+
+        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if token_id >= vocab_size:
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+            for i in range(1, pad_count + 1):
+                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+        self.gguf_writer.add_tokenizer_model("t5")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_add_space_prefix(add_prefix)
+        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+        if precompiled_charsmap:
+            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+        self.gguf_writer.add_add_bos_token(False)
+        self.gguf_writer.add_add_eos_token(True)
+
+    def set_gguf_parameters(self):
+        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
+            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
+            n_ctx = 512
+        self.gguf_writer.add_context_length(n_ctx)
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
+        self.gguf_writer.add_head_count(self.hparams["num_heads"])
+        self.gguf_writer.add_key_length(self.hparams["d_kv"])
+        self.gguf_writer.add_value_length(self.hparams["d_kv"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
+        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
+        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
+        # and decoder and ignore the remaining ones.
+        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
+            if not self.shared_token_embeddings_found:
+                name = "shared.weight"
+                self.shared_token_embeddings_found = True
+            else:
+                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("T5EncoderModel")
+class T5EncoderModel(Model):
+    model_arch = gguf.MODEL_ARCH.T5ENCODER
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.shared_token_embeddings_found = False
+
+    def set_vocab(self):
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
+        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+        from sentencepiece import SentencePieceProcessor
+        from sentencepiece import sentencepiece_model_pb2 as model
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        # many older models use spiece.model tokenizer model filename
+        if not tokenizer_path.is_file():
+            tokenizer_path = self.dir_model / 'spiece.model'
+
+        if not tokenizer_path.is_file():
+            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+
+        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
+        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
+            # assure the tokenizer model file name is correct
+            assert tokenizer_path.name == 'tokenizer.model'
+            return self._set_vocab_sentencepiece()
+        else:
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
+
+        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if token_id >= vocab_size:
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+            for i in range(1, pad_count + 1):
+                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
+        self.gguf_writer.add_tokenizer_model("t5")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_add_space_prefix(add_prefix)
+        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
+        if precompiled_charsmap:
+            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+        self.gguf_writer.add_add_bos_token(False)
+        self.gguf_writer.add_add_eos_token(True)
+
+    def set_gguf_parameters(self):
+        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
+            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
+            n_ctx = 512
+        self.gguf_writer.add_context_length(n_ctx)
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
+        self.gguf_writer.add_block_count(self.hparams["num_layers"])
+        self.gguf_writer.add_head_count(self.hparams["num_heads"])
+        self.gguf_writer.add_key_length(self.hparams["d_kv"])
+        self.gguf_writer.add_value_length(self.hparams["d_kv"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
+        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
+        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
+        # and decoder and ignore the remaining ones.
+        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
+            if not self.shared_token_embeddings_found:
+                name = "shared.weight"
+                self.shared_token_embeddings_found = True
+            else:
+                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("JAISLMHeadModel")
+class JaisModel(Model):
+    model_arch = gguf.MODEL_ARCH.JAIS
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # SwigLU activation
+        assert self.hparams["activation_function"] == "swiglu"
+        # ALiBi position embedding
+        assert self.hparams["position_embedding_type"] == "alibi"
+
+        # Embeddings scale
+        self.embeddings_scale = 1.0
+        if 'mup_embeddings_scale' in self.hparams:
+            self.embeddings_scale = self.hparams['mup_embeddings_scale']
+        elif 'embeddings_scale' in self.hparams:
+            self.embeddings_scale = self.hparams['embeddings_scale']
+        else:
+            assert False
+
+        self.width_scale = 1.0
+        if 'mup_output_alpha' in self.hparams:
+            assert 'mup_width_scale' in self.hparams
+            self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
+        elif 'width_scale' in self.hparams:
+            self.width_scale = self.hparams['width_scale']
+        else:
+            assert False
+
+        self.max_alibi_bias = 8.0
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        # we don't need these
+        if name.endswith((".attn.bias")):
+            return tensors
+
+        if name.endswith(("relative_pe.slopes")):
+            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
+            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
+            # but Jais's PyTorch model simply precalculates the slope values and places them
+            # in relative_pes.slopes
+            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
+            first_val = float(data_torch[0].item())
+            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
+
+            return tensors
+
+        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
+            data_torch = data_torch.transpose(1, 0)
+
+        new_name = self.map_tensor_name(name)
+
+        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
+            tensors.append((new_name, data_torch * self.embeddings_scale))
+        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
+            tensors.append((new_name, data_torch * self.width_scale))
+        else:
+            tensors.append((new_name, data_torch))
+
+        return tensors
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
+
+
+@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
+class ChatGLMModel(Model):
+    model_arch = gguf.MODEL_ARCH.CHATGLM
+
+    def set_vocab_chatglm3(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytes] = []
+        toktypes: list[int] = []
+        scores: list[float] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        for token_id in range(vocab_size):
+            piece = tokenizer._convert_id_to_token(token_id)
+            if token_id == 0:
+                piece = "<unk>"
+            elif token_id == 1:
+                piece = "<bos>"
+            elif token_id == 2:
+                piece = "<eos>"
+
+            text = piece.encode("utf-8")
+            score = 0.0
+            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
+            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
+            if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
+                score = tokenizer.tokenizer.sp_model.get_score(token_id)
+
+            if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
+                if piece in special_tokens:
+                    toktype = SentencePieceTokenTypes.CONTROL
+                elif len(piece) == 0:
+                    text = f"[PAD{token_id}]".encode("utf-8")
+                    toktype = SentencePieceTokenTypes.UNUSED
+                else:
+                    toktype = SentencePieceTokenTypes.USER_DEFINED
+                tokens.append(text)
+                scores.append(score)
+                toktypes.append(toktype)
+                continue
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.tokenizer.sp_model.is_unknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.tokenizer.sp_model.is_control(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.tokenizer.sp_model.is_unused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.tokenizer.sp_model.is_byte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        # glm3 needs prefix and suffix formatted as:
+        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
+        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    @staticmethod
+    def token_bytes_to_string(b):
+        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+        byte_encoder = bytes_to_unicode()
+        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
+
+    @staticmethod
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
+        parts = [bytes([b]) for b in token]
+        while True:
+            min_idx = None
+            min_rank = None
+            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+                rank = mergeable_ranks.get(pair[0] + pair[1])
+                if rank is not None and (min_rank is None or rank < min_rank):
+                    min_idx = i
+                    min_rank = rank
+            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+                break
+            assert min_idx is not None
+            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+        return parts
+
+    def set_vocab(self):
+        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
+            self.set_vocab_chatglm3()
+            return
+
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        # only add special tokens when they were not already loaded from config.json
+        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
+        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
+        self.gguf_writer.add_embedding_length(n_embed)
+        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
+        self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
+        self.gguf_writer.add_file_type(self.ftype)
+        if "attention_dim" in self.hparams:
+            rope_dim = self.hparams["attention_dim"]
+        else:
+            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_add_bos_token(False)
+        rope_freq = 10000
+        if "rope_ratio" in self.hparams:
+            rope_freq = rope_freq * self.hparams["rope_ratio"]
+        self.gguf_writer.add_rope_freq_base(rope_freq)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
+            return []
+
+        name = name.removeprefix("transformer.")
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("NemotronForCausalLM")
+class NemotronModel(Model):
+    model_arch = gguf.MODEL_ARCH.NEMOTRON
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+        self.gguf_writer.add_pad_token_id(0)
+        self.gguf_writer.add_unk_token_id(1)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
+        self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+
+        # * Partial RoPE
+        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+
+        # * RopeScaling for Nemotron
+        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        else:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
+        #   model.layers.{l}.input_layernorm.weight
+        #   model.layers.{l}.post_attention_layernorm.weight
+        #   model.norm.weight
+        if name.endswith("norm.weight"):
+            data_torch = data_torch + 1
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@Model.register("ExaoneForCausalLM")
+class ExaoneModel(Model):
+    model_arch = gguf.MODEL_ARCH.EXAONE
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+
+        assert (hparams["activation_function"] == "silu")
+
+        max_position_embeddings = hparams["max_position_embeddings"]
+        embed_dim = hparams["hidden_size"]
+        num_heads = hparams["num_attention_heads"]
+        num_kv_heads = hparams.get("num_key_value_heads", num_heads)
+        layer_norm_eps = hparams["layer_norm_epsilon"]
+        intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
+        num_layers = hparams["num_layers"]
+        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
+        # attention_dropout_rate = hparams["attention_dropout"]
+        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
+        # embed_dropout_rate = hparams["embed_dropout"]
+        self.gguf_writer.add_embedding_length(embed_dim)
+        self.gguf_writer.add_head_count(num_heads)
+        self.gguf_writer.add_head_count_kv(num_kv_heads)
+        self.gguf_writer.add_context_length(max_position_embeddings)
+        self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
+        self.gguf_writer.add_feed_forward_length(intermediate_size)
+        self.gguf_writer.add_block_count(num_layers)
+        self.gguf_writer.add_file_type(self.ftype)
+
+        if (rope_theta := self.hparams.get("rope_theta")) is not None:
+            self.gguf_writer.add_rope_freq_base(rope_theta)
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
+        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
+        if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
+            if hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+
+@Model.register("GraniteForCausalLM")
+class GraniteModel(LlamaModel):
+    """Conversion for IBM's GraniteForCausalLM"""
+    model_arch = gguf.MODEL_ARCH.GRANITE
+
+    def set_gguf_parameters(self):
+        """Granite uses standard llama parameters with the following differences:
+
+        - No head_dim support
+        - New multiplier params:
+            - attention_scale
+            - embedding_scale
+            - residual_scale
+        - logits_scaling
+        """
+        if head_dim := self.hparams.pop("head_dim", None):
+            logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
+        super().set_gguf_parameters()
+        # NOTE: Convert _multiplier params to _scale params for naming
+        #   consistency
+        if attention_scale := self.hparams.get("attention_multiplier"):
+            self.gguf_writer.add_attention_scale(attention_scale)
+            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
+        if embedding_scale := self.hparams.get("embedding_multiplier"):
+            self.gguf_writer.add_embedding_scale(embedding_scale)
+            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
+        if residual_scale := self.hparams.get("residual_multiplier"):
+            self.gguf_writer.add_residual_scale(residual_scale)
+            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
+        if logits_scale := self.hparams.get("logits_scaling"):
+            self.gguf_writer.add_logit_scale(logits_scale)
+            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
+
+
+@Model.register("GraniteMoeForCausalLM")
+class GraniteMoeModel(GraniteModel):
+    """Conversion for IBM's GraniteMoeForCausalLM"""
+    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        """In modeling_granitemoe, the JetMoe implementation of parallel experts
+        is used. This essentially merges w1 and w3 into a single tensor with 2x
+        the hidden size that is then split during forward. To keep compatibility
+        with existing mixtral support, we pull them apart here.
+        """
+
+        if name.endswith("block_sparse_moe.input_linear.weight"):
+            ffn_dim = self.hparams["intermediate_size"]
+            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
+            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
+            ]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@Model.register("ChameleonForConditionalGeneration")
+@Model.register("ChameleonForCausalLM")  # obsolete
+class ChameleonModel(Model):
+    model_arch = gguf.MODEL_ARCH.CHAMELEON
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # ignore image tokenizer for now
+        # TODO: remove this once image support is implemented for Chameleon
+        if name.startswith("model.vqmodel"):
+            return []
+
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        hidden_dim = self.hparams.get("hidden_size")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        if name.endswith(("q_norm.weight", "q_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
+        if name.endswith(("k_norm.weight", "k_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
+    @staticmethod
+    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
+        head_dim = hidden_dim // n_heads
+        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
+        data_torch = data_torch.repeat_interleave(n_heads, 0)
+        return data_torch
+
+
+###### CONVERSION LOGIC ######
+
+
+# tree of lazy tensors
+class LazyTorchTensor(gguf.LazyBase):
+    _tensor_type = torch.Tensor
+    # to keep the type-checker happy
+    dtype: torch.dtype
+    shape: torch.Size
+
+    # only used when converting a torch.Tensor to a np.ndarray
+    _dtype_map: dict[torch.dtype, type] = {
+        torch.float16: np.float16,
+        torch.float32: np.float32,
+    }
+
+    # used for safetensors slices
+    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
+    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
+    _dtype_str_map: dict[str, torch.dtype] = {
+        "F64": torch.float64,
+        "F32": torch.float32,
+        "BF16": torch.bfloat16,
+        "F16": torch.float16,
+        # "U64": torch.uint64,
+        "I64": torch.int64,
+        # "U32": torch.uint32,
+        "I32": torch.int32,
+        # "U16": torch.uint16,
+        "I16": torch.int16,
+        "U8": torch.uint8,
+        "I8": torch.int8,
+        "BOOL": torch.bool,
+        "F8_E4M3": torch.float8_e4m3fn,
+        "F8_E5M2": torch.float8_e5m2,
+    }
+
+    def numpy(self) -> gguf.LazyNumpyTensor:
+        dtype = self._dtype_map[self.dtype]
+        return gguf.LazyNumpyTensor(
+            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
+            args=(self,),
+            func=(lambda s: s.numpy())
+        )
+
+    @classmethod
+    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
+        return torch.empty(size=shape, dtype=dtype, device="meta")
+
+    @classmethod
+    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
+        dtype = cls._dtype_str_map[st_slice.get_dtype()]
+        shape: tuple[int, ...] = tuple(st_slice.get_shape())
+        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
+        return cast(torch.Tensor, lazy)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        del types  # unused
+
+        if kwargs is None:
+            kwargs = {}
+
+        if func is torch.Tensor.numpy:
+            return args[0].numpy()
+
+        return cls._wrap_fn(func)(*args, **kwargs)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a huggingface model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+    )
+    parser.add_argument(
+        "--bigendian", action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--use-temp-file", action="store_true",
+        help="use the tempfile library while processing (helpful when running out of memory, process killed)",
+    )
+    parser.add_argument(
+        "--no-lazy", action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
+    )
+    parser.add_argument(
+        "--model-name", type=str, default=None,
+        help="name of the model",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--split-max-tensors", type=int, default=0,
+        help="max tensors in each split",
+    )
+    parser.add_argument(
+        "--split-max-size", type=str, default="0",
+        help="max size per split N(M|G)",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="only print out a split plan and exit, without writing any new files",
+    )
+    parser.add_argument(
+        "--no-tensor-first-split", action="store_true",
+        help="do not add tensors to the first split (disabled by default)"
+    )
+    parser.add_argument(
+        "--metadata", type=Path,
+        help="Specify the path for an authorship metadata override file"
+    )
+    parser.add_argument(
+        "--print-supported-models", action="store_true",
+        help="Print the supported models"
+    )
+
+    args = parser.parse_args()
+    if not args.print_supported_models and args.model is None:
+        parser.error("the following arguments are required: model")
+    return args
+
+
+def split_str_to_n_bytes(split_str: str) -> int:
+    if split_str.endswith("K"):
+        n = int(split_str[:-1]) * 1000
+    elif split_str.endswith("M"):
+        n = int(split_str[:-1]) * 1000 * 1000
+    elif split_str.endswith("G"):
+        n = int(split_str[:-1]) * 1000 * 1000 * 1000
+    elif split_str.isnumeric():
+        n = int(split_str)
+    else:
+        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
+
+    if n < 0:
+        raise ValueError(f"Invalid split size: {split_str}, must be positive")
+
+    return n
+
+
+def main() -> None:
+    args = parse_args()
+
+    if args.print_supported_models:
+        logger.error("Supported models:")
+        Model.print_registered_models()
+        sys.exit(0)
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    dir_model = args.model
+
+    if not dir_model.is_dir():
+        logger.error(f'Error: {args.model} is not a directory')
+        sys.exit(1)
+
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
+        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
+        "auto": gguf.LlamaFileType.GUESSED,
+    }
+
+    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
+    if args.use_temp_file and is_split:
+        logger.error("Error: Cannot use temp file when splitting")
+        sys.exit(1)
+
+    if args.outfile is not None:
+        fname_out = args.outfile
+    else:
+        fname_out = dir_model
+
+    logger.info(f"Loading model: {dir_model.name}")
+
+    hparams = Model.load_hparams(dir_model)
+
+    with torch.inference_mode():
+        output_type = ftype_map[args.outtype]
+        model_architecture = hparams["architectures"][0]
+
+        try:
+            model_class = Model.from_model_architecture(model_architecture)
+        except NotImplementedError:
+            logger.error(f"Model {model_architecture} is not supported")
+            sys.exit(1)
+
+        model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
+                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
+                                     eager=args.no_lazy,
+                                     metadata_override=args.metadata, model_name=args.model_name,
+                                     split_max_tensors=args.split_max_tensors,
+                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
+                                     small_first_shard=args.no_tensor_first_split)
+
+        if args.vocab_only:
+            logger.info("Exporting model vocab...")
+            model_instance.write_vocab()
+            logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
+        else:
+            logger.info("Exporting model...")
+            model_instance.write()
+            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
+            logger.info(f"Model successfully exported to {out_path}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
new file mode 100755
index 000000000..cea34413f
--- /dev/null
+++ b/convert_hf_to_gguf_update.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This script downloads the tokenizer models of the specified models from Huggingface and
+# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+#
+# This is necessary in order to analyze the type of pre-tokenizer used by the model and
+# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# the same pre-tokenizer.
+#
+# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+#
+# Instructions:
+#
+# - Add a new model to the "models" list
+# - Run the script with your huggingface token:
+#
+#   python3 convert_hf_to_gguf_update.py <huggingface_token>
+#
+# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
+# - Update llama.cpp with the new pre-tokenizer if necessary
+#
+# TODO: generate tokenizer tests for llama.cpp
+#
+
+import logging
+import os
+import pathlib
+import re
+
+import requests
+import sys
+import json
+import shutil
+
+from hashlib import sha256
+from enum import IntEnum, auto
+from transformers import AutoTokenizer
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("convert_hf_to_gguf_update")
+sess = requests.Session()
+
+
+class TOKENIZER_TYPE(IntEnum):
+    SPM = auto()
+    BPE = auto()
+    WPM = auto()
+    UGM = auto()
+
+
+# TODO: this string has to exercise as much pre-tokenizer functionality as possible
+#       will be updated with time - contributions welcome
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+if len(sys.argv) == 2:
+    token = sys.argv[1]
+    if not token.startswith("hf_"):
+        logger.info("Huggingface token seems invalid")
+        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+        sys.exit(1)
+else:
+    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+    sys.exit(1)
+
+# TODO: add models here, base models preferred
+models = [
+    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+    {"name": "llama-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "phi-3",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+    {"name": "deepseek-llm",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+    {"name": "deepseek-coder",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+    {"name": "falcon",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+    {"name": "bert-bge",         "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "falcon3",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
+    {"name": "bert-bge-large",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
+    {"name": "mpt",              "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+    {"name": "starcoder",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+    {"name": "gpt-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "stablelm2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
+    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
+    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
+    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
+    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+    {"name": "jina-v1-en",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
+    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
+    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
+    {"name": "gemma",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
+    {"name": "gemma-2",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
+    {"name": "jais",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
+    {"name": "t5",               "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
+    {"name": "codeshell",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
+    {"name": "tekken",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
+    {"name": "smollm",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
+    {'name': "bloom",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
+    {'name': "gpt3-finnish",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
+    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
+    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
+    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
+    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
+    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
+    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
+    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+]
+
+
+def download_file_with_auth(url, token, save_path):
+    headers = {"Authorization": f"Bearer {token}"}
+    response = sess.get(url, headers=headers)
+    response.raise_for_status()
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, 'wb') as downloaded_file:
+        downloaded_file.write(response.content)
+    logger.info(f"File {save_path} downloaded successfully")
+
+
+def download_model(model):
+    name = model["name"]
+    repo = model["repo"]
+    tokt = model["tokt"]
+
+    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
+
+    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
+
+    if tokt == TOKENIZER_TYPE.SPM:
+        files.append("tokenizer.model")
+
+    if tokt == TOKENIZER_TYPE.UGM:
+        files.append("spiece.model")
+
+    if os.path.isdir(repo):
+        # If repo is a path on the file system, copy the directory
+        for file in files:
+            src_path = os.path.join(repo, file)
+            dst_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(dst_path):
+                logger.info(f"{name}: File {dst_path} already exists - skipping")
+                continue
+            if os.path.isfile(src_path):
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"{name}: Copied {src_path} to {dst_path}")
+            else:
+                logger.warning(f"{name}: Source file {src_path} does not exist")
+    else:
+        # If repo is a URL, download the files
+        for file in files:
+            save_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(save_path):
+                logger.info(f"{name}: File {save_path} already exists - skipping")
+                continue
+            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+
+
+for model in models:
+    try:
+        download_model(model)
+    except Exception as e:
+        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+
+
+# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
+
+src_ifs = ""
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
+        continue
+
+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
+    # create the tokenizer
+    try:
+        if name == "t5":
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+        continue  # Skip to the next model if the tokenizer can't be loaded
+
+    chktok = tokenizer.encode(CHK_TXT)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    logger.info(f"model: {name}")
+    logger.info(f"tokt: {tokt}")
+    logger.info(f"repo: {model['repo']}")
+    logger.info(f"chktok: {chktok}")
+    logger.info(f"chkhsh: {chkhsh}")
+
+    # print the "pre_tokenizer" content from the tokenizer.json
+    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+        normalizer = cfg["normalizer"]
+        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
+        pre_tokenizer = cfg["pre_tokenizer"]
+        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+        if "ignore_merges" in cfg["model"]:
+            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
+
+    logger.info("")
+
+    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
+    src_ifs += f"            # ref: {model['repo']}\n"
+    src_ifs += f"            res = \"{name}\"\n"
+
+src_func = f"""
+    def get_vocab_base_pre(self, tokenizer) -> str:
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+        # is specific for the BPE pre-tokenizer used by the model
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+        # use in llama.cpp to implement the same pre-tokenizer
+
+        chktxt = {repr(CHK_TXT)}
+
+        chktok = tokenizer.encode(chktxt)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+        logger.debug(f"chktok: {{chktok}}")
+        logger.debug(f"chkhsh: {{chkhsh}}")
+
+        res = None
+
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
+{src_ifs}
+        if res is None:
+            logger.warning("\\n")
+            logger.warning("**************************************************************************************")
+            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+            logger.warning("**          There are 2 possible reasons for this:")
+            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+            logger.warning("**          - the pre-tokenization config has changed upstream")
+            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("**")
+            logger.warning(f"** chkhsh:  {{chkhsh}}")
+            logger.warning("**************************************************************************************")
+            logger.warning("\\n")
+            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
+        logger.debug(f"chkhsh: {{chkhsh}}")
+
+        return res
+"""
+
+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
+convert_py = convert_py_pth.read_text(encoding="utf-8")
+convert_py = re.sub(
+    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
+    lambda m: m.group(1) + src_func + m.group(3),
+    convert_py,
+    flags=re.DOTALL | re.MULTILINE,
+)
+
+convert_py_pth.write_text(convert_py, encoding="utf-8")
+
+logger.info("+++ convert_hf_to_gguf.py was updated")
+
+# generate tests for each tokenizer model
+
+tests = [
+    "ied 4 ½ months",
+    "Führer",
+    "",
+    " ",
+    "  ",
+    "   ",
+    "\t",
+    "\n",
+    "\n\n",
+    "\n\n\n",
+    "\t\n",
+    "Hello world",
+    " Hello world",
+    "Hello World",
+    " Hello World",
+    " Hello World!",
+    "Hello, world!",
+    " Hello, world!",
+    " this is 🦙.cpp",
+    "w048 7tuijk dsdfhu",
+    "нещо на Български",
+    "កាន់តែពិសេសអាចខលចេញ",
+    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+    "Hello",
+    " Hello",
+    "  Hello",
+    "   Hello",
+    "    Hello",
+    "    Hello\n    Hello",
+    " (",
+    "\n =",
+    "' era",
+    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+    "!!!!!!",
+    "3",
+    "33",
+    "333",
+    "3333",
+    "33333",
+    "333333",
+    "3333333",
+    "33333333",
+    "333333333",
+    "Cửa Việt", # llama-bpe fails on this
+    " discards",
+    CHK_TXT,
+]
+
+# write the tests to ./models/ggml-vocab-{name}.gguf.inp
+# the format is:
+#
+# test0
+# __ggml_vocab_test__
+# test1
+# __ggml_vocab_test__
+# ...
+#
+
+# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
+# for each test, write the resulting tokens on a separate line
+
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    # Skip if the tokenizer folder does not exist or there are other download issues previously
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
+        continue
+
+    # create the tokenizer
+    try:
+        if name == "t5":
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+    except OSError as e:
+        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
+        continue  # Skip this model and continue with the next one in the loop
+
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
+        for text in tests:
+            f.write(f"{text}")
+            f.write("\n__ggml_vocab_test__\n")
+
+    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
+        for text in tests:
+            res = tokenizer.encode(text, add_special_tokens=False)
+            for r in res:
+                f.write(f" {r}")
+            f.write("\n")
+
+    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+
+# generate commands for creating vocab files
+
+logger.info("\nRun the following commands to generate the vocab files for testing:\n")
+
+for model in models:
+    name = model["name"]
+
+    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+
+logger.info("\n")
diff --git a/convert-llama-ggml-to-gguf.py b/convert_llama_ggml_to_gguf.py
similarity index 87%
rename from convert-llama-ggml-to-gguf.py
rename to convert_llama_ggml_to_gguf.py
index b33108062..29b14e98d 100755
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 
+import logging
 import argparse
 import os
 import struct
@@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 
+logger = logging.getLogger("ggml-to-gguf")
+
 
 class GGMLFormat(IntEnum):
     GGML = 0
@@ -113,7 +116,7 @@ class Tensor:
         assert quant is not None, 'Unknown tensor type'
         (blksize, tysize) = quant
         offset += 12
-        self.dtype= dtype
+        self.dtype= gguf.GGMLQuantizationType(dtype)
         self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
         offset += 4 * n_dims
         self.name = bytes(data[offset:offset + name_len])
@@ -125,11 +128,14 @@ class Tensor:
         self.start_offset = offset
         self.len_bytes = n_bytes
         offset += n_bytes
-        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
         return offset - orig_offset
 
 
 class GGMLModel:
+
+    file_format: GGMLFormat
+    format_version: int
+
     def __init__(self):
         self.hyperparameters = None
         self.vocab = None
@@ -175,7 +181,7 @@ class GGMLModel:
         offset += self.validate_header(data, offset)
         hp = Hyperparameters()
         offset += hp.load(data, offset)
-        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
+        logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
         self.validate_conversion(hp.ftype)
         vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
         offset += vocab.load(data, offset, hp.n_vocab)
@@ -215,12 +221,12 @@ class GGMLToGGUF:
                     if float(hp.n_head) / float(x) == gqa:
                         n_kv_head = x
                 assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
-                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+                logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
         self.n_kv_head = n_kv_head
         self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
 
     def save(self):
-        print('* Preparing to save GGUF file')
+        logger.info('* Preparing to save GGUF file')
         gguf_writer = gguf.GGUFWriter(
             self.cfg.output,
             gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@@ -230,11 +236,11 @@ class GGMLToGGUF:
         if self.special_vocab is not None:
             self.special_vocab.add_to_gguf(gguf_writer)
         self.add_tensors(gguf_writer)
-        print("    gguf: write header")
+        logger.info("    gguf: write header")
         gguf_writer.write_header_to_file()
-        print("    gguf: write metadata")
+        logger.info("    gguf: write metadata")
         gguf_writer.write_kv_data_to_file()
-        print("    gguf: write tensors")
+        logger.info("    gguf: write tensors")
         gguf_writer.write_tensors_to_file()
         gguf_writer.close()
 
@@ -250,7 +256,7 @@ class GGMLToGGUF:
             name = cfg.name if cfg.name is not None else cfg.input.name
         except UnicodeDecodeError:
             name = None
-        print('* Adding model parameters and KV items')
+        logger.info('* Adding model parameters and KV items')
         if name is not None:
             gguf_writer.add_name(name)
         gguf_writer.add_description(desc)
@@ -281,13 +287,14 @@ class GGMLToGGUF:
     def add_vocab(self, gguf_writer):
         hp = self.model.hyperparameters
         gguf_writer.add_tokenizer_model('llama')
+        gguf_writer.add_tokenizer_pre('default')
         tokens = []
         scores = []
         toktypes = []
         if self.vocab_override is not None:
             vo = self.vocab_override
-            print('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            logger.info('* Adding vocab item(s)')
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                 tokens.append(vbytes)
                 scores.append(score)
                 toktypes.append(ttype)
@@ -298,7 +305,7 @@ class GGMLToGGUF:
             if len(toktypes) > 0:
                 gguf_writer.add_token_types(toktypes)
             return
-        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
         assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
         for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
             tt = 1 # Normal
@@ -333,7 +340,7 @@ class GGMLToGGUF:
     def add_tensors(self, gguf_writer):
         tensor_map = self.name_map
         data = self.data
-        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
         for tensor in self.model.tensors:
             name = str(tensor.name, 'UTF-8')
             mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@@ -343,7 +350,6 @@ class GGMLToGGUF:
                 temp = tempdims[1]
                 tempdims[1] = tempdims[0]
                 tempdims[0] = temp
-            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
             gguf_writer.add_tensor(
                 mapped_name,
                 data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@@ -352,7 +358,8 @@ class GGMLToGGUF:
 
 
 def handle_metadata(cfg, hp):
-    import convert
+    import examples.convert_legacy_llama as convert
+
     assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
     hf_config_path   = cfg.model_metadata_dir / "config.json"
     orig_config_path = cfg.model_metadata_dir / "params.json"
@@ -373,7 +380,7 @@ def handle_metadata(cfg, hp):
         raise ValueError('Unable to load metadata')
     vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
     vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
     convert.check_vocab_size(params, vocab)
     return params, vocab, special_vocab
 
@@ -398,35 +405,37 @@ def handle_args():
                         help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
     parser.add_argument("--vocab-dir", type=Path,
                         help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+    parser.add_argument("--vocabtype", default="spm,hfft",
+                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
     return parser.parse_args()
 
 
 def main():
     cfg = handle_args()
-    print(f'* Using config: {cfg}')
-    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
+    logger.info(f'* Using config: {cfg}')
+    logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
     if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
+        logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
     data = np.memmap(cfg.input, mode = 'r')
     model = GGMLModel()
-    print('* Scanning GGML input file')
+    logger.info('* Scanning GGML input file')
     offset = model.load(data, 0)  # noqa
-    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
     vocab_override = None
     params_override = None
     special_vocab = None
     if cfg.model_metadata_dir is not None:
         (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
-        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
-        print(f'* Overriding params: {params_override}')
-        print(f'* Overriding vocab: {vocab_override}')
-        print(f'* Special vocab: {special_vocab}')
+        logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        logger.info(f'* Overriding params: {params_override}')
+        logger.info(f'* Overriding vocab: {vocab_override}')
+        logger.info(f'* Special vocab: {special_vocab}')
     else:
-        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+        logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
         if model.file_format == GGMLFormat.GGML:
-            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
+            logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
     converter = GGMLToGGUF(
         model, data, cfg,
         params_override = params_override,
@@ -434,7 +443,7 @@ def main():
         special_vocab = special_vocab
     )
     converter.save()
-    print(f'* Successful completion. Output saved to: {cfg.output}')
+    logger.info(f'* Successful completion. Output saved to: {cfg.output}')
 
 
 if __name__ == '__main__':
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
new file mode 100755
index 000000000..6dea14a23
--- /dev/null
+++ b/convert_lora_to_gguf.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import logging
+import argparse
+import os
+import sys
+import json
+from math import prod
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
+from transformers import AutoConfig
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+# reuse model definitions from convert_hf_to_gguf.py
+from convert_hf_to_gguf import LazyTorchTensor, Model
+
+logger = logging.getLogger("lora-to-gguf")
+
+
+@dataclass
+class PartialLoraTensor:
+    A: Tensor | None = None
+    B: Tensor | None = None
+
+
+# magic to support tensor shape modifications and splitting
+class LoraTorchTensor:
+    _lora_A: Tensor  # (n_rank, row_size)
+    _lora_B: Tensor  # (col_size, n_rank)
+    _rank: int
+
+    def __init__(self, A: Tensor, B: Tensor):
+        assert len(A.shape) == len(B.shape)
+        assert A.shape[-2] == B.shape[-1]
+        if A.dtype != B.dtype:
+            A = A.to(torch.float32)
+            B = B.to(torch.float32)
+        self._lora_A = A
+        self._lora_B = B
+        self._rank = B.shape[-1]
+
+    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
+        return (self._lora_A, self._lora_B)
+
+    def __getitem__(
+        self,
+        indices: (
+            SupportsIndex
+            | slice
+            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
+        ),
+    ) -> LoraTorchTensor:
+        shape = self.shape
+        if isinstance(indices, SupportsIndex):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                raise NotImplementedError  # can't return a vector
+        elif isinstance(indices, slice):
+            if len(shape) > 2:
+                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
+            else:
+                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
+        elif isinstance(indices, tuple):
+            assert len(indices) > 0
+            if indices[-1] is Ellipsis:
+                return self[indices[:-1]]
+            # expand ellipsis
+            indices = tuple(
+                u
+                for v in (
+                    (
+                        (slice(None, None) for _ in range(len(indices) - 1))
+                        if i is Ellipsis
+                        else (i,)
+                    )
+                    for i in indices
+                )
+                for u in v
+            )
+
+            if len(indices) < len(shape):
+                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
+
+            # TODO: make sure this is correct
+            indices_A = (
+                *(
+                    (
+                        j.__index__() % self._lora_A.shape[i]
+                        if isinstance(j, SupportsIndex)
+                        else slice(None, None)
+                    )
+                    for i, j in enumerate(indices[:-2])
+                ),
+                slice(None, None),
+                indices[-1],
+            )
+            indices_B = indices[:-1]
+            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
+        else:
+            raise NotImplementedError  # unknown indice type
+
+    @property
+    def dtype(self) -> torch.dtype:
+        assert self._lora_A.dtype == self._lora_B.dtype
+        return self._lora_A.dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        assert len(self._lora_A.shape) == len(self._lora_B.shape)
+        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
+
+    def size(self, dim=None):
+        assert dim is None
+        return self.shape
+
+    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
+        if isinstance(shape[0], tuple):
+            new_shape: tuple[int, ...] = shape[0]
+        else:
+            new_shape = cast(tuple[int, ...], shape)
+        orig_shape = self.shape
+        if len(new_shape) < 2:
+            raise NotImplementedError  # can't become a vector
+
+        # expand -1 in the shape
+        if any(dim == -1 for dim in new_shape):
+            n_elems = prod(orig_shape)
+            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
+            assert n_elems % n_new_elems == 0
+            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
+
+        if new_shape[-1] != orig_shape[-1]:
+            raise NotImplementedError  # can't reshape the row size trivially
+
+        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
+        shape_B = (*new_shape[:-1], self._rank)
+        return LoraTorchTensor(
+            self._lora_A.reshape(shape_A),
+            self._lora_B.reshape(shape_B),
+        )
+
+    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
+        return self.reshape(*other.shape)
+
+    def view(self, *size: int) -> LoraTorchTensor:
+        return self.reshape(*size)
+
+    def permute(self, *dims: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
+        if dims[-1] == -1:
+            # TODO: support higher dimensional A shapes bigger than 1
+            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
+            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
+        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
+            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
+        else:
+            # TODO: compose the above two
+            raise NotImplementedError
+
+    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
+        shape = self.shape
+        dims = [i for i in range(len(shape))]
+        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
+        return self.permute(*dims)
+
+    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
+        return self.transpose(axis0, axis1)
+
+    def to(self, *args, **kwargs):
+        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
+
+    @classmethod
+    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
+        del types  # unused
+
+        if kwargs is None:
+            kwargs = {}
+
+        if func is torch.permute:
+            return type(args[0]).permute(*args, **kwargs)
+        elif func is torch.reshape:
+            return type(args[0]).reshape(*args, **kwargs)
+        elif func is torch.stack:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            return LoraTorchTensor(
+                torch.stack([a._lora_A for a in args[0]], dim),
+                torch.stack([b._lora_B for b in args[0]], dim),
+            )
+        elif func is torch.cat:
+            assert isinstance(args[0], Sequence)
+            dim = kwargs.get("dim", 0)
+            assert dim == 0
+            if len(args[0][0].shape) > 2:
+                return LoraTorchTensor(
+                    torch.cat([a._lora_A for a in args[0]], dim),
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
+                return LoraTorchTensor(
+                    args[0][0]._lora_A,
+                    torch.cat([b._lora_B for b in args[0]], dim),
+                )
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+
+def get_base_tensor_name(lora_tensor_name: str) -> str:
+    base_name = lora_tensor_name.replace("base_model.model.", "")
+    base_name = base_name.replace(".lora_A.weight", ".weight")
+    base_name = base_name.replace(".lora_B.weight", ".weight")
+    # models produced by mergekit-extract-lora have token embeddings in the adapter
+    base_name = base_name.replace(".lora_embedding_A", ".weight")
+    base_name = base_name.replace(".lora_embedding_B", ".weight")
+    return base_name
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+    )
+    parser.add_argument(
+        "--bigendian", action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "--no-lazy", action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="only print out what will be done, without writing any new files",
+    )
+    parser.add_argument(
+        "--base", type=Path,
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
+    )
+    parser.add_argument(
+        "--base-model-id", type=str,
+        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
+    )
+    parser.add_argument(
+        "lora_path", type=Path,
+        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
+    )
+
+    return parser.parse_args()
+
+
+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+    # normally, adapter does not come with base model config, we need to load it from AutoConfig
+    config = AutoConfig.from_pretrained(hf_model_id)
+    return config.to_dict()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    ftype_map: dict[str, gguf.LlamaFileType] = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
+        "auto": gguf.LlamaFileType.GUESSED,
+    }
+
+    ftype = ftype_map[args.outtype]
+
+    dir_base_model: Path | None = args.base
+    dir_lora: Path = args.lora_path
+    base_model_id: str | None = args.base_model_id
+    lora_config = dir_lora / "adapter_config.json"
+    input_model = dir_lora / "adapter_model.safetensors"
+
+    if args.outfile is not None:
+        fname_out = args.outfile
+    else:
+        # output in the same directory as the model by default
+        fname_out = dir_lora
+
+    if os.path.exists(input_model):
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+
+        lora_model = load_file(input_model, device="cpu")
+    else:
+        input_model = os.path.join(dir_lora, "adapter_model.bin")
+        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
+
+    # load LoRA config
+    with open(lora_config, "r") as f:
+        lparams: dict[str, Any] = json.load(f)
+
+    # load base model
+    if base_model_id is not None:
+        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
+        hparams = load_hparams_from_hf(base_model_id)
+    elif dir_base_model is None:
+        if "base_model_name_or_path" in lparams:
+            model_id = lparams["base_model_name_or_path"]
+            logger.info(f"Loading base model from Hugging Face: {model_id}")
+            try:
+                hparams = load_hparams_from_hf(model_id)
+            except OSError as e:
+                logger.error(f"Failed to load base model config: {e}")
+                logger.error("Please try downloading the base model and add its path to --base")
+                sys.exit(1)
+        else:
+            logger.error("'base_model_name_or_path' is not found in adapter_config.json")
+            logger.error("Base model config is required. Please download the base model and add its path to --base")
+            sys.exit(1)
+    else:
+        logger.info(f"Loading base model: {dir_base_model.name}")
+        hparams = Model.load_hparams(dir_base_model)
+
+    with torch.inference_mode():
+        try:
+            model_class = Model.from_model_architecture(hparams["architectures"][0])
+        except NotImplementedError:
+            logger.error(f"Model {hparams['architectures'][0]} is not supported")
+            sys.exit(1)
+
+        class LoraModel(model_class):
+            model_arch = model_class.model_arch
+
+            lora_alpha: float
+
+            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
+
+                super().__init__(*args, **kwargs)
+
+                self.dir_model_card = dir_lora_model
+                self.lora_alpha = float(lora_alpha)
+
+            def set_vocab(self):
+                pass
+
+            def set_type(self):
+                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
+                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
+
+            def set_gguf_parameters(self):
+                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
+
+            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
+                return ()
+
+            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+                tensor_map: dict[str, PartialLoraTensor] = {}
+
+                for name, tensor in lora_model.items():
+                    if self.lazy:
+                        tensor = LazyTorchTensor.from_eager(tensor)
+                    base_name = get_base_tensor_name(name)
+                    # note: mergekit-extract-lora also adds token embeddings to the adapter
+                    is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
+                    is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
+                    if not is_lora_a and not is_lora_b:
+                        if ".base_layer.weight" in name:
+                            continue
+                        # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
+                        if "_layernorm" in name or ".norm" in name:
+                            yield (base_name, tensor)
+                            continue
+                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
+                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
+                        sys.exit(1)
+
+                    if base_name in tensor_map:
+                        if is_lora_a:
+                            tensor_map[base_name].A = tensor
+                        else:
+                            tensor_map[base_name].B = tensor
+                    else:
+                        if is_lora_a:
+                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
+                        else:
+                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
+
+                for name, tensor in tensor_map.items():
+                    assert tensor.A is not None
+                    assert tensor.B is not None
+                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
+
+            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
+                for dest_name, dest_data in dest:
+                    # mergekit-extract-lora add these layernorm to the adapter
+                    if "_norm" in dest_name:
+                        assert dest_data.dim() == 1
+                        yield (dest_name, dest_data)
+                        continue
+
+                    # otherwise, we must get the lora_A and lora_B tensors
+                    assert isinstance(dest_data, LoraTorchTensor)
+                    lora_a, lora_b = dest_data.get_lora_A_B()
+
+                    # note: mergekit-extract-lora flip and transpose A and B
+                    # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
+                    if "token_embd.weight" in dest_name:
+                        lora_a = lora_a.T
+
+                    yield (dest_name + ".lora_a", lora_a)
+                    yield (dest_name + ".lora_b", lora_b)
+
+        alpha: float = lparams["lora_alpha"]
+
+        model_instance = LoraModel(
+            dir_base_model,
+            ftype,
+            fname_out,
+            is_big_endian=args.bigendian,
+            use_temp_file=False,
+            eager=args.no_lazy,
+            dry_run=args.dry_run,
+            dir_lora_model=dir_lora,
+            lora_alpha=alpha,
+            hparams=hparams,
+        )
+
+        logger.info("Exporting model...")
+        model_instance.write()
+        logger.info(f"Model successfully exported to {model_instance.fname_out}")
diff --git a/docs/android.md b/docs/android.md
new file mode 100644
index 000000000..47530c6c1
--- /dev/null
+++ b/docs/android.md
@@ -0,0 +1,83 @@
+
+# Android
+
+## Build on Android using Termux
+
+[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
+
+With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
+
+```
+$ apt update && apt upgrade -y
+$ apt install git cmake
+```
+
+Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+
+Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
+
+```
+$ curl -L {model-url} -o ~/{model}.gguf
+```
+
+Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
+
+```
+$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+
+To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
+
+https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
+
+## Cross-compile using Android NDK
+It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
+
+Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
+
+```
+$ cmake \
+  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DCMAKE_C_FLAGS="-march=armv8.7a" \
+  -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
+  -DGGML_OPENMP=OFF \
+  -DGGML_LLAMAFILE=OFF \
+  -B build-android
+```
+
+Notes:
+  - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
+  - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
+
+The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
+
+Feel free to adjust the Android ABI for your target. Once the project is configured:
+
+```
+$ cmake --build build-android --config Release -j{n}
+$ cmake --install build-android --prefix {install-dir} --config Release
+```
+
+After installing, go ahead and download the model of your choice to your host system. Then:
+
+```
+$ adb shell "mkdir /data/local/tmp/llama.cpp"
+$ adb push {install-dir} /data/local/tmp/llama.cpp/
+$ adb push {model}.gguf /data/local/tmp/llama.cpp/
+$ adb shell
+```
+
+In the `adb shell`:
+
+```
+$ cd /data/local/tmp/llama.cpp
+$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
+```
+
+That's it!
+
+Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`.
diff --git a/docs/BLIS.md b/docs/backend/BLIS.md
similarity index 91%
rename from docs/BLIS.md
rename to docs/backend/BLIS.md
index 0bcd6eeef..904548577 100644
--- a/docs/BLIS.md
+++ b/docs/backend/BLIS.md
@@ -23,23 +23,16 @@ Install BLIS:
 sudo make install
 ```
 
-We recommend using openmp since it's easier to modify the cores been used.
+We recommend using openmp since it's easier to modify the cores being used.
 
 ### llama.cpp compilation
 
-Makefile:
-
-```bash
-make LLAMA_BLIS=1 -j
-# make LLAMA_BLIS=1 benchmark-matmult
-```
-
 CMake:
 
 ```bash
 mkdir build
 cd build
-cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
 make -j
 ```
 
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
new file mode 100644
index 000000000..23f10175a
--- /dev/null
+++ b/docs/backend/CANN.md
@@ -0,0 +1,263 @@
+# llama.cpp for CANN
+
+ - [Background](#background)
+ - [News](#news)
+ - [OS](#os)
+ - [Hardware](#hardware)
+ - [Model Supports](#model-supports)
+ - [DataType Supports](#datatype-supports)
+ - [Docker](#docker)
+ - [Linux](#linux)
+ - [TODO](#todo)
+
+
+## Background
+
+**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
+
+**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
+
+**Llama.cpp + CANN**
+
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+
+## News
+
+- 2024.11
+  - Support F16 and F32 data type model for Ascend 310P NPU.
+- 2024.8
+  - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
+- 2024.7
+  - Create CANN backend for Ascend NPU.
+
+## OS
+
+| OS      | Status  | Verified                                       |
+|:-------:|:-------:|:----------------------------------------------:|
+| Linux   | Support | Ubuntu 22.04, OpenEuler22.03                   |
+
+
+## Hardware
+
+### Ascend NPU
+
+**Verified devices**
+
+| Ascend NPU                    | Status  |
+|:-----------------------------:|:-------:|
+| Atlas 300T A2                 | Support |
+| Atlas 300I Duo                | Support |
+
+*Notes:*
+
+- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
+- If you run successfully with your Ascend NPU device, please help update the upper table.
+
+
+## Model Supports
+
+| Model Name                  | FP16  | Q8_0 | Q4_0 |
+|:----------------------------|:-----:|:----:|:----:|
+| AquilaChat2-7B              |   √   |   √  |   √  |
+| Baichuan-7b                 |   √   |   √  |   √  |
+| Baichuan2-7B-Chat           |   √   |   √  |   √  |
+| bitnet_b1_58-large          |   √   |   √  |   √  |
+| bloom-560m                  |   √   |   x  |   √  |
+| bloomz-alpaca-560m          |   √   |   x  |   √  |
+| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
+| chatglm3-6B                 |   x   |   x  |   x  |
+| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
+| CodeShell-7B                |   √   |   √  |   √  |
+| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
+| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
+| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
+| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
+| falcon-7b-instruct          |   √   |   √  |   √  |
+| flan-t5-large               |   √   |   √  |   √  |
+| gemma-2-9b-it               |   √   |   √  |   √  |
+| glm-4-9B                    |   x   |   x  |   x  |
+| gpt2                        |   √   |   √  |   √  |
+| Gpt2-163M                   |   √   |   √  |   √  |
+| granite-3B-code-instruct    |   √   |   √  |   √  |
+| GritLM-7B                   |   √   |   √  |   √  |
+| internlm2_5-7b-chat         |   √   |   √  |   √  |
+| koala-7B-HF                 |   √   |   √  |   √  |
+| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
+| Llama-3-Smaug-8B            |   √   |   √  |   √  |
+| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
+| Llama3-8B                   |   √   |   √  |   √  |
+| Llama3-8b-chinese           |   √   |   √  |   √  |
+| mamba-130m-hf               |   √   |   √  |   √  |
+| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
+| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
+| mpt-7B                      |   √   |   √  |   √  |
+| OLMo-1B-hf                  |   √   |   √  |   √  |
+| OpenELM-3B-Instruct         |   √   |   √  |   √  |
+| Orion-14b-base              |   √   |   √  |   √  |
+| phi1                        |   x   |   x  |   x  |
+| phi2                        |   x   |   x  |   x  |
+| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
+| plamo-13b                   |   √   |   √  |   √  |
+| pythia-70M                  |   x   |   x  |   x  |
+| Qwen-7B                     |   √   |   √  |   √  |
+| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
+| Refact-1_6B-fim             |   √   |   √  |   √  |
+| SmolLM-135M                 |   √   |   √  |   √  |
+| stablelm-zephyr             |   x   |   x  |   x  |
+| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
+| starcoderbase-1b            |   √   |   √  |   √  |
+| starcoder2-3b               |   √   |   √  |   √  |
+| vigogne-7b-chat             |   √   |   √  |   √  |
+| xverse-7b-chat              |   √   |   √  |   √  |
+| Yi-6b-Chat                  |   √   |   √  |   √  |
+
+
+
+## DataType Supports
+
+| DataType               | Status  |
+|:----------------------:|:-------:|
+| FP16                   | Support |
+| Q8_0                   | Support |
+| Q4_0                   | Support |
+
+## Docker
+
+### Build Images
+You can get a image with llama.cpp in one command.
+```sh
+docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
+```
+
+### Run container
+
+```sh
+# Find all cards.
+npu-smi info
+
+# Select the cards that you want to use, make sure these cards are not used by someone.
+# Following using cards of device0.
+docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
+```
+
+*Notes:*
+
+- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
+
+## Linux
+
+### I. Setup Environment
+
+1. **Install Ascend Driver and firmware**
+
+    ```sh
+    # create driver running user.
+    sudo groupadd -g HwHiAiUser
+    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
+    sudo usermod -aG HwHiAiUser $USER
+
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
+    ```
+
+    Once installed, run `npu-smi info` to check whether driver is installed successfully.
+    ```sh
+    +-------------------------------------------------------------------------------------------+
+    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
+    +----------------------+---------------+----------------------------------------------------+
+    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
+    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
+    +======================+===============+====================================================+
+    | 2     xxx            | OK            | 64.4        51                15   / 15            |
+    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | 5     xxx            | OK            | 64.0        52                15   / 15            |
+    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 2                                                       |
+    +======================+===============+====================================================+
+    | No running processes found in NPU 5                                                       |
+    +======================+===============+====================================================+
+    ```
+
+2. **Install Ascend Firmware**
+    ```sh
+    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
+    # and install driver.
+    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
+    ```
+    If the following messaage appers, firmware is installed successfully.
+    ```sh
+    Firmware package installed successfully!
+    ```
+
+
+3. **Install CANN toolkit and kernels**
+
+    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
+
+    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
+    ```sh
+    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
+    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
+    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
+    ```
+
+    Set Ascend Variables:
+    ```sh
+    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
+    source ~/.bashrc
+    ```
+
+Upon a successful installation, CANN is enabled for the available ascend devices.
+
+### II. Build llama.cpp
+
+```sh
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
+cmake --build build --config release
+```
+
+### III. Run the inference
+
+1. **Retrieve and prepare model**
+
+    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
+
+    **Notes**:
+
+      - CANN backend only supports FP16/Q4_0/Q8_0 models currently.
+
+2. **Launch inference**
+
+    There are two device selection modes:
+
+    - Single device: Use one device target specified by the user.
+    - Multiple devices: Automatically choose the devices with the same backend.
+
+    | Device selection | Parameter                              |
+    |:----------------:|:--------------------------------------:|
+    | Single device    | --split-mode none --main-gpu DEVICE_ID |
+    | Multiple devices | --split-mode layer (default)           |
+
+    Examples:
+
+    - Use device 0:
+
+    ```sh
+    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+    ```
+
+    - Use multiple devices:
+
+    ```sh
+    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+    ```
+
+### **GitHub contribution**:
+Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
+
+
+## TODO
+- Support more models and data types.
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
new file mode 100644
index 000000000..89ddbd669
--- /dev/null
+++ b/docs/backend/SYCL.md
@@ -0,0 +1,714 @@
+# llama.cpp for SYCL
+
+- [Background](#background)
+- [Recommended Release](#recommended-release)
+- [News](#news)
+- [OS](#os)
+- [Hardware](#hardware)
+- [Docker](#docker)
+- [Linux](#linux)
+- [Windows](#windows)
+- [Environment Variable](#environment-variable)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)
+
+## Background
+
+**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
+
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
+
+- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
+- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
+
+### Llama.cpp + SYCL
+
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+
+## Recommended Release
+
+The SYCL backend would be broken by some PRs due to no online CI.
+
+The following release is verified with good quality:
+
+|Commit ID|Tag|Release|Verified  Platform| Update date|
+|-|-|-|-|-|
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
+
+
+## News
+
+- 2024.11
+  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
+
+- 2024.8
+  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
+
+- 2024.5
+  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
+  - Arch Linux is verified successfully.
+
+- 2024.4
+  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
+
+- 2024.3
+  - Release binary files of Windows.
+  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
+  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
+  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
+  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
+  - Support detecting all GPUs with level-zero and same top **Max compute units**.
+  - Support OPs
+    - hardsigmoid
+    - hardswish
+    - pool2d
+
+- 2024.1
+  - Create SYCL backend for Intel GPU.
+  - Support Windows build
+
+## OS
+
+| OS      | Status  | Verified                                       |
+|---------|---------|------------------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
+| Windows | Support | Windows 11                                     |
+
+
+## Hardware
+
+### Intel GPU
+
+SYCL backend supports Intel GPU Family:
+
+- Intel Data Center Max Series
+- Intel Flex Series, Arc Series
+- Intel Built-in Arc GPU
+- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
+
+#### Verified devices
+
+| Intel GPU                     | Status  | Verified Model                        |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
+| Intel Data Center Flex Series | Support | Flex 170                              |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
+| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+
+*Notes:*
+
+- **Memory**
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+
+  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
+
+- **Execution Unit (EU)**
+  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
+
+### Other Vendor GPU
+
+**Verified devices**
+
+| Nvidia GPU               | Status    | Verified Model |
+|--------------------------|-----------|----------------|
+| Ampere Series            | Supported | A100, A4000    |
+| Ampere Series *(Mobile)* | Supported | RTX 40 Series  |
+
+| AMD GPU                  | Status       | Verified Model |
+|--------------------------|--------------|----------------|
+| Radeon Pro               | Experimental | W6800          |
+| Radeon RX                | Experimental | 6700 XT        |
+
+Note: AMD GPU support is highly experimental and is incompatible with F16.
+Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
+
+## Docker
+The docker build option is currently limited to *intel GPU* targets.
+
+### Build image
+```sh
+# Using FP16
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
+```
+
+*Notes*:
+
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
+
+You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+
+### Run container
+
+```sh
+# First, find all the DRI cards
+ls -la /dev/dri
+# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+
+*Notes:*
+- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
+- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
+
+## Linux
+
+### I. Setup Environment
+
+1. **Install GPU drivers**
+
+  - **Intel GPU**
+
+Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
+
+*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
+
+Once installed, add the user(s) to the `video` and `render` groups.
+
+```sh
+sudo usermod -aG render $USER
+sudo usermod -aG video $USER
+```
+
+*Note*: logout/re-login for the changes to take effect.
+
+Verify installation through `clinfo`:
+
+```sh
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Sample output:
+
+```sh
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+- **Nvidia GPU**
+
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
+
+- **AMD GPU**
+
+To target AMD GPUs with SYCL, the ROCm stack must be installed first.
+
+2. **Install Intel® oneAPI Base toolkit**
+
+- **For Intel GPU**
+
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+
+Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
+
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
+
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
+
+- **Adding support to Nvidia GPUs**
+
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+
+
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+
+```sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+cmake --build buildWithCublas --config Release
+```
+
+- **Adding support to AMD GPUs**
+
+**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
+
+**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
+
+```sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+# Find your HIPTARGET with rocminfo, under the key 'Name:'
+cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
+cmake --build buildWithrocBLAS --config Release
+```
+
+3. **Verify installation and environment**
+
+In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
+```sh
+source /opt/intel/oneapi/setvars.sh
+sycl-ls
+```
+
+- **Intel GPU**
+
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
+
+```
+[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+```
+
+- **Nvidia GPU**
+
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
+
+```
+[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
+[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
+[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
+```
+
+- **AMD GPU**
+
+For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
+
+```
+[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
+[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
+```
+
+### II. Build llama.cpp
+
+#### Intel GPU
+
+```
+./examples/sycl/build.sh
+```
+
+or
+
+```sh
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+# build all binary
+cmake --build build --config Release -j -v
+```
+
+#### Nvidia GPU
+
+```sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+
+# Build LLAMA with Nvidia BLAS acceleration through SYCL
+# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
+GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+# build all binary
+cmake --build build --config Release -j -v
+```
+
+#### AMD GPU
+
+```sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
+
+# Build LLAMA with rocBLAS acceleration through SYCL
+
+## AMD
+# Use FP32, FP16 is not supported
+# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
+GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# build all binary
+cmake --build build --config Release -j -v
+```
+
+### III. Run the inference
+
+#### Retrieve and prepare model
+
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+
+##### Check device
+
+1. Enable oneAPI running environment
+
+```sh
+source /opt/intel/oneapi/setvars.sh
+```
+
+2. List devices information
+
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
+
+```sh
+./build/bin/llama-ls-sycl-device
+```
+
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+```
+found 2 SYCL devices:
+
+|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
+|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
+| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+```
+
+#### Choose level-zero devices
+
+|Chosen Device ID|Setting|
+|-|-|
+|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:0"` or no action|
+|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
+|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+
+#### Execute
+
+Choose one of following methods to run.
+
+1. Script
+
+- Use device 0:
+
+```sh
+./examples/sycl/run-llama2.sh 0
+```
+- Use multiple devices:
+
+```sh
+./examples/sycl/run-llama2.sh
+```
+
+2. Command line
+Launch inference
+
+There are two device selection modes:
+
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |
+
+Examples:
+
+- Use device 0:
+
+```sh
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+```
+
+- Use multiple devices:
+
+```sh
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+```
+
+*Notes:*
+
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
+
+```sh
+detect 1 SYCL GPUs: [0] with top Max compute units:512
+```
+Or
+```sh
+use 1 SYCL GPUs: [0] with Max compute units:512
+```
+
+## Windows
+
+### I. Setup Environment
+
+1. Install GPU driver
+
+Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+
+2. Install Visual Studio
+
+If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
+
+3. Install Intel® oneAPI Base toolkit
+
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+
+Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
+
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
+
+b. Enable oneAPI running environment:
+
+- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
+
+- On the command prompt, enable the runtime environment with the following:
+```
+"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
+```
+
+c. Verify installation
+
+In the oneAPI command line, run the following to print the available SYCL devices:
+
+```
+sycl-ls.exe
+```
+
+There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO  [31.0.101.5186]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
+```
+
+4. Install build tools
+
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
+b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
+
+
+### II. Build llama.cpp
+
+You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
+
+Choose one of following methods to build from source code.
+
+1. Script
+
+```sh
+.\examples\sycl\win-build-sycl.bat
+```
+
+2. CMake
+
+On the oneAPI command line window, step into the llama.cpp main directory and run the following:
+
+```
+@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+
+# Option 2: Or FP16
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+
+cmake --build build --config Release -j
+```
+
+Or, use CMake presets to build:
+
+```sh
+cmake --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake --preset x64-windows-sycl-debug
+cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+```
+
+3. Visual Studio
+
+You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+
+*Notes:*
+
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
+
+### III. Run the inference
+
+#### Retrieve and prepare model
+
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+
+##### Check device
+
+1. Enable oneAPI running environment
+
+On the oneAPI command line window, run the following and step into the llama.cpp directory:
+```
+"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
+```
+
+2. List devices information
+
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
+
+```
+build\bin\llama-ls-sycl-device.exe
+```
+
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+```
+found 2 SYCL devices:
+|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
+|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
+| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+
+```
+#### Choose level-zero devices
+
+|Chosen Device ID|Setting|
+|-|-|
+|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
+|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
+|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+
+#### Execute
+
+Choose one of following methods to run.
+
+1. Script
+
+```
+examples\sycl\win-run-llama2.bat
+```
+
+2. Command line
+
+Launch inference
+
+There are two device selection modes:
+
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |
+
+Examples:
+
+- Use device 0:
+
+```
+build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+```
+
+- Use multiple devices:
+
+```
+build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+```
+
+
+Note:
+
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
+
+```sh
+detect 1 SYCL GPUs: [0] with top Max compute units:512
+```
+Or
+```sh
+use 1 SYCL GPUs: [0] with Max compute units:512
+```
+
+
+## Environment Variable
+
+#### Build
+
+| Name               | Value                                 | Function                                    |
+|--------------------|---------------------------------------|---------------------------------------------|
+| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
+| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
+| CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
+| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
+
+#### Runtime
+
+| Name              | Value            | Function                                                                                                                  |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+
+## Known Issues
+
+- `Split-mode:[row]` is not supported.
+
+## Q&A
+
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
+  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
+
+- General compiler error:
+
+  - Remove **build** folder or try a clean-build.
+
+- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
+
+  Please double-check with `sudo sycl-ls`.
+
+  If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
+
+  ```
+  sudo usermod -aG render $USER
+  sudo usermod -aG video $USER
+  ```
+  Otherwise, please double-check the GPU driver installation steps.
+
+- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
+
+  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
+
+  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
+
+  It's same for other projects including llama.cpp SYCL backend.
+
+- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
+
+  Device Memory is not enough.
+
+  |Reason|Solution|
+  |-|-|
+  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
+  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
+
+### **GitHub contribution**:
+Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
+
+## TODO
+
+- NA
diff --git a/docs/build.md b/docs/build.md
new file mode 100644
index 000000000..afb7a0402
--- /dev/null
+++ b/docs/build.md
@@ -0,0 +1,408 @@
+# Build llama.cpp locally
+
+**To get the Code:**
+
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+```
+
+The following sections describe how to build with different backends and options.
+
+## CPU Build
+
+Build llama.cpp using `CMake`:
+
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+**Notes**:
+
+- For faster compilation, add the `-j` argument to run multiple jobs in parallel, or use a generator that does this automatically such as Ninja. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+- For faster repeated compilation, install [ccache](https://ccache.dev/)
+- For debug builds, there are two cases:
+
+    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+
+       ```bash
+       cmake -B build -DCMAKE_BUILD_TYPE=Debug
+       cmake --build build
+       ```
+
+    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
+
+       ```bash
+       cmake -B build -G "Xcode"
+       cmake --build build --config Debug
+       ```
+
+    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
+- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+  ```
+  cmake -B build -DBUILD_SHARED_LIBS=OFF
+  cmake --build build --config Release
+  ```
+
+- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+    - Tab Workload: Desktop-development with C++
+    - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+    - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+    - For Windows on ARM (arm64, WoA) build with:
+    ```bash
+    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+    cmake --build build-arm64-windows-llvm-release
+    ```
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
+
+    For building with ninja generator and clang compiler as default:
+      -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
+      ```bash
+      cmake --preset x64-windows-llvm-release
+      cmake --build build-x64-windows-llvm-release
+      ```
+
+## BLAS Build
+
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use:
+
+### Accelerate Framework
+
+This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
+
+### OpenBLAS
+
+This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
+
+- Using `CMake` on Linux:
+
+    ```bash
+    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+    cmake --build build --config Release
+    ```
+
+### BLIS
+
+Check [BLIS.md](./backend/BLIS.md) for more information.
+
+### Intel oneMKL
+
+Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+- Using manual oneAPI installation:
+  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+    ```bash
+    source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
+    cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
+    cmake --build build --config Release
+    ```
+
+- Using oneAPI docker image:
+  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
+
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+
+### Other BLAS libraries
+
+Any other BLAS library can be used by setting the `GGML_BLAS_VENDOR` option. See the [CMake documentation](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) for a list of supported vendors.
+
+## Metal Build
+
+On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
+To disable the Metal build at compile time use the `-DGGML_METAL=OFF` cmake option.
+
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers 0` command-line argument.
+
+## SYCL
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+
+## CUDA
+
+This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
+
+#### Download directly from NVIDIA
+You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
+
+
+#### Compile and run inside a Fedora Toolbox Container
+We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
+
+**Recommended for:**
+
+- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
+
+
+### Compilation
+```bash
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release
+```
+
+### Override Compute Capability Specifications
+
+If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
+ ```text
+nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
+```
+
+To override the `native` GPU detection:
+
+#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
+
+```text
+GeForce RTX 4090      8.9
+GeForce RTX 3080 Ti   8.6
+GeForce RTX 3070      8.6
+```
+
+#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
+```
+
+### Runtime CUDA environmental variables
+
+You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
+
+```bash
+# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
+CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
+```
+
+### Unified Memory
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
+
+### Performance Tuning
+
+The following compilation options are also available to tweak performance:
+
+| Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
+| GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
+| GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
+
+## MUSA
+
+This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
+
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DGGML_MUSA=ON
+  cmake --build build --config Release
+  ```
+
+The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
+
+Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
+
+## HIP
+
+This provides GPU acceleration on HIP-supported AMD GPUs.
+Make sure to have ROCm installed.
+You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
+
+- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
+  ```bash
+  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      && cmake --build build --config Release -- -j 16
+  ```
+  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
+  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
+
+  Note that if you get the following error:
+  ```
+  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+  ```
+  Try searching for a directory under `HIP_PATH` that contains the file
+  `oclc_abi_version_400.bc`. Then, add the following to the start of the
+  command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
+  like:
+  ```bash
+  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
+  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      && cmake --build build -- -j 16
+  ```
+
+- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
+  ```bash
+  set PATH=%HIP_PATH%\bin;%PATH%
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake --build build
+  ```
+  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
+  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
+
+
+The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
+
+## Vulkan
+
+**Windows**
+
+### w64devkit
+
+Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
+
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
+
+Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
+```sh
+SDK_VERSION=1.3.283.0
+cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
+cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
+cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
+cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
+Name: Vulkan-Loader
+Description: Vulkan Loader
+Version: $SDK_VERSION
+Libs: -lvulkan-1
+EOF
+
+```
+
+Switch into the `llama.cpp` directory and build using CMake.
+```sh
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+### Git Bash MINGW64
+
+Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
+
+Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
+
+Download and install [`CMake`](https://cmake.org/download/) with the default settings
+
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
+
+Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
+
+```
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+Now you can load the model in conversation mode using `Vulkan`
+
+```sh
+build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
+```
+
+### MSYS2
+Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
+```sh
+pacman -S git \
+    mingw-w64-ucrt-x86_64-gcc \
+    mingw-w64-ucrt-x86_64-cmake \
+    mingw-w64-ucrt-x86_64-vulkan-devel \
+    mingw-w64-ucrt-x86_64-shaderc
+```
+
+Switch into the `llama.cpp` directory and build using CMake.
+```sh
+cmake -B build -DGGML_VULKAN=ON
+cmake --build build --config Release
+```
+
+**With docker**:
+
+You don't need to install Vulkan SDK. It will be installed inside the container.
+
+```sh
+# Build the image
+docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
+
+# Then, use it:
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+
+**Without docker**:
+
+Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
+
+For example, on Ubuntu 22.04 (jammy), use the command below:
+
+```bash
+wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
+wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+apt update -y
+apt-get install -y vulkan-sdk
+# To verify the installation, use the command below:
+vulkaninfo
+```
+
+Alternatively your package manager might be able to provide the appropriate libraries.
+For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
+For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
+
+Then, build llama.cpp using the cmake command below:
+
+```bash
+cmake -B build -DGGML_VULKAN=1
+cmake --build build --config Release
+# Test the output binary (with "-ngl 33" to offload all layers to GPU)
+./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+
+# You should see in the output, ggml_vulkan detected your GPU. For example:
+# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
+```
+
+## CANN
+This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
+
+For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
+
+Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
+
+Go to `llama.cpp` directory and build using CMake.
+```bash
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
+cmake --build build --config release
+```
+
+You can test with:
+
+```bash
+./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32
+```
+
+If the following info is output on screen, you are using `llama.cpp` with the CANN backend:
+```bash
+llm_load_tensors:       CANN model buffer size = 13313.00 MiB
+llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
+```
+
+For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
+
+## Android
+
+To read documentation for how to build on Android, [click here](./android.md)
+
+## Notes about GPU-accelerated backends
+
+The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
+
+In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
+
+Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md
new file mode 100644
index 000000000..9c88b7694
--- /dev/null
+++ b/docs/cuda-fedora.md
@@ -0,0 +1,270 @@
+# Setting Up CUDA on Fedora
+
+In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
+
+- [Fedora Workstation](https://fedoraproject.org/workstation/)
+- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
+- [Fedora Spins](https://fedoraproject.org/spins)
+- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository)
+- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
+- [Installing Essential Development Tools](#installing-essential-development-tools)
+- [Adding the CUDA Repository](#adding-the-cuda-repository)
+- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
+- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
+- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
+- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
+- [Configuring the Environment](#configuring-the-environment)
+- [Verifying the Installation](#verifying-the-installation)
+- [Conclusion](#conclusion)
+- [Troubleshooting](#troubleshooting)
+- [Additional Notes](#additional-notes)
+- [References](#references)
+
+## Prerequisites
+
+- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
+- **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
+- **Internet connectivity** to download packages.
+
+### Using the Fedora 41 CUDA Repository
+
+The latest release is 41.
+
+- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)
+
+**Note:** We recommend using a toolbox environment to prevent system conflicts.
+
+## Creating a Fedora Toolbox Environment
+
+This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system.
+
+**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.
+
+1. **Create a Fedora 41 Toolbox:**
+
+   ```bash
+   toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda
+   ```
+
+2. **Enter the Toolbox:**
+
+   ```bash
+   toolbox enter --container fedora-toolbox-41-cuda
+   ```
+
+   Inside the toolbox, you have root privileges and can install packages without affecting the host system.
+
+## Installing Essential Development Tools
+
+1. **Synchronize the DNF Package Manager:**
+
+   ```bash
+   sudo dnf distro-sync
+   ```
+
+2. **Install the Default Text Editor (Optional):**
+
+   ```bash
+   sudo dnf install vim-default-editor --allowerasing
+   ```
+
+   The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package.
+
+3. **Install Development Tools and Libraries:**
+
+   ```bash
+   sudo dnf install @c-development @development-tools cmake
+   ```
+
+   This installs essential packages for compiling software, including `gcc`, `make`, and other development headers.
+
+## Adding the CUDA Repository
+
+Add the NVIDIA CUDA repository to your DNF configuration:
+
+```bash
+sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo
+```
+
+After adding the repository, synchronize the package manager again:
+
+```bash
+sudo dnf distro-sync
+```
+
+## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+
+We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
+
+```bash
+ls -la /usr/lib64/libcuda.so.1
+```
+
+**Explanation:**
+
+- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
+  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
+
+### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+
+```bash
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+```
+
+### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
+
+If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
+
+#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
+
+```bash
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
+```
+
+#### 2. Update the RPM database to assume the installation of these packages.
+
+```bash
+sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
+```
+
+**Note:**
+
+- The `--justdb` option only updates the RPM database, without touching the filesystem.
+
+#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+
+After manually installing the dependencies, run:
+
+```bash
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+```
+
+You should receive a message indicating the package is already installed:
+
+```
+Updating and loading repositories:
+Repositories loaded.
+Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+
+Nothing to do.
+```
+
+## Installing the CUDA Meta-Package
+
+Now that the driver libraries are installed, proceed to install CUDA:
+
+```bash
+sudo dnf install cuda
+```
+
+This installs the CUDA toolkit and associated packages.
+
+## Configuring the Environment
+
+To use CUDA, add its binary directory to your system's `PATH`.
+
+1. **Create a Profile Script:**
+
+   ```bash
+   sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh'
+   ```
+
+   **Explanation:**
+
+   - We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
+   - The backslash `\` before `$PATH` ensures the variable is correctly written into the script.
+
+2. **Make the Script Executable:**
+
+   ```bash
+   sudo chmod +x /etc/profile.d/cuda.sh
+   ```
+
+3. **Source the Script to Update Your Environment:**
+
+   ```bash
+   source /etc/profile.d/cuda.sh
+   ```
+
+   **Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions.
+
+## Verifying the Installation
+
+To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`):
+
+```bash
+nvcc --version
+```
+
+You should see output similar to:
+
+```
+nvcc: NVIDIA (R) Cuda compiler driver
+Copyright (c) 2005-2025 NVIDIA Corporation
+Built on Wed_Jan_15_19:20:09_PST_2025
+Cuda compilation tools, release 12.8, V12.8.61
+Build cuda_12.8.r12.8/compiler.35404655_0
+```
+
+This output confirms that the CUDA compiler is accessible and indicates the installed version.
+
+## Conclusion
+
+You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system.
+
+## Troubleshooting
+
+- **Installation Failures:**
+
+  - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
+  - You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations.
+
+- **Rebooting the Container:**
+
+  - Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue:
+
+  ```bash
+  # on the host system
+  podman container restart --all
+  ```
+
+- **Environment Variables Not Set:**
+  - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
+  - Run `echo $PATH` to check if the path is included.
+  - Re-source the profile script or open a new terminal session.
+
+## Additional Notes
+
+- **Updating CUDA in the Future:**
+
+  - Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
+  - When an updated repository becomes available, adjust your `dnf` configuration accordingly.
+
+- **Building `llama.cpp`:**
+
+  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
+  - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
+
+- **Using the Toolbox Environment:**
+  - The toolbox environment is isolated from your host system, which helps prevent conflicts.
+  - Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox.
+
+---
+
+**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox.
+
+**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide.
+
+## References
+
+- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/)
+- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
+- [Podman Documentation](https://podman.io/get-started)
+
+---
diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md
new file mode 100644
index 000000000..8fcd70811
--- /dev/null
+++ b/docs/development/HOWTO-add-model.md
@@ -0,0 +1,119 @@
+# Add a new model architecture to `llama.cpp`
+
+Adding a model requires few steps:
+
+1. Convert the model to GGUF
+2. Define the model architecture in `llama.cpp`
+3. Build the GGML graph implementation
+
+After following these steps, you can open PR.
+
+Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
+- [main](/examples/main/)
+- [imatrix](/examples/imatrix/)
+- [quantize](/examples/quantize/)
+- [server](/examples/server/)
+
+### 1. Convert the model to GGUF
+
+This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+
+The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
+
+The required steps to implement for an HF model are:
+
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+
+```python
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
+    model_arch = gguf.MODEL_ARCH.MYMODEL
+```
+
+2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
+
+Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
+
+Example for `falcon` model:
+```python
+    MODEL_ARCH.FALCON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ]
+```
+
+3. Map the original tensor names to the standardize equivalent in GGUF
+
+As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
+
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
+
+If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
+
+Example for the normalization tensor in attention layers:
+
+```python
+block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
+            "transformer.blocks.{bid}.norm_1",                      # mpt
+            ...
+        )
+}
+```
+
+`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
+
+Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`
+
+NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
+
+### 2. Define the model architecture in `llama.cpp`
+
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non-standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+
+NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
+
+### 3. Build the GGML graph implementation
+
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
+
+Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
+
+Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
+
+## GGUF specification
+
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+
+## Resources
+
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
diff --git a/docs/development/debugging-tests.md b/docs/development/debugging-tests.md
new file mode 100644
index 000000000..18407f688
--- /dev/null
+++ b/docs/development/debugging-tests.md
@@ -0,0 +1,104 @@
+# Debugging Tests Tips
+
+## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
+
+There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
+
+For example, running the following command will output an interactive list from which you can select a test. It takes this form:
+
+`debug-test.sh [OPTION]... <test_regex> <test_number>`
+
+It will then build & run in the debugger for you.
+
+To just execute a test and get back a PASS or FAIL message run:
+
+```bash
+./scripts/debug-test.sh test-tokenizer
+```
+
+To test in GDB use the `-g` flag to enable gdb test mode.
+
+```bash
+./scripts/debug-test.sh -g test-tokenizer
+
+# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
+>>> b main
+```
+
+To speed up the testing loop, if you know your test number you can just run it similar to below:
+
+```bash
+./scripts/debug-test.sh test 23
+```
+
+For further reference use `debug-test.sh -h` to print help.
+
+&nbsp;
+
+### How does the script work?
+If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
+
+#### Step 1: Reset and Setup folder context
+
+From base of this repository, let's create `build-ci-debug` as our build context.
+
+```bash
+rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+```
+
+#### Step 2: Setup Build Environment and Compile Test Binaries
+
+Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
+
+```bash
+cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
+make -j
+```
+
+#### Step 3: Find all tests available that matches REGEX
+
+The output of this command will give you the command & arguments needed to run GDB.
+
+* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
+* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
+* `-V` : Verbose Mode
+
+```bash
+ctest -R "test-tokenizer" -V -N
+```
+
+This may return output similar to below (focusing on key lines to pay attention to):
+
+```bash
+...
+1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+1: Working Directory: .
+Labels: main
+  Test  #1: test-tokenizer-0-llama-spm
+...
+4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
+4: Working Directory: .
+Labels: main
+  Test  #4: test-tokenizer-0-falcon
+...
+```
+
+#### Step 4: Identify Test Command for Debugging
+
+So for test #1 above we can tell these two pieces of relevant information:
+* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
+* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
+
+#### Step 5: Run GDB on test command
+
+Based on the ctest 'test command' report above we can then run a gdb session via this command below:
+
+```bash
+gdb --args ${Test Binary} ${Test GGUF Model}
+```
+
+Example:
+
+```bash
+gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+```
diff --git a/docs/llama-star/idea-arch.key b/docs/development/llama-star/idea-arch.key
similarity index 100%
rename from docs/llama-star/idea-arch.key
rename to docs/development/llama-star/idea-arch.key
diff --git a/docs/llama-star/idea-arch.pdf b/docs/development/llama-star/idea-arch.pdf
similarity index 100%
rename from docs/llama-star/idea-arch.pdf
rename to docs/development/llama-star/idea-arch.pdf
diff --git a/docs/token_generation_performance_tips.md b/docs/development/token_generation_performance_tips.md
similarity index 83%
rename from docs/token_generation_performance_tips.md
rename to docs/development/token_generation_performance_tips.md
index d7e863dff..41b7232c9 100644
--- a/docs/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@@ -1,9 +1,9 @@
 # Token generation performance troubleshooting
 
-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with CUDA
+Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
+./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 
 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -27,7 +27,7 @@ RAM: 32GB
 
 Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
 
-Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 
 Result:
 
diff --git a/docs/docker.md b/docs/docker.md
new file mode 100644
index 000000000..dac9a9ec1
--- /dev/null
+++ b/docs/docker.md
@@ -0,0 +1,123 @@
+# Docker
+
+## Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (ex. /llama/models)
+
+## Images
+We have three Docker images available for this project:
+
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+
+Additionally, there the following images, similar to the above:
+
+- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
+
+## Usage
+
+The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
+
+Replace `/path/to/models` below with the actual path where you downloaded the models.
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+```
+
+On completion, you are ready to play!
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+```
+
+or with a light image:
+
+```bash
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+```
+
+or with a server image:
+
+```bash
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+```
+
+## Docker With CUDA
+
+Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `CUDA_VERSION` set to `12.6.0`
+- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
+
+The resulting images, are essentially the same as the non-CUDA images:
+
+1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
+3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
+
+## Docker With MUSA
+
+Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `MUSA_VERSION` set to `rc3.1.0`
+
+The resulting images, are essentially the same as the non-MUSA images:
+
+1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
+3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
diff --git a/docs/install.md b/docs/install.md
new file mode 100644
index 000000000..10a568506
--- /dev/null
+++ b/docs/install.md
@@ -0,0 +1,39 @@
+# Install pre-built version of llama.cpp
+
+## Homebrew
+
+On Mac and Linux, the homebrew package manager can be used via
+
+```sh
+brew install llama.cpp
+```
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+
+## Nix
+
+On Mac and Linux, the Nix package manager can be used via
+
+```sh
+nix profile install nixpkgs#llama-cpp
+```
+For flake enabled installs.
+
+Or
+
+```sh
+nix-env --file '<nixpkgs>' --install --attr llama-cpp
+```
+
+For non-flake enabled installs.
+
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+
+## Flox
+
+On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+
+```sh
+flox install llama-cpp
+```
+
+Flox follows the nixpkgs build of llama.cpp.
diff --git a/docs/llguidance.md b/docs/llguidance.md
new file mode 100644
index 000000000..792d20704
--- /dev/null
+++ b/docs/llguidance.md
@@ -0,0 +1,51 @@
+# LLGuidance Support in llama.cpp
+
+[LLGuidance](https://github.com/guidance-ai/llguidance) is a library for constrained decoding (also called constrained sampling or structured outputs) for Large Language Models (LLMs). Initially developed as the backend for the [Guidance](https://github.com/guidance-ai/guidance) library, it can also be used independently.
+
+LLGuidance supports JSON Schemas and arbitrary context-free grammars (CFGs) written in a [variant](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md) of Lark syntax. It is [very fast](https://github.com/guidance-ai/jsonschemabench/tree/main/maskbench) and has [excellent](https://github.com/guidance-ai/llguidance/blob/main/docs/json_schema.md) JSON Schema coverage but requires the Rust compiler, which complicates the llama.cpp build process.
+
+## Building
+
+To enable LLGuidance support, build llama.cpp with the `LLAMA_LLGUIDANCE` option:
+
+```sh
+cmake -B build -DLLAMA_LLGUIDANCE=ON
+make -C build -j
+```
+
+This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install).
+
+## Interface
+
+There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance.
+
+For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format.
+
+## Performance
+
+Computing a "token mask" (i.e., the set of allowed tokens) for a llama3 tokenizer with 128k tokens takes, on average, 50μs of single-core CPU time for the [JSON Schema Bench](https://github.com/guidance-ai/jsonschemabench). The p99 time is 0.5ms, and the p100 time is 20ms. These results are due to the lexer/parser split and several [optimizations](https://github.com/guidance-ai/llguidance/blob/main/docs/optimizations.md).
+
+## JSON Schema
+
+LLGuidance adheres closely to the JSON Schema specification. For example:
+
+- `additionalProperties` defaults to `true`, unlike current grammars, though you can set `"additionalProperties": false` if needed.
+- any whitespace is allowed.
+- The definition order in the `"properties": {}` object is maintained, regardless of whether properties are required (current grammars always puts required properties first).
+
+Unsupported schemas result in an error message—no keywords are silently ignored.
+
+## Why Not Reuse GBNF Format?
+
+GBNF lacks the concept of a lexer.
+
+Most programming languages, including JSON, use a two-step process: a lexer (built with regular expressions) converts a byte stream into lexemes, which are then processed by a CFG parser. This approach is faster because lexers are cheaper to evaluate, and there is ~10x fewer lexemes than bytes.
+LLM tokens often align with lexemes, so the parser is engaged in under 0.5% of tokens, with the lexer handling the rest.
+
+However, the user has to provide the distinction between lexemes and CFG symbols. In [Lark](https://github.com/lark-parser/lark), lexeme names are uppercase, while CFG symbols are lowercase.
+The [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) can often take care of this automatically.
+See [LLGuidance syntax docs](https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#terminals-vs-rules) for more details.
+
+## Error Handling
+
+Errors are currently printed to `stderr`, and generation continues. Improved error handling may be added in the future.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 653abc73a..66cfab2c3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,43 +6,68 @@ find_package(Threads REQUIRED)
 
 # ...
 
+# flags
+
+llama_add_compile_flags()
+
 # examples
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(baby-llama)
-    add_subdirectory(batched)
     add_subdirectory(batched-bench)
-    add_subdirectory(beam-search)
-    add_subdirectory(benchmark)
-    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(batched)
     add_subdirectory(embedding)
-    add_subdirectory(finetune)
+    add_subdirectory(eval-callback)
+
+    if (NOT WIN32)
+        # disabled on Windows because it uses internal functions not exported with LLAMA_API
+        add_subdirectory(gbnf-validator)
+    endif()
+
+    add_subdirectory(gguf-hash)
+    add_subdirectory(gguf-split)
+    add_subdirectory(gguf)
+    add_subdirectory(gritlm)
+    add_subdirectory(imatrix)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
-    add_subdirectory(llava)
-    if (LLAMA_SYCL)
-        add_subdirectory(sycl)
-    endif()
-    add_subdirectory(main)
-    add_subdirectory(tokenize)
-    add_subdirectory(parallel)
-    add_subdirectory(perplexity)
-    add_subdirectory(quantize)
-    add_subdirectory(quantize-stats)
-    add_subdirectory(save-load-state)
-    add_subdirectory(simple)
-    add_subdirectory(passkey)
-    add_subdirectory(speculative)
     add_subdirectory(lookahead)
     add_subdirectory(lookup)
-    add_subdirectory(gguf)
-    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(imatrix)
+    add_subdirectory(main)
+    add_subdirectory(parallel)
+    add_subdirectory(passkey)
+    add_subdirectory(perplexity)
+    add_subdirectory(quantize)
+    add_subdirectory(retrieval)
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
-    add_subdirectory(export-lora)
+    add_subdirectory(save-load-state)
+    add_subdirectory(run)
+    add_subdirectory(simple)
+    add_subdirectory(simple-chat)
+    add_subdirectory(speculative)
+    add_subdirectory(speculative-simple)
+    add_subdirectory(tokenize)
+    add_subdirectory(tts)
+    add_subdirectory(gen-docs)
+    if (NOT GGML_BACKEND_DL)
+        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(convert-llama2c-to-ggml)
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+        if (NOT WIN32)
+            # disabled on Windows because it uses internal functions not exported with LLAMA_API
+            add_subdirectory(quantize-stats)
+        endif()
+        add_subdirectory(llava)
+        if (GGML_RPC)
+            add_subdirectory(rpc)
+        endif()
+        if (GGML_SYCL)
+            add_subdirectory(sycl)
+        endif()
+    endif()
 endif()
diff --git a/examples/Miku.sh b/examples/Miku.sh
index b9174b4e6..0f6c8c878 100755
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
     GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
 
-./main "${GEN_OPTIONS[@]}" \
+./llama-cli "${GEN_OPTIONS[@]}" \
     --model "$MODEL" \
     --in-prefix " " \
     --in-suffix "${AI_NAME}:" \
diff --git a/examples/alpaca.sh b/examples/alpaca.sh
deleted file mode 100755
index 8d2bae691..000000000
--- a/examples/alpaca.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
-       --color \
-       -f ./prompts/alpaca.txt \
-       --ctx_size 2048 \
-       -n -1 \
-       -ins -b 256 \
-       --top_k 10000 \
-       --temp 0.2 \
-       --repeat_penalty 1.1 \
-       -t 7
diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt
deleted file mode 100644
index 7b70227a5..000000000
--- a/examples/baby-llama/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
deleted file mode 100644
index bf0125e75..000000000
--- a/examples/baby-llama/baby-llama.cpp
+++ /dev/null
@@ -1,1640 +0,0 @@
-#include "ggml.h"
-#include "train.h"
-
-#include <vector>
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <random>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#ifdef LLAMA_DEFAULT_RMS_EPS
-constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
-#else
-constexpr float rms_norm_eps = 5e-6f;
-#endif
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static struct ggml_tensor * randomize_tensor(
-    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
-) {
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return tensor;
-}
-
-struct llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-
-    bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
-    }
-};
-
-static uint32_t get_n_ff(const struct llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
-struct llama_hparams_lora {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    uint32_t n_lora  = 64;
-
-    bool operator!=(const llama_hparams_lora & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
-    }
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-struct llama_layer_lora {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wqa;
-    struct ggml_tensor * wqb;
-    struct ggml_tensor * wka;
-    struct ggml_tensor * wkb;
-    struct ggml_tensor * wva;
-    struct ggml_tensor * wvb;
-    struct ggml_tensor * woa;
-    struct ggml_tensor * wob;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-
-struct llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
-struct llama_model {
-    struct ggml_context * ctx = NULL;
-
-    llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<llama_layer> layers;
-};
-
-struct llama_model_lora {
-    struct ggml_context * ctx = NULL;
-
-    llama_hparams_lora hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * outputa;
-    struct ggml_tensor * outputb;
-
-    std::vector<llama_layer_lora> layers;
-};
-
-static void init_model(struct llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_ff = get_n_ff(&hparams);
-
-    struct ggml_context * ctx = model->ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        // std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
-    }
-}
-
-
-static void init_model_lora(struct llama_model_lora * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_mult  = hparams.n_mult;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_lora  = hparams.n_lora;
-
-    const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
-
-    struct ggml_context * ctx = model->ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
-    model->outputa        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight",         {n_embd, n_vocab});
-    model->outputb        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,  n_lora); // ("output.weight",         {n_embd, n_vocab});
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        // std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
-
-        layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
-        layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
-        layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
-        layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-        layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
-    }
-}
-
-static void set_param_model(struct llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
-}
-
-static void set_param_model_lora(struct llama_model_lora * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->outputa);
-    ggml_set_param(ctx, model->outputb);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wqa);
-        ggml_set_param(ctx, layer.wqb);
-        ggml_set_param(ctx, layer.wka);
-        ggml_set_param(ctx, layer.wkb);
-        ggml_set_param(ctx, layer.wva);
-        ggml_set_param(ctx, layer.wvb);
-        ggml_set_param(ctx, layer.woa);
-        ggml_set_param(ctx, layer.wob);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
-}
-
-static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings , rnd);
-    randomize_tensor_normal(model->norm           , rnd);
-    randomize_tensor_normal(model->output         , rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-
-static void randomize_model_lora(
-    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
-) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm          , rnd);
-    randomize_tensor_normal(model->outputa       , rnd);
-    randomize_tensor_normal(model->outputb       , rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wqa, rnd);
-        randomize_tensor_normal(layer.wqb, rnd);
-        randomize_tensor_normal(layer.wka, rnd);
-        randomize_tensor_normal(layer.wkb, rnd);
-        randomize_tensor_normal(layer.wva, rnd);
-        randomize_tensor_normal(layer.wvb, rnd);
-        randomize_tensor_normal(layer.woa, rnd);
-        randomize_tensor_normal(layer.wob, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            exit(1);
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-}
-
-static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-
-    return true;
-}
-
-static struct ggml_tensor * forward(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static struct ggml_tensor * forward_batch(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past,
-    const  int              n_batch
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [N, n_embd, n_batch, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                    ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_mul_mat(ctx0,
-                                model->layers[il].wv,
-                                cur),
-                        n_embd, N, n_batch),
-                        1, 0, 2, 3));
-
-                assert_shape_3d(Vcur, N, n_embd, n_batch);
-
-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
-                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_2d(ctx0, kc,
-                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
-                        ggml_element_size(kc)*n_embd*n_ctx,
-                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc,
-                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
-
-                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
-                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_4d(ctx0,
-                            ggml_view_3d(ctx0,
-                                kc,
-                                n_embd,
-                                (n_past + N),
-                                n_batch,
-                                n_embd*ggml_element_size(kc),
-                                n_ctx*n_embd*ggml_element_size(kc),
-                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
-                            n_embd/n_head, n_head, n_past + N, n_batch),
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
-
-            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
-            struct ggml_tensor * V =
-                ggml_view_4d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head, n_batch,
-                        ggml_element_size(vc)*n_ctx,
-                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
-            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static struct ggml_tensor * forward_lora(
-    struct llama_model_lora * model,
-    struct llama_kv_cache   * cache,
-    struct ggml_context     * ctx0,
-    struct ggml_cgraph      * gf,
-    struct ggml_tensor      * tokens_input,
-    const  int                n_tokens,
-    const  int                n_past
-) {
-    const int N = n_tokens;
-
-    struct llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    {
-        int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
-        }
-    }
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0,
-                                            ggml_reshape_3d(ctx0,
-                                                ggml_mul_mat(ctx0,
-                                                    model->layers[il].wqa,
-                                                    ggml_mul_mat(ctx0,
-                                                        model->layers[il].wqb,
-                                                        cur)),
-                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0,
-                                            ggml_reshape_3d(ctx0,
-                                                ggml_mul_mat(ctx0,
-                                                    model->layers[il].wka,
-                                                    ggml_mul_mat(ctx0,
-                                                        model->layers[il].wkb,
-                                                        cur)),
-                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                                                ggml_transpose(ctx0,
-                                                    ggml_reshape_2d(ctx0,
-                                                        ggml_mul_mat(ctx0,
-                                                            model->layers[il].wva,
-                                                            ggml_mul_mat(ctx0,
-                                                                model->layers[il].wvb,
-                                                                cur)),
-                                                        n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].woa,
-                    ggml_mul_mat(ctx0,
-                        model->layers[il].wob,
-                        cur));
-        }
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0,
-                model->outputa,
-                    ggml_mul_mat(ctx0,
-                        model->outputb,
-                        inpL));
-
-    // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(ggml_is_matrix(logits));
-    assert(ggml_is_matrix(probs));
-    assert(ggml_is_vector(best_samples));
-    assert(logits->ne[1] == best_samples->ne[0]);
-    assert(logits->ne[0] == probs->ne[0]);
-    assert(logits->ne[1] == probs->ne[1]);
-    for (int i = 0; i < logits->ne[1]; ++i) {
-        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
-        ggml_set_i32_1d(best_samples, i, 0);
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            if (logit > max_logit) {
-                max_logit = logit;
-                ggml_set_i32_1d(best_samples, i, k);
-            }
-        }
-        float psum = 0;
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
-            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
-            psum += p;
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
-        }
-        for (int k = 0; k < logits->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
-        }
-    }
-}
-
-static void sample_softmax_batch(
-    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
-    struct ggml_tensor * best_samples
-) {
-    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(ggml_is_3d(probs));
-    int n_tokens = best_samples->ne[0];
-    int n_batch  = best_samples->ne[1];
-    int n_vocab  = logits->ne[0];
-    GGML_ASSERT(n_tokens == logits->ne[1]);
-    GGML_ASSERT(n_batch  == logits->ne[2]);
-    GGML_ASSERT(n_vocab  == probs->ne[0]);
-    GGML_ASSERT(n_tokens == probs->ne[1]);
-    GGML_ASSERT(n_batch  == probs->ne[2]);
-
-    for (int k = 0; k < n_batch; ++k) {
-        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
-                                                best_samples,
-                                                best_samples->ne[0],
-                                                k*best_samples->nb[1]);
-        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
-                                                logits,
-                                                logits->ne[0],
-                                                logits->ne[1],
-                                                logits->nb[1],
-                                                k*logits->nb[2]);
-        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
-                                                probs,
-                                                probs->ne[0],
-                                                probs->ne[1],
-                                                probs->nb[1],
-                                                k*probs->nb[2]);
-        sample_softmax(logits_k, probs_k, best_samples_k);
-    }
-}
-
-static void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-        printf(" %.2f", p);
-    }
-    printf("\n");
-}
-
-static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
-            printf(" %.2f", p);
-        }
-        printf("\n");
-    }
-}
-
-static void print_token(int token, int n_vocab) {
-    for (int k = 0; k < token; ++k) {
-        printf(" ");
-    }
-    printf("X");
-    for (int k = token+1; k < n_vocab; ++k) {
-        printf(" ");
-    }
-    printf("\n");
-}
-
-static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(token, n_vocab);
-    }
-}
-
-static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = targets->ne[0];
-    float randomness = 0.0f;
-    // ggml_set_zero(targets);
-    ggml_set_f32(targets, -1.0f);
-    ggml_set_i32_1d(tokens_input, 0, 0);
-    for (int i=1; i<n_tokens+1; ++i) {
-        float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
-        float y = sinf(x);//*cosf(x*1.1f+1.0f);
-        float z = (y+1.0f)*0.5f; // scale to [0..1]
-        z += (frand()-0.5f)*(randomness/n_vocab);
-        z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
-        int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
-        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
-        if (i<n_tokens) {
-            ggml_set_i32_1d(tokens_input, i, token);
-        }
-    }
-}
-
-static void get_example_targets_batch(
-    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
-) {
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(targets));
-    int n_tokens = tokens_input->ne[0];
-    int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == targets->ne[1]);
-    GGML_ASSERT(n_batch  == targets->ne[2]);
-
-    for (int k=0; k<n_batch; ++k) {
-        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
-                                                tokens_input,
-                                                tokens_input->ne[0],
-                                                k*tokens_input->nb[1]);
-        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
-                                                targets,
-                                                targets->ne[0],
-                                                targets->ne[1],
-                                                targets->nb[1],
-                                                k*targets->nb[2]);
-        get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
-    }
-}
-
-static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = targets->ne[0];
-    for (int i=0; i<n_tokens-n_shift; ++i) {
-        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
-        for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
-        }
-    }
-}
-
-static struct ggml_tensor * square_error_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
-    // todo: instead of a-b: a[1:]-b[:-1]
-    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
-}
-
-static struct ggml_tensor * cross_entropy_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
-    const float eps = 1e-3f;
-    return
-        ggml_sum(ctx,
-            ggml_neg(ctx,
-                ggml_sum_rows(ctx,
-                    ggml_mul(ctx,
-                        ggml_soft_max(ctx, a),
-                        ggml_log(ctx,
-                            ggml_add1(ctx,
-                                ggml_soft_max(ctx, b),
-                                ggml_new_f32(ctx, eps)))))));
-}
-
-int main(int argc, char ** argv) {
-    if (argc < 1) {
-        fprintf(stderr, "usage: %s\n", argv[0]);
-
-        return 1;
-    }
-
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll;
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    struct llama_model model;
-    model.hparams.n_vocab = 8;
-    model.hparams.n_ctx   = 8;
-    model.hparams.n_embd  = 32;
-    model.hparams.n_mult  = 2;
-    model.hparams.n_head  = 8;
-    model.hparams.n_layer = 1;
-    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
-
-    // model.hparams.n_embd  = 32;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 4;
-    // model.hparams.n_layer = 8;
-    // model.hparams.n_rot   = 8;
-
-    model.ctx = ggml_init(lcparams);
-    printf("init model\n");
-    init_model(&model);
-    set_param_model(&model);
-
-    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
-
-/*
-    struct llama_model_lora model_lora;
-    // model.hparams.n_vocab = 6;
-    // model.hparams.n_ctx   = 64;
-    // model.hparams.n_embd  = 128;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 8;
-    // model.hparams.n_layer = 6;
-    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
-
-    model_lora.hparams.n_vocab = 16;
-    model_lora.hparams.n_ctx   = 32;
-    model_lora.hparams.n_embd  = 256;
-    model_lora.hparams.n_mult  = 2;
-    model_lora.hparams.n_head  = 16;
-    model_lora.hparams.n_layer = 1;
-    model_lora.hparams.n_lora  = 64;
-    model_lora.hparams.n_rot   = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
-    // model.hparams.n_rot   = (model.hparams.n_embd / model.hparams.n_head) / 2;
-
-    // model.hparams.n_embd  = 32;
-    // model.hparams.n_mult  = 2;
-    // model.hparams.n_head  = 4;
-    // model.hparams.n_layer = 8;
-    // model.hparams.n_rot   = 8;
-
-    model_lora.ctx = ggml_init(lcparams);
-    printf("init model_lora\n");
-    init_model_lora(&model_lora);
-    set_param_model_lora(&model_lora);
-
-    randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
-*/
-    int n_batch = 8;
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-    printf("init_kv_cache\n");
-    kv_self.ctx = model.ctx;
-    init_kv_cache(&kv_self, &model, n_batch);
-    //init_kv_cache_lora(&kv_self, &model_lora);
-
-    size_t    compute_size = 1024ll*1024ll*1024ll;
-    uint8_t * compute_addr = new uint8_t[compute_size];
-
-    int n_examples = 256;
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-
-    std::vector<uint8_t> work_buffer;
-
-    for (int ex=0; ex<n_examples; ++ex) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
-        };
-
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
-        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
-
-        int n_past = 0;
-
-        struct ggml_cgraph * gf = NULL;
-        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
-
-        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
-
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
-        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
-        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-
-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-        float error_before_opt = ggml_get_f32_1d(e, 0);
-
-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
-        opt_params_lbfgs.print_forward_graph = false;
-        opt_params_lbfgs.print_backward_graph = false;
-        opt_params_lbfgs.lbfgs.n_iter = 16;
-        ggml_opt(ctx0, opt_params_lbfgs, e);
-        //
-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-        float error_after_opt = ggml_get_f32_1d(e, 0);
-
-        if (ex % 8 == 0) {
-            printf("Example %d\n", (ex+1));
-            printf("error_before_opt: %.2f\n", error_before_opt);
-            printf("error_after_opt:  %.2f\n", error_after_opt);
-        }
-
-        if (ex % 64 == 0) {
-            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
-            // printf("probabilities after optimization:\n");
-            // print_matrix(after_opt_probs);
-            printf("best samples after optimization:\n");
-            print_tokens(after_opt_best_samples, n_vocab);
-        }
-
-        ggml_free(ctx0);
-    }
-
-    {
-        int n_gen = 128;
-        int sample_ctx = n_tokens-n_tokens/8;
-
-        printf("Generating %d tokens.\n", n_gen);
-
-        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
-
-        get_example_targets(137, tokens_input, targets);
-        for (int i=sample_ctx; i<n_tokens; ++i) {
-            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
-        }
-
-        for (int i=0; i<sample_ctx-1; ++i) {
-            print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
-        }
-        printf("---\n");
-        for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
-            };
-            struct ggml_context * ctx0 = ggml_init(params);
-
-            struct ggml_cgraph * gf = NULL;
-            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
-
-            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-
-            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
-
-            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
-            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
-
-            sample_softmax(logits, probs, best_samples);
-
-            // int sample_at = n_tokens-1;
-            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
-
-            // print_row(probs, sample_at);
-            print_token(token, n_vocab);
-
-            lshift_examples(tokens_input, targets, 1);
-            ggml_set_i32_1d(tokens_input, 0, 0);
-            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
-
-            ggml_free(ctx0);
-        }
-    }
-
-    print_matrix(model.tok_embeddings);
-    printf("done\n");
-
-    // ggml_free(kv_self.ctx);
-    // ggml_free(model_lora.ctx);
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/base-translate.sh b/examples/base-translate.sh
deleted file mode 100755
index 00dedd0df..000000000
--- a/examples/base-translate.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
index 40a032c51..68ad707f3 100644
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET batched-bench)
+set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index 34b343f66..df67c47e3 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 
 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 
 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 
 ## Sample results
@@ -49,3 +49,12 @@ There are 2 modes of operation:
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
+
+### JSONL output
+
+Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
+```
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19aff18ae..0659ab6f1 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,79 +1,33 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 
-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
-        return 1 ;
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+        return 1;
     }
 
-    int n_kv_max     = 2048;
-    int is_pp_shared = 0;
-    int n_gpu_layers = 0;
+    common_init();
 
-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
-    std::vector<int> n_tg = { 128, 256, };
-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
-    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
+    int is_pp_shared = params.is_pp_shared;
 
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_kv_max = std::atoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        n_pp = parse_list(argv[5]);
-    }
-
-    if (argc >= 7) {
-        n_tg = parse_list(argv[6]);
-    }
-
-    if (argc >= 8) {
-        n_pl = parse_list(argv[7]);
-    }
+    std::vector<int> n_pp = params.n_pp;
+    std::vector<int> n_tg = params.n_tg;
+    std::vector<int> n_pl = params.n_pl;
 
     // init LLM
 
@@ -82,36 +36,29 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = common_model_params_to_llama(params);
 
-    const std::vector<float> t_split(llama_max_devices(), 0.0f);
-
-    model_params.n_gpu_layers = n_gpu_layers;
-    model_params.tensor_split = t_split.data();
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
-    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
+    // ensure enough sequences are available
+    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
 
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
+    const int32_t n_kv_max = llama_n_ctx(ctx);
+
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
@@ -127,14 +74,15 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 return false;
             }
+
+            llama_synchronize(ctx);
         }
 
         return true;
@@ -143,21 +91,22 @@ int main(int argc, char ** argv) {
     // warm up
     {
         for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
+            common_batch_add(batch, 0, i, { 0 }, false);
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-    LOG_TEE("\n");
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    if (!params.batched_bench_output_jsonl) {
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    }
 
     for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
         for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -172,12 +121,12 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                llama_batch_clear(batch);
+                common_batch_clear(batch);
 
-                const int n_tokens = is_pp_shared ? pp : pl*pp;
-
-                for (int i = 0; i < n_tokens; ++i) {
-                    llama_batch_add(batch, 0, i, { 0 }, false);
+                for (int i = 0; i < pp; ++i) {
+                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+                        common_batch_add(batch, 0, i, { j }, false);
+                    }
                 }
                 batch.logits[batch.n_tokens - 1] = true;
 
@@ -186,13 +135,13 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                     return 1;
                 }
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                     }
                 }
 
@@ -201,14 +150,14 @@ int main(int argc, char ** argv) {
                 const auto t_tg_start = ggml_time_us();
 
                 for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
+                    common_batch_clear(batch);
 
                     for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
+                        common_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                         return 1;
                     }
                 }
@@ -225,21 +174,31 @@ int main(int argc, char ** argv) {
                 const float speed_tg = pl*tg / t_tg;
                 const float speed    = n_kv / t;
 
-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                if(params.batched_bench_output_jsonl) {
+                    LOG(
+                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
+                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
+                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
+                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
+                    );
+                } else {
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                }
             }
         }
     }
 
-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
     llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile
index 2afb24fb8..1f9156e58 100755
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +1,6 @@
 .PHONY: build
 
 build:
-	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./batched_swift
-	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
+	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./llama-batched-swift
+	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift
index 826491def..7e8afd084 100644
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -4,7 +4,7 @@
 import PackageDescription
 
 let package = Package(
-    name: "batched_swift",
+    name: "llama-batched-swift",
     platforms: [.macOS(.v12)],
     dependencies: [
         .package(name: "llama", path: "../../"),
@@ -13,7 +13,7 @@ let package = Package(
         // Targets are the basic building blocks of a package, defining a module or a test suite.
         // Targets can depend on other targets in this package and products from dependencies.
         .executableTarget(
-            name: "batched_swift",
+            name: "llama-batched-swift",
             dependencies: ["llama"],
             path: "Sources",
             linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md
index 4c2721fe8..7f2e2fcdc 100644
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 
 $ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index d75c503d5..55c31166c 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -23,13 +23,17 @@ defer {
 }
 
 let model_params = llama_model_default_params()
-guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
+guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else {
     print("Failed to load model")
     exit(1)
 }
-
 defer {
-    llama_free_model(model)
+    llama_model_free(model)
+}
+
+guard let vocab = llama_model_get_vocab(model) else {
+    print("Failed to get vocab")
+    exit(1)
 }
 
 var tokens = tokenize(text: prompt, add_bos: true)
@@ -37,22 +41,36 @@ var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
 
 var context_params = llama_context_default_params()
-context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
 
-let context = llama_new_context_with_model(model, context_params)
+let context = llama_init_from_model(model, context_params)
 guard context != nil else {
     print("Failed to initialize context")
     exit(1)
 }
-
 defer {
     llama_free(context)
 }
 
+var sparams = llama_sampler_chain_default_params()
+
+let smpl = llama_sampler_chain_init(sparams)
+guard smpl != nil else {
+    print("Failed to initialize sampling")
+    exit(1)
+}
+defer {
+    llama_sampler_free(smpl)
+}
+
+llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
+llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
+llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
+llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
+
 let n_ctx = llama_n_ctx(context)
 
 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@@ -125,35 +143,10 @@ while n_cur <= n_len {
             continue
         }
 
-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
-
-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
-
-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-
-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
-
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
 
         // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
             i_batch[i] = -1
             // print("")
             if n_parallel > 1 {
@@ -210,15 +203,16 @@ if n_parallel > 1 {
 
 let t_main_end = ggml_time_us()
 
-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_print_timings(context)
+llama_perf_sampler_print(smpl)
+llama_perf_context_print(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let utf8Count = text.utf8.count
     let n_tokens = utf8Count + (add_bos ? 1 : 0)
     let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
     var swiftTokens: [llama_token] = []
     for i in 0 ..< tokenCount {
         swiftTokens.append(tokens[Int(i)])
@@ -229,15 +223,17 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
 
 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
     var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false)
     if nTokens < 0 {
         let actualTokensCount = -Int(nTokens)
         result = .init(repeating: 0, count: actualTokensCount)
         let check = llama_token_to_piece(
-            model,
+            vocab,
             token,
             &result,
-            Int32(result.count)
+            Int32(result.count),
+            0,
+            false
         )
         assert(check == actualTokensCount)
     } else {
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
index 6aa178d4d..0d439f498 100644
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET batched)
+set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/batched/README.md b/examples/batched/README.md
index 5d7303317..6013aab01 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 
 ```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 
 ...
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 9be7eb56b..21b95ef5e 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,52 +1,36 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 
-int main(int argc, char ** argv) {
-    gpt_params params;
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG("\n");
+}
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
-        return 1 ;
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.prompt = "Hello my name is";
+    params.n_predict = 32;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        return 1;
     }
 
+    common_init();
+
     // number of parallel batches
-    int n_parallel = 1;
+    int n_parallel = params.n_parallel;
 
     // total length of the sequences including the prompt
-    int n_len = 32;
-
-    // number of layers to offload to the GPU
-    int n_gpu_layers = 0;
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        params.prompt = argv[2];
-    }
-
-    if (argc >= 4) {
-        n_parallel = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_len = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        n_gpu_layers = std::atoi(argv[5]);
-    }
-
-    if (params.prompt.empty()) {
-        params.prompt = "Hello my name is";
-    }
+    int n_predict = params.n_predict;
 
     // init LLM
 
@@ -55,88 +39,113 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = common_model_params_to_llama(params);
 
-    model_params.n_gpu_layers = n_gpu_layers;
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    tokens_list = common_tokenize(vocab, params.prompt, true);
 
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
-    ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_ctx   = n_kv_req;
+    ctx_params.n_batch = std::max(n_predict, n_parallel);
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
 
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
-    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_ctx = llama_n_ctx(ctx);
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
 
     // print the prompt token-by-token
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
-    fflush(stderr);
-
     // create a llama_batch
     // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
+
+    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+    for (int32_t i = 0; i < n_parallel; ++i) {
+        seq_ids[i] = i;
+    }
 
     // evaluate the initial prompt
     for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        common_batch_add(batch, tokens_list[i], i, seq_ids, false);
     }
     GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
 
+    if (llama_model_has_encoder(model)) {
+        if (llama_encode(ctx, batch)) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
+        }
+
+        common_batch_clear(batch);
+        common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
+    }
+
     // llama_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
 
     if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
-    // assign the system KV cache to all parallel sequences
-    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
-    }
+    //// assign the system KV cache to all parallel sequences
+    //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    //for (int32_t i = 1; i < n_parallel; ++i) {
+    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    //}
 
     if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
     }
 
     // main loop
@@ -153,9 +162,9 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    while (n_cur <= n_len) {
+    while (n_cur <= n_predict) {
         // prepare the next batch
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // sample the next token for each parallel sequence / stream
         for (int32_t i = 0; i < n_parallel; ++i) {
@@ -164,36 +173,14 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
 
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
-
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
-
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
-
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation? -> mark the stream as finished
+            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
-                LOG_TEE("\n");
+                LOG("\n");
                 if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                 }
 
                 continue;
@@ -201,16 +188,15 @@ int main(int argc, char ** argv) {
 
             // if there is only one stream, we print immediately to stdout
             if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
+                LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
             }
 
-            streams[i] += llama_token_to_piece(ctx, new_token_id);
+            streams[i] += common_token_to_piece(ctx, new_token_id);
 
             i_batch[i] = batch.n_tokens;
 
             // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
+            common_batch_add(batch, new_token_id, n_cur, { i }, true);
 
             n_decode += 1;
         }
@@ -224,34 +210,35 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
-
     if (n_parallel > 1) {
-        LOG_TEE("\n");
+        LOG("\n");
 
         for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
         }
     }
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
     llama_batch_free(batch);
 
+    llama_sampler_free(smpl);
     llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
 
     llama_backend_free();
 
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468b..000000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 866c6d7a6..000000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
-    llama_context * ctx;
-    llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
-    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
-    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
-        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
-    }
-    return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
-    llama_context * ctx;
-    std::vector<llama_token> response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-//  * Show progress by printing ',' following by number of convergent beam tokens if any.
-//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-//    This is also called when the stop condition is met.
-//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
-    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
-    // Mark beams as EOS as needed.
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        llama_beam_view& beam_view = beams_state.beam_views[i];
-        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
-            beam_view.eob = true;
-        }
-    }
-    printf(",");  // Show progress
-    if (const size_t n = beams_state.common_prefix_length) {
-        callback_data.response.resize(callback_data.response.size() + n);
-        assert(0u < beams_state.n_beams);
-        const llama_token * tokens = beams_state.beam_views[0].tokens;
-        std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%zu", n);
-    }
-    fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
-    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
-    }
-#endif
-}
-
-int main(int argc, char ** argv)
-{
-    gpt_params params;
-    //params.n_gpu_layers = 200;
-
-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc < 2 || argv[1][0] == '-' )
-    {
-        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
-        return 1 ;
-    }
-
-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    params.model = argv[1];
-
-    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
-
-    if ( argc > 3 )
-    {
-        params.prompt = argv[3];
-    }
-
-    if ( params.prompt.empty() )
-    {
-        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
-    }
-
-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
-        return 1;
-    }
-
-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
-
-    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
-
-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
-
-    if (tokens_list.size() > max_tokens_list_size)
-    {
-        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
-             __func__ , tokens_list.size() , max_tokens_list_size );
-        return 1;
-    }
-
-    fprintf( stderr, "\n\n" );
-
-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        std::cout << llama_token_to_piece(ctx, id);
-    }
-    std::cout << std::flush;
-
-    int n_past = 0;
-
-    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
-    {
-        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
-        return 1;
-    }
-    n_past += tokens_list.size();
-
-    beam_search_callback_data callback_data{ctx, {}};
-    size_t const beam_width = static_cast<size_t>(params.n_beams);
-    int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
-    std::cout << "\n\n";
-    for (llama_token const token_id : callback_data.response) {
-        std::cout << llama_token_to_piece(ctx,token_id);
-    }
-    std::cout << std::endl;
-
-    llama_free( ctx );
-    llama_free_model( model );
-
-    llama_backend_free();
-
-    return 0;
-}
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
deleted file mode 100644
index 2bb47bab5..000000000
--- a/examples/benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-set(TARGET benchmark)
-add_executable(${TARGET} benchmark-matmult.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
deleted file mode 100644
index e89f3de2f..000000000
--- a/examples/benchmark/benchmark-matmult.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-#include "common.h"
-#include "ggml.h"
-
-#include <locale.h>
-#include <assert.h>
-#include <math.h>
-#include <cstring>
-#include <cstdio>
-#include <cinttypes>
-#include <unordered_map>
-#include <queue>
-#include <string.h>
-#include <cassert>
-#include <fstream>
-#include <string>
-#include <iterator>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static float tensor_sum_elements(const ggml_tensor * tensor) {
-    double sum = 0;
-    if (tensor->type == GGML_TYPE_F32) {
-        for (int j = 0; j < tensor->ne[1]; j++) {
-            for (int k = 0; k < tensor->ne[0]; k++) {
-                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
-            }
-        }
-    }
-    return sum;
-}
-
-static void tensor_dump(const ggml_tensor * tensor, const char * name) {
-    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
-        tensor->type, ggml_type_name(tensor->type),
-        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
-    float sum = tensor_sum_elements(tensor);
-    printf("Sum of tensor %s is %6.2f\n", name, sum);
-}
-
-#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
-
-struct benchmark_params_struct {
-    int32_t n_threads     = 1;
-    int32_t n_iterations  = 10;
-};
-
-static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
-    fprintf(stderr, "\n");
-}
-
-int main(int argc, char ** argv)  {
-    struct benchmark_params_struct benchmark_params;
-
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            benchmark_params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-i" || arg == "--iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            benchmark_params.n_iterations = std::stoi(argv[i]);
-        }  else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, benchmark_params);
-            exit(0);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        print_usage(argc, argv, benchmark_params);
-        exit(1);
-    }
-
-    print_build_info();
-    printf("Starting Test\n");
-
-    // create the ggml context
-    struct ggml_context * ctx;
-    //const int sizex = 4096;
-    //const int sizey = 11008;
-
-#undef VERBOSE_DEBUGGING
-#ifndef VERBOSE_DEBUGGING
-    const int sizey = 4096;
-    const int sizex = 11008;
-    const int sizez = 128;
-#else
-    /* Working - let's increase size */
-    const int sizey = 1;
-    const int sizex = (8*32);
-    const int sizez = 1;
-
-    /*const int sizey = 1;
-    const int sizex = 3*(8*32);
-    const int sizez = 1;*/
-#endif
-
-    //printf("Memsize required = %i\n", sizex*sizex);
-
-    // TODO: perform the bench for all types or for a user specified type
-    const ggml_type qtype = GGML_TYPE_Q4_1;
-
-    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += 1024*1024*16;
-
-    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /* no_alloc   =*/ 0
-    };
-
-    ctx = ggml_init(params);
-    if (!ctx) {
-        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-        return 1;
-    }
-
-
-    printf("Creating new tensors\n");
-    // printf("Creating new tensor m1\n");
-    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    ggml_set_f32(m11, 1.0f);
-
-    // printf("Creating new tensor m1\n");
-    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
-    ggml_set_f32(m12, 1.5f);
-
-    // printf("Creating new tensor m2\n");
-    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
-    ggml_set_f32(m2, 2.0f);
-
-    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
-    // printf("Creating new tensor m11xm2\n");
-    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
-
-    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf, m11xm2);
-
-    printf("n_threads=%i\n", benchmark_params.n_threads);
-
-    TENSOR_DUMP(m11);
-    TENSOR_DUMP(m2);
-
-    std::vector<uint8_t> work_buffer;
-
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
-
-    TENSOR_DUMP(gf->nodes[0]);
-
-    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
-
-    int32_t nelements = sizex*sizey;
-
-    std::vector<int64_t> hist_cur(1 << 4, 0);
-
-    // Set up a the benchmark matrices
-    // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
-
-    // Set up a the compute graph
-    // printf("Creating new tensor q31\n");
-    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
-
-    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf31, q31);
-
-    // Set up a second graph computation to make sure we override the CPU cache lines
-    // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
-
-    // printf("Creating new tensor q32\n");
-    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
-
-    //printf("Creating compute graph\n");
-    struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf32, q32);
-    printf("n_threads=%i\n", benchmark_params.n_threads);
-
-    const int dimx = sizex;
-    const int dimy = sizey;
-    const int dimz = sizez;
-    long long int flops_per_dot_product = dimy + dimy;
-    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
-    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
-
-
-    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
-
-    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
-    printf("=====================================================================================\n");
-
-    double  gflops_sum = 0;
-    for (int i=0;i<benchmark_params.n_iterations ;i++) {
-
-        long long int start = ggml_time_us();
-        //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
-
-        long long int stop = ggml_time_us();
-        long long int usec = stop-start;
-        double gflops = (double)(flops_per_matrix)/usec/1000.0;
-        gflops_sum += gflops;
-        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
-            i,
-            benchmark_params.n_threads,
-            sizex, sizey, sizez, flops_per_matrix,
-            usec,gflops);
-
-#ifdef VERBOSE_DEBUGGING
-        TENSOR_DUMP("res",gf31.nodes[0])
-#endif
-
-        // Check that the matrix multiplication result is in the right ballpark
-        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
-        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
-        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
-
-        if (delta > allowed_delta)  {
-            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
-                sum_of_F32_reference,
-                sum_of_Q4_result,
-                delta,
-                allowed_delta
-            );
-            exit(0);
-        }
-
-        // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
-    }
-    printf("\n");
-    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
-    printf("=====================================================================================\n");
-}
diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index 35c089d57..1828903c3 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./main $GEN_OPTIONS \
+./llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index 22f5b83d3..9d761ebb8 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
 NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
 NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
 
-SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
-SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
+'|'\
+'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
 
 CTX_SIZE=2048
@@ -62,7 +63,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
     echo 'Prompt cache does not exist, building...'
     # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./main 2>>"$LOG" \
+    ./llama-cli 2>>"$LOG" \
         --batch_size 64 \
         "${OPTS[@]}" \
         --prompt-cache "$PROMPT_CACHE_FILE" \
@@ -109,13 +110,13 @@ while read -e line; do
 
     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
 
-    ./main 2>>"$LOG" "${OPTS[@]}" \
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
             --prompt-cache "$CUR_PROMPT_CACHE" \
             --prompt-cache-all \
             --file "$CUR_PROMPT_FILE" \
             --reverse-prompt "${USER_NAME}:" \
             --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./main
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
         skip_bytes "$n_prompt_len_pre"  # print generation
 
@@ -129,22 +130,19 @@ while read -e line; do
 
     printf ' '
 
-    # HACK get num tokens from debug message
-    # TODO get both messages in one go
-    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./main output!"
+    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
         exit 1
     fi
 
-    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
 
     if ((n_tokens > CTX_ROTATE_POINT)); then
         tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
     fi
 
     # Update cache for next prompt in background, ideally during user input
-    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
           --prompt-cache "$NEXT_PROMPT_CACHE" \
           --file "$NEXT_PROMPT_FILE" \
           --n_predict 1 &
diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh
index 8c7b7bef4..ffdd20084 100755
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
      $PROMPT_TEMPLATE > $PROMPT_FILE
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/main $GEN_OPTIONS \
+./bin/llama-cli $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
diff --git a/examples/chat.sh b/examples/chat.sh
index d567acecd..9f85d1e26 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
     --repeat_penalty 1.0 --color -i \
     -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
index e262d44f9..44e5f722a 100644
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET convert-llama2c-to-ggml)
+set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 0f37d295b..46a42da69 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,13 +2,10 @@
 
 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
 
-To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
 
-`$ make -j`
-
-After successful compilation, following usage options are available:
 ```
-usage: ./convert-llama2c-to-ggml [options]
+usage: ./llama-convert-llama2c-to-ggml [options]
 
 options:
   -h, --help                       show this help message and exit
@@ -19,8 +16,10 @@ options:
 
 An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
 
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
+`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
+
+Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
 
 Now you can use the model with a command like:
 
-`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 8209dcb64..bdf0eed2a 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,6 +1,9 @@
 #include "ggml.h"
+#include "gguf.h"
+
 #include "llama.h"
 #include "common.h"
+#include "log.h"
 
 #include <unordered_map>
 #include <vector>
@@ -8,6 +11,7 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
+#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -78,111 +82,101 @@ typedef struct {
 
 struct TransformerWeights {
     // token embedding table
-    float* token_embedding_table;    // (vocab_size, dim)
+    std::vector<float> token_embedding_table;    // (vocab_size, dim)
     // weights for rmsnorms
-    float* rms_att_weight; // (layer, dim) rmsnorm weights
-    float* rms_ffn_weight; // (layer, dim)
+    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
+    std::vector<float> rms_ffn_weight; // (layer, dim)
     // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
+    std::vector<float> wq; // (layer, dim, dim)
+    std::vector<float> wk; // (layer, dim, dim)
+    std::vector<float> wv; // (layer, dim, dim)
+    std::vector<float> wo; // (layer, dim, dim)
     // weights for ffn
-    float* w1; // (layer, hidden_dim, dim)
-    float* w2; // (layer, dim, hidden_dim)
-    float* w3; // (layer, hidden_dim, dim)
+    std::vector<float> w1; // (layer, hidden_dim, dim)
+    std::vector<float> w2; // (layer, dim, hidden_dim)
+    std::vector<float> w3; // (layer, hidden_dim, dim)
     // final rmsnorm
-    float* rms_final_weight; // (dim,)
+    std::vector<float> rms_final_weight; // (dim,)
     // freq_cis for RoPE relatively positional embeddings
-    // float* freq_cis_real; // (seq_len, dim/2)
-    // float* freq_cis_imag; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
     // (optional) classifier weights for the logits, on the last layer
-    float* wcls;
-
-    ~TransformerWeights() {
-        delete[] token_embedding_table;
-        delete[] rms_att_weight;
-        delete[] rms_ffn_weight;
-        delete[] wq;
-        delete[] wk;
-        delete[] wv;
-        delete[] wo;
-        delete[] w1;
-        delete[] w2;
-        delete[] w3;
-        delete[] rms_final_weight;
-        delete[] wcls;
-    }
+    std::vector<float> wcls;
 };
 
-static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
-    // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
+    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
+    try {
+        w->token_embedding_table.resize(p->vocab_size * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
 
-    w->rms_att_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        w->rms_att_weight.resize(p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
 
-    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        w->rms_ffn_weight.resize(p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
 
-    w->wq = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wq.resize(p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->wk = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
-    w->wv = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
-    w->wo = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wo.resize(p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->rms_final_weight = new float[p->dim]();
-    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        w->rms_final_weight.resize(p->dim);
+        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
 
-    if (shared_weights) {
-        w->wcls = NULL;
-    } else {
-        w->wcls = new float[p->vocab_size * p->dim]();
-        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        if (shared_weights) {
+            w->wcls = {};
+        } else {
+            w->wcls.resize(p->vocab_size * p->dim);
+            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        }
+    }
+    catch (std::length_error &) {
+        die("Invalid configuration. Failed to allocate memory for weights");
     }
 }
 
-static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
+    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
+    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
+    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
+    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
+    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
+    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
+    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
+    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
+    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
+    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
+    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
 
     // Skip freq_cis_real & freq_cis_imag
     int head_size = p->dim / p->n_heads;
     fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
 
-    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
 
     // Check we didn't forget to read anything
     auto curr = ftell(f);
     fseek(f, 0, SEEK_END);
     auto end = ftell(f);
     if (curr != end) {
-        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
+        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
         return 1;
     }
 
@@ -190,26 +184,26 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
 }
 
 static void print_sample_weights(TransformerWeights *w){
-    printf("----- Quick print of first of the weight vales of all the variables\n");
-    printf("%f\n", w->token_embedding_table[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->rms_ffn_weight[0]);
+    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+    LOG_INF("%f\n", w->token_embedding_table[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    LOG_INF("%f\n", w->rms_ffn_weight[0]);
 
-    printf("%f\n", w->wq[0]);
-    printf("%f\n", w->wk[0]);
-    printf("%f\n", w->wv[0]);
-    printf("%f\n", w->wo[0]);
-    printf("%f\n", w->w1[0]);
-    printf("%f\n", w->w2[0]);
-    printf("%f\n", w->w3[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    if (w->wcls) printf("%f\n", w->wcls[0]);
+    LOG_INF("%f\n", w->wq[0]);
+    LOG_INF("%f\n", w->wk[0]);
+    LOG_INF("%f\n", w->wv[0]);
+    LOG_INF("%f\n", w->wo[0]);
+    LOG_INF("%f\n", w->w1[0]);
+    LOG_INF("%f\n", w->w2[0]);
+    LOG_INF("%f\n", w->w3[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
 
-struct llama_vocab {
+struct my_llama_vocab {
     using id    = int32_t;
     using token = std::string;
     using ttype = llama_token_type;
@@ -225,14 +219,16 @@ struct llama_vocab {
 };
 
 struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_ff    = 11008;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
+    uint32_t n_vocab   = 32000;
+    uint32_t n_ctx     = 512;   // this is provided as user input?
+    uint32_t n_embd    = 4096;
+    uint32_t n_ff      = 11008;
+    uint32_t n_mult    = 4;
+    uint32_t n_head    = 32;
+    uint32_t n_head_kv = 32;
+    uint32_t n_layer   = 32;
+    uint32_t n_rot     = 64;
+
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(my_llama_hparams));
     }
@@ -325,14 +321,30 @@ struct train_params {
 };
 
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
+    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
+}
+
+static void print_tensor_info(const struct ggml_context * ctx) {
+    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        LOG_INF("%s: Allocating ", __func__);
+        int64_t total = 1;
+        int i = 0;
+        for (; i < ggml_n_dims(t); ++i) {
+            if (i > 0) LOG("x ");
+            LOG("[%" PRId64 "] ", t->ne[i]);
+            total *= t->ne[i];
+        }
+        if (i > 1) LOG("= [%" PRId64 "] ", total);
+        LOG("float space for %s\n", ggml_get_name(t));
+    }
 }
 
 static void init_model(struct my_llama_model * model) {
@@ -342,6 +354,8 @@ static void init_model(struct my_llama_model * model) {
     const uint32_t n_layer = hparams.n_layer;
     const uint32_t n_vocab = hparams.n_vocab;
 
+    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
+
     const uint32_t n_ff = hparams.n_ff;
     struct ggml_context * ctx = model->ctx;
 
@@ -350,25 +364,8 @@ static void init_model(struct my_llama_model * model) {
     model->train_tokens = 0;
 
     model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
-
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
-
     model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
-
-    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
 
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
@@ -383,8 +380,8 @@ static void init_model(struct my_llama_model * model) {
         layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
         layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
         layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
 
         layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -406,6 +403,8 @@ static void init_model(struct my_llama_model * model) {
         ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
         ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
     }
+
+    print_tensor_info(ctx);
 }
 
 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -421,9 +420,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 static void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = get_f32_2d(probs, k, i);
-        printf(" %f", p);
+        LOG(" %f", p);
     }
-    printf("\n");
+    LOG("\n");
 }
 
 static void print_matrix(struct ggml_tensor * probs) {
@@ -431,39 +430,18 @@ static void print_matrix(struct ggml_tensor * probs) {
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
             float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
+            LOG(" %.2f", p);
         }
-        printf("\n");
+        LOG("\n");
     }
 }
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
+struct my_llama_file {
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
     size_t size;
 
-    llama_file(const char * fname, const char * mode) {
+    my_llama_file(const char * fname, const char * mode) {
         fp = std::fopen(fname, mode);
         if (fp == NULL) {
             size = 0;
@@ -524,7 +502,7 @@ struct llama_file {
         return std::string(chars.data(), len);
     }
 
-    ~llama_file() {
+    ~my_llama_file() {
         if (fp) {
             std::fclose(fp);
         }
@@ -532,7 +510,7 @@ struct llama_file {
 };
 
 static bool is_ggml_file(const char * filename) {
-    llama_file file(filename, "rb");
+    my_llama_file file(filename, "rb");
     if (file.size < 4) {
         return false;
     }
@@ -549,8 +527,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
     return out.str();
 }
 
-static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
     if (is_ggml_file(filename)) {
+        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
         struct ggml_context * ctx_data = NULL;
 
         struct gguf_init_params params = {
@@ -578,6 +557,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
         const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
 
         const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
+            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
+        }
 
         vocab->id_to_token.resize(n_vocab);
 
@@ -595,21 +577,21 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
         gguf_free(ctx);
     } else {
         // assume llama2.c vocabulary
-        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
-        llama_file file(filename, "rb");
+        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        my_llama_file file(filename, "rb");
         if (!file.fp) {
             die_fmt("%s: %s", strerror(errno), filename);
         }
         const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
-        for (llama_vocab::id id=0; id<n_vocab; ++id) {
+        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
             std::string text = file.read_string(len);
 
             unsigned char byte_val;
-            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
             if (id == UNKNOWN_TOKEN_ID) {
                 text = "<unk>";
                 type = LLAMA_TOKEN_TYPE_UNKNOWN;
@@ -638,57 +620,36 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 }
 
 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int ct;
-    switch (ggml_n_dims(gg_weights)) {
-        case 1:
-            ct = 0;
-            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
-                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
-                *ptr = karpathy_weights[ct];
-                ct++;
-            }
-            break;
-        case 2:
-            ct = 0;
-            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
-                    *ptr = karpathy_weights[ct];
-                    ct++;
-                }
-            }
-            break;
-        case 3:
-            ct = 0;
-            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
-                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
-                        *ptr = karpathy_weights[ct];
-                        ct++;
-                    }
-                }
-            }
-            break;
+    int size = 1;
+    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
+        size *= gg_weights->ne[dim];
+    }
+    for (int ct = 0; ct < size; ++ct) {
+        int64_t i0 = 0; int64_t i1 = 0;
+        int64_t i2 = 0; int64_t i3 = 0;
+        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
+        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
     }
 }
 
 static void save_as_llama_model(
-    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
 ) {
     // convert AK weights into GG weights one by one.
     // w->token_embedding_table -> model->tok_embeddings
     // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
+    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
 
-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
     //print_row(model->norm, 0);
 
     // for rms-att-weight
     int row_length = model->hparams.n_embd;
     int n_ff = model->hparams.n_ff;
 
+    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
+
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
         // 1d
@@ -697,9 +658,10 @@ static void save_as_llama_model(
 
         // from 3d matrix layer x dim x dim to 2d matrix dim x dim
         convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
         convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);
 
         convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
         convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
@@ -711,7 +673,7 @@ static void save_as_llama_model(
     std::vector<const char*> tokens;
     std::vector<float> scores;
     std::vector<llama_token_type> token_types;
-    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
+    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
         tokens.push_back(token_data.text.c_str());
         scores.push_back(token_data.score);
         token_types.push_back(token_data.type);
@@ -729,15 +691,15 @@ static void save_as_llama_model(
     gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
     gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
     gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
 
     gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
     gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
     gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
     gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    // n_head_kv is optional, default to n_head
-    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
     gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
     gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
     gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -789,12 +751,12 @@ static void save_as_llama_model(
 
 static struct train_params get_default_train_params() {
     struct train_params params;
-    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
+    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
     params.fn_llama2c_output_model = "ak_llama_model.bin";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+    params.fn_train_data           = "shakespeare.txt";
+    params.fn_checkpoint_in        = "checkpoint.bin";
+    params.fn_checkpoint_out       = "checkpoint.bin";
+    params.fn_model_out            = "ggml-checkpoint-f32.bin";
 
     params.seed       =   -1;
 
@@ -815,7 +777,7 @@ static struct train_params get_default_train_params() {
 
     params.samples_start_after_nl = false;
     params.use_adam               = true;
-    params.use_flash              = true;
+    params.use_flash              = false;
     params.use_scratch            = true;
 
     // only adam
@@ -829,8 +791,8 @@ static struct train_params get_default_train_params() {
     params.adam_alpha        = 1e-3f;
     params.adam_decay        = 1e-3f;
 
-    params.mem_model_gb   = 2;
-    params.mem_compute_gb = 24;
+    params.mem_model_gb    = 2;
+    params.mem_compute_gb  = 24;
     params.mem_compute0_gb = 8;
     params.mem_compute1_gb = 2;
 
@@ -912,39 +874,55 @@ static std::string basename(const std::string &path) {
 }
 
 int main(int argc, char ** argv) {
+    common_init();
+
     struct train_params params = get_default_train_params();
     if (!params_parse(argc, argv, &params)) {
         return 1;
     }
+
     Config config;
     TransformerWeights weights = {};
     {
-        FILE *file = fopen(params.fn_llama2c_model, "rb");
-        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        FILE * file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) {
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            return 1;
+        }
         // read in the config header
-        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        if (fread(&config, sizeof(Config), 1, file) != 1) {
+            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            return 1;
+        }
         auto shared_weights = config.vocab_size > 0;
         config.vocab_size = abs(config.vocab_size);
 
         // read in the Transformer weights
-        malloc_weights(&weights, &config, shared_weights);
-        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
+        alloc_weights(&weights, &config, shared_weights);
+        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            return 1;
+        }
         fclose(file);
     }
 
-    struct llama_vocab vocab;
+    struct my_llama_vocab vocab;
     load_vocab(params.fn_vocab_model, &config, &vocab);
 
     struct my_llama_model model;
-    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
-    model.hparams.n_embd  = config.dim; //params.n_embd;
-    model.hparams.n_ff    = config.hidden_dim;
-    model.hparams.n_mult  = 32;//params.n_mult;
-    model.hparams.n_head  = config.n_heads; //params.n_head;
-    model.hparams.n_layer = config.n_layers; //params.n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_vocab   = config.vocab_size; //llama_vocab_n_vocab(lctx);
+    model.hparams.n_ctx     = params.n_ctx;
+    model.hparams.n_embd    = config.dim; //params.n_embd;
+    model.hparams.n_ff      = config.hidden_dim;
+    model.hparams.n_mult    = 32;//params.n_mult;
+    model.hparams.n_head    = config.n_heads; //params.n_head;
+    model.hparams.n_head_kv = config.n_kv_heads;
+    model.hparams.n_layer   = config.n_layers; //params.n_layer;
+    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
     print_params(&model.hparams);
+
     struct ggml_init_params lcparams;
     lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
     lcparams.mem_buffer = NULL;
@@ -956,7 +934,7 @@ int main(int argc, char ** argv) {
     model.name = basename(params.fn_llama2c_model);
     save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
 
-    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
 
     ggml_free(model.ctx);
     return 0;
diff --git a/convert.py b/examples/convert_legacy_llama.py
similarity index 65%
rename from convert.py
rename to examples/convert_legacy_llama.py
index 63a0a5d78..c4ec5c524 100755
--- a/convert.py
+++ b/examples/convert_legacy_llama.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 
+import logging
 import argparse
 import concurrent.futures
 import enum
@@ -16,23 +17,28 @@ import re
 import signal
 import struct
 import sys
+import textwrap
 import time
 import zipfile
-from abc import ABCMeta, abstractmethod
+from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
 
 import numpy as np
-from sentencepiece import SentencePieceProcessor
 
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    # use .parent.parent since we are in "examples" directory
+    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
+
 import gguf
+from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
 
 if TYPE_CHECKING:
-    from typing import TypeAlias
+    from typing_extensions import Self, TypeAlias
+
+logger = logging.getLogger("convert")
 
 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
     faulthandler.register(signal.SIGUSR1)
@@ -43,6 +49,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
 
 DEFAULT_CONCURRENCY = 8
 
+ADDED_TOKENS_FILE = 'added_tokens.json'
+FAST_TOKENIZER_FILE = 'tokenizer.json'
+
 #
 # data types
 #
@@ -135,7 +144,8 @@ class GGMLFileType(enum.IntEnum):
         dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
         if dt is None:
             raise ValueError(self)
-        # 1D tensors are always F32.
+        # Convert all 1D tensors to F32.  Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
+        #  Also The 1d tensors aren't much of a performance/size issue.  So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
         return dt if len(tensor.shape) > 1 else DT_F32
 
 
@@ -166,7 +176,7 @@ class Params:
     rope_scaling_type: gguf.RopeScalingType | None = None
     f_rope_freq_base: float | None = None
     f_rope_scale: float | None = None
-    n_orig_ctx: int | None = None
+    n_ctx_orig: int | None = None
     rope_finetuned: bool | None = None
 
     ftype: GGMLFileType | None = None
@@ -188,8 +198,10 @@ class Params:
             n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
         if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            msg = """\
+                failed to guess 'n_layer'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))
 
         n_head = n_embd // 128 # guessed
         n_mult = 256           # guessed
@@ -211,9 +223,10 @@ class Params:
 
     @staticmethod
     def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
+            config = json.load(f)
 
-        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
         rope_scaling = config.get("rope_scaling")
 
         if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -223,7 +236,7 @@ class Params:
                 rope_scaling_type = gguf.RopeScalingType.LINEAR
             elif typ == "yarn":
                 rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                n_ctx_orig = rope_scaling['original_max_position_embeddings']
                 rope_finetuned = rope_scaling['finetuned']
             else:
                 raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -233,8 +246,10 @@ class Params:
         elif "max_position_embeddings" in config:
             n_ctx = config["max_position_embeddings"]
         else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            msg = """\
+                failed to guess 'n_ctx'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))
 
         n_experts      = None
         n_experts_used = None
@@ -257,7 +272,7 @@ class Params:
             f_rope_freq_base  = config.get("rope_theta"),
             rope_scaling_type = rope_scaling_type,
             f_rope_scale      = f_rope_scale,
-            n_orig_ctx        = n_orig_ctx,
+            n_ctx_orig        = n_ctx_orig,
             rope_finetuned    = rope_finetuned,
         )
 
@@ -265,11 +280,13 @@ class Params:
     # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
     @staticmethod
     def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
+            config = json.load(f)
 
         n_experts      = None
         n_experts_used = None
         f_rope_freq_base = None
+        n_ff = None
 
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
         if config.get("moe"):
@@ -294,6 +311,8 @@ class Params:
             n_experts_used = config["moe"]["num_experts_per_tok"]
             f_rope_freq_base = 1e6
 
+        assert n_ff is not None
+
         return Params(
             n_vocab          = model["tok_embeddings.weight"].shape[0],
             n_embd           = config["dim"],
@@ -327,235 +346,6 @@ class Params:
         return params
 
 
-#
-# vocab
-#
-
-class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        if isinstance(self.bpe_tokenizer.get('model'), dict):
-            self.vocab = self.bpe_tokenizer["model"]["vocab"]
-        else:
-            self.vocab = self.bpe_tokenizer
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-            if not tokenizer_json_file.is_file():
-                added_tokens = {}
-            else:
-                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
-                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer)
-
-        vocab_size: int = len(self.vocab)
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids      = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
-
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict    = added_tokens
-        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens   = fname_added_tokens
-
-    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
-
-        for i, _ in enumerate(self.vocab):
-            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.bpe_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
-
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-
-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
-        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
-
-        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
-        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_dict = added_tokens
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
-            score: float = tokenizer.get_score(i)
-
-            toktype = gguf.TokenType.NORMAL
-            if tokenizer.is_unknown(i):
-                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.is_control(i):
-                toktype = gguf.TokenType.CONTROL
-
-            # NOTE: I think added_tokens are user defined.
-            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-
-            if tokenizer.is_unused(i):
-                toktype = gguf.TokenType.UNUSED
-            if tokenizer.is_byte(i):
-                toktype = gguf.TokenType.BYTE
-
-            yield text, score, toktype
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.sentencepiece_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class HfVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
-        try:
-            from transformers import AutoTokenizer
-        except ImportError as e:
-            raise ImportError(
-                "To use HfVocab, please install the `transformers` package. "
-                "You can install it with `pip install transformers`."
-            ) from e
-
-        print("fname_tokenizer:", fname_tokenizer)
-        # Allow the tokenizer to default to slow or fast versions.
-        # Explicitly set tokenizer to use local paths.
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            fname_tokenizer,
-            cache_dir=fname_tokenizer,
-            local_files_only=True,
-        )
-
-        # Initialize lists and dictionaries for added tokens
-        self.added_tokens_list = []
-        self.added_tokens_dict = dict()
-        self.added_tokens_ids  = set()
-
-        # Process added tokens
-        for tok, tokidx in sorted(
-            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
-        ):
-            # Only consider added tokens that are not in the base vocabulary
-            if tokidx >= self.tokenizer.vocab_size:
-                self.added_tokens_list.append(tok)
-                self.added_tokens_dict[tok] = tokidx
-                self.added_tokens_ids.add(tokidx)
-
-        # Store special tokens and their IDs
-        self.specials = {
-            tok: self.tokenizer.get_vocab()[tok]
-            for tok in self.tokenizer.all_special_tokens
-        }
-        self.special_ids = set(self.tokenizer.all_special_ids)
-
-        # Set vocabulary sizes
-        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
-
-        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        reverse_vocab = {
-            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
-        }
-
-        for token_id in range(self.vocab_size_base):
-            # Skip processing added tokens here
-            if token_id in self.added_tokens_ids:
-                continue
-
-            # Convert token text to bytes
-            token_text = reverse_vocab[token_id].encode("utf-8")
-
-            # Yield token text, score, and type
-            yield token_text, self.get_token_score(token_id), self.get_token_type(
-                token_id, token_text, self.special_ids  # Reuse already stored special IDs
-            )
-
-    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
-        # Special case for byte tokens
-        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
-            return gguf.TokenType.BYTE
-
-        # Determine token type based on whether it's a special token
-        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-
-    def get_token_score(self, token_id: int) -> float:
-        # Placeholder for actual logic to determine the token's score
-        # This needs to be implemented based on specific requirements
-        return -1000.0  # Default score
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            if text in self.specials:
-                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
-                score = self.get_token_score(self.specials[text])
-            else:
-                toktype = gguf.TokenType.USER_DEFINED
-                score = -1000.0
-
-            yield text.encode("utf-8"), score, toktype
-
-    def has_newline_token(self):
-        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.hf_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
-
-
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@@ -563,7 +353,6 @@ Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
 
 
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
-    # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
     if n_head_kv is not None and n_head != n_head_kv:
         n_head = n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -571,17 +360,18 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
             .reshape(weights.shape))
 
 
-class Tensor(metaclass=ABCMeta):
+class Tensor(ABC):
+    ndarray: NDArray
     data_type: DataType
 
     @abstractmethod
-    def astype(self, data_type: DataType) -> Tensor: ...
+    def astype(self, data_type: DataType) -> Self: ...
     @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
+    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
     @abstractmethod
-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
     @abstractmethod
-    def part(self, n_part: int) -> UnquantizedTensor: ...
+    def part(self, n_part: int) -> Self: ...
     @abstractmethod
     def to_ggml(self) -> GGMLCompatibleTensor: ...
 
@@ -593,18 +383,18 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
 
 
 class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray) -> None:
+    def __init__(self, ndarray: NDArray):
         assert isinstance(ndarray, np.ndarray)
         self.ndarray = ndarray
         self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
 
-    def astype(self, data_type: DataType) -> Tensor:
+    def astype(self, data_type: DataType) -> UnquantizedTensor:
         dtype = data_type.dtype
         if self.data_type == DT_BF16:
             self.ndarray = bf16_to_fp32(self.ndarray)
         return UnquantizedTensor(self.ndarray.astype(dtype))
 
-    def to_ggml(self) -> UnquantizedTensor:
+    def to_ggml(self) -> Self:
         return self
 
     def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -666,13 +456,14 @@ class LazyTensor:
 
 LazyModel: TypeAlias = 'dict[str, LazyTensor]'
 
+ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
 
 @dataclass
 class ModelPlus:
     model: LazyModel
     paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
-    vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.
+    format: ModelFormat
+    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
 
 
 def merge_sharded(models: list[LazyModel]) -> LazyModel:
@@ -681,7 +472,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
     names = {name: None for model in models for name in model}
 
     def convert(name: str) -> LazyTensor:
-        lazy_tensors: list[LazyTensor] = [model[name] for model in models]
+        lazy_tensors = [model[name] for model in models]
         if len(lazy_tensors) == 1:
             # only one file; don't go through this procedure since there might
             # be quantized tensors
@@ -702,7 +493,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
 
         def load() -> UnquantizedTensor:
             ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            concatenated = np.concatenate(ndarrays, axis=axis)
             return UnquantizedTensor(concatenated)
         description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
         return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@@ -710,7 +501,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
 
 
 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
+    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
     assert len(formats) == 1, "different formats?"
     format = formats.pop()
     paths = [path for mp in models_plus for path in mp.paths]
@@ -729,7 +520,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
     else:
         model = merge_sharded([mp.model for mp in models_plus])
 
-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)
 
 
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -754,6 +545,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
     return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
 
 
+def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
+    def load() -> Tensor:
+        tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
+        return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
+    s = lazy_tensors[0].shape.copy()
+    s.insert(0, len(lazy_tensors))
+    return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
+
+
 # Functionality that simulates `torch.load` but where individual tensors are
 # only loaded into memory on demand, not all at once.
 # PyTorch can't do this natively as of time of writing:
@@ -790,10 +590,10 @@ class LazyUnpickler(pickle.Unpickler):
 
         def load(offset: int, elm_count: int) -> NDArray:
             dtype = data_type.dtype
-            fp = self.zip_file.open(info)
-            fp.seek(offset * dtype.itemsize)
-            size = elm_count * dtype.itemsize
-            data = fp.read(size)
+            with self.zip_file.open(info) as fp:
+                fp.seek(offset * dtype.itemsize)
+                size = elm_count * dtype.itemsize
+                data = fp.read(size)
             assert len(data) == size
             return np.frombuffer(data, dtype)
         description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@@ -814,7 +614,7 @@ class LazyUnpickler(pickle.Unpickler):
     def rebuild_from_type_v2(func, new_type, args, state):
         return func(*args)
 
-    CLASSES: dict[tuple[str, str], Any] = {
+    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
         # getattr used here as a workaround for mypy not being smart enough to determine
         # the staticmethods have a __func__ attribute.
         ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -873,7 +673,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
 def must_read(fp: IO[bytes], length: int) -> bytes:
     ret = fp.read(length)
     if len(ret) < length:
-        raise Exception("unexpectedly reached end of file")
+        raise EOFError("unexpectedly reached end of file")
     return ret
 
 
@@ -931,21 +731,24 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
             yield result
 
 
-def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
+def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
     # Handle special case where the model's vocab size is not set
     if params.n_vocab == -1:
         raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
+            "The model's vocab size is set to -1 in params.json. Please update it manually."
+            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
         )
+    if not isinstance(vocab, Vocab):
+        return  # model has no vocab
 
     # Check for a vocab size mismatch
     if params.n_vocab == vocab.vocab_size:
-        print("Ignoring added_tokens.json since model matches vocab size without it.")
+        logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
         return
 
     if pad_vocab and params.n_vocab > vocab.vocab_size:
         pad_count = params.n_vocab - vocab.vocab_size
-        print(
+        logger.debug(
             f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
         )
         for i in range(1, pad_count + 1):
@@ -960,27 +763,128 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
     if vocab.vocab_size < params.n_vocab:
         msg += " Add the --pad-vocab option and try again."
 
-    raise Exception(msg)
+    raise ValueError(msg)
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
-    def add_meta_arch(self, params: Params) -> None:
+    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
+        # Metadata About The Model And Its Provenence
         name = "LLaMA"
-
-        # TODO: better logic to determine model name
-        if params.n_ctx == 4096:
-            name = "LLaMA v2"
+        if metadata is not None and metadata.name is not None:
+            name = metadata.name
         elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            name = params.path_model.name
+        elif params.n_ctx == 4096:
+            # Heuristic detection of LLaMA v2 model
+            name = "LLaMA v2"
 
-        self.gguf.add_name                (name)
-        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_name(name)
+
+        if metadata is not None:
+            if metadata.author is not None:
+                self.gguf.add_author(metadata.author)
+            if metadata.version is not None:
+                self.gguf.add_version(metadata.version)
+            if metadata.organization is not None:
+                self.gguf.add_organization(metadata.organization)
+
+            if metadata.finetune is not None:
+                self.gguf.add_finetune(metadata.finetune)
+            if metadata.basename is not None:
+                self.gguf.add_basename(metadata.basename)
+
+            if metadata.description is not None:
+                self.gguf.add_description(metadata.description)
+            if metadata.quantized_by is not None:
+                self.gguf.add_quantized_by(metadata.quantized_by)
+
+            if metadata.size_label is not None:
+                self.gguf.add_size_label(metadata.size_label)
+
+            if metadata.license is not None:
+                self.gguf.add_license(metadata.license)
+            if metadata.license_name is not None:
+                self.gguf.add_license_name(metadata.license_name)
+            if metadata.license_link is not None:
+                self.gguf.add_license_link(metadata.license_link)
+
+            if metadata.url is not None:
+                self.gguf.add_url(metadata.url)
+            if metadata.doi is not None:
+                self.gguf.add_doi(metadata.doi)
+            if metadata.uuid is not None:
+                self.gguf.add_uuid(metadata.uuid)
+            if metadata.repo_url is not None:
+                self.gguf.add_repo_url(metadata.repo_url)
+
+            if metadata.source_url is not None:
+                self.gguf.add_source_url(metadata.source_url)
+            if metadata.source_doi is not None:
+                self.gguf.add_source_doi(metadata.source_doi)
+            if metadata.source_uuid is not None:
+                self.gguf.add_source_uuid(metadata.source_uuid)
+            if metadata.source_repo_url is not None:
+                self.gguf.add_source_repo_url(metadata.source_repo_url)
+
+            if metadata.base_models is not None:
+                self.gguf.add_base_model_count(len(metadata.base_models))
+                for key, base_model_entry in enumerate(metadata.base_models):
+                    if "name" in base_model_entry:
+                        self.gguf.add_base_model_name(key, base_model_entry["name"])
+                    if "author" in base_model_entry:
+                        self.gguf.add_base_model_author(key, base_model_entry["author"])
+                    if "version" in base_model_entry:
+                        self.gguf.add_base_model_version(key, base_model_entry["version"])
+                    if "organization" in base_model_entry:
+                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
+                    if "description" in base_model_entry:
+                        self.gguf.add_base_model_description(key, base_model_entry["description"])
+                    if "url" in base_model_entry:
+                        self.gguf.add_base_model_url(key, base_model_entry["url"])
+                    if "doi" in base_model_entry:
+                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
+                    if "uuid" in base_model_entry:
+                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
+                    if "repo_url" in base_model_entry:
+                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
+
+            if metadata.datasets is not None:
+                self.gguf.add_dataset_count(len(metadata.datasets))
+                for key, dataset_entry in enumerate(metadata.datasets):
+                    if "name" in dataset_entry:
+                        self.gguf.add_dataset_name(key, dataset_entry["name"])
+                    if "author" in dataset_entry:
+                        self.gguf.add_dataset_author(key, dataset_entry["author"])
+                    if "version" in dataset_entry:
+                        self.gguf.add_dataset_version(key, dataset_entry["version"])
+                    if "organization" in dataset_entry:
+                        self.gguf.add_dataset_organization(key, dataset_entry["organization"])
+                    if "description" in dataset_entry:
+                        self.gguf.add_dataset_description(key, dataset_entry["description"])
+                    if "url" in dataset_entry:
+                        self.gguf.add_dataset_url(key, dataset_entry["url"])
+                    if "doi" in dataset_entry:
+                        self.gguf.add_dataset_doi(key, dataset_entry["doi"])
+                    if "uuid" in dataset_entry:
+                        self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
+                    if "repo_url" in dataset_entry:
+                        self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
+            if metadata.tags is not None:
+                self.gguf.add_tags(metadata.tags)
+            if metadata.languages is not None:
+                self.gguf.add_languages(metadata.languages)
+
+    def add_meta_arch(self, params: Params) -> None:
+        # Metadata About The Neural Architecture Itself
+        self.gguf.add_vocab_size(params.n_vocab)
+        self.gguf.add_context_length(params.n_ctx)
+        self.gguf.add_embedding_length(params.n_embd)
+        self.gguf.add_block_count(params.n_layer)
+        self.gguf.add_feed_forward_length(params.n_ff)
         self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
         self.gguf.add_head_count          (params.n_head)
         self.gguf.add_head_count_kv       (params.n_head_kv)
@@ -1004,8 +908,8 @@ class OutputFile:
             self.gguf.add_rope_scaling_type(params.rope_scaling_type)
             self.gguf.add_rope_scaling_factor(params.f_rope_scale)
 
-        if params.n_orig_ctx is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+        if params.n_ctx_orig is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
 
         if params.rope_finetuned is not None:
             self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
@@ -1013,20 +917,6 @@ class OutputFile:
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)
 
-    def handle_tokenizer_model(self, vocab: Vocab) -> str:
-        # Map the vocab types to the supported tokenizer models
-        tokenizer_model = {
-            SentencePieceVocab: "llama",
-            HfVocab: "llama",
-            BpeVocab: "gpt2",
-        }.get(type(vocab))
-
-        # Block if vocab type is not predefined
-        if tokenizer_model is None:
-            raise ValueError("Unknown vocab type: Not supported")
-
-        return tokenizer_model
-
     def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
         tokens = []
         scores = []
@@ -1043,11 +933,8 @@ class OutputFile:
         return tokens, scores, toktypes
 
     def add_meta_vocab(self, vocab: Vocab) -> None:
-        # Handle the tokenizer model
-        tokenizer_model = self.handle_tokenizer_model(vocab)
-
         # Ensure that tokenizer_model is added to the GGUF model
-        self.gguf.add_tokenizer_model(tokenizer_model)
+        self.gguf.add_tokenizer_model(vocab.tokenizer_model)
 
         # Extract model vocabulary for model conversion
         tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@@ -1074,19 +961,40 @@ class OutputFile:
     def write_tensor_info(self) -> None:
         self.gguf.write_ti_data_to_file()
 
+    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
+        if ftype == GGMLFileType.MostlyQ8_0:
+            ndarrays = bounded_parallel_map(
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
+                use_processpool_executor=True,
+            )
+        else:
+            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
+
+        start = time.time()
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            elapsed = time.time() - start
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            logger.info(
+                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
+            )
+            self.gguf.write_tensor_data(ndarray)
+
     def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
     def write_vocab_only(
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
     ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
+        of.add_meta_model(params, metadata)
         of.add_meta_arch(params)
         of.add_meta_vocab(vocab)
         of.add_meta_special_vocab(svocab)
@@ -1110,18 +1018,23 @@ class OutputFile:
 
     @staticmethod
     def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
         concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
         pad_vocab: bool = False,
+        metadata: gguf.Metadata | None = None,
     ) -> None:
         check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
+        of.add_meta_model(params, metadata)
         of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
-        of.add_meta_special_vocab(svocab)
+        if isinstance(vocab, Vocab):
+            of.add_meta_vocab(vocab)
+            of.add_meta_special_vocab(svocab)
+        else:  # NoVocab
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
 
         # tensor info
         for name, lazy_tensor in model.items():
@@ -1131,24 +1044,7 @@ class OutputFile:
         of.write_tensor_info()
 
         # tensor data
-        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
-        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
-                use_processpool_executor=True,
-            )
-        else:
-            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
-
-        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            elapsed = time.time() - start
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
-            padi = len(str(len(model)))
-            print(
-                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
-            )
-            of.gguf.write_tensor_data(ndarray)
+        of.write_tensor_data(ftype, model, concurrency)
 
         of.close()
 
@@ -1156,16 +1052,44 @@ class OutputFile:
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
     wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
 
-    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
         return GGMLFileType.AllF32
-    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
         return GGMLFileType.MostlyF16
     if output_type_str == "q8_0":
         return GGMLFileType.MostlyQ8_0
 
     name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
 
-    raise Exception(f"Unexpected combination of types: {name_to_type}")
+    raise ValueError(f"Unexpected combination of types: {name_to_type}")
+
+
+def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
+    total_params = 0
+    shared_params = 0
+    expert_params = 0
+
+    for name, lazy_tensor in tensors:
+        # We don't need these
+        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Got A Tensor
+        sum_weights_in_tensor: int = 1
+
+        # Tensor Volume
+        for dim in lazy_tensor.shape:
+            sum_weights_in_tensor *= dim
+
+        if ".experts." in name:
+            if ".experts.0." in name:
+                expert_params += sum_weights_in_tensor
+        else:
+            shared_params += sum_weights_in_tensor
+
+        total_params += sum_weights_in_tensor
+
+    return total_params, shared_params, expert_params
 
 
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1175,19 +1099,35 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
 
 def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
     tmap = gguf.TensorNameMap(ARCH, params.n_layer)
-    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
 
     tmp = model
 
+    # merge experts into one tensor
+    if params.n_experts and params.n_experts > 0:
+        for i_l in range(params.n_layer):
+            for w in range(1, 4):
+                experts = []
+                for e in range(params.n_experts):
+                    if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
+                        experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
+                        del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
+                    elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
+                        experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
+                        del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
+                    else:
+                        raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
+                tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
+
     # HF models permut or pack some of the tensors, so we need to undo that
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            print(f"Permuting layer {i}")
+            logger.debug(f"Permuting layer {i}")
             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
             # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
         elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            print(f"Unpacking and permuting layer {i}")
+            logger.debug(f"Unpacking and permuting layer {i}")
             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
             tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@@ -1200,16 +1140,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
         tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
         if name_new is None:
             if skip_unknown:
-                print(f"Unexpected tensor name: {name} - skipping")
+                logger.warning(f"Unexpected tensor name: {name} - skipping")
                 continue
-            else:
-                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
+            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
 
         if tensor_type in should_skip:
-            print(f"skipping tensor {name_new}")
+            logger.debug(f"skipping tensor {name_new}")
             continue
 
-        print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+        logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
         out[name_new] = lazy_tensor
 
     return out
@@ -1220,7 +1159,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
     the nth path in the model.
     '''
     # Support the following patterns:
-    patterns: list[tuple[str, str]] = [
+    patterns = [
         # - x.00.pth, x.01.pth, etc.
         (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
         # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1259,22 +1198,22 @@ def load_some_model(path: Path) -> ModelPlus:
     # Be extra-friendly and accept either a file or a directory:
     if path.is_dir():
         # Check if it's a set of safetensors files first
-        globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+        globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
         files = [file for glob in globs for file in path.glob(glob)]
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
             files = [file for glob in globs for file in path.glob(glob)]
         if not files:
-            raise Exception(f"Can't find model in directory {path}")
+            raise FileNotFoundError(f"Can't find model in directory {path}")
         if len(files) > 1:
-            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
         path = files[0]
 
     paths = find_multifile_paths(path)
     models_plus: list[ModelPlus] = []
     for path in paths:
-        print(f"Loading model file {path}")
+        logger.info(f"Loading model file {path}")
         models_plus.append(lazy_load_file(path))
 
     model_plus = merge_multifile_models(models_plus)
@@ -1282,39 +1221,14 @@ def load_some_model(path: Path) -> ModelPlus:
 
 
 class VocabFactory:
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
+
     def __init__(self, path: Path):
         self.path = path
-        self.files: dict[str, Path | None] = {
-            "tokenizer.model": None,
-            "vocab.json": None,
-            "tokenizer.json": None,
-        }
-        self._detect_files()
 
-    def _detect_files(self):
-        for file in self.files.keys():
-            file_path = self.path / file
-            parent_file_path = self.path.parent / file
-            if file_path.exists():
-                self.files[file] = file_path
-            elif parent_file_path.exists():
-                self.files[file] = parent_file_path
-        print(f"Found vocab files: {self.files}")
-
-    def _select_file(self, vocabtype: str | None) -> Path:
-        if vocabtype in ["spm", "bpe"]:
-            for file_key in self.files.keys():
-                if (file := self.files[file_key]) is not None:
-                    return file
-            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        if vocabtype == "hfft":
-            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
-            return self.path
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
-
-    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
-        load_merges = vocabtype == "bpe"
-        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
+    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
+        load_merges = vocab.name == "bpe"
+        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
         return gguf.SpecialVocab(
             model_parent_path,
             load_merges=load_merges,
@@ -1322,56 +1236,74 @@ class VocabFactory:
             n_vocab=n_vocab,
         )
 
-    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
-        path = self._select_file(vocabtype)
-        print(f"Loading vocab file '{path}', type '{vocabtype}'")
+    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
+        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
+        selected_vocabs: dict[str, type[Vocab]] = {}
+        for vtype in vocab_types:
+            try:
+                selected_vocabs[vtype] = vocab_classes[vtype]
+            except KeyError:
+                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
 
-        added_tokens_path = path.parent / "added_tokens.json"
-        vocab: Vocab
-        if vocabtype == "bpe":
-            vocab = BpeVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
-            )
-        elif vocabtype == "spm":
-            vocab = SentencePieceVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
-            )
-        elif vocabtype == "hfft":
-            vocab = HfVocab(
-                path, added_tokens_path if added_tokens_path.exists() else None
-            )
+        for vtype, cls in selected_vocabs.items():
+            try:
+                vocab = cls(self.path)
+                break
+            except FileNotFoundError:
+                pass  # ignore unavailable tokenizers
         else:
-            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
+
+        logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
+        return vocab
+
+    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
+        vocab: BaseVocab
+        if vocab_types is None:
+            vocab = NoVocab()
+        else:
+            vocab = self._create_vocab_by_path(vocab_types)
         # FIXME: Respect --vocab-dir?
         special_vocab = self._create_special_vocab(
             vocab,
-            vocabtype,
             model_parent_path,
         )
         return vocab, special_vocab
 
 
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
-    namestr = {
-        GGMLFileType.AllF32:    "f32",
-        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0:"q8_0",
+def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
+    name = metadata.name if metadata.name is not None else None
+    basename = metadata.basename if metadata.basename is not None else None
+    finetune = metadata.finetune if metadata.finetune is not None else None
+    version = metadata.version if metadata.version is not None else None
+    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
+
+    output_type = {
+        GGMLFileType.AllF32:    "F32",
+        GGMLFileType.MostlyF16: "F16",
+        GGMLFileType.MostlyQ8_0: "Q8_0",
     }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+
+    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
+
+
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
+    ret = model_paths[0].parent / f"{default_filename}.gguf"
     if ret in model_paths:
-        sys.stderr.write(
+        logger.error(
             f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n")
+            "Please explicitly specify a path using --outfile.")
         sys.exit(1)
     return ret
 
 
 def do_dump_model(model_plus: ModelPlus) -> None:
-    print(f"model_plus.paths = {model_plus.paths!r}")
-    print(f"model_plus.format = {model_plus.format!r}")
-    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
+    print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
+    print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
     for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
 
 
 def main(args_in: list[str] | None = None) -> None:
@@ -1379,15 +1311,14 @@ def main(args_in: list[str] | None = None) -> None:
     if np.uint32(1) == np.uint32(1).newbyteorder("<"):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
-    vocab_types = ["spm", "bpe", "hfft"]
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
+    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
     parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
     parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
     parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
+    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
     parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
     parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
     parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
     parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
     parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
@@ -1395,88 +1326,136 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
     parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
     parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
+    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
+    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
+    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")
 
     args = parser.parse_args(args_in)
-    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
-        tmp_model_path = args.model / "weighted_model"
-        if tmp_model_path.is_dir():
-            print(f"{tmp_model_path} exists as a weighted model.")
-        else:
-            tmp_model_path.mkdir(parents=True, exist_ok=True)
-            print("Saving new weighted model ...")
-            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
-            print(f"Saved weighted model at {tmp_model_path}.")
-        args.model = tmp_model_path
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    elif args.dump_single or args.dump or args.get_outfile:
+        # Avoid printing anything besides the dump output
+        logging.basicConfig(level=logging.WARNING)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    model_name = args.model_name
+    dir_model = args.model
+
+    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
+
+    if args.get_outfile:
+        model_plus = load_some_model(dir_model)
+        params = Params.load(model_plus)
+        model = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
+        ftype = pick_output_type(model, args.outtype)
+
+        if (metadata is None or metadata.name is None) and params.path_model is not None:
+            metadata.name = params.path_model.name
+
+        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
+        return
+
+    if args.no_vocab and args.vocab_only:
+        raise ValueError("--vocab-only does not make sense with --no-vocab")
 
     if args.dump_single:
-        model_plus = lazy_load_file(args.model)
+        model_plus = lazy_load_file(dir_model)
         do_dump_model(model_plus)
         return
 
     if not args.vocab_only:
-        model_plus = load_some_model(args.model)
+        model_plus = load_some_model(dir_model)
     else:
-        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
 
     if args.dump:
         do_dump_model(model_plus)
         return
+
     endianess = gguf.GGUFEndian.LITTLE
     if args.big_endian:
         endianess = gguf.GGUFEndian.BIG
 
-    params = Params.load(model_plus)
-    if params.n_ctx == -1:
-        if args.ctx is None:
-            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                            "Please specify one with --ctx:\n"
-                            " - LLaMA v1: --ctx 2048\n"
-                            " - LLaMA v2: --ctx 4096\n")
-        params.n_ctx = args.ctx
+    params = None
+    if args.pad_vocab or not args.vocab_only:
+        params = Params.load(model_plus)
+        if params.n_ctx == -1:
+            if args.ctx is None:
+                msg = """\
+                    The model doesn't have a context size, and you didn't specify one with --ctx
+                    Please specify one with --ctx:
+                     - LLaMA v1: --ctx 2048
+                     - LLaMA v2: --ctx 4096"""
+                parser.error(textwrap.dedent(msg))
+            params.n_ctx = args.ctx
 
-    if args.outtype:
-        params.ftype = {
-            "f32": GGMLFileType.AllF32,
-            "f16": GGMLFileType.MostlyF16,
-            "q8_0": GGMLFileType.MostlyQ8_0,
-        }[args.outtype]
+        if args.outtype:
+            params.ftype = {
+                "f32": GGMLFileType.AllF32,
+                "f16": GGMLFileType.MostlyF16,
+                "q8_0": GGMLFileType.MostlyQ8_0,
+            }[args.outtype]
 
-    print(f"params = {params}")
+        logger.info(f"params = {params}")
 
     model_parent_path = model_plus.paths[0].parent
-    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
+    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
     vocab_factory = VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
+    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
+    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
 
     if args.vocab_only:
+        assert isinstance(vocab, Vocab)
         if not args.outfile:
             raise ValueError("need --outfile if using --vocab-only")
         outfile = args.outfile
+        if params is None:
+            params = Params(
+                n_vocab    = vocab.vocab_size,
+                n_embd     = 1,
+                n_layer    = 1,
+                n_ctx      = 1,
+                n_ff       = 1,
+                n_head     = 1,
+                n_head_kv  = 1,
+                f_norm_eps = 1e-5,
+            )
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
-        print(f"Wrote {outfile}")
+                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
+        logger.info(f"Wrote {outfile}")
         return
 
-    if model_plus.vocab is not None and args.vocab_dir is None:
+    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
         vocab = model_plus.vocab
 
-    print(f"Vocab info: {vocab}")
-    print(f"Special vocab info: {special_vocab}")
+    assert params is not None
 
+    if metadata.name is None and params.path_model is not None:
+        metadata.name = params.path_model.name
+
+    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
+    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
+
+    logger.info(f"Vocab info: {vocab}")
+    logger.info(f"Special vocab info: {special_vocab}")
     model   = model_plus.model
     model   = convert_model_names(model, params, args.skip_unknown)
     ftype   = pick_output_type(model, args.outtype)
     model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
+
+    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
 
     params.ftype = ftype
-    print(f"Writing {outfile}, format {ftype}")
+    logger.info(f"Writing {outfile}, format {ftype}")
 
     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
-    print(f"Wrote {outfile}")
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
+    logger.info(f"Wrote {outfile}")
 
 
 if __name__ == '__main__':
diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt
new file mode 100644
index 000000000..49ad9561c
--- /dev/null
+++ b/examples/cvector-generator/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-cvector-generator)
+add_executable(${TARGET} cvector-generator.cpp pca.hpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md
new file mode 100644
index 000000000..be4dd5250
--- /dev/null
+++ b/examples/cvector-generator/README.md
@@ -0,0 +1,45 @@
+# cvector-generator
+
+This example demonstrates how to generate a control vector using gguf models.
+
+Related PRs:
+- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
+
+## Examples
+
+```sh
+# CPU only
+./cvector-generator -m ./llama-3.Q4_K_M.gguf
+
+# With GPU
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
+
+# With advanced options
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
+
+# Using mean value instead of PCA
+./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
+
+# To see help message
+./cvector-generator -h
+# Then, have a look at "cvector" section
+```
+
+## Tips and tricks
+
+If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
+
+```
+<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
+<|im_start|>system\nYou are in a very good mood today<|im_end|>
+```
+
+Example to use output file with `llama-cli`:
+
+(Tips: The control vector works better when apply to layers higher than 10)
+
+```sh
+./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
+```
diff --git a/examples/cvector-generator/completions.txt b/examples/cvector-generator/completions.txt
new file mode 100644
index 000000000..abc45ffd8
--- /dev/null
+++ b/examples/cvector-generator/completions.txt
@@ -0,0 +1,582 @@
+
+That game
+I can see
+Hmm, this
+I can relate to
+Who is
+I understand the
+Ugh,
+What the hell was
+Hey, did anyone
+Although
+Thank you for choosing
+What are you
+Oh w
+How dare you open
+It was my pleasure
+I'm hon
+I appreciate that you
+Are you k
+Whoever left this
+It's always
+Ew,
+Hey, I l
+Hello? Is someone
+I understand that
+That poem
+Aww, poor
+Hey, it
+Alright, who
+I didn't
+Well, life
+The document
+Oh no, this
+I'm concerned
+Hello, this is
+This art
+Hmm, this drink
+Hi there!
+It seems
+Is
+Good
+I can't
+Ex
+Who are
+I can see that
+Wow,
+Today is a
+Hey friend
+Sometimes friends
+Oh, this old
+The weather outside
+This place is sur
+I appreciate your input
+Thank you for the
+Look at
+I'm disappoint
+To my
+How dare you
+That's an
+This piece of art
+Eww
+This park is
+This is incredible
+Oh no, someone
+Exc
+Well, it'
+I warned
+Hey, I understand
+Hey, I saw
+How dare you go
+What the he
+Hey
+It's
+Hello? Hello?
+It
+Oh no!
+This is the perfect
+Good morning,
+Oh no, there
+It's so
+Yeah
+Uh,
+Hello everyone
+Who turned off
+The weather
+Who'
+Hey, this
+Wait,
+Eww, gross
+Excuse
+It seems like you
+Thank you so
+What happened?
+Oh my g
+I am deeply sad
+I war
+Okay, let'
+Hey, that
+That was a beautiful
+Oh no! That
+What happened
+Hey there
+The artist'
+What?!
+Hey, it'
+I am disappoint
+It seems like
+Oh no! The
+This park is a
+If you
+Yes! I did
+It sounds
+What
+Who is it
+Hmm, that
+That's strange
+Yeah, that was
+That's interesting
+This park
+What the hell
+Who is that
+I feel like my
+Oh well
+What the hell is
+Hello? Hello
+To my dearest
+Bless you!\"
+Thank you for
+Oh, looks like
+Can you please
+This place is
+Eww, what
+Bless you
+Is everything
+Hey, I just
+Whoever left these
+Well, that'
+I feel
+Hey, do you
+It's sad
+Oh no, it
+Hey, that'
+Oh my god,
+Thank you,
+Hello little one,
+I apolog
+Hey team, I
+How dare you read
+Who is this and
+Whoever left
+Hi there! W
+A
+If you have
+I was
+U
+Bless
+Well, this
+Oh, I'
+It's a
+Eww,
+Is everything okay?
+Oh, I
+Hello, can you
+Al
+That was a great
+What are
+I understand that not
+Oh no, not
+Who is it?\"
+Hey, can we
+Whoever is taking
+I would love to
+Hey, I noticed
+Hey, could
+I understand that there
+Hello?
+D
+Oh man, I
+Thank you so much
+Oh no, my
+Dear [Name
+Uh
+I remember
+Hey, who
+Well, it
+Are you
+I understand that it
+Hey, is
+I would
+Who is this
+Excuse me
+Alright
+I am thrilled
+Sometimes friends have
+Who the
+It's interesting
+I would love
+E
+Hello? Is anyone
+Well, this is
+This place
+Well,
+I warned you
+Hey, watch where
+Oh my
+That'
+Sometimes friends have different
+I understand that everyone
+What?
+What do these notes
+I can relate
+I'm not
+I understand
+To my dear
+Guys
+Well
+Hey, I appreciate
+Wow, what
+Dear
+That melody
+Who the hell
+Today is
+Hello little
+Wow, look
+That's great
+Love is never wrong
+I'm having
+Whoa, did
+Ugh
+Can you please provide
+I miss you,
+I feel uncom
+I know
+Ugh, this
+Hey, watch
+Oh great, a
+I didn
+Okay
+That game of char
+Oh
+I appreciate
+Who's there
+I am so
+Oh great, someone
+Hey, could you
+I remember wondering
+Wait, what?
+What do
+Hello? Can
+Hey there,
+That game of
+This is incred
+Oh my gosh
+Oh great, f
+I appreciate your
+It sounds like
+What the heck
+Okay, I understand
+Ew
+I understand that this
+Uh, hi
+Hi everyone!
+What the hell?
+Thank you for your
+Oh no, the
+Wow, I
+Who turned
+Dear [
+Whoever
+This is a
+Whoa, he
+What in the world
+Although the physical
+Hello, who is
+That's amaz
+Hey, I know
+Okay, that
+Hi everyone
+Hey, is everything
+I understand your fr
+Oh no, poor
+Oh, look
+Good morning
+Ew, gross
+Oh no, did
+Look at the family
+Hey team
+Yes!
+Hey, can I
+Okay, that'
+It's great
+Love is
+Hey, what
+Good morning, world
+Who is it?
+That poem really reson
+I
+That's
+I understand the task
+Gu
+Hello? Who'
+This postcard is
+Whoa,
+Oh, that
+I understand that I
+Whoever is
+Hello? Who is
+I'm really
+Wow, this
+Can
+This artwork really
+This is a shame
+I miss you too
+Who are you?
+Today is a difficult
+Hey, just
+Are you okay
+I am
+Hi,
+Wow, that
+Hey there! Can
+Okay, stay
+Oh great, just
+Yeah,
+Hello? Can you
+Oh, looks
+Thank you for sharing
+I'm glad
+Hey, is that
+Hmm
+It was my
+It sounds like you
+Wow, your
+I was promised certain
+That was such a
+Thank
+Excuse you
+That was
+Hey team,
+I feel un
+It was
+What'
+Hey friend, I
+How
+Saying goodbye
+That
+It's heart
+How dare
+Oh,
+Hello, may
+What's this
+Thank you for recogn
+Aww, that
+Oh, I remember
+Hmm, that'
+I miss
+I know this
+Wait
+Is everything okay
+Who is that person
+Wow, you
+Oh great
+I'm sad
+Wow, the
+I am very disappoint
+Who turned off the
+I understand that things
+I'm very
+Hi
+That's very
+Okay, I
+Oh no,
+Wow, there
+What's wrong
+I apologize for
+Hey, I
+Can I help you
+Oh, I didn
+Alright,
+Oh wow,
+Oh my goodness
+I know this event
+What in the
+Saying
+Yeah, that
+Guys, I
+Hey, this v
+This post
+Are
+Hey, can
+Hello? Is
+I can only imagine
+Oh, that sounds
+Hey, is anyone
+I am disappointed
+Hello,
+Hey everyone, I
+That was such
+It's okay
+The artist
+Whoa
+I understand that mistakes
+Can I help
+Who
+Hi everyone! I
+Hey, can you
+Wow, how
+Today
+Oh no, I
+Oh well, I
+Well, that
+This is the
+Yes! I finally
+Hey there little
+Hello everyone!
+Love is never
+Look at the
+This postcard
+Oh great,
+Can I
+Hmm, this is
+I understand your
+Oh, look at
+B
+I'm so
+Whoa, this
+W
+Oh, this
+Sometimes
+This piece of
+What the
+That was a
+Hey, do
+Oh no
+Whoa, what
+I feel like I
+The documentary
+Hello
+Hello little one
+I understand that my
+Eww, that
+Wow, an
+Yes! Finally,
+Although the physical location
+Whoever is watching
+That movie
+I remember wondering about
+Hey there, little
+Who's
+Hello, who
+Hello everyone! Thank
+Hello, can
+That's too
+Hey, just wanted
+Hey there, I
+Saying good
+Hey there!
+Who is there?
+Oh my good
+I am very
+Oh no, what
+Wow, thank
+I was promised
+Hi, is
+Hey, I'
+Guys, the
+Oh no, that
+Who is there
+Hello, this
+That movie really touched
+If you have something
+The documentary was
+I'm starting
+Are you kidd
+That movie really
+Hey everyone,
+Thank you for considering
+I didn'
+Yes! I
+Can you
+Oh my god
+Hey, whoever
+That melody really
+Thank you, little
+Hello, may I
+Look
+Wow, we
+It looks
+What do these
+Oh wow
+I apologize
+What are you all
+It's such
+It's clear
+Hey, I was
+Hey friend,
+I can only
+The weather outside is
+Eww, this
+I miss you
+Wow
+Aww,
+Hi, is there
+This artwork
+Okay,
+Oh well,
+This
+I'
+Say
+Hey there little gu
+Hmm,
+Whoa, who
+I am thr
+Oh man
+Okay, stay calm
+I'm happy
+Oh, this cur
+Oh man,
+I'm sorry
+Hello? Who
+What?! That
+This piece
+Hey everyone
+That's so
+Are you okay?
+What happened? Where
+Hi there
+The
+Who the hell entered
+I can
+Guys,
+What's
+What in
+It's important
+I'm
+I'm coming
+It'
+Yes! Finally
+Wait, what
+Wow, reading
+I'm surprised
+Hey, did
+Hey,
+Okay, let
+I understand that you
+Who the hell threw
+Eww, who
+Thank you for thinking
+Who is this?\"
+I am deeply
+Thank you for including
+Oh no, an
+It looks like you
+Aww
+I'm confused
+Wow, it
+That poem really
+Yes
+Hey there, is
+Hey, what'
+Thank you for remember
+To
+This is
+Thank you for making
+I can'
+That mel
+Wow, they
+I feel like
+Although the
+Who are you
+Love
+If
+What the hell are
+I am so sad
+Oh, I found
+Thank you
+It looks like
+Well, life is
+I appreciate that
+The artist's
+Whoa, that
+It's never
\ No newline at end of file
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
new file mode 100644
index 000000000..413b71d34
--- /dev/null
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -0,0 +1,506 @@
+#include "ggml.h"
+#include "gguf.h"
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "pca.hpp"
+#include "mean.hpp"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+
+//////////////////////////////////////////////////
+// utils
+
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
+    }
+
+    return ret;
+}
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
+    printf("\n");
+}
+
+//////////////////////////////////////////////////
+
+
+// cb_eval is reused for each pair of positive - negative prompt
+struct callback_data {
+    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
+
+    int n_layers = 0;
+    int n_tokens = 0;
+    bool is_eval_pos = true;
+
+    // each element of the vector correspond to one layer
+    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
+
+    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
+    void save_tensor_for_layer(struct ggml_tensor * t) {
+        GGML_ASSERT(t->type == GGML_TYPE_F32);
+
+        if (ctx_ggml == nullptr) {
+            // alloc a new ctx_ggml if needed
+            struct ggml_init_params params_ggml = {
+                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ctx_ggml = ggml_init(params_ggml);
+        }
+
+        // copy tensor data
+        auto n_bytes = ggml_nbytes(t);
+        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
+        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
+        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
+        ggml_set_name(t_layer, ggml_get_name(t));
+        //print_debug_tensor(t_layer);
+
+        if (is_eval_pos) {
+            v_pos.push_back(t_layer);
+        } else {
+            v_neg.push_back(t_layer);
+        }
+    }
+
+    // calculate diff (v_pos - v_neg) and place the result back to v_pos
+    // all zero rows in the diff tensor will also be removed
+    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
+    std::vector<struct ggml_tensor *> calc_diff() {
+        for (float il = 0; il < v_pos.size(); il++) {
+            float * a = (float *) v_pos[il]->data;
+            float * b = (float *) v_neg[il]->data;
+            size_t n_elem = ggml_nelements(v_pos[il]);
+            for (size_t j = 0; j < n_elem; j++) {
+                a[j] -= b[j];
+            }
+            //print_debug_tensor(v_pos[i]);
+            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
+            v_diff_filtered.push_back(diff_filtered);
+        }
+        return v_diff_filtered; // for convinient, we return the result std::vector
+    }
+
+    // delete zero rows from a given 2D tensor
+    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
+        //printf("filter_nonzero_rows\n");
+        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
+            // check if given row containing all zero elements
+            int n_cols = t->ne[0]; // hint: should be equal to n_embd
+            for (int col = 0; col < n_cols; ++col) {
+                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
+        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
+            if (!is_row_all_zeros(a, i_row, 1e-6)) {
+                rows_to_copy.push_back(i_row);
+            }
+        }
+
+        // get "n_nonzero_rows" for the output "diff_filtered"
+        int n_nonzero_rows = rows_to_copy.size();
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        int n_embd = a->ne[0];
+        GGML_ASSERT(n_nonzero_rows > 0);
+
+        // diff_filtered: [n_embd, n_nonzero_rows]
+        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
+            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
+        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
+        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
+
+        // copy non-zero rows
+        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
+            int src_row = rows_to_copy[dest_row];
+            for (int i = 0; i < n_embd; i++) {
+                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
+                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
+            }
+        }
+
+        //print_debug_tensor(diff_filtered);
+
+        return diff_filtered;
+    }
+
+    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
+    void reset() {
+        for (auto ptr : v_pos) free(ptr->data);
+        for (auto ptr : v_neg) free(ptr->data);
+        for (auto ptr : v_diff_filtered) free(ptr->data);
+        v_pos.clear();
+        v_neg.clear();
+        v_diff_filtered.clear();
+        if (ctx_ggml) {
+            ggml_free(ctx_ggml);
+        }
+        ctx_ggml = nullptr;
+    }
+};
+
+/**
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
+ * in short, input => v_diff and output => v_final
+ */
+struct train_context {
+    ggml_context * ctx_ggml;
+    int n_embd;
+    int n_layers;
+
+    /* pair of prompts to be used for generating final vector */
+    std::vector<std::string> positive_entries;
+    std::vector<std::string> negative_entries;
+
+    // each element of the vector correspond to one layer
+    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
+    // NOTE (2): v_diff is transposed from v_diff_tmp
+    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
+
+    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
+    // v_diff_tmp will get converted unto v_diff later on
+    std::vector<std::vector<uint8_t>> v_diff_tmp;
+
+    train_context(int n_embd_, int n_layers_) {
+        n_embd = n_embd_;
+        n_layers = n_layers_;
+        struct ggml_init_params params_ggml = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_ggml = ggml_init(params_ggml);
+        for (int il = 0; il < n_layers - 1; il++) {
+            std::vector<uint8_t> empty;
+            v_diff_tmp.push_back(empty);
+            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
+            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
+            v_final.push_back(t);
+        }
+    }
+
+    // add new rows into existing tensor in v_diff_tmp
+    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto t = diff_filtered[il];
+            auto & diff_tmp = v_diff_tmp[il];
+            size_t curr_size = diff_tmp.size();
+            diff_tmp.resize(curr_size + ggml_nbytes(t));
+            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
+        }
+    }
+
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
+    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
+    void build_v_diff(bool transpose) {
+        printf("build_v_diff\n");
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto & diff_tmp = v_diff_tmp[il];
+            int n_elem = diff_tmp.size() / sizeof(float);
+            GGML_ASSERT(n_elem % n_embd == 0);
+            int n_rows = n_elem / n_embd;
+            struct ggml_tensor * diff = transpose
+                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
+                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
+            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+            if (transpose) {
+                // copy data & transpose
+                float * arr = (float *) diff_tmp.data();
+                for (int ir = 0; ir < n_rows; ++ir) {
+                    for (int ic = 0; ic < n_embd; ++ic) {
+                        float f = arr[ir*n_embd + ic];
+                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                    }
+                }
+            } else {
+                // only copy
+                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
+            }
+            v_diff.push_back(diff);
+            print_debug_tensor(diff);
+            // free memory of diff_tmp
+            diff_tmp.resize(0);
+        }
+    }
+
+    ~train_context() {
+        for (auto ptr : v_final) free(ptr->data);
+        for (auto ptr : v_diff) free(ptr->data);
+        // no need to free v_diff_tmp, since we didn't use malloc
+        ggml_free(ctx_ggml);
+    }
+};
+
+struct tokenized_prompt {
+    std::vector<llama_token> tokens_pos;
+    std::vector<llama_token> tokens_neg;
+    size_t max_seq_len;
+
+    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const bool add_bos = llama_vocab_get_add_bos(vocab);
+        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
+        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
+        padding_seq(ctx, tokens_pos, max_seq_len);
+        padding_seq(ctx, tokens_neg, max_seq_len);
+    }
+
+    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+        // TODO: customize padding token
+        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
+        llama_token pad_tok = pad_tokens.back();
+        while (tokens.size() < len) {
+            tokens.push_back(pad_tok);
+        }
+    }
+};
+
+//////////////////////////////////////////////////
+
+template <typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
+    std::vector<std::string> output;
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
+        exit(1);
+    }
+    std::string line;
+    while (std::getline(file, line)) {
+        bool is_skip = skip_empty_lines && line.empty();
+        if (!is_skip) {
+            string_process_escapes(line);
+            output.push_back(line);
+        }
+    }
+    file.close();
+    return output;
+}
+
+//////////////////////////////////////////////////
+
+static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+    static const char * l_out_name = "l_out";
+    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
+
+    if (ask) {
+        return is_l_out;
+    }
+
+    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
+        return true;
+    }
+
+    // save the tensor to current context
+    cb_data->save_tensor_for_layer(t);
+    return true;
+}
+
+static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
+    llama_kv_cache_clear(ctx);
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+    return true;
+}
+
+static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    const std::string arch = "controlvector";
+    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
+    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
+    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
+
+    for (size_t i = 0; i < v_ctrl.size(); ++i) {
+        gguf_add_tensor(ctx, v_ctrl[i]);
+        print_debug_tensor(v_ctrl[i]);
+        printf("Added tensor: %s\n", v_ctrl[i]->name);
+    }
+
+    printf("%s: writing file...\n", __func__);
+    gguf_write_to_file(ctx, fname.c_str(), false);
+    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
+    gguf_free(ctx);
+}
+
+/**
+ * Load prompt files and completion file.
+ * Then format each pair of prompt + completion to make an entry.
+ */
+static int prepare_entries(common_params & params, train_context & ctx_train) {
+    // load prompts
+    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
+    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
+    if (positive_prompts.size() != negative_prompts.size()) {
+        fprintf(stderr, "number of positive and negative prompts must be equal\n");
+        return 1;
+    }
+    if (positive_prompts.empty()) {
+        fprintf(stderr, "must provide at least one prompt pair\n");
+        return 1;
+    }
+    ctx_train.positive_entries = positive_prompts;
+    ctx_train.negative_entries = negative_prompts;
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+        return 1;
+    }
+
+    if (params.n_pca_iterations % params.n_pca_batch != 0) {
+        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
+        return 1;
+    }
+
+
+    callback_data cb_data;
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = cb_eval;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    print_build_info();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model to get hparams
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    // int n_ctx = llama_n_ctx(ctx);
+    int n_layers = llama_model_n_layer(model);
+    int n_embd = llama_model_n_embd(model);
+
+    // get model hint param (a.k.a model arch name)
+    char model_hint[128];
+    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+
+    // init train_context
+    train_context ctx_train(n_embd, n_layers);
+
+    // load and prepare entries for training
+    prepare_entries(params, ctx_train);
+
+    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
+    std::vector<tokenized_prompt> tokenized_prompts;
+    size_t n_total_tokens = 0;
+    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
+        n_total_tokens += 2 * t.max_seq_len;
+        tokenized_prompts.push_back(std::move(t));
+    }
+
+    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
+
+    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        bool success = false;
+        tokenized_prompt t = tokenized_prompts[i];
+        cb_data.n_layers = n_layers;
+        cb_data.n_tokens = t.max_seq_len;
+
+        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
+            (int) i+1, (int) ctx_train.positive_entries.size(),
+            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
+            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
+            (int) t.max_seq_len);
+
+        cb_data.is_eval_pos = true;
+        success = get_hidden_layers(ctx, t.tokens_pos);
+        if (!success) break;
+
+        cb_data.is_eval_pos = false;
+        success = get_hidden_layers(ctx, t.tokens_neg);
+        if (!success) break;
+
+        // calculate diff and remove all zero rows
+        auto v_diff_filtered = cb_data.calc_diff();
+
+        // save & concat the filtered v_diff to ctx_train
+        ctx_train.concat_diff_tmp(v_diff_filtered);
+
+        // reset for next iteration
+        cb_data.reset();
+    }
+
+    // done with the model, we can now free it to make gain some memory
+    printf("Done evaluate prompts, unload model...\n");
+
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
+
+    // prepare ctx_train for PCA
+    ctx_train.build_v_diff(use_pca);
+
+    if (use_pca) {
+        // run PCA
+        PCA::pca_params pca_params;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
+        pca_params.n_iterations = params.n_pca_iterations;
+        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+    } else {
+        // run mean
+        mean::run(ctx_train.v_diff, ctx_train.v_final);
+    }
+
+    // write output vectors to gguf
+    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp
new file mode 100644
index 000000000..4eeac1eeb
--- /dev/null
+++ b/examples/cvector-generator/mean.hpp
@@ -0,0 +1,48 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <string>
+#include <vector>
+#include <math.h>
+
+namespace mean {
+
+static void run(
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running mean...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+
+        // calculate mean vector
+        struct ggml_tensor * t_layer = v_input[il];
+        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
+        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
+            float f = 0.0;
+            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
+                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
+            }
+            f /= t_layer->ne[1];
+            ggml_set_f32_1d(ctrl_out, ic, f);
+        }
+
+        // normalize output vector
+        float norm = 0.0;
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            norm += f*f;
+        }
+        norm = sqrt(norm);
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            ggml_set_f32_1d(ctrl_out, i, f / norm);
+        }
+
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt
new file mode 100644
index 000000000..45b9384b3
--- /dev/null
+++ b/examples/cvector-generator/negative.txt
@@ -0,0 +1,4 @@
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
+<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
\ No newline at end of file
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
new file mode 100644
index 000000000..e88bbdde9
--- /dev/null
+++ b/examples/cvector-generator/pca.hpp
@@ -0,0 +1,315 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cstdio>
+#include <ctime>
+#include <random>
+#include <string>
+#include <vector>
+
+#define DEBUG_POS 5
+
+static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
+    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
+    if (!with_data) return;
+    printf("%s: %s[0] = [", __func__, t->name);
+    for (size_t i = 0; i <= DEBUG_POS; i++) {
+        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
+    }
+    printf(" ... ]\n");
+}
+
+namespace PCA {
+
+// input params for PCA computations
+struct pca_params {
+    int n_threads = 1;
+    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
+    int n_iterations = 1000;
+    float tolerance = 1e-7;
+
+    // for debugging
+    int i_layer = 0;
+    int n_layers = 0;
+};
+
+// result from each iteration
+struct pca_result {
+    struct ggml_tensor * calculated_square = NULL;
+    std::vector<struct ggml_tensor *> eigenvectors;
+    std::vector<float> distances;
+};
+
+struct pca_model {
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;      // context to compute graph on target device
+    struct ggml_context * ctx_host; // host context to store results
+
+    // tensors on target device
+    struct ggml_tensor * dev_input;
+    struct ggml_tensor * dev_square;
+    struct ggml_tensor * dev_eigenvector;
+
+    pca_model(struct ggml_tensor * t_input) {
+#ifdef GGML_USE_CUDA
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        backend = ggml_backend_cuda_init(0); // init device 0
+        if (!backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+#endif
+
+// TODO: enable Metal support when support for GGML_OP_SQRT is added
+// #ifdef GGML_USE_METAL
+//         fprintf(stderr, "%s: using Metal backend\n", __func__);
+//         backend = ggml_backend_metal_init();
+//         if (!backend) {
+//             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+//         }
+// #endif
+
+        // if there aren't GPU Backends fallback to CPU backend
+        if (!backend) {
+            backend = ggml_backend_cpu_init();
+        }
+
+        const int num_tensors = 4;
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx = ggml_init(params);
+
+        auto n_samples = t_input->ne[0];
+        auto n_embd    = t_input->ne[1];
+
+        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
+        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
+        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        ggml_set_name(dev_input,       "dev_input");
+        ggml_set_name(dev_square,      "dev_square");
+        ggml_set_name(dev_eigenvector, "dev_eigenvector");
+        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
+
+        // initialize eigenvector to random normalized vector
+        {
+            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
+            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
+            std::uniform_real_distribution<float> distribution(0.0, 1.0);
+            float sum_sqr = 0.0; // for normalizing random_vec
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                float f = distribution(generator);
+                sum_sqr += f * f;
+                random_vec[i] = f;
+            }
+            // normalize it
+            float random_vec_norm = std::sqrt(sum_sqr);
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                random_vec[i] /= random_vec_norm;
+            }
+            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
+        }
+    }
+
+    ~pca_model() {
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+        ggml_backend_free(backend);
+    }
+};
+
+static struct ggml_cgraph * build_graph_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        bool calc_square = false) {
+    GGML_ASSERT(params.n_batch > 0);
+    // TODO: buf_size must be able to scale with params.n_batch
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    // turn v_diff_original into square matrix if needed
+    struct ggml_tensor * tmp_square;
+    if (calc_square) {
+        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
+        ggml_set_name(tmp_square, "tmp_square");
+    }
+
+    struct ggml_tensor * b_tensor;
+    struct ggml_tensor * distance;
+    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
+    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
+
+    for (int i = 0; i < params.n_batch; ++i) {
+        // b_tensor = square * eigenvector^T
+        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
+        ggml_set_name(b_tensor, "b_tensor");
+
+        // normalize
+        b_tensor = ggml_div_inplace(ctx0,
+            b_tensor,
+            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
+        );
+        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
+
+        // calculate distance(new eigenvector - old eigenvector)
+        // we don't use ggml_sub because it may not be implemented on GPU backend
+        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
+        distance = ggml_sqrt_inplace(ctx0,
+            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
+        ggml_format_name(distance, "distance_%d", i);
+
+        old_eigen = b_tensor;
+
+        // build operations nodes
+        ggml_build_forward_expand(gf, distance);
+    }
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+static ggml_status compute_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        struct ggml_cgraph * gf,
+        ggml_gallocr_t allocr,
+        struct pca_result & result) {
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
+    }
+
+    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
+    if (res == GGML_STATUS_SUCCESS) {
+        auto extract_i = [](std::string prefix, std::string str) -> int {
+            int i = -1;
+            if (str.rfind(prefix, 0) == 0) {
+                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
+            }
+            return i;
+        };
+        result.calculated_square = NULL;
+        result.eigenvectors.clear();
+        result.distances.clear();
+        result.eigenvectors.resize(params.n_batch);
+        result.distances.resize(params.n_batch);
+        // get output nodes
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
+            int iter = -1;
+            // find b_tensor (without copying data from device)
+            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
+                result.eigenvectors[iter] = node;
+            }
+            // find distances, then copy data from device
+            if ((iter = extract_i("distance_", node->name)) > -1) {
+                float d;
+                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
+                result.distances[iter] = d;
+                // std::cout << node->name << " = " << d << "\n";
+            }
+            // find tmp_square if it exists (without copying data from device)
+            if (std::string(node->name) == "tmp_square") {
+                result.calculated_square = node;
+            }
+        }
+    }
+    return res;
+}
+
+static void power_iteration(
+        const struct pca_params & params,
+        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
+        struct ggml_tensor * output) {
+    //printf("in power iteration\n");
+    struct pca_model model(input);
+
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    struct pca_result result;
+    struct ggml_tensor * last_eigenvector = NULL;
+
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    for (int iter = 0; iter < n_iters; ++iter) {
+        bool calc_square = (iter == 0); // only need to calculate square for first iteration
+        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        compute_piter(params, model, gf, allocr, result);
+
+        for (size_t k = 0; k < result.distances.size(); ++k) {
+            last_eigenvector = result.eigenvectors[k];
+            if (result.distances[k] < params.tolerance) {
+                break; // done
+            }
+        }
+
+        if (calc_square) {
+            // copy and store the square matrix if needed
+            GGML_ASSERT(result.calculated_square != NULL);
+            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
+        }
+
+        {
+            // copy last eigen vector and store as input for next iteration
+            GGML_ASSERT(last_eigenvector != NULL);
+            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
+        }
+
+        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
+    }
+
+    // get output tensor
+    GGML_ASSERT(last_eigenvector);
+    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
+    //print_debug_tensor(output);
+    ggml_gallocr_free(allocr);
+
+    // TODO @ngxson : The output vector is randomly inverted
+    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
+}
+
+static void run_pca(
+        struct pca_params & params,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running PCA...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+
+        // run power_iteration
+        params.i_layer = il;
+        params.n_layers = v_input.size();
+        power_iteration(params, v_input[il], ctrl_out);
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt
new file mode 100644
index 000000000..fea736225
--- /dev/null
+++ b/examples/cvector-generator/positive.txt
@@ -0,0 +1,4 @@
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
+<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
\ No newline at end of file
diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md
new file mode 100644
index 000000000..59918ec2b
--- /dev/null
+++ b/examples/deprecation-warning/README.md
@@ -0,0 +1,49 @@
+# Migration notice for binary filenames
+
+> [!IMPORTANT]
+[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+
+This migration was important, but it is a breaking change that may not always be immediately obvious to users.
+
+Please update all scripts and workflows to use the new binary names.
+
+| Old Filename | New Filename |
+| ---- | ---- |
+| main | llama-cli |
+| server | llama-server |
+| llama-bench | llama-bench |
+| embedding | llama-embedding |
+| quantize | llama-quantize |
+| tokenize | llama-tokenize |
+| export-lora | llama-export-lora |
+| libllava.a | libllava.a |
+| baby-llama | llama-baby-llama |
+| batched | llama-batched |
+| batched-bench | llama-batched-bench |
+| benchmark-matmult | llama-benchmark-matmult |
+| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
+| eval-callback | llama-eval-callback |
+| gbnf-validator | llama-gbnf-validator |
+| gguf | llama-gguf |
+| gguf-split | llama-gguf-split |
+| gritlm | llama-gritlm |
+| imatrix | llama-imatrix |
+| infill | llama-infill |
+| llava-cli | llama-llava-cli |
+| lookahead | llama-lookahead |
+| lookup | llama-lookup |
+| lookup-create | llama-lookup-create |
+| lookup-merge | llama-lookup-merge |
+| lookup-stats | llama-lookup-stats |
+| parallel | llama-parallel |
+| passkey | llama-passkey |
+| perplexity | llama-perplexity |
+| q8dot | llama-q8dot |
+| quantize-stats | llama-quantize-stats |
+| retrieval | llama-retrieval |
+| save-load-state | llama-save-load-state |
+| simple | llama-simple |
+| speculative | llama-speculative |
+| vdot | llama-vdot |
+| tests/test-c.o | tests/test-c.o |
+
diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp
new file mode 100644
index 000000000..c2958ea12
--- /dev/null
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -0,0 +1,35 @@
+// Warns users that this filename was deprecated, and provides a link for more information.
+
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+// Main
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    auto pos = filename.find_last_of("/\\");
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    // Append "llama-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "llama-" + filename;
+
+    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    if (filename == "main") {
+        replacement_filename = "llama-cli";
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
+    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 8ffc33868..809040307 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET embedding)
+set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
index 6929454c5..12b372bf1 100644
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -9,13 +9,52 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
 ```
 
 ### Windows:
 
 ```powershell
-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
 ```
 
 The above command will output space-separated float values.
+
+## extra parameters
+### --embd-normalize $integer$
+| $integer$ | description         | formula |
+|-----------|---------------------|---------|
+| $-1$      | none                |
+| $0$       | max absolute int16  | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
+| $1$       | taxicab             | $\Large{x_i \over\sum \lvert x_i\rvert}$
+| $2$       | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
+| $>2$      | p-norm              | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
+
+### --embd-output-format $'string'$
+| $'string'$ | description                  |  |
+|------------|------------------------------|--|
+| ''         | same as before               | (default)
+| 'array'    | single embeddings            | $[[x_1,...,x_n]]$
+|            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
+| 'json'     | openai style                 |
+| 'json+'    | add cosine similarity matrix |
+
+### --embd-separator $"string"$
+| $"string"$   | |
+|--------------|-|
+| "\n"         | (default)
+| "<#embSep#>" | for exemple
+| "<#sep#>"    | other exemple
+
+## examples
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+```
+
+### Windows:
+
+```powershell
+llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+```
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index acff715e9..38d22c90f 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <ctime>
@@ -7,153 +9,200 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static std::vector<std::string> split_lines(const std::string & s) {
-    std::string line;
+static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
     std::vector<std::string> lines;
-    std::stringstream ss(s);
-    while (std::getline(ss, line)) {
-        lines.push_back(line);
+    size_t start = 0;
+    size_t end = s.find(separator);
+
+    while (end != std::string::npos) {
+        lines.push_back(s.substr(start, end - start));
+        start = end + separator.length();
+        end = s.find(separator, start);
     }
+
+    lines.push_back(s.substr(start)); // Add the last part
+
     return lines;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
-static void normalize(float * vec, float * out, int n) {
-    float norm = 0;
-    for (int i = 0; i < n; i++) {
-        norm += vec[i] * vec[i];
-    }
-    norm = sqrt(norm);
-    for (int i = 0; i < n; i++) {
-        out[i] = vec[i] / norm;
-    }
-}
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    const struct llama_model * model = llama_get_model(ctx);
 
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_cache_clear(ctx);
 
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+        // encoder-only model
+        if (llama_encode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to encode\n", __func__);
+        }
+    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        // decoder-only model
+        if (llama_decode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to decode\n", __func__);
+        }
     }
 
-    // normalize on copy
-    for (int k = 0; k < n_seq; k++) {
-        float * emb = llama_get_embeddings_ith(ctx, k);
-        float * out = output + k * n_embd;
-        normalize(emb, out, n_embd);
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        const float * embd = nullptr;
+        int embd_pos = 0;
+
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            // try to get token embeddings
+            embd = llama_get_embeddings_ith(ctx, i);
+            embd_pos = i;
+            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+        } else {
+            // try to get sequence embeddings - supported only when pooling_type is not NONE
+            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            embd_pos = batch.seq_id[i][0];
+            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+        }
+
+        float * out = output + embd_pos * n_embd;
+        common_embd_normalize(embd, out, n_embd, embd_norm);
     }
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
         return 1;
     }
 
+    common_init();
+
     params.embedding = true;
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
+    // For non-causal models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
 
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
 
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+
+    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        return 1;
+    }
+
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     // split the prompt into lines
-    std::vector<std::string> prompts = split_lines(params.prompt);
+    std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
 
     // max batch size
     const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch == params.n_ctx);
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
 
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true);
+        auto inp = common_tokenize(ctx, prompt, true, true);
         if (inp.size() > n_batch) {
-            inp.resize(n_batch);
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
         }
         inputs.push_back(inp);
     }
 
+    // check if the last token is SEP
+    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
+    for (auto & inp : inputs) {
+        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+        }
+    }
+
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
             for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
         }
     }
 
     // initialize batch
     const int n_prompts = prompts.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+    // count number of embeddings
+    int n_embd_count = 0;
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        for (int k = 0; k < n_prompts; k++) {
+            n_embd_count += inputs[k].size();
+        }
+    } else {
+        n_embd_count = n_prompts;
+    }
 
     // allocate output
-    const int n_embd = llama_n_embd(model);
-    std::vector<float> embeddings(n_prompts * n_embd, 0);
+    const int n_embd = llama_model_n_embd(model);
+    std::vector<float> embeddings(n_embd_count * n_embd, 0);
     float * emb = embeddings.data();
 
     // break into batches
-    int p = 0; // number of prompts processed already
+    int e = 0; // number of embeddings already stored
     int s = 0; // number of prompts in current batch
     for (int k = 0; k < n_prompts; k++) {
         // clamp to n_batch tokens
         auto & inp = inputs[k];
+
         const uint64_t n_toks = inp.size();
 
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
-            float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
-            llama_batch_clear(batch);
-            p += s;
+            float * out = emb + e * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
+            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
             s = 0;
+            common_batch_clear(batch);
         }
 
         // add to batch
@@ -162,23 +211,114 @@ int main(int argc, char ** argv) {
     }
 
     // final batch
-    float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
+    float * out = emb + e * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
 
-    // print first 3 embeddings
-    for (int j = 0; j < std::min(3, n_prompts); j++) {
-        fprintf(stderr, "embedding %d: ", j);
-        for (int i = 0; i < n_embd; i++) {
-            fprintf(stderr, "%f ", emb[j * n_embd + i]);
+    if (params.embd_out.empty()) {
+        LOG("\n");
+
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            for (int j = 0; j < n_embd_count; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < std::min(3, n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG(" ... ");
+                for (int i = n_embd - 3; i < n_embd; i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
+            }
+        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            for (int j = 0; j < n_embd_count; j++) {
+                // NOTE: if you change this log - update the tests in ci/run.sh
+                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+            }
+        } else {
+            // print the first part of the embeddings or for a single prompt, the full embedding
+            for (int j = 0; j < n_prompts; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
+            }
+
+            // print cosine similarity matrix
+            if (n_prompts > 1) {
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    LOG("%6.6s ", prompts[i].c_str());
+                }
+                LOG("\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    for (int j = 0; j < n_prompts; j++) {
+                        float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        LOG("%6.2f ", sim);
+                    }
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
+                }
+            }
         }
-        fprintf(stderr, "\n\n");
     }
-    fprintf(stderr, "\n");
+
+    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
+        const bool notArray = params.embd_out != "array";
+
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        for (int j = 0;;) { // at least one iteration (one prompt)
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
+            for (int i = 0;;) { // at least one iteration (n_embd > 0)
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                i++;
+                if (i < n_embd) LOG(","); else break;
+            }
+            LOG(notArray ? "]\n    }" : "]");
+            j++;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
+        }
+        LOG(notArray ? "\n  ]" : "]\n");
+
+        if (params.embd_out == "json+" && n_prompts > 1) {
+            LOG(",\n  \"cosineSimilarity\": [\n");
+            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
+                LOG("    [");
+                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
+                    float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    LOG("%6.2f", sim);
+                    j++;
+                    if (j < n_embd_count) LOG(", "); else break;
+                }
+                LOG(" ]");
+                i++;
+                if (i < n_embd_count) LOG(",\n"); else break;
+            }
+            LOG("\n  ]");
+        }
+
+        if (notArray) LOG("\n}\n");
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
 
     // clean up
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    llama_free_model(model);
+    llama_batch_free(batch);
     llama_backend_free();
 
     return 0;
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
new file mode 100644
index 000000000..95915ed91
--- /dev/null
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(TARGET llama-eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET}
+        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
new file mode 100644
index 000000000..63a57ad6b
--- /dev/null
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+llama-eval-callback \
+  --hf-repo ggml-org/models \
+  --hf-file phi-2/ggml-model-q4_0.gguf \
+  --model phi-2-q4_0.gguf \
+  --prompt hello \
+  --seed 42 \
+  -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx      = 512
+...
+llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
+llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
+llama_new_context_with_model: graph nodes  = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.0181,   0.0272,   0.0272, ...],
+                                      ],
+                                     ]
+ggml_debug:                   norm-0 = (f32)       NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.6989,   1.0636,   1.0636, ...],
+                                      ],
+                                     ]
+ggml_debug:                 norm_w-0 = (f32)        MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1800,   0.2817,   0.2632, ...],
+                                      ],
+                                     ]
+ggml_debug:              attn_norm-0 = (f32)        ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1863,   0.2970,   0.2604, ...],
+                                      ],
+                                     ]
+ggml_debug:                   wqkv-0 = (f32)    MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1238,   1.2876,  -1.8086, ...],
+                                      ],
+                                     ]
+ggml_debug:                   bqkv-0 = (f32)        ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:            bqkv-0 (view) = (f32)       VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:        Qcur-0 (reshaped) = (f32)    RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+```
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
new file mode 100644
index 000000000..fb188f5a9
--- /dev/null
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,194 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+    std::vector<uint8_t> data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        LOG("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                LOG("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            LOG("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    LOG("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                LOG("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        LOG("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    LOG("%12.4f", v);
+                    sum += v;
+                    if (i0 < ne[0] - 1) LOG(", ");
+                }
+                LOG("],\n");
+            }
+            LOG("                                      ],\n");
+        }
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
+
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type)) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
+static bool run(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        LOG_ERR("%s : failed to eval\n", __func__);
+        return false;
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    callback_data cb_data;
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = ggml_debug;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    // init
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    if (model == nullptr || ctx == nullptr) {
+        LOG_ERR("%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+    }
+
+    bool OK = run(ctx, params);
+    if (!OK) {
+        return 1;
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
index cbbdaec67..310455787 100644
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET export-lora)
+set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 0cf3e8e45..7dce99c9a 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -3,24 +3,31 @@
 Apply LORA adapters to base model and export the resulting model.
 
 ```
-usage: export-lora [options]
+usage: llama-export-lora [options]
 
 options:
-  -h, --help                         show this help message and exit
-  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
-  -o FNAME, --model-out FNAME        path to save exported model (default '')
-  -l FNAME, --lora FNAME             apply LoRA adapter
-  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
-  -t N, --threads N                  number of threads to use during computation (default: 4)
+  -m,    --model                  model path from which to load base model (default '')
+         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
+         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
+  -t,    --threads N              number of threads to use during computation (default: 4)
+  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
 ```
 
 For example:
 
 ```bash
-./bin/export-lora \
-    -m open-llama-3b-v2-q8_0.gguf \
-    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+./bin/llama-export-lora \
+    -m open-llama-3b-v2.gguf \
+    -o open-llama-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
 ```
 
-Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
+
+```bash
+./bin/llama-export-lora \
+    -m your_base_model.gguf \
+    -o your_merged_model.gguf \
+    --lora-scaled lora_task_A.gguf 0.5 \
+    --lora-scaled lora_task_B.gguf 0.5
+```
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 08413f57e..91238e4be 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,462 +1,432 @@
-
-#include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
+#include "gguf.h"
 
+#include "arg.h"
+#include "common.h"
+
+#include <map>
 #include <vector>
 #include <string>
-#include <thread>
+#include <fstream>
 
-struct lora_info {
-    std::string filename;
+static bool g_verbose = false;
+
+struct tensor_transformation {
+    struct ggml_tensor * in;
+    struct ggml_tensor * out;
+    bool is_copy;
+};
+
+static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+}
+
+static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx_ggml,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load input GGUF from " + fname);
+    }
+    return ctx_gguf;
+}
+
+struct file_input {
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_context * ctx_gguf = nullptr;
+    std::ifstream f_in;
+    std::map<std::string, ggml_tensor *> tensors;
+    float alpha;
     float scale;
+
+    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+        if (!f_in.is_open()) {
+            throw std::runtime_error("failed to open input gguf from " + fname);
+        }
+
+        ctx_gguf = load_gguf(fname, &ctx_meta);
+        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
+        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
+
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
+            std::string name(cur->name);
+            tensors[name] = cur;
+            if (g_verbose) {
+                printf("%s: %s\n", __func__, cur->name);
+            }
+        }
+    }
+
+    ggml_tensor * get_tensor(std::string name) {
+        if (tensors.find(name) == tensors.end()) {
+            return nullptr;
+        }
+        return tensors[name];
+    }
+
+    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
+        if (tensors.find(name) == tensors.end()) {
+            throw std::runtime_error("cannot find tensor with name: " + name);
+        }
+        auto len = ggml_nbytes(tensors[name]);
+        if (buf.size() < len) {
+            buf.resize(len);
+        }
+        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+        f_in.seekg(offset);
+        f_in.read((char* )buf.data(), len);
+    }
+
+    ~file_input() {
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
 };
 
-struct export_lora_params {
-    std::string fn_model_base;
-    std::string fn_model_out;
-    std::vector<struct lora_info> lora;
+struct lora_merge_ctx {
+    // input base model + adapters
+    file_input base_model;
+    std::vector<std::unique_ptr<file_input>> adapters;
+
+    // for computing merged tensor
     int n_threads;
-};
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> read_buf;
 
-struct lora_data {
-    struct lora_info     info;
-    std::vector<uint8_t> data;
-    struct ggml_context * ctx;
+    // output file
+    struct gguf_context * ctx_out;
+    struct ggml_context * ctx_out_ggml;
+    std::ofstream fout;
 
-    uint32_t lora_r;
-    uint32_t lora_alpha;
-};
+    lora_merge_ctx(
+            std::string & base_fname,
+            std::vector<common_adapter_lora_info> & lora_files,
+            std::string & outfile,
+            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
+        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
+            throw std::runtime_error("split model is not yet supported");
+        }
 
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
+        for (auto & lora_inp : lora_files) {
+            auto fname = lora_inp.path;
+            auto scale = lora_inp.scale;
+            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
+            check_metadata_lora(adapter.get());
+            adapters.push_back(std::move(adapter));
+        }
+
+        ctx_out = gguf_init_empty();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_out_ggml = ggml_init(params);
+        backend = ggml_backend_cpu_init();
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+
+    void check_metadata_lora(file_input * adapter) {
+        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
+        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
+        if (general_arch_base != general_arch_lora) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+    }
+
+    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+        if (t->type == GGML_TYPE_F32) {
+            return GGML_TYPE_F32;
         } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
+            return GGML_TYPE_F16;
         }
     }
 
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
+    void run_merge() {
+        // prepare metadata
+        gguf_set_kv(ctx_out, base_model.ctx_gguf);
+        // output is forced to f16 for now
+        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
 
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // check if all lora adapters have the same tensors
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
+        if (adapters.size() > 1) {
+            for (size_t i = 1; i < adapters.size(); ++i) {
+                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
+                    throw std::runtime_error(err_no_subset_adapter);
+                }
+                for (auto & it : adapters[i]->tensors) {
+                    if (adapters[0]->get_tensor(it.first) == nullptr) {
+                        throw std::runtime_error(err_no_subset_adapter);
+                    }
+                }
+            }
         }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
+
+        // mapping base tensor to out tensor (same shape with base, but different type)
+        std::vector<tensor_transformation> trans;
+        for (auto & it : base_model.tensors) {
+            bool t_a = true;
+            bool t_b = true;
+            for (auto & adapter : adapters) {
+                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
+                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
+            }
+            auto base_tensor = it.second;
+            if (!t_a && !t_b) {
+                // only copy
+                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(cpy_tensor, base_tensor->name);
+                trans.push_back({
+                    cpy_tensor,
+                    cpy_tensor,
+                    true,
+                });
+                gguf_add_tensor(ctx_out, cpy_tensor);
+            } else if (t_a && t_b) {
+                // need merging
+                struct ggml_tensor * out_tensor = ggml_new_tensor(
+                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
+                ggml_set_name(out_tensor, base_tensor->name);
+                trans.push_back({
+                    base_tensor,
+                    out_tensor,
+                    false,
+                });
+                gguf_add_tensor(ctx_out, out_tensor);
+            } else {
+                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
+            }
         }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
+
+        // placeholder for the meta data
+        {
+            size_t meta_size = gguf_get_meta_size(ctx_out);
+            zeros(fout, meta_size);
         }
-    }
 
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // process base model tensors
+        size_t n_merged = 0;
+        for (auto & it : trans) {
+            if (!it.is_copy) {
+                merge_tensor(it.in, it.out);
+                n_merged++;
+            } else {
+                copy_tensor(it.in);
+            }
         }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
+
+        // write output metadata
+        {
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.seekp(0);
+            fout.write((const char *)data.data(), data.size());
         }
+
+        printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
     }
 
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
+    void copy_tensor(struct ggml_tensor * base) {
+        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+        size_t len = ggml_nbytes(base);
+        base_model.read_tensor_data(base->name, read_buf);
+        fout.write((char* )read_buf.data(), len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
     }
 
-    bool eof() {
-        return tell() >= size;
-    }
+    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
+        std::string name_base(base->name);
+        std::string name_lora_a = name_base + ".lora_a";
+        std::string name_lora_b = name_base + ".lora_b";
 
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
+        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+
+        // context for input tensor
+        std::vector<struct ggml_tensor *> inp_a(adapters.size());
+        std::vector<struct ggml_tensor *> inp_b(adapters.size());
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // alloc tensors
+        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            auto t_a = adapters[i]->get_tensor(name_lora_a);
+            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            // TODO: add support for quantized lora
+            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
+                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
+            }
+            inp_a[i] = ggml_dup_tensor(ctx, t_a);
+            inp_b[i] = ggml_dup_tensor(ctx, t_b);
         }
+        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+        // load base tensor to backend buffer
+        base_model.read_tensor_data(name_base, read_buf);
+        if (base->type != GGML_TYPE_F32) {
+            // optionally dequantize it
+            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
+            auto nels = ggml_nelements(inp_base);
+            const auto * qtype = ggml_get_type_traits(base->type);
+            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
+        } else {
+            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        }
+
+        // load lora tensors to backend buffer
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            adapters[i]->read_tensor_data(name_lora_a, read_buf);
+            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
+            adapters[i]->read_tensor_data(name_lora_b, read_buf);
+            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        }
+
+        // build graph
+        struct ggml_cgraph * gf;
+        {
+            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+            static std::vector<uint8_t> buf(buf_size);
+            struct ggml_init_params params0 = {
+                /*.mem_size   =*/ buf_size,
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ true,
+            };
+            struct ggml_context * ctx0 = ggml_init(params0);
+            gf = ggml_new_graph(ctx0);
+            struct ggml_tensor * cur = inp_base;
+            for (size_t i = 0; i < adapters.size(); ++i) {
+                struct ggml_tensor * delta;
+                bool is_tok_embd = string_starts_with(name_base, "token_embd");
+                if (is_tok_embd) {
+                    printf("%s :     detected token embeddings tensor\n", __func__);
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
+                        ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
+                } else {
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                }
+                // scale
+                const float alpha = adapters[i]->alpha;
+                const float rank  = (float) inp_b[i]->ne[0];
+                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
+                delta = ggml_scale(ctx0, delta, scale);
+                cur = ggml_add(ctx0, delta, cur);
+                printf("%s :   + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
+            }
+            cur = ggml_cast(ctx0, cur, out->type);
+            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
+            ggml_build_forward_expand(gf, cur);
+            ggml_free(ctx0);
+        }
+
+        // compute
+        {
+            ggml_gallocr_alloc_graph(allocr, gf);
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        // write data to output file
+        {
+            auto * result = ggml_graph_node(gf, -1);
+            size_t len = ggml_nbytes(result);
+            if (read_buf.size() < len) {
+                read_buf.resize(len);
+            }
+            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
+            fout.write((char* )read_buf.data(), len);
+            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        }
+
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+    }
+
+    ~lora_merge_ctx() {
+        ggml_gallocr_free(allocr);
+        ggml_backend_free(backend);
+        gguf_free(ctx_out);
+        ggml_free(ctx_out_ggml);
     }
 };
 
-static struct export_lora_params get_default_export_lora_params() {
-    struct export_lora_params result;
-    result.fn_model_base = "";
-    result.fn_model_out  = "";
-    result.n_threads = GGML_DEFAULT_N_THREADS;
-    return result;
-}
-
-static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
-    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
-    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
-    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
-    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
-}
-
-static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct export_lora_params default_params = get_default_export_lora_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "-m" || arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "-o" || arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-l" || arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            lora.scale = 1.0f;
-            params->lora.push_back(lora);
-        } else if (arg == "-s" || arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            lora.scale = std::stof(argv[i]);
-            params->lora.push_back(lora);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-            if (params->n_threads <= 0) {
-                params->n_threads = std::thread::hardware_concurrency();
-            }
-        } else {
-            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-
-    if (params->fn_model_base == default_params.fn_model_base) {
-        fprintf(stderr, "error: please specify a filename for model-base.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (params->fn_model_out == default_params.fn_model_out) {
-        fprintf(stderr, "error: please specify a filename for model-out.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    return true;
-}
-
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
-
-static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
-    result->info = *info;
-    result->ctx = NULL;
-    result->lora_r     = 1;
-    result->lora_alpha = 1;
-
-    struct llama_file file(info->filename.c_str(), "rb");
-    if (file.fp == NULL) {
-        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
-            info->filename.c_str());
-        free_lora(result);
-        return NULL;
-    }
-
-    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
-    params_ggml.mem_buffer = NULL;
-    params_ggml.no_alloc   = true;
-    result->ctx = ggml_init(params_ggml);
-
-    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
-    }
-    uint32_t version = file.read_u32();
-    if (version != 1) {
-        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
-    }
-    result->lora_r     = file.read_u32();
-    result->lora_alpha = file.read_u32();
-    // read tensor infos from file
-    std::vector<char> name_buf;
-    std::vector<struct ggml_tensor *> tensors;
-    std::vector<size_t> tensors_offset;
-    size_t total_nbytes_pad = 0;
-    while(!file.eof()) {
-        int64_t ne[4]   = {1,1,1,1};
-        uint32_t n_dims  = file.read_u32();
-        uint32_t namelen = file.read_u32();
-        uint32_t type    = file.read_u32();
-        for (uint32_t k = 0; k < n_dims; ++k) {
-            ne[k] = (int64_t)file.read_u32();
-        }
-        name_buf.clear();
-        name_buf.resize(namelen + 1, '\0');
-        file.read_raw(name_buf.data(), namelen);
-        file.seek((0-file.tell()) & 31, SEEK_CUR);
-        size_t offset = file.tell();
-        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        ggml_set_name(tensor, name_buf.data());
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        total_nbytes_pad += nbytes_pad;
-        tensors.push_back(tensor);
-        tensors_offset.push_back(offset);
-        file.seek(nbytes, SEEK_CUR);
-    }
-    // read tensor data
-    result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        struct ggml_tensor * tensor = tensors[i];
-        size_t offset     = tensors_offset[i];
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
-    }
-    return result;
-}
-
-
-static struct ggml_cgraph * build_graph_lora(
-    struct ggml_context * ctx,
-    struct ggml_tensor * tensor,
-    struct ggml_tensor * lora_a,
-    struct ggml_tensor * lora_b,
-    float scaling
-) {
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
-    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
-    }
-    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand (gf, res);
-    return gf;
-}
-
-static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
-    if (lora->ctx == NULL) {
-        return false;
-    }
-    std::string name = ggml_get_name(tensor);
-    std::string name_a = name + std::string(".loraA");
-    std::string name_b = name + std::string(".loraB");
-    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
-    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
-    if (lora_a == NULL || lora_b == NULL) {
-        return false;
-    }
-
-    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
-
-    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
-    struct ggml_context * ctx = NULL;
-    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph  * gf = NULL;
-
-    ctx   = ggml_init(params);
-    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-
-    ggml_gallocr_alloc_graph(alloc, gf);
-
-    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> data_work;
-    data_work.resize(cplan.work_size);
-    cplan.work_data = data_work.data();
-
-    ggml_graph_compute(gf, &cplan);
-
-    ggml_gallocr_free(alloc);
-    ggml_free(ctx);
-    return true;
-}
-
-static void export_lora(struct export_lora_params * params) {
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
-            loras.push_back(lora);
-        }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-
-    // open input file
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
-    if (!fin.fp) {
-        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
-    }
-
-    // open base model gguf, read tensors without their data
-    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
-    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
-
-    // create new gguf
-    struct gguf_context * gguf_out = gguf_init_empty();
-
-    // copy meta data from base model: kv and tensors
-    gguf_set_kv(gguf_out, gguf_in);
-    int n_tensors = gguf_get_n_tensors(gguf_in);
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        gguf_add_tensor(gguf_out, tensor);
-    }
-
-    // create output file
-    struct llama_file fout(params->fn_model_out.c_str(), "wb");
-    if (!fout.fp) {
-        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
-    }
-
-    // write gguf meta data
-    std::vector<uint8_t> meta;
-    meta.resize(gguf_get_meta_size(gguf_out));
-    gguf_get_meta_data(gguf_out, meta.data());
-    fout.write_raw(meta.data(), meta.size());
-
-    std::vector<uint8_t> data;
-    std::vector<uint8_t> padding;
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-
-        // read tensor data
-        data.resize(ggml_nbytes(tensor));
-        tensor->data = data.data();
-        size_t offset = gguf_get_tensor_offset(gguf_in, i);
-        fin.seek(offset + meta.size(), SEEK_SET);
-        fin.read_raw(data.data(), data.size());
-
-        // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
-        }
-
-        // write tensor data + padding
-        padding.clear();
-        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
-
-        GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset + meta.size(), SEEK_SET);
-        fout.write_raw(data.data(), data.size());
-        fout.write_raw(padding.data(), padding.size());
-
-        if (i % 2 == 0) {
-            printf(".");
-        }
-    }
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
+    printf("\nNOTE: output model is F16\n");
     printf("\n");
-
-    // close gguf
-    gguf_free(gguf_out);
-    gguf_free(gguf_in);
-
-    // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
-    }
 }
 
 int main(int argc, char ** argv) {
-    struct export_lora_params params = get_default_export_lora_params();
+    common_params params;
 
-    if (!export_lora_params_parse(argc, argv, &params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
         return 1;
     }
 
-    export_lora(&params);
+    g_verbose = (params.verbosity > 1);
+    try {
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
+        ctx.run_merge();
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s\n", err.what());
+        exit(EXIT_FAILURE);
+    }
+
+    printf("done, output file is %s\n", params.lora_outfile.c_str());
 
     return 0;
 }
diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt
deleted file mode 100644
index 2b52d21cf..000000000
--- a/examples/finetune/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
deleted file mode 100644
index 2fafd505e..000000000
--- a/examples/finetune/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# finetune
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# finetune LORA adapter
-./bin/finetune \
-        --model-base open-llama-3b-v2-q8_0.gguf \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
-        --train-data "shakespeare.txt" \
-        --save-every 10 \
-        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
-
-# predict
-./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-```
-
-**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
-The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
-So in above example after 10 iterations these files will be written:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-After 10 more iterations:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
-
-llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
-
-In `main` you can also load multiple LORA adapters, which will then be mixed together.
-
-For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
-
-```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
-  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
-```
-
-You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
-
-For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
-
-```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
-  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
-```
-
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
-
-Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
-If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
-
-The default LORA rank can be specified with `--lora-r N`.
-The LORA rank can be configured for each model tensor type separately with these command line options:
-
-```bash
-  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
-  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
-  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
-  --rank-out-norm N          LORA rank for output norm tensor (default 1)
-  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
-  --rank-out N               LORA rank for output tensor (default 4)
-  --rank-wq N                LORA rank for wq tensor (default 4)
-  --rank-wk N                LORA rank for wk tensor (default 4)
-  --rank-wv N                LORA rank for wv tensor (default 4)
-  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
-```
-
-The LORA rank of 'norm' tensors should always be 1.
-
-To see all available options use `finetune --help`.
diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
deleted file mode 100644
index c89090918..000000000
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/env python3
-# finetune checkpoint --> gguf conversion
-
-import argparse
-import gguf
-import struct
-import numpy as np
-from pathlib import Path
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
-LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
-LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
-LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
-LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
-LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
-LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
-LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
-LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
-LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.product(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        if tuple(ne) != tuple(self.ne):
-            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version != 1:
-            raise ValueError('Invalid version of optimization context in checkpoint file')
-
-        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-        self.adam_m  = Tensor('f', [self.nx])
-        self.adam_v  = Tensor('f', [self.nx])
-        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-        self.lbfgs_x    = Tensor('f', [self.nx])
-        self.lbfgs_xp   = Tensor('f', [self.nx])
-        self.lbfgs_g    = Tensor('f', [self.nx])
-        self.lbfgs_gp   = Tensor('f', [self.nx])
-        self.lbfgs_d    = Tensor('f', [self.nx])
-        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-        # forgot to save type in version 1:
-        # guess self.type from number of remaining bytes
-        size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                [self.adam_m, self.adam_v]
-                                +([self.adam_pf] if (self.past > 0) else [])])
-        size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                 self.lbfgs_lmal, self.lbfgs_lmys,
-                                 self.lbfgs_lms, self.lbfgs_lmy]
-                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
-        # due to alignment padding the size might not by exact
-        # but the difference in size for both types is significant,
-        # so we can just use whichever is closest
-        remaining = len(data) - offset
-        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-            self.type = 0
-        else:
-            self.type = 1
-
-        if self.type == 0:
-            offset = self.adam_m.load(data, offset)
-            offset = self.adam_v.load(data, offset)
-            offset = self.adam_pf.load(data,offset)
-
-            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        elif self.type == 1:
-            offset = self.lbfgs_x.load(data, offset)
-            offset = self.lbfgs_xp.load(data, offset)
-            offset = self.lbfgs_g.load(data, offset)
-            offset = self.lbfgs_gp.load(data, offset)
-            offset = self.lbfgs_d.load(data, offset)
-            offset = self.lbfgs_pf.load(data, offset)
-            offset = self.lbfgs_lmal.load(data, offset)
-            offset = self.lbfgs_lmys.load(data, offset)
-            offset = self.lbfgs_lms.load(data, offset)
-            offset = self.lbfgs_lmy.load(data, offset)
-
-            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError(f"Invalid optimizer type '{self.type}'")
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class LoraParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
-
-class ModelParams:
-    def __init__(self, n_ff = None):
-        self.n_ff = n_ff
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        if self.n_ff is None:
-            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-        else:
-            return self.n_ff
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
-
-class Layer:
-    def __init__(self, params, lora_params, bid):
-        self.bid = bid
-        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
-        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
-        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
-        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
-        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
-        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
-        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
-        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
-        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
-        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm_a.load(data, offset)
-        offset = self.att_norm_b.load(data, offset)
-        offset = self.wq_a.load(data, offset)
-        offset = self.wq_b.load(data, offset)
-        offset = self.wk_a.load(data, offset)
-        offset = self.wk_b.load(data, offset)
-        offset = self.wv_a.load(data, offset)
-        offset = self.wv_b.load(data, offset)
-        offset = self.wo_a.load(data, offset)
-        offset = self.wo_b.load(data, offset)
-        offset = self.ffn_norm_a.load(data, offset)
-        offset = self.ffn_norm_b.load(data, offset)
-        offset = self.w1_a.load(data, offset)
-        offset = self.w1_b.load(data, offset)
-        offset = self.w2_a.load(data, offset)
-        offset = self.w2_b.load(data, offset)
-        offset = self.w3_a.load(data, offset)
-        offset = self.w3_b.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
-        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
-        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
-        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
-        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
-        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
-        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
-        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
-        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
-        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
-        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
-        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
-        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
-        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
-        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
-        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
-        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
-        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
-
-class LoraModel:
-    def __init__(self, n_ff = None):
-        self.params = ModelParams(n_ff = n_ff)
-        self.lora_params = LoraParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-        offset = self.lora_params.load(data, offset)
-
-        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
-        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
-        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
-        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
-        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
-        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
-
-        offset = self.tok_embd_a.load(data, offset)
-        offset = self.tok_embd_b.load(data, offset)
-        offset = self.norm_a.load(data, offset)
-        offset = self.norm_b.load(data, offset)
-        offset = self.output_a.load(data, offset)
-        offset = self.output_b.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, self.lora_params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-        self.lora_params.save_gguf(gguf_writer)
-
-        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
-        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
-        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
-        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
-        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
-        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class LoraCheckpoint:
-    def __init__(self, n_ff = None):
-        self.model = LoraModel(n_ff = n_ff)
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcl':
-            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
-    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    print(cfg)
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = LoraCheckpoint(n_ff = cfg.ff)
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
deleted file mode 100644
index 3da5317b3..000000000
--- a/examples/finetune/finetune.cpp
+++ /dev/null
@@ -1,1861 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "llama.h"
-#include "common.h"
-#include "train.h"
-#include <vector>
-#include <cstring>
-#include <ctime>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab    = 32000;
-    uint32_t n_ctx      = 512;
-    uint32_t n_embd     = 4096;
-    uint32_t n_ff       = 11008;
-    uint32_t n_head     = 32;
-    uint32_t n_head_kv  = 32;
-    uint32_t n_layer    = 32;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-
-    uint32_t n_gqa() const {
-        return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_head() const {
-        return n_embd/n_head;
-    }
-
-    uint32_t n_embd_gqa() const {
-        return n_embd/n_gqa();
-    }
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-struct my_llama_lora_hparams {
-    uint32_t lora_r = 1;
-    uint32_t lora_alpha = 1;
-    uint32_t n_rank_attention_norm = 1;
-    uint32_t n_rank_wq = 4;
-    uint32_t n_rank_wk = 4;
-    uint32_t n_rank_wv = 4;
-    uint32_t n_rank_wo = 4;
-    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_ffn_gate = 4;
-    uint32_t n_rank_ffn_down = 4;
-    uint32_t n_rank_ffn_up = 4;
-    uint32_t n_rank_tok_embeddings = 4;
-    uint32_t n_rank_norm = 1;
-    uint32_t n_rank_output = 4;
-
-    bool operator!=(const my_llama_lora_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_lora_layer {
-    // normalization
-    struct ggml_tensor * attention_norm_a;
-    struct ggml_tensor * attention_norm_b;
-
-    // attention
-    struct ggml_tensor * wq_a;
-    struct ggml_tensor * wq_b;
-    struct ggml_tensor * wk_a;
-    struct ggml_tensor * wk_b;
-    struct ggml_tensor * wv_a;
-    struct ggml_tensor * wv_b;
-    struct ggml_tensor * wo_a;
-    struct ggml_tensor * wo_b;
-
-    // normalization
-    struct ggml_tensor * ffn_norm_a;
-    struct ggml_tensor * ffn_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate_a;
-    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * ffn_down_a;
-    struct ggml_tensor * ffn_down_b;
-    struct ggml_tensor * ffn_up_a;
-    struct ggml_tensor * ffn_up_b;
-};
-
-struct my_llama_lora {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data;
-
-    my_llama_lora_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings_a;
-    struct ggml_tensor * tok_embeddings_b;
-
-    struct ggml_tensor * norm_a;
-    struct ggml_tensor * norm_b;
-    struct ggml_tensor * output_a;
-    struct ggml_tensor * output_b;
-
-    std::vector<my_llama_lora_layer> layers;
-};
-
-// gguf constants
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
-
-// gguf constants (sync with gguf.py)
-
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV     = "%s.attention.head_count_kv";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
-    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
-    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
-    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
-}
-
-static void print_lora_params(struct my_llama_lora_hparams * params) {
-    printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
-    printf("%s: n_rank_wq             : %u\n", __func__, params->n_rank_wq);
-    printf("%s: n_rank_wk             : %u\n", __func__, params->n_rank_wk);
-    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
-    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
-    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
-    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
-    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
-    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
-    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
-    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) {
-    std::string arch;
-
-    GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    if (expected_arch != NULL) {
-        if (arch != expected_arch) {
-            printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch);
-        }
-        GGML_ASSERT(arch == expected_arch);
-    }
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
-
-    // n_head_kv is optional, default to n_head
-    hparams->n_head_kv = hparams->n_head;
-    GGUF_GET_KEY(ctx, hparams->n_head_kv,      gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-}
-
-static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
-    auto & hparams = model->hparams;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-
-    // get parameters directly from gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
-
-        load_model_hparams_gguf(mctx, &hparams, "llama");
-
-        gguf_free(mctx);
-    }
-    hparams.n_vocab = llama_n_vocab(input);
-    hparams.n_ctx = n_ctx;
-
-    // get tensors from llama_model (possibly mmapped)
-    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
-    model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
-    model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
-
-    assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab);
-    assert_shape_1d(model->norm,           hparams.n_embd);
-    assert_shape_2d(model->output,         hparams.n_embd, hparams.n_vocab);
-
-    model->layers.resize(hparams.n_layer);
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
-        layer.wq             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i));
-        layer.wk             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i));
-        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
-        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
-        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
-
-        assert_shape_1d(layer.attention_norm, hparams.n_embd);
-        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
-        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
-    }
-}
-
-static void set_param_lora(struct my_llama_lora * lora) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct ggml_context* ctx = lora->ctx;
-
-    ggml_set_param(ctx, lora->tok_embeddings_a);
-    ggml_set_param(ctx, lora->tok_embeddings_b);
-    ggml_set_param(ctx, lora->norm_a);
-    ggml_set_param(ctx, lora->norm_b);
-    ggml_set_param(ctx, lora->output_a);
-    ggml_set_param(ctx, lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm_a);
-        ggml_set_param(ctx, layer.attention_norm_b);
-        ggml_set_param(ctx, layer.wq_a);
-        ggml_set_param(ctx, layer.wq_b);
-        ggml_set_param(ctx, layer.wk_a);
-        ggml_set_param(ctx, layer.wk_b);
-        ggml_set_param(ctx, layer.wv_a);
-        ggml_set_param(ctx, layer.wv_b);
-        ggml_set_param(ctx, layer.wo_a);
-        ggml_set_param(ctx, layer.wo_b);
-        ggml_set_param(ctx, layer.ffn_norm_a);
-        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.ffn_gate_a);
-        ggml_set_param(ctx, layer.ffn_gate_b);
-        ggml_set_param(ctx, layer.ffn_down_a);
-        ggml_set_param(ctx, layer.ffn_down_b);
-        ggml_set_param(ctx, layer.ffn_up_a);
-        ggml_set_param(ctx, layer.ffn_up_b);
-    }
-}
-
-static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
-    const auto & lparams = lora->hparams;
-
-    const uint32_t n_embd     = model->hparams.n_embd;
-    const uint32_t n_embd_gqa = model->hparams.n_embd_gqa();
-    const uint32_t n_layer    = model->hparams.n_layer;
-    const uint32_t n_vocab    = model->hparams.n_vocab;
-    const uint32_t n_ff       = model->hparams.n_ff;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // context for lora tensors without their data
-    struct ggml_init_params ctx_lora_params;
-    ctx_lora_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_lora_params.mem_buffer = NULL;
-    ctx_lora_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_lora_params);
-    lora->ctx = ctx;
-
-    lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
-    lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
-    lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
-    lora->norm_b           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1);
-    lora->output_a         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
-    lora->output_b         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
-
-    ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_a"));
-    ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_b"));
-    ggml_set_name(lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a"));
-    ggml_set_name(lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b"));
-    ggml_set_name(lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_a"));
-    ggml_set_name(lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_b"));
-
-    lora->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
-        layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
-
-        layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
-        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa);
-        layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
-        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa);
-        layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-        layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-
-        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
-        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
-
-        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
-        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
-        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
-        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
-        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
-        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);
-
-        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
-        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
-        ggml_set_name(layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_a", i));
-        ggml_set_name(layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_b", i));
-        ggml_set_name(layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_a", i));
-        ggml_set_name(layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_b", i));
-        ggml_set_name(layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_a", i));
-        ggml_set_name(layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_b", i));
-        ggml_set_name(layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_a", i));
-        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
-    }
-
-    set_param_lora(lora);
-
-    // allocate data for lora tensors
-    lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(lora->tok_embeddings_a, rnd);
-    ggml_set_zero(lora->tok_embeddings_b);
-    randomize_tensor_normal(lora->norm_a,           rnd);
-    ggml_set_zero(lora->norm_b);
-    randomize_tensor_normal(lora->output_a,         rnd);
-    ggml_set_zero(lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        randomize_tensor_normal(layer.attention_norm_a, rnd);
-        ggml_set_zero(layer.attention_norm_b);
-
-        randomize_tensor_normal(layer.wq_a, rnd);
-        ggml_set_zero(layer.wq_b);
-        randomize_tensor_normal(layer.wk_a, rnd);
-        ggml_set_zero(layer.wk_b);
-        randomize_tensor_normal(layer.wv_a, rnd);
-        ggml_set_zero(layer.wv_b);
-        randomize_tensor_normal(layer.wo_a, rnd);
-        ggml_set_zero(layer.wo_b);
-
-        randomize_tensor_normal(layer.ffn_norm_a, rnd);
-        ggml_set_zero(layer.ffn_norm_b);
-
-        randomize_tensor_normal(layer.ffn_gate_a, rnd);
-        ggml_set_zero(layer.ffn_gate_b);
-        randomize_tensor_normal(layer.ffn_down_a, rnd);
-        ggml_set_zero(layer.ffn_down_b);
-        randomize_tensor_normal(layer.ffn_up_a, rnd);
-        ggml_set_zero(layer.ffn_up_b);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_lora_finetune_graphs(
-        struct my_llama_model * model,
-        struct my_llama_lora  * lora,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams  = model->hparams;
-    const int n_ctx       = hparams.n_ctx;
-    const int n_vocab     = hparams.n_vocab;
-    const int n_embd      = hparams.n_embd;
-    const int n_layer     = hparams.n_layer;
-    const int n_head      = hparams.n_head;
-    const int n_head_kv   = hparams.n_head_kv;
-    const int n_ff        = hparams.n_ff;
-    const int n_rot       = hparams.n_embd_head();
-    const int n_embd_head = hparams.n_embd_head();
-    const int n_embd_gqa  = hparams.n_embd_gqa();
-
-    const float rms_norm_eps    = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    GGML_ASSERT((size_t) n_layer == lora->layers.size());
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-
-    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
-            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
-        } else if (a->type == GGML_TYPE_F32) {
-            return ggml_add(ctx, a, b);
-        } else {
-            die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n",
-                __func__, ggml_type_name(a->type));
-        }
-    };
-
-    struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
-    struct ggml_tensor * norm           = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
-    struct ggml_tensor * output         = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
-
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00);        set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    if (enable_checkpointing) {
-        checkpoints.push_back(tokens_input);
-        checkpoints.push_back(targets);
-        checkpoints.push_back(t00);
-        checkpoints.push_back(t01);
-    }
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct my_llama_lora_layer & llayer = lora->layers[il];
-
-        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
-        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
-        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
-        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
-
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                                set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                 set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd_head, n_head, N, n_batch);    set_name(t06, "t06");     assert_shape_4d(t06, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                          set_name(t07, "t07");     assert_shape_4d(t07, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                 set_name(t08, "t08");     assert_shape_2d(t08, n_embd_gqa, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                          set_name(t10, "t10");     assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch);
-
-        struct ggml_tensor * t11;
-        if (ggml_is_quantized(wv->type)) {
-            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                               set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch);
-            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                 set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa);
-                                 t11   = ggml_cont     (ctx, t11_2);                                 set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        } else {
-                                 t11   = ggml_mul_mat  (ctx, t04, wv);                               set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        }
-
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                         set_name(t13, "t13");     assert_shape_4d(t13, n_embd_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                         set_name(t14, "t14");     assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);             set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                     set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                     set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                         set_name(t17, "t17");     assert_shape_4d(t17, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                     set_name(t18, "t18");     assert_shape_4d(t18, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                  set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                 set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                                set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        if (enable_checkpointing) {
-            checkpoints.push_back(cur);
-        }
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                    set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                            set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                             set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                          set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);             set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                         set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    if (enable_checkpointing) {
-        checkpoints.push_back(t31);
-        checkpoints.push_back(t32);
-        checkpoints.push_back(t33);
-        checkpoints.push_back(t34);
-        checkpoints.push_back(t35);
-        checkpoints.push_back(t36);
-    }
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    GGML_ASSERT(alloc != NULL);
-
-    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-    int n_leafs_before = gb->n_leafs;
-    int n_nodes_before = gb->n_nodes;
-
-    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-    ggml_set_input(t36->grad);
-    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-
-    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
-    }
-
-    // allocating checkpoints in one block to reduce memory fragmentation
-    // note: they will be freed in reverse order
-    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
-        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_set_input(checkpoints[i]);
-        }
-    }
-
-    if (measure_only) {
-        ggml_gallocr_reserve(alloc, gb);
-    } else {
-        ggml_gallocr_alloc_graph(alloc, gb);
-
-        // set KQ_pos
-        {
-            int * data = (int *) KQ_pos->data;
-            for (int i = 0; i < N; ++i) {
-                data[i] = n_past + i;
-            }
-        }
-    }
-
-    // remove the additional nodes and leafs
-    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-        gb->leafs[i] = NULL;
-    }
-    for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-        gb->nodes[i] = NULL;
-    }
-    gb->n_leafs = n_leafs_before;
-    gb->n_nodes = n_nodes_before;
-
-    *logits = t35;
-    return t36;
-}
-
-static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    struct my_llama_hparams hparams;
-    load_model_hparams_gguf(fctx, &hparams, arch.c_str());
-
-    // parameters that define tensor shapes must match
-    GGML_ASSERT(hparams.n_embd    == model->hparams.n_embd);
-    GGML_ASSERT(hparams.n_ff      == model->hparams.n_ff);
-    GGML_ASSERT(hparams.n_head    == model->hparams.n_head);
-    GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv);
-    GGML_ASSERT(hparams.n_layer   == model->hparams.n_layer);
-
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_output,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
-
-    init_lora(model, lora);
-
-    copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
-    copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
-    copy_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
-    copy_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
-    copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
-    copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
-        copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
-        copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
-        copy_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
-        copy_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
-        copy_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
-        copy_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
-        copy_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
-        copy_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
-        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
-        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
-        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
-        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
-        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
-        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
-        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
-        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
-    }
-}
-
-static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    const char * arch = "llama";
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_embd_head());
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           model->hparams.rope_freq_scale);
-
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,   lora->hparams.n_rank_tok_embeddings);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM,  lora->hparams.n_rank_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT,       lora->hparams.n_rank_output);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,    lora->hparams.n_rank_attention_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q,       lora->hparams.n_rank_wq);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K,       lora->hparams.n_rank_wk);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);
-
-    gguf_add_tensor(fctx, lora->tok_embeddings_a);
-    gguf_add_tensor(fctx, lora->tok_embeddings_b);
-    gguf_add_tensor(fctx, lora->norm_a);
-    gguf_add_tensor(fctx, lora->norm_b);
-    gguf_add_tensor(fctx, lora->output_a);
-    gguf_add_tensor(fctx, lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-
-        gguf_add_tensor(fctx, layer.attention_norm_a);
-        gguf_add_tensor(fctx, layer.attention_norm_b);
-        gguf_add_tensor(fctx, layer.wq_a);
-        gguf_add_tensor(fctx, layer.wq_b);
-        gguf_add_tensor(fctx, layer.wk_a);
-        gguf_add_tensor(fctx, layer.wk_b);
-        gguf_add_tensor(fctx, layer.wv_a);
-        gguf_add_tensor(fctx, layer.wv_b);
-        gguf_add_tensor(fctx, layer.wo_a);
-        gguf_add_tensor(fctx, layer.wo_b);
-        gguf_add_tensor(fctx, layer.ffn_norm_a);
-        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.ffn_gate_a);
-        gguf_add_tensor(fctx, layer.ffn_gate_b);
-        gguf_add_tensor(fctx, layer.ffn_down_a);
-        gguf_add_tensor(fctx, layer.ffn_down_b);
-        gguf_add_tensor(fctx, layer.ffn_up_a);
-        gguf_add_tensor(fctx, layer.ffn_up_b);
-    }
-}
-
-static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
-    GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-
-    load_train_state_gguf(fctx, f_ggml_ctx, train);
-    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
-}
-
-static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-    save_llama_lora_gguf(fctx, model, lora);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train);
-
-    gguf_free(fctx);
-    return true;
-}
-
-static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_lora_gguf(fctx, model, lora, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    if (name == NULL) {
-        name = ggml_get_name(tensor);
-    }
-    uint32_t name_len = strlen(name);
-    uint32_t nd = ggml_n_dims(tensor);
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-
-    auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
-    file.write_u32(1); // version
-    // write_hparams
-    file.write_u32(lora->hparams.lora_r);
-    file.write_u32(lora->hparams.lora_alpha);
-    // write tensors
-    write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraA"));
-    write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraB"));
-    write_tensor(&file, lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA"));
-    write_tensor(&file, lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB"));
-    write_tensor(&file, lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraA"));
-    write_tensor(&file, lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraB"));
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA"));
-        write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB"));
-        write_tensor(&file, layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraA"));
-        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
-    }
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_model_base;
-    const char * fn_lora_out;
-
-    bool only_write_lora;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-
-    bool custom_f_norm_rms_eps;
-    bool custom_rope_freq_base;
-    bool custom_rope_freq_scale;
-
-    int32_t lora_r;
-    int32_t lora_alpha;
-    bool custom_lora_alpha;
-
-    uint32_t n_rank_attention_norm;
-    uint32_t n_rank_wq;
-    uint32_t n_rank_wk;
-    uint32_t n_rank_wv;
-    uint32_t n_rank_wo;
-    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_ffn_gate;
-    uint32_t n_rank_ffn_down;
-    uint32_t n_rank_ffn_up;
-    uint32_t n_rank_tok_embeddings;
-    uint32_t n_rank_norm;
-    uint32_t n_rank_output;
-
-    bool custom_n_rank_attention_norm;
-    bool custom_n_rank_wq;
-    bool custom_n_rank_wk;
-    bool custom_n_rank_wv;
-    bool custom_n_rank_wo;
-    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_ffn_gate;
-    bool custom_n_rank_ffn_down;
-    bool custom_n_rank_ffn_up;
-    bool custom_n_rank_tok_embeddings;
-    bool custom_n_rank_norm;
-    bool custom_n_rank_output;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_model_base     = "";
-    params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
-
-    params.only_write_lora = false;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    params.custom_f_norm_rms_eps  = false;
-    params.custom_rope_freq_base  = false;
-    params.custom_rope_freq_scale = false;
-
-    params.lora_r      = 4;
-    params.lora_alpha  = 4;
-    params.custom_lora_alpha = false;
-
-    params.n_rank_attention_norm = 1;
-    params.n_rank_wq             = 4;
-    params.n_rank_wk             = 4;
-    params.n_rank_wv             = 4;
-    params.n_rank_wo             = 4;
-    params.n_rank_ffn_norm       = 1;
-    params.n_rank_ffn_gate       = 4;
-    params.n_rank_ffn_down       = 4;
-    params.n_rank_ffn_up         = 4;
-    params.n_rank_tok_embeddings = 4;
-    params.n_rank_norm           = 1;
-    params.n_rank_output         = 4;
-
-    params.custom_n_rank_attention_norm = false;
-    params.custom_n_rank_wq             = false;
-    params.custom_n_rank_wk             = false;
-    params.custom_n_rank_wv             = false;
-    params.custom_n_rank_wo             = false;
-    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_ffn_gate       = false;
-    params.custom_n_rank_ffn_down       = false;
-    params.custom_n_rank_ffn_up         = false;
-    params.custom_n_rank_tok_embeddings = false;
-    params.custom_n_rank_norm           = false;
-    params.custom_n_rank_output         = false;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
-    fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
-    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training.  use this if you only want to convert a checkpoint to a lora adapter.\n");
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-    fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
-    fprintf(stderr, "  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r);
-    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-out N               LORA rank for output tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "--lora-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_lora_out = argv[i];
-        } else if (arg == "--only-write-lora") {
-            params->only_write_lora = true;
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-            params->custom_f_norm_rms_eps = true;
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-            params->custom_rope_freq_base = true;
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-            params->custom_rope_freq_scale = true;
-        } else if (arg == "--lora-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_alpha = std::stoi(argv[i]);
-            params->custom_lora_alpha = true;
-        } else if (arg == "--lora-r") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_r = std::stoi(argv[i]);
-        } else if (arg == "--rank-att-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_attention_norm = std::stoi(argv[i]);
-            params->custom_n_rank_attention_norm = true;
-        } else if (arg == "--rank-ffn-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_norm = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_norm = true;
-        } else if (arg == "--rank-out-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_norm = std::stoi(argv[i]);
-            params->custom_n_rank_norm = true;
-        } else if (arg == "--rank-tok-embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_tok_embeddings = std::stoi(argv[i]);
-            params->custom_n_rank_tok_embeddings = true;
-        } else if (arg == "--rank-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_output = std::stoi(argv[i]);
-            params->custom_n_rank_output = true;
-        } else if (arg == "--rank-wq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wq = std::stoi(argv[i]);
-            params->custom_n_rank_wq = true;
-        } else if (arg == "--rank-wk") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wk = std::stoi(argv[i]);
-            params->custom_n_rank_wk = true;
-        } else if (arg == "--rank-wv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wv = std::stoi(argv[i]);
-            params->custom_n_rank_wv = true;
-        } else if (arg == "--rank-wo") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wo = std::stoi(argv[i]);
-            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-ffn_gate") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_gate = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_gate = true;
-        } else if (arg == "--rank-ffn_down") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_down = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_down = true;
-        } else if (arg == "--rank-ffn_up") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_up = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_up = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_lora_out;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-    struct my_llama_lora  * lora;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train);
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->model, data->lora, train);
-    }
-    if (strlen(data->fn_lora_out) > 0) {
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->lora);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_lora* lora) {
-    int64_t nx = 0;
-    nx += ggml_nelements(lora->tok_embeddings_a);
-    nx += ggml_nelements(lora->tok_embeddings_b);
-    nx += ggml_nelements(lora->norm_a);
-    nx += ggml_nelements(lora->norm_b);
-    nx += ggml_nelements(lora->output_a);
-    nx += ggml_nelements(lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        nx += ggml_nelements(layer.attention_norm_a);
-        nx += ggml_nelements(layer.attention_norm_b);
-        nx += ggml_nelements(layer.wq_a);
-        nx += ggml_nelements(layer.wq_b);
-        nx += ggml_nelements(layer.wk_a);
-        nx += ggml_nelements(layer.wk_b);
-        nx += ggml_nelements(layer.wv_a);
-        nx += ggml_nelements(layer.wv_b);
-        nx += ggml_nelements(layer.wo_a);
-        nx += ggml_nelements(layer.wo_b);
-        nx += ggml_nelements(layer.ffn_norm_a);
-        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.ffn_gate_a);
-        nx += ggml_nelements(layer.ffn_gate_b);
-        nx += ggml_nelements(layer.ffn_down_a);
-        nx += ggml_nelements(layer.ffn_down_b);
-        nx += ggml_nelements(layer.ffn_up_a);
-        nx += ggml_nelements(layer.ffn_up_b);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params llama_mparams = llama_model_default_params();
-    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
-    llama_mparams.vocab_only = false;
-
-    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams);
-
-    struct llama_context_params llama_cparams = llama_context_default_params();
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams);
-
-    struct my_llama_model model;
-    init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx);
-
-    struct my_llama_lora lora;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set params from command line
-    if (params.custom_f_norm_rms_eps) {
-        model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    }
-    if (params.custom_rope_freq_base) {
-        model.hparams.rope_freq_base  = params.rope_freq_base;
-    }
-    if (params.custom_rope_freq_scale) {
-        model.hparams.rope_freq_scale = params.rope_freq_scale;
-    }
-    lora.hparams.lora_r                = params.lora_r;
-    lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
-    uint32_t n_rank_attention_norm     = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
-    uint32_t n_rank_wq                 = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
-    uint32_t n_rank_wk                 = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
-    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
-    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
-    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
-    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
-    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
-    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
-    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
-    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
-    lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
-    lora.hparams.n_rank_wq             = n_rank_wq;
-    lora.hparams.n_rank_wk             = n_rank_wk;
-    lora.hparams.n_rank_wv             = n_rank_wv;
-    lora.hparams.n_rank_wo             = n_rank_wo;
-    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
-    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
-    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
-    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
-    lora.hparams.n_rank_norm           = n_rank_norm;
-    lora.hparams.n_rank_output         = n_rank_output;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
-
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_param_count_changed = (
-           (lora.hparams.n_rank_attention_norm != n_rank_attention_norm)
-        || (lora.hparams.n_rank_wq             != n_rank_wq)
-        || (lora.hparams.n_rank_wk             != n_rank_wk)
-        || (lora.hparams.n_rank_wv             != n_rank_wv)
-        || (lora.hparams.n_rank_wo             != n_rank_wo)
-        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
-        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
-        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
-        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
-        || (lora.hparams.n_rank_norm           != n_rank_norm)
-        || (lora.hparams.n_rank_output         != n_rank_output)
-        );
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_param_count_changed) {
-            print_lora_params(&lora.hparams);
-            die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
-            // need to discard previous optimizer gradient statistics and opt_init with new shapes
-            // TODO
-        }
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else { // existed == false
-        init_lora(&model, &lora);
-        randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_lora) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    print_lora_params(&lora.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_lora) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_lora_out       = params.fn_lora_out;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-        save_data.lora              = &lora;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(lora.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // allocate input tensors
-    // measure required memory for input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_lora_finetune_graphs(
-            &model, &lora, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_gallocr_free(alloc);
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_lora_finetune_graphs(
-        &model, &lora, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    // tokenize data
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
-    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
-    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    std::vector<size_t> token_noccurs;
-    token_noccurs.resize(model.hparams.n_vocab, 0);
-    for (unsigned int i = 0; i < train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
-    }
-    int n_unique_tokens = 0;
-    for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
-        if (token_noccurs[i] == 0) continue;
-        ++n_unique_tokens;
-    }
-    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_lora_out       = params.fn_lora_out;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-    save_data.lora              = &lora;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-    ggml_gallocr_free(alloc);
-
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(lora.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
deleted file mode 100644
index 079bfa113..000000000
--- a/examples/finetune/finetune.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
new file mode 100644
index 000000000..d2cb524c0
--- /dev/null
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gbnf-validator)
+add_executable(${TARGET} gbnf-validator.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
new file mode 100644
index 000000000..a610e6a0b
--- /dev/null
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -0,0 +1,109 @@
+#include "unicode.h"
+#include "llama-grammar.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    size_t pos = 0;
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            error_pos = pos;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            return false;
+        }
+        ++pos;
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            return true;
+        }
+    }
+
+    error_pos = pos;
+    error_msg = "Unexpected end of input";
+    return false;
+}
+
+static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
+    fprintf(stdout, "Input string is invalid according to the grammar.\n");
+    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Input string:\n");
+    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
+    if (error_pos < input_str.size()) {
+        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
+        if (error_pos+1 < input_str.size()) {
+            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
+        }
+        fprintf(stdout, "\033[0m\n");
+    }
+}
+
+int main(int argc, char** argv) {
+    if (argc != 3) {
+        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string grammar_filename = argv[1];
+    const std::string input_filename = argv[2];
+
+    // Read the GBNF grammar file
+    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
+    if (!grammar_file) {
+        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
+        return 1;
+    }
+
+    std::string grammar_str;
+    {
+        std::ifstream grammar_file(grammar_filename);
+        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
+        std::stringstream buffer;
+        buffer << grammar_file.rdbuf();
+        grammar_str = buffer.str();
+    }
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+    if (grammar == nullptr) {
+        fprintf(stdout, "Failed to initialize llama_grammar\n");
+        return 1;
+    }
+    // Read the input file
+    std::string input_str;
+    {
+        std::ifstream input_file(input_filename);
+        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
+        std::stringstream buffer;
+        buffer << input_file.rdbuf();
+        input_str = buffer.str();
+    }
+
+    // Validate the input string against the grammar
+    size_t error_pos;
+    std::string error_msg;
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+
+    if (is_valid) {
+        fprintf(stdout, "Input string is valid according to the grammar.\n");
+    } else {
+        print_error_message(input_str, error_pos, error_msg);
+    }
+
+    // Clean up
+    llama_grammar_free_impl(grammar);
+
+    return 0;
+}
diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
new file mode 100644
index 000000000..25de0af35
--- /dev/null
+++ b/examples/gen-docs/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gen-docs)
+add_executable(${TARGET} gen-docs.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
new file mode 100644
index 000000000..77c59a836
--- /dev/null
+++ b/examples/gen-docs/gen-docs.cpp
@@ -0,0 +1,83 @@
+#include "arg.h"
+#include "common.h"
+
+#include <fstream>
+#include <string>
+
+// Export usage message (-h) to markdown format
+
+static void write_table_header(std::ofstream & file) {
+    file << "| Argument | Explanation |\n";
+    file << "| -------- | ----------- |\n";
+}
+
+static void write_table_entry(std::ofstream & file, const common_arg & opt) {
+    file << "| `";
+    // args
+    for (const auto & arg : opt.args) {
+    if (arg == opt.args.front()) {
+            file << arg;
+            if (opt.args.size() > 1) file << ", ";
+        } else {
+            file << arg << (arg != opt.args.back() ? ", " : "");
+        }
+    }
+    // value hint
+    if (opt.value_hint) {
+        std::string md_value_hint(opt.value_hint);
+        string_replace_all(md_value_hint, "|", "\\|");
+        file << " " << md_value_hint;
+    }
+    if (opt.value_hint_2) {
+        std::string md_value_hint_2(opt.value_hint_2);
+        string_replace_all(md_value_hint_2, "|", "\\|");
+        file << " " << md_value_hint_2;
+    }
+    // help text
+    std::string md_help(opt.help);
+    string_replace_all(md_help, "\n", "<br/>");
+    string_replace_all(md_help, "|", "\\|");
+    file << "` | " << md_help << " |\n";
+}
+
+static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
+    write_table_header(file);
+    for (const auto & opt : opts) {
+        write_table_entry(file, *opt);
+    }
+}
+
+static void export_md(std::string fname, llama_example ex) {
+    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
+
+    common_params params;
+    auto ctx_arg = common_params_parser_init(params, ex);
+
+    std::vector<common_arg *> common_options;
+    std::vector<common_arg *> sparam_options;
+    std::vector<common_arg *> specific_options;
+    for (auto & opt : ctx_arg.options) {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+
+    file << "**Common params**\n\n";
+    write_table(file, common_options);
+    file << "\n\n**Sampling params**\n\n";
+    write_table(file, sparam_options);
+    file << "\n\n**Example-specific params**\n\n";
+    write_table(file, specific_options);
+}
+
+int main(int, char **) {
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+
+    return 0;
+}
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
new file mode 100644
index 000000000..15c5c68c6
--- /dev/null
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(TARGET llama-gguf-hash)
+add_executable(${TARGET} gguf-hash.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+
+# clibs dependencies
+include_directories(deps/)
+
+add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
+target_link_libraries(${TARGET} PRIVATE xxhash)
+
+add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
+target_link_libraries(${TARGET} PRIVATE sha1)
+if (NOT MSVC)
+    # disable warnings in 3rd party code
+    target_compile_options(sha1 PRIVATE -w)
+endif()
+
+add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
+target_link_libraries(${TARGET} PRIVATE sha256)
+
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md
new file mode 100644
index 000000000..9871651e3
--- /dev/null
+++ b/examples/gguf-hash/README.md
@@ -0,0 +1,206 @@
+
+# llama-gguf-hash
+
+CLI to hash GGUF files to detect difference on a per model and per tensor level.
+
+**Command line options:**
+
+- `--help`: display help message
+- `--xxh64`: use xhash 64bit hash mode (default)
+- `--sha1`: use sha1
+- `--uuid`: use uuid
+- `--sha256`: use sha256
+- `--all`: use all hash
+- `--no-layer`: exclude per layer hash
+- `--uuid`: generate UUIDv5 ID
+- `-c`, `--check <manifest>`:  verify against a manifest
+
+## About
+
+While most POSIX systems already have hash checking programs like sha256sum, it
+is designed to check entire files. This is not ideal for our purpose if we want
+to check for consistency of the tensor data even if the metadata content of the
+gguf KV store has been updated.
+
+This program is designed to hash a gguf tensor payload on a 'per tensor layer'
+in addition to a 'entire tensor model' hash. The intent is that the entire
+tensor layer can be checked first but if there is any detected inconsistencies,
+then the per tensor hash can be used to narrow down the specific tensor layer
+that has inconsistencies.
+
+For Maintainers:
+- Detection of tensor inconsistency during development and automated tests
+    - This is served by xxh64 which is fast
+    - This is also served by having per tensor layer to assist in narrowing down
+      the location of the faulty tensor layer
+    - This is also served by sha1 which is much slower but more widely supported
+
+For Model Creators:
+- Optional consistent UUID generation based on model tensor content
+    - This is served by UUIDv5 which is useful for databases keys
+        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
+            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
+
+For Model Users:
+- Assurance of tensor layer integrity even if metadata was updated
+    - This is served by sha256 which is still considered very secure as of 2024
+
+### Design Note
+
+- The default behavior of this program if no arguments is provided is to hash
+  using xxhash's xxh32 mode because it is very fast and is primarily targeted
+  towards maintainers who may want to use this in automated tests.
+- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
+  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
+  would have a better affinity to calculating hash that is 64bit in size.
+
+## Compile Example
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
+make -C build clean
+make -C build llama-gguf-hash VERBOSE=1
+./build/bin/llama-gguf-hash test.gguf
+./build/bin/llama-gguf-hash --xxh64 test.gguf
+./build/bin/llama-gguf-hash --sha1 test.gguf
+./build/bin/llama-gguf-hash --uuid test.gguf
+./build/bin/llama-gguf-hash --sha256 test.gguf
+```
+
+## Generation and Verification Example
+
+To generate we may use this command
+
+```bash
+./llama-gguf-hash --all test.gguf > test.gguf.manifest
+```
+
+Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
+(This excludes UUID as that is an ID not a hash)
+
+```bash
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0
+sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1
+sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1
+xxh64     a0af5d700049693b  test.gguf:tensor_2
+sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3
+sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3
+xxh64     1257733306b7992d  test.gguf:tensor_4
+sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4
+xxh64     d238d16ba4711e58  test.gguf:tensor_5
+sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6
+sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6
+xxh64     c22021c29854f093  test.gguf:tensor_7
+sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7
+xxh64     936df61f5d64261f  test.gguf:tensor_8
+sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8
+xxh64     93fd20c64421c081  test.gguf:tensor_9
+sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9
+xxh64     5a54d3aad816f302  test.gguf
+sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf
+```
+
+We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
+
+```bash
+$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+Or we may explicitly ask for a faster hash like:
+
+```bash
+$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
+xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
+xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
+xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
+xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
+xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
+xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
+xxh64     5a54d3aad816f302  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+Or maybe we want to just check that all the hash is valid:
+
+```bash
+$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
+manifest  test.gguf.manifest  sha256  sha1  xxh64
+xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
+sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
+sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
+xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
+sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1  -  Ok
+sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
+xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
+sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2  -  Ok
+sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
+xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
+sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3  -  Ok
+sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
+xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
+sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4  -  Ok
+sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
+xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
+sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5  -  Ok
+sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
+xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
+sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6  -  Ok
+sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
+xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
+sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7  -  Ok
+sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
+xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
+sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8  -  Ok
+sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
+xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
+sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9  -  Ok
+sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
+xxh64     5a54d3aad816f302  test.gguf  -  Ok
+sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf  -  Ok
+sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
+
+Verification results for test.gguf.manifest - Success
+```
+
+
+## Crypto/Hash Libraries Used
+
+These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
+
+- https://github.com/Cyan4973/xxHash
+- https://github.com/clibs/sha1/
+- https://github.com/jb55/sha256.c
diff --git a/examples/gguf-hash/deps/rotate-bits/package.json b/examples/gguf-hash/deps/rotate-bits/package.json
new file mode 100644
index 000000000..74c0bef68
--- /dev/null
+++ b/examples/gguf-hash/deps/rotate-bits/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "rotate-bits",
+  "version": "0.1.1",
+  "repo": "jb55/rotate-bits.h",
+  "description": "rotate bits",
+  "keywords": ["rotl", "rotr"],
+  "src": ["rotate-bits.h"],
+  "license": "Public Domain",
+  "development": {
+    "thlorenz/tap.c": "*"
+  }
+}
+
diff --git a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
new file mode 100644
index 000000000..75c4881fc
--- /dev/null
+++ b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
@@ -0,0 +1,46 @@
+
+
+#ifndef __ROTATE_DEFS_H
+#define __ROTATE_DEFS_H
+
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#define ROTL32(v, n) _rotl((v), (n))
+#define ROTL64(v, n) _rotl64((v), (n))
+
+#define ROTR32(v, n) _rotr((v), (n))
+#define ROTR64(v, n) _rotr64((v), (n))
+
+#else
+
+#include <stdint.h>
+
+#define U8V(v) ((uint8_t)(v) & 0xFFU)
+#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
+#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
+#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
+
+#define ROTL32(v, n) \
+  (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
+
+// tests fail if we don't have this cast...
+#define ROTL64(v, n) \
+  (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
+
+#define ROTR32(v, n) ROTL32(v, 32 - (n))
+#define ROTR64(v, n) ROTL64(v, 64 - (n))
+
+#endif
+
+#define ROTL8(v, n) \
+  (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
+
+#define ROTL16(v, n) \
+  (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
+
+#define ROTR8(v, n) ROTL8(v, 8 - (n))
+#define ROTR16(v, n) ROTL16(v, 16 - (n))
+
+#endif
diff --git a/examples/gguf-hash/deps/sha1/package.json b/examples/gguf-hash/deps/sha1/package.json
new file mode 100644
index 000000000..6a5843dd1
--- /dev/null
+++ b/examples/gguf-hash/deps/sha1/package.json
@@ -0,0 +1,9 @@
+{
+  "name": "sha1",
+  "version": "0.0.1",
+  "repo": "clibs/sha1",
+  "description": "sha1 hash algorithm",
+  "keywords": ["sha1", "hash"],
+  "license": "public domain",
+  "src": ["sha1.c", "sha1.h"]
+}
diff --git a/examples/gguf-hash/deps/sha1/sha1.c b/examples/gguf-hash/deps/sha1/sha1.c
new file mode 100644
index 000000000..76cd6ca33
--- /dev/null
+++ b/examples/gguf-hash/deps/sha1/sha1.c
@@ -0,0 +1,295 @@
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+
+/* for uint32_t */
+#include <stdint.h>
+
+#include "sha1.h"
+
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+    |(rol(block->l[i],8)&0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+    ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+void SHA1Transform(
+    uint32_t state[5],
+    const unsigned char buffer[64]
+)
+{
+    uint32_t a, b, c, d, e;
+
+    typedef union
+    {
+        unsigned char c[64];
+        uint32_t l[16];
+    } CHAR64LONG16;
+
+#ifdef SHA1HANDSOFF
+    CHAR64LONG16 block[1];      /* use array to appear as a pointer */
+
+    memcpy(block, buffer, 64);
+#else
+    /* The following had better never be used because it causes the
+     * pointer-to-const buffer to be cast into a pointer to non-const.
+     * And the result is written through.  I threw a "const" in, hoping
+     * this will cause a diagnostic.
+     */
+    CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
+#endif
+    /* Copy context->state[] to working vars */
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    /* 4 rounds of 20 operations each. Loop unrolled. */
+    R0(a, b, c, d, e, 0);
+    R0(e, a, b, c, d, 1);
+    R0(d, e, a, b, c, 2);
+    R0(c, d, e, a, b, 3);
+    R0(b, c, d, e, a, 4);
+    R0(a, b, c, d, e, 5);
+    R0(e, a, b, c, d, 6);
+    R0(d, e, a, b, c, 7);
+    R0(c, d, e, a, b, 8);
+    R0(b, c, d, e, a, 9);
+    R0(a, b, c, d, e, 10);
+    R0(e, a, b, c, d, 11);
+    R0(d, e, a, b, c, 12);
+    R0(c, d, e, a, b, 13);
+    R0(b, c, d, e, a, 14);
+    R0(a, b, c, d, e, 15);
+    R1(e, a, b, c, d, 16);
+    R1(d, e, a, b, c, 17);
+    R1(c, d, e, a, b, 18);
+    R1(b, c, d, e, a, 19);
+    R2(a, b, c, d, e, 20);
+    R2(e, a, b, c, d, 21);
+    R2(d, e, a, b, c, 22);
+    R2(c, d, e, a, b, 23);
+    R2(b, c, d, e, a, 24);
+    R2(a, b, c, d, e, 25);
+    R2(e, a, b, c, d, 26);
+    R2(d, e, a, b, c, 27);
+    R2(c, d, e, a, b, 28);
+    R2(b, c, d, e, a, 29);
+    R2(a, b, c, d, e, 30);
+    R2(e, a, b, c, d, 31);
+    R2(d, e, a, b, c, 32);
+    R2(c, d, e, a, b, 33);
+    R2(b, c, d, e, a, 34);
+    R2(a, b, c, d, e, 35);
+    R2(e, a, b, c, d, 36);
+    R2(d, e, a, b, c, 37);
+    R2(c, d, e, a, b, 38);
+    R2(b, c, d, e, a, 39);
+    R3(a, b, c, d, e, 40);
+    R3(e, a, b, c, d, 41);
+    R3(d, e, a, b, c, 42);
+    R3(c, d, e, a, b, 43);
+    R3(b, c, d, e, a, 44);
+    R3(a, b, c, d, e, 45);
+    R3(e, a, b, c, d, 46);
+    R3(d, e, a, b, c, 47);
+    R3(c, d, e, a, b, 48);
+    R3(b, c, d, e, a, 49);
+    R3(a, b, c, d, e, 50);
+    R3(e, a, b, c, d, 51);
+    R3(d, e, a, b, c, 52);
+    R3(c, d, e, a, b, 53);
+    R3(b, c, d, e, a, 54);
+    R3(a, b, c, d, e, 55);
+    R3(e, a, b, c, d, 56);
+    R3(d, e, a, b, c, 57);
+    R3(c, d, e, a, b, 58);
+    R3(b, c, d, e, a, 59);
+    R4(a, b, c, d, e, 60);
+    R4(e, a, b, c, d, 61);
+    R4(d, e, a, b, c, 62);
+    R4(c, d, e, a, b, 63);
+    R4(b, c, d, e, a, 64);
+    R4(a, b, c, d, e, 65);
+    R4(e, a, b, c, d, 66);
+    R4(d, e, a, b, c, 67);
+    R4(c, d, e, a, b, 68);
+    R4(b, c, d, e, a, 69);
+    R4(a, b, c, d, e, 70);
+    R4(e, a, b, c, d, 71);
+    R4(d, e, a, b, c, 72);
+    R4(c, d, e, a, b, 73);
+    R4(b, c, d, e, a, 74);
+    R4(a, b, c, d, e, 75);
+    R4(e, a, b, c, d, 76);
+    R4(d, e, a, b, c, 77);
+    R4(c, d, e, a, b, 78);
+    R4(b, c, d, e, a, 79);
+    /* Add the working vars back into context.state[] */
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    /* Wipe variables */
+    a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+    memset(block, '\0', sizeof(block));
+#endif
+}
+
+
+/* SHA1Init - Initialize new context */
+
+void SHA1Init(
+    SHA1_CTX * context
+)
+{
+    /* SHA1 initialization constants */
+    context->state[0] = 0x67452301;
+    context->state[1] = 0xEFCDAB89;
+    context->state[2] = 0x98BADCFE;
+    context->state[3] = 0x10325476;
+    context->state[4] = 0xC3D2E1F0;
+    context->count[0] = context->count[1] = 0;
+}
+
+
+/* Run your data through this. */
+
+void SHA1Update(
+    SHA1_CTX * context,
+    const unsigned char *data,
+    uint32_t len
+)
+{
+    uint32_t i;
+
+    uint32_t j;
+
+    j = context->count[0];
+    if ((context->count[0] += len << 3) < j)
+        context->count[1]++;
+    context->count[1] += (len >> 29);
+    j = (j >> 3) & 63;
+    if ((j + len) > 63)
+    {
+        memcpy(&context->buffer[j], data, (i = 64 - j));
+        SHA1Transform(context->state, context->buffer);
+        for (; i + 63 < len; i += 64)
+        {
+            SHA1Transform(context->state, &data[i]);
+        }
+        j = 0;
+    }
+    else
+        i = 0;
+    memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+
+/* Add padding and return the message digest. */
+
+void SHA1Final(
+    unsigned char digest[20],
+    SHA1_CTX * context
+)
+{
+    unsigned i;
+
+    unsigned char finalcount[8];
+
+    unsigned char c;
+
+#if 0    /* untested "improvement" by DHR */
+    /* Convert context->count to a sequence of bytes
+     * in finalcount.  Second element first, but
+     * big-endian order within element.
+     * But we do it all backwards.
+     */
+    unsigned char *fcp = &finalcount[8];
+
+    for (i = 0; i < 2; i++)
+    {
+        uint32_t t = context->count[i];
+
+        int j;
+
+        for (j = 0; j < 4; t >>= 8, j++)
+            *--fcp = (unsigned char) t}
+#else
+    for (i = 0; i < 8; i++)
+    {
+        finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
+    }
+#endif
+    c = 0200;
+    SHA1Update(context, &c, 1);
+    while ((context->count[0] & 504) != 448)
+    {
+        c = 0000;
+        SHA1Update(context, &c, 1);
+    }
+    SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
+    for (i = 0; i < 20; i++)
+    {
+        digest[i] = (unsigned char)
+            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
+    }
+    /* Wipe variables */
+    memset(context, '\0', sizeof(*context));
+    memset(&finalcount, '\0', sizeof(finalcount));
+}
+
+void SHA1(
+    char *hash_out,
+    const char *str,
+    uint32_t len)
+{
+    SHA1_CTX ctx;
+    unsigned int ii;
+
+    SHA1Init(&ctx);
+    for (ii=0; ii<len; ii+=1)
+        SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
+    SHA1Final((unsigned char *)hash_out, &ctx);
+}
+
diff --git a/examples/gguf-hash/deps/sha1/sha1.h b/examples/gguf-hash/deps/sha1/sha1.h
new file mode 100644
index 000000000..f492009c9
--- /dev/null
+++ b/examples/gguf-hash/deps/sha1/sha1.h
@@ -0,0 +1,52 @@
+#ifndef SHA1_H
+#define SHA1_H
+
+/*
+   SHA-1 in C
+   By Steve Reid <steve@edmweb.com>
+   100% Public Domain
+ */
+
+#include "stdint.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct
+{
+    uint32_t state[5];
+    uint32_t count[2];
+    unsigned char buffer[64];
+} SHA1_CTX;
+
+void SHA1Transform(
+    uint32_t state[5],
+    const unsigned char buffer[64]
+    );
+
+void SHA1Init(
+    SHA1_CTX * context
+    );
+
+void SHA1Update(
+    SHA1_CTX * context,
+    const unsigned char *data,
+    uint32_t len
+    );
+
+void SHA1Final(
+    unsigned char digest[20],
+    SHA1_CTX * context
+    );
+
+void SHA1(
+    char *hash_out,
+    const char *str,
+    uint32_t len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* SHA1_H */
diff --git a/examples/gguf-hash/deps/sha256/package.json b/examples/gguf-hash/deps/sha256/package.json
new file mode 100644
index 000000000..b92a04127
--- /dev/null
+++ b/examples/gguf-hash/deps/sha256/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "sha256",
+  "version": "0.0.2",
+  "repo": "jb55/sha256.c",
+  "description": "sha256 in c",
+  "keywords": ["sha256", "sha2"],
+  "src": ["sha256.c", "sha256.h"],
+  "dependencies": {
+    "jb55/rotate-bits.h": "0.1.1"
+  },
+  "development": {
+    "thlorenz/tap.c": "*"
+  }
+}
+
diff --git a/examples/gguf-hash/deps/sha256/sha256.c b/examples/gguf-hash/deps/sha256/sha256.c
new file mode 100644
index 000000000..a7a87aeb2
--- /dev/null
+++ b/examples/gguf-hash/deps/sha256/sha256.c
@@ -0,0 +1,221 @@
+/* Crypto/Sha256.c -- SHA-256 Hash
+2010-06-11 : Igor Pavlov : Public domain
+This code is based on public domain code from Wei Dai's Crypto++ library. */
+
+#include "rotate-bits/rotate-bits.h"
+#include "sha256.h"
+
+/* define it for speed optimization */
+#define _SHA256_UNROLL
+#define _SHA256_UNROLL2
+
+void
+sha256_init(sha256_t *p)
+{
+  p->state[0] = 0x6a09e667;
+  p->state[1] = 0xbb67ae85;
+  p->state[2] = 0x3c6ef372;
+  p->state[3] = 0xa54ff53a;
+  p->state[4] = 0x510e527f;
+  p->state[5] = 0x9b05688c;
+  p->state[6] = 0x1f83d9ab;
+  p->state[7] = 0x5be0cd19;
+  p->count = 0;
+}
+
+#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
+#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
+#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
+#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
+
+#define blk0(i) (W[i] = data[i])
+#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
+
+#define Ch(x,y,z) (z^(x&(y^z)))
+#define Maj(x,y,z) ((x&y)|(z&(x|y)))
+
+#define a(i) T[(0-(i))&7]
+#define b(i) T[(1-(i))&7]
+#define c(i) T[(2-(i))&7]
+#define d(i) T[(3-(i))&7]
+#define e(i) T[(4-(i))&7]
+#define f(i) T[(5-(i))&7]
+#define g(i) T[(6-(i))&7]
+#define h(i) T[(7-(i))&7]
+
+
+#ifdef _SHA256_UNROLL2
+
+#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
+  d += h; h += S0(a) + Maj(a, b, c)
+
+#define RX_8(i) \
+  R(a,b,c,d,e,f,g,h, i); \
+  R(h,a,b,c,d,e,f,g, (i+1)); \
+  R(g,h,a,b,c,d,e,f, (i+2)); \
+  R(f,g,h,a,b,c,d,e, (i+3)); \
+  R(e,f,g,h,a,b,c,d, (i+4)); \
+  R(d,e,f,g,h,a,b,c, (i+5)); \
+  R(c,d,e,f,g,h,a,b, (i+6)); \
+  R(b,c,d,e,f,g,h,a, (i+7))
+
+#else
+
+#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
+  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
+
+#ifdef _SHA256_UNROLL
+
+#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
+
+#endif
+
+#endif
+
+static const uint32_t K[64] = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+sha256_transform(uint32_t *state, const uint32_t *data)
+{
+  uint32_t W[16] = {0};
+  unsigned j;
+  #ifdef _SHA256_UNROLL2
+  uint32_t a,b,c,d,e,f,g,h;
+  a = state[0];
+  b = state[1];
+  c = state[2];
+  d = state[3];
+  e = state[4];
+  f = state[5];
+  g = state[6];
+  h = state[7];
+  #else
+  uint32_t T[8];
+  for (j = 0; j < 8; j++)
+    T[j] = state[j];
+  #endif
+
+  for (j = 0; j < 64; j += 16)
+  {
+    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
+    RX_8(0); RX_8(8);
+    #else
+    unsigned i;
+    for (i = 0; i < 16; i++) { R(i); }
+    #endif
+  }
+
+  #ifdef _SHA256_UNROLL2
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+  state[4] += e;
+  state[5] += f;
+  state[6] += g;
+  state[7] += h;
+  #else
+  for (j = 0; j < 8; j++)
+    state[j] += T[j];
+  #endif
+
+  /* Wipe variables */
+  /* memset(W, 0, sizeof(W)); */
+  /* memset(T, 0, sizeof(T)); */
+}
+
+#undef S0
+#undef S1
+#undef s0
+#undef s1
+
+static void
+sha256_write_byte_block(sha256_t *p)
+{
+  uint32_t data32[16];
+  unsigned i;
+  for (i = 0; i < 16; i++)
+    data32[i] =
+      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
+      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
+      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
+      ((uint32_t)(p->buffer[i * 4 + 3]));
+  sha256_transform(p->state, data32);
+}
+
+
+void
+sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
+{
+  sha256_t hash;
+  sha256_init(&hash);
+  sha256_update(&hash, data, size);
+  sha256_final(&hash, buf);
+}
+
+
+void
+sha256_update(sha256_t *p, const unsigned char *data, size_t size)
+{
+  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+  while (size > 0)
+  {
+    p->buffer[curBufferPos++] = *data++;
+    p->count++;
+    size--;
+    if (curBufferPos == 64)
+    {
+      curBufferPos = 0;
+      sha256_write_byte_block(p);
+    }
+  }
+}
+
+
+void
+sha256_final(sha256_t *p, unsigned char *digest)
+{
+  uint64_t lenInBits = (p->count << 3);
+  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+  unsigned i;
+  p->buffer[curBufferPos++] = 0x80;
+  while (curBufferPos != (64 - 8))
+  {
+    curBufferPos &= 0x3F;
+    if (curBufferPos == 0)
+      sha256_write_byte_block(p);
+    p->buffer[curBufferPos++] = 0;
+  }
+  for (i = 0; i < 8; i++)
+  {
+    p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
+    lenInBits <<= 8;
+  }
+  sha256_write_byte_block(p);
+
+  for (i = 0; i < 8; i++)
+  {
+    *digest++ = (unsigned char)(p->state[i] >> 24);
+    *digest++ = (unsigned char)(p->state[i] >> 16);
+    *digest++ = (unsigned char)(p->state[i] >> 8);
+    *digest++ = (unsigned char)(p->state[i]);
+  }
+  sha256_init(p);
+}
diff --git a/examples/gguf-hash/deps/sha256/sha256.h b/examples/gguf-hash/deps/sha256/sha256.h
new file mode 100644
index 000000000..21657e66b
--- /dev/null
+++ b/examples/gguf-hash/deps/sha256/sha256.h
@@ -0,0 +1,24 @@
+/* Sha256.h -- SHA-256 Hash
+2010-06-11 : Igor Pavlov : Public domain */
+
+#ifndef __CRYPTO_SHA256_H
+#define __CRYPTO_SHA256_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define SHA256_DIGEST_SIZE 32
+
+typedef struct sha256_t
+{
+  uint32_t state[8];
+  uint64_t count;
+  unsigned char buffer[64];
+} sha256_t;
+
+void sha256_init(sha256_t *p);
+void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
+void sha256_final(sha256_t *p, unsigned char *digest);
+void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
+
+#endif
diff --git a/examples/gguf-hash/deps/xxhash/clib.json b/examples/gguf-hash/deps/xxhash/clib.json
new file mode 100644
index 000000000..242343c5d
--- /dev/null
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@@ -0,0 +1,12 @@
+{
+  "name": "xxhash",
+  "version": "0.8.2",
+  "repo": "Cyan4973/xxhash",
+  "description": "Extremely fast non-cryptographic hash algorithm",
+  "keywords": ["xxhash", "hashing"],
+  "license": "BSD-2-Clause",
+  "src": [
+    "xxhash.c",
+    "xxhash.h"
+  ]
+}
diff --git a/examples/gguf-hash/deps/xxhash/xxhash.c b/examples/gguf-hash/deps/xxhash/xxhash.c
new file mode 100644
index 000000000..e60cc37f1
--- /dev/null
+++ b/examples/gguf-hash/deps/xxhash/xxhash.c
@@ -0,0 +1,42 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
+#define XXH_IMPLEMENTATION      /* access definitions */
+
+#include "xxhash.h"
diff --git a/examples/gguf-hash/deps/xxhash/xxhash.h b/examples/gguf-hash/deps/xxhash/xxhash.h
new file mode 100644
index 000000000..c0fafe20d
--- /dev/null
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
@@ -0,0 +1,7093 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ *
+ * @anchor canonical_representation_example
+ * **Canonical Representation**
+ *
+ * The default return values from XXH functions are unsigned 32, 64 and 128 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ *
+ * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
+ * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
+ * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which prints XXH32_hash_t in human readable format
+ *   void printXxh32(XXH32_hash_t hash)
+ *   {
+ *       XXH32_canonical_t cano;
+ *       XXH32_canonicalFromHash(&cano, hash);
+ *       size_t i;
+ *       for(i = 0; i < sizeof(cano.digest); ++i) {
+ *           printf("%02x", cano.digest[i]);
+ *       }
+ *       printf("\n");
+ *   }
+ *
+ *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
+ *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
+ *   {
+ *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
+ *       return hash;
+ *   }
+ * @endcode
+ *
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Gives access to internal state declaration, required for static allocation.
+ *
+ * Incompatible with dynamic linking, due to risks of ABI changes.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_STATIC_LINKING_ONLY
+/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
+
+/*!
+ * @brief Gives access to internal definitions.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #define XXH_IMPLEMENTATION
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_IMPLEMENTATION
+/* Do not undef XXH_IMPLEMENTATION for Doxygen */
+
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((__unused__))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((__const__))
+# define XXH_PUREF   __attribute__((__pure__))
+# define XXH_MALLOCF __attribute__((__malloc__))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  3
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit xxHash32 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * @return An allocated pointer of @ref XXH32_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH32_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH32_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 32-bit xxHash32 value from that state.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*! @cond Doxygen ignores this part */
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
+ * leave as `201711L` (C17 + 1).
+ * TODO: Update to correct value when its been specified.
+ */
+#define XXH_C23_VN 201711L
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_NOESCAPE for annotated pointers in public API.
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
+ * As of writing this, only supported by clang.
+ */
+#if XXH_HAS_ATTRIBUTE(noescape)
+# define XXH_NOESCAPE __attribute__((__noescape__))
+#else
+# define XXH_NOESCAPE
+#endif
+/*! @endcond */
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit xxHash64 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*!
+ * @brief Allocates an @ref XXH64_state_t.
+ *
+ * @return An allocated pointer of @ref XXH64_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH64_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+
+/*!
+ * @brief Frees an @ref XXH64_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH64_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH64_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH64_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH64_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 64-bit xxHash64 value from that state.
+ *
+ * @note
+ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
+ */
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
+ *
+ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH64_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
+ *
+ * @param src The @ref XXH64_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
+ * implementations for many common platforms:
+ *   - AVX512
+ *   - AVX2
+ *   - SSE2
+ *   - ARM NEON
+ *   - WebAssembly SIMD128
+ *   - POWER8 VSX
+ *   - s390x ZVector
+ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
+ * selects the best version according to predefined macros. For the x86 family, an
+ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generate exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
+ *   it may have slightly better performance due to constant propagation of the
+ *   defaults.
+ *
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed   The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The opaque state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH3_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   `secret` is referenced, it _must outlive_ the hash streaming session.
+ *
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 64-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
+/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 128-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * @brief Check equality of two XXH128_hash_t values
+ *
+ * @param h1 The 128-bit hash value.
+ * @param h2 Another 128-bit hash value.
+ *
+ * @return `1` if `h1` and `h2` are equal.
+ * @return `0` if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @param h128_1 Left-hand side value
+ * @param h128_2 Right-hand side value
+ *
+ * @return >0 if @p h128_1  > @p h128_2
+ * @return =0 if @p h128_1 == @p h128_2
+ * @return <0 if @p h128_1  < @p h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+
+
+/*!
+ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
+ *
+ * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH128_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
+ *
+ * @param src The @ref XXH128_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+/* Windows SDK under 10.0.22000 is missing stdalign.h so we add a check
+   before allowing the windows compiler to use the C11 form.
+   Reference: https://github.com/Cyan4973/xxHash/issues/955 */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) \
+    && (defined(_MSC_VER) && (_MSC_VER >= 1000) || !defined(_MSC_VER)) /* >= C11 */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @internal
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)                       \
+    do {                                                     \
+        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
+        tmp_xxh3_state_ptr->seed = 0;                        \
+        tmp_xxh3_state_ptr->extSecret = NULL;                \
+    } while(0)
+
+
+/*!
+ * @brief Calculates the 128-bit hash of @p data using XXH3.
+ *
+ * @param data The block of data to be hashed, at least @p len bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p len is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 128-bit XXH3 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
+ *
+ * @param secretBuffer    A writable buffer for derived high-entropy secret data.
+ * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_SIZE_MIN.
+ * @param customSeed      A user-defined content.
+ * @param customSeedSize  Size of customSeed, in bytes.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes
+ * @param seed         The 64-bit seed to alter the hash result predictably.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_DEFAULT_SIZE];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * @brief Maximum size of "short" key in bytes.
+ */
+#define XXH3_MIDSIZE_MAX 240
+
+/*!
+ * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed       The 64-bit seed to alter the hash result predictably.
+ *
+ * These variants generate hash values using either:
+ * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
+ * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
+                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+
+/*!
+ * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The memory segment to be hashed, at least @p len bytes in size.
+ * @param length     The length of @p data, in bytes.
+ * @param secret     The secret used to alter hash result predictably.
+ * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN)
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(): contract is the same.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
+                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ *
+ * Note: there was a bug in an earlier version of this function (<= v0.8.2)
+ * that would make it generate an incorrect hash value
+ * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX
+ * and @p secret is different from XXH3_generateSecret_fromSeed().
+ * As stated in the contract, the correct hash result must be
+ * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX.
+ * Results generated by this older version are wrong, hence not comparable.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH3_INLINE_SECRET
+ * @brief Determines whether to inline the XXH3 withSecret code.
+ *
+ * When the secret size is known, the compiler can improve the performance
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
+ *
+ * However, if the secret size is not known, it doesn't have any benefit. This
+ * happens when xxHash is compiled into a global symbol. Therefore, if
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
+ *
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
+ * detect this optimization level.
+ */
+#  define XXH3_INLINE_SECRET 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH3_INLINE_SECRET
+#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
+     || !defined(XXH_INLINE_ALL)
+#    define XXH3_INLINE_SECRET 0
+#  else
+#    define XXH3_INLINE_SECRET 1
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((__unused__))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__))
+#  define XXH_NO_INLINE static __attribute__((__noinline__))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+#if XXH3_INLINE_SECRET
+#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
+#else
+#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
+#endif
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  if defined(__INTEL_COMPILER)
+#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
+#  else
+#    define XXH_ASSERT(c)   XXH_ASSUME(c)
+#  endif
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* Specifically for NEON vectors which use the "w" constraint, on
+ * Clang. */
+#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint8_t xxh_u8;
+#else
+    typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+
+
+/*
+ * C23 and future versions have standard "unreachable()".
+ * Once it has been implemented reliably we can add it as an
+ * additional case:
+ *
+ * ```
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
+ * #  include <stddef.h>
+ * #  ifdef unreachable
+ * #    define XXH_UNREACHABLE() unreachable()
+ * #  endif
+ * #endif
+ * ```
+ *
+ * Note C++23 also has std::unreachable() which can be detected
+ * as follows:
+ * ```
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
+ * #  include <utility>
+ * #  define XXH_UNREACHABLE() std::unreachable()
+ * #endif
+ * ```
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
+ * We don't use that as including `<utility>` in `extern "C"` blocks
+ * doesn't work on GCC12
+ */
+
+#if XXH_HAS_BUILTIN(__builtin_unreachable)
+#  define XXH_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+#  define XXH_UNREACHABLE() __assume(0)
+
+#else
+#  define XXH_UNREACHABLE()
+#endif
+
+#if XXH_HAS_BUILTIN(__builtin_assume)
+#  define XXH_ASSUME(c) __builtin_assume(c)
+#else
+#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
+     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
+     * than half the speed.
+     *
+     * Additionally, this is used on WASM SIMD128 because it JITs to the same
+     * SIMD instructions and has the same issue.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr->v[1] = seed + XXH_PRIME32_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME32_1;
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+
+            do {
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
+    } else {
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * DISABLE AUTOVECTORIZATION:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling AVX512.
+     *
+     * Autovectorization of XXH64 tends to be detrimental,
+     * though the exact outcome may change depending on exact cpu and compiler version.
+     * For information, it has been reported as detrimental for Skylake-X,
+     * but possibly beneficial for Zen4.
+     *
+     * The default is to disable auto-vectorization,
+     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+static XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr->v[1] = seed + XXH_PRIME64_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME64_1;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+
+            do {
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
+    } else {
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT   /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+   || (defined (__clang__)) \
+   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
+   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
+/*
+ * There are a LOT more compilers that recognize __restrict but this
+ * covers the major ones.
+ */
+#  define XXH_RESTRICT   __restrict
+#else
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  endif
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || (defined(_M_ARM) && _M_ARM >= 7) \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) \
+   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * internal macro XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!<
+                       * NEON for most ARMv7-A, all AArch64, and WASM SIMD128
+                       * via the SIMDeverywhere polyfill provided with the
+                       * Emscripten SDK.
+                       */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#  define XXH_SVE    6
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_ALIASING __attribute__((__may_alias__))
+#else
+#  define XXH_ALIASING /* nothing */
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+#if XXH_VECTOR == XXH_NEON
+
+/*
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
+ * optimizes out the entire hashLong loop because of the aliasing violation.
+ *
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
+ * so the only option is to mark it as aliasing.
+ */
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(xxh_aliasing_uint64x2_t const *)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+
+/*!
+ * @internal
+ * @brief `vmlal_u32` on low and high halves of a vector.
+ *
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
+ * with `vmlal_u32`.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* Inline assembly is the only way */
+    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
+    return acc;
+}
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* This intrinsic works as expected */
+    return vmlal_high_u32(acc, lhs, rhs);
+}
+#else
+/* Portable intrinsic versions */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
+}
+/*! @copydoc XXH_vmlal_low_u32
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
+}
+#endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * This can be set to 2, 4, 6, or 8.
+ *
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
+ * bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
+ * and 2 scalar lanes, which is chosen by default.
+ *
+ * This does not apply to Apple processors or 32-bit processors, which run better with
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
+ * it effectively becomes worse 4.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+/*
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
+ */
+typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
+static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= PRIME_MX1;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= PRIME_MX2;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= PRIME_MX2;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+#endif
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        xxh_u64 acc_end;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        /* last bytes */
+        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        XXH_ASSERT(nbRounds >= 8);
+        acc = XXH3_avalanche(acc);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            /*
+             * Prevents clang for unrolling the acc loop and interleaving with this one.
+             */
+            XXH_COMPILER_GUARD(acc);
+            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON and WASM SIMD128.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ *
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
+ * integers instead of the other platforms which mask full 64-bit vectors,
+ * so the setup is more complicated than just shifting right.
+ *
+ * Additionally, there is an optimization for 4 lanes at once noted below.
+ *
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
+ * there needs to be *three* versions of the accumulate operation used
+ * for the remaining 2 lanes.
+ *
+ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
+ * nearly perfectly.
+ */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {   /* GCC for darwin arm64 does not like aliasing here */
+        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* xinput = (const uint8_t *) input;
+        uint8_t const* xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+#ifdef __wasm_simd128__
+        /*
+         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
+         * is constant propagated, which results in it converting it to this
+         * inside the loop:
+         *
+         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
+         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
+         *    ...
+         *
+         * This requires a full 32-bit address immediate (and therefore a 6 byte
+         * instruction) as well as an add for each offset.
+         *
+         * Putting an asm guard prevents it from folding (at the cost of losing
+         * the alignment hint), and uses the free offset in `v128.load` instead
+         * of adding secret_offset each time which overall reduces code size by
+         * about a kilobyte and improves performance.
+         */
+        XXH_COMPILER_GUARD(xsecret);
+#endif
+        /* Scalar lanes use the normal scalarRound routine */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        /* 4 NEON lanes at a time. */
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
+            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* data_swap = swap(data_vec) */
+            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+            /*
+             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+             * get one vector with the low 32 bits of each lane, and one vector
+             * with the high 32 bits of each lane.
+             *
+             * The intrinsic returns a double vector because the original ARMv7-a
+             * instruction modified both arguments in place. AArch64 and SIMD128 emit
+             * two instructions from this intrinsic.
+             *
+             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+             */
+            uint32x4x2_t unzipped = vuzpq_u32(
+                vreinterpretq_u32_u64(data_key_1),
+                vreinterpretq_u32_u64(data_key_2)
+            );
+            /* data_key_lo = data_key & 0xFFFFFFFF */
+            uint32x4_t data_key_lo = unzipped.val[0];
+            /* data_key_hi = data_key >> 32 */
+            uint32x4_t data_key_hi = unzipped.val[1];
+            /*
+             * Then, we can split the vectors horizontally and multiply which, as for most
+             * widening intrinsics, have a variant that works on both high half vectors
+             * for free on AArch64. A similar instruction is available on SIMD128.
+             *
+             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+             */
+            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
+            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
+            /*
+             * Clang reorders
+             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             * to
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
+             *
+             * While it would make sense in theory since the addition is faster,
+             * for reasons likely related to umlal being limited to certain NEON
+             * pipelines, this is worse. A compiler guard fixes this.
+             */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i]   = vaddq_u64(xacc[i], sum_1);
+            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
+        }
+        /* Operate on the remaining NEON lanes 2 at a time. */
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* For two lanes, just use VMOVN and VSHRN. */
+            /* data_key_lo = data_key & 0xFFFFFFFF; */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* data_key_hi = data_key >> 32; */
+            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
+            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
+            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
+            /* Same Clang workaround as before */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i] = vaddq_u64 (xacc[i], sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+
+        size_t i;
+        /* WASM uses operator overloads and doesn't need these. */
+#ifndef __wasm_simd128__
+        /* { prime32_1, prime32_1 } */
+        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
+        /* { 0, prime32_1, 0, prime32_1 } */
+        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
+#endif
+
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* xacc[i] *= XXH_PRIME32_1 */
+#ifdef __wasm_simd128__
+            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
+            xacc[i] = data_key * XXH_PRIME32_1;
+#else
+            /*
+             * Expanded version with portable NEON intrinsics
+             *
+             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+             *
+             * prod_hi = hi(data_key) * lo(prime) << 32
+             *
+             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
+             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
+             * and avoid the shift.
+             */
+            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
+            /* Extract low bits for vmlal_u32  */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+#endif
+        }
+    }
+}
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
+    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = xacc[i];
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        xacc[i] = acc_vec;
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
+/*
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
+ *
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
+ * big Cortex designs have a full 64-bit multiplier.
+ *
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
+ *
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
+ * not have this penalty and does the mask automatically.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    xxh_u64 ret;
+    /* note: %x = 64-bit register, %w = 32-bit register */
+    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
+    return ret;
+}
+#else
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
+}
+#endif
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__GNUC__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes the compiler to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Allocate an @ref XXH3_state_t.
+ *
+ * @return An allocated pointer of @ref XXH3_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH3_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Frees an @ref XXH3_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note Must be allocated with XXH3_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/*!
+ * @internal
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
+ *
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
+ *
+ * @param acc                Pointer to the 8 accumulator lanes
+ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
+ * @param nbStripesPerBlock  Number of stripes in a block
+ * @param input              Input pointer
+ * @param nbStripes          Number of stripes to process
+ * @param secret             Secret pointer
+ * @param secretLimit        Offset of the last block in @p secret
+ * @param f_acc              Pointer to an XXH3_accumulate implementation
+ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
+ * @return                   Pointer past the end of @p input after processing
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
+    /* Process full blocks */
+    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
+        /* Process the initial partial block... */
+        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
+
+        do {
+            /* Accumulate and scramble */
+            f_acc(acc, input, initialSecret, nbStripesThisIter);
+            f_scramble(acc, secret + secretLimit);
+            input += nbStripesThisIter * XXH_STRIPE_LEN;
+            nbStripes -= nbStripesThisIter;
+            /* Then continue the loop with the full block size */
+            nbStripesThisIter = nbStripesPerBlock;
+            initialSecret = secret;
+        } while (nbStripes >= nbStripesPerBlock);
+        *nbStripesSoFarPtr = 0;
+    }
+    /* Process a partial block */
+    if (nbStripes > 0) {
+        f_acc(acc, input, initialSecret, nbStripes);
+        input += nbStripes * XXH_STRIPE_LEN;
+        *nbStripesSoFarPtr += nbStripes;
+    }
+    /* Return end pointer */
+    return input;
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
+        XXH_memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            input = XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                       input, nbStripes,
+                                       secret, state->secretLimit,
+                                       f_acc, f_scramble);
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+        }
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        XXH_memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    xxh_u8 lastStripe[XXH_STRIPE_LEN];
+    const xxh_u8* lastStripePtr;
+
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        /* Consume remaining stripes then point to remaining data in buffer */
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        /* Copy to temp buffer */
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        lastStripePtr = lastStripe;
+    }
+    /* Last stripe */
+    XXH3_accumulate_512(acc,
+                        lastStripePtr,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= PRIME_MX2;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        /*
+         *  We set as `i` as offset + 32. We do this so that unchanged
+         * `len` can be used as upper bound. This reaches a sweet spot
+         * where both x86 and aarch64 get simple agen and good codegen
+         * for the loop.
+         */
+        for (i = 32; i < 160; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input  + i - 32,
+                                input  + i - 16,
+                                secret + i - 32,
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        /*
+         * NB: `i <= len` will duplicate the last 32-bytes if
+         * len % 32 was zero. This is an unfortunate necessity to keep
+         * the hash result stable.
+         */
+        for (i=160; i <= len; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input + i - 32,
+                                input + i - 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            (XXH64_hash_t)0 - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ *
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_64bits_update(state, input, len);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->useSeed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+} /* extern "C" */
+#endif
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
new file mode 100644
index 000000000..9523ec122
--- /dev/null
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -0,0 +1,694 @@
+#include "ggml.h"
+#include "gguf.h"
+
+#include <cstdlib>   /* abort() */
+#include <cstddef>
+#include <cstdio>
+#include <string>
+#include <stdexcept>
+#include <algorithm>
+#include <cstring>
+
+#include <sstream>
+#include <fstream>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "xxhash/xxhash.h"
+#include "sha1/sha1.h"
+#include "sha256/sha256.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+
+// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
+#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
+#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
+
+
+#define HASH_TYPE_SHA256_STR "sha256"
+#define HASH_TYPE_SHA1_STR   "sha1"
+#define HASH_TYPE_XXH64_STR  "xxh64"
+#define HASH_TYPE_UUID_STR   "uuid"
+
+
+typedef enum {
+    HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
+    HASH_EXIT_FAILURE = 1, // Generic Failure
+    HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
+    HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
+    HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
+    HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
+} hash_exit_code_t;
+
+
+typedef enum {
+    HASH_MANIFEST_NOT_FOUND,
+    HASH_MANIFEST_MISMATCH,
+    HASH_MANIFEST_OK,
+} hash_manifest_result_t;
+
+
+struct hash_params {
+    std::string input;
+    bool xxh64 = false;
+    bool sha1 = false;
+    bool sha256 = false;
+    bool uuid = false;
+
+    bool no_layer = false;
+
+    bool manifest_is_usable = false;
+    std::string manifest_file;
+};
+
+struct manifest_check_params {
+    bool xxh64 = false;
+    bool sha1 = false;
+    bool sha256 = false;
+    bool uuid = false;
+};
+
+static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
+    switch (value) {
+        case HASH_MANIFEST_NOT_FOUND: return "Not Found";
+        case HASH_MANIFEST_MISMATCH: return "Mismatch";
+        case HASH_MANIFEST_OK: return "Ok";
+    }
+    return "?";
+}
+
+static char const * hash_exit_code_to_str(hash_exit_code_t value) {
+    switch (value) {
+        case HASH_EXIT_SUCCESS: return "Success";
+        case HASH_EXIT_FAILURE: return "Failure";
+        case HASH_EXIT_MISMATCH: return "Mismatch";
+        case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
+        case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
+        case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
+    }
+    return "?";
+}
+
+static void hash_print_usage(const char * executable) {
+    const hash_params default_params;
+    printf("\n");
+    printf("usage: %s [options] GGUF_IN\n", executable);
+    printf("\n");
+    printf("Hash a GGUF file");
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help              show this help message and exit\n");
+    printf("      --xxh64             use xxh64 hash\n");
+    printf("      --sha1              use sha1 hash\n");
+    printf("      --sha256            use sha256 hash\n");
+    printf("      --all               use all hash\n");
+    printf("      --no-layer          exclude per layer hash\n");
+    printf("      --uuid              generate UUIDv5 ID\n");
+    printf("  -c, --check <manifest>  verify against a manifest\n");
+    printf("\n");
+}
+
+static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
+    std::string arg;
+    bool invalid_param = false;
+    const std::string arg_prefix = "--";
+
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        arg = argv[arg_idx];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        bool arg_found = false;
+        if (arg == "-h" || arg == "--help") {
+            hash_print_usage(argv[0]);
+            exit(0);
+        }
+
+        if (arg == "--xxh64") {
+            arg_found = true;
+            params.xxh64 = true;
+        }
+
+        if (arg == "--sha1") {
+            arg_found = true;
+            params.sha1 = true;
+        }
+
+        if (arg == "--uuid") {
+            arg_found = true;
+            params.uuid = true;
+        }
+
+        if (arg == "--sha256") {
+            arg_found = true;
+            params.sha256 = true;
+        }
+
+        if (arg == "--all") {
+            arg_found = true;
+            params.sha256 = true;
+            params.sha1 = true;
+            params.xxh64 = true;
+        }
+
+        if (arg == "--no-layer") {
+            arg_found = true;
+            params.no_layer = true;
+        }
+
+        if (arg == "-c" || arg == "--check") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            params.manifest_file = argv[arg_idx];
+        }
+
+        if (!arg_found) {
+            throw std::invalid_argument("error: unknown argument: " + arg);
+        }
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument:" + arg);
+    }
+
+    if (argc - arg_idx < 1) {
+        throw std::invalid_argument("error: bad arguments");
+    }
+
+    params.input = argv[arg_idx++];
+}
+
+static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
+    bool result = true;
+    try {
+        hash_params_parse_ex(argc, argv, params);
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        hash_print_usage(argv[0]);
+        exit(EXIT_FAILURE);
+    }
+    return result;
+}
+
+static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
+    if (manifest_file.empty()) {
+        return false;
+    }
+
+    std::ifstream file(manifest_file);
+    if (!file.is_open()) {
+        return false;
+    }
+
+    std::string manifest_entry_line;
+    while (getline(file, manifest_entry_line)) {
+        // hash_type_str hash_str tensor_name
+        // e.g. 'xxh64     f66e9cd66a4396a0  test.gguf:tensor_0'
+        std::istringstream line_stream(manifest_entry_line);
+        std::string file_hash_type;
+        if (line_stream >> file_hash_type) {
+            if (file_hash_type == HASH_TYPE_SHA256_STR) {
+                manifest_check.sha256 = true;
+            } else if (file_hash_type == HASH_TYPE_SHA1_STR) {
+                manifest_check.sha1 = true;
+            } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
+                manifest_check.xxh64 = true;
+            } else if (file_hash_type == HASH_TYPE_UUID_STR) {
+                manifest_check.uuid = true;
+            }
+        }
+    }
+
+    return true;
+}
+
+static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
+    if (manifest_file.empty()) {
+        return HASH_MANIFEST_NOT_FOUND;
+    }
+
+    std::ifstream file(manifest_file);
+    if (!file.is_open()) {
+        return HASH_MANIFEST_NOT_FOUND;
+    }
+
+    std::string manifest_entry_line;
+    while (getline(file, manifest_entry_line)) {
+        std::istringstream line_stream(manifest_entry_line);
+        std::string file_hash_type;
+        std::string file_hash;
+        std::string file_tensor_name;
+        if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
+            // Line parsed. Check hash validity
+
+            if (file_hash_type != hash_type_str) {
+                continue;
+            }
+
+            if (file_tensor_name != tensor_name) {
+                continue;
+            }
+
+            return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
+        }
+    }
+
+    return HASH_MANIFEST_NOT_FOUND;
+}
+
+static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
+    // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
+    // Assumes that digest was processed correctly with the expected namespace
+    for (int i = 0; i < 16; i++) {
+        uuid[i] = sha1_digest[i];
+    }
+
+    // Set bits corresponding to UUID ver 5
+    uuid[ 6] &= ~(0xF << 4);
+    uuid[ 6] |= (5 << 4);
+
+    // Set bits corresponding to UUID variant 0b10XX
+    uuid[ 8] &= ~(0xc << 4);
+    uuid[ 8] |= (0x8 << 4);
+}
+
+static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
+    const std::string & fname = hash_params.input;
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    // xxh64 init
+    XXH64_state_t* xxh64_model_hash_state = NULL;
+    if (hash_params.xxh64) {
+        xxh64_model_hash_state = XXH64_createState();
+        if (xxh64_model_hash_state==NULL) {
+            abort();
+        }
+
+        XXH64_hash_t const seed = 0;
+        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
+            abort();
+        }
+    }
+
+    // sha1 init
+    SHA1_CTX sha1_model_hash_ctx;
+    if (hash_params.sha1) {
+        SHA1Init(&sha1_model_hash_ctx);
+    }
+
+    // sha256 init
+    sha256_t sha256_model_hash_ctx;
+    if (hash_params.sha256) {
+        sha256_init(&sha256_model_hash_ctx);
+    }
+
+    // sha1 for uuid init
+    SHA1_CTX sha1_for_uuid_ctx;
+    if (hash_params.uuid) {
+        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
+        SHA1Init(&sha1_for_uuid_ctx);
+        SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
+    }
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+    const int n_tensors = gguf_get_n_tensors(ctx);
+    bool tensor_layer_in_manifest = false;
+    bool model_in_manifest = false;
+    bool tensor_layer_has_mismatch = false;
+    bool model_has_mismatch = false;
+    for (int i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(ctx, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        auto n_bytes = ggml_nbytes(cur);
+        auto *raw_data = cur->data;
+        const std::string tensor_layer_name = fname + ":" + name;
+
+        if (hash_params.xxh64) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
+
+                char hex_result[17];
+                for (int  offset = 0; offset < 8; offset++) {
+                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
+        }
+
+        if (hash_params.sha1) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                char result[21]; // sha1 outputs 20 bytes
+                SHA1( result, (const char *)raw_data, n_bytes);
+
+                char hex_result[41] = {0};
+                for (int  offset = 0; offset < 20; offset++) {
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+
+        if (hash_params.sha256) {
+
+            if (!hash_params.no_layer) {
+                // Per Layer Hash
+                unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
+                sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
+
+                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
+                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
+                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                }
+
+                if (hash_params.manifest_is_usable) {
+                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
+
+                    switch (verify_result) {
+                        case HASH_MANIFEST_NOT_FOUND:
+                            break;
+                        case HASH_MANIFEST_MISMATCH:
+                            tensor_layer_in_manifest = true;
+                            tensor_layer_has_mismatch = true;
+                            break;
+                        case HASH_MANIFEST_OK:
+                            tensor_layer_in_manifest = true;
+                            break;
+                    }
+
+                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
+                } else {
+                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
+                }
+            }
+
+            // Overall Model Hash
+            sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+
+        if (hash_params.uuid) {
+            SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
+        }
+    }
+
+    if (hash_params.xxh64) {
+        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
+
+        char hex_result[17];
+        for (int  offset = 0; offset < 8; offset++) {
+            unsigned int shift_bits_by = (8 * (8 - offset - 1));
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.sha1) {
+        unsigned char result[21];
+        SHA1Final(result, &sha1_model_hash_ctx);
+
+        char hex_result[41];
+        for (int  offset = 0; offset < 20; offset++) {
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.sha256) {
+        unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
+        sha256_final( &sha256_model_hash_ctx,  result);
+
+        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
+        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
+            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+        }
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
+        }
+    }
+
+    if (hash_params.uuid) {
+        unsigned char result[21];
+        SHA1Final(result, &sha1_for_uuid_ctx);
+
+        unsigned char uuid[16];
+        generate_uuidv5(result, uuid);
+
+        char string_buffer[37] = {0};
+        snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+            uuid[0], uuid[1], uuid[2], uuid[3],
+            uuid[4], uuid[5], uuid[6], uuid[7],
+            uuid[8], uuid[9], uuid[10], uuid[11],
+            uuid[12], uuid[13], uuid[14], uuid[15]);
+
+        if (hash_params.manifest_is_usable) {
+            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
+
+            switch (verify_result) {
+                case HASH_MANIFEST_NOT_FOUND:
+                    break;
+                case HASH_MANIFEST_MISMATCH:
+                    model_in_manifest = true;
+                    model_has_mismatch = true;
+                    break;
+                case HASH_MANIFEST_OK:
+                    model_in_manifest = true;
+                    break;
+            }
+
+            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
+        } else {
+            printf("%-8s  %-s  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
+        }
+    }
+
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+
+    if (hash_params.manifest_is_usable) {
+        // In hash verification mode
+
+        if (!model_in_manifest) {
+            // model missing in manifest?
+
+            // Check tensor layer...
+            if (!tensor_layer_in_manifest) {
+                // Still missing? Maybe we are reading the wrong manifest.
+                return HASH_EXIT_MANIFEST_MISSING_ENTRY;
+            }
+
+            if (tensor_layer_has_mismatch) {
+                // Per tensor check found error
+                return HASH_EXIT_FAILURE;
+            }
+
+            // All per tensor layer checks passed? Sounds good enough.
+            return HASH_EXIT_SUCCESS;
+        }
+
+        // Overall model check passed, but let's check per layer just in case
+        // If missing, we don't care too much as the overall model checked
+        if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
+            return HASH_EXIT_FAILURE;
+        }
+
+        if (model_has_mismatch) {
+            // model has failed hash somewhere in the model
+            return HASH_EXIT_FAILURE;
+        }
+
+        // All checks appears to be fine
+        return HASH_EXIT_SUCCESS;
+    }
+
+    // In hash generation mode
+    return HASH_EXIT_SUCCESS;
+}
+
+int main(int argc, const char ** argv) {
+    hash_params params;
+    manifest_check_params manifest_check;
+    hash_params_parse(argc, argv, params);
+
+    if (!params.manifest_file.empty()) {
+        if (!manifest_type(params.manifest_file, manifest_check)) {
+            printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
+            return HASH_EXIT_MANIFEST_FILE_ERROR;
+        }
+
+        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
+            printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
+            return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
+        }
+
+        printf("manifest  %s", params.manifest_file.c_str());
+
+        if (manifest_check.sha256) {
+            printf("  sha256");
+        }
+
+        if (manifest_check.sha1) {
+            printf("  sha1");
+        }
+
+        if (manifest_check.xxh64) {
+            printf("  xxh64");
+        }
+
+        if (manifest_check.uuid) {
+            printf("  uuid");
+        }
+
+        printf("\n");
+
+        // Autoselect the highest security hash if manifest is provided but
+        // the user has not specifically defined the hash they care about
+        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+            // User has not selected a specific value, pick most secure hash
+            if (manifest_check.sha256) {
+                params.sha256 = true;
+            } else if (manifest_check.sha1) {
+                params.sha1 = true;
+            } else if (manifest_check.xxh64) {
+                params.xxh64 = true;
+            } else if (manifest_check.uuid) {
+                params.uuid = true;
+            }
+        }
+
+        params.manifest_is_usable = true;
+    }
+
+    // By default if no swich argument provided, assume xxh64
+    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+        params.xxh64 = true;
+    }
+
+    hash_exit_code_t exit_code = gguf_hash(params);
+
+    if (params.manifest_is_usable) {
+        printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
+    }
+
+    return exit_code;
+}
diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt
new file mode 100644
index 000000000..c407e2f0a
--- /dev/null
+++ b/examples/gguf-split/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gguf-split)
+add_executable(${TARGET} gguf-split.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md
new file mode 100644
index 000000000..ad1d86651
--- /dev/null
+++ b/examples/gguf-split/README.md
@@ -0,0 +1,10 @@
+## GGUF split Example
+
+CLI to split / merge GGUF files.
+
+**Command line options:**
+
+- `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
+- `--split-max-tensors`: maximum tensors in each split: default(128)
+- `--merge`: merge multiple GGUF to a single GGUF.
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
new file mode 100644
index 000000000..ef3ceb686
--- /dev/null
+++ b/examples/gguf-split/gguf-split.cpp
@@ -0,0 +1,584 @@
+#include "ggml.h"
+#include "gguf.h"
+#include "llama.h"
+#include "common.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+enum split_operation : uint8_t {
+    OP_NONE,
+    OP_SPLIT,
+    OP_MERGE,
+};
+
+enum split_mode : uint8_t {
+    MODE_NONE,
+    MODE_TENSOR,
+    MODE_SIZE,
+};
+
+struct split_params {
+    split_operation operation = OP_NONE;
+    split_mode mode = MODE_NONE;
+    size_t n_bytes_split = 0;
+    int n_split_tensors = 128;
+    std::string input;
+    std::string output;
+    bool no_tensor_first_split = false;
+    bool dry_run = false;
+};
+
+static void split_print_usage(const char * executable) {
+    const split_params default_params;
+    printf("\n");
+    printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
+    printf("\n");
+    printf("Apply a GGUF operation on IN to OUT.");
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help              show this help message and exit\n");
+    printf("  --version               show version and build info\n");
+    printf("  --split                 split GGUF to multiple GGUF (enabled by default)\n");
+    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
+    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
+    printf("  --split-max-size N(M|G) max size per split\n");
+    printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
+    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
+    printf("\n");
+}
+
+// return convert string, for example "128M" or "4G" to number of bytes
+static size_t split_str_to_n_bytes(std::string str) {
+    size_t n_bytes = 0;
+    int n;
+    if (str.back() == 'M') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = (size_t)n * 1000 * 1000; // megabytes
+    } else if (str.back() == 'G') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
+    } else {
+        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
+    }
+    if (n <= 0) {
+        throw std::invalid_argument("error: size must be a positive value");
+    }
+    return n_bytes;
+}
+
+static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    bool invalid_param = false;
+
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        arg = argv[arg_idx];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        bool arg_found = false;
+        if (arg == "-h" || arg == "--help") {
+            split_print_usage(argv[0]);
+            exit(0);
+        } else if (arg == "--version") {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        } else if (arg == "--dry-run") {
+            arg_found = true;
+            params.dry_run = true;
+        } else if (arg == "--no-tensor-first-split") {
+            arg_found = true;
+            params.no_tensor_first_split = true;
+        } else if (arg == "--merge") {
+            arg_found = true;
+            if (params.operation != OP_NONE && params.operation != OP_MERGE) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_MERGE;
+        } else if (arg == "--split") {
+            arg_found = true;
+            if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_SPLIT;
+        } else if (arg == "--split-max-tensors") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_TENSOR;
+            params.n_split_tensors = atoi(argv[arg_idx]);
+        } else if (arg == "--split-max-size") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_SIZE;
+            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
+        }
+
+        if (!arg_found) {
+            throw std::invalid_argument("error: unknown argument: " + arg);
+        }
+    }
+
+    // the operation is split if not specified
+    if (params.operation == OP_NONE) {
+        params.operation = OP_SPLIT;
+    }
+    // the split mode is by tensor if not specified
+    if (params.mode == MODE_NONE) {
+        params.mode = MODE_TENSOR;
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+    }
+
+    if (argc - arg_idx != 2) {
+        throw std::invalid_argument("error: bad arguments");
+    }
+
+    params.input = argv[arg_idx++];
+    params.output = argv[arg_idx++];
+}
+
+static bool split_params_parse(int argc, const char ** argv, split_params & params) {
+    bool result = true;
+    try {
+        split_params_parse_ex(argc, argv, params);
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        split_print_usage(argv[0]);
+        exit(EXIT_FAILURE);
+    }
+    return result;
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+struct split_strategy {
+    const split_params params;
+    std::ifstream & f_input;
+    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_meta = NULL;
+    const int n_tensors;
+
+    // one ctx_out per one output file
+    std::vector<struct gguf_context *> ctx_outs;
+
+    // temporary buffer for reading in tensor data
+    std::vector<uint8_t> read_buf;
+
+    split_strategy(const split_params & params,
+            std::ifstream & f_input,
+            struct gguf_context * ctx_gguf,
+            struct ggml_context * ctx_meta) :
+        params(params),
+        f_input(f_input),
+        ctx_gguf(ctx_gguf),
+        ctx_meta(ctx_meta),
+        n_tensors(gguf_get_n_tensors(ctx_gguf)) {
+
+        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
+        int i_split = -1;
+        struct gguf_context * ctx_out = NULL;
+        auto new_ctx_out = [&](bool allow_no_tensors) {
+            i_split++;
+            if (ctx_out != NULL) {
+                if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
+                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
+                    exit(EXIT_FAILURE);
+                }
+                ctx_outs.push_back(ctx_out);
+            }
+            ctx_out = gguf_init_empty();
+            // Save all metadata in first split only
+            if (i_split == 0) {
+                gguf_set_kv(ctx_out, ctx_gguf);
+            }
+            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
+            gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
+        };
+
+        // initialize ctx_out for the first split
+        new_ctx_out(false);
+
+        // skip first split if no_tensor_first_split is set
+        if (params.no_tensor_first_split) {
+            new_ctx_out(true);
+        }
+
+        // process tensors one by one
+        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
+        for (int i = 0; i < n_tensors; ++i) {
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            // calculate the "imaginary" size = the current size + next tensor size
+            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
+            size_t next_tensors_size = curr_tensors_size + n_bytes;
+            if (should_split(i, next_tensors_size)) {
+                new_ctx_out(false);
+                curr_tensors_size = n_bytes;
+            } else {
+                curr_tensors_size = next_tensors_size;
+            }
+            gguf_add_tensor(ctx_out, t);
+        }
+
+        // push the last ctx_out
+        ctx_outs.push_back(ctx_out);
+
+        // set the correct n_split for all ctx_out
+        for (auto & ctx : ctx_outs) {
+            gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
+        }
+    }
+
+    ~split_strategy() {
+        for (auto & ctx_out : ctx_outs) {
+            gguf_free(ctx_out);
+        }
+    }
+
+    bool should_split(int i_tensor, size_t next_size) {
+        if (params.mode == MODE_SIZE) {
+            // split by max size per file
+            return next_size > params.n_bytes_split;
+        } else if (params.mode == MODE_TENSOR) {
+            // split by number of tensors per file
+            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+        }
+        // should never happen
+        GGML_ABORT("invalid mode");
+    }
+
+    void print_info() {
+        printf("n_split: %zu\n", ctx_outs.size());
+        int i_split = 0;
+        for (auto & ctx_out : ctx_outs) {
+            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
+            size_t total_size = gguf_get_meta_size(ctx_out);
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
+                total_size += ggml_nbytes(t);
+            }
+            total_size = total_size / 1000 / 1000; // convert to megabytes
+            printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            i_split++;
+        }
+    }
+
+    void write() {
+        int i_split = 0;
+        int n_split = ctx_outs.size();
+        for (auto & ctx_out : ctx_outs) {
+            // construct file path
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+
+            // open the output file
+            printf("Writing file %s ... ", split_path);
+            fflush(stdout);
+            std::ofstream fout = std::ofstream(split_path, std::ios::binary);
+            fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+            // write metadata
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.write((const char *)data.data(), data.size());
+
+            // write tensors
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                // read tensor meta and prepare buffer
+                const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                auto n_bytes = ggml_nbytes(t);
+                read_buf.resize(n_bytes);
+
+                // calculate offset
+                auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
+                auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+
+                // copy tensor from input to output file
+                copy_file_to_file(f_input, fout, offset, n_bytes);
+                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+            }
+
+            printf("done\n");
+            // close the file
+            fout.close();
+            i_split++;
+        }
+    }
+
+    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
+        // TODO: detect OS and use copy_file_range() here for better performance
+        if (read_buf.size() < len) {
+            read_buf.resize(len);
+        }
+        f_in.seekg(in_offset);
+        f_in.read((char *)read_buf.data(), len);
+        f_out.write((const char *)read_buf.data(), len);
+    }
+};
+
+static void gguf_split(const split_params & split_params) {
+    struct ggml_context * ctx_meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_meta,
+    };
+
+    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
+    if (!f_input.is_open()) {
+        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    // prepare the strategy
+    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+    int n_split = strategy.ctx_outs.size();
+    strategy.print_info();
+
+    if (!split_params.dry_run) {
+        // write all output splits
+        strategy.write();
+    }
+
+    // done, clean up
+    gguf_free(ctx_gguf);
+    f_input.close();
+
+    fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
+            __func__, n_split, strategy.n_tensors);
+}
+
+static void gguf_merge(const split_params & split_params) {
+    fprintf(stderr, "%s: %s -> %s\n",
+            __func__, split_params.input.c_str(),
+            split_params.output.c_str());
+    int n_split = 1;
+    int total_tensors = 0;
+
+    // avoid overwriting existing output file
+    if (std::ifstream(split_params.output.c_str())) {
+        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
+    fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+    auto * ctx_out = gguf_init_empty();
+
+    std::vector<uint8_t> read_data;
+    std::vector<ggml_context *> ctx_metas;
+    std::vector<gguf_context *> ctx_ggufs;
+
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};
+
+    // First pass to find KV and tensors metadata
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        struct ggml_context * ctx_meta = NULL;
+
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_meta,
+        };
+
+        if (i_split > 0) {
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        }
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
+
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
+        if (!ctx_gguf) {
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+            exit(EXIT_FAILURE);
+        }
+        ctx_ggufs.push_back(ctx_gguf);
+        ctx_metas.push_back(ctx_meta);
+
+        if (i_split == 0) {
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+            if (key_n_split < 0) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain %s metadata\n",
+                        __func__,
+                        LLM_KV_SPLIT_COUNT);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+            if (n_split < 1) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain a valid split count %d\n",
+                        __func__,
+                        n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
+
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
+        }
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+            gguf_add_tensor(ctx_out, t);
+        }
+        total_tensors += n_tensors;
+
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+
+    // placeholder for the meta data
+    {
+        auto meta_size = gguf_get_meta_size(ctx_out);
+        ::zeros(fout, meta_size);
+    }
+
+    // Write tensors data
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
+        if (!f_input.is_open()) {
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
+            }
+            gguf_free(ctx_out);
+            fout.close();
+            exit(EXIT_FAILURE);
+        }
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
+
+        auto * ctx_gguf = ctx_ggufs[i_split];
+        auto * ctx_meta = ctx_metas[i_split];
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+
+            auto n_bytes = ggml_nbytes(t);
+
+            if (read_data.size() < n_bytes) {
+                read_data.resize(n_bytes);
+            }
+
+            auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+            f_input.seekg(offset);
+            f_input.read((char *)read_data.data(), n_bytes);
+
+            // write tensor data + padding
+            fout.write((const char *)read_data.data(), n_bytes);
+            zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+        }
+
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+        f_input.close();
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+
+    {
+        // go back to beginning of file and write the updated metadata
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *)data.data(), data.size());
+
+        fout.close();
+        gguf_free(ctx_out);
+    }
+
+    fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
+            __func__, split_params.output.c_str(), n_split, total_tensors);
+}
+
+int main(int argc, const char ** argv) {
+    split_params params;
+    split_params_parse(argc, argv, params);
+
+    switch (params.operation) {
+        case OP_SPLIT: gguf_split(params);
+            break;
+        case OP_MERGE: gguf_merge(params);
+            break;
+        default: split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
new file mode 100755
index 000000000..05a932227
--- /dev/null
+++ b/examples/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/llama-gguf-split
+MAIN=$1/llama-cli
+WORK_PATH=$TMP_DIR/gguf-split
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensors in the first split
+$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+echo PASS
+echo
+
+# 4b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
+echo PASS
+echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index 6481f087b..fb04eb83f 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET gguf)
+set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index e67be4fb2..f31989c8c 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,10 +1,9 @@
 #include "ggml.h"
+#include "gguf.h"
 
 #include <cstdio>
-#include <cinttypes>
 #include <string>
 #include <sstream>
-#include <fstream>
 #include <vector>
 
 #undef MIN
@@ -92,6 +91,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
 
+    if (!ctx) {
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
     printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
     printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
     printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
@@ -130,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -142,7 +147,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
 }
 
 // read and create ggml_context containing the tensors and their data
-static bool gguf_ex_read_1(const std::string & fname) {
+static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
@@ -177,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -194,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname) {
 
             struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
 
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
+                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
 
             // print first 10 elements
             const float * data = (const float *) cur->data;
@@ -206,11 +213,12 @@ static bool gguf_ex_read_1(const std::string & fname) {
             printf("\n\n");
 
             // check data
-            {
+            if (check_data) {
                 const float * data = (const float *) cur->data;
                 for (int j = 0; j < ggml_nelements(cur); ++j) {
                     if (data[j] != 100 + i) {
-                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
+                        gguf_free(ctx);
                         return false;
                     }
                 }
@@ -228,9 +236,18 @@ static bool gguf_ex_read_1(const std::string & fname) {
 
 int main(int argc, char ** argv) {
     if (argc < 3) {
-        printf("usage: %s data.gguf r|w\n", argv[0]);
+        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
+        printf("r: read data.gguf file\n");
+        printf("w: write data.gguf file\n");
+        printf("n: no check of tensor data\n");
         return -1;
     }
+    bool check_data = true;
+    if (argc == 4) {
+        check_data = false;
+    }
+
+    srand(123456);
 
     const std::string fname(argv[1]);
     const std::string mode (argv[2]);
@@ -241,7 +258,7 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
     } else if (mode == "r") {
         GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
-        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
     }
 
     return 0;
diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh
deleted file mode 100755
index 5fd739e55..000000000
--- a/examples/gpt4all.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-./main --color --instruct --threads 4 \
-       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
-       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 -n -1 \
-       --repeat_last_n 64 --repeat_penalty 1.3 \
-       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
new file mode 100644
index 000000000..fa1b4dc70
--- /dev/null
+++ b/examples/gritlm/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gritlm)
+add_executable(${TARGET} gritlm.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
new file mode 100644
index 000000000..786ba5736
--- /dev/null
+++ b/examples/gritlm/README.md
@@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
+```
+
+Run the example using the downloaded model:
+```console
+$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
new file mode 100644
index 000000000..72eb46257
--- /dev/null
+++ b/examples/gritlm/gritlm.cpp
@@ -0,0 +1,229 @@
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <vector>
+
+// #define GRIT_DEBUG
+
+static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
+    std::vector<std::vector<float>> result;
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+    for (uint64_t i = 0; i < sentences.size(); i++) {
+        common_batch_clear(batch);
+
+        const std::string input_string = instruction + sentences[i];
+
+        std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
+
+        const int32_t n_toks = inputs.size();
+
+        // GritLM seems to have EOS = ""
+        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
+        // inputs.push_back(llama_vocab_eos(vocab));
+
+        // we want to ignore instruction tokens for mean pooling
+        const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
+
+#ifdef GRIT_DEBUG
+        // debug tokens - should be matching as referenced in the GritLM sample
+        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
+            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
+        });
+        std::printf("\n");
+#endif
+
+        // add input to batch (this increments n_tokens)
+        for (int32_t j = 0; j < n_toks; j++) {
+            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+        }
+
+        // clear previous kv_cache values (irrelevant for embeddings)
+        llama_kv_cache_clear(ctx);
+        llama_set_embeddings(ctx, true);
+        llama_set_causal_attn(ctx, false);
+
+        // run model
+        llama_decode(ctx, batch);
+
+        // get embedding dimensions
+        uint64_t n_embd = llama_model_n_embd(model);
+
+        // allocate embedding output
+        std::vector<float> emb_unorm(n_embd, 0.0f);
+
+        // sum up all token embeddings
+        for (int32_t k = n_inst; k < n_toks; k++) {
+            float * emb = llama_get_embeddings_ith(ctx, k);
+            for (uint64_t j = 0; j < n_embd; j++) {
+                emb_unorm[j] += emb[j];
+            }
+        }
+
+        // divide by number of tokens (mean pooling)
+        {
+            const uint64_t n_sent = n_toks - n_inst;
+
+            for (uint64_t j = 0; j < n_embd; j++) {
+                emb_unorm[j] /= n_sent;
+            }
+        }
+
+        std::vector<float> emb_norm(emb_unorm.size());
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
+        result.push_back(emb_norm);
+
+#ifdef GRIT_DEBUG
+        // print out emb_norm
+        std::printf("embedding %ld: ", i);
+        for (uint64_t j = 0; j < n_embd; j++) {
+            std::printf("%.5f ", emb_norm[j]);
+        }
+        std::printf("\n\n");
+#endif
+    }
+
+    llama_batch_free(batch);
+
+    return result;
+}
+
+static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
+    std::string result;
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_token eos_token = llama_vocab_eos(vocab);
+
+    llama_kv_cache_clear(ctx);
+    llama_set_embeddings(ctx, false);
+    llama_set_causal_attn(ctx, true);
+
+    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+    std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
+    int32_t i_current_token = 0;
+
+    while (true) {
+        common_batch_clear(bat);
+        {
+            const int32_t n_inputs = inputs.size();
+
+            for (int32_t i = 0; i < n_inputs; i++) {
+                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            }
+        }
+        inputs.clear();
+
+        llama_decode(ctx, bat);
+
+        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+
+        if (token == eos_token) {
+            break;
+        }
+
+        std::string piece = common_token_to_piece(ctx, token);
+        if (stream) {
+            std::printf("%s", piece.c_str());
+            std::fflush(stdout);
+        }
+
+        inputs.push_back(token);
+
+        result += piece;
+    }
+
+    if (stream) {
+        std::printf("\n");
+    }
+
+    llama_batch_free(bat);
+
+    return result;
+}
+
+static std::string gritlm_instruction(const std::string & instruction) {
+    return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
+}
+
+int main(int argc, char * argv[]) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    llama_context_params cparams = common_context_params_to_llama(params);
+
+    llama_backend_init();
+
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+
+    // create generation context
+    llama_context * ctx = llama_init_from_model(model, cparams);
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    // ### Embedding/Representation ###
+    // samples taken from: https://github.com/ContextualAI/gritlm#basic
+    {
+        const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
+
+        const std::vector<std::string> queries = {
+            "Bitcoin: A Peer-to-Peer Electronic Cash System",
+            "Generative Representational Instruction Tuning",
+        };
+
+        const std::vector<std::string> documents = {
+            "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+            "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+        };
+
+        // No need to add instruction for retrieval documents
+        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
+        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
+
+        const int n_embd = llama_model_n_embd(model);
+
+        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
+
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
+    }
+
+    // ### Generation ###
+    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
+    {
+        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
+        std::string response = generate(ctx, smpl, prompt, true);
+    }
+
+    llama_sampler_free(smpl);
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
index d688a1620..412696c47 100644
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET imatrix)
+set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 578e8fc27..9c056986b 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,32 +1,33 @@
 # llama.cpp/examples/imatrix
 
-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
 
 ## Usage
 
 ```
-./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
-        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+./llama-imatrix \
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
+    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
 ```
 
 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
 
 ## Example
 
 ```bash
-LLAMA_CUBLAS=1 make -j
-
 # generate importance matrix (imatrix.dat)
-./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 
 # use the imatrix to perform a Q4_K_M quantization
-./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index f21bc48f3..b5f3feb9f 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,11 +1,12 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <sstream>
 #include <thread>
 #include <mutex>
 #include <vector>
@@ -17,52 +18,71 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
+            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
+            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+    LOG("\n");
+}
+
 struct Stats {
     std::vector<float> values;
+    std::vector<int> counts;
     int ncall = 0;
 };
 
-struct StatParams {
-    std::string ofile = "imatrix.dat";
-    int         n_output_frequency = 10;
-    int         verbosity = 1;
-    int         keep_every = 0;
-    bool        collect_output_weight = false;
-};
-
 class IMatrixCollector {
 public:
     IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
+    void set_params(common_params params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
-    bool load_imatrix(const char * file_name, bool add);
-    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
+    void save_imatrix(int ncall = -1) const;
+    bool load_imatrix(const char * fname);
 private:
     std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
+    common_params                          m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
-    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
-                                                  //
-    void save_imatrix(const char * file_name) const;
-    void keep_imatrix(int ncall) const;
+    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
 };
 
+// remove any prefix and suffixes from the name
+// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+static std::string filter_tensor_name(const char * name) {
+    std::string wname;
+    const char * p = strchr(name, '#');
+    if (p != NULL) {
+        p = p + 1;
+        const char * q = strchr(p, '#');
+        if (q != NULL) {
+            wname = std::string(p, q - p);
+        } else {
+            wname = p;
+        }
+    } else {
+        wname = name;
+    }
+    return wname;
+}
+
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
     const struct ggml_tensor * src0 = t->src[0];
     const struct ggml_tensor * src1 = t->src[1];
+    std::string wname = filter_tensor_name(src0->name);
 
     // when ask is true, the scheduler wants to know if we are interested in data from this tensor
     // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
     if (ask) {
         if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
         if (t->op != GGML_OP_MUL_MAT) return false;
+        // why are small batches ignored (<16 tokens)?
         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
         return true;
     }
 
@@ -78,82 +98,104 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 
     const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
 
+    // this has been adapted to the new format of storing merged experts in a single 3d tensor
+    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
     if (t->op == GGML_OP_MUL_MAT_ID) {
-        const int idx  = ((int32_t *) t->op_params)[0];
-        const int n_as = ((int32_t *) t->op_params)[1];
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
+        const ggml_tensor * ids = t->src[2];
+        const int n_as = src0->ne[2];
+        const int n_ids = ids->ne[0];
 
-        // the top-k selected expert ids are stored in the src0 tensor
-        // for simplicity, always copy src0 to host, because it is small
-        // take into account that src0 is not contiguous!
-        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
-        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
-        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+        // the top-k selected expert ids are stored in the ids tensor
+        // for simplicity, always copy ids to host, because it is small
+        // take into account that ids is not contiguous!
 
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        m_ids.resize(ggml_nbytes(ids));
+        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
+
+        auto & e = m_stats[wname];
+
+        ++e.ncall;
+
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(src1->ne[0]*n_as, 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
         // loop over all possible experts, regardless if they are used or not in the batch
-        // this is necessary to guarantee equal number of "ncall" for each tensor
         for (int ex = 0; ex < n_as; ++ex) {
-            src0 = t->src[2 + ex];
-            auto& e = m_stats[src0->name];
-            if (e.values.empty()) {
-                e.values.resize(src1->ne[0], 0);
-            }
-            else if (e.values.size() != (size_t)src1->ne[0]) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-                exit(1); //GGML_ASSERT(false);
-            }
-            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
-            //       using the following line, we can correct for that if needed
-            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
-            ++e.ncall;
-            if (m_params.verbosity > 1) {
-                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-            }
-            for (int row = 0; row < (int)src1->ne[1]; ++row) {
-                const int excur = m_ids[row*n_as + idx];
-                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
-                if (excur != ex) continue;
-                const float * x = data + row * src1->ne[0];
-                for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                    e.values[j] += x[j]*x[j];
+            size_t e_start = ex*src1->ne[0];
+
+            for (int idx = 0; idx < n_ids; ++idx) {
+                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+                    if (excur != ex) continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j]*x[j];
+                        e.counts[e_start + j]++;
+                        if (!std::isfinite(e.values[e_start + j])) {
+                            LOG("\n");
+                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            exit(1);
+                        }
+                    }
                 }
             }
             if (e.ncall > m_last_call) {
                 m_last_call = e.ncall;
-                if (m_last_call % m_params.n_output_frequency == 0) {
+                if (m_last_call % m_params.n_out_freq == 0) {
                     save_imatrix();
                 }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                    keep_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                    save_imatrix(m_last_call);
                 }
             }
         }
     } else {
-        auto& e = m_stats[src0->name];
+        auto & e = m_stats[wname];
         if (e.values.empty()) {
             e.values.resize(src1->ne[0], 0);
+            e.counts.resize(src1->ne[0], 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-            exit(1); //GGML_ASSERT(false);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
         for (int row = 0; row < (int)src1->ne[1]; ++row) {
             const float * x = data + row * src1->ne[0];
             for (int j = 0; j < (int)src1->ne[0]; ++j) {
                 e.values[j] += x[j]*x[j];
+                e.counts[j]++;
+                if (!std::isfinite(e.values[j])) {
+                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
+                    exit(1);
+                }
             }
         }
         if (e.ncall > m_last_call) {
             m_last_call = e.ncall;
-            if (m_last_call % m_params.n_output_frequency == 0) {
+            if (m_last_call % m_params.n_out_freq == 0) {
                 save_imatrix();
             }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                keep_imatrix(m_last_call);
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                save_imatrix(m_last_call);
             }
         }
     }
@@ -161,46 +203,104 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     return true;
 }
 
-void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+    auto fname = m_params.out_file;
+    if (fname.empty()) {
+        fname = "imatrix.dat";
+    }
 
-void IMatrixCollector::keep_imatrix(int ncall) const {
-    auto file_name = m_params.ofile;
-    if (file_name.empty()) file_name = "imatrix.dat";
-    file_name += ".at_";
-    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
-}
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
+
+    // avoid writing imatrix entries that do not have full data
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    int n_entries = 0;
+    std::vector<std::string> to_store;
+
+    bool is_first = true; // for printing
+    for (const auto & kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        if (n_all == 0) {
+            continue;
+        }
+
+        int n_zeros = 0;
+        for (const int c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            LOG_INF("\n");
+            is_first = false;
+        }
+
+        if (n_zeros == n_all) {
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            continue;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            continue;
+        }
+
+        n_entries++;
+        to_store.push_back(kv.first);
+    }
+
+    if (to_store.size() < m_stats.size()) {
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+    }
 
-void IMatrixCollector::save_imatrix(const char * fname) const {
     std::ofstream out(fname, std::ios::binary);
-    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
-        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
-        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
-        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+    out.write((const char *) &n_entries, sizeof(n_entries));
+    for (const auto & name : to_store) {
+        const auto & stat = m_stats.at(name);
+        int len = name.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(name.c_str(), len);
+        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
+        int nval = stat.values.size();
+        out.write((const char *) &nval, sizeof(nval));
+        if (nval > 0) {
+            std::vector<float> tmp(nval);
+            for (int i = 0; i < nval; i++) {
+                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            }
+            out.write((const char*)tmp.data(), nval*sizeof(float));
+        }
     }
-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+
+    // Write the number of call the matrix was computed with
+    out.write((const char *) &m_last_call, sizeof(m_last_call));
+
+    // Write the input filename at the end of the file to later on specify it in quantize
+    {
+        int len = m_params.prompt_file.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(m_params.prompt_file.c_str(), len);
     }
+
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
 }
 
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
-    std::ifstream in(imatrix_file, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * fname) {
+    std::ifstream in(fname, std::ios::binary);
     if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        LOG_ERR("%s: failed to open %s\n",__func__, fname);
         return false;
     }
     int n_entries;
     in.read((char*)&n_entries, sizeof(n_entries));
     if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        LOG_ERR("%s: no data in file %s\n", __func__, fname);
         return false;
     }
     for (int i = 0; i < n_entries; ++i) {
@@ -208,40 +308,46 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
         std::vector<char> name_as_vec(len+1);
         in.read((char *)name_as_vec.data(), len);
         if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
             return false;
         }
         name_as_vec[len] = 0;
         std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = m_stats[std::move(name)];
         int ncall;
         in.read((char*)&ncall, sizeof(ncall));
         int nval;
         in.read((char *)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            imatrix_data = {};
+            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            m_stats = {};
             return false;
         }
-        e.values.resize(nval);
-        in.read((char*)e.values.data(), nval*sizeof(float));
+
+        if (e.values.empty()) {
+            e.values.resize(nval, 0);
+            e.counts.resize(nval, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char*)tmp.data(), nval*sizeof(float));
         if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
-            imatrix_data = {};
+            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            m_stats = {};
             return false;
         }
-        e.ncall = ncall;
+
+        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i];
+            e.counts[i] += ncall;
+        }
+        e.ncall += ncall;
+
     }
     return true;
 }
 
-bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
-    if (!add) {
-        m_stats.clear();
-    }
-    return load_imatrix(file_name, m_stats);
-}
-
 static IMatrixCollector g_collector;
 
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -255,7 +361,7 @@ struct results_log_softmax {
     float  prob;
 };
 
-static std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float> & logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
     for (float v : logits) {
@@ -289,8 +395,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
 
 static void process_logits(
     int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
     std::mutex mutex;
     int counter = 0;
     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -322,39 +427,42 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
+static bool compute_imatrix(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
     const int n_ctx = llama_n_ctx(ctx);
 
-    auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    auto tim1 = std::chrono::high_resolution_clock::now();
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
-    if (from_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+    if (params.i_chunk > 0) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
             return false;
         }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
     }
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
         return false;
     }
 
     std::vector<float> logit_history;
     std::vector<float> prob_history;
 
-    if (compute_ppl) {
+    if (params.compute_ppl) {
         logit_history.resize(tokens.size());
         prob_history.resize(tokens.size());
     }
@@ -362,21 +470,21 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     const int n_chunk_max = tokens.size() / n_ctx;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = llama_vocab_n_tokens(vocab);
     const int n_batch = params.n_batch;
 
     int count = 0;
     double nll = 0.0;
     double nll2 = 0.0;
 
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
     const int num_batches = (n_ctx + n_batch - 1) / n_batch;
 
     std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
+    if (params.compute_ppl && num_batches > 1) {
         logits.reserve((size_t)n_ctx * n_vocab);
     }
 
@@ -391,6 +499,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -400,61 +510,69 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_vocab_bos(vocab);
             }
 
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return false;
             }
 
             // restore the original token in case it was set to BOS
             tokens[batch_start] = token_org;
 
-            if (compute_ppl && num_batches > 1) {
+            if (params.compute_ppl && num_batches > 1) {
                 const auto * batch_logits = llama_get_logits(ctx);
                 logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
 
-        if (compute_ppl) {
+        if (params.compute_ppl) {
             const int first = n_ctx/2;
-            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
             process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                     workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
             count += n_ctx - first - 1;
 
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
             fflush(stdout);
 
             logits.clear();
         }
     }
-    printf("\n");
+    LOG("\n");
 
-    if (compute_ppl) {
+    if (params.compute_ppl) {
         nll2 /= count;
         nll /= count;
         const double ppl = exp(nll);
         nll2 -= nll * nll;
         if (nll2 > 0) {
             nll2 = sqrt(nll2/(count-1));
-            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
         } else {
-            printf("Unexpected negative standard deviation of log(prob)\n");
+            LOG("Unexpected negative standard deviation of log(prob)\n");
         }
     }
 
@@ -462,159 +580,84 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 }
 
 int main(int argc, char ** argv) {
+    common_params params;
 
-    StatParams sparams;
-    std::string prev_result_file;
-    std::string combine_files;
-    bool compute_ppl = true;
-    int  from_chunk  = 0;
-    std::vector<char*> args;
-    args.push_back(argv[0]);
-    int iarg = 1;
-    for (; iarg < argc-1; ++iarg) {
-        std::string arg{argv[iarg]};
-        if (arg == "-o" || arg == "--output-file") {
-            sparams.ofile = argv[++iarg];
-        }
-        else if (arg == "-ofreq" || arg == "--output-frequency") {
-            sparams.n_output_frequency = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "-ow" || arg == "--output-weight") {
-            sparams.collect_output_weight = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "--verbosity") {
-            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else if (arg == "--keep-imatrix") {
-            sparams.keep_every = std::stoi(argv[++iarg]);
-        } else if (arg == "--continue-from") {
-            prev_result_file = argv[++iarg];
-        } else if (arg == "--combine") {
-            combine_files = argv[++iarg];
-        }
-        else if (arg == "--from-chunk") {
-            from_chunk = std::stoi(argv[++iarg]);
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-    if (iarg < argc) {
-        std::string arg{argv[iarg]};
-        if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
+    params.n_ctx = 512;
+    params.logits_all = true;
+    params.escape = false;
 
-    g_collector.set_parameters(std::move(sparams));
-
-    if (!combine_files.empty()) {
-        std::vector<std::string> files;
-        size_t pos = 0;
-        while (true) {
-            auto new_pos = combine_files.find(',', pos);
-            if (new_pos != std::string::npos) {
-                files.emplace_back(combine_files.substr(pos, new_pos - pos));
-                pos = new_pos + 1;
-            } else {
-                files.emplace_back(combine_files.substr(pos));
-                break;
-            }
-        }
-        if (files.size() < 2) {
-            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
-            return 1;
-        }
-        printf("Combining the following %d files\n", int(files.size()));
-        for (auto& file : files) {
-            printf("    %s\n", file.c_str());
-            if (!g_collector.load_imatrix(file.c_str(), true)) {
-                fprintf(stderr, "Failed to load %s\n", file.c_str());
-                return 1;
-            }
-        }
-        g_collector.save_imatrix();
-        return 0;
-    }
-
-    if (!prev_result_file.empty()) {
-        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
-            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
-            return 1;
-        }
-    }
-
-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
         return 1;
     }
 
-    params.logits_all = true;
+    common_init();
+
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
-    print_build_info();
+    g_collector.set_params(params);
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+    for (const auto & in_file : params.in_files) {
+        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        if (!g_collector.load_imatrix(in_file.c_str())) {
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
+            return 1;
+        }
     }
 
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+    if (params.in_files.size() > 1) {
+        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        g_collector.save_imatrix();
     }
 
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
     // pass the callback to the backend scheduler
     // it will be executed for each node during the graph computation
-    cparams.cb_eval = ik_collect_imatrix;
-    cparams.cb_eval_user_data = NULL;
+    params.cb_eval = ik_collect_imatrix;
+    params.cb_eval_user_data = NULL;
+    params.warmup = false;
 
-    llama_context * ctx = llama_new_context_with_model(model, cparams);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+    // init
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    if (model == nullptr || ctx == nullptr) {
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = llama_model_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
-    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
-    if (!OK) {
-        return 1;
+    if (params.prompt.empty()) {
+        if (params.in_files.empty()) {
+            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
+            return 1;
+        }
+        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
+    } else {
+        if (!compute_imatrix(ctx, params)) {
+            return 1;
+        }
     }
 
+
     g_collector.save_imatrix();
 
-    llama_print_timings(ctx);
-
-    llama_free(ctx);
-    llama_free_model(model);
+    LOG("\n");
+    llama_perf_context_print(ctx);
 
     llama_backend_free();
 
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index e4e8028da..fb26628d8 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET infill)
+set(TARGET llama-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/infill/README.md b/examples/infill/README.md
index 8c97f719b..df4d976f2 100644
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
 ## Input Prompts
 
@@ -36,6 +37,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
 
 ### Example
 
-```bash
-./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
+```bash
+./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index d4b8729dd..489a208b6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,8 +1,9 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
-#include "grammar-parser.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -34,57 +35,14 @@
 
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 
 static bool is_interacting = false;
 
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: infill\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
     if (signo == SIGINT) {
@@ -92,9 +50,13 @@ static void sigint_handler(int signo) {
             is_interacting = true;
         } else {
             console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            LOG("\n");
+            common_perf_print(*g_ctx, *g_smpl);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            common_log_pause(common_log_main());
+
             _exit(130);
         }
     }
@@ -102,197 +64,135 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
-    gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
+    common_params params;
     g_params = &params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
         return 1;
     }
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+    common_init();
+
+    auto & sparams = params.sampling;
 
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
-    if (params.instruct) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
-        printf("************\n\n");
 
-        return 0;
-    }
-    if (params.chatml) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.antiprompt.empty()) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
     if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (params.random_prompt) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.path_prompt_cache.empty()) {
-        printf("\n************\n");
-        printf("%s: infill does not support prompt caching\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    LOG("%s: llama backend init\n", __func__);
+    LOG_INF("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    common_sampler * smpl = nullptr;
+
     g_model = &model;
     g_ctx = &ctx;
+    g_smpl = &smpl;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    common_init_result llama_init = common_init_from_params(params);
+
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos: %d\n", add_bos);
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
-    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-        params.input_suffix.erase(0, 1);
-        suff_rm_leading_spc = false;
-    }
     std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-        inp_sfx.erase(inp_sfx.begin());
-    }
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
-    }
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(model));
+    std::vector<llama_token> embd_end;
+    std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
 
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
+    GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
+
+    inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
+    inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+    if (add_bos) {
+        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+    }
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+    const llama_token middle_token = llama_vocab_fim_mid(vocab);
+    if (middle_token >= 0) {
+        embd_inp.push_back(middle_token);
+    }
+
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-    }
-
-    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
+        embd_inp.push_back(llama_vocab_bos(vocab));
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
@@ -301,9 +201,8 @@ int main(int argc, char ** argv) {
         params.n_keep = (int)embd_inp.size();
     }
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
 
     // enable interactive mode if interactive start is specified
     if (params.interactive_first) {
@@ -311,30 +210,21 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG_CNT("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     if (params.interactive) {
@@ -351,55 +241,54 @@ int main(int argc, char ** argv) {
         SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    smpl = common_sampler_init(model, sparams);
 
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
-    if (params.infill) {
-        printf("\n************\n");
-        printf("no need to specify '--infill', always running infill\n");
-        printf("************\n\n");
-    }
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    LOG_INF("\n");
+    LOG_INF("\n#####  Infill mode  #####\n\n");
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
                               " - To return control without starting a new line, end your input with '/'.\n";
         } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to LLaMA.\n"
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG_INF(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
 
-    bool input_echo           = true;
+    bool input_echo = true;
 
-    int n_past             = 0;
-    int n_remain           = params.n_predict;
-    int n_consumed         = 0;
-    int n_past_guidance    = 0;
+    int n_past     = 0;
+    int n_remain   = params.n_predict;
+    int n_consumed = 0;
 
     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@@ -409,9 +298,6 @@ int main(int argc, char ** argv) {
     console::set_display(console::prompt);
 
     std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while (n_remain != 0 || params.interactive) {
         // predict
@@ -426,25 +312,24 @@ int main(int argc, char ** argv) {
                 embd.resize(max_embd_size);
 
                 console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
-                fflush(stdout);
             }
 
             // infinite text generation via context swapping
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            if (n_past + (int) embd.size() > n_ctx) {
                 if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
                 }
 
                 const int n_left    = n_past - params.n_keep - 1;
                 const int n_discard = n_left/2;
 
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                 llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -452,87 +337,42 @@ int main(int argc, char ** argv) {
 
                 n_past -= n_discard;
 
-                if (ctx_guidance) {
-                    n_past_guidance -= n_discard;
-                }
+                LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
             }
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
             }
 
         }
 
         embd.clear();
-        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            common_sampler_accept(smpl, id, true);
 
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -542,16 +382,16 @@ int main(int argc, char ** argv) {
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -563,8 +403,8 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                const std::string token_str = common_token_to_piece(ctx, id);
+                LOG("%s", token_str.c_str());
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
@@ -573,7 +413,6 @@ int main(int argc, char ** argv) {
                     output_ss << token_str;
                 }
             }
-            fflush(stdout);
         }
         // reset color to default if we there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -582,15 +421,13 @@ int main(int argc, char ** argv) {
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-
             // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if(is_interacting && !params.interactive_first) {
+            if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
+                if (is_interacting && !params.interactive_first) {
                     // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
                 }
-                fflush(stdout);
-                printf("\n");
+                LOG("\n");
                 console::set_display(console::user_input);
                 std::string buffer;
                 std::string line;
@@ -620,62 +457,59 @@ int main(int argc, char ** argv) {
 
                 if (params.escape) {
                     //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    process_escapes(params.input_prefix);
-                    process_escapes(params.input_suffix);
-                }
-                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
-                    params.input_suffix.erase(0, 1);
-                    suff_rm_leading_spc = false;
+                    string_process_escapes(params.input_prefix);
+                    string_process_escapes(params.input_suffix);
                 }
+
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-                    inp_sfx.erase(inp_sfx.begin());
-                }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+
+                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
+                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                 if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
+                    embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
                 }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(model));
+                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+                if (middle_token >= 0) {
+                    embd_inp.push_back(middle_token);
+                }
+
                 embd.clear();
-                embd_guidance.clear();
                 n_remain = params.n_predict;
                 n_past = 0;
                 n_consumed = 0;
-                // LOG_TEE("took new input\n");
                 is_interacting = false;
             }
-            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
-                LOG("found EOS token\n");
+            // deal with end of generation tokens in interactive mode
+            else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+                LOG_DBG("found EOS token\n");
 
                 if (params.interactive) {
 
                     is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                     console::set_display(console::user_input);
-                    fflush(stdout);
                }
             }
 
             if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    LOG_DBG("adding input prefix BOS token\n");
+                    embd_inp.push_back(llama_vocab_bos(vocab));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    LOG("%s", buffer.c_str());
                 }
 
                 std::string line;
@@ -693,30 +527,30 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    const auto line_inp = common_tokenize(ctx, buffer, false);
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
 
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -724,14 +558,14 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
         }
 
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+        // end of generation
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
             break;
         }
 
@@ -743,24 +577,14 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
+        LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
     }
 
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    LOG("\n");
+    common_perf_print(ctx, smpl);
 
-    if (ctx_guidance) { llama_free(ctx_guidance); }
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_sampling_free(ctx_sampling);
+    common_sampler_free(smpl);
     llama_backend_free();
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
     return 0;
 }
-
diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh
index 9bdbc755c..07bcb3b8d 100755
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
   echo $counter
   echo "Current Question: $question"
   eval "$exe_cmd"
diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py
deleted file mode 100755
index 6a977f031..000000000
--- a/examples/json-schema-to-grammar.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import json
-import re
-import sys
-
-# whitespace is constrained to a single space char to prevent model "running away" in
-# whitespace. Also maybe improves generation quality?
-SPACE_RULE = '" "?'
-
-PRIMITIVE_RULES = {
-    'boolean': '("true" | "false") space',
-    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
-    'string': r''' "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space ''',
-    'null': '"null" space',
-}
-
-INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
-
-
-class SchemaConverter:
-    def __init__(self, prop_order):
-        self._prop_order = prop_order
-        self._rules = {'space': SPACE_RULE}
-
-    def _format_literal(self, literal):
-        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
-        )
-        return f'"{escaped}"'
-
-    def _add_rule(self, name, rule):
-        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
-        if esc_name not in self._rules or self._rules[esc_name] == rule:
-            key = esc_name
-        else:
-            i = 0
-            while f'{esc_name}{i}' in self._rules:
-                i += 1
-            key = f'{esc_name}{i}'
-        self._rules[key] = rule
-        return key
-
-    def visit(self, schema, name):
-        schema_type = schema.get('type')
-        rule_name = name or 'root'
-
-        if 'oneOf' in schema or 'anyOf' in schema:
-            rule = ' | '.join((
-                self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
-                for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
-            ))
-            return self._add_rule(rule_name, rule)
-
-        elif 'const' in schema:
-            return self._add_rule(rule_name, self._format_literal(schema['const']))
-
-        elif 'enum' in schema:
-            rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
-            return self._add_rule(rule_name, rule)
-
-        elif schema_type == 'object' and 'properties' in schema:
-            # TODO: `required` keyword
-            prop_order = self._prop_order
-            prop_pairs = sorted(
-                schema['properties'].items(),
-                # sort by position in prop_order (if specified) then by key
-                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
-            )
-
-            rule = '"{" space'
-            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
-                prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
-                if i > 0:
-                    rule += ' "," space'
-                rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
-            rule += ' "}" space'
-
-            return self._add_rule(rule_name, rule)
-
-        elif schema_type == 'array' and 'items' in schema:
-            # TODO `prefixItems` keyword
-            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            list_item_operator = f'("," space {item_rule_name})'
-            successive_items = ""
-            min_items = schema.get("minItems", 0)
-            if min_items > 0:
-               first_item = f"({item_rule_name})"
-               successive_items = list_item_operator * (min_items - 1)
-               min_items -= 1
-            else:
-               first_item = f"({item_rule_name})?"
-            max_items = schema.get("maxItems")
-            if max_items is not None and max_items > min_items:
-                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
-            else:
-                successive_items += list_item_operator + "*"
-            rule = f'"[" space {first_item} {successive_items} "]" space'
-            return self._add_rule(rule_name, rule)
-
-        else:
-            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
-            return self._add_rule(
-                'root' if rule_name == 'root' else schema_type,
-                PRIMITIVE_RULES[schema_type]
-            )
-
-    def format_grammar(self):
-        return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
-
-
-def main(args_in = None):
-    parser = argparse.ArgumentParser(
-        description='''
-            Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
-            given JSON schema. Only a subset of JSON schema features are supported; more may be
-            added in the future.
-        ''',
-    )
-    parser.add_argument(
-        '--prop-order',
-        default=[],
-        type=lambda s: s.split(','),
-        help='''
-            comma-separated property names defining the order of precedence for object properties;
-            properties not specified here are given lower precedence than those that are, and are
-            sorted alphabetically
-        '''
-    )
-    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
-    args = parser.parse_args(args_in)
-
-    schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
-    prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
-    converter = SchemaConverter(prop_order)
-    converter.visit(schema, '')
-    print(converter.format_grammar())
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/json_schema_pydantic_example.py b/examples/json_schema_pydantic_example.py
new file mode 100644
index 000000000..19c0bdb5b
--- /dev/null
+++ b/examples/json_schema_pydantic_example.py
@@ -0,0 +1,82 @@
+# Usage:
+#! ./llama-server -m some-model.gguf &
+#! pip install pydantic
+#! python json_schema_pydantic_example.py
+
+from pydantic import BaseModel, Field, TypeAdapter
+from annotated_types import MinLen
+from typing import Annotated, List, Optional
+import json, requests
+
+if True:
+
+    def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
+        '''
+        Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
+        (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+
+        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
+        '''
+        response_format = None
+        type_adapter = None
+
+        if response_model:
+            type_adapter = TypeAdapter(response_model)
+            schema = type_adapter.json_schema()
+            messages = [{
+                "role": "system",
+                "content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
+            }] + messages
+            response_format={"type": "json_object", "schema": schema}
+
+        data = requests.post(endpoint, headers={"Content-Type": "application/json"},
+                             json=dict(messages=messages, response_format=response_format, **kwargs)).json()
+        if 'error' in data:
+            raise Exception(data['error']['message'])
+
+        content = data["choices"][0]["message"]["content"]
+        return type_adapter.validate_json(content) if type_adapter else content
+
+else:
+
+    # This alternative branch uses Instructor + OpenAI client lib.
+    # Instructor support streamed iterable responses, retry & more.
+    # (see https://python.useinstructor.com/)
+    #! pip install instructor openai
+    import instructor, openai
+    client = instructor.patch(
+        openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
+        mode=instructor.Mode.JSON_SCHEMA)
+    create_completion = client.chat.completions.create
+
+
+if __name__ == '__main__':
+
+    class QAPair(BaseModel):
+        class Config:
+            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
+        question: str
+        concise_answer: str
+        justification: str
+        stars: Annotated[int, Field(ge=1, le=5)]
+
+    class PyramidalSummary(BaseModel):
+        class Config:
+            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
+        title: str
+        summary: str
+        question_answers: Annotated[List[QAPair], MinLen(2)]
+        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
+
+    print("# Summary\n", create_completion(
+        model="...",
+        response_model=PyramidalSummary,
+        messages=[{
+            "role": "user",
+            "content": f"""
+                You are a highly efficient corporate document summarizer.
+                Create a pyramidal summary of an imaginary internal document about our company processes
+                (starting high-level, going down to each sub sections).
+                Keep questions short, and answers even shorter (trivia / quizz style).
+            """
+        }]))
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
new file mode 100755
index 000000000..fc9f0097f
--- /dev/null
+++ b/examples/json_schema_to_grammar.py
@@ -0,0 +1,811 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import re
+import sys
+from typing import Any, List, Optional, Set, Tuple, Union
+
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
+
+    if min_items == 0 and max_items == 1:
+        return f'{item_rule}?'
+
+    if not separator_rule:
+        if min_items == 1 and max_items is None:
+            return f'{item_rule}+'
+        elif min_items == 0 and max_items is None:
+            return f'{item_rule}*'
+        else:
+            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
+
+    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
+    return f'({result})?' if min_items == 0 else result
+
+def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
+    has_min = min_value != None
+    has_max = max_value != None
+
+    def digit_range(from_char: str, to_char: str):
+        out.append("[")
+        if from_char == to_char:
+            out.append(from_char)
+        else:
+            out.append(from_char)
+            out.append("-")
+            out.append(to_char)
+        out.append("]")
+
+    def more_digits(min_digits: int, max_digits: int):
+        out.append("[0-9]")
+        if min_digits == max_digits and min_digits == 1:
+            return
+        out.append("{")
+        out.append(str(min_digits))
+        if max_digits != min_digits:
+            out.append(",")
+            if max_digits != sys.maxsize:
+                out.append(str(max_digits))
+        out.append("}")
+
+    def uniform_range(from_str: str, to_str: str):
+        i = 0
+        while i < len(from_str) and from_str[i] == to_str[i]:
+            i += 1
+        if i > 0:
+            out.append("\"")
+            out.append(from_str[:i])
+            out.append("\"")
+        if i < len(from_str):
+            if i > 0:
+                out.append(" ")
+            sub_len = len(from_str) - i - 1
+            if sub_len > 0:
+                from_sub = from_str[i+1:]
+                to_sub = to_str[i+1:]
+                sub_zeros = "0" * sub_len
+                sub_nines = "9" * sub_len
+
+                to_reached = False
+                out.append("(")
+                if from_sub == sub_zeros:
+                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
+                    out.append(" ")
+                    more_digits(sub_len, sub_len)
+                else:
+                    out.append("[")
+                    out.append(from_str[i])
+                    out.append("] ")
+                    out.append("(")
+                    uniform_range(from_sub, sub_nines)
+                    out.append(")")
+                    if ord(from_str[i]) < ord(to_str[i]) - 1:
+                        out.append(" | ")
+                        if to_sub == sub_nines:
+                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
+                            to_reached = True
+                        else:
+                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
+                        out.append(" ")
+                        more_digits(sub_len, sub_len)
+                if not to_reached:
+                    out.append(" | ")
+                    digit_range(to_str[i], to_str[i])
+                    out.append(" ")
+                    uniform_range(sub_zeros, to_sub)
+                out.append(")")
+            else:
+                out.append("[")
+                out.append(from_str[i])
+                out.append("-")
+                out.append(to_str[i])
+                out.append("]")
+
+    if has_min and has_max:
+        if min_value < 0 and max_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
+            out.append(")")
+            return
+
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
+            out.append(") | ")
+            min_value = 0
+
+        min_s = str(min_value)
+        max_s = str(max_value)
+        min_digits = len(min_s)
+        max_digits = len(max_s)
+
+        for digits in range(min_digits, max_digits):
+            uniform_range(min_s, "9" * digits)
+            min_s = "1" + "0" * digits
+            out.append(" | ")
+        uniform_range(min_s, max_s)
+        return
+
+    less_decimals = max(decimals_left - 1, 1)
+
+    if has_min:
+        if min_value < 0:
+            out.append("\"-\" (")
+            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
+            out.append(") | [0] | [1-9] ")
+            more_digits(0, decimals_left - 1)
+        elif min_value == 0:
+            if top_level:
+                out.append("[0] | [1-9] ")
+                more_digits(0, less_decimals)
+            else:
+                more_digits(1, decimals_left)
+        elif min_value <= 9:
+            c = str(min_value)
+            range_start = '1' if top_level else '0'
+            if c > range_start:
+                digit_range(range_start, chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(1, less_decimals)
+                out.append(" | ")
+            digit_range(c, "9")
+            out.append(" ")
+            more_digits(0, less_decimals)
+        else:
+            min_s = str(min_value)
+            length = len(min_s)
+            c = min_s[0]
+
+            if c > "1":
+                digit_range("1" if top_level else "0", chr(ord(c) - 1))
+                out.append(" ")
+                more_digits(length, less_decimals)
+                out.append(" | ")
+            digit_range(c, c)
+            out.append(" (")
+            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
+            out.append(")")
+            if c < "9":
+                out.append(" | ")
+                digit_range(chr(ord(c) + 1), "9")
+                out.append(" ")
+                more_digits(length - 1, less_decimals)
+        return
+
+    if has_max:
+        if max_value >= 0:
+            if top_level:
+                out.append("\"-\" [1-9] ")
+                more_digits(0, less_decimals)
+                out.append(" | ")
+            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
+        else:
+            out.append("\"-\" (")
+            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
+            out.append(")")
+        return
+
+    raise RuntimeError("At least one of min_value or max_value must be set")
+
+class BuiltinRule:
+    def __init__(self, content: str, deps: list | None = None):
+        self.content = content
+        self.deps = deps or []
+
+# Constraining spaces to prevent model "running away".
+SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
+
+PRIMITIVE_RULES = {
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
+    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+    'null'         : BuiltinRule('"null" space', []),
+}
+
+# TODO: support "uri", "email" string formats
+STRING_FORMAT_RULES = {
+    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+}
+
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
+
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
+
+INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
+
+NON_LITERAL_SET = set('|.()[]{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
+
+
+class SchemaConverter:
+    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
+        self._prop_order = prop_order
+        self._allow_fetch = allow_fetch
+        self._dotall = dotall
+        self._raw_pattern = raw_pattern
+        self._rules = {
+            'space': SPACE_RULE,
+        }
+        self._refs = {}
+        self._refs_being_resolved = set()
+
+    def _format_literal(self, literal):
+        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
+        )
+        return f'"{escaped}"'
+
+    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+        '''
+            not_literal('a') -> '[^a]'
+            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+        '''
+        assert len(literal) > 0, 'Empty literal not supported'
+        def recurse(i: int):
+            c = literal[i]
+            if maybe_escaped_underscores and c == '_':
+                yield f'[^{c}\\\\]'
+                yield ' | '
+                yield f'"\\\\"? "{c}"'
+            else:
+                yield f'[^{c}]'
+            if i < len(literal) - 1:
+                yield ' | '
+                yield self._format_literal(c)
+                yield ' ('
+                yield from recurse(i + 1)
+                yield ')?'
+
+        return ''.join(('(', *recurse(0), ')'))
+
+    def _not_strings(self, strings):
+        class TrieNode:
+            def __init__(self):
+                self.children = {}
+                self.is_end_of_string = False
+
+            def insert(self, string):
+                node = self
+                for c in string:
+                    node = node.children.setdefault(c, TrieNode())
+                node.is_end_of_string = True
+
+        trie = TrieNode()
+        for s in strings:
+            trie.insert(s)
+
+        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+        out = ['["] ( ']
+
+        def visit(node):
+            rejects = []
+            first = True
+            for c in sorted(node.children.keys()):
+                child = node.children[c]
+                rejects.append(c)
+                if first:
+                    first = False
+                else:
+                    out.append(' | ')
+                out.append(f'[{c}]')
+                if child.children:
+                    out.append(f' (')
+                    visit(child)
+                    out.append(')')
+                elif child.is_end_of_string:
+                    out.append(f' {char_rule}+')
+            if node.children:
+                if not first:
+                    out.append(' | ')
+                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
+        visit(trie)
+
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        return ''.join(out)
+
+    def _add_rule(self, name, rule):
+        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
+        if esc_name not in self._rules or self._rules[esc_name] == rule:
+            key = esc_name
+        else:
+            i = 0
+            while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
+                i += 1
+            key = f'{esc_name}{i}'
+        self._rules[key] = rule
+        return key
+
+    def resolve_refs(self, schema: dict, url: str):
+        '''
+            Resolves all $ref fields in the given schema, fetching any remote schemas,
+            replacing $ref with absolute reference URL and populating self._refs with the
+            respective referenced (sub)schema dictionaries.
+        '''
+        def visit(n: dict):
+            if isinstance(n, list):
+                return [visit(x) for x in n]
+            elif isinstance(n, dict):
+                ref = n.get('$ref')
+                if ref is not None and ref not in self._refs:
+                    if ref.startswith('https://'):
+                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
+                        import requests
+
+                        frag_split = ref.split('#')
+                        base_url = frag_split[0]
+
+                        target = self._refs.get(base_url)
+                        if target is None:
+                            target = self.resolve_refs(requests.get(ref).json(), base_url)
+                            self._refs[base_url] = target
+
+                        if len(frag_split) == 1 or frag_split[-1] == '':
+                            return target
+                    elif ref.startswith('#/'):
+                        target = schema
+                        ref = f'{url}{ref}'
+                        n['$ref'] = ref
+                    else:
+                        raise ValueError(f'Unsupported ref {ref}')
+
+                    for sel in ref.split('#')[-1].split('/')[1:]:
+                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                        target = target[sel]
+
+                    self._refs[ref] = target
+                else:
+                    for v in n.values():
+                        visit(v)
+
+            return n
+        return visit(schema)
+
+    def _generate_union_rule(self, name, alt_schemas):
+        return ' | '.join((
+            self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
+            for i, alt_schema in enumerate(alt_schemas)
+        ))
+
+    def _visit_pattern(self, pattern, name):
+        '''
+            Transforms a regular expression pattern into a GBNF rule.
+
+            Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
+            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+
+            Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
+
+            Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
+            we define sub-rules to keep the output lean.
+        '''
+
+        assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
+        pattern = pattern[1:-1]
+        sub_rule_ids = {}
+
+        i = 0
+        length = len(pattern)
+
+        def to_rule(s: tuple[str, bool]) -> str:
+            (txt, is_literal) = s
+            return "\"" + txt + "\"" if is_literal else txt
+
+        def transform() -> tuple[str, bool]:
+            '''
+                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
+            '''
+            nonlocal i
+            nonlocal pattern
+            nonlocal sub_rule_ids
+
+            start = i
+            # For each component of this sequence, store its string representation and whether it's a literal.
+            # We only need a flat structure here to apply repetition operators to the last item, and
+            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+            # (GBNF's syntax is luckily very close to regular expressions!)
+            seq: list[tuple[str, bool]] = []
+
+            def get_dot():
+                if self._dotall:
+                    rule = DOTALL
+                else:
+                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
+                    rule = DOT
+                return self._add_rule(f'dot', rule)
+
+            def join_seq():
+                nonlocal seq
+                ret = []
+                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
+                    if is_literal:
+                        ret.append((''.join(x[0] for x in g), True))
+                    else:
+                        ret.extend(g)
+                if len(ret) == 1:
+                    return ret[0]
+                return (' '.join(to_rule(x) for x in seq), False)
+
+            while i < length:
+                c = pattern[i]
+                if c == '.':
+                    seq.append((get_dot(), False))
+                    i += 1
+                elif c == '(':
+                    i += 1
+                    if i < length:
+                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                    seq.append((f'({to_rule(transform())})', False))
+                elif c == ')':
+                    i += 1
+                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
+                    return join_seq()
+                elif c == '[':
+                    square_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != ']':
+                        if pattern[i] == '\\':
+                            square_brackets += pattern[i:i+2]
+                            i += 2
+                        else:
+                            square_brackets += pattern[i]
+                            i += 1
+                    assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    square_brackets += ']'
+                    i += 1
+                    seq.append((square_brackets, False))
+                elif c == '|':
+                    seq.append(('|', False))
+                    i += 1
+                elif c in ('*', '+', '?'):
+                    seq[-1] = (to_rule(seq[-1]) + c, False)
+                    i += 1
+                elif c == '{':
+                    curly_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != '}':
+                        curly_brackets += pattern[i]
+                        i += 1
+                    assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    curly_brackets += '}'
+                    i += 1
+                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
+                    min_times = 0
+                    max_times = None
+                    try:
+                        if len(nums) == 1:
+                            min_times = int(nums[0])
+                            max_times = min_times
+                        else:
+                            assert len(nums) == 2
+                            min_times = int(nums[0]) if nums[0] else 0
+                            max_times = int(nums[1]) if nums[1] else None
+                    except ValueError:
+                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
+
+                    (sub, sub_is_literal) = seq[-1]
+
+                    if not sub_is_literal:
+                        id = sub_rule_ids.get(sub)
+                        if id is None:
+                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+                            sub_rule_ids[sub] = id
+                        sub = id
+
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
+                else:
+                    literal = ''
+                    while i < length:
+                        if pattern[i] == '\\' and i < length - 1:
+                            next = pattern[i + 1]
+                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
+                                i += 1
+                                literal += pattern[i]
+                                i += 1
+                            else:
+                                literal += pattern[i:i+2]
+                                i += 2
+                        elif pattern[i] == '"' and not self._raw_pattern:
+                            literal += '\\"'
+                            i += 1
+                        elif pattern[i] not in NON_LITERAL_SET and \
+                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
+                            literal += pattern[i]
+                            i += 1
+                        else:
+                            break
+                    if literal:
+                        seq.append((literal, True))
+
+            return join_seq()
+
+        return self._add_rule(
+            name,
+            to_rule(transform()) if self._raw_pattern \
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
+
+
+    def _resolve_ref(self, ref):
+        ref_name = ref.split('/')[-1]
+        if ref_name not in self._rules and ref not in self._refs_being_resolved:
+            self._refs_being_resolved.add(ref)
+            resolved = self._refs[ref]
+            ref_name = self.visit(resolved, ref_name)
+            self._refs_being_resolved.remove(ref)
+        return ref_name
+
+    def _generate_constant_rule(self, value):
+        return self._format_literal(json.dumps(value))
+
+    def visit(self, schema, name):
+        schema_type = schema.get('type')
+        schema_format = schema.get('format')
+        rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'
+
+        if (ref := schema.get('$ref')) is not None:
+            return self._add_rule(rule_name, self._resolve_ref(ref))
+
+        elif 'oneOf' in schema or 'anyOf' in schema:
+            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
+
+        elif isinstance(schema_type, list):
+            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
+
+        elif 'const' in schema:
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
+
+        elif 'enum' in schema:
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type in (None, 'object') and \
+             ('properties' in schema or \
+              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
+            required = set(schema.get('required', []))
+            properties = list(schema.get('properties', {}).items())
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
+
+        elif schema_type in (None, 'object') and 'allOf' in schema:
+            required = set()
+            properties = []
+            hybrid_name = name
+            def add_component(comp_schema, is_required):
+                if (ref := comp_schema.get('$ref')) is not None:
+                    comp_schema = self._refs[ref]
+
+                if 'properties' in comp_schema:
+                    for prop_name, prop_schema in comp_schema['properties'].items():
+                        properties.append((prop_name, prop_schema))
+                        if is_required:
+                            required.add(prop_name)
+
+            for t in schema['allOf']:
+                if 'anyOf' in t:
+                    for tt in t['anyOf']:
+                        add_component(tt, is_required=False)
+                else:
+                    add_component(t, is_required=True)
+
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
+
+        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
+            items = schema.get('items') or schema['prefixItems']
+            if isinstance(items, list):
+                return self._add_rule(
+                    rule_name,
+                    '"[" space ' +
+                    ' "," space '.join(
+                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
+                        for i, item in enumerate(items)) +
+                    ' "]" space')
+            else:
+                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
+                min_items = schema.get("minItems", 0)
+                max_items = schema.get("maxItems")
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
+
+        elif schema_type in (None, 'string') and 'pattern' in schema:
+            return self._visit_pattern(schema['pattern'], rule_name)
+
+        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
+            return self._add_primitive(
+                'root' if rule_name == 'root' else schema_format,
+                PRIMITIVE_RULES['uuid']
+            )
+
+        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+            prim_name = f'{schema_format}-string'
+            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+            min_len = schema.get('minLength', 0)
+            max_len = schema.get('maxLength')
+
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+
+        elif schema_type in (None, 'integer') and \
+                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
+            min_value = None
+            max_value = None
+            if 'minimum' in schema:
+                min_value = schema['minimum']
+            elif 'exclusiveMinimum' in schema:
+                min_value = schema['exclusiveMinimum'] + 1
+            if 'maximum' in schema:
+                max_value = schema['maximum']
+            elif 'exclusiveMaximum' in schema:
+                max_value = schema['exclusiveMaximum'] - 1
+
+            out = ["("]
+            _generate_min_max_int(min_value, max_value, out)
+            out.append(") space")
+            return self._add_rule(rule_name, ''.join(out))
+
+        elif (schema_type == 'object') or (len(schema) == 0):
+            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
+
+        else:
+            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
+            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
+
+    def _add_primitive(self, name: str, rule: BuiltinRule):
+        n = self._add_rule(name, rule.content)
+
+        for dep in rule.deps:
+            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+            assert dep_rule, f'Rule {dep} not known'
+            if dep not in self._rules:
+                self._add_primitive(dep, dep_rule)
+        return n
+
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
+        prop_order = self._prop_order
+        # sort by position in prop_order (if specified) then by original order
+        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
+
+        prop_kv_rule_names = {}
+        for prop_name, prop_schema in properties:
+            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
+            prop_kv_rule_names[prop_name] = self._add_rule(
+                f'{name}{"-" if name else ""}{prop_name}-kv',
+                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
+            )
+        required_props = [k for k in sorted_props if k in required]
+        optional_props = [k for k in sorted_props if k not in required]
+
+        if additional_properties is not None and additional_properties != False:
+            sub_name = f'{name}{"-" if name else ""}additional'
+            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
+                self._add_primitive('value', PRIMITIVE_RULES['value'])
+            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
+                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
+
+            prop_kv_rule_names["*"] = self._add_rule(
+                f'{sub_name}-kv',
+                f'{key_rule} ":" space {value_rule}'
+            )
+            optional_props.append("*")
+
+        rule = '"{" space '
+        rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
+
+        if optional_props:
+            rule += ' ('
+            if required_props:
+                rule += ' "," space ( '
+
+            def get_recursive_refs(ks, first_is_optional):
+                [k, *rest] = ks
+                kv_rule_name = prop_kv_rule_names[k]
+                comma_ref = f'( "," space {kv_rule_name} )'
+                if first_is_optional:
+                    res = comma_ref + ('*' if k == '*' else '?')
+                else:
+                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
+                if len(rest) > 0:
+                    res += ' ' + self._add_rule(
+                        f'{name}{"-" if name else ""}{k}-rest',
+                        get_recursive_refs(rest, first_is_optional=True)
+                    )
+                return res
+
+            rule += ' | '.join(
+                get_recursive_refs(optional_props[i:], first_is_optional=False)
+                for i in range(len(optional_props))
+            )
+            if required_props:
+                rule += ' )'
+            rule += ' )?'
+
+        rule += ' "}" space'
+
+        return rule
+
+    def format_grammar(self):
+        return '\n'.join(
+            f'{name} ::= {rule}'
+            for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
+        )
+
+
+def main(args_in = None):
+    parser = argparse.ArgumentParser(
+        description='''
+            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
+            given JSON schema. Only a subset of JSON schema features are supported; more may be
+            added in the future.
+        ''',
+    )
+    parser.add_argument(
+        '--prop-order',
+        default=[],
+        type=lambda s: s.split(','),
+        help='''
+            comma-separated property names defining the order of precedence for object properties;
+            properties not specified here are given lower precedence than those that are, and
+            are kept in their original order from the schema. Required properties are always
+            given precedence over optional properties.
+        '''
+    )
+    parser.add_argument(
+        '--allow-fetch',
+        action='store_true',
+        default=False,
+        help='Whether to allow fetching referenced schemas over HTTPS')
+    parser.add_argument(
+        '--dotall',
+        action='store_true',
+        default=False,
+        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
+    parser.add_argument(
+        '--raw-pattern',
+        action='store_true',
+        default=False,
+        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
+
+    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
+    args = parser.parse_args(args_in)
+
+    if args.schema.startswith('https://'):
+        url = args.schema
+        import requests
+        schema = requests.get(url).json()
+    elif args.schema == '-':
+        url = 'stdin'
+        schema = json.load(sys.stdin)
+    else:
+        url = f'file://{args.schema}'
+        with open(args.schema) as f:
+            schema = json.load(f)
+    converter = SchemaConverter(
+        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
+        allow_fetch=args.allow_fetch,
+        dotall=args.dotall,
+        raw_pattern=args.raw_pattern)
+    schema = converter.resolve_refs(schema, url)
+    converter.visit(schema, '')
+    print(converter.format_grammar())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt
index 5bdbea4e2..17e3b9b87 100644
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 10f37b441..6bbe4bb75 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -1,4 +1,4 @@
-# llama.cpp/example/llama-bench
+# llama.cpp/examples/llama-bench
 
 Performance testing tool for llama.cpp.
 
@@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
     1. [Markdown](#markdown)
     2. [CSV](#csv)
     3. [JSON](#json)
-    4. [SQL](#sql)
+    4. [JSONL](#jsonl)
+    5. [SQL](#sql)
 
 ## Syntax
 
@@ -23,30 +24,43 @@ usage: ./llama-bench [options]
 
 options:
   -h, --help
-  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                  (default: 512)
-  -n, --n-gen <n>                     (default: 128)
-  -b, --batch-size <n>                (default: 512)
-  -ctk <t>, --cache-type-k <t>        (default: f16)
-  -ctv <t>, --cache-type-v <t>        (default: f16)
-  -t, --threads <n>                   (default: 112)
-  -ngl, --n-gpu-layers <n>            (default: 99)
-  -sm, --split-mode <none|layer|row>  (default: layer)
-  -mg, --main-gpu <i>                 (default: 0)
-  -nkvo, --no-kv-offload <0|1>        (default: 0)
-  -mmp, --mmap <0|1>                  (default: 1)
-  -ts, --tensor_split <ts0/ts1/..>    (default: 0)
-  -r, --repetitions <n>               (default: 5)
-  -o, --output <csv|json|md|sql>      (default: md)
-  -v, --verbose                       (default: 0)
+  -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                        (default: 512)
+  -n, --n-gen <n>                           (default: 128)
+  -pg <pp,tg>                               (default: )
+  -b, --batch-size <n>                      (default: 2048)
+  -ub, --ubatch-size <n>                    (default: 512)
+  -ctk, --cache-type-k <t>                  (default: f16)
+  -ctv, --cache-type-v <t>                  (default: f16)
+  -t, --threads <n>                         (default: 8)
+  -C, --cpu-mask <hex,hex>                  (default: 0x0)
+  --cpu-strict <0|1>                        (default: 0)
+  --poll <0...100>                          (default: 50)
+  -ngl, --n-gpu-layers <n>                  (default: 99)
+  -rpc, --rpc <rpc_servers>                 (default: )
+  -sm, --split-mode <none|layer|row>        (default: layer)
+  -mg, --main-gpu <i>                       (default: 0)
+  -nkvo, --no-kv-offload <0|1>              (default: 0)
+  -fa, --flash-attn <0|1>                   (default: 0)
+  -mmp, --mmap <0|1>                        (default: 1)
+  --numa <distribute|isolate|numactl>       (default: disabled)
+  -embd, --embeddings <0|1>                 (default: 0)
+  -ts, --tensor-split <ts0/ts1/..>          (default: 0)
+  -r, --repetitions <n>                     (default: 5)
+  --prio <0|1|2|3>                          (default: 0)
+  --delay <0...N> (seconds)                 (default: 0)
+  -o, --output <csv|json|jsonl|md|sql>      (default: md)
+  -oe, --output-err <csv|json|jsonl|md|sql> (default: none)
+  -v, --verbose                             (default: 0)
 
 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```
 
-llama-bench can perform two types of tests:
+llama-bench can perform three types of tests:
 
 - Prompt processing (pp): processing a prompt in batches (`-p`)
 - Text generation (tg): generating a sequence of tokens (`-n`)
+- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
 
 With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
 
@@ -156,7 +170,7 @@ $ ./llama-bench -o csv
 ```
 
 ```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@@ -173,7 +187,6 @@ $ ./llama-bench -o json
     "build_commit": "3469684",
     "build_number": 1275,
     "cuda": true,
-    "opencl": false,
     "metal": false,
     "gpu_blas": true,
     "blas": true,
@@ -204,7 +217,6 @@ $ ./llama-bench -o json
     "build_commit": "3469684",
     "build_number": 1275,
     "cuda": true,
-    "opencl": false,
     "metal": false,
     "gpu_blas": true,
     "blas": true,
@@ -234,6 +246,19 @@ $ ./llama-bench -o json
 ]
 ```
 
+
+### JSONL
+
+```sh
+$ ./llama-bench -o jsonl
+```
+
+```json lines
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
+{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+```
+
+
 ### SQL
 
 SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
@@ -247,7 +272,6 @@ CREATE TABLE IF NOT EXISTS test (
   build_commit TEXT,
   build_number INTEGER,
   cuda INTEGER,
-  opencl INTEGER,
   metal INTEGER,
   gpu_blas INTEGER,
   blas INTEGER,
@@ -273,6 +297,6 @@ CREATE TABLE IF NOT EXISTS test (
   stddev_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index c2155b2ac..4ac19ca86 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -6,6 +6,7 @@
 #include <clocale>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <iterator>
@@ -14,13 +15,20 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <thread>
 #include <vector>
 
+#include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "common.h"
-#include "ggml-cuda.h"
-#include "ggml-sycl.h"
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#endif
 
 // utils
 static uint64_t get_time_ns() {
@@ -28,8 +36,7 @@ static uint64_t get_time_ns() {
     return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
 
-template<class T>
-static std::string join(const std::vector<T> & values, const std::string & delim) {
+template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
     std::ostringstream str;
     for (size_t i = 0; i < values.size(); i++) {
         str << values[i];
@@ -40,166 +47,176 @@ static std::string join(const std::vector<T> & values, const std::string & delim
     return str.str();
 }
 
-template<class T>
-static std::vector<T> split(const std::string & str, char delim) {
-    std::vector<T> values;
-    std::istringstream str_stream(str);
-    std::string token;
-    while (std::getline(str_stream, token, delim)) {
-        T value;
-        std::istringstream token_stream(token);
-        token_stream >> value;
-        values.push_back(value);
-    }
-    return values;
-}
-
-template<typename T, typename F>
-static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
     std::vector<std::string> str_values;
     std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
     return str_values;
 }
 
-template<typename T>
-static T avg(const std::vector<T> & v) {
+template <typename T> static T avg(const std::vector<T> & v) {
     if (v.empty()) {
         return 0;
     }
     T sum = std::accumulate(v.begin(), v.end(), T(0));
-    return sum / (T)v.size();
+    return sum / (T) v.size();
 }
 
-template<typename T>
-static T stdev(const std::vector<T> & v) {
+template <typename T> static T stdev(const std::vector<T> & v) {
     if (v.size() <= 1) {
         return 0;
     }
-    T mean = avg(v);
+    T mean   = avg(v);
     T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
-    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
+    T stdev  = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
     return stdev;
 }
 
 static std::string get_cpu_info() {
-    std::string id;
-#ifdef __linux__
-    FILE * f = fopen("/proc/cpuinfo", "r");
-    if (f) {
-        char buf[1024];
-        while (fgets(buf, sizeof(buf), f)) {
-            if (strncmp(buf, "model name", 10) == 0) {
-                char * p = strchr(buf, ':');
-                if (p) {
-                    p++;
-                    while (std::isspace(*p)) {
-                        p++;
-                    }
-                    while (std::isspace(p[strlen(p) - 1])) {
-                        p[strlen(p) - 1] = '\0';
-                    }
-                    id = p;
-                    break;
-                }
-            }
+    std::vector<std::string> cpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            cpu_list.push_back(ggml_backend_dev_description(dev));
         }
     }
-#endif
-    // TODO: other platforms
-    return id;
+    return join(cpu_list, ", ");
 }
 
 static std::string get_gpu_info() {
-    std::string id;
-#ifdef GGML_USE_CUBLAS
-    int count = ggml_cuda_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_cuda_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
+    std::vector<std::string> gpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpu_list.push_back(ggml_backend_dev_description(dev));
         }
     }
-#endif
-#ifdef GGML_USE_SYCL
-    int device_list[GGML_SYCL_MAX_DEVICES];
-    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
-
-    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
-        if (device_list[i] >0 ){
-            char buf[128];
-            ggml_sycl_get_device_description(i, buf, sizeof(buf));
-            id += buf;
-            id += "/";
-        }
-    }
-    if (id.length() >2 ) {
-        id.pop_back();
-    }
-#endif
-    // TODO: other backends
-    return id;
+    return join(gpu_list, ", ");
 }
 
 // command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
 
 static const char * output_format_str(output_formats format) {
     switch (format) {
-        case CSV:      return "csv";
-        case JSON:     return "json";
-        case MARKDOWN: return "md";
-        case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
+        case NONE:
+            return "none";
+        case CSV:
+            return "csv";
+        case JSON:
+            return "json";
+        case JSONL:
+            return "jsonl";
+        case MARKDOWN:
+            return "md";
+        case SQL:
+            return "sql";
+        default:
+            GGML_ABORT("invalid output format");
     }
 }
 
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+    if (s == "none") {
+        format = NONE;
+    } else if (s == "csv") {
+        format = CSV;
+    } else if (s == "json") {
+        format = JSON;
+    } else if (s == "jsonl") {
+        format = JSONL;
+    } else if (s == "md") {
+        format = MARKDOWN;
+    } else if (s == "sql") {
+        format = SQL;
+    } else {
+        return false;
+    }
+    return true;
+}
+
 static const char * split_mode_str(llama_split_mode mode) {
     switch (mode) {
-        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
+        case LLAMA_SPLIT_MODE_NONE:
+            return "none";
+        case LLAMA_SPLIT_MODE_LAYER:
+            return "layer";
+        case LLAMA_SPLIT_MODE_ROW:
+            return "row";
+        default:
+            GGML_ABORT("invalid split mode");
     }
 }
 
+static std::string pair_str(const std::pair<int, int> & p) {
+    static char buf[32];
+    snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
+    return buf;
+}
+
 struct cmd_params {
-    std::vector<std::string> model;
-    std::vector<int> n_prompt;
-    std::vector<int> n_gen;
-    std::vector<int> n_batch;
-    std::vector<ggml_type> type_k;
-    std::vector<ggml_type> type_v;
-    std::vector<int> n_threads;
-    std::vector<int> n_gpu_layers;
-    std::vector<llama_split_mode> split_mode;
-    std::vector<int> main_gpu;
-    std::vector<bool> no_kv_offload;
-    std::vector<std::vector<float>> tensor_split;
-    std::vector<bool> use_mmap;
-    int reps;
-    bool verbose;
-    output_formats output_format;
+    std::vector<std::string>         model;
+    std::vector<int>                 n_prompt;
+    std::vector<int>                 n_gen;
+    std::vector<std::pair<int, int>> n_pg;
+    std::vector<int>                 n_batch;
+    std::vector<int>                 n_ubatch;
+    std::vector<ggml_type>           type_k;
+    std::vector<ggml_type>           type_v;
+    std::vector<int>                 n_threads;
+    std::vector<std::string>         cpu_mask;
+    std::vector<bool>                cpu_strict;
+    std::vector<int>                 poll;
+    std::vector<int>                 n_gpu_layers;
+    std::vector<std::string>         rpc_servers;
+    std::vector<llama_split_mode>    split_mode;
+    std::vector<int>                 main_gpu;
+    std::vector<bool>                no_kv_offload;
+    std::vector<bool>                flash_attn;
+    std::vector<std::vector<float>>  tensor_split;
+    std::vector<bool>                use_mmap;
+    std::vector<bool>                embeddings;
+    ggml_numa_strategy               numa;
+    int                              reps;
+    ggml_sched_priority              prio;
+    int                              delay;
+    bool                             verbose;
+    bool                             progress;
+    output_formats                   output_format;
+    output_formats                   output_format_stderr;
 };
 
 static const cmd_params cmd_params_defaults = {
-    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt      */ {512},
-    /* n_gen         */ {128},
-    /* n_batch       */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_num_physical_cores()},
-    /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu      */ {0},
-    /* no_kv_offload */ {false},
-    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap      */ {true},
-    /* reps          */ 5,
-    /* verbose       */ false,
-    /* output_format */ MARKDOWN
+    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
+    /* n_prompt             */ { 512 },
+    /* n_gen                */ { 128 },
+    /* n_pg                 */ {},
+    /* n_batch              */ { 2048 },
+    /* n_ubatch             */ { 512 },
+    /* type_k               */ { GGML_TYPE_F16 },
+    /* type_v               */ { GGML_TYPE_F16 },
+    /* n_threads            */ { cpu_get_num_math() },
+    /* cpu_mask             */ { "0x0" },
+    /* cpu_strict           */ { false },
+    /* poll                 */ { 50 },
+    /* n_gpu_layers         */ { 99 },
+    /* rpc_servers          */ { "" },
+    /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
+    /* main_gpu             */ { 0 },
+    /* no_kv_offload        */ { false },
+    /* flash_attn           */ { false },
+    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
+    /* use_mmap             */ { true },
+    /* embeddings           */ { false },
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
+    /* verbose              */ false,
+    /* progress             */ false,
+    /* output_format        */ MARKDOWN,
+    /* output_format_stderr */ NONE,
 };
 
 static void print_usage(int /* argc */, char ** argv) {
@@ -207,30 +224,69 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help\n");
-    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
-    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
-    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -p, --n-prompt <n>                        (default: %s)\n",
+           join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -pg <pp,tg>                               (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -b, --batch-size <n>                      (default: %s)\n",
+           join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
+           join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                         (default: %s)\n",
+           join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
+           join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
+           join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    if (llama_supports_rpc()) {
+        printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n",
+               join(cmd_params_defaults.rpc_servers, ",").c_str());
+    }
+    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                       (default: %s)\n",
+           join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n",
+           join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n",
+           join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  --numa <distribute|isolate|numactl>       (default: disabled)\n");
+    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
+           join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
+    printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
+    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  -v, --verbose                             (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  --progress                                (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
     printf("\n");
-    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+    printf(
+        "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
+        "multiple times.\n");
 }
 
 static ggml_type ggml_type_from_name(const std::string & s) {
     if (s == "f16") {
         return GGML_TYPE_F16;
     }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
     if (s == "q8_0") {
         return GGML_TYPE_Q8_0;
     }
@@ -246,21 +302,28 @@ static ggml_type ggml_type_from_name(const std::string & s) {
     if (s == "q5_1") {
         return GGML_TYPE_Q5_1;
     }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }
 
     return GGML_TYPE_COUNT;
 }
 
-
 static cmd_params parse_cmd_params(int argc, char ** argv) {
-    cmd_params params;
-    std::string arg;
-    bool invalid_param = false;
-    const std::string arg_prefix = "--";
-    const char split_delim = ',';
+    cmd_params        params;
+    std::string       arg;
+    bool              invalid_param = false;
+    const std::string arg_prefix    = "--";
+    const char        split_delim   = ',';
 
-    params.verbose = cmd_params_defaults.verbose;
-    params.output_format = cmd_params_defaults.output_format;
-    params.reps = cmd_params_defaults.reps;
+    params.verbose              = cmd_params_defaults.verbose;
+    params.output_format        = cmd_params_defaults.output_format;
+    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
+    params.reps                 = cmd_params_defaults.reps;
+    params.numa                 = cmd_params_defaults.numa;
+    params.prio                 = cmd_params_defaults.prio;
+    params.delay                = cmd_params_defaults.delay;
+    params.progress             = cmd_params_defaults.progress;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -276,35 +339,53 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
             params.model.insert(params.model.end(), p.begin(), p.end());
         } else if (arg == "-p" || arg == "--n-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
         } else if (arg == "-n" || arg == "--n-gen") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+        } else if (arg == "-pg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], ',');
+            if (p.size() != 2) {
+                invalid_param = true;
+                break;
+            }
+            params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
         } else if (arg == "-ctk" || arg == "--cache-type-k") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto                   p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -314,13 +395,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 types.push_back(gt);
             }
+            if (invalid_param) {
+                break;
+            }
             params.type_k.insert(params.type_k.end(), types.begin(), types.end());
         } else if (arg == "-ctv" || arg == "--cache-type-v") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto                   p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -330,27 +414,57 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 types.push_back(gt);
             }
+            if (invalid_param) {
+                break;
+            }
             params.type_v.insert(params.type_v.end(), types.begin(), types.end());
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<std::string>(argv[i], split_delim);
+            params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+        } else if (arg == "--cpu-strict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+        } else if (arg == "--poll") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.poll.insert(params.poll.end(), p.begin(), p.end());
         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+        } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rpc_servers.push_back(argv[i]);
         } else if (arg == "-sm" || arg == "--split-mode") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto                          p = string_split<std::string>(argv[i], split_delim);
             std::vector<llama_split_mode> modes;
             for (const auto & m : p) {
                 llama_split_mode mode;
@@ -366,37 +480,71 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 modes.push_back(mode);
             }
+            if (invalid_param) {
+                break;
+            }
             params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
         } else if (arg == "-mg" || arg == "--main-gpu") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.main_gpu = split<int>(argv[i], split_delim);
+            params.main_gpu = string_split<int>(argv[i], split_delim);
         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+        } else if (arg == "--numa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            } else {
+                std::string value(argv[i]);
+                /**/ if (value == "distribute" || value == "") {
+                    params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
+                } else if (value == "isolate") {
+                    params.numa = GGML_NUMA_STRATEGY_ISOLATE;
+                } else if (value == "numactl") {
+                    params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+            }
+        } else if (arg == "-fa" || arg == "--flash-attn") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
         } else if (arg == "-mmp" || arg == "--mmap") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+        } else if (arg == "-embd" || arg == "--embeddings") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<bool>(argv[i], split_delim);
+            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
         } else if (arg == "-ts" || arg == "--tensor-split") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            for (auto ts : split<std::string>(argv[i], split_delim)) {
+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
                 // split string by ; and /
-                const std::regex regex{R"([;/]+)"};
-                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
-                std::vector<std::string> split_arg{it, {}};
+                const std::regex           regex{ R"([;/]+)" };
+                std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
+                std::vector<std::string>   split_arg{ it, {} };
                 GGML_ASSERT(split_arg.size() <= llama_max_devices());
 
                 std::vector<float> tensor_split(llama_max_devices());
@@ -415,25 +563,34 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 break;
             }
             params.reps = std::stoi(argv[i]);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+        } else if (arg == "--delay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.delay = std::stoi(argv[i]);
         } else if (arg == "-o" || arg == "--output") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            if (argv[i] == std::string("csv")) {
-                params.output_format = CSV;
-            } else if (argv[i] == std::string("json")) {
-                params.output_format = JSON;
-            } else if (argv[i] == std::string("md")) {
-                params.output_format = MARKDOWN;
-            } else if (argv[i] == std::string("sql")) {
-                params.output_format = SQL;
-            } else {
+            invalid_param = !output_format_from_str(argv[i], params.output_format);
+        } else if (arg == "-oe" || arg == "--output-err") {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
         } else if (arg == "-v" || arg == "--verbose") {
             params.verbose = true;
+        } else if (arg == "--progress") {
+            params.progress = true;
         } else {
             invalid_param = true;
             break;
@@ -446,67 +603,156 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     }
 
     // set defaults
-    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
-    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
-    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
-    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
-    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
-    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
-    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
-    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
-    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
-    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
-    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
-    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+    if (params.model.empty()) {
+        params.model = cmd_params_defaults.model;
+    }
+    if (params.n_prompt.empty()) {
+        params.n_prompt = cmd_params_defaults.n_prompt;
+    }
+    if (params.n_gen.empty()) {
+        params.n_gen = cmd_params_defaults.n_gen;
+    }
+    if (params.n_pg.empty()) {
+        params.n_pg = cmd_params_defaults.n_pg;
+    }
+    if (params.n_batch.empty()) {
+        params.n_batch = cmd_params_defaults.n_batch;
+    }
+    if (params.n_ubatch.empty()) {
+        params.n_ubatch = cmd_params_defaults.n_ubatch;
+    }
+    if (params.type_k.empty()) {
+        params.type_k = cmd_params_defaults.type_k;
+    }
+    if (params.type_v.empty()) {
+        params.type_v = cmd_params_defaults.type_v;
+    }
+    if (params.n_gpu_layers.empty()) {
+        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
+    }
+    if (params.rpc_servers.empty()) {
+        params.rpc_servers = cmd_params_defaults.rpc_servers;
+    }
+    if (params.split_mode.empty()) {
+        params.split_mode = cmd_params_defaults.split_mode;
+    }
+    if (params.main_gpu.empty()) {
+        params.main_gpu = cmd_params_defaults.main_gpu;
+    }
+    if (params.no_kv_offload.empty()) {
+        params.no_kv_offload = cmd_params_defaults.no_kv_offload;
+    }
+    if (params.flash_attn.empty()) {
+        params.flash_attn = cmd_params_defaults.flash_attn;
+    }
+    if (params.tensor_split.empty()) {
+        params.tensor_split = cmd_params_defaults.tensor_split;
+    }
+    if (params.use_mmap.empty()) {
+        params.use_mmap = cmd_params_defaults.use_mmap;
+    }
+    if (params.embeddings.empty()) {
+        params.embeddings = cmd_params_defaults.embeddings;
+    }
+    if (params.n_threads.empty()) {
+        params.n_threads = cmd_params_defaults.n_threads;
+    }
+    if (params.cpu_mask.empty()) {
+        params.cpu_mask = cmd_params_defaults.cpu_mask;
+    }
+    if (params.cpu_strict.empty()) {
+        params.cpu_strict = cmd_params_defaults.cpu_strict;
+    }
+    if (params.poll.empty()) {
+        params.poll = cmd_params_defaults.poll;
+    }
 
     return params;
 }
 
 struct cmd_params_instance {
-    std::string model;
-    int n_prompt;
-    int n_gen;
-    int n_batch;
-    ggml_type type_k;
-    ggml_type type_v;
-    int n_threads;
-    int n_gpu_layers;
-    llama_split_mode split_mode;
-    int main_gpu;
-    bool no_kv_offload;
+    std::string        model;
+    int                n_prompt;
+    int                n_gen;
+    int                n_batch;
+    int                n_ubatch;
+    ggml_type          type_k;
+    ggml_type          type_v;
+    int                n_threads;
+    std::string        cpu_mask;
+    bool               cpu_strict;
+    int                poll;
+    int                n_gpu_layers;
+    std::string        rpc_servers_str;
+    llama_split_mode   split_mode;
+    int                main_gpu;
+    bool               no_kv_offload;
+    bool               flash_attn;
     std::vector<float> tensor_split;
-    bool use_mmap;
+    bool               use_mmap;
+    bool               embeddings;
 
     llama_model_params to_llama_mparams() const {
         llama_model_params mparams = llama_model_default_params();
 
         mparams.n_gpu_layers = n_gpu_layers;
-        mparams.split_mode = split_mode;
-        mparams.main_gpu = main_gpu;
+        if (!rpc_servers_str.empty()) {
+            auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
+
+            // add RPC devices
+            if (!rpc_servers.empty()) {
+                ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+                if (!rpc_reg) {
+                    fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
+                    exit(1);
+                }
+
+                typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+                ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+                if (!ggml_backend_rpc_add_device_fn) {
+                    fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
+                    exit(1);
+                }
+                static std::vector<ggml_backend_dev_t> devices;
+                devices.clear();
+                for (const std::string & server : rpc_servers) {
+                    ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+                    if (dev) {
+                        devices.push_back(dev);
+                    } else {
+                        fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                        exit(1);
+                    }
+                }
+                devices.push_back(nullptr);
+                mparams.devices = devices.data();
+            }
+        }
+        mparams.split_mode   = split_mode;
+        mparams.main_gpu     = main_gpu;
         mparams.tensor_split = tensor_split.data();
-        mparams.use_mmap = use_mmap;
+        mparams.use_mmap     = use_mmap;
 
         return mparams;
     }
 
     bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model &&
-               n_gpu_layers == other.n_gpu_layers &&
-               split_mode == other.split_mode &&
-               main_gpu == other.main_gpu &&
-               use_mmap == other.use_mmap &&
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
+               split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
                tensor_split == other.tensor_split;
     }
 
     llama_context_params to_llama_cparams() const {
         llama_context_params cparams = llama_context_default_params();
 
-        cparams.n_ctx = n_prompt + n_gen;
-        cparams.n_batch = n_batch;
-        cparams.type_k = type_k;
-        cparams.type_v = type_v;
+        cparams.n_ctx       = n_prompt + n_gen;
+        cparams.n_batch     = n_batch;
+        cparams.n_ubatch    = n_ubatch;
+        cparams.type_k      = type_k;
+        cparams.type_v      = type_v;
         cparams.offload_kqv = !no_kv_offload;
+        cparams.flash_attn  = flash_attn;
+        cparams.embeddings  = embeddings;
 
         return cparams;
     }
@@ -516,17 +762,25 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     std::vector<cmd_params_instance> instances;
 
     // this ordering minimizes the number of times that each model needs to be reloaded
+    // clang-format off
     for (const auto & m : params.model)
     for (const auto & nl : params.n_gpu_layers)
+    for (const auto & rpc : params.rpc_servers)
     for (const auto & sm : params.split_mode)
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & mmp : params.use_mmap)
+    for (const auto & embd : params.embeddings)
     for (const auto & nb : params.n_batch)
+    for (const auto & nub : params.n_ubatch)
     for (const auto & tk : params.type_k)
     for (const auto & tv : params.type_v)
     for (const auto & nkvo : params.no_kv_offload)
-    for (const auto & nt : params.n_threads) {
+    for (const auto & fa : params.flash_attn)
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & pl : params.poll) {
         for (const auto & n_prompt : params.n_prompt) {
             if (n_prompt == 0) {
                 continue;
@@ -536,15 +790,22 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ n_prompt,
                 /* .n_gen        = */ 0,
                 /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
+                /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
                 /* .main_gpu     = */ mg,
                 /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
         }
@@ -558,164 +819,174 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_prompt     = */ 0,
                 /* .n_gen        = */ n_gen,
                 /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
                 /* .n_gpu_layers = */ nl,
+                /* .rpc_servers  = */ rpc,
                 /* .split_mode   = */ sm,
                 /* .main_gpu     = */ mg,
                 /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
                 /* .use_mmap     = */ mmp,
+                /* .embeddings   = */ embd,
+            };
+            instances.push_back(instance);
+        }
+
+        for (const auto & n_pg : params.n_pg) {
+            if (n_pg.first == 0 && n_pg.second == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ n_pg.first,
+                /* .n_gen        = */ n_pg.second,
+                /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
+                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
+                /* .n_gpu_layers = */ nl,
+                /* .rpc_servers  = */ rpc,
+                /* .split_mode   = */ sm,
+                /* .main_gpu     = */ mg,
+                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
+                /* .tensor_split = */ ts,
+                /* .use_mmap     = */ mmp,
+                /* .embeddings   = */ embd,
             };
             instances.push_back(instance);
         }
     }
+    // clang-format on
 
     return instances;
 }
 
 struct test {
     static const std::string build_commit;
-    static const int build_number;
-    static const bool cuda;
-    static const bool opencl;
-    static const bool vulkan;
-    static const bool kompute;
-    static const bool metal;
-    static const bool sycl;
-    static const bool gpu_blas;
-    static const bool blas;
+    static const int         build_number;
     static const std::string cpu_info;
     static const std::string gpu_info;
-    std::string model_filename;
-    std::string model_type;
-    uint64_t model_size;
-    uint64_t model_n_params;
-    int n_batch;
-    int n_threads;
-    ggml_type type_k;
-    ggml_type type_v;
-    int n_gpu_layers;
-    llama_split_mode split_mode;
-    int main_gpu;
-    bool no_kv_offload;
-    std::vector<float> tensor_split;
-    bool use_mmap;
-    int n_prompt;
-    int n_gen;
-    std::string test_time;
-    std::vector<uint64_t> samples_ns;
+    std::string              model_filename;
+    std::string              model_type;
+    uint64_t                 model_size;
+    uint64_t                 model_n_params;
+    int                      n_batch;
+    int                      n_ubatch;
+    int                      n_threads;
+    std::string              cpu_mask;
+    bool                     cpu_strict;
+    int                      poll;
+    ggml_type                type_k;
+    ggml_type                type_v;
+    int                      n_gpu_layers;
+    llama_split_mode         split_mode;
+    int                      main_gpu;
+    bool                     no_kv_offload;
+    bool                     flash_attn;
+    std::vector<float>       tensor_split;
+    bool                     use_mmap;
+    bool                     embeddings;
+    int                      n_prompt;
+    int                      n_gen;
+    std::string              test_time;
+    std::vector<uint64_t>    samples_ns;
 
     test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
         model_filename = inst.model;
         char buf[128];
         llama_model_desc(lmodel, buf, sizeof(buf));
-        model_type = buf;
-        model_size = llama_model_size(lmodel);
+        model_type     = buf;
+        model_size     = llama_model_size(lmodel);
         model_n_params = llama_model_n_params(lmodel);
-        n_batch = inst.n_batch;
-        n_threads = inst.n_threads;
-        type_k = inst.type_k;
-        type_v = inst.type_v;
-        n_gpu_layers = inst.n_gpu_layers;
-        split_mode = inst.split_mode;
-        main_gpu = inst.main_gpu;
-        no_kv_offload = inst.no_kv_offload;
-        tensor_split = inst.tensor_split;
-        use_mmap = inst.use_mmap;
-        n_prompt = inst.n_prompt;
-        n_gen = inst.n_gen;
+        n_batch        = inst.n_batch;
+        n_ubatch       = inst.n_ubatch;
+        n_threads      = inst.n_threads;
+        cpu_mask       = inst.cpu_mask;
+        cpu_strict     = inst.cpu_strict;
+        poll           = inst.poll;
+        type_k         = inst.type_k;
+        type_v         = inst.type_v;
+        n_gpu_layers   = inst.n_gpu_layers;
+        split_mode     = inst.split_mode;
+        main_gpu       = inst.main_gpu;
+        no_kv_offload  = inst.no_kv_offload;
+        flash_attn     = inst.flash_attn;
+        tensor_split   = inst.tensor_split;
+        use_mmap       = inst.use_mmap;
+        embeddings     = inst.embeddings;
+        n_prompt       = inst.n_prompt;
+        n_gen          = inst.n_gen;
         // RFC 3339 date-time format
-        time_t t = time(NULL);
+        time_t t       = time(NULL);
         std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
         test_time = buf;
 
         (void) ctx;
     }
 
-    uint64_t avg_ns() const {
-        return ::avg(samples_ns);
-    }
+    uint64_t avg_ns() const { return ::avg(samples_ns); }
 
-    uint64_t stdev_ns() const {
-        return ::stdev(samples_ns);
-    }
+    uint64_t stdev_ns() const { return ::stdev(samples_ns); }
 
     std::vector<double> get_ts() const {
-        int n_tokens = n_prompt + n_gen;
+        int                 n_tokens = n_prompt + n_gen;
         std::vector<double> ts;
-        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
+                       [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
         return ts;
     }
 
-    double avg_ts() const {
-        return ::avg(get_ts());
-    }
+    double avg_ts() const { return ::avg(get_ts()); }
 
-    double stdev_ts() const {
-        return ::stdev(get_ts());
-    }
+    double stdev_ts() const { return ::stdev(get_ts()); }
 
     static std::string get_backend() {
-        if (cuda) {
-            return GGML_CUDA_NAME;
+        std::vector<std::string> backends;
+        for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+            auto *      reg  = ggml_backend_reg_get(i);
+            std::string name = ggml_backend_reg_name(reg);
+            if (name != "CPU") {
+                backends.push_back(ggml_backend_reg_name(reg));
+            }
         }
-        if (opencl) {
-            return "OpenCL";
-        }
-        if (vulkan) {
-            return "Vulkan";
-        }
-        if (kompute) {
-            return "Kompute";
-        }
-        if (metal) {
-            return "Metal";
-        }
-        if (sycl) {
-            return GGML_SYCL_NAME;
-        }
-        if (gpu_blas) {
-            return "GPU BLAS";
-        }
-        if (blas) {
-            return "BLAS";
-        }
-
-        return "CPU";
+        return backends.empty() ? "CPU" : join(backends, ",");
     }
 
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
-            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
-            "cpu_info", "gpu_info",
-            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "type_k", "type_v",
-            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload",
-            "tensor_split", "use_mmap",
-            "n_prompt", "n_gen", "test_time",
-            "avg_ns", "stddev_ns",
-            "avg_ts", "stddev_ts"
+            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
+            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
+            "avg_ts",       "stddev_ts",
         };
         return fields;
     }
 
-    enum field_type {STRING, BOOL, INT, FLOAT};
+    enum field_type { STRING, BOOL, INT, FLOAT };
 
     static field_type get_field_type(const std::string & field) {
-        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
-            field == "model_size" || field == "model_n_params" ||
-            field == "n_gpu_layers" || field == "main_gpu" ||
-            field == "n_prompt" || field == "n_gen" ||
-            field == "avg_ns" || field == "stddev_ns") {
+        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
+            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
+            field == "stddev_ns") {
             return INT;
         }
-        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
-            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "use_mmap") {
+        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
+            field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -726,7 +997,7 @@ struct test {
 
     std::vector<std::string> get_values() const {
         std::string tensor_split_str;
-        int max_nonzero = 0;
+        int         max_nonzero = 0;
         for (size_t i = 0; i < llama_max_devices(); i++) {
             if (tensor_split[i] > 0) {
                 max_nonzero = i;
@@ -740,43 +1011,53 @@ struct test {
                 tensor_split_str += "/";
             }
         }
-        std::vector<std::string> values = {
-            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
-            cpu_info, gpu_info,
-            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
-            std::to_string(n_gpu_layers), split_mode_str(split_mode),
-            std::to_string(main_gpu), std::to_string(no_kv_offload),
-            tensor_split_str, std::to_string(use_mmap),
-            std::to_string(n_prompt), std::to_string(n_gen), test_time,
-            std::to_string(avg_ns()), std::to_string(stdev_ns()),
-            std::to_string(avg_ts()), std::to_string(stdev_ts())
-        };
+        std::vector<std::string> values = { build_commit,
+                                            std::to_string(build_number),
+                                            cpu_info,
+                                            gpu_info,
+                                            get_backend(),
+                                            model_filename,
+                                            model_type,
+                                            std::to_string(model_size),
+                                            std::to_string(model_n_params),
+                                            std::to_string(n_batch),
+                                            std::to_string(n_ubatch),
+                                            std::to_string(n_threads),
+                                            cpu_mask,
+                                            std::to_string(cpu_strict),
+                                            std::to_string(poll),
+                                            ggml_type_name(type_k),
+                                            ggml_type_name(type_v),
+                                            std::to_string(n_gpu_layers),
+                                            split_mode_str(split_mode),
+                                            std::to_string(main_gpu),
+                                            std::to_string(no_kv_offload),
+                                            std::to_string(flash_attn),
+                                            tensor_split_str,
+                                            std::to_string(use_mmap),
+                                            std::to_string(embeddings),
+                                            std::to_string(n_prompt),
+                                            std::to_string(n_gen),
+                                            test_time,
+                                            std::to_string(avg_ns()),
+                                            std::to_string(stdev_ns()),
+                                            std::to_string(avg_ts()),
+                                            std::to_string(stdev_ts()) };
         return values;
     }
 
     std::map<std::string, std::string> get_map() const {
         std::map<std::string, std::string> map;
-        auto fields = get_fields();
-        auto values = get_values();
-        std::transform(fields.begin(), fields.end(), values.begin(),
-                std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
+        auto                               fields = get_fields();
+        auto                               values = get_values();
+        std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
+                       std::make_pair<const std::string &, const std::string &>);
         return map;
     }
 };
 
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cublas();
-const bool        test::opencl       = !!ggml_cpu_has_clblast();
-const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
-const bool        test::kompute      = !!ggml_cpu_has_kompute();
-const bool        test::metal        = !!ggml_cpu_has_metal();
-const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
-const bool        test::blas         = !!ggml_cpu_has_blas();
-const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 
@@ -784,9 +1065,12 @@ struct printer {
     virtual ~printer() {}
 
     FILE * fout;
+
     virtual void print_header(const cmd_params & params) { (void) params; }
+
     virtual void print_test(const test & t) = 0;
-    virtual void print_footer() { }
+
+    virtual void print_footer() {}
 };
 
 struct csv_printer : public printer {
@@ -802,7 +1086,7 @@ struct csv_printer : public printer {
         return escaped;
     }
 
-    void print_header(const cmd_params & params) override  {
+    void print_header(const cmd_params & params) override {
         std::vector<std::string> fields = test::get_fields();
         fprintf(fout, "%s\n", join(fields, ",").c_str());
         (void) params;
@@ -815,38 +1099,38 @@ struct csv_printer : public printer {
     }
 };
 
+static std::string escape_json(const std::string & value) {
+    std::string escaped;
+    for (auto c : value) {
+        if (c == '"') {
+            escaped += "\\\"";
+        } else if (c == '\\') {
+            escaped += "\\\\";
+        } else if (c <= 0x1f) {
+            char buf[8];
+            snprintf(buf, sizeof(buf), "\\u%04x", c);
+            escaped += buf;
+        } else {
+            escaped += c;
+        }
+    }
+    return escaped;
+}
+
+static std::string format_json_value(const std::string & field, const std::string & value) {
+    switch (test::get_field_type(field)) {
+        case test::STRING:
+            return "\"" + escape_json(value) + "\"";
+        case test::BOOL:
+            return value == "0" ? "false" : "true";
+        default:
+            return value;
+    }
+}
+
 struct json_printer : public printer {
     bool first = true;
 
-    static std::string escape_json(const std::string & value) {
-        std::string escaped;
-        for (auto c : value) {
-            if (c == '"') {
-                escaped += "\\\"";
-            } else if (c == '\\') {
-                escaped += "\\\\";
-            } else  if (c <= 0x1f) {
-                char buf[8];
-                snprintf(buf, sizeof(buf), "\\u%04x", c);
-                escaped += buf;
-            } else {
-                escaped += c;
-            }
-        }
-        return escaped;
-    }
-
-    static std::string format_value(const std::string & field, const std::string & value) {
-        switch (test::get_field_type(field)) {
-            case test::STRING:
-                return "\"" + escape_json(value) + "\"";
-            case test::BOOL:
-                return value == "0" ? "false" : "true";
-            default:
-                return value;
-        }
-    }
-
     void print_header(const cmd_params & params) override {
         fprintf(fout, "[\n");
         (void) params;
@@ -855,7 +1139,8 @@ struct json_printer : public printer {
     void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
         assert(fields.size() == values.size());
         for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(),
+                    format_json_value(fields.at(i), values.at(i)).c_str());
         }
     }
 
@@ -873,8 +1158,24 @@ struct json_printer : public printer {
         fflush(fout);
     }
 
-    void print_footer() override {
-        fprintf(fout, "\n]\n");
+    void print_footer() override { fprintf(fout, "\n]\n"); }
+};
+
+struct jsonl_printer : public printer {
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "{");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "}\n");
+        fflush(fout);
     }
 };
 
@@ -886,7 +1187,7 @@ struct markdown_printer : public printer {
             return -30;
         }
         if (field == "t/s") {
-            return 16;
+            return 20;
         }
         if (field == "size" || field == "params") {
             return 10;
@@ -894,8 +1195,32 @@ struct markdown_printer : public printer {
         if (field == "n_gpu_layers") {
             return 3;
         }
+        if (field == "n_threads") {
+            return 7;
+        }
+        if (field == "n_batch") {
+            return 7;
+        }
+        if (field == "n_ubatch") {
+            return 8;
+        }
+        if (field == "type_k" || field == "type_v") {
+            return 6;
+        }
+        if (field == "split_mode") {
+            return 5;
+        }
+        if (field == "flash_attn") {
+            return 2;
+        }
+        if (field == "use_mmap") {
+            return 4;
+        }
+        if (field == "test") {
+            return 13;
+        }
 
-        int width = std::max((int)field.length(), 10);
+        int width = std::max((int) field.length(), 10);
 
         if (test::get_field_type(field) == test::STRING) {
             return -width;
@@ -916,9 +1241,15 @@ struct markdown_printer : public printer {
         if (field == "no_kv_offload") {
             return "nkvo";
         }
+        if (field == "flash_attn") {
+            return "fa";
+        }
         if (field == "use_mmap") {
             return "mmap";
         }
+        if (field == "embeddings") {
+            return "embd";
+        }
         if (field == "tensor_split") {
             return "ts";
         }
@@ -931,16 +1262,29 @@ struct markdown_printer : public printer {
         fields.emplace_back("size");
         fields.emplace_back("params");
         fields.emplace_back("backend");
-        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
+        bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
+                              test::get_backend().find("BLAS") != std::string::npos;
         if (!is_cpu_backend) {
             fields.emplace_back("n_gpu_layers");
         }
         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
             fields.emplace_back("n_threads");
         }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.emplace_back("n_batch");
         }
+        if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
+            fields.emplace_back("n_ubatch");
+        }
         if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
             fields.emplace_back("type_k");
         }
@@ -956,12 +1300,18 @@ struct markdown_printer : public printer {
         if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
             fields.emplace_back("no_kv_offload");
         }
+        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
+            fields.emplace_back("flash_attn");
+        }
         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
             fields.emplace_back("tensor_split");
         }
         if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
             fields.emplace_back("use_mmap");
         }
+        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
+            fields.emplace_back("embeddings");
+        }
         fields.emplace_back("test");
         fields.emplace_back("t/s");
 
@@ -984,18 +1334,18 @@ struct markdown_printer : public printer {
         fprintf(fout, "|");
         for (const auto & field : fields) {
             std::string value;
-            char buf[128];
+            char        buf[128];
             if (field == "model") {
                 value = t.model_type;
             } else if (field == "size") {
-                if (t.model_size < 1024*1024*1024) {
+                if (t.model_size < 1024 * 1024 * 1024) {
                     snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
                 } else {
                     snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
                 }
                 value = buf;
             } else if (field == "params") {
-                if (t.model_n_params < 1000*1000*1000) {
+                if (t.model_n_params < 1000 * 1000 * 1000) {
                     snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
                 } else {
                     snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
@@ -1005,12 +1355,11 @@ struct markdown_printer : public printer {
                 value = test::get_backend();
             } else if (field == "test") {
                 if (t.n_prompt > 0 && t.n_gen == 0) {
-                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
+                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
                 } else if (t.n_gen > 0 && t.n_prompt == 0) {
-                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
+                    snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
                 } else {
-                    assert(false);
-                    exit(1);
+                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                 }
                 value = buf;
             } else if (field == "t/s") {
@@ -1058,7 +1407,8 @@ struct sql_printer : public printer {
         std::vector<std::string> fields = test::get_fields();
         fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
         for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
+            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
+                    i < fields.size() - 1 ? "," : "");
         }
         fprintf(fout, ");\n");
         fprintf(fout, "\n");
@@ -1076,26 +1426,43 @@ struct sql_printer : public printer {
     }
 };
 
-static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
-    int n_processed = 0;
-
+static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
+    const llama_model * model   = llama_get_model(ctx);
+    const llama_vocab * vocab   = llama_model_get_vocab(model);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+
+    std::vector<llama_token> tokens(n_batch);
+
+    int n_processed = 0;
+
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
+        tokens[0]    = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+        for (int i = 1; i < n_tokens; i++) {
+            tokens[i] = std::rand() % n_vocab;
+        }
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
         n_processed += n_tokens;
     }
+
+    llama_synchronize(ctx);
 }
 
-static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(llama_get_model(ctx));
-
+static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
+    const llama_model * model   = llama_get_model(ctx);
+    const llama_vocab * vocab   = llama_model_get_vocab(model);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+
+    llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+
     for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_decode(ctx, llama_batch_get_one(&token, 1));
+        llama_synchronize(ctx);
+        token = std::rand() % n_vocab;
     }
 }
 
@@ -1105,6 +1472,24 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
     (void) user_data;
 }
 
+static std::unique_ptr<printer> create_printer(output_formats format) {
+    switch (format) {
+        case NONE:
+            return nullptr;
+        case CSV:
+            return std::unique_ptr<printer>(new csv_printer());
+        case JSON:
+            return std::unique_ptr<printer>(new json_printer());
+        case JSONL:
+            return std::unique_ptr<printer>(new jsonl_printer());
+        case MARKDOWN:
+            return std::unique_ptr<printer>(new markdown_printer());
+        case SQL:
+            return std::unique_ptr<printer>(new sql_printer());
+    }
+    GGML_ABORT("fatal error");
+}
+
 int main(int argc, char ** argv) {
     // try to set locale for unicode characters in markdown
     setlocale(LC_CTYPE, ".UTF-8");
@@ -1123,47 +1508,59 @@ int main(int argc, char ** argv) {
 
     cmd_params params = parse_cmd_params(argc, argv);
 
+    // initialize backends
+    ggml_backend_load_all();
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!cpu_dev) {
+        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
+        return 1;
+    }
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
+
     // initialize llama.cpp
     if (!params.verbose) {
         llama_log_set(llama_null_log_callback, NULL);
     }
     llama_backend_init();
+    llama_numa_init(params.numa);
+
+    set_process_priority(params.prio);
 
     // initialize printer
-    std::unique_ptr<printer> p;
-    switch (params.output_format) {
-        case CSV:
-            p.reset(new csv_printer());
-            break;
-        case JSON:
-            p.reset(new json_printer());
-            break;
-        case MARKDOWN:
-            p.reset(new markdown_printer());
-            break;
-        case SQL:
-            p.reset(new sql_printer());
-            break;
-        default:
-            assert(false);
-            exit(1);
+    std::unique_ptr<printer> p     = create_printer(params.output_format);
+    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
+
+    if (p) {
+        p->fout = stdout;
+        p->print_header(params);
+    }
+
+    if (p_err) {
+        p_err->fout = stderr;
+        p_err->print_header(params);
     }
-    p->fout = stdout;
-    p->print_header(params);
 
     std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
 
-    llama_model * lmodel = nullptr;
+    llama_model *               lmodel    = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
+    int  params_idx   = 0;
+    auto params_count = params_instances.size();
     for (const auto & inst : params_instances) {
+        params_idx++;
+        if (params.progress) {
+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
+        }
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
             if (lmodel) {
-                llama_free_model(lmodel);
+                llama_model_free(lmodel);
             }
 
-            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
             if (lmodel == NULL) {
                 fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
                 return 1;
@@ -1171,10 +1568,10 @@ int main(int argc, char ** argv) {
             prev_inst = &inst;
         }
 
-        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
+        llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
-            llama_free_model(lmodel);
+            llama_model_free(lmodel);
             return 1;
         }
 
@@ -1182,38 +1579,93 @@ int main(int argc, char ** argv) {
 
         llama_kv_cache_clear(ctx);
 
+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            exit(1);
+        }
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
+
+        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
+        if (!threadpool) {
+            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
+        llama_attach_threadpool(ctx, threadpool, NULL);
+
         // warmup run
         if (t.n_prompt > 0) {
-            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
+            if (params.progress) {
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
+            }
+            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
         }
         if (t.n_gen > 0) {
-            test_gen(ctx, 1, 0, t.n_threads);
+            if (params.progress) {
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
+            }
+            test_gen(ctx, 1, t.n_threads);
         }
 
         for (int i = 0; i < params.reps; i++) {
             llama_kv_cache_clear(ctx);
 
             uint64_t t_start = get_time_ns();
+
             if (t.n_prompt > 0) {
-                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
             if (t.n_gen > 0) {
-                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                test_gen(ctx, t.n_gen, t.n_threads);
             }
+
             uint64_t t_ns = get_time_ns() - t_start;
             t.samples_ns.push_back(t_ns);
         }
 
-        p->print_test(t);
+        if (p) {
+            p->print_test(t);
+            fflush(p->fout);
+        }
 
-        llama_print_timings(ctx);
+        if (p_err) {
+            p_err->print_test(t);
+            fflush(p_err->fout);
+        }
+
+        llama_perf_context_print(ctx);
 
         llama_free(ctx);
+
+        ggml_threadpool_free_fn(threadpool);
     }
 
-    llama_free_model(lmodel);
+    llama_model_free(lmodel);
 
-    p->print_footer();
+    if (p) {
+        p->print_footer();
+    }
+
+    if (p_err) {
+        p_err->print_footer();
+    }
 
     llama_backend_free();
 
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
index d42140efe..8d1b37195 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -7,8 +7,6 @@ android {
     namespace = "com.example.llama"
     compileSdk = 34
 
-    ndkVersion = "26.1.10909125"
-
     defaultConfig {
         applicationId = "com.example.llama"
         minSdk = 33
@@ -20,17 +18,6 @@ android {
         vectorDrawables {
             useSupportLibrary = true
         }
-        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
-        }
-        externalNativeBuild {
-            cmake {
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
-                cppFlags += listOf()
-                arguments += listOf()
-            }
-        }
     }
 
     buildTypes {
@@ -55,17 +42,6 @@ android {
     composeOptions {
         kotlinCompilerExtensionVersion = "1.5.1"
     }
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path = file("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
 }
 
 dependencies {
@@ -78,6 +54,7 @@ dependencies {
     implementation("androidx.compose.ui:ui-graphics")
     implementation("androidx.compose.ui:ui-tooling-preview")
     implementation("androidx.compose.material3:material3")
+    implementation(project(":llama"))
     testImplementation("junit:junit:4.13.2")
     androidTestImplementation("androidx.test.ext:junit:1.1.5")
     androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
index be95e2221..45ac29938 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,5 +1,6 @@
 package com.example.llama
 
+import android.llama.cpp.LLamaAndroid
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch
 
-class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
     companion object {
         @JvmStatic
         private val NanosPerSecond = 1_000_000_000.0
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
 
         viewModelScope.launch {
             try {
-                llm.unload()
+                llamaAndroid.unload()
             } catch (exc: IllegalStateException) {
                 messages += exc.message!!
             }
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
         messages += ""
 
         viewModelScope.launch {
-            llm.send(text)
+            llamaAndroid.send(text)
                 .catch {
                     Log.e(tag, "send() failed", it)
                     messages += it.message!!
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
         viewModelScope.launch {
             try {
                 val start = System.nanoTime()
-                val warmupResult = llm.bench(pp, tg, pl, nr)
+                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
                 val end = System.nanoTime()
 
                 messages += warmupResult
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
                     return@launch
                 }
 
-                messages += llm.bench(512, 128, 1, 3)
+                messages += llamaAndroid.bench(512, 128, 1, 3)
             } catch (exc: IllegalStateException) {
                 Log.e(tag, "bench() failed", exc)
                 messages += exc.message!!
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
     fun load(pathToModel: String) {
         viewModelScope.launch {
             try {
-                llm.load(pathToModel)
+                llamaAndroid.load(pathToModel)
                 messages += "Loaded $pathToModel"
             } catch (exc: IllegalStateException) {
                 Log.e(tag, "load() failed", exc)
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
index 50ebc8211..acd1ada7d 100644
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -2,4 +2,5 @@
 plugins {
     id("com.android.application") version "8.2.0" apply false
     id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+    id("com.android.library") version "8.2.0" apply false
 }
diff --git a/examples/llama.android/llama/.gitignore b/examples/llama.android/llama/.gitignore
new file mode 100644
index 000000000..796b96d1c
--- /dev/null
+++ b/examples/llama.android/llama/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
new file mode 100644
index 000000000..28dbc1904
--- /dev/null
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -0,0 +1,70 @@
+plugins {
+    id("com.android.library")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "android.llama.cpp"
+    compileSdk = 34
+
+    defaultConfig {
+        minSdk = 33
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        consumerProguardFiles("consumer-rules.pro")
+        ndk {
+            // Add NDK properties if wanted, e.g.
+            // abiFilters += listOf("arm64-v8a")
+        }
+        externalNativeBuild {
+            cmake {
+                arguments += "-DLLAMA_BUILD_COMMON=ON"
+                arguments += "-DGGML_LLAMAFILE=OFF"
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                cppFlags += listOf()
+                arguments += listOf()
+
+                cppFlags("")
+            }
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+}
+
+dependencies {
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.appcompat:appcompat:1.6.1")
+    implementation("com.google.android.material:material:1.11.0")
+    testImplementation("junit:junit:4.13.2")
+    androidTestImplementation("androidx.test.ext:junit:1.1.5")
+    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+}
diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/llama.android/llama/consumer-rules.pro
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/llama.android/llama/proguard-rules.pro
new file mode 100644
index 000000000..f1b424510
--- /dev/null
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
new file mode 100644
index 000000000..05d6ab5d2
--- /dev/null
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -0,0 +1,24 @@
+package android.llama.cpp
+
+import androidx.test.platform.app.InstrumentationRegistry
+import androidx.test.ext.junit.runners.AndroidJUnit4
+
+import org.junit.Test
+import org.junit.runner.RunWith
+
+import org.junit.Assert.*
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+@RunWith(AndroidJUnit4::class)
+class ExampleInstrumentedTest {
+    @Test
+    fun useAppContext() {
+        // Context of the app under test.
+        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
+        assertEquals("android.llama.cpp.test", appContext.packageName)
+    }
+}
diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/llama.android/llama/src/main/AndroidManifest.xml
new file mode 100644
index 000000000..8bdb7e14b
--- /dev/null
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+</manifest>
diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
similarity index 78%
rename from examples/llama.android/app/src/main/cpp/CMakeLists.txt
rename to examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 85139329a..2de496574 100644
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
@@ -12,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
 
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)
 
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -31,20 +30,24 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-    llama-android.cpp)
+        # List C/C++ source files with relative paths to this CMakeLists.txt.
+        llama-android.cpp)
 
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
+        # List libraries link to the target library
+        llama
+        common
+        android
+        log)
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
similarity index 68%
rename from examples/llama.android/app/src/main/cpp/llama-android.cpp
rename to examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 2beb1e0d5..2a73983a9 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common/common.h"
+#include "common.h"
 
 // Write C++ code here.
 //
@@ -33,6 +33,45 @@ jclass la_int_var;
 jmethodID la_int_var_value;
 jmethodID la_int_var_inc;
 
+std::string cached_token_chars;
+
+bool is_valid_utf8(const char * string) {
+    if (!string) {
+        return true;
+    }
+
+    const unsigned char * bytes = (const unsigned char *)string;
+    int num;
+
+    while (*bytes != 0x00) {
+        if ((*bytes & 0x80) == 0x00) {
+            // U+0000 to U+007F
+            num = 1;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // U+0080 to U+07FF
+            num = 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // U+0800 to U+FFFF
+            num = 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // U+10000 to U+10FFFF
+            num = 4;
+        } else {
+            return false;
+        }
+
+        bytes += 1;
+        for (int i = 1; i < num; ++i) {
+            if ((*bytes & 0xC0) != 0x80) {
+                return false;
+            }
+            bytes += 1;
+        }
+    }
+
+    return true;
+}
+
 static void log_callback(ggml_log_level level, const char * fmt, void * data) {
     if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
     else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
@@ -42,13 +81,13 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
     llama_model_params model_params = llama_model_default_params();
 
     auto path_to_model = env->GetStringUTFChars(filename, 0);
     LOGi("Loading model from %s", path_to_model);
 
-    auto model = llama_load_model_from_file(path_to_model, model_params);
+    auto model = llama_model_load_from_file(path_to_model, model_params);
     env->ReleaseStringUTFChars(filename, path_to_model);
 
     if (!model) {
@@ -62,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_free_model(reinterpret_cast<llama_model *>(model));
+Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
+    llama_model_free(reinterpret_cast<llama_model *>(model));
 }
 
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
     auto model = reinterpret_cast<llama_model *>(jmodel);
 
     if (!model) {
@@ -81,8 +120,8 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
     LOGi("Using %d threads", n_threads);
 
     llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+
+    ctx_params.n_ctx           = 2048;
     ctx_params.n_threads       = n_threads;
     ctx_params.n_threads_batch = n_threads;
 
@@ -100,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
     llama_free(reinterpret_cast<llama_context *>(context));
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
     llama_backend_free();
 }
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
     llama_log_set(log_callback, NULL);
 }
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_bench_1model(
+Java_android_llama_cpp_LLamaAndroid_bench_1model(
         JNIEnv *env,
         jobject,
         jlong context_pointer,
@@ -147,11 +186,11 @@ Java_com_example_llama_Llm_bench_1model(
     for (nri = 0; nri < nr; nri++) {
         LOGi("Benchmark prompt processing (pp)");
 
-        llama_batch_clear(*batch);
+        common_batch_clear(*batch);
 
         const int n_tokens = pp;
         for (i = 0; i < n_tokens; i++) {
-            llama_batch_add(*batch, 0, i, { 0 }, false);
+            common_batch_add(*batch, 0, i, { 0 }, false);
         }
 
         batch->logits[batch->n_tokens - 1] = true;
@@ -171,9 +210,9 @@ Java_com_example_llama_Llm_bench_1model(
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
-            llama_batch_clear(*batch);
+            common_batch_clear(*batch);
             for (j = 0; j < pl; j++) {
-                llama_batch_add(*batch, 0, i, { j }, true);
+                common_batch_add(*batch, 0, i, { j }, true);
             }
 
             LOGi("llama_decode() text generation: %d", i);
@@ -230,15 +269,9 @@ Java_com_example_llama_Llm_bench_1model(
     return env->NewStringUTF(result.str().c_str());
 }
 
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
 extern "C"
 JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
 
     // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
 
@@ -250,9 +283,6 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
         nullptr,
         nullptr,
         nullptr,
-        0,
-        0,
-        0,
     };
 
     if (embd) {
@@ -274,32 +304,61 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    //llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    delete batch;
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = true;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    return reinterpret_cast<jlong>(smpl);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
+    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
     llama_backend_init();
 }
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
     return env->NewStringUTF(llama_print_system_info());
 }
 
 extern "C"
 JNIEXPORT jint JNICALL
-Java_com_example_llama_Llm_completion_1init(
+Java_android_llama_cpp_LLamaAndroid_completion_1init(
         JNIEnv *env,
         jobject,
         jlong context_pointer,
         jlong batch_pointer,
         jstring jtext,
+        jboolean format_chat,
         jint n_len
     ) {
 
+    cached_token_chars.clear();
+
     const auto text = env->GetStringUTFChars(jtext, 0);
     const auto context = reinterpret_cast<llama_context *>(context_pointer);
     const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
 
-    const auto tokens_list = llama_tokenize(context, text, 1);
+    bool parse_special = (format_chat == JNI_TRUE);
+    const auto tokens_list = common_tokenize(context, text, true, parse_special);
 
     auto n_ctx = llama_n_ctx(context);
     auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -311,14 +370,14 @@ Java_com_example_llama_Llm_completion_1init(
     }
 
     for (auto id : tokens_list) {
-        LOGi("%s", llama_token_to_piece(context, id).c_str());
+        LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
     }
 
-    llama_batch_clear(*batch);
+    common_batch_clear(*batch);
 
     // evaluate the initial prompt
     for (auto i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
+        common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
     }
 
     // llama_decode will output logits only for the last token of the prompt
@@ -335,48 +394,47 @@ Java_com_example_llama_Llm_completion_1init(
 
 extern "C"
 JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_completion_1loop(
+Java_android_llama_cpp_LLamaAndroid_completion_1loop(
         JNIEnv * env,
         jobject,
         jlong context_pointer,
         jlong batch_pointer,
+        jlong sampler_pointer,
         jint n_len,
         jobject intvar_ncur
 ) {
     const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
+    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
     const auto model = llama_get_model(context);
+    const auto vocab = llama_model_get_vocab(model);
 
     if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
     if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
     if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
 
-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
     // sample the most likely token
-    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
-        return env->NewStringUTF("");
+    if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
+        return nullptr;
     }
 
-    auto new_token_chars = llama_token_to_piece(context, new_token_id);
-    LOGi("new_token_chars: `%s`", new_token_chars.c_str());
-    auto new_token = env->NewStringUTF(new_token_chars.c_str());
+    auto new_token_chars = common_token_to_piece(context, new_token_id);
+    cached_token_chars += new_token_chars;
 
-    llama_batch_clear(*batch);
-    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
+    jstring new_token = nullptr;
+    if (is_valid_utf8(cached_token_chars.c_str())) {
+        new_token = env->NewStringUTF(cached_token_chars.c_str());
+        LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
+        cached_token_chars.clear();
+    } else {
+        new_token = env->NewStringUTF("");
+    }
+
+    common_batch_clear(*batch);
+    common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
 
     env->CallVoidMethod(intvar_ncur, la_int_var_inc);
 
@@ -389,6 +447,6 @@ Java_com_example_llama_Llm_completion_1loop(
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
     llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
 }
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
similarity index 86%
rename from examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
rename to examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
index 5f3270372..b964d93e3 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,4 +1,4 @@
-package com.example.llama
+package android.llama.cpp
 
 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread
 
-class Llm {
+class LLamaAndroid {
     private val tag: String? = this::class.simpleName
 
     private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@@ -45,8 +45,10 @@ class Llm {
     private external fun free_context(context: Long)
     private external fun backend_init(numa: Boolean)
     private external fun backend_free()
-    private external fun free_batch(batch: Long)
     private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
+    private external fun free_batch(batch: Long)
+    private external fun new_sampler(): Long
+    private external fun free_sampler(sampler: Long)
     private external fun bench_model(
         context: Long,
         model: Long,
@@ -63,15 +65,17 @@ class Llm {
         context: Long,
         batch: Long,
         text: String,
+        formatChat: Boolean,
         nLen: Int
     ): Int
 
     private external fun completion_loop(
         context: Long,
         batch: Long,
+        sampler: Long,
         nLen: Int,
         ncur: IntVar
-    ): String
+    ): String?
 
     private external fun kv_cache_clear(context: Long)
 
@@ -101,21 +105,24 @@ class Llm {
                     val batch = new_batch(512, 0, 1)
                     if (batch == 0L) throw IllegalStateException("new_batch() failed")
 
+                    val sampler = new_sampler()
+                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
+
                     Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch))
+                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
                 }
                 else -> throw IllegalStateException("Model already loaded")
             }
         }
     }
 
-    fun send(message: String): Flow<String> = flow {
+    fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
         when (val state = threadLocalState.get()) {
             is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
+                val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
                 while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, nlen, ncur)
-                    if (str.isEmpty()) {
+                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
+                    if (str == null) {
                         break
                     }
                     emit(str)
@@ -138,6 +145,7 @@ class Llm {
                     free_context(state.context)
                     free_model(state.model)
                     free_batch(state.batch)
+                    free_sampler(state.sampler);
 
                     threadLocalState.set(State.Idle)
                 }
@@ -161,12 +169,12 @@ class Llm {
 
         private sealed interface State {
             data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long): State
+            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
         }
 
         // Enforce only one instance of Llm.
-        private val _instance: Llm = Llm()
+        private val _instance: LLamaAndroid = LLamaAndroid()
 
-        fun instance(): Llm = _instance
+        fun instance(): LLamaAndroid = _instance
     }
 }
diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
new file mode 100644
index 000000000..cbbb974d3
--- /dev/null
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -0,0 +1,17 @@
+package android.llama.cpp
+
+import org.junit.Test
+
+import org.junit.Assert.*
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+class ExampleUnitTest {
+    @Test
+    fun addition_isCorrect() {
+        assertEquals(4, 2 + 2)
+    }
+}
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
index 2ba32c4fa..c7c1a034a 100644
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
 
 rootProject.name = "LlamaAndroid"
 include(":app")
+include(":llama")
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 58fcf40c6..ee7141a66 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,13 +24,16 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
     private var model: OpaquePointer
     private var context: OpaquePointer
+    private var vocab: OpaquePointer
+    private var sampling: UnsafeMutablePointer<llama_sampler>
     private var batch: llama_batch
     private var tokens_list: [llama_token]
+    var is_done: Bool = false
 
     /// This variable is used to store temporarily invalid cchars
     private var temporary_invalid_cchars: [CChar]
 
-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
     var n_cur: Int32 = 0
 
     var n_decode: Int32 = 0
@@ -41,12 +44,18 @@ actor LlamaContext {
         self.tokens_list = []
         self.batch = llama_batch_init(512, 0, 1)
         self.temporary_invalid_cchars = []
+        let sparams = llama_sampler_chain_default_params()
+        self.sampling = llama_sampler_chain_init(sparams)
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
+        vocab = llama_model_get_vocab(model)
     }
 
     deinit {
+        llama_sampler_free(sampling)
         llama_batch_free(batch)
+        llama_model_free(model)
         llama_free(context)
-        llama_free_model(model)
         llama_backend_free()
     }
 
@@ -58,7 +67,7 @@ actor LlamaContext {
         model_params.n_gpu_layers = 0
         print("Running on simulator, force use n_gpu_layers = 0")
 #endif
-        let model = llama_load_model_from_file(path, model_params)
+        let model = llama_model_load_from_file(path, model_params)
         guard let model else {
             print("Could not load model at \(path)")
             throw LlamaError.couldNotInitializeContext
@@ -68,12 +77,11 @@ actor LlamaContext {
         print("Using \(n_threads) threads")
 
         var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
         ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads       = Int32(n_threads)
+        ctx_params.n_threads_batch = Int32(n_threads)
 
-        let context = llama_new_context_with_model(model, ctx_params)
+        let context = llama_init_from_model(model, ctx_params)
         guard let context else {
             print("Could not load context!")
             throw LlamaError.couldNotInitializeContext
@@ -143,23 +151,11 @@ actor LlamaContext {
     func completion_loop() -> String {
         var new_token_id: llama_token = 0
 
-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
 
-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
-
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
-
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
             print("\n")
+            is_done = true
             let new_token_str = String(cString: temporary_invalid_cchars + [0])
             temporary_invalid_cchars.removeAll()
             return new_token_str
@@ -216,19 +212,20 @@ actor LlamaContext {
 
             llama_kv_cache_clear(context)
 
-            let t_pp_start = ggml_time_us()
+            let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
             if llama_decode(context, batch) != 0 {
                 print("llama_decode() failed during prompt")
             }
+            llama_synchronize(context)
 
-            let t_pp_end = ggml_time_us()
+            let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
             // bench text generation
 
             llama_kv_cache_clear(context)
 
-            let t_tg_start = ggml_time_us()
+            let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
             for i in 0..<tg {
                 llama_batch_clear(&batch)
@@ -240,9 +237,10 @@ actor LlamaContext {
                 if llama_decode(context, batch) != 0 {
                     print("llama_decode() failed during text generation")
                 }
+                llama_synchronize(context)
             }
 
-            let t_tg_end = ggml_time_us()
+            let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
             llama_kv_cache_clear(context)
 
@@ -301,7 +299,7 @@ actor LlamaContext {
         let utf8Count = text.utf8.count
         let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
         let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
 
         var swiftTokens: [llama_token] = []
         for i in 0..<tokenCount {
@@ -320,7 +318,7 @@ actor LlamaContext {
         defer {
             result.deallocate()
         }
-        let nTokens = llama_token_to_piece(model, token, result, 8)
+        let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
 
         if nTokens < 0 {
             let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -328,7 +326,7 @@ actor LlamaContext {
             defer {
                 newResult.deallocate()
             }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false)
             let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
             return Array(bufferPointer)
         } else {
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
index 3950b9e9d..ff3d108b2 100644
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@@ -17,7 +18,6 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */
 
@@ -42,7 +42,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
+				1809696D2D05A39F00400EE8 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
 			);
@@ -151,7 +151,7 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
+				1809696C2D05A39F00400EE8 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -429,7 +429,7 @@
 /* End XCConfigurationList section */
 
 /* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
+		1809696C2D05A39F00400EE8 /* llama */ = {
 			isa = XCSwiftPackageProductDependency;
 			productName = llama;
 		};
diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
index 5bde18917..b8f6a31d5 100644
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
 
         messageLog += "\(text)"
 
-        while await llamaContext.n_cur < llamaContext.n_len {
-            let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
+        Task.detached {
+            while await !llamaContext.is_done {
+                let result = await llamaContext.completion_loop()
+                await MainActor.run {
+                    self.messageLog += "\(result)"
+                }
+            }
+
+            let t_end = DispatchTime.now().uptimeNanoseconds
+            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
+            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+
+            await llamaContext.clear()
+
+            await MainActor.run {
+                self.messageLog += """
+                    \n
+                    Done
+                    Heat up took \(t_heat)s
+                    Generated \(tokens_per_second) t/s\n
+                    """
+            }
         }
-
-        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
-        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
-
-        await llamaContext.clear()
-        messageLog += """
-            \n
-            Done
-            Heat up took \(t_heat)s
-            Generated \(tokens_per_second) t/s\n
-            """
     }
 
     func bench() async {
diff --git a/examples/llama.vim b/examples/llama.vim
index 1b5ad6ba0..57eb2a977 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,135 +1,783 @@
-" Requires an already running llama.cpp server
-" To install either copy or symlink to ~/.vim/autoload/llama.vim
-" Then start with either :call llama#doLlamaGen(),
-" or add a keybind to your vimrc such as
-" nnoremap Z :call llama#doLlamaGen()<CR>
-" Similarly, you could add an insert mode keybind with
-" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
+" LLM-based text completion using llama.cpp
 "
-" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
-" let g:llama_api_url = "192.168.1.10:8080"
-" llama_overrides can also be set through buffer/window scopes. For instance
-" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
-" Could be added to your .vimrc to automatically set a lower temperature when
-" editing a python script
-" Additionally, an override dict can be stored at the top of a file
-" !*{"stop": ["User:"]}
-" Could be added to the start of your chatlog.txt to set the stopping token
-" These parameter dicts are merged together from lowest to highest priority:
-" server default -> g:llama_overrides -> w:llama_overrides ->
-" b:llama_overrides -> in file (!*) overrides
+" requires:
+"
+"   - neovim or vim
+"   - curl
+"   - llama.cpp server instance
+"   - FIM-compatible model
+"
+" sample config:
+"
+"   - Tab       - accept the current suggestion
+"   - Shift+Tab - accept just the first line of the suggestion
+"   - Ctrl+F    - toggle FIM completion manually
+"
+" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
+"
+" start the llama.cpp server with a FIM-compatible model. for example:
+"
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"
+"   --batch-size [512, model max context]
+"
+"     adjust the batch size to control how much of the provided local context will be used during the inference
+"     lower values will use smaller part of the context around the cursor, which will result in faster processing
+"
+"   --ubatch-size [64, 2048]
+"
+"     chunks the batch into smaller chunks for faster processing
+"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"
+"   --cache-reuse (ge:llama_config.n_predict, 1024]
+"
+"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
+"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
+"     large contexts. a value of 256 should be good for all cases
+"
+" run this once to initialise llama.vim:
+"
+"   :call llama#init()
+"
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787
 "
-" Sublists (like logit_bias and stop) are overridden, not merged
-" Example override:
-" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
-if !exists("g:llama_api_url")
-    let g:llama_api_url= "127.0.0.1:8080"
-endif
-if !exists("g:llama_overrides")
-   let g:llama_overrides = {}
-endif
-const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
-const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
-let s:linedict = {}
 
-func s:callbackHandler(bufn, channel, msg)
-   if len(a:msg) < 3
-      return
-   elseif a:msg[0] == "d"
-      let l:msg = a:msg[6:-1]
-   else
-      let l:msg = a:msg
-   endif
-   let l:decoded_msg = json_decode(l:msg)
-   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
-   if len(l:newtext) > 0
-      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
-   else
-      echo "nothing genned"
-   endif
-   if len(newtext) > 1
-      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
-      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
-   endif
-   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
-       echo "Finished generation"
-   endif
-endfunction
+" colors (adjust to your liking)
+highlight llama_hl_hint guifg=#ff772f ctermfg=202
+highlight llama_hl_info guifg=#77ff2f ctermfg=119
 
-func llama#doLlamaGen()
-   if exists("b:job")
-      if job_status(b:job) == "run"
-         call job_stop(b:job)
-         return
-      endif
-   endif
+" general parameters:
+"
+"   endpoint:         llama.cpp server endpoint
+"   n_prefix:         number of lines before the cursor location to include in the local prefix
+"   n_suffix:         number of lines after  the cursor location to include in the local suffix
+"   n_predict:        max number of tokens to predict
+"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max alloted time for the prediction
+"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
+"   auto_fim:         trigger FIM completion automatically on cursor movement
+"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
+"
+" ring buffer of chunks, accumulated with time upon:
+"
+"  - completion request
+"  - yank
+"  - entering a buffer
+"  - leaving a buffer
+"  - writing a file
+"
+" parameters for the ring-buffer with extra context:
+"
+"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
+"   ring_chunk_size:  max size of the chunks (in number of lines)
+"                     note: adjust these numbers so that you don't overrun your context
+"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
+"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
+"   ring_update_ms:   how often to process queued chunks in normal mode
+"
+let s:default_config = {
+    \ 'endpoint':         'http://127.0.0.1:8012/infill',
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         64,
+    \ 'n_predict':        128,
+    \ 't_max_prompt_ms':  500,
+    \ 't_max_predict_ms': 3000,
+    \ 'show_info':        2,
+    \ 'auto_fim':         v:true,
+    \ 'max_line_suffix':  8,
+    \ 'ring_n_chunks':    64,
+    \ 'ring_chunk_size':  64,
+    \ 'ring_scope':       1024,
+    \ 'ring_update_ms':   1000,
+    \ }
 
-   let l:cbuffer = bufnr("%")
-   let s:linedict[l:cbuffer] = line('$')
-   let l:buflines = getbufline(l:cbuffer, 1, 1000)
-   let l:querydata = copy(s:querydata)
-   call extend(l:querydata, g:llama_overrides)
-   if exists("w:llama_overrides")
-      call extend(l:querydata, w:llama_overrides)
-   endif
-   if exists("b:llama_overrides")
-      call extend(l:querydata, b:llama_overrides)
-   endif
-   if l:buflines[0][0:1] == '!*'
-      let l:userdata = json_decode(l:buflines[0][2:-1])
-      call extend(l:querydata, l:userdata)
-      let l:buflines = l:buflines[1:-1]
-   endif
-   let l:querydata.prompt = join(l:buflines, "\n")
-   let l:curlcommand = copy(s:curlcommand)
-   if exists("g:llama_api_key")
-       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
-   endif
-   let l:curlcommand[2] = json_encode(l:querydata)
-   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
-endfunction
+let g:llama_config = get(g:, 'llama_config', s:default_config)
 
-" Echos the tokkenization of the provided string , or cursor to end of word
-" Onus is placed on the user to include the preceding space
-func llama#tokenizeWord(...)
-    if (a:0 > 0)
-        let l:input = a:1
-    else
-        exe "normal \"*ye"
-        let l:input = @*
-    endif
-    let l:querydata = {"content": l:input}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
-endfunction
-
-func s:tokenizeWordCallback(plaintext, channel, msg)
-    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
-endfunction
-
-
-" Echos the token count of the entire buffer (or provided string)
-" Example usage :echo llama#tokenCount()
-func llama#tokenCount(...)
-    if (a:0 > 0)
-        let l:buflines = a:1
-    else
-        let l:buflines = getline(1,1000)
-        if l:buflines[0][0:1] == '!*'
-            let l:buflines = l:buflines[1:-1]
+function! s:get_indent(str)
+    let l:count = 0
+    for i in range(len(a:str))
+        if a:str[i] == "\t"
+            let l:count += &tabstop - 1
+        else
+            break
         endif
-        let l:buflines = join(l:buflines, "\n")
-    endif
-    let l:querydata = {"content": l:buflines}
-    let l:curlcommand = copy(s:curlcommand)
-    let l:curlcommand[2] = json_encode(l:querydata)
-    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
-   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
+    endfor
+    return l:count
 endfunction
 
-func s:tokenCountCallback(channel, msg)
-    let resp = json_decode(a:msg)
-    echo len(resp.tokens)
+function! s:rand(i0, i1) abort
+    return a:i0 + rand() % (a:i1 - a:i0 + 1)
+endfunction
+
+function! llama#init()
+    if !executable('curl')
+        echohl WarningMsg
+        echo 'llama.vim requires the "curl" command to be available'
+        echohl None
+        return
+    endif
+
+    let s:pos_x = 0 " cursor position upon start of completion
+    let s:pos_y = 0
+
+    let s:line_cur = ''
+
+    let s:line_cur_prefix = ''
+    let s:line_cur_suffix = ''
+
+    let s:ring_chunks = [] " current set of chunks used as extra context
+    let s:ring_queued = [] " chunks that are queued to be sent for processing
+    let s:ring_n_evict = 0
+
+    let s:hint_shown = v:false
+    let s:pos_y_pick = -9999 " last y where we picked a chunk
+    let s:pos_dx = 0
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:timer_fim = -1
+    let s:t_fim_start = reltime() " used to measure total FIM time
+    let s:t_last_move = reltime() " last time the cursor moved
+
+    let s:current_job = v:null
+
+    let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
+    let s:ghost_text_vim = has('textprop')
+
+    if s:ghost_text_vim
+        let s:hlgroup_hint = 'llama_hl_hint'
+        let s:hlgroup_info = 'llama_hl_info'
+
+        if empty(prop_type_get(s:hlgroup_hint))
+            call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
+        endif
+        if empty(prop_type_get(s:hlgroup_info))
+            call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
+        endif
+    endif
+
+    augroup llama
+        autocmd!
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
+        autocmd InsertLeavePre  * call llama#fim_cancel()
+
+        autocmd CursorMoved     * call s:on_move()
+        autocmd CursorMovedI    * call s:on_move()
+        autocmd CompleteChanged * call llama#fim_cancel()
+
+        if g:llama_config.auto_fim
+            autocmd CursorMovedI * call llama#fim(v:true)
+        endif
+
+        " gather chunks upon yanking
+        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
+
+        " gather chunks upon entering/leaving a buffer
+        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+
+        " gather chunk upon saving the file
+        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+    augroup END
+
+    silent! call llama#fim_cancel()
+
+    " init background update of the ring buffer
+    if g:llama_config.ring_n_chunks > 0
+        call s:ring_update()
+    endif
+endfunction
+
+" compute how similar two chunks of text are
+" 0 - no similarity, 1 - high similarity
+" TODO: figure out something better
+function! s:chunk_sim(c0, c1)
+    let l:lines0 = len(a:c0)
+    let l:lines1 = len(a:c1)
+
+    let l:common = 0
+
+    for l:line0 in a:c0
+        for l:line1 in a:c1
+            if l:line0 == l:line1
+                let l:common += 1
+                break
+            endif
+        endfor
+    endfor
+
+    return 2.0 * l:common / (l:lines0 + l:lines1)
+endfunction
+
+" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
+"
+" no_mod   - do not pick chunks from buffers with pending changes
+" do_evict - evict chunks that are very similar to the new one
+"
+function! s:pick_chunk(text, no_mod, do_evict)
+    " do not pick chunks from buffers with pending changes or buffers that are not files
+    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
+        return
+    endif
+
+    " if the extra context option is disabled - do nothing
+    if g:llama_config.ring_n_chunks <= 0
+        return
+    endif
+
+    " don't pick very small chunks
+    if len(a:text) < 3
+        return
+    endif
+
+    if len(a:text) + 1 < g:llama_config.ring_chunk_size
+        let l:chunk = a:text
+    else
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
+        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
+
+        let l:chunk = a:text[l:l0:l:l1]
+    endif
+
+    let l:chunk_str = join(l:chunk, "\n") . "\n"
+
+    " check if this chunk is already added
+    let l:exist = v:false
+
+    for i in range(len(s:ring_chunks))
+        if s:ring_chunks[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    for i in range(len(s:ring_queued))
+        if s:ring_queued[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    if l:exist
+        return
+    endif
+
+    " evict queued chunks that are very similar to the new one
+    for i in range(len(s:ring_queued) - 1, 0, -1)
+        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_queued, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " also from s:ring_chunks
+    for i in range(len(s:ring_chunks) - 1, 0, -1)
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
+            if a:do_evict
+                call remove(s:ring_chunks, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " TODO: become parameter ?
+    if len(s:ring_queued) == 16
+        call remove(s:ring_queued, 0)
+    endif
+
+    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
+
+    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+endfunction
+
+" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
+" called every g:llama_config.ring_update_ms
+function! s:ring_update()
+    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
+
+    " update only if in normal mode or if the cursor hasn't moved for a while
+    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
+        return
+    endif
+
+    if len(s:ring_queued) == 0
+        return
+    endif
+
+    " move the first queued chunk to the ring buffer
+    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
+        call remove(s:ring_chunks, 0)
+    endif
+
+    call add(s:ring_chunks, remove(s:ring_queued, 0))
+
+    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+
+    " send asynchronous job with the new extra context so that it is ready for the next FIM
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " no samplers needed here
+    let l:request = json_encode({
+        \ 'input_prefix':     "",
+        \ 'input_suffix':     "",
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           "",
+        \ 'n_predict':        1,
+        \ 'temperature':      0.0,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["temperature"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  1,
+        \ 't_max_predict_ms': 1
+        \ })
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
+
+    " no callbacks because we don't need to process the response
+    if s:ghost_text_nvim
+        call jobstart(l:curl_command, {})
+    elseif s:ghost_text_vim
+        call job_start(l:curl_command, {})
+    endif
+endfunction
+
+" necessary for 'inoremap <expr>'
+function! llama#fim_inline(is_auto) abort
+    call llama#fim(a:is_auto)
+    return ''
+endfunction
+
+" the main FIM call
+" takes local context around the cursor and sends it together with the extra context to the server for completion
+function! llama#fim(is_auto) abort
+    " we already have a suggestion for the current cursor position
+    if s:hint_shown && !a:is_auto
+        call llama#fim_cancel()
+        return
+    endif
+
+    call llama#fim_cancel()
+
+    " avoid sending repeated requests too fast
+    if reltimefloat(reltime(s:t_fim_start)) < 0.6
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+
+        let s:t_fim_start = reltime()
+        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
+        return
+    endif
+
+    let s:t_fim_start = reltime()
+
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:pos_x = col('.') - 1
+    let s:pos_y = line('.')
+    let l:max_y = line('$')
+
+    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
+    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
+
+    let s:line_cur = getline('.')
+
+    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
+    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
+
+    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
+        return
+    endif
+
+    let l:prefix = ""
+        \ . join(l:lines_prefix, "\n")
+        \ . "\n"
+
+    let l:prompt = ""
+        \ . s:line_cur_prefix
+
+    let l:suffix = ""
+        \ . s:line_cur_suffix
+        \ . "\n"
+        \ . join(l:lines_suffix, "\n")
+        \ . "\n"
+
+    " prepare the extra context data
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    " the indentation of the current line
+    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+
+    let l:request = json_encode({
+        \ 'input_prefix':     l:prefix,
+        \ 'input_suffix':     l:suffix,
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           l:prompt,
+        \ 'n_predict':        g:llama_config.n_predict,
+        \ 'n_indent':         l:indent,
+        \ 'top_k':            40,
+        \ 'top_p':            0.99,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "top_p", "infill"],
+        \ 'cache_prompt':     v:true,
+        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
+        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
+        \ })
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", l:request
+        \ ]
+
+    if s:current_job != v:null
+        if s:ghost_text_nvim
+            call jobstop(s:current_job)
+        elseif s:ghost_text_vim
+            call job_stop(s:current_job)
+        endif
+    endif
+
+    " send the request asynchronously
+    if s:ghost_text_nvim
+        let s:current_job = jobstart(l:curl_command, {
+            \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'on_exit':   function('s:fim_on_exit'),
+            \ 'stdout_buffered': v:true
+            \ })
+    elseif s:ghost_text_vim
+        let s:current_job = job_start(l:curl_command, {
+            \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
+            \ 'exit_cb':   function('s:fim_on_exit')
+            \ })
+    endif
+
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " gather some extra context nearby and process it in the background
+    " only gather chunks if the cursor has moved a lot
+    " TODO: something more clever? reranking?
+    if a:is_auto && l:delta_y > 32
+        " expand the prefix even further
+        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+
+        " pick a suffix chunk
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
+
+        let s:pos_y_pick = s:pos_y
+    endif
+endfunction
+
+" if first_line == v:true accept only the first line of the response
+function! llama#fim_accept(first_line)
+    " insert the suggestion at the cursor location
+    if s:can_accept && len(s:content) > 0
+        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
+        if len(s:content) > 1
+            if !a:first_line
+                call append(s:pos_y, s:content[1:-1])
+            endif
+        endif
+
+        " move the cursor to the end of the accepted text
+        if !a:first_line && len(s:content) > 1
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
+        else
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
+        endif
+    endif
+
+    call llama#fim_cancel()
+endfunction
+
+function! llama#fim_cancel()
+    let s:hint_shown = v:false
+
+    " clear the virtual text
+    let l:bufnr = bufnr('%')
+
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+        call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    elseif s:ghost_text_vim
+        call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
+        call prop_remove({'type': s:hlgroup_info, 'all': v:true})
+    endif
+
+    " remove the mappings
+    silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <S-Tab>
+    silent! iunmap <buffer> <Esc>
+endfunction
+
+function! s:on_move()
+    let s:t_last_move = reltime()
+
+    call llama#fim_cancel()
+endfunction
+
+" callback that processes the FIM result from the server and displays the suggestion
+function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
+    if s:ghost_text_nvim
+        let l:raw = join(a:data, "\n")
+    elseif s:ghost_text_vim
+        let l:raw = a:data
+    endif
+
+    if len(l:raw) == 0
+        return
+    endif
+
+    if a:pos_x != col('.') - 1 || a:pos_y != line('.')
+        return
+    endif
+
+    " show the suggestion only in insert mode
+    if mode() !=# 'i'
+        return
+    endif
+
+    let s:pos_x = a:pos_x
+    let s:pos_y = a:pos_y
+
+    let s:can_accept = v:true
+    let l:has_info   = v:false
+
+    if s:can_accept && v:shell_error
+        if !a:is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
+    let l:n_prompt    = 0
+    let l:t_prompt_ms = 1.0
+    let l:s_prompt    = 0
+
+    let l:n_predict    = 0
+    let l:t_predict_ms = 1.0
+    let l:s_predict    = 0
+
+    " get the generated suggestion
+    if s:can_accept
+        let l:response = json_decode(l:raw)
+
+        for l:part in split(get(l:response, 'content', ''), "\n", 1)
+            call add(s:content, l:part)
+        endfor
+
+        " remove trailing new lines
+        while len(s:content) > 0 && s:content[-1] == ""
+            call remove(s:content, -1)
+        endwhile
+
+        let l:generation_settings = get(l:response, 'generation_settings', {})
+        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
+
+        let l:n_cached  = get(l:response, 'tokens_cached', 0)
+        let l:truncated = get(l:response, 'truncated', v:false)
+
+        " if response.timings is available
+        if len(get(l:response, 'timings', {})) > 0
+            let l:has_info = v:true
+            let l:timings  = get(l:response, 'timings', {})
+
+            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
+            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
+            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
+
+            let l:n_predict    = get(l:timings, 'predicted_n', 0)
+            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
+            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
+        endif
+    endif
+
+    if len(s:content) == 0
+        call add(s:content, "")
+        let s:can_accept = v:false
+    endif
+
+    if len(s:content) == 0
+        return
+    endif
+
+    " NOTE: the following is logic for discarding predictions that repeat existing text
+    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
+    "
+    "       still, I wonder if there is some better way that avoids having to do these special hacks?
+    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
+    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
+    "       helpful to re-generate the same code that is already there
+
+    " truncate the suggestion if the first line is empty
+    if len(s:content) == 1 && s:content[0] == ""
+        let s:content = [""]
+    endif
+
+    " ... and the next lines are repeated
+    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
+        let s:content = [""]
+    endif
+
+    " truncate the suggestion if it repeats the suffix
+    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
+        let s:content = [""]
+    endif
+
+    " find the first non-empty line (strip whitespace)
+    let l:cmp_y = s:pos_y + 1
+    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
+        let l:cmp_y += 1
+    endwhile
+
+    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
+        " truncate the suggestion if it repeats the next line
+        if len(s:content) == 1
+            let s:content = [""]
+        endif
+
+        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
+        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
+            let s:content = [""]
+        endif
+
+        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
+        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
+            let s:content = [""]
+        endif
+    endif
+
+    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
+    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+    "for i in range(1, len(s:content) - 1)
+    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
+    "        let s:content = s:content[:i - 1]
+    "        break
+    "    endif
+    "endfor
+
+    let s:pos_dx = len(s:content[-1])
+
+    let s:content[-1] .= s:line_cur_suffix
+
+    call llama#fim_cancel()
+
+    " display virtual text with the suggestion
+    let l:bufnr = bufnr('%')
+
+    if s:ghost_text_nvim
+        let l:id_vt_fim = nvim_create_namespace('vt_fim')
+    endif
+
+    " construct the info message
+    if g:llama_config.show_info > 0 && l:has_info
+        let l:prefix = '   '
+
+        if l:truncated
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached, l:n_ctx
+                \ )
+        else
+            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
+                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+                \ l:n_predict, l:t_predict_ms, l:s_predict,
+                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
+                \ )
+        endif
+
+        if g:llama_config.show_info == 1
+            " display the info in the statusline
+            let &statusline = l:info
+            let l:info = ''
+        endif
+    endif
+
+    " display the suggestion and append the info to the end of the first line
+    if s:ghost_text_nvim
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
+            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
+            \ 'virt_text_win_col': virtcol('.') - 1
+            \ })
+
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
+            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+            \ 'virt_text_win_col': virtcol('.')
+            \ })
+    elseif s:ghost_text_vim
+        let l:new_suffix = s:content[0]
+        if !empty(l:new_suffix)
+            call prop_add(s:pos_y, s:pos_x + 1, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': l:new_suffix
+                        \ })
+        endif
+        for line in s:content[1:]
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_hint,
+                        \ 'text': line,
+                        \ 'text_padding_left': s:get_indent(line),
+                        \ 'text_align': 'below'
+                        \ })
+        endfor
+        if !empty(l:info)
+            call prop_add(s:pos_y, 0, {
+                        \ 'type': s:hlgroup_info,
+                        \ 'text': l:info,
+                        \ 'text_padding_left': col('$'),
+                        \ 'text_wrap': 'truncate'
+                        \ })
+        endif
+    endif
+
+    " setup accept shortcuts
+    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
+
+    let s:hint_shown = v:true
+endfunction
+
+function! s:fim_on_exit(job_id, exit_code, event = v:null)
+    if a:exit_code != 0
+        echom "Job failed with exit code: " . a:exit_code
+    endif
+
+    let s:current_job = v:null
 endfunction
diff --git a/examples/llama2-13b.sh b/examples/llama2-13b.sh
deleted file mode 100755
index 92b3f6dd8..000000000
--- a/examples/llama2-13b.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
-       --color \
-       --ctx_size 2048 \
-       -n -1 \
-       -ins -b 256 \
-       --top_k 10000 \
-       --temp 0.2 \
-       --repeat_penalty 1.1 \
-       -t 8
diff --git a/examples/llama2.sh b/examples/llama2.sh
deleted file mode 100755
index 221b37553..000000000
--- a/examples/llama2.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
-       --color \
-       --ctx_size 2048 \
-       -n -1 \
-       -ins -b 256 \
-       --top_k 10000 \
-       --temp 0.2 \
-       --repeat_penalty 1.1 \
-       -t 8
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 2985caff8..319effd19 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .)
 target_include_directories(llava PUBLIC ../..)
 target_include_directories(llava PUBLIC ../../common)
 
-target_compile_features(llava PRIVATE cxx_std_11)
+target_compile_features(llava PRIVATE cxx_std_17)
 
 add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
 if (BUILD_SHARED_LIBS)
@@ -30,8 +30,30 @@ if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
 endif()
 
-set(TARGET llava-cli)
-add_executable(llava-cli llava-cli.cpp)
-install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(llava PRIVATE cxx_std_11)
+set(TARGET llama-llava-cli)
+add_executable(${TARGET} llava-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-minicpmv-cli)
+add_executable(${TARGET} minicpmv-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-qwen2vl-cli)
+add_executable(${TARGET} qwen2vl-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-llava-clip-quantize-cli)
+add_executable(${TARGET} clip-quantize-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 9eba791da..4f783f3ce 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -1,18 +1,20 @@
 # MobileVLM
 
-Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
 
 for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
 
 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
 
-## Usage
-Build with cmake or run `make llava-cli` to build it.
+Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 
-After building, run: `./llava-cli` to see the usage. For example:
+## Usage
+Build with cmake or run `make llama-llava-cli` to build it.
+
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
     --image path/to/an/image.jpg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@@ -20,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
 
 ## Model conversion
 
-- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
 
 ```sh
 git clone https://huggingface.co/mtgv/MobileVLM-1.7B
@@ -28,31 +30,39 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 
 ```sh
-python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```
 
-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
 
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
     -m path/to/clip-vit-large-patch14-336 \
     --llava-projector path/to/MobileVLM-1.7B/llava.projector \
     --output-dir path/to/MobileVLM-1.7B \
     --projector-type ldp
 ```
 
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
-
 ```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/llava/convert_image_encoder_to_gguf.py \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B_V2 \
+    --projector-type ldpv2
 ```
 
-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+
 ```sh
-./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
+```
+
+5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
+```sh
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
@@ -68,11 +78,11 @@ cd examples/llava/android/build_64
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`
 
-## some result on Android with `Snapdragon 888` chip
+## Some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -92,14 +102,13 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
     --image /data/local/tmp/cat.jpeg \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
-
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
@@ -111,17 +120,87 @@ llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 m
 llama_print_timings:       total time =   34570.79 ms
 ```
 
+
+## Some result on Android with `Snapdragon 778G` chip
+### MobileVLM-1.7B case
+#### llava-cli release-b2005
+**input**
+```sh
+/data/local/tmp/llama-llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/many_llamas.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ A group of llamas are standing in a green pasture.
+
+llama_print_timings:        load time =   20357.33 ms
+llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
+llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
+llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
+llama_print_timings:       total time =   28038.34 ms /   205 tokens
+```
+#### llava-cli latest-version
+**input**
+
+Just the same as above.
+
+**output**(seems to be much slower)
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ It is a group of sheep standing together in a grass field.
+
+llama_print_timings:        load time =  818120.91 ms
+llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
+llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
+llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
+llama_print_timings:       total time =  865441.76 ms /   204 tokens
+```
+### MobileVLM_V2-1.7B case
+#### llava-cli release-2005b
+**input**
+
+Just the same as above.
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
+
+The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
+
+llama_print_timings:        load time =   22406.77 ms
+llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
+llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
+llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
+llama_print_timings:       total time =   44411.01 ms /   377 tokens
+```
+
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ```
-
 ### run on Orin
 ### case 1
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     --image /data/local/tmp/demo.jpeg \
@@ -145,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
@@ -165,8 +244,121 @@ llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 m
 llama_print_timings:       total time =    1365.47 ms /   243 tokens
 ```
 
-## Minor shortcomings
-The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+## Running on Intel(R) Core(TM) i7-10750H
+### Operating system
+Ubuntu22.04
+### compile
+```sh
+make -j32
+```
+### MobileVLM-1.7B case
+**input**
+```sh
+-m /path/to/ggml-model-q4_k.gguf \
+    --mmproj /path/to/mmproj-model-f16.gguf \
+    --image /path/to/many_llamas.jpeg
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
+```
+**output**
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that?ASSISTANT:
+
+ A group of llamas are walking together in a field.
+
+llama_print_timings:        load time =    5506.60 ms
+llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
+llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
+llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
+llama_print_timings:       total time =    5990.25 ms /   202 tokens
+```
+
+### MobileVLM_V2-1.7B case
+**input**
+
+Just the same as above.
+
+**ouput**
+```sh
+encode_image_with_clip: image embedding created: 144 tokens
+
+encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that?ASSISTANT:
+
+ The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
+
+The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
+
+The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
+
+llama_print_timings:        load time =    6642.61 ms
+llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
+llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
+llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
+llama_print_timings:       total time =   15513.95 ms /   412 tokens
+```
+
+## Run on Intel(R) Core(TM) Ultra7 115H
+### operation system
+Windows11
+### comiple
+```sh
+make -j32
+```
+### MobileVLM-1.7B case
+**input**
+```sh
+-m /path/to/ggml-model-q4_k.gguf \
+    --mmproj /path/to/tmp/mmproj-model-f16.gguf \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ The image features a group of brown and white llamas standing in a grassy field.
+
+llama_print_timings:        load time =    7441.06 ms
+llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
+llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
+llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
+llama_print_timings:       total time =    7987.23 ms /   209 tokens
+```
+
+### MobileVLM_V2-1.7B case
+**input**
+
+Just the same as above.
+
+**output**
+```sh
+encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per image patch)
+system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
+user_prompt: \nWhat's that? ASSISTANT:
+
+ This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
+ of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
+
+The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
+
+The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
+ front is not visible, indicating that it might not be the main focus of the photo.
+
+The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
+
+
+llama_print_timings:        load time =    7015.35 ms
+llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
+llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
+llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
+llama_print_timings:       total time =   14371.19 ms /   446 tokens
+```
 
 ## TODO
 
@@ -181,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic
 
 ## contributor
 ```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
 ```
diff --git a/examples/llava/README-glmedge.md b/examples/llava/README-glmedge.md
new file mode 100644
index 000000000..603d01474
--- /dev/null
+++ b/examples/llava/README-glmedge.md
@@ -0,0 +1,43 @@
+# GLMV-EDGE
+
+Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
+
+## Usage
+Build with cmake or run `make llama-llava-cli` to build it.
+
+After building, run: `./llama-llava-cli` to see the usage. For example:
+
+```sh
+./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+
+## GGUF conversion
+
+1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
+
+```sh
+git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
+```
+
+2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
+
+```sh
+python ./examples/llava/glmedge-surgery.py -m ../model_path
+```
+
+4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
+
+```sh
+python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
+```
+
+5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
+
+```sh
+python convert_hf_to_gguf.py ../model_path
+```
+
+Now both the LLM part and the image encoder are in the `model_path` directory.
diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md
new file mode 100644
index 000000000..8713a43d6
--- /dev/null
+++ b/examples/llava/README-minicpmo2.6.md
@@ -0,0 +1,46 @@
+## MiniCPM-o 2.6
+Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
+
+### Prepare models and code
+
+Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
+
+Clone llama.cpp:
+```bash
+git clone git@github.com:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpm-omni
+```
+
+### Usage of MiniCPM-o 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build llama.cpp using `CMake`:
+https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
+
+```bash
+cmake -B build
+cmake --build build --config Release
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md
new file mode 100644
index 000000000..1c8498ff9
--- /dev/null
+++ b/examples/llava/README-minicpmv2.5.md
@@ -0,0 +1,99 @@
+## MiniCPM-Llama3-V 2.5
+
+### Prepare models and code
+
+Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
+
+Clone llama.cpp:
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+```
+
+### Usage
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
+python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+make llama-minicpmv-cli
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
+
+### Android
+
+#### Build on Android device using Termux
+We found that build on Android device would bring better runtime performance, so we recommend to build on device.
+
+[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
+
+Install tools in Termux:
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+#### Building the Project using Android NDK
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+
+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
+
+```bash
+mkdir build-android
+cd build-android
+export NDK=/your_ndk_path
+cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
+make
+```
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
+Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+
+(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+```
+$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cd /data/data/com.termux/files/home/bin
+$chmod +x ./*
+```
+
+Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+
+```
+$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+```
+
+Now, you can start chatting:
+```
+$cd /data/data/com.termux/files/home/bin
+$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+```
diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md
new file mode 100644
index 000000000..c4be5e5dd
--- /dev/null
+++ b/examples/llava/README-minicpmv2.6.md
@@ -0,0 +1,107 @@
+## MiniCPM-V 2.6
+
+### Prepare models and code
+
+Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
+
+Clone llama.cpp:
+```bash
+git clone git@github.com:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpmv-main
+```
+
+### Usage of MiniCPM-V 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
+python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+make llama-minicpmv-cli
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
+
+### Video
+Install FFmpeg
+```
+brew install ffmpeg
+brew install pkg-config
+```
+
+### Android
+
+#### Build on Android device using Termux
+We found that build on Android device would bring better runtime performance, so we recommend to build on device.
+
+[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
+
+Install tools in Termux:
+```
+apt update && apt upgrade -y
+apt install git make cmake
+```
+
+It's recommended to move your model inside the `~/` directory for best performance:
+```
+cd storage/downloads
+mv model.gguf ~/
+```
+
+#### Building the Project using Android NDK
+Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+
+Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
+
+```bash
+mkdir build-android
+cd build-android
+export NDK=/your_ndk_path
+cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
+make
+```
+
+Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
+
+Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
+
+(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
+```
+$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
+$cd /data/data/com.termux/files/home/bin
+$chmod +x ./*
+```
+
+Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
+
+```
+$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
+$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
+```
+
+Now, you can start chatting:
+```
+$cd /data/data/com.termux/files/home/bin
+$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+```
diff --git a/examples/llava/README-quantize.md b/examples/llava/README-quantize.md
new file mode 100644
index 000000000..b931513ab
--- /dev/null
+++ b/examples/llava/README-quantize.md
@@ -0,0 +1,44 @@
+# Quantizing CLIP Visual Projector
+
+This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
+
+## Usage
+
+To quantize a CLIP visual projector model, use the following command:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
+```
+
+After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
+
+### Arguments
+
+- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
+- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
+- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
+
+### Quantization Types
+
+The following quantization types are supported, based on the `enum ggml_type` definition:
+
+- `2` - `q4_0`: 4-bit quantization with a single scale value.
+- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
+- `6` - `q5_0`: 5-bit quantization with a single scale value.
+- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
+- `8` - `q8_0`: 8-bit quantization with a single scale value.
+
+### Example
+
+To quantize a model using the `q4_0` quantization type, you would run:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
+```
+
+This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
+
+## Notes
+
+- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
+- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 35e6d9e5d..012451361 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.
 
 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 
 ```sh
-./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
 
 ## LLaVA 1.5
 
-- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
 
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@@ -38,37 +38,45 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```
 
-3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 
 ```sh
-python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```
 
-4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
 
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
 ```
 
-Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
 
 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
-2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+
+4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@@ -76,20 +84,20 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```
 
-4) Create the visual gguf model:
+5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
 
-5) Then convert the model to gguf format:
+6) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 
-6) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index f73623ae3..45ccf8d70 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 
 program_dir="build_64/bin"
-binName="llava-cli"
+binName="llama-llava-cli"
 n_threads=4
 
 
diff --git a/examples/llava/clip-quantize-cli.cpp b/examples/llava/clip-quantize-cli.cpp
new file mode 100644
index 000000000..566506954
--- /dev/null
+++ b/examples/llava/clip-quantize-cli.cpp
@@ -0,0 +1,59 @@
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
+    fprintf(stderr, "  type = 2 - q4_0\n");
+    fprintf(stderr, "  type = 3 - q4_1\n");
+    fprintf(stderr, "  type = 6 - q5_0\n");
+    fprintf(stderr, "  type = 7 - q5_1\n");
+    fprintf(stderr, "  type = 8 - q8_0\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const int itype = atoi(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    return 0;
+}
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ef9e4ba7a..271cf2a2a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -4,16 +4,30 @@
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "gguf.h"
 
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
+//#ifdef GGML_USE_CUDA
+//#include "ggml-cuda.h"
+//#endif
+//
+//#ifdef GGML_USE_SYCL
+//#include "ggml-sycl.h"
+//#endif
+//
+//#ifdef GGML_USE_METAL
+//#include "ggml-metal.h"
+//#endif
+//
+//#ifdef GGML_USE_CANN
+//#include "ggml-cann.h"
+//#endif
+//
+//#ifdef GGML_USE_VULKAN
+//#include "ggml-vulkan.h"
+//#endif
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -23,7 +37,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
-#include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -32,6 +45,18 @@
 #include <cinttypes>
 #include <limits>
 
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
+
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
@@ -70,26 +95,31 @@ static std::string format(const char * fmt, ...) {
 // key constants
 //
 
-#define KEY_FTYPE          "general.file_type"
-#define KEY_NAME           "general.name"
-#define KEY_DESCRIPTION    "general.description"
-#define KEY_HAS_TEXT_ENC   "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC    "clip.has_vision_encoder"
-#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
-#define KEY_USE_GELU       "clip.use_gelu"
-#define KEY_N_EMBD         "clip.%s.embedding_length"
-#define KEY_N_FF           "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK        "clip.%s.block_count"
-#define KEY_N_HEAD         "clip.%s.attention.head_count"
-#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM       "clip.%s.projection_dim"
-#define KEY_TOKENS         "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS    "clip.text.context_length"
-#define KEY_IMAGE_SIZE     "clip.vision.image_size"
-#define KEY_PATCH_SIZE     "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN     "clip.vision.image_mean"
-#define KEY_IMAGE_STD      "clip.vision.image_std"
-#define KEY_PROJ_TYPE      "clip.projector_type"
+#define KEY_FTYPE               "general.file_type"
+#define KEY_NAME                "general.name"
+#define KEY_DESCRIPTION         "general.description"
+#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
+#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
+#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
+#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
+#define KEY_USE_GELU            "clip.use_gelu"
+#define KEY_USE_SILU            "clip.use_silu"
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_TOKENS              "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS         "clip.text.context_length"
+#define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_PATCH_SIZE          "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
+#define KEY_IMAGE_STD           "clip.vision.image_std"
+#define KEY_PROJ_TYPE           "clip.projector_type"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -103,7 +133,9 @@ static std::string format(const char * fmt, ...) {
 #define TN_TOKEN_EMBD      "%s.token_embd.weight"
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
+#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -119,19 +151,44 @@ static std::string format(const char * fmt, ...) {
 #define TN_LLAVA_PROJ      "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 
+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
+#define TN_MINICPMV_QUERY "resampler.query"
+#define TN_MINICPMV_PROJ "resampler.proj.weight"
+#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
+#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
+#define TN_MINICPMV_LN "resampler.ln_%s.%s"
+
+#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
+#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
+#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
+#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
+#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
+#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
+#define TN_GLM_BOI_W "adapter.boi"
+#define TN_GLM_EOI_W "adapter.eoi"
+
 
 enum projector_type {
     PROJECTOR_TYPE_MLP,
     PROJECTOR_TYPE_MLP_NORM,
     PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
+    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_GLM_EDGE,
+    PROJECTOR_TYPE_MERGER,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_MLP, "mlp" },
     { PROJECTOR_TYPE_LDP, "ldp" },
+    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
+    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
+    { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
 };
 
 
@@ -142,7 +199,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
     int i = gguf_find_key(ctx, key);
     if (i == -1) {
-        fprintf(stderr, "key %s not found in file\n", key);
+        LOG_ERR("key %s not found in file\n", key);
         throw std::runtime_error(format("Missing required key: %s", key));
     }
 
@@ -192,17 +249,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
 }
 
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
+    if (search.empty()) {
+        return;
     }
-    s = std::move(result);
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
 }
 
 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@@ -215,7 +275,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
             {
                 const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
                 int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
                 std::stringstream ss;
                 ss << "[";
                 for (int j = 0; j < arr_n; j++) {
@@ -244,7 +304,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
             tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -262,7 +322,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -281,7 +341,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -421,7 +481,9 @@ struct clip_vision_model {
 
     // embeddings
     struct ggml_tensor * class_embedding;
-    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * patch_embeddings_0;
+    struct ggml_tensor * patch_embeddings_1;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    struct ggml_tensor * patch_bias;
     struct ggml_tensor * position_embeddings;
 
     struct ggml_tensor * pre_ln_w;
@@ -450,6 +512,12 @@ struct clip_vision_model {
     struct ggml_tensor * mm_4_w = NULL;
     struct ggml_tensor * mm_4_b = NULL;
 
+    //GLMV-Edge projection
+    struct ggml_tensor * mm_model_adapter_conv_w;
+    struct ggml_tensor * mm_model_adapter_conv_b;
+    struct ggml_tensor * boi_w;
+    struct ggml_tensor * eoi_w;
+
     // MobileVLM projection
     struct ggml_tensor * mm_model_mlp_1_w;
     struct ggml_tensor * mm_model_mlp_1_b;
@@ -475,12 +543,44 @@ struct clip_vision_model {
     struct ggml_tensor * mm_model_block_2_block_2_0_w;
     struct ggml_tensor * mm_model_block_2_block_2_1_w;
     struct ggml_tensor * mm_model_block_2_block_2_1_b;
+
+    // MobileVLM_V2 projection
+    struct ggml_tensor * mm_model_mlp_0_w;
+    struct ggml_tensor * mm_model_mlp_0_b;
+    struct ggml_tensor * mm_model_mlp_2_w;
+    struct ggml_tensor * mm_model_mlp_2_b;
+    struct ggml_tensor * mm_model_peg_0_w;
+    struct ggml_tensor * mm_model_peg_0_b;
+
+    // MINICPMV projection
+    struct ggml_tensor * mm_model_pos_embed_k;
+    struct ggml_tensor * mm_model_query;
+    struct ggml_tensor * mm_model_proj;
+    struct ggml_tensor * mm_model_kv_proj;
+    struct ggml_tensor * mm_model_attn_q_w;
+    struct ggml_tensor * mm_model_attn_q_b;
+    struct ggml_tensor * mm_model_attn_k_w;
+    struct ggml_tensor * mm_model_attn_k_b;
+    struct ggml_tensor * mm_model_attn_v_w;
+    struct ggml_tensor * mm_model_attn_v_b;
+    struct ggml_tensor * mm_model_attn_o_w;
+    struct ggml_tensor * mm_model_attn_o_b;
+    struct ggml_tensor * mm_model_ln_q_w;
+    struct ggml_tensor * mm_model_ln_q_b;
+    struct ggml_tensor * mm_model_ln_kv_w;
+    struct ggml_tensor * mm_model_ln_kv_b;
+    struct ggml_tensor * mm_model_ln_post_w;
+    struct ggml_tensor * mm_model_ln_post_b;
 };
 
 struct clip_ctx {
     bool has_text_encoder    = false;
     bool has_vision_encoder  = false;
     bool has_llava_projector = false;
+    bool has_minicpmv_projector = false;
+    bool has_glm_projector = false;
+    bool has_qwen2vl_merger = false;
+    int minicpmv_version = 2;
 
     struct clip_vision_model vision_model;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -488,8 +588,14 @@ struct clip_ctx {
     float image_mean[3];
     float image_std[3];
     bool use_gelu = false;
+    bool use_silu = false;
     int32_t ftype = 1;
 
+    bool has_class_embedding = true;
+    bool has_pre_norm = true;
+    bool has_post_norm = false;
+    bool has_patch_bias = false;
+
     struct gguf_context * ctx_gguf;
     struct ggml_context * ctx_data;
 
@@ -497,35 +603,61 @@ struct clip_ctx {
 
     // memory buffers to evaluate the model
     ggml_backend_buffer_t params_buffer  = NULL;
-    ggml_backend_buffer_t compute_buffer = NULL;
 
     ggml_backend_t backend       = NULL;
     ggml_gallocr_t compute_alloc = NULL;
+
+    struct clip_image_size * load_image_size;
 };
 
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return nullptr;
     }
 
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    const int image_size           = hparams.image_size;
+    const int image_size = hparams.image_size;
+    int image_size_width  = image_size;
+    int image_size_height = image_size;
+    if (ctx->has_minicpmv_projector) {
+        if (load_image_size == nullptr) {
+            load_image_size = clip_image_size_init();
+        }
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        image_size_width  = load_image_size->width;
+        image_size_height = load_image_size->height;
+        if (is_inf) {
+            image_size_width  = imgs->data->nx;
+            image_size_height = imgs->data->ny;
+        }
+    }
+    else if (ctx->has_qwen2vl_merger) {
+        // use the image's native resolution when image is avaible
+        if (is_inf) {
+        // if (imgs->data->nx && imgs->data->ny) {
+            image_size_width  = imgs->data->nx;
+            image_size_height = imgs->data->ny;
+        }
+    }
     const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + 1;
+    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int patches_w            = image_size_width / patch_size;
+    const int patches_h            = image_size_height / patch_size;
+    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_position_ids     = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
-    const int n_layer              = hparams.n_layer;
+    int n_layer                    = hparams.n_layer;
     const float eps                = hparams.eps;
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
     const int batch_size = imgs->size;
 
-    if (ctx->has_llava_projector) {
+    if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
         GGML_ASSERT(batch_size == 1);
     }
 
@@ -538,35 +670,82 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     struct ggml_context * ctx0 = ggml_init(params);
     struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
 
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    if (ctx->has_qwen2vl_merger) {
+        GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
+        GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
 
-    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-    ggml_set_name(embeddings, "embeddings");
-    ggml_set_input(embeddings);
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            hidden_size * 2, patches_w / 2, patches_h, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
+        inp = ggml_reshape_3d(
+            ctx0, inp,
+            hidden_size, patches_w * patches_h, batch_size);
+    }
+    else {
+        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    }
 
-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    if (ctx->has_patch_bias) {
+        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+    }
+    struct ggml_tensor * embeddings = inp;
+    struct ggml_tensor * pos_embed = nullptr;
 
-    embeddings = ggml_acc(ctx0, embeddings, inp,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    if (ctx->has_llava_projector) {
+        // concat class_embeddings and patch_embeddings
+        if (ctx->has_class_embedding) {
+            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+            ggml_set_name(embeddings, "embeddings");
+            ggml_set_input(embeddings);
+            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+            embeddings = ggml_acc(ctx0, embeddings, inp,
+                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+        }
+    }
 
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
 
-    embeddings =
-        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+    if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
+        embeddings =
+            ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+    }
+
+    if (ctx->has_minicpmv_projector) {
+        int pos_w = image_size_width/patch_size;
+        int pos_h = image_size_height/patch_size;
+        if (ctx->minicpmv_version == 2) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
+        }
+        else if (ctx->minicpmv_version == 3) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
+        }
+        else if (ctx->minicpmv_version == 4) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
+        }
+        ggml_set_name(pos_embed, "pos_embed");
+        ggml_set_input(pos_embed);
+    }
 
     // pre-layernorm
-    {
+    if (ctx->has_pre_norm) {
         embeddings = ggml_norm(ctx0, embeddings, eps);
         ggml_set_name(embeddings, "pre_ln");
 
@@ -574,6 +753,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     }
 
     // loop over layers
+    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
+        n_layer += 1;
+    }
     for (int il = 0; il < n_layer - 1; il++) {
         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
 
@@ -593,8 +775,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             struct ggml_tensor * Q =
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
 
-            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
             Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            if (ctx->has_qwen2vl_merger) {
+                Q = ggml_rope_multi(
+                    ctx0, Q, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            }
+            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
             Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
             Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
 
@@ -602,6 +789,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
 
             K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            if (ctx->has_qwen2vl_merger) {
+                K = ggml_rope_multi(
+                    ctx0, K, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            }
             K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
             K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
 
@@ -641,6 +833,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
         if (ctx->use_gelu) {
             cur = ggml_gelu_inplace(ctx0, cur);
+        } else if (ctx->use_silu) {
+            cur = ggml_silu_inplace(ctx0, cur);
         } else {
             cur = ggml_gelu_quick_inplace(ctx0, cur);
         }
@@ -652,10 +846,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         cur = ggml_add(ctx0, embeddings, cur);
 
         embeddings = cur;
+
+    }
+
+    // post-layernorm
+    if (ctx->has_post_norm) {
+        embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "post_ln");
+
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
     }
 
     // llava projector
-    {
+    if (ctx->has_llava_projector) {
         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
 
         struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@@ -676,8 +879,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = ggml_gelu(ctx0, embeddings);
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-
-        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
             // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -715,7 +918,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -763,7 +966,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // block_2
             {
                 // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
@@ -808,10 +1011,147 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             }
             embeddings = block_1;
         }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
+        else {
+            GGML_ABORT("fatal error");
+        }
+    }
+    // minicpmv projector
+    else if (ctx->has_minicpmv_projector)
+    {
+        if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+            struct ggml_tensor * q = model.mm_model_query;
+            { // layernorm
+                q = ggml_norm(ctx0, q, eps);
+                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            }
+            struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+            { // layernorm
+                v = ggml_norm(ctx0, v, eps);
+                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
+            }
+            struct ggml_tensor * k;
+            { // position
+                // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
+                k = ggml_add(ctx0, v, pos_embed);
+            }
+
+            { // attention
+                int hidden_size = 4096;
+                const int d_head = 128;
+                int n_head = hidden_size/d_head;
+                int num_query = 96;
+                if (ctx->minicpmv_version == 2) {
+                    hidden_size = 4096;
+                    n_head = hidden_size/d_head;
+                    num_query = 96;
+                }
+                else if (ctx->minicpmv_version == 3) {
+                    hidden_size = 3584;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }
+                else if (ctx->minicpmv_version == 4) {
+                    hidden_size = 3584;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }
+
+                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
+                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
+                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
+                // permute
+                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
+                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
+                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+                KQ = ggml_soft_max_inplace(ctx0, KQ);
+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
+                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
+
+                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
+            }
+            { // layernorm
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
+            }
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+        }
         else {
             GGML_ASSERT(false);
         }
     }
+    // glm projector
+    else if (ctx->has_glm_projector) {
+        if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
+            embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+            embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+            embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+            embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+            //GLU
+            {
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+                embeddings = ggml_gelu_inplace(ctx0, embeddings);
+                struct ggml_tensor * x = embeddings;
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+                x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+                embeddings = ggml_silu_inplace(ctx0, embeddings);
+                embeddings = ggml_mul(ctx0, embeddings,x);
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+            }
+        } else {
+            GGML_ABORT("fatel error");
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+
+        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+        // GELU activation
+        embeddings = ggml_gelu(ctx0, embeddings);
+
+        // Second linear layer
+        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+    }
 
     // build the graph
     ggml_build_forward_expand(gf, embeddings);
@@ -845,21 +1185,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         const int idx_name = gguf_find_key(ctx, KEY_NAME);
         if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
             const std::string name = gguf_get_val_str(ctx, idx_name);
-            printf("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
         }
-        printf("%s: description:  %s\n", __func__, description.c_str());
-        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
-        printf("%s: n_kv:         %d\n", __func__, n_kv);
-        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        printf("\n");
+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_INF("\n");
     }
     const int n_tensors = gguf_get_n_tensors(ctx);
 
     // kv
     const int n_kv = gguf_get_n_kv(ctx);
-    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
         __func__, n_kv, n_tensors, fname);
     {
         std::map<enum ggml_type, uint32_t> n_type;
@@ -870,7 +1210,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             n_type[type]++;
         }
 
-        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(ctx, i);
             const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -886,7 +1226,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             replace_all(value, "\n", "\\n");
 
-            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
@@ -895,7 +1235,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 continue;
             }
 
-            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -910,13 +1250,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             size_t tensor_size = ggml_nbytes(cur);
             model_size += tensor_size;
             if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                        __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
             }
         }
     }
 
-    clip_ctx * new_clip = new clip_ctx;
+    clip_ctx * new_clip = new clip_ctx{};
 
     // update projector type
     {
@@ -935,20 +1275,34 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
 
-#ifdef GGML_USE_CUBLAS
-    new_clip->backend = ggml_backend_cuda_init(0);
-    printf("%s: CLIP using CUDA backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
-    printf("%s: CLIP using Metal backend\n", __func__);
-#endif
-
+//#ifdef GGML_USE_CUDA
+//    new_clip->backend = ggml_backend_cuda_init(0);
+//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_METAL
+//    new_clip->backend = ggml_backend_metal_init();
+//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_CANN
+//    new_clip->backend = ggml_backend_cann_init(0);
+//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_VULKAN
+//    new_clip->backend = ggml_backend_vk_init(0);
+//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+//#endif
+//
+//#ifdef GGML_USE_SYCL
+//    new_clip->backend = ggml_backend_sycl_init(0);
+//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+//#endif
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
-        printf("%s: CLIP using CPU backend\n", __func__);
+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
     }
 
     // model size and capabilities
@@ -964,23 +1318,52 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
         }
 
-        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
+        idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
+        if (idx != -1) {
+            new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
+        }
+
+        idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
+        if (idx != -1) {
+            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
+        }
+
+        idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
+        if (idx != -1) {
+            new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
+        }
+
+        idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
+        if (idx != -1) {
+            new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
+        }
+        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
+
         GGML_ASSERT(new_clip->has_vision_encoder);
         GGML_ASSERT(!new_clip->has_text_encoder);
 
         idx = get_key_idx(ctx, KEY_USE_GELU);
         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
 
+        try {
+            idx = get_key_idx(ctx, KEY_USE_SILU);
+            new_clip->use_silu = gguf_get_val_bool(ctx, idx);
+        } catch (std::runtime_error & /*e*/) {
+            new_clip->use_silu = false;
+        }
+
         if (verbosity >= 1) {
-            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: glm_projector:  %d\n", __func__, new_clip->has_glm_projector);
+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
         }
     }
 
-    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
 
     // load tensors
     {
@@ -993,15 +1376,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         new_clip->ctx_data = ggml_init(params);
         if (!new_clip->ctx_data) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            LOG_ERR("%s: ggml_init() failed\n", __func__);
             clip_free(new_clip);
+            gguf_free(ctx);
             return nullptr;
         }
 
         auto fin = std::ifstream(fname, std::ios::binary);
         if (!fin) {
-            printf("cannot open model file for loading tensors\n");
+            LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
+            gguf_free(ctx);
             return nullptr;
         }
 
@@ -1021,8 +1406,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
             fin.seekg(offset, std::ios::beg);
             if (!fin) {
-                printf("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
                 clip_free(new_clip);
+                gguf_free(ctx);
                 return nullptr;
             }
             int num_bytes = ggml_nbytes(cur);
@@ -1062,20 +1448,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             if (n < 32)
                 hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
             hparams.image_grid_pinpoints[0]=0;
         }
 
         try {
             int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
             strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
             strcpy(hparams.mm_patch_merge_type, "flat");
         }
 
         try {
             hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
             hparams.image_crop_resolution = hparams.image_size;
         }
 
@@ -1091,34 +1477,66 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
 
         if (verbosity >= 2) {
-            printf("\n%s: vision model hparams\n", __func__);
-            printf("image_size         %d\n", hparams.image_size);
-            printf("patch_size         %d\n", hparams.patch_size);
-            printf("v_hidden_size      %d\n", hparams.hidden_size);
-            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
-            printf("v_projection_dim   %d\n", hparams.projection_dim);
-            printf("v_n_head           %d\n", hparams.n_head);
-            printf("v_n_layer          %d\n", hparams.n_layer);
-            printf("v_eps              %f\n", hparams.eps);
-            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            printf("v_image_grid_pinpoints: ");
+            LOG_INF("\n%s: vision model hparams\n", __func__);
+            LOG_INF("image_size         %d\n", hparams.image_size);
+            LOG_INF("patch_size         %d\n", hparams.patch_size);
+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_INF("v_n_head           %d\n", hparams.n_head);
+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
+            LOG_INF("v_eps              %f\n", hparams.eps);
+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_INF("v_image_grid_pinpoints: ");
             for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                printf("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
             }
-            printf("\n");
-            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_INF("\n");
+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
 
         }
 
         try {
-            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            new_clip->has_class_embedding = true;
+        } catch (const std::exception& /*e*/) {
+            new_clip->has_class_embedding = false;
+        }
+
+        try {
+            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+            new_clip->has_pre_norm = true;
+        } catch (std::exception & /*e*/) {
+            new_clip->has_pre_norm = false;
+        }
+
+        try {
+            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
+            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
+            new_clip->has_post_norm = true;
+        } catch (std::exception & /*e*/) {
+            new_clip->has_post_norm = false;
+        }
+
+        try {
+            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
+            new_clip->has_patch_bias = true;
+        } catch (std::exception & /*e*/) {
+            new_clip->has_patch_bias = false;
+        }
+
+        try {
+            vision_model.patch_embeddings_0    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        } catch(const std::exception& e) {
-            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+        } catch(const std::exception& /*e*/) {
+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
+        }
+        try {
+            vision_model.patch_embeddings_1    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
+        } catch(const std::exception& /*e*/) {
+            new_clip->has_qwen2vl_merger = false;
         }
 
         // LLaVA projection
@@ -1129,26 +1547,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 // Yi-type llava
                 vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                 vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // missing in Yi-type llava
                 vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                 vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // Yi-type llava
                 vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                 vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 // Yi-type llava
                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
             try {
                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & e) {  }
+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
+            } catch (std::runtime_error & /*e*/) { }
         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
             vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
@@ -1175,7 +1593,57 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
             vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
             vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
-        } else {
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            // MobilVLM_V2 projection
+            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
+            vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
+            vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
+            vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
+            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
+            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+            // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
+            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
+            vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
+            vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
+            vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
+            vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
+            vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
+            vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
+            vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
+            vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
+            vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
+            vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
+            vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
+            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
+            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
+            vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
+            vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
+            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
+            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
+            vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
+            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
+            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
+            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
+            vision_model.mm_model_mlp_1_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
+            vision_model.mm_model_mlp_2_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
+            vision_model.mm_model_mlp_3_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
+            vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
+            vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
+            vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        }
+        else {
             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
         }
@@ -1213,15 +1681,31 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
         clip_image_f32_batch batch;
         batch.size = 1;
-        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
+        batch.data = nullptr;
+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
     }
 
     return new_clip;
 }
 
+void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
+    ctx_clip->load_image_size = load_image_size;
+}
+
+struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
+    return ctx_clip->load_image_size;
+}
+
+struct clip_image_size * clip_image_size_init() {
+    struct clip_image_size * load_image_size = new struct clip_image_size();
+    load_image_size->width = 448;
+    load_image_size->height = 448;
+    return load_image_size;
+}
+
 struct clip_image_u8 * clip_image_u8_init() {
     return new clip_image_u8();
 }
@@ -1232,16 +1716,16 @@ struct clip_image_f32 * clip_image_f32_init() {
 
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
     }
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
     }
 }
 
@@ -1256,7 +1740,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
     int nx, ny, nc;
     auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
     if (!data) {
-        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1268,7 +1752,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     int nx, ny, nc;
     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
     if (!data) {
-        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1277,7 +1761,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 }
 
 // Linear interpolation between two points
-inline float lerp(float s, float e, float t) {
+inline float clip_lerp(float s, float e, float t) {
     return s + (e - s) * t;
 }
 // Bilinear resize function
@@ -1299,17 +1783,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
             float y_lerp = py - y_floor;
 
             for (int c = 0; c < 3; c++) {
-                float top = lerp(
+                float top = clip_lerp(
                     static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
                     static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
                     x_lerp
                 );
-                float bottom = lerp(
+                float bottom = clip_lerp(
                     static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
                     static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
                     x_lerp
                 );
-                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
             }
         }
     }
@@ -1327,7 +1811,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
     }
 }
 
-inline float clip(float x, float lower, float upper) {
+inline int clip(int x, int lower, int upper) {
     return std::max(lower, std::min(x, upper));
 }
 
@@ -1458,7 +1942,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -1492,12 +1976,224 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
     return patches;
 }
 
+static int ensure_divide(int length, int patch_size) {
+    return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
+}
+
+static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
+    int width = original_size.first;
+    int height = original_size.second;
+    if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
+        float r = static_cast<float>(width) / height;
+        height = static_cast<int>(scale_resolution / std::sqrt(r));
+        width = static_cast<int>(height * r);
+    }
+    int best_width = ensure_divide(width, patch_size);
+    int best_height = ensure_divide(height, patch_size);
+    return std::make_pair(best_width, best_height);
+}
+
+static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
+    int width, height;
+    std::tie(width, height) = original_size;
+    int grid_x, grid_y;
+    std::tie(grid_x, grid_y) = grid;
+
+    int refine_width = ensure_divide(width, grid_x);
+    int refine_height = ensure_divide(height, grid_y);
+
+    int grid_width = refine_width / grid_x;
+    int grid_height = refine_height / grid_y;
+
+   // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
+    auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
+    int best_grid_width, best_grid_height;
+    std::tie(best_grid_width, best_grid_height) = best_grid_size;
+
+  //  std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
+    std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
+    return refine_size;
+}
+
+static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
+    std::vector<int> candidate_split_grids_nums;
+    for (int i : {multiple - 1, multiple, multiple + 1}) {
+        if (i == 1 || i > max_slice_nums) {
+            continue;
+        }
+        candidate_split_grids_nums.push_back(i);
+    }
+
+    std::vector<std::pair<int, int>> candidate_grids;
+    for (int split_grids_nums : candidate_split_grids_nums) {
+        int m = 1;
+        while (m <= split_grids_nums) {
+            if (split_grids_nums % m == 0) {
+                candidate_grids.emplace_back(m, split_grids_nums / m);
+            }
+            ++m;
+        }
+    }
+
+    std::pair<int, int> best_grid{1, 1};
+    float min_error = std::numeric_limits<float>::infinity();
+    for (const auto& grid : candidate_grids) {
+        float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
+        if (error < min_error) {
+            best_grid = grid;
+            min_error = error;
+        }
+    }
+    return best_grid;
+}
+
+// inspired from LLaVA-UHD:
+//    -> https://arxiv.org/pdf/2403.11703
+//    -> https://github.com/thunlp/LLaVA-UHD
+//    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
+static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
+    const std::pair<int, int> original_size={img->nx,img->ny};
+    const int original_width = img->nx;
+    const int original_height = img->ny;
+    const float log_ratio = log(1.0*original_width/original_height);
+    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
+    const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+    std::vector<std::vector<clip_image_u8 *>> images;
+    LOG_INF("%s: multiple %d\n", __func__, multiple);
+    images.push_back(std::vector<clip_image_u8 *>());
+
+    if (multiple <= 1) {
+        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
+        clip_image_u8 * source_image = clip_image_u8_init();
+        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
+        // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
+        images[images.size()-1].push_back(source_image);
+    }
+    else if (multiple > 1) {
+        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
+        clip_image_u8 * source_image = clip_image_u8_init();
+        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
+        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        images[images.size()-1].push_back(source_image);
+
+        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+
+        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
+        clip_image_u8 * refine_image = clip_image_u8_init();
+        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
+
+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+
+        // split_to_patches
+        int width = refine_image->nx;
+        int height = refine_image->ny;
+        int grid_x = int(width / best_grid.first);
+        int grid_y = int(height / best_grid.second);
+        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
+            images.push_back(std::vector<clip_image_u8 *>());
+            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
+                clip_image_u8 * patch = clip_image_u8_init();
+                patch->nx = grid_x;
+                patch->ny = grid_y;
+                patch->buf.resize(3 * patch->nx * patch->ny);
+                for (int y = patches_i; y < patches_i + grid_y; ++y) {
+                    for (int x = patches_j; x < patches_j + grid_x; ++x) {
+                        const int i = 3 * (y * refine_image->nx + x);
+                        const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
+                        patch->buf[j]   = refine_image->buf[i];
+                        patch->buf[j+1] = refine_image->buf[i+1];
+                        patch->buf[j+2] = refine_image->buf[i+2];
+                    }
+                }
+                images[images.size()-1].push_back(patch);
+            }
+        }
+        clip_image_u8_free(refine_image);
+    }
+    return images;
+}
+
+int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
+    const int max_slice_nums=9;
+    const int scale_resolution=448;
+    const int original_width = ctx_clip->load_image_size->width;
+    const int original_height = ctx_clip->load_image_size->height;
+    const float log_ratio = log(1.0*original_width/original_height);
+    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
+    const int multiple = fmin(ceil(ratio), max_slice_nums);
+    std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
+    return best_grid.first;
+}
+
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
+
+    if(clip_is_minicpmv(ctx)){
+        int max_slice_nums = 9;
+        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
+        res_imgs->size = 0;
+        for (size_t i = 0; i < imgs.size(); ++i){
+            res_imgs->size += imgs[i].size();
+        }
+        res_imgs->data = new clip_image_f32[res_imgs->size];
+        int idx = 0;
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            for (size_t j = 0; j < imgs[i].size(); ++j) {
+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                clip_image_f32 * res = clip_image_f32_init();
+                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
+                res_imgs->data[idx++] = *res;
+                clip_image_f32_free(res);
+            }
+        }
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            for (size_t j = 0; j < imgs[i].size(); ++j) {
+                if (imgs[i][j] != nullptr) {
+                    clip_image_u8_free(imgs[i][j]);
+                }
+            }
+        }
+        return true;
+    }
+    else if (ctx->has_qwen2vl_merger) {
+        clip_image_u8 * resized = clip_image_u8_init();
+        auto patch_size = clip_patch_size(ctx) * 2;
+        int nx = ceil((float)img->nx / patch_size) * patch_size;
+        int ny = ceil((float)img->ny / patch_size) * patch_size;
+        bicubic_resize(*img, *resized, nx, ny);
+
+        res_imgs->data = new clip_image_f32[1];
+        // clip_image_f32 * res = clip_image_f32_init();
+        normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
+        // res_imgs->data[0] = *res;
+        res_imgs->size = 1;
+
+        // clip_image_f32_free(res);
+        clip_image_u8_free(resized);
+        return true;
+    }
+
+    if (ctx->has_glm_projector) {
+        res_imgs->size = 1;
+        res_imgs->data = new clip_image_f32[res_imgs->size];
+        clip_image_u8 resized_image;
+        int32_t sz=ctx->vision_model.hparams.image_size;
+        bicubic_resize(*img, resized_image,sz,sz);
+        clip_image_f32 * res = clip_image_f32_init();
+        //clip_image_save_to_bmp(resized_image, "resized.bmp");
+        normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
+        res_imgs->data[0] = *res;
+        clip_image_f32_free(res);
+        return true;
+    }
+
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
     auto & params = ctx->vision_model.hparams;
@@ -1506,11 +2202,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         pad_to_square = false;
     }
     // free the previous res_imgs if any set
-    if (res_imgs.size > 0) {
+    if (res_imgs->size > 0) {
         clip_image_f32_batch_free(res_imgs);
     }
-    res_imgs.data = nullptr;
-    res_imgs.size = 0;
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1565,16 +2261,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
             patches.insert(patches.begin(), image_original_resize);
             // clip_image_f32_batch_init(patches.size());
-            res_imgs.size = patches.size();
-            res_imgs.data = new clip_image_f32[res_imgs.size];
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
             int num=0;
             for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
                 num++;
             }
 
             for (size_t i = 0; i < patches.size(); i++) {
-                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                 clip_image_u8_free(patches[i]);
             }
 
@@ -1657,9 +2353,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     // }
     // res_imgs.push_back(res);
 
-    res_imgs.size = 1;
-    res_imgs.data = new clip_image_f32[res_imgs.size];
-    res_imgs.data[0] = *res;
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
     clip_image_f32_free(res);
 
     return true;
@@ -1673,11 +2369,22 @@ void clip_free(clip_ctx * ctx) {
     ggml_free(ctx->ctx_data);
     gguf_free(ctx->ctx_gguf);
 
+    ggml_backend_buffer_free(ctx->params_buffer);
+    ggml_backend_free(ctx->backend);
+    ggml_gallocr_free(ctx->compute_alloc);
     delete ctx;
 }
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    int extra_tokens = ctx->has_glm_projector ? 2 : 0;
+    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
+    clip_image_f32 img;
+    img.nx = img_w;
+    img.ny = img_h;
+    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_image_size(const struct clip_ctx * ctx) {
@@ -1701,20 +2408,128 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
+    clip_image_f32 img;
+    img.nx = ctx->vision_model.hparams.image_size;
+    img.ny = ctx->vision_model.hparams.image_size;
+    return clip_n_patches_by_img(ctx, &img);
+}
+
+int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
         n_patches /= 4;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        if (ctx->minicpmv_version == 2) {
+            n_patches = 96;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            n_patches = 64;
+        }
+        else if (ctx->minicpmv_version == 4) {
+            n_patches = 64;
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        int patch_size = params.patch_size * 2;
+        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
+        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
+        n_patches = x_patch * y_patch;
     }
 
     return n_patches;
 }
 
+static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
+    assert(embed_dim % 2 == 0);
+    int H = pos.size();
+    int W = pos[0].size();
+
+    std::vector<float> omega(embed_dim / 2);
+    for (int i = 0; i < embed_dim / 2; ++i) {
+        omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
+    }
+
+    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int d = 0; d < embed_dim / 2; ++d) {
+                float out_value = pos[h][w] * omega[d];
+                emb[h][w][d] = sin(out_value);
+                emb[h][w][d + embed_dim / 2] = cos(out_value);
+            }
+        }
+    }
+
+    return emb;
+}
+
+static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
+    assert(embed_dim % 2 == 0);
+    std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
+    std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
+
+    int H = emb_h.size();
+    int W = emb_h[0].size();
+    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
+
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            for (int d = 0; d < embed_dim / 2; ++d) {
+                emb[h][w][d] = emb_h[h][w][d];
+                emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
+            }
+        }
+    }
+    return emb;
+}
+
+static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
+    int grid_h_size = image_size.first;
+    int grid_w_size = image_size.second;
+
+    std::vector<float> grid_h(grid_h_size);
+    std::vector<float> grid_w(grid_w_size);
+
+    for (int i = 0; i < grid_h_size; ++i) {
+        grid_h[i] = static_cast<float>(i);
+    }
+    for (int i = 0; i < grid_w_size; ++i) {
+        grid_w[i] = static_cast<float>(i);
+    }
+
+    std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
+    for (int h = 0; h < grid_h_size; ++h) {
+        for (int w = 0; w < grid_w_size; ++w) {
+            grid[h][w] = grid_w[w];
+        }
+    }
+    std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
+    for (int h = 0; h < grid_h_size; ++h) {
+        for (int w = 0; w < grid_w_size; ++w) {
+            grid_2d[0][h][w] = grid_h[h];
+            grid_2d[1][h][w] = grid_w[w];
+        }
+    }
+
+    std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
+
+    int H = image_size.first;
+    int W = image_size.second;
+    std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+            pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
+        }
+    }
+
+    return pos_embed_2d;
+}
+
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -1726,7 +2541,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
     if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -1734,19 +2549,39 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     if (ctx->has_llava_projector) {
         GGML_ASSERT(batch_size == 1); // TODO: support multiple images
     }
+    if (ctx->has_minicpmv_projector) {
+        GGML_ASSERT(batch_size == 1);
+    }
+    if (ctx->has_glm_projector) {
+        GGML_ASSERT(batch_size == 1);
+        ggml_tensor * boi = ctx->vision_model.boi_w;
+        ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
+        vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
+    }
 
     // build the inference graph
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
 
     // set inputs
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    const int image_size    = hparams.image_size;
+    const int image_size = hparams.image_size;
+    int image_size_width  = image_size;
+    int image_size_height = image_size;
+    if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
+        image_size_width  = imgs->data[0].nx;
+        image_size_height = imgs->data[0].ny;
+    }
     const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_positions = num_patches + 1;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    if(ctx->load_image_size==nullptr){
+        ctx->load_image_size= clip_image_size_init();
+    }
+    const int pos_w = ctx->load_image_size->width/patch_size;
+    const int pos_h = ctx->load_image_size->height/patch_size;
 
     {
         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1755,7 +2590,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         for (size_t i = 0; i < imgs->size; i++) {
             const int nx = imgs->data[i].nx;
             const int ny = imgs->data[i].ny;
-            GGML_ASSERT(nx == image_size && ny == image_size);
+            if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
+                GGML_ASSERT(nx == image_size && ny == image_size);
+            }
 
             const int n = nx * ny;
 
@@ -1772,63 +2609,144 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
         free(data);
     }
-
-    {
-        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
-
-        void* zero_mem = malloc(ggml_nbytes(embeddings));
-        memset(zero_mem, 0, ggml_nbytes(embeddings));
-        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-        free(zero_mem);
-    }
-
-    {
-        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-
-        int* positions_data = (int*)malloc(ggml_nbytes(positions));
-        for (int i = 0; i < num_positions; i++) {
-            positions_data[i] = i;
+    if (ctx->has_minicpmv_projector) {
+        {
+            // inspired from siglip:
+            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+            int bucket_coords_h[1024];
+            int bucket_coords_w[1024];
+            for (int i = 0; i < pos_h; i++){
+                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+            }
+            for (int i = 0; i < pos_w; i++){
+                bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+            }
+            for (int i = 0, id = 0; i < pos_h; i++){
+                for (int j = 0; j < pos_w; j++){
+                    positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                }
+            }
+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+            free(positions_data);
         }
-        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-        free(positions_data);
-    }
 
-    {
-        struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-        int* patches_data = (int*)malloc(ggml_nbytes(patches));
-        for (int i = 0; i < num_patches; i++) {
-            patches_data[i] = i + 1;
+        {
+            // inspired from resampler of Qwen-VL:
+            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
+            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
+            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
+            int embed_dim = 4096;
+            if (ctx->minicpmv_version == 2) {
+                embed_dim = 4096;
+            }
+            else if (ctx->minicpmv_version == 3) {
+                embed_dim = 3584;
+            }
+            else if (ctx->minicpmv_version == 4) {
+                embed_dim = 3584;
+            }
+            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+
+            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
+            for(int i=0;i < pos_w * pos_h; ++i){
+                for(int j=0; j < embed_dim; ++j){
+                    pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
+                }
+            }
+
+            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
+            free(pos_embed_data);
+        }
+    }
+    else{
+        {
+            if (ctx->has_class_embedding) {
+                struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+
+                void* zero_mem = malloc(ggml_nbytes(embeddings));
+                memset(zero_mem, 0, ggml_nbytes(embeddings));
+                ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
+                free(zero_mem);
+            }
+        }
+
+        if (ctx->has_qwen2vl_merger) {
+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+
+            const int pw = image_size_width / patch_size;
+            const int ph = image_size_height / patch_size;
+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+
+            int ptr = 0;
+            for (int y = 0; y < ph; y+=2)
+            {
+                for (int x = 0; x < pw; x+=2)
+                {
+                    for (int dy = 0; dy < 2; dy++) {
+                        for (int dx = 0; dx < 2; dx++) {
+                            positions_data[ptr]                 = y + dy;
+                            positions_data[num_patches + ptr]     = x + dx;
+                            positions_data[num_patches * 2 + ptr] = y + dy;
+                            positions_data[num_patches * 3 + ptr] = x + dx;
+                            ptr++;
+                        }
+                    }
+                }
+            }
+
+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+            free(positions_data);
+        }
+        else {
+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+
+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+            for (int i = 0; i < num_positions; i++) {
+                positions_data[i] = i;
+            }
+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+            free(positions_data);
+
+            if (!ctx->has_glm_projector) {
+                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                for (int i = 0; i < num_patches; i++) {
+                    patches_data[i] = i + 1;
+                }
+                ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+                free(patches_data);
+            }
         }
-        ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-        free(patches_data);
     }
 
     if (ggml_backend_is_cpu(ctx->backend)) {
         ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
     }
 
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(ctx->backend)) {
-        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
-    }
-#endif
-
     ggml_backend_graph_compute(ctx->backend, gf);
 
     // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
 
+    if (ctx->has_glm_projector) {
+        //eoi
+        ggml_tensor * eoi = ctx->vision_model.eoi_w;
+        int offset = ggml_nelements(embeddings);
+        ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
+    }
+
     return true;
 }
 
 bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
-    ggml_type type = GGML_TYPE_Q4_1;
-
     assert(itype < GGML_TYPE_COUNT);
-    type = static_cast<ggml_type>(itype);
+    ggml_type type = static_cast<ggml_type>(itype);
 
     auto * ctx_clip = clip_model_load(fname_inp, 2);
 
@@ -1862,7 +2780,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
     std::vector<uint8_t> work(512);
     std::vector<float> conv_buf(512);
-    std::vector<int64_t> hist_all(1 << 4, 0);
     size_t total_size_org = 0;
     size_t total_size_new = 0;
 
@@ -1882,14 +2799,14 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             }
         }
 
-        // quantize only 2D tensors
-        quantize &= (ggml_n_dims(cur) == 2);
+        // quantize only 2D tensors and bigger than block size
+        quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
 
         if (quantize) {
             new_type = type;
             if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                 new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
             }
             const size_t n_elms = ggml_nelements(cur);
             float * f32_data;
@@ -1908,7 +2825,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                 f32_data = (float *)conv_buf.data();
                 break;
             default:
-                printf("Please use an input file in f32 or f16\n");
+                LOG_ERR("Please use an input file in f32 or f16\n");
+                gguf_free(ctx_out);
                 return false;
             }
 
@@ -1917,48 +2835,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             }
             new_data = work.data();
 
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
-            switch (new_type) {
-                case GGML_TYPE_Q4_0: {
-                    new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q4_1: {
-                    new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q5_0: {
-                    new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q5_1: {
-                    new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q8_0: {
-                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q2_K: {
-                    new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q3_K: {
-                    new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q4_K: {
-                    new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q5_K: {
-                    new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                case GGML_TYPE_Q6_K: {
-                    new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-                } break;
-                default: {
-                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
-                    return false;
-                }
-            }
-
-            for (size_t j = 0; j < hist_cur.size(); ++j) {
-                hist_all[j] += hist_cur[j];
-            }
+            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
         } else {
             new_type = cur->type;
             new_data = cur->data;
@@ -1968,14 +2845,15 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         total_size_org += orig_size;
         total_size_new += new_size;
         gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
         fout.write((const char *)new_data, new_size);
         size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
         for (size_t j = 0; j < pad; ++j) {
             fout.put(0);
         }
 
-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
                orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
     }
 
@@ -1991,19 +2869,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     gguf_free(ctx_out);
 
     {
-        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
-
-        int64_t sum_all = 0;
-        for (size_t i = 0; i < hist_all.size(); ++i) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (size_t i = 0; i < hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
-        }
-        printf("\n");
+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
     }
 
     return true;
@@ -2013,13 +2880,61 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
         return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
     }
+    if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+        return ctx->vision_model.mm_model_peg_0_b->ne[0];
+    }
     if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
         return ctx->vision_model.mm_2_b->ne[0];
     }
     if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
         return ctx->vision_model.mm_3_b->ne[0];
     }
+    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        if (ctx->minicpmv_version == 2) {
+            return 4096;
+        }
+        else if (ctx->minicpmv_version == 3) {
+            return 3584;
+        }
+        else if (ctx->minicpmv_version == 4) {
+            return 3584;
+        }
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
+        return ctx->vision_model.mm_model_mlp_3_w->ne[1];
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        return ctx->vision_model.mm_1_b->ne[0];
+    }
 
     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
     throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }
+
+int clip_is_minicpmv(const struct clip_ctx * ctx) {
+    if (ctx->has_minicpmv_projector) {
+        return ctx->minicpmv_version;
+    }
+    return 0;
+}
+
+bool clip_is_glm(const struct clip_ctx * ctx) {
+    return ctx->has_glm_projector;
+}
+bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
+    return ctx->has_qwen2vl_merger;
+}
+
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
+    clip_image_f32 clip_img;
+    clip_img.buf.resize(h * w * 3);
+    for (int i = 0; i < h*w*3; i++)
+    {
+        clip_img.buf[i] = img[i];
+    }
+    clip_img.nx = w;
+    clip_img.ny = h;
+    clip_image_encode(ctx, n_threads, &clip_img, vec);
+    return true;
+}
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index e5bd54924..841b4f6f9 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -18,14 +18,17 @@
 #    define CLIP_API
 #endif
 
-struct clip_ctx;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct clip_ctx;
 
+struct clip_image_size {
+    int width;
+    int height;
+};
+
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
@@ -42,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
 
 CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@@ -52,24 +56,30 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
+CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
 
+CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
+CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
+
+CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 
-/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
 
 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
@@ -78,6 +88,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 
+CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+
+CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+
+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert_image_encoder_to_gguf.py
similarity index 96%
rename from examples/llava/convert-image-encoder-to-gguf.py
rename to examples/llava/convert_image_encoder_to_gguf.py
index c69f89ac2..4fa1d6cea 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import json
+import re
 
 import torch
 import numpy as np
@@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
 def get_tensor_name(name: str) -> str:
     if "projection" in name:
         return name
-
     if "mm_projector" in name:
-        return name.replace("model.mm_projector", "mm")
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
 
     return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
 
@@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
 ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                 help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
@@ -182,6 +185,8 @@ else:
     fout.add_description("two-tower CLIP model")
 
 if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
     # text_model hparams
     fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
     fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@@ -256,8 +261,8 @@ if has_vision_encoder:
 
 
     if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
     else:
         image_mean = args.image_mean if args.image_mean is not None else default_image_mean
         image_std = args.image_std if args.image_std is not None else default_image_std
diff --git a/examples/llava/glmedge-convert-image-encoder-to-gguf.py b/examples/llava/glmedge-convert-image-encoder-to-gguf.py
new file mode 100644
index 000000000..848ef1cf3
--- /dev/null
+++ b/examples/llava/glmedge-convert-image-encoder-to-gguf.py
@@ -0,0 +1,280 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+from transformers import SiglipVisionModel, SiglipVisionConfig
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if name in (
+        "vision_model.head.probe",
+        "vision_model.head.attention.in_proj_weight",
+        "vision_model.head.attention.in_proj_bias",
+        "vision_model.head.attention.out_proj.weight",
+        "vision_model.head.attention.out_proj.bias",
+        "vision_model.head.layernorm.weight",
+        "vision_model.head.layernorm.bias",
+        "vision_model.head.mlp.fc1.weight",
+        "vision_model.head.mlp.fc1.bias",
+        "vision_model.head.mlp.fc2.weight",
+        "vision_model.head.mlp.fc2.bias"
+    ):
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = None
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+vision_config = SiglipVisionConfig(**v_hparams)
+model = SiglipVisionModel(vision_config)
+model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
+
+fname_middle = None
+has_text_encoder = False
+has_vision_encoder = True
+has_glm_projector = True
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_glm_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_glm_projector", has_glm_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if has_glm_projector:
+    fout.add_description("image encoder for glm4v")
+    fout.add_string("clip.projector_type", "adapter")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
+
+    image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+    image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+fout.add_bool("clip.use_gelu", True)
+
+
+if has_glm_projector:
+    # model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+        if name.startswith("vision."):
+            name=name.replace("vision.","")
+        fout.add_tensor(name, data)
+        print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
+        # print(f"Projector {name} tensors added\n")
+
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            # print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
+    # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/examples/llava/glmedge-surgery.py b/examples/llava/glmedge-surgery.py
new file mode 100644
index 000000000..16bb915d0
--- /dev/null
+++ b/examples/llava/glmedge-surgery.py
@@ -0,0 +1,33 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to GLM model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/glm.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/glm.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index e29da6cb2..40aa0876f 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,13 +1,16 @@
-#include "ggml.h"
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-
-#include "base64.hpp"
+#include "ggml.h"
 
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <vector>
 
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@@ -17,8 +20,8 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
         *n_past += n_eval;
@@ -34,21 +37,25 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
 
 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
     eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
     return true;
 }
 
-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct common_sampler * smpl,
                            struct llama_context * ctx_llama,
                            int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
+
+    const llama_model * model = llama_get_model(ctx_llama);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
     static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+    if (llama_vocab_is_eog(vocab, id)) {
         ret = "</s>";
     } else {
-        ret = llama_token_to_piece(ctx_llama, id);
+        ret = common_token_to_piece(ctx_llama, id);
     }
     eval_id(ctx_llama, id, n_past);
     return ret.c_str();
@@ -73,7 +80,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
     size_t img_base64_str_start, img_base64_str_end;
     find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
     if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
         return NULL;
     }
 
@@ -87,7 +94,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
 
     auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
     if (!embed) {
-        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
         return NULL;
     }
 
@@ -111,30 +118,31 @@ struct llava_context {
     struct llama_model * model = NULL;
 };
 
-static void show_additional_info(int /*argc*/, char ** argv) {
-    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int, char ** argv) {
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
-static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
+static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
 
     // load and preprocess the image
     llava_image_embed * embed = NULL;
     auto prompt = params->prompt;
     if (prompt_contains_image(prompt)) {
         if (!params->image.empty()) {
-            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
+            LOG_INF("using base64 encoded image instead of command line image path\n");
         }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
-            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
     } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
         if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
             return NULL;
         }
     }
@@ -142,11 +150,10 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
     return embed;
 }
 
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
     int n_past = 0;
 
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
 
     std::string system_prompt, user_prompt;
     size_t image_pos = prompt.find("<image>");
@@ -154,18 +161,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
         system_prompt = prompt.substr(0, image_pos);
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        printf("system_prompt: %s\n", system_prompt.c_str());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
-        printf("user_prompt: %s\n", user_prompt.c_str());
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     } else {
@@ -173,29 +180,34 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
         user_prompt = prompt + "\nASSISTANT:";
         if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     }
 
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
     llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
     eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
 
     // generate the response
 
-    fprintf(stderr, "\n");
+    LOG("\n");
+
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
+    if (!smpl) {
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
     std::string response = "";
     for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
         response += tmp;
         if (strcmp(tmp, "</s>") == 0) break;
         if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        LOG("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
         if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -203,12 +215,25 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         fflush(stdout);
     }
 
-    llama_sampling_free(ctx_sampling);
-    printf("\n");
+    common_sampler_free(smpl);
+    LOG("\n");
 }
 
+static struct llama_model * llava_init(common_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
 
-static struct llava_context * llava_init(gpt_params * params) {
+    llama_model_params model_params = common_model_params_to_llama(*params);
+
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
     const char * clip_path = params->mmproj.c_str();
 
     auto prompt = params->prompt;
@@ -218,28 +243,17 @@ static struct llava_context * llava_init(gpt_params * params) {
 
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }
-
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
 
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;
     ctx_llava->ctx_clip = ctx_clip;
@@ -254,42 +268,65 @@ static void llava_free(struct llava_context * ctx_llava) {
     }
 
     llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
+    llama_model_free(ctx_llava->model);
     llama_backend_free();
 }
 
 int main(int argc, char ** argv) {
     ggml_time_init();
 
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
         return 1;
     }
+
+    common_init();
+
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
+        print_usage(argc, argv);
         return 1;
     }
 
-    auto ctx_llava = llava_init(&params);
-    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+    auto * model = llava_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
         return 1;
     }
 
-    auto image_embed = load_image(ctx_llava, &params);
-    if (!image_embed) {
-        return 1;
+    if (prompt_contains_image(params.prompt)) {
+        auto * ctx_llava = llava_init_context(&params, model);
+
+        auto * image_embed = load_image(ctx_llava, &params, "");
+
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+        llama_perf_context_print(ctx_llava->ctx_llama);
+        llava_image_embed_free(image_embed);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+    } else {
+        for (auto & image : params.image) {
+            auto * ctx_llava = llava_init_context(&params, model);
+
+            auto * image_embed = load_image(ctx_llava, &params, image);
+            if (!image_embed) {
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_perf_context_print(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
     }
 
-    // process the prompt
-    process_prompt(ctx_llava, image_embed, &params, params.prompt);
+    llama_model_free(model);
 
-    llama_print_timings(ctx_llava->ctx_llama);
-
-    llava_image_embed_free(image_embed);
-    llava_free(ctx_llava);
     return 0;
 }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 980128166..300714045 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,13 +1,27 @@
 #include "clip.h"
-#include "common.h"
-#include "llama.h"
 #include "llava.h"
-#include "base64.hpp"
 
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <limits>
 #include <vector>
-#include <numeric>
+
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -54,7 +68,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -88,7 +102,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
     struct {
-        struct ggml_tensor * newline;
         struct ggml_context * ctx;
     } model;
 
@@ -150,20 +163,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     model.ctx = ggml_init(params);
 
-    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
-    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
-    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
-        if (newline_tmp->buffer == NULL) {
-            printf("newline_tmp tensor buffer is NULL\n");
-        }
-        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
-    } else {
-        model.newline->data = newline_tmp->data;
-        if (model.newline->data == NULL) {
-            printf("newline_tmp tensor data is NULL\n");
-        }
-    }
-
     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
@@ -199,7 +198,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
     ggml_build_forward_expand(gf, flatten);
     ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor* result = ggml_graph_node(gf, -1);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
@@ -217,14 +216,41 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     return true;
 }
 
+static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
+    int width = image->nx;
+    int height = image->ny;
+    int num_patches = (height / patch_size) * (width / patch_size);
+    clip_image_f32 * patch = clip_image_f32_init();
+    patch->nx = patch_size * num_patches;
+    patch->ny = patch_size;
+    patch->buf.resize(3 * patch->nx * patch->ny);
+
+    int patch_index = 0;
+
+    for (int i = 0; i < height; i += patch_size) {
+        for (int j = 0; j < width; j += patch_size) {
+            for (int pi = 0; pi < patch_size; ++pi) {
+                for (int pj = 0; pj < patch_size; ++pj) {
+                    int input_index = ((i + pi) * width + (j + pj)) * 3;
+                    int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
+                    patch->buf[output_index] = image->buf[input_index];
+                    patch->buf[output_index+1] = image->buf[input_index+1];
+                    patch->buf[output_index+2] = image->buf[input_index+2];
+                }
+            }
+            patch_index++;
+        }
+    }
+    return patch;
+}
 
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
     // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
     clip_image_f32_batch img_res_v;
     img_res_v.size = 0;
     img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
-        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
+        LOG_ERR("%s: unable to preprocess image\n", __func__);
         delete[] img_res_v.data;
         return false;
     }
@@ -233,17 +259,84 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
     const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
 
-    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size);
+        struct clip_image_size * load_image_size = clip_image_size_init();
+
+        for (size_t i = 0; i < img_res_v.size; i++) {
+            const int64_t t_img_enc_step_start_us = ggml_time_us();
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            int patch_size=14;
+            load_image_size->width = img_res_v.data[i].nx;
+            load_image_size->height = img_res_v.data[i].ny;
+            clip_add_load_image_size(ctx_clip, load_image_size);
+
+            bool encoded = false;
+            if (clip_is_qwen2vl(ctx_clip)) {
+                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+            }
+            else {
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+            }
+
+            if (!encoded) {
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                return false;
+            }
+            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+        int n_img_pos_out = 0;
+        for (size_t i = 0; i < image_embd_v.size(); i++) {
+            std::memcpy(
+                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
+                image_embd_v[i],
+                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
+        }
+        *n_img_pos = n_img_pos_out;
+        for (size_t i = 0; i < image_embd_v.size(); i++) {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+        load_image_size->width = img->nx;
+        load_image_size->height = img->ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
+    }
+    else if (clip_is_glm(ctx_clip)){
+        struct clip_image_size * load_image_size = clip_image_size_init();
+        load_image_size->width = img_res_v.data[0].nx;
+        load_image_size->height = img_res_v.data[0].ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
+        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
+        *n_img_pos = (pos * pos + 2);
+        if (!encoded){
+            LOG_ERR("Unable to encode image \n");
+            return false;
+        }
+    }
+    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
         *n_img_pos = clip_n_patches(ctx_clip);
         bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
         delete[] img_res_v.data;
         if (!encoded) {
-            fprintf(stderr, "Unable to encode image\n");
+            LOG_ERR("Unable to encode image\n");
 
             return false;
         }
-    } else {
+    }
+    else {
         // spatial_unpad llava-1.6 type embedding
         // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
         std::vector<float *> image_embd_v;
@@ -252,12 +345,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
             const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         const int32_t * image_grid = clip_image_grid(ctx_clip);
 
@@ -290,37 +383,50 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
     }
 
-    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
 
-    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
 
     return true;
 }
 
 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
         // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
     auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
     if (n_image_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
         return false;
     }
     return true;
 }
 
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
+    int num_max_patches = 6;
+    if (clip_is_minicpmv(ctx_clip)) {
+        num_max_patches = 10;
+    }
+    if (clip_is_glm(ctx_clip)) {
+        num_max_patches = 1;
+    }
+    float * image_embd;
+    if (clip_is_qwen2vl(ctx_clip)) {
+        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
+        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
+    } else {
+        image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    }
     if (!image_embd) {
-        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+        LOG_ERR("Unable to allocate memory for image embeddings\n");
         return false;
     }
 
     int n_img_pos;
     if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
+        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
         free(image_embd);
         return false;
     }
@@ -330,17 +436,51 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     return true;
 }
 
+struct llava_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
 
     for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
         int n_eval = image_embed->n_image_pos - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+        float * embd = image_embed->embed+i*n_embd;
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+        if (llama_decode(ctx_llama, llava_batch.batch)) {
+            LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
         *n_past += n_eval;
@@ -352,7 +492,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     clip_image_u8 * img = clip_image_u8_init();
     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
         clip_image_u8_free(img);
-        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
         return NULL;
     }
 
@@ -361,7 +501,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
     if (!image_embed_result) {
         clip_image_u8_free(img);
-        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: couldn't embed the image\n", __func__);
         return NULL;
     }
 
@@ -375,7 +515,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
     auto file = fopen(path, "rb");
     if (file == NULL) {
-        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
+        LOG_ERR("%s: can't read file %s\n", __func__, path);
         return false;
     }
 
@@ -385,7 +525,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
 
     auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
     if (buffer == NULL) {
-        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
         perror("Memory allocation error");
         fclose(file);
         return false;
@@ -393,10 +533,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
     errno = 0;
     size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
     if (ferror(file)) {
-        die_fmt("read error: %s", strerror(errno));
+        LOG_ERR("read error: %s", strerror(errno));
+        free(buffer);
+        fclose(file);
+        return false;
     }
     if (ret != (size_t) fileSize) {
-        die("unexpectedly reached end of file");
+        LOG_ERR("unexpectedly reached end of file");
+        free(buffer);
+        fclose(file);
+        return false;
     }
     fclose(file); // Close the file
 
@@ -410,7 +556,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
     long image_bytes_length;
     auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
     if (!loaded) {
-        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
+        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
         return NULL;
     }
 
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index 2d40f3f1d..b6feb3027 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -17,28 +17,27 @@
 #    define LLAVA_API
 #endif
 
-struct clip_ctx;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+struct clip_ctx;
 struct llava_image_embed {
     float * embed;
     int n_image_pos;
 };
 
 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
 
-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** free an embedding made with llava_image_embed_make_* */
+LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 
 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava_surgery.py
similarity index 100%
rename from examples/llava/llava-surgery.py
rename to examples/llava/llava_surgery.py
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava_surgery_v2.py
similarity index 94%
rename from examples/llava/llava-surgery-v2.py
rename to examples/llava/llava_surgery_v2.py
index eb56d6988..2d5b32fe6 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava_surgery_v2.py
@@ -2,7 +2,9 @@ import argparse
 import glob
 import os
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
 
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
@@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
 def load_model(file_path):
     if is_safetensor_file(file_path):
         tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
             for key in f.keys():
                 tensors[key] = f.get_tensor(key).clone()
                 # output shape
@@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
     if last_checkpoint is not None:
         for k, v in last_checkpoint.items():
             print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
     print("No tensors found. Is this a LLaVA model?")
     exit()
 
@@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
+    assert last_checkpoint is not None
     projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
+    assert first_checkpoint is not None
     projector[name] = first_checkpoint[name].float()
 
 if len(projector) > 0:
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
new file mode 100644
index 000000000..53d902d61
--- /dev/null
+++ b/examples/llava/minicpmv-cli.cpp
@@ -0,0 +1,335 @@
+#include "arg.h"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <iostream> // TODO: remove me
+
+struct llava_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llama_model * llava_init(common_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = common_model_params_to_llama(*params);
+
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
+    if (params->n_ctx < 2048) {
+        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
+        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        ctx_params.n_ctx = 2048;
+    } else {
+        ctx_params.n_ctx = params->n_ctx;
+    }
+
+    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->model = model;
+    return ctx_llava;
+}
+
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_llava->ctx_llama);
+    llama_model_free(ctx_llava->model);
+    llama_backend_free();
+}
+
+static struct clip_ctx * clip_init_context(common_params * params) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    return ctx_clip;
+}
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
+    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+}
+
+static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
+    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
+    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
+
+    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    slice_embed->embed = image_embed;
+    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
+    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
+    llava_image_embed_free(slice_embed);
+}
+
+static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
+    std::string system_prompt;
+    int idx = 0;
+    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
+    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+    if (has_minicpmv_projector == 2) {
+        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
+    }
+    else if (has_minicpmv_projector == 3) {
+        system_prompt = "<|im_start|>user\n";
+    }
+    else if (has_minicpmv_projector == 4) {
+        system_prompt = "<|im_start|>user\n";
+    }
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
+    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
+    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+    if (num_image_embeds > 1) {
+        size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
+        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
+        for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
+            for (size_t j = 0; j < num_image_embeds_col; ++j) {
+                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+                process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
+                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+                if (j == num_image_embeds_col - 1) {
+                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
+                }
+            }
+        }
+        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
+    }
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+}
+
+static const char * sample(struct common_sampler * smpl,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
+
+    const llama_model * model = llama_get_model(ctx_llama);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    static std::string ret;
+    if (llama_vocab_is_eog(vocab, id)) {
+        ret = "</s>";
+    } else {
+        ret = common_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
+    auto * ctx_clip = clip_init_context(params);
+    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    if (!embeds) {
+        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
+        return NULL;
+    }
+
+    // process the prompt
+    if (params->prompt.empty() && params->interactive == false) {
+        LOG_ERR("prompt should be given or interactive mode should be on");
+        return NULL;
+    }
+
+    auto * model = llava_init(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
+        return NULL;
+    }
+    const int64_t t_llava_init_start_us = ggml_time_us();
+    auto * ctx_llava = llava_init_context(params, model);
+    ctx_llava->ctx_clip = ctx_clip;
+    const int64_t t_llava_init_end_us = ggml_time_us();
+    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
+    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+
+    const int64_t t_process_image_start_us = ggml_time_us();
+    process_image(ctx_llava, embeds, params, n_past);
+    const int64_t t_process_image_end_us = ggml_time_us();
+    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
+    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+
+    llava_image_embed_free(embeds);
+    return ctx_llava;
+}
+
+static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
+    std::string user_prompt = prompt;
+    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+    if (!is_first) {
+        if (has_minicpmv_projector == 2) {
+            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
+        }
+        else if (has_minicpmv_projector == 3) {
+            user_prompt = "<|im_start|>user\n" + prompt;
+        }
+        else if (has_minicpmv_projector == 4) {
+            user_prompt = "<|im_start|>user\n" + prompt;
+        }
+    }
+
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+    if (has_minicpmv_projector == 2) {
+        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
+    }
+    else if (has_minicpmv_projector == 3) {
+        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+    }
+    else if (has_minicpmv_projector == 4) {
+        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+    }
+
+    // generate the response
+
+    LOG_INF("\n");
+
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
+    return smpl;
+}
+
+static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
+
+    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
+    return tmp;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+        return 1;
+    }
+
+    common_init();
+
+    if (params.mmproj.empty() || (params.image.empty())) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    for (auto & image : params.image) {
+        int n_past = 0;
+        auto * ctx_llava = minicpmv_init(&params, image, n_past);
+
+        if (!params.prompt.empty()) {
+            LOG("<user>%s\n", params.prompt.c_str());
+            LOG("<assistant>");
+            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
+            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+            std::string response;
+            bool have_tmp = false;
+            for (int i = 0; i < max_tgt_len; i++) {
+                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                response += tmp;
+                if (strcmp(tmp, "</s>") == 0){
+                    if (!have_tmp) {
+                        continue;
+                    }
+                    break;
+                }
+                if (strstr(tmp, "###")) break; // Yi-VL behavior
+                have_tmp = true;
+                printf("%s", tmp);
+                if (strstr(response.c_str(), "<user>")) break; // minicpm-v
+
+                fflush(stdout);
+            }
+            common_sampler_free(smpl);
+        }else {
+            while (true) {
+                LOG("<user>");
+                std::string prompt;
+                std::getline(std::cin, prompt);
+                LOG("<assistant>");
+                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+                std::string response;
+                for (int i = 0; i < max_tgt_len; i++) {
+                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                    response += tmp;
+                    if (strcmp(tmp, "</s>") == 0) break;
+                    printf("%s", tmp);// mistral llava-1.6
+                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
+                    fflush(stdout);
+                }
+                common_sampler_free(smpl);
+            }
+        }
+        printf("\n");
+        llama_perf_context_print(ctx_llava->ctx_llama);
+
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+    }
+
+    return 0;
+}
diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
new file mode 100644
index 000000000..9b196757f
--- /dev/null
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -0,0 +1,815 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model. """
+# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+
+
+import os
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import (
+    logging,
+)
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+):
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    denom = fan_in
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = (
+            SiglipAttention(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+import argparse
+import json
+import re
+
+import numpy as np
+from gguf import *
+from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def add_key_str(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_minicpmv and name in ["visual_projection.weight"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+# if args.clip_model_is_vision or args.clip_model_is_openclip:
+#     model = CLIPVisionModel.from_pretrained(dir_model)
+#     processor = None
+# else:
+#     model = CLIPModel.from_pretrained(dir_model)
+#     processor = CLIPProcessor.from_pretrained(dir_model)
+
+minicpmv_version = args.minicpmv_version
+emb_dim = 4096
+block_count = 26
+if minicpmv_version == 1:
+    emb_dim = 2304
+    block_count = 26
+elif minicpmv_version == 2:
+    emb_dim = 4096
+    block_count = 27
+elif minicpmv_version == 3:
+    emb_dim = 3584
+    block_count = 27
+elif minicpmv_version == 4:
+    emb_dim = 3584
+    block_count = 27
+
+default_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 980,
+        "intermediate_size": 4304,
+        "model_type": "idefics2",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+
+vision_config = Idefics2VisionConfig(**default_vision_config)
+model = Idefics2VisionTransformer(vision_config)
+if minicpmv_version == 3:
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 4:
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+
+processor = None
+# if model.attn_pool is not None:
+#     model.attn_pool = torch.nn.Identity()
+
+# model.blocks = model.blocks[:-1]
+model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_minicpmv_projector = False
+
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.minicpmv_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_minicpmv_projector = True
+    minicpmv_version = 4
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
+fout.add_file_type(ftype)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_minicpmv_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_minicpmv_projector:
+    fout.add_description("image encoder for MiniCPM-V")
+    # add projector type
+    fout.add_string("clip.projector_type", "resampler")
+    fout.add_int32("clip.minicpmv_version", minicpmv_version)
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", 448)
+    fout.add_uint32("clip.vision.patch_size", 14)
+    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
+    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
+    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = True
+fout.add_bool("clip.use_gelu", use_gelu)
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+def _replace_name_resampler(s, v):
+    if re.match("resampler.pos_embed", s):
+        return {
+            s: v,
+            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+        }
+    if re.match("resampler.proj", s):
+        return {
+            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
+        }
+    if re.match("resampler.attn.in_proj_.*", s):
+        return {
+            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
+            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
+            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
+        }
+    return {s: v}
+
+if has_minicpmv_projector:
+    projector = torch.load(args.minicpmv_projector)
+    new_state_dict = {}
+    for k, v in projector.items():
+        kvs = _replace_name_resampler(k, v)
+        for nk, nv in kvs.items():
+            new_state_dict[nk] = nv
+    projector = new_state_dict
+    ftype_cur = 0
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        data = data.squeeze().numpy()
+
+        n_dims = len(data.shape)
+        if ftype == 1:
+            if name[-7:] == ".weight" and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype_cur = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+        else:
+            if data.dtype != np.float32:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+
+        fout.add_tensor(name, data)
+        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+
+    print("Projector tensors added\n")
+
+def _replace_name(s, v):
+    s = "vision_model." + s
+    if re.match("vision_model.embeddings.position_embedding", s):
+        v = v.unsqueeze(0)
+        return {s: v}
+
+    return {s: v}
+
+state_dict = model.state_dict()
+new_state_dict = {}
+for k, v in state_dict.items():
+    kvs = _replace_name(k, v)
+    for nk, nv in kvs.items():
+        new_state_dict[nk] = nv
+state_dict = new_state_dict
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py
new file mode 100644
index 000000000..ba8211658
--- /dev/null
+++ b/examples/llava/minicpmv-surgery.py
@@ -0,0 +1,45 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/minicpmv.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/minicpmv.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+config = model.llm.config
+config.auto_map = {
+    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
+    "AutoModel": "modeling_minicpm.MiniCPMModel",
+    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
+}
+model.llm.save_pretrained(f"{args.model}/model")
+tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+tok.save_pretrained(f"{args.model}/model")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py
new file mode 100644
index 000000000..c87606b4f
--- /dev/null
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -0,0 +1,165 @@
+import argparse
+from typing import Dict
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLProcessor,
+    AutoProcessor,
+    Qwen2VLConfig
+)
+
+
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def to_gguf_name(name: str) -> str:
+    og = name
+    name = name.replace("text_model", "t").replace("vision_model", "v")
+    name = name.replace("blocks", "blk").replace("embeddings.", "")
+    name = name.replace("attn.", "attn_")
+    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
+    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
+    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+    name = name.replace("merger.mlp", 'mm')
+    print(f"[to_gguf_name] {og} --> {name}")
+    return name
+
+
+def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
+    vision_model = qwen2vl.visual
+    tensor_map = {}
+    for name, ten in vision_model.state_dict().items():
+        ten = ten.numpy()
+        if 'qkv' in name:
+            if ten.ndim == 2: # weight
+                c3, _ = ten.shape
+            else:             # bias
+                c3 = ten.shape[0]
+            assert c3 % 3 == 0
+            c = c3 // 3
+            wq = ten[:c]
+            wk = ten[c: c * 2]
+            wv = ten[c * 2:]
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
+            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
+        elif 'merger' in name:
+            if name.endswith("ln_q.weight"):
+                tensor_map['v.post_ln.weight'] = ten
+            elif name.endswith("ln_q.bias"):
+                tensor_map['v.post_ln.bias'] = ten
+            else:
+                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
+                tensor_map[to_gguf_name(name)] = ten
+        elif 'patch_embed.proj.weight' in name:
+            # NOTE: split Conv3D into Conv2Ds
+            c1, c2, kt, kh, kw = ten.shape
+            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
+            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
+        else:
+            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
+
+    for new_name, ten in tensor_map.items():
+        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
+            tensor_map[new_name] = ten.astype(np.float32)
+        else:
+            tensor_map[new_name] = ten.astype(dtype)
+    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
+    return tensor_map
+
+
+def main(args):
+    if args.data_type == 'fp32':
+        dtype = torch.float32
+        np_dtype = np.float32
+        ftype = 0
+    elif args.data_type == 'fp16':
+        dtype = torch.float32
+        np_dtype = np.float16
+        ftype = 1
+    else:
+        raise ValueError()
+
+    local_model = False
+    model_path = ""
+    model_name = args.model_name
+    print("model_name: ", model_name)
+    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_name, torch_dtype=dtype, device_map="cpu"
+    )
+    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+    vcfg = cfg.vision_config
+
+    if os.path.isdir(model_name):
+        local_model = True
+        if model_name.endswith(os.sep):
+            model_name = model_name[:-1]
+        model_path = model_name
+        model_name = os.path.basename(model_name)
+    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
+
+    fout = GGUFWriter(path=fname_out, arch="clip")
+    fout.add_description("image encoder for Qwen2VL")
+
+    fout.add_file_type(ftype)
+    fout.add_bool("clip.has_text_encoder", False)
+    fout.add_bool("clip.has_vision_encoder", True)
+    fout.add_bool("clip.has_qwen2vl_merger", True)
+    fout.add_string("clip.projector_type", "qwen2vl_merger")
+
+    print(cfg.vision_config)
+    if 'silu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", True)
+        fout.add_bool("clip.use_gelu", False)
+    elif 'gelu' in cfg.vision_config.hidden_act.lower():
+        fout.add_bool("clip.use_silu", False)
+        fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
+    else:
+        raise ValueError()
+
+    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
+    for name, data in tensor_map.items():
+        fout.add_tensor(name, data)
+
+    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
+    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
+    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0)  # not sure what this does, put 0 here as a placeholder
+    fout.add_name(model_name)
+    """
+    HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
+            it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
+    """
+
+    if local_model:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path)
+    else:
+        processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
+    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
+    fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
+
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print("save model as: ", fname_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
+    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
new file mode 100644
index 000000000..132a7da54
--- /dev/null
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -0,0 +1,584 @@
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+#ifdef NDEBUG
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+
+
+static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
+                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
+    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
+    const int patch_size = 14 * 2;
+    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
+    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
+    auto img_tokens = image_embed->n_image_pos;
+    // llama_pos mrope_pos[img_tokens * 4];
+    std::vector<llama_pos> mrope_pos;
+    mrope_pos.resize(img_tokens * 4);
+
+    for (int y = 0; y < ph; y++)
+    {
+        for (int x = 0; x < pw; x++)
+        {
+            int i = y * pw + x;
+            mrope_pos[i] = *st_pos_id;
+            mrope_pos[i + img_tokens] = *st_pos_id + y;
+            mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
+            mrope_pos[i + img_tokens * 3] = 0;
+        }
+    }
+    *st_pos_id += std::max(pw, ph);
+
+    int processed = 0;
+    std::vector<llama_pos> batch_mrope_pos;
+    batch_mrope_pos.resize(img_tokens * 4);
+
+    for (int i = 0; i < img_tokens; i += n_batch) {
+        int n_eval = img_tokens - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+
+        // llama_pos batch_mrope_pos[n_eval * 4];
+        std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
+        memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
+
+        llama_batch batch = {
+            int32_t(n_eval),                // n_tokens
+            nullptr,                        // token
+            (image_embed->embed+i*n_embd),  // embed
+            batch_mrope_pos.data(),         // pos
+            nullptr,  // n_seq_id
+            nullptr,  // seq_id
+            nullptr,  // logits
+        };
+
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+        processed += n_eval;
+    }
+    return true;
+}
+
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
+    int N = (int) tokens.size();
+    std::vector<llama_pos> pos;
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        auto batch = llama_batch_get_one(&tokens[i], n_eval);
+        // TODO: add mrope pos ids somewhere else
+        pos.resize(batch.n_tokens * 4);
+        std::fill(pos.begin(), pos.end(), 0);
+        for (int j = 0; j < batch.n_tokens * 3; j ++) {
+            pos[j] = *st_pos_id + (j % batch.n_tokens);
+        }
+        batch.pos = pos.data();
+
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+        *st_pos_id += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
+    return true;
+}
+
+static const char * sample(struct common_sampler * smpl,
+                           struct llama_context * ctx_llama,
+                           int * n_past, int * st_pos_id) {
+    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
+    common_sampler_accept(smpl, id, true);
+
+    const llama_model * model = llama_get_model(ctx_llama);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    static std::string ret;
+    if (llama_vocab_is_eog(vocab, id)) {
+        ret = "</s>";
+    } else {
+        ret = common_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past, st_pos_id);
+    return ret.c_str();
+}
+
+static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char* IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t img_base64_str_start, img_base64_str_end;
+    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        return NULL;
+    }
+
+    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct llava_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
+
+    // load and preprocess the image
+    llava_image_embed * embed = NULL;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            LOG_INF("using base64 encoded image instead of command line image path\n");
+        }
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
+        if (!embed) {
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
+            return NULL;
+        }
+        params->prompt = remove_image_from_prompt(prompt);
+    } else {
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
+        if (!embed) {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+            return NULL;
+        }
+    }
+
+    return embed;
+}
+
+static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
+    int n_past = 0;
+    int cur_pos_id = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    std::string system_prompt, user_prompt;
+    size_t image_pos = prompt.find("<|vision_start|>");
+    if (image_pos != std::string::npos) {
+        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+        system_prompt = prompt.substr(0, image_pos);
+        user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    } else {
+        // llava-1.5 native mode
+        system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
+        user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
+        if (params->verbose_prompt) {
+            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            for (int i = 0; i < (int) tmp.size(); i++) {
+                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+            }
+        }
+    }
+
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
+    if (image_embed != nullptr) {
+        auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
+        qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
+    }
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
+
+    // generate the response
+
+    LOG("\n");
+
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
+    if (!smpl) {
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
+        response += tmp;
+        if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break; // Yi-VL behavior
+        LOG("%s", tmp);
+        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
+        fflush(stdout);
+    }
+
+    common_sampler_free(smpl);
+    LOG("\n");
+}
+
+static struct llama_model * llava_init(common_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = common_model_params_to_llama(*params);
+
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    llama_context_params ctx_params = common_context_params_to_llama(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_clip = ctx_clip;
+    ctx_llava->model = model;
+    return ctx_llava;
+}
+
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_llava->ctx_llama);
+    llama_model_free(ctx_llava->model);
+    llama_backend_free();
+}
+
+#ifndef NDEBUG
+
+static void debug_test_mrope_2d() {
+    // 1. Initialize backend
+    ggml_backend_t backend = NULL;
+    std::string backend_name = "";
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    backend = ggml_backend_cuda_init(0); // init device 0
+    backend_name = "cuda";
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#endif
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!backend) {
+        backend = ggml_backend_cpu_init();
+        backend_name = "cpu";
+    }
+
+    // Calculate the size needed to allocate
+    size_t ctx_size = 0;
+    ctx_size += 2 * ggml_tensor_overhead(); // tensors
+    // no need to allocate anything else!
+
+    // 2. Allocate `ggml_context` to store tensor data
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
+    ggml_set_name(pos, "pos");
+    ggml_set_input(pos);
+
+    std::vector<float> dummy_q;
+    dummy_q.resize(128 * 12 * 30);
+    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
+    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
+
+    std::vector<int> pos_id;
+    pos_id.resize(30 * 4);
+    for (int i = 0; i < 30; i ++) {
+        pos_id[i] = i;
+        pos_id[i + 30] = i + 10;
+        pos_id[i + 60] = i + 20;
+        pos_id[i + 90] = i + 30;
+    }
+    int sections[4] = {32, 32, 0, 0};
+
+    // 4. Allocate a `ggml_backend_buffer` to store all tensors
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+    // 5. Copy tensor data from main memory (RAM) to backend buffer
+    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
+    ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
+
+    // 6. Create a `ggml_cgraph` for mul_mat operation
+    struct ggml_cgraph * gf = NULL;
+    struct ggml_context * ctx_cgraph = NULL;
+
+    // create a temporally context to build the graph
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+    };
+    ctx_cgraph = ggml_init(params0);
+    gf = ggml_new_graph(ctx_cgraph);
+
+    struct ggml_tensor * result0 = ggml_rope_multi(
+        ctx_cgraph, inp_raw, pos, nullptr,
+        128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
+        0, 1, 32, 1);
+
+    // Add "result" tensor and all of its dependencies to the cgraph
+    ggml_build_forward_expand(gf, result0);
+
+    // 7. Create a `ggml_gallocr` for cgraph computation
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    // 9. Run the computation
+    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
+    if (ggml_backend_is_cpu(backend)) {
+        ggml_backend_cpu_set_n_threads(backend, n_threads);
+    }
+    ggml_backend_graph_compute(backend, gf);
+
+    // 10. Retrieve results (output tensors)
+    // in this example, output tensor is always the last tensor in the graph
+    struct ggml_tensor * result = result0;
+    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
+    float * result_data = (float *)malloc(ggml_nbytes(result));
+    // because the tensor data is stored in device buffer, we need to copy it back to RAM
+    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
+    const std::string bin_file = "mrope_2d_" + backend_name +".bin";
+    std::ofstream outFile(bin_file, std::ios::binary);
+
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
+        outFile.close();
+        std::cout << "Data successfully written to " + bin_file << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+
+    free(result_data);
+    // 11. Free memory and exit
+    ggml_free(ctx_cgraph);
+    ggml_gallocr_free(allocr);
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+}
+
+static void debug_dump_img_embed(struct llava_context * ctx_llava) {
+    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
+    int ne = n_embd * 4;
+    float vals[56 * 56 * 3];
+    // float embd[ne];
+    std::vector<float> embd;
+    embd.resize(ne);
+
+    for (int i = 0; i < 56*56; i++)
+    {
+        for (int c = 0; c < 3; c++)
+            vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
+    }
+
+    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
+
+    std::ofstream outFile("img_embed.bin", std::ios::binary);
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
+
+        outFile.close();
+        std::cout << "Data successfully written to mrope.bin" << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+}
+
+#endif
+
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    auto * model = llava_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
+        return 1;
+    }
+
+    if (prompt_contains_image(params.prompt)) {
+        auto * ctx_llava = llava_init_context(&params, model);
+
+        auto * image_embed = load_image(ctx_llava, &params, "");
+
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+        llama_perf_context_print(ctx_llava->ctx_llama);
+        llava_image_embed_free(image_embed);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+#ifndef NDEBUG
+    } else if (params.image[0].empty()) {
+        auto ctx_llava = llava_init_context(&params, model);
+
+        debug_test_mrope_2d();
+        debug_dump_img_embed(ctx_llava);
+
+        llama_perf_context_print(ctx_llava->ctx_llama);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+#endif
+    } else {
+        for (auto & image : params.image) {
+            auto * ctx_llava = llava_init_context(&params, model);
+
+            auto * image_embed = load_image(ctx_llava, &params, image);
+            if (!image_embed) {
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_perf_context_print(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
+    }
+
+    llama_model_free(model);
+
+    return 0;
+}
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
index f80f727a7..cbcbf26c9 100644
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,3 +1,5 @@
--r ../../requirements/requirements-convert.txt
+-r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.1.1
+torch~=2.2.1
+torchvision~=0.17.1
diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt
index 8827e3f11..346861314 100644
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET lookahead)
+set(TARGET llama-lookahead)
 add_executable(${TARGET} lookahead.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index e2551e7a4..2f0898e62 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,7 +1,9 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -35,56 +37,51 @@ struct ngram_container {
 };
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
+    common_init();
+
     const int W = 15; // lookahead window
     const int N = 5;  // n-gram size
     const int G = 15; // max verification n-grams
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookahead", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the target model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // Tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
     std::vector<llama_token> inp;
     std::vector<llama_token> all;
 
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
     all = inp;
 
     const int max_context_size     = llama_n_ctx(ctx);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -94,8 +91,8 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
         llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
@@ -120,7 +117,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
@@ -152,7 +149,7 @@ int main(int argc, char ** argv) {
     }
 
     // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_n_vocab(model), N, G);
+    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
 
     // debug
     struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
@@ -161,14 +158,14 @@ int main(int argc, char ** argv) {
 
     // sample first token
     {
-        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
+        id = common_sampler_sample(smpl, ctx, 0);
 
-        llama_sampling_accept(ctx_sampling, ctx, id, true);
+        common_sampler_accept(smpl, id, true);
 
         {
-            const std::string token_str = llama_token_to_piece(ctx, id);
+            const std::string token_str = common_token_to_piece(ctx, id);
 
-            printf("%s", token_str.c_str());
+            LOG("%s", token_str.c_str());
             fflush(stdout);
         }
     }
@@ -177,7 +174,7 @@ int main(int argc, char ** argv) {
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
         // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -206,10 +203,10 @@ int main(int argc, char ** argv) {
         //                                                      V  V  V  V  V  V
         //                                                             id
         {
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             // current token - first token of the first level
-            llama_batch_add(batch, id, n_past, seq_id_all, true);
+            common_batch_add(batch, id, n_past, seq_id_all, true);
 
             // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
             {
@@ -234,7 +231,7 @@ int main(int argc, char ** argv) {
                         ngrams_cur[g].tokens [j + 1] = t;
                         ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
 
-                        llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
+                        common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
                     }
                 }
             }
@@ -246,19 +243,19 @@ int main(int argc, char ** argv) {
                     seq_id_look[j] = i + j + 1;
                 }
 
-                llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
+                common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
             }
 
             // fill the rest of the levels
             for (int j = 1; j < N - 1; j++) {
                 for (int i = 0; i < W; i++) {
-                    llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
+                    common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
                 }
             }
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
             return 1;
         }
 
@@ -286,23 +283,23 @@ int main(int argc, char ** argv) {
             }
 
             // sample the next token
-            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
+            id = common_sampler_sample(smpl, ctx, i_batch);
 
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            common_sampler_accept(smpl, id, true);
 
             // print
             {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
 
                 if (v == 0) {
-                    printf("%s", token_str.c_str());
+                    LOG("%s", token_str.c_str());
                 } else {
                     // print light cyan
-                    printf("\033[0;96m%s\033[0m", token_str.c_str());
+                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
                 }
                 fflush(stdout);
 
-                if (id == llama_token_eos(model)) {
+                if (llama_vocab_is_eog(vocab, id)) {
                     has_eos = true;
                 }
 
@@ -332,21 +329,21 @@ int main(int argc, char ** argv) {
             // print known n-grams starting with token id (debug)
             if (0 && v == 0) {
                 if (ngrams_observed.cnt[id] > 0) {
-                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
                 }
 
                 for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    printf("   - ngram %2d: ", i);
+                    LOG("   - ngram %2d: ", i);
 
                     const int idx = id*(N - 1)*G + i*(N - 1);
 
                     for (int j = 0; j < N - 1; j++) {
-                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
+                        const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
 
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                     }
 
-                    printf("\n");
+                    LOG("\n");
                 }
             }
 
@@ -363,7 +360,7 @@ int main(int argc, char ** argv) {
                 if (v == 0) {
                     // sample from the last level
                     for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                     }
                 } else {
                     for (int i = 0; i < W; i++) {
@@ -457,32 +454,31 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("W = %2d\n", W);
-    LOG_TEE("N = %2d\n", N);
-    LOG_TEE("G = %2d\n", G);
-    LOG_TEE("\n");
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_INF("\n");
+    LOG_INF("W = %2d\n", W);
+    LOG_INF("N = %2d\n", N);
+    LOG_INF("G = %2d\n", G);
+    LOG_INF("\n");
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_accept  = %d\n", n_accept);
 
-    llama_print_timings(ctx);
+    LOG_INF("\n");
+    common_perf_print(ctx, smpl);
+
+    common_sampler_free(smpl);
 
     llama_kv_cache_view_free(&kvc_view);
-    llama_sampling_free(ctx_sampling);
 
     llama_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index c060b8f56..fba78ceda 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,5 +1,23 @@
-set(TARGET lookup)
+set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-lookup-create)
+add_executable(${TARGET} lookup-create.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-lookup-merge)
+add_executable(${TARGET} lookup-merge.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-lookup-stats)
+add_executable(${TARGET} lookup-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookup/README.md b/examples/lookup/README.md
index 5bfb0de93..71c345c03 100644
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -10,4 +10,3 @@ More info:
 
 https://github.com/ggerganov/llama.cpp/pull/4484
 https://github.com/ggerganov/llama.cpp/issues/4226
-
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
new file mode 100644
index 000000000..3da45ed9e
--- /dev/null
+++ b/examples/lookup/lookup-create.cpp
@@ -0,0 +1,40 @@
+#include "arg.h"
+#include "common.h"
+#include "ngram-cache.h"
+#include "llama.h"
+
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv){
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+        return 1;
+    }
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model_ptr & model = llama_init.model;
+    llama_context_ptr & ctx = llama_init.context;
+
+    GGML_ASSERT(model != nullptr);
+
+    // tokenize the prompt
+    std::vector<llama_token> inp;
+    inp = common_tokenize(ctx.get(), params.prompt, true, true);
+    fprintf(stderr, "%s: tokenization done\n", __func__);
+
+    common_ngram_cache ngram_cache;
+    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
+
+    common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+
+    return 0;
+}
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
new file mode 100644
index 000000000..6871c0f5f
--- /dev/null
+++ b/examples/lookup/lookup-merge.cpp
@@ -0,0 +1,47 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static void print_usage(char* argv0) {
+    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
+    fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
+}
+
+int main(int argc, char ** argv){
+    if (argc < 3) {
+        print_usage(argv[0]);
+        exit(1);
+    }
+
+    std::vector<std::string> args;
+    args.resize(argc-1);
+    for (int i = 0; i < argc-1; ++i) {
+        args[i] = argv[i+1];
+        if (args[i] == "-h" || args[i] == "--help") {
+            print_usage(argv[0]);
+            exit(0);
+        }
+    }
+
+    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
+    common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
+
+    for (size_t i = 1; i < args.size()-1; ++i) {
+        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
+        common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
+
+        common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
+    }
+
+    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
+    common_ngram_cache_save(ngram_cache_merged, args.back());
+}
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
new file mode 100644
index 000000000..fcb289abe
--- /dev/null
+++ b/examples/lookup/lookup-stats.cpp
@@ -0,0 +1,157 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "ngram-cache.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cinttypes>
+#include <fstream>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv){
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+        return 1;
+    }
+
+    common_init();
+
+    const int n_draft = params.speculative.n_max;
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_context_ptr & ctx = llama_init.context;
+
+    // tokenize the prompt
+    std::vector<llama_token> inp;
+    inp = common_tokenize(ctx.get(), params.prompt, true, true);
+
+    common_ngram_cache ngram_cache_context;
+    common_ngram_cache ngram_cache_dynamic;
+    common_ngram_cache ngram_cache_static;
+
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        const int64_t t_start_draft_us = ggml_time_us();
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
+
+    const int n_input = inp.size();
+    const int n_ctx = llama_n_ctx(ctx.get());
+
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    const int64_t t_start_ms = ggml_time_ms();
+
+    // Iterate over input tokens in chunks of size n_ctx.
+    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
+    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
+        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
+        std::vector<llama_token> pseudo_output;
+        pseudo_output.push_back(inp_slice[0]);
+
+        while ((int) pseudo_output.size() < n_ctx) {
+            // Simulate drafting and decoding from draft:
+            std::vector<llama_token> draft;
+            draft.push_back(pseudo_output.back());
+
+            {
+                const int64_t t_start_draft_us = ggml_time_us();
+                common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
+
+            n_drafted += draft.size() - 1;
+
+            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
+                const llama_token ground_truth = inp_slice[pseudo_output.size()];
+                const llama_token drafted = draft[j];
+
+                if (ground_truth != drafted) {
+                    break;
+                }
+
+                ++n_accept;
+                pseudo_output.push_back(ground_truth);
+
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            // After each simulated batch decoding simulate the sampling of a single token:
+            if ((int) pseudo_output.size() < n_ctx) {
+                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            draft.erase(draft.begin());
+
+        }
+        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
+            const int64_t t_now_ms = ggml_time_ms();
+            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
+            const int64_t eta_min  = eta_ms / (60*1000);
+            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+        }
+
+        // After each chunk, update the dynamic ngram cache with the context ngram cache:
+        common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+        ngram_cache_context.clear();
+    }
+
+    LOG("\n");
+
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    llama_backend_free();
+
+    LOG("\n\n");
+
+    return 0;
+}
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index b53fae110..dbd0444ec 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,64 +1,88 @@
-#include "common.h"
+#include "arg.h"
 #include "ggml.h"
+#include "common.h"
+#include "ngram-cache.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
-#include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <vector>
 
 int main(int argc, char ** argv){
-    gpt_params params;
+    common_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
-    // max/min n-grams size to search for in prompt
-    const int ngram_max = 4;
-    const int ngram_min = 1;
+    common_init();
 
-    // length of the candidate / draft sequence, if match is found
-    const int n_draft = params.n_draft;
+    // max. number of additional tokens to draft if match is found
+    const int n_draft = params.speculative.n_max;
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookup", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = common_tokenize(ctx, params.prompt, true, true);
+
+    common_ngram_cache ngram_cache_context;
+    common_ngram_cache ngram_cache_dynamic;
+    common_ngram_cache ngram_cache_static;
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        // Fill up context ngram cache with tokens from user input:
+        const int64_t t_start_draft_us = ggml_time_us();
+        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
 
     const int max_context_size     = llama_n_ctx(ctx);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", common_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -67,8 +91,8 @@ int main(int argc, char ** argv){
 
     const auto t_enc_start = ggml_time_us();
 
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     const auto t_enc_end = ggml_time_us();
 
@@ -76,13 +100,11 @@ int main(int argc, char ** argv){
     int n_drafted = 0;
     int n_accept  = 0;
 
-    int64_t t_draft_us = 0;
-
     int n_past = inp.size();
 
     bool has_eos = false;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     std::vector<llama_token> draft;
 
@@ -97,26 +119,26 @@ int main(int argc, char ** argv){
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
         // print current draft sequence
-        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
 
         int i_dft = 0;
         while (true) {
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = common_sampler_sample(smpl, ctx, i_dft);
 
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            common_sampler_accept(smpl, id, true);
 
-            const std::string token_str = llama_token_to_piece(ctx, id);
+            const std::string token_str = common_token_to_piece(ctx, id);
 
             if (!params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
 
-            if (id == llama_token_eos(model)) {
+            if (llama_vocab_is_eog(vocab, id)) {
                 has_eos = true;
             }
 
@@ -124,31 +146,43 @@ int main(int argc, char ** argv){
 
             // check if the target token matches the draft
             if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                 ++n_accept;
                 ++n_past;
                 ++i_dft;
                 inp.push_back(id);
+                {
+                    // Update context ngram cache with the newly accepted token:
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
 
                 if (params.use_color) {
                     // color accepted draft token
-                    printf("\033[34m%s\033[0m", token_str.c_str());
+                    LOG("\033[34m%s\033[0m", token_str.c_str());
                     fflush(stdout);
                 }
                 continue;
             }
 
             if (params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
             fflush(stdout);
 
 
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
             draft.clear();
             draft.push_back(id);
             inp.push_back(id);
+            {
+                // Update context ngram cache with the newly accepted token:
+                const int64_t t_start_draft_us = ggml_time_us();
+                common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
             break;
         }
 
@@ -160,47 +194,22 @@ int main(int argc, char ** argv){
         // clean the cache of draft tokens that weren't accepted
         llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
 
-        llama_batch_clear(batch_tgt);
-        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
-
-        // generate n_pred tokens through prompt lookup
-        auto prompt_lookup = [&]() -> void {
-            const int inp_size = inp.size();
-            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
-                const llama_token * ngram = &inp[inp_size - ngram_size];
-
-                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
-                    bool match = true;
-                    for (int j = 0; j < ngram_size; ++j) {
-                        if (inp[i + j] != ngram[j]) {
-                            match = false;
-                            break;
-                        }
-                    }
-
-                    if (match) {
-                        const int startIdx = i + ngram_size;
-                        const int endIdx = startIdx + n_draft;
-                        if (endIdx < inp_size) {
-                            for (int j = startIdx; j < endIdx; ++j) {
-                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
-                                draft.push_back(inp[j]);
-                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
-                                ++n_drafted;
-                            }
-                            return;
-                        }
-                    }
-                }
-            }
-            return;
-        };
+        common_batch_clear(batch_tgt);
+        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
 
+        // Draft already contains a single token sampled from the model:
+        GGML_ASSERT(draft.size() == 1);
+        GGML_ASSERT(draft[0] == inp.back());
         const int64_t t_start_draft_us = ggml_time_us();
 
-        prompt_lookup();
+        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+
+        for (size_t i = 1; i < draft.size(); ++i) {
+            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+        }
 
         t_draft_us += ggml_time_us() - t_start_draft_us;
+        n_drafted += draft.size() - 1;
 
         llama_decode(ctx, batch_tgt);
         ++n_past;
@@ -210,32 +219,35 @@ int main(int argc, char ** argv){
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    // Update dynamic ngram cache with context ngram cache and save it to disk:
+    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+    common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG("\n\n");
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("t_draft   = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_predict);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
             t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    LOG_INF("\ntarget:\n\n");
+    common_perf_print(ctx, smpl);
+
+    common_sampler_free(smpl);
 
-    llama_sampling_free(ctx_sampling);
     llama_batch_free(batch_tgt);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt
deleted file mode 100644
index deb77d588..000000000
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-project("main-cmake-pkg" C CXX)
-set(TARGET main-cmake-pkg)
-
-find_package(Llama 0.0.1 REQUIRED)
-
-# Bake common functionality in with target. Because applications
-# using the relocatable Llama package should be outside of the
-# source tree, main-cmake-pkg pretends the dependencies are built-in.
-set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
-add_library(common OBJECT)
-file(GLOB _common_files
-    "${_common_path}/*.h"
-    "${_common_path}/*.cpp"
-)
-target_sources(common PRIVATE ${_common_files})
-
-# If the common project was part of "main-cmake-pkg" the transient
-# defines would automatically be attached. Because the common func-
-# tionality is separate, but dependent upon the defines, it must be
-# explicitly extracted from the "llama" target.
-#
-get_target_property(_llama_transient_defines llama
-    INTERFACE_COMPILE_DEFINITIONS)
-
-target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
-
-add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
-target_include_directories(${TARGET} PRIVATE ${_common_path})
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
deleted file mode 100644
index 6d665f28f..000000000
--- a/examples/main-cmake-pkg/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# llama.cpp/example/main-cmake-pkg
-
-This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
-
-## Building
-
-Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
-
-### Considerations
-
-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
-
-### Build llama.cpp and install to C:\LlamaCPP directory
-
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
-```cmd
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/LlamaCPP
-```
-
-### Build main-cmake-pkg
-
-
-```cmd
-cd ..\examples\main-cmake-pkg
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/MyLlamaApp
-```
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index d532980b7..af3d9150f 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET main)
+set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/main/README.md b/examples/main/README.md
index 7f84e4262..ceaed42f6 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,6 +1,6 @@
-# llama.cpp/example/main
+# llama.cpp/examples/main
 
-This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
 
 ## Table of Contents
 
@@ -17,73 +17,71 @@ This example program allows you to use various LLaMA language models in an easy
 
 To get started right away, run the following command, making sure to use the correct path for the model you have:
 
-#### Unix-based systems (Linux, macOS, etc.):
+First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
+[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
+
+Once downloaded, place your model in the models folder in llama.cpp.
+
+### Unix-based systems (Linux, macOS, etc.):
+
+##### Input prompt (One-and-done)
 
 ```bash
-./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
-
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
-```
-
-For an interactive experience, try this command:
-
-#### Unix-based systems (Linux, macOS, etc.):
+##### Conversation mode (Allow for continuous interaction with the model)
 
 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
-'User: Hi
-AI: Hello. I am an AI chatbot. Would you like to talk?
-User: Sure!
-AI: What would you like to talk about?
-User:'
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 ```
 
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
-```
-
-The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
-
-#### Unix-based systems (Linux, macOS, etc.):
-
+##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 
-#### Windows:
+### Windows:
+
+##### Input prompt (One-and-done)
+```powershell
+./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
+```
+##### Conversation mode (Allow for continuous interaction with the model)
 
 ```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
+```
+
+#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+```powershell
+llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 
 ## Common Options
 
-In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
 
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
--   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
+-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 
 ## Input Prompts
 
-The `main` program provides several ways to interact with the LLaMA models using input prompts:
+The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
 
 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
 -   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
--   `--random-prompt`: Start with a randomized prompt.
 
 ## Interaction
 
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
 
 In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
 
@@ -91,7 +89,7 @@ In interactive mode, users can participate in text generation by injecting their
 
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
--   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
+-   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -109,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./main -r "User:" --in-prefix " "
+./llama-cli -r "User:" --in-prefix " "
 ```
 
 ### In-Suffix
@@ -117,18 +115,15 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
 
 ```sh
-./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
+When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
 
-### Instruction Mode
+### Chat templates
 
-Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
+ `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
 
--   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
-
-Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
-
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+ Example usage: `--chat-template gemma`
 
 ## Context Management
 
@@ -136,13 +131,11 @@ During text generation, LLaMA models have a limited context size, which means th
 
 ### Context Size
 
-The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
-
--   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
 
 ### Extended Context Size
 
-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
 
 -   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
 
@@ -160,15 +153,17 @@ The following options allow you to control the text generation process and fine-
 
 ### Number of Tokens to Predict
 
--   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)
+-   `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
 
-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
 
-A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
 
 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
 
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
+
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
 
 ### Temperature
 
@@ -176,21 +171,40 @@ It is important to note that the generated text may be shorter than the specifie
 
 Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
 
-Example usage: `--temp 0.5`
+Example usage: `--temp 0`
 
 ### Repeat Penalty
 
--   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
 -   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
--   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
 
-The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
 
 The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
 
-Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
+### DRY Repetition Penalty
 
-Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
+DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
+
+- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
+- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
+- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
+- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
+- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used.
+
+The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
+
+The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
+
+The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
+
+The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens.
+
+The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied.
+
+DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
+
+Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"`
 
 ### Top-K Sampling
 
@@ -208,22 +222,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
 
 Example usage: `--top-p 0.95`
 
-### Min P Sampling
+### Min-P Sampling
 
--   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
 
 The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
 
 Example usage: `--min-p 0.05`
 
-### Tail Free Sampling (TFS)
-
--   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-
-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
-
-Example usage: `--tfs 0.95`
-
 ### Locally Typical Sampling
 
 -   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
@@ -246,6 +252,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
 
 Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
+### XTC Sampling
+
+-   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
+-   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
+
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
+
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
+
 ### Logit Bias
 
 -   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
@@ -285,29 +304,38 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
 -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
--   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
+-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
 
  These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
 
-### Memory Float 32
-
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
-
 ### Batch Size
 
--   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
+
+- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
 
 ### Prompt Caching
 
 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
 
-### Grammars
+### Grammars & JSON schemas
 
 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
 
+-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
+
 ### Quantization
 
-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
+
+## LoRA (Low-Rank Adaptation) adapters
+
+-   `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
+-   `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
+
+You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
+
+LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
 
 ## Additional Options
 
@@ -315,8 +343,7 @@ These options provide extra functionality and customization when running the LLa
 
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
--   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
--   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
--   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
--   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+-   `--no-display-prompt`: Don't print prompt at generation.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
+-   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 34e84d0d4..e654d3542 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,11 +1,11 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "log.h"
+#include "sampling.h"
 #include "llama.h"
+#include "chat-template.hpp"
 
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -31,110 +31,70 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
+static bool is_interacting  = false;
+static bool need_insert_eot = false;
 
-static bool file_exists(const std::string &path) {
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
+}
+
+static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
 }
 
-static bool file_is_empty(const std::string &path) {
+static bool file_is_empty(const std::string & path) {
     std::ifstream f;
     f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
     return f.tellg() == 0;
 }
 
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
     if (signo == SIGINT) {
         if (!is_interacting && g_params->interactive) {
-            is_interacting = true;
+            is_interacting  = true;
+            need_insert_eot = true;
         } else {
             console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            LOG("\n");
+            common_perf_print(*g_ctx, *g_smpl);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            common_log_pause(common_log_main());
+
             _exit(130);
         }
     }
 }
 #endif
 
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     g_params = &params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
-    llama_sampling_params & sparams = params.sparams;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    common_init();
 
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+    auto & sparams = params.sampling;
 
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
@@ -142,155 +102,207 @@ int main(int argc, char ** argv) {
     atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    LOG_INF("%s: llama backend init\n", __func__);
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    common_sampler * smpl = nullptr;
+
     g_model = &model;
     g_ctx = &ctx;
+    g_smpl = &smpl;
+
+    std::vector<common_chat_msg> chat_msgs;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    common_init_result llama_init = common_init_from_params(params);
+
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
+
+    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
+    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    set_process_priority(params.cpuparams.priority);
+
+    struct ggml_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            return 1;
+        }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
+    if (!threadpool) {
+        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        return 1;
+    }
+
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+    }
+
+    // auto enable conversation mode if chat template is available
+    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
+        if (has_chat_template) {
+            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        } else {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    }
+
+    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
+    if (params.conversation_mode && !has_chat_template) {
+        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
+    }
+
+    // print chat template example in conversation mode
+    if (params.conversation_mode) {
+        if (params.enable_chat_template) {
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
+        } else {
+            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+        }
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
         if (!file_exists(path_session)) {
-            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+            LOG_INF("%s: session file does not exist, will create.\n", __func__);
         } else if (file_is_empty(path_session)) {
-            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
         } else {
             // The file exists and is not empty
             session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            llama_set_rng_seed(ctx, params.seed);
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
         }
     }
 
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos: %d\n", add_bos);
+    const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja;
+    if (!llama_model_has_encoder(model)) {
+        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    }
+
+    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
 
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
-        if (params.chatml) {
-            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
-        }
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-    } else {
-        LOG("use session tokens\n");
-        embd_inp = session_tokens;
-    }
+    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
+        common_chat_msg new_msg{role, content, {}};
+        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back({role, content, {}});
+        LOG_DBG("formatted: '%s'\n", formatted.c_str());
+        return formatted;
+    };
 
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    {
+        auto prompt = (params.conversation_mode && params.enable_chat_template)
+            // format the system prompt in conversation mode (fallback to default if empty)
+            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            // otherwise use the prompt as is
+            : params.prompt;
+        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+            LOG_DBG("tokenize the prompt\n");
+            embd_inp = common_tokenize(ctx, prompt, true, true);
+        } else {
+            LOG_DBG("use session tokens\n");
+            embd_inp = session_tokens;
+        }
+
+        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+    }
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        if (add_bos) {
+            embd_inp.push_back(llama_vocab_bos(vocab));
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+        } else {
+            LOG_ERR("input is empty\n");
+            return -1;
+        }
     }
 
     // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
@@ -304,63 +316,41 @@ int main(int argc, char ** argv) {
             n_matching_session_tokens++;
         }
         if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
+            LOG_INF("%s: using full prompt from session file\n", __func__);
         } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
         llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
 
     // if we will use the cache for the full prompt without reaching the end of the cache, force
-    // reevaluation of the last token token to recalculate the cached logits
+    // reevaluation of the last token to recalculate the cached logits
     if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
 
         session_tokens.resize(embd_inp.size() - 1);
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
         params.n_keep = (int)embd_inp.size();
     } else {
         params.n_keep += add_bos; // always keep the BOS token
     }
 
-    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
-
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
-    // chatml prefix & suffix
-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
-    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
-
-    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
-    LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
+    if (params.conversation_mode) {
         params.interactive_first = true;
-        params.antiprompt.emplace_back("### Instruction:\n\n");
-    }
-    // similar for chatml mode
-    else if (params.chatml) {
-        params.interactive_first = true;
-        params.antiprompt.emplace_back("<|im_start|>user\n");
     }
 
     // enable interactive mode if interactive start is specified
@@ -369,30 +359,20 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > add_bos) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+            LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG_CNT("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     // ctrl+C handling
@@ -412,47 +392,56 @@ int main(int argc, char ** argv) {
     }
 
     if (params.interactive) {
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    auto tmp = common_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
         }
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
     }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    smpl = common_sampler_init(model, sparams);
+    if (!smpl) {
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        return 1;
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
     // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -466,25 +455,29 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
       //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
       //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
     }
-    LOG_TEE("\n\n");
+    LOG_INF("\n");
 
     if (params.interactive) {
-        const char *control_message;
+        const char * control_message;
         if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to the AI, end your input with '\\'.\n"
                               " - To return control without starting a new line, end your input with '/'.\n";
         } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to the AI.\n"
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG_INF(       "%s", control_message);
+        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
+            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        }
+        LOG_INF("\n");
 
         is_interacting = params.interactive_first;
     }
@@ -498,20 +491,45 @@ int main(int argc, char ** argv) {
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;
-    int n_past_guidance    = 0;
 
     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
     std::ostringstream output_ss;     g_output_ss     = &output_ss;
+    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console::set_display(console::prompt);
     display = params.display_prompt;
 
     std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    // single-token antiprompts
+    std::vector<llama_token> antiprompt_token;
+
+    for (const std::string & antiprompt : params.antiprompt) {
+        auto ids = ::common_tokenize(ctx, antiprompt, false, true);
+        if (ids.size() == 1) {
+            antiprompt_token.push_back(ids[0]);
+        }
+    }
+
+    if (llama_model_has_encoder(model)) {
+        int enc_input_size = embd_inp.size();
+        llama_token * enc_input_buf = embd_inp.data();
+
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
+        }
+
+        embd_inp.clear();
+        embd_inp.push_back(decoder_start_token_id);
+    }
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -526,9 +544,8 @@ int main(int argc, char ** argv) {
                 embd.resize(max_embd_size);
 
                 console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
-                fflush(stdout);
             }
 
             if (ga_n == 1) {
@@ -536,16 +553,22 @@ int main(int argc, char ** argv) {
                 // if we run out of context:
                 // - take the n_keep first tokens from the original prompt (via n_past)
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+
+                if (n_past + (int) embd.size() >= n_ctx) {
+                    if (!params.ctx_shift){
+                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                        break;
+                    }
+
                     if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                         break;
                     }
 
                     const int n_left    = n_past - params.n_keep;
                     const int n_discard = n_left/2;
 
-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                     llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
@@ -553,15 +576,11 @@ int main(int argc, char ** argv) {
 
                     n_past -= n_discard;
 
-                    if (ctx_guidance) {
-                        n_past_guidance -= n_discard;
-                    }
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
-                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                    LOG("clear session path\n");
+                    LOG_DBG("clear session path\n");
                     path_session.clear();
                 }
             } else {
@@ -571,10 +590,10 @@ int main(int argc, char ** argv) {
                     const int bd = (ga_w/ga_n)*(ga_n - 1);
                     const int dd = (ga_w/ga_n) - ib*bd - ga_w;
 
-                    LOG("\n");
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG_DBG("\n");
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
                     llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                     llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -584,7 +603,7 @@ int main(int argc, char ** argv) {
 
                     ga_i += ga_w/ga_n;
 
-                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                 }
             }
 
@@ -610,65 +629,25 @@ int main(int argc, char ** argv) {
                 }
             }
 
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
                 // Display total tokens alongside total time
                 if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                 }
             }
 
@@ -679,22 +658,21 @@ int main(int argc, char ** argv) {
         }
 
         embd.clear();
-        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                 need_to_save_session = false;
-                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
 
-                LOG("saved session to %s\n", path_session.c_str());
+                LOG_DBG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
 
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            common_sampler_accept(smpl, id, /* accept_grammar= */ true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -704,16 +682,16 @@ int main(int argc, char ** argv) {
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -725,18 +703,24 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);
 
+                // Console/Stream Output
+                LOG("%s", token_str.c_str());
+
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
                 if (embd.size() > 1) {
+                    // Incoming Requested Tokens
                     input_tokens.push_back(id);
                 } else {
+                    // Outgoing Generated Tokens
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
             }
-            fflush(stdout);
         }
+
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
@@ -748,7 +732,7 @@ int main(int argc, char ** argv) {
             // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
                 const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -769,46 +753,62 @@ int main(int argc, char ** argv) {
                     }
                 }
 
+                // check for reverse prompt using special tokens
+                llama_token last_token = common_sampler_last(smpl);
+                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
+                    if (params.interactive) {
+                        is_interacting = true;
+                    }
+                    is_antiprompt = true;
+                }
+
                 if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
+                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
                 }
             }
 
-            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
-                LOG("found EOS token\n");
+            // deal with end of generation tokens in interactive mode
+            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+                LOG_DBG("found an EOG token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
 
+                    if (params.enable_chat_template) {
+                        chat_add_and_format("assistant", assistant_ss.str());
+                    }
                     is_interacting = true;
-                    printf("\n");
-                } else if (params.instruct || params.chatml) {
-                    is_interacting = true;
+                    LOG("\n");
                 }
             }
 
-            if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
+            // if current token is not EOG, we add it to current assistant message
+            if (params.conversation_mode) {
+                const auto id = common_sampler_last(smpl);
+                assistant_ss << common_token_to_piece(ctx, id, false);
+            }
 
-                if (params.instruct || params.chatml) {
-                    printf("\n> ");
+            if (n_past > 0 && is_interacting) {
+                LOG_DBG("waiting for user input\n");
+
+                if (params.conversation_mode) {
+                    LOG("\n> ");
                 }
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    LOG_DBG("adding input prefix BOS token\n");
+                    embd_inp.push_back(llama_vocab_bos(vocab));
                 }
 
                 std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
+                if (!params.input_prefix.empty() && !params.conversation_mode) {
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("%s", params.input_prefix.c_str());
                 }
 
                 // color user input only
@@ -830,61 +830,54 @@ int main(int argc, char ** argv) {
                 // Entering a empty line lets the user pass control back
                 if (buffer.length() > 1) {
                     // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        printf("%s", params.input_suffix.c_str());
+                    if (!params.input_suffix.empty() && !params.conversation_mode) {
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
-                        LOG("inserting instruction prefix\n");
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-                    }
-                    // chatml mode: insert user chat prefix
-                    if (params.chatml && !is_antiprompt) {
-                        LOG("inserting chatml prefix\n");
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
-                    }
                     if (params.escape) {
-                        process_escapes(buffer);
+                        string_process_escapes(buffer);
                     }
 
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    bool format_chat = params.conversation_mode && params.enable_chat_template;
+                    std::string user_inp = format_chat
+                        ? chat_add_and_format("user", std::move(buffer))
+                        : std::move(buffer);
+                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
+                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = common_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
+
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+
+                    // if user stop generation mid-way, we must add EOT to finish model's last response
+                    if (need_insert_eot && format_chat) {
+                        llama_token eot = llama_vocab_eot(vocab);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
+                        need_insert_eot = false;
+                    }
 
                     embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
-                        LOG("inserting instruction suffix\n");
-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                    }
-                    // chatml mode: insert assistant chat suffix
-                    if (params.chatml) {
-                        LOG("inserting chatml suffix\n");
-                        embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
-                    }
-
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
 
+                    // reset assistant message
+                    assistant_ss.str("");
+
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -892,15 +885,15 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
         }
 
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
-            LOG_TEE(" [end of text]\n");
+        // end of generation
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
+            LOG(" [end of text]\n");
             break;
         }
 
@@ -913,23 +906,19 @@ int main(int argc, char ** argv) {
     }
 
     if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
-        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    LOG("\n\n");
+    common_perf_print(ctx, smpl);
 
-    if (ctx_guidance) { llama_free(ctx_guidance); }
-    llama_free(ctx);
-    llama_free_model(model);
+    common_sampler_free(smpl);
 
-    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);
 
     return 0;
 }
diff --git a/examples/make-ggml.py b/examples/make-ggml.py
deleted file mode 100755
index c73485ebf..000000000
--- a/examples/make-ggml.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
-
-Usage:
-python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
-
-Arguments:
-- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
-- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
-- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
-- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
-- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
-- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
-
-Old quant types (some base model types require these):
-- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
-- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
-- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
-- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
-
-New quant types (recommended):
-- Q2_K: smallest, extreme quality loss - not recommended
-- Q3_K: alias for Q3_K_M
-- Q3_K_S: very small, very high quality loss
-- Q3_K_M: very small, very high quality loss
-- Q3_K_L: small, substantial quality loss
-- Q4_K: alias for Q4_K_M
-- Q4_K_S: small, significant quality loss
-- Q4_K_M: medium, balanced quality - recommended
-- Q5_K: alias for Q5_K_M
-- Q5_K_S: large, low quality loss - recommended
-- Q5_K_M: large, very low quality loss - recommended
-- Q6_K: very large, extremely low quality loss
-- Q8_0: very large, extremely low quality loss - not recommended
-- F16: extremely large, virtually no quality loss - not recommended
-- F32: absolutely huge, lossless - not recommended
-"""
-import subprocess
-subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
-
-import argparse
-import os
-from huggingface_hub import snapshot_download
-
-def main(model, model_type, outname, outdir, quants, keep_fp16):
-    if not os.path.isdir(model):
-        print(f"Model not found at {model}. Downloading...")
-        try:
-            if outname is None:
-                outname = model.split('/')[-1]
-            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
-        except Exception as e:
-            raise Exception(f"Could not download the model: {e}")
-
-    if outdir is None:
-        outdir = f'../models/{outname}'
-
-    if not os.path.isfile(f"{model}/config.json"):
-        raise Exception(f"Could not find config.json in {model}")
-
-    os.makedirs(outdir, exist_ok=True)
-
-    print("Building llama.cpp")
-    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
-
-    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
-
-    print(f"Making unquantised GGUF at {fp16}")
-    if not os.path.isfile(fp16):
-        if model_type != "llama":
-            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
-        else:
-            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
-    else:
-        print(f"Unquantised GGML already exists at: {fp16}")
-
-    print("Making quants")
-    for type in quants:
-        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
-        print(f"Making {type} : {outfile}")
-        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
-
-    if not keep_fp16:
-        os.remove(fp16)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
-    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
-    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
-    parser.add_argument('--outname', default=None, help='Output model(s) name')
-    parser.add_argument('--outdir', default=None, help='Output directory')
-    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
-    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
-
-    args = parser.parse_args()
-
-    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
index 319535a6e..847e916de 100644
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET parallel)
+set(TARGET llama-parallel)
 add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7d11fcd59..7ef43d5e1 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,7 +1,10 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.
 
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -50,8 +53,8 @@ static std::vector<std::string> k_prompts = {
 
 struct client {
     ~client() {
-        if (ctx_sampling) {
-            llama_sampling_free(ctx_sampling);
+        if (smpl) {
+            common_sampler_free(smpl);
         }
     }
 
@@ -72,7 +75,7 @@ struct client {
     std::string prompt;
     std::string response;
 
-    struct llama_sampling_context * ctx_sampling = nullptr;
+    struct common_sampler * smpl = nullptr;
 };
 
 static void print_date_time() {
@@ -81,7 +84,9 @@ static void print_date_time() {
     char buffer[80];
     strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
 
-    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+    LOG_INF("\n");
+    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
+    LOG_INF("\n");
 }
 
 // Define a split string function to ...
@@ -98,15 +103,20 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 int main(int argc, char ** argv) {
     srand(1234);
 
-    gpt_params params;
+    common_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
     }
 
+    common_init();
+
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
 
+    // dedicate one sequence to the system prompt
+    params.n_parallel += 1;
+
     // requests to simulate
     const int32_t n_seq = params.n_sequences;
 
@@ -115,42 +125,36 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("parallel", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the target model
-    params.logits_all = true;
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
-        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
     } else {
         // Output each line of the input params.prompts vector and copy to k_prompts
         int index = 0;
-        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
 
         std::vector<std::string> prompts = split_string(params.prompt, '\n');
         for (const auto& prompt : prompts) {
             k_prompts.resize(index + 1);
             k_prompts[index] = prompt;
             index++;
-            printf("%3d prompt: %s\n", index, prompt.c_str());
+            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
         }
     }
 
-    fprintf(stderr, "\n\n");
-    fflush(stderr);
+    LOG_INF("\n\n");
 
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -158,11 +162,11 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams);
+        client.smpl = common_sampler_init(model, params.sampling);
     }
 
     std::vector<llama_token> tokens_system;
-    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    tokens_system = common_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
 
     llama_seq_id g_seq_id = 0;
@@ -179,39 +183,39 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_TEE("\n");
+    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("\n");
 
     {
-        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
 
         for (int32_t i = 0; i < n_tokens_system; ++i) {
-            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
+            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
         // assign the system KV cache to all parallel sequences
-        for (int32_t i = 1; i < n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+        for (int32_t i = 1; i <= n_clients; ++i) {
+            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
-    LOG_TEE("Processing requests ...\n\n");
+    LOG_INF("Processing requests ...\n\n");
 
     while (true) {
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // decode any currently ongoing sequences
         for (auto & client : clients) {
@@ -221,18 +225,20 @@ int main(int argc, char ** argv) {
 
             client.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
 
             client.n_decoded += 1;
         }
 
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
-            for (int i = 0; i < n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+            for (int i = 1; i <= n_clients; ++i) {
+                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                // but keep the system prompt
+                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
-            LOG_TEE("%s: clearing the KV cache\n", __func__);
+            LOG_INF("%s: clearing the KV cache\n", __func__);
         }
 
         // insert new sequences for decoding
@@ -248,14 +254,14 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
 
-                    llama_sampling_reset(client.ctx_sampling);
+                    common_sampler_reset(client.smpl);
 
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
-                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
+                    tokens_prompt = common_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }
 
                     // extract the logits only for the last token
@@ -267,7 +273,7 @@ int main(int argc, char ** argv) {
                     client.n_decoded = 0;
                     client.i_batch   = batch.n_tokens - 1;
 
-                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
 
                     g_seq_id += 1;
 
@@ -304,18 +310,17 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return 1;
                 }
 
-                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
                 n_cache_miss += 1;
 
@@ -326,7 +331,7 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
 
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -336,9 +341,9 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
 
-                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
+                common_sampler_accept(client.smpl, id, true);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -346,7 +351,7 @@ int main(int argc, char ** argv) {
                     client.t_start_gen = ggml_time_us();
                 }
 
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
 
                 client.response += token_str;
                 client.sampled = id;
@@ -355,7 +360,7 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(model) ||
+                        (llama_vocab_is_eog(vocab, id) ||
                          (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
@@ -366,11 +371,12 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                             client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -393,30 +399,28 @@ int main(int argc, char ** argv) {
 
     print_date_time();
 
-    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
     if (params.prompt_file.empty()) {
         params.prompt_file = "used built-in defaults";
     }
-    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
 
-    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
+    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
 
-    llama_print_timings(ctx);
+    // TODO: print sampling/grammar timings for all clients
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt
index 3161bf3ef..9bc5110c2 100644
--- a/examples/passkey/CMakeLists.txt
+++ b/examples/passkey/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET passkey)
+set(TARGET llama-passkey)
 add_executable(${TARGET} passkey.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 4a22bb559..2b8e910f9 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -1,5 +1,8 @@
 # llama.cpp/example/passkey
 
+A passkey retrieval task is an evaluation method used to measure a language
+models ability to recall information from long contexts.
+
 See the following PRs for more info:
 
 - https://github.com/ggerganov/llama.cpp/pull/3856
@@ -8,5 +11,5 @@ See the following PRs for more info:
 ### Usage
 
 ```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 2cbc9e1fa..5953928d4 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -6,46 +8,29 @@
 #include <string>
 #include <vector>
 
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG("\n");
+}
+
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
-        return 1 ;
+    params.n_junk = 250;
+    params.n_keep = 32;
+    params.i_pos  = -1;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+        return 1;
     }
 
-    int seed = -1;
+    common_init();
 
-    int n_junk = 250; // number of times to repeat the junk text
-    int n_keep = 32;  // number of tokens in the prompt prefix
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
-    int i_pos  = -1;  // position of the passkey in the junk text
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_junk = std::stoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        n_grp = std::stoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        i_pos = std::stoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        seed = std::stoi(argv[5]);
-    }
-
-    if (seed == -1) {
-        seed = time(NULL);
-    }
-
-    srand(seed);
+    int n_junk = params.n_junk;
+    int n_keep = params.n_keep;
+    int n_grp  = params.grp_attn_n;
+    int i_pos  = params.i_pos;
 
     if (i_pos == -1) {
         i_pos = rand() % n_junk;
@@ -76,42 +61,43 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = common_model_params_to_llama(params);
 
-    model_params.n_gpu_layers = 99; // offload all layers to the GPU
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return 1;
     }
 
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = common_context_params_to_llama(params);
 
-    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
-    ctx_params.n_batch = 512;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
 
     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
 
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
     // tokenize the prompt
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = common_tokenize(ctx, params.prompt, true);
 
     // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
+    const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
 
     const int n_tokens_all = tokens_list.size();
 
@@ -126,16 +112,16 @@ int main(int argc, char ** argv) {
     const int n_batch     = ctx_params.n_batch;
     const int n_batch_grp = ctx_params.n_batch/n_grp;
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
 
     // print the prompt token-by-token
 
-    LOG_TEE("\n");
-    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
-    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
+    LOG_INF("\n");
+    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_INF("prompt tokens: %d\n", n_tokens_all);
+    //LOG_INF("prompt: %s\n", params.prompt.c_str());
 
-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
 
     int n_past = 0;
 
@@ -153,10 +139,10 @@ int main(int argc, char ** argv) {
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
 
         if (i + n_batch >= n_tokens_all) {
@@ -164,11 +150,11 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_INF("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
 
         if (i + n_batch >= n_tokens_all) {
             break;
@@ -178,7 +164,7 @@ int main(int argc, char ** argv) {
     for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
         const int n_discard = n_batch;
 
-        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -187,10 +173,10 @@ int main(int argc, char ** argv) {
 
         n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
 
         if (i + n_batch >= n_tokens_all) {
@@ -198,18 +184,18 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
     }
 
     {
         const int n_discard = n_past - n_ctx + n_predict;
 
         if (n_discard > 0) {
-            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -220,81 +206,69 @@ int main(int argc, char ** argv) {
         }
     }
 
-    LOG_TEE("\n");
-    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_TEE("\n");
+    LOG_INF("\n");
+    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_INF("\n");
 
     // main loop
 
     int n_cur    = n_tokens_all;
     int n_decode = 0;
 
-    LOG_TEE("%s", prompt_suffix.c_str());
-    fflush(stdout);
+    LOG_INF("%s", prompt_suffix.c_str());
 
     const auto t_main_start = ggml_time_us();
 
     while (n_cur <= n_len) {
         // sample the next token
         {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
 
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
-                LOG_TEE("\n");
+            // is it an end of generation?
+            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
+                LOG("\n");
 
                 break;
             }
 
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
+            LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
 
             n_decode += 1;
 
             // prepare the next batch
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
+            common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
         }
 
         n_cur += 1;
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);
 
-    fprintf(stderr, "\n");
+    LOG("\n");
+
+    llama_sampler_free(smpl);
 
     llama_batch_free(batch);
 
     llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
 
     llama_backend_free();
 
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 3c76d3221..3e6864093 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET perplexity)
+set(TARGET llama-perplexity)
 add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
index 50e1af011..33a46d1a2 100644
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -1,21 +1,193 @@
-# perplexity
+# Perplexity
 
-TODO
+The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
+Perplexity measures how well the model can predict the next token with lower values being better.
+Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
+Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
 
-## Llama 2 70B Scorechart
-Quantization | Model size (GiB) | Perplexity | Delta to fp16
--- | -- | -- | --
-Q4_0 | 36.20 | 3.5550 | 3.61%
-Q4_1 | 40.20 | 3.5125 | 2.37%
-Q5_0 | 44.20 | 3.4744 | 1.26%
-Q2_K | 27.27 | 3.7339 | 8.82%
-Q3_K_S | 27.86 | 3.7019 | 7.89%
-Q3_K_M | 30.83 | 3.5932 | 4.72%
-Q3_K_L | 33.67 | 3.5617 | 3.80%
-Q4_K_S | 36.39 | 3.4852 | 1.57%
-Q4_K_M | 38.54 | 3.4725 | 1.20%
-Q5_K_S | 44.20 | 3.4483 | 0.50%
-Q5_K_M | 45.41 | 3.4451 | 0.40%
-Q6_K | 52.70 | 3.4367 | 0.16%
-fp16 | 128.5 | 3.4313 | -
+Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
+The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
+When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
+llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
 
+By default only the mean perplexity value and the corresponding uncertainty is calculated.
+The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
+
+More statistics can be obtained by recording the logits from the FP16 version of a model.
+To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
+The program will then record all logits and save them to the provided path in binary format.
+**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
+Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
+and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
+This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
+The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
+
+In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
+
+* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
+* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
+* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
+* Pearson correlation coefficient of the "correct" token probabilites between models.
+* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
+* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
+* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
+
+## LLaMA 3 8b Scoreboard
+
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
+The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
+Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
+In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
+So the "f16" results are to be understood as the difference resulting only from this downcast.
+
+| Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
+|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
+| f16          | None    |            14.97 | 6.233160 ±   0.037828  | 0.001524 ±   0.000755  | 0.000551 ±   0.000002 |  0.001 ± 0.002 %  | 0.787 ± 0.004 %  |
+| q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
+| q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
+| q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
+| q5_K_S       | None    |             5.21 | 6.336598 ±   0.038755  | 0.104964 ±   0.003331  | 0.016595 ±   0.000122 | -0.223 ± 0.010 %  | 3.918 ± 0.036 %  |
+| q5_1         | None    |             5.65 | 6.337857 ±   0.038677  | 0.106223 ±   0.003476  | 0.018045 ±   0.000139 | -0.287 ± 0.011 %  | 4.123 ± 0.039 %  |
+| q5_0         | None    |             5.21 | 6.363224 ±   0.038861  | 0.131591 ±   0.003894  | 0.022239 ±   0.000166 | -0.416 ± 0.012 %  | 4.634 ± 0.043 %  |
+| q4_K_M       | WT 10m  |             4.58 | 6.382937 ±   0.039055  | 0.151303 ±   0.004429  | 0.028152 ±   0.000240 | -0.389 ± 0.014 %  | 5.251 ± 0.049 %  |
+| q4_K_M       | None    |             4.58 | 6.407115 ±   0.039119  | 0.175482 ±   0.004620  | 0.031273 ±   0.000238 | -0.596 ± 0.014 %  | 5.519 ± 0.050 %  |
+| q4_K_S       | WT 10m  |             4.37 | 6.409697 ±   0.039189  | 0.178064 ±   0.004744  | 0.031951 ±   0.000259 | -0.531 ± 0.015 %  | 5.645 ± 0.051 %  |
+| iq4_NL       | WT 10m  |             4.35 | 6.455593 ±   0.039630  | 0.223959 ±   0.005201  | 0.035742 ±   0.000288 | -0.590 ± 0.016 %  | 5.998 ± 0.054 %  |
+| iq4_XS       | WT 10m  |             4.14 | 6.459705 ±   0.039595  | 0.228071 ±   0.005207  | 0.036334 ±   0.000284 | -0.668 ± 0.016 %  | 6.044 ± 0.054 %  |
+| q4_K_S       | None    |             4.37 | 6.500529 ±   0.039778  | 0.268895 ±   0.005638  | 0.043136 ±   0.000314 | -0.927 ± 0.017 %  | 6.562 ± 0.055 %  |
+| q4_1         | None    |             4.78 | 6.682737 ±   0.041285  | 0.451103 ±   0.008030  | 0.071683 ±   0.000505 | -0.927 ± 0.017 %  | 8.512 ± 0.063 %  |
+| q4_0         | None    |             4.34 | 6.700147 ±   0.041226  | 0.468514 ±   0.007951  | 0.071940 ±   0.000491 | -1.588 ± 0.022 %  | 8.434 ± 0.061 %  |
+| q3_K_L       | WT 10m  |             4.03 | 6.671223 ±   0.041427  | 0.439590 ±   0.008154  | 0.073077 ±   0.000529 | -0.940 ± 0.023 %  | 8.662 ± 0.064 %  |
+| q3_K_M       | WT 10m  |             3.74 | 6.734255 ±   0.041838  | 0.502622 ±   0.008901  | 0.084358 ±   0.000588 | -1.198 ± 0.024 %  | 9.292 ± 0.065 %  |
+| q3_K_L       | None    |             4.03 | 6.787876 ±   0.042104  | 0.556242 ±   0.009171  | 0.087176 ±   0.000614 | -1.532 ± 0.025 %  | 9.432 ± 0.067 %  |
+| q3_K_M       | None    |             3.74 | 6.888498 ±   0.042669  | 0.656864 ±   0.010071  | 0.101913 ±   0.000677 | -1.990 ± 0.026 %  | 10.203 ± 0.068 % |
+| iq3_M        | WT 10m  |             3.53 | 6.898327 ±   0.041643  | 0.666694 ±   0.009449  | 0.102534 ±   0.000663 | -3.178 ± 0.026 %  | 10.513 ± 0.066 % |
+| iq3_S        | WT 10m  |             3.42 | 6.965501 ±   0.042406  | 0.733867 ±   0.010245  | 0.111278 ±   0.000710 | -3.066 ± 0.027 %  | 10.845 ± 0.068 % |
+| iq3_XS       | WT 10m  |             3.28 | 7.163043 ±   0.043772  | 0.931409 ±   0.012084  | 0.138693 ±   0.000857 | -3.667 ± 0.031 %  | 12.148 ± 0.070 % |
+| iq3_XXS      | WT 10m  |             3.05 | 7.458436 ±   0.046404  | 1.226803 ±   0.015234  | 0.183625 ±   0.001042 | -3.918 ± 0.035 %  | 13.836 ± 0.074 % |
+| q3_K_S       | WT 10m  |             3.41 | 7.602878 ±   0.046848  | 1.371244 ±   0.015688  | 0.199821 ±   0.001008 | -5.046 ± 0.037 %  | 14.980 ± 0.070 % |
+| q3_K_S       | None    |             3.41 | 7.863786 ±   0.048885  | 1.632152 ±   0.017733  | 0.228217 ±   0.001079 | -5.604 ± 0.038 %  | 15.541 ± 0.070 % |
+| iq2_M        | WT 10m  |             2.74 | 8.600799 ±   0.055124  | 2.369166 ±   0.025244  | 0.325989 ±   0.00160  | -6.463 ± 0.046 %  | 18.519 ± 0.080 % |
+| q2_K         | WT 10k  |             2.96 | 8.652290 ±   0.055572  | 2.420657 ±   0.025587  | 0.331393 ±   0.001562 | -6.606 ± 0.046 %  | 18.790 ± 0.078 % |
+| q2_K         | WT 100k |             2.96 | 8.641993 ±   0.055406  | 2.410359 ±   0.025495  | 0.331672 ±   0.001569 | -6.628 ± 0.047 %  | 18.856 ± 0.078 % |
+| q2_K         | WT 10m  |             2.96 | 8.647825 ±   0.055610  | 2.416191 ±   0.025683  | 0.332223 ±   0.001572 | -6.500 ± 0.047 %  | 18.881 ± 0.078 % |
+| q2_K         | WT 1m   |             2.96 | 8.674365 ±   0.055743  | 2.442732 ±   0.025843  | 0.335308 ±   0.001576 | -6.634 ± 0.047 %  | 19.009 ± 0.079 % |
+| q2_K         | WT 1k   |             2.96 | 8.682605 ±   0.055916  | 2.450972 ±   0.026069  | 0.337093 ±   0.001596 | -6.596 ± 0.047 %  | 18.977 ± 0.079 % |
+| q2_K_S       | WT 10m  |             2.96 | 9.323778 ±   0.061551  | 3.092145 ±   0.031914  | 0.403360 ±   0.001787 | -7.131 ± 0.049 %  | 20.050 ± 0.081 % |
+| q2_K_S       | WT 1m   |             2.96 | 9.329321 ±   0.061378  | 3.097688 ±   0.031816  | 0.403590 ±   0.001797 | -7.289 ± 0.049 %  | 20.123 ± 0.081 % |
+| q2_K_S       | WT 100k |             2.96 | 9.362973 ±   0.061740  | 3.131339 ±   0.032169  | 0.408367 ±   0.001802 | -7.198 ± 0.050 %  | 20.132 ± 0.081 % |
+| q2_K_S       | WT 10k  |             2.96 | 9.376479 ±   0.062045  | 3.144846 ±   0.032464  | 0.408662 ±   0.001819 | -7.141 ± 0.050 %  | 20.120 ± 0.081 % |
+| q2_K_S       | WT 1k   |             2.96 | 9.415200 ±   0.062475  | 3.183567 ±   0.032993  | 0.415865 ±   0.001846 | -7.153 ± 0.050 %  | 20.311 ± 0.082 % |
+| iq2_S        | WT 10m  |             2.56 | 9.650781 ±   0.063209  | 3.419148 ±   0.034017  | 0.439197 ±   0.001976 | -8.319 ± 0.052 %  | 21.491 ± 0.083 % |
+| q2_K         | None    |             2.96 | 9.751568 ±   0.063312  | 3.519934 ±   0.033863  | 0.445132 ±   0.001835 | -9.123 ± 0.051 %  | 21.421 ± 0.079 % |
+| iq2_XS       | WT 10m  |             2.43 | 10.761424 ±   0.071056 | 4.529791 ±   0.042229  | 0.546290 ±   0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
+| iq2_XXS      | WT 10m  |             2.24 | 14.091782 ±   0.098396 | 7.860148 ±   0.070752  | 0.812022 ±   0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
+| iq1_M        | WT 10m  |             2.01 | 25.493722 ±   0.177903 | 19.262089 ±   0.152396 | 1.393084 ±   0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
+| iq1_S        | WT 1m   |             1.88 | 58.097760 ±   0.438604 | 51.866126 ±   0.416604 | 2.211278 ±   0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
+| iq1_S        | WT 1k   |             1.88 | 58.267851 ±   0.446208 | 52.036218 ±   0.424373 | 2.214858 ±   0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
+| iq1_S        | WT 100k |             1.88 | 58.581498 ±   0.453145 | 52.349864 ±   0.431360 | 2.220834 ±   0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
+| iq1_S        | WT 10m  |             1.88 | 60.694593 ±   0.471290 | 54.462959 ±   0.449644 | 2.254554 ±   0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
+| iq1_S        | WT 10k  |             1.88 | 63.221324 ±   0.493077 | 56.989691 ±   0.471423 | 2.293527 ±   0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
+
+There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
+K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
+
+## LLaMA 2 vs. LLaMA 3 Quantization comparison
+
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+| Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
+|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
+| Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
+| Mean PPL ratio  | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
+| Mean ΔPPL       | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
+| PPL correlation |              97.36% |              89.62% |              99.71% |              99.34% |              99.94% |              99.88% |              99.98% |              99.96% |
+| Mean KLD        | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
+| Mean Δp         |    -2.710 ± 0.023 % |    -9.123 ± 0.051 % |    -0.416 ± 0.008 % |    -0.596 ± 0.014 % |    -0.035 ± 0.003 % |    -0.007 ± 0.006 % |    -0.005 ± 0.002 % |    -0.019 ± 0.003 % |
+| Maximum Δp      |             85.136% |             94.268% |             45.209% |             95.054% |             23.593% |             53.601% |             43.925% |             28.734% |
+| 99.9% Δp        |             37.184% |             50.003% |             17.461% |             27.084% |              7.798% |             13.613% |              3.387% |              6.402% |
+| 99.0% Δp        |             18.131% |             25.875% |              7.798% |             12.084% |              3.838% |              6.407% |              1.867% |              3.544% |
+| Median Δp       |             -0.391% |             -2.476% |             -0.026% |             -0.024% |             -0.001% |              0.000% |             -0.000% |             -0.000% |
+| 1.0% Δp         |            -39.762% |            -87.173% |            -11.433% |            -19.567% |             -4.222% |             -6.767% |             -1.862% |             -3.698% |
+| 0.1% Δp         |            -79.002% |            -98.897% |            -26.433% |            -56.054% |             -9.091% |            -16.584% |             -3.252% |             -6.579% |
+| Minimum Δp      |            -99.915% |            -99.965% |            -83.383% |            -98.699% |            -43.142% |            -68.487% |             -9.343% |            -24.301% |
+| RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
+| Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
+
+## LLaMA 3 BF16 vs. FP16 comparison
+
+| Revision | 83330d8c      |
+|:---------|:--------------|
+| Backend  | CPU           |
+| CPU      | AMD Epyc 7742 |
+| GPU      | N/A           |
+
+Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
+
+| Metric                         |                    Value |
+|--------------------------------|--------------------------|
+| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
+| Mean PPL(base)                 |      6.225194 ± 0.037771 |
+| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
+| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
+| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
+| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
+| Mean    KLD                    |  0.00002515 ± 0.00000020 |
+| Maximum KLD                    |                 0.012206 |
+| 99.9%   KLD                    |                 0.000799 |
+| 99.0%   KLD                    |                 0.000222 |
+| 99.0%   KLD                    |                 0.000222 |
+| Median  KLD                    |                 0.000013 |
+| 10.0%   KLD                    |                -0.000002 |
+| 5.0%   KLD                     |                -0.000008 |
+| 1.0%   KLD                     |                -0.000023 |
+| Minimum KLD                    |                -0.000059 |
+| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
+| Maximum Δp                     |                   4.186% |
+| 99.9%   Δp                     |                   1.049% |
+| 99.0%   Δp                     |                   0.439% |
+| 95.0%   Δp                     |                   0.207% |
+| 90.0%   Δp                     |                   0.125% |
+| 75.0%   Δp                     |                   0.029% |
+| Median  Δp                     |                   0.000% |
+| 25.0%   Δp                     |                  -0.030% |
+| 10.0%   Δp                     |                  -0.126% |
+| 5.0%   Δp                      |                  -0.207% |
+| 1.0%   Δp                      |                  -0.434% |
+| 0.1%   Δp                      |                  -1.016% |
+| Minimum Δp                     |                  -4.672% |
+| RMS Δp                         |          0.150 ± 0.001 % |
+| Same top p                     |         99.739 ± 0.013 % |
+
+## Old Numbers
+
+<details>
+<summary>Llama 2 70B Scoreboard</summary>
+
+| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
+|--------------|------------------|------------|---------------|
+| Q4_0         | 36.20            | 3.5550     | 3.61%         |
+| Q4_1         | 40.20            | 3.5125     | 2.37%         |
+| Q5_0         | 44.20            | 3.4744     | 1.26%         |
+| Q2_K         | 27.27            | 3.7339     | 8.82%         |
+| Q3_K_S       | 27.86            | 3.7019     | 7.89%         |
+| Q3_K_M       | 30.83            | 3.5932     | 4.72%         |
+| Q3_K_L       | 33.67            | 3.5617     | 3.80%         |
+| Q4_K_S       | 36.39            | 3.4852     | 1.57%         |
+| Q4_K_M       | 38.54            | 3.4725     | 1.20%         |
+| Q5_K_S       | 44.20            | 3.4483     | 0.50%         |
+| Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
+| Q6_K         | 52.70            | 3.4367     | 0.16%         |
+| fp16         | 128.5            | 3.4313     | -             |
+
+</details>
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9ec989389..9bf6c5743 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,18 +1,21 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
+#include <algorithm>
+#include <array>
+#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <fstream>
+#include <mutex>
+#include <random>
 #include <sstream>
 #include <thread>
-#include <mutex>
-#include <atomic>
 #include <vector>
-#include <array>
-#include <fstream>
-#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -31,55 +34,6 @@ struct results_log_softmax {
     float  prob;
 };
 
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const struct results_perplexity & results
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Perplexity Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_vector_float_yaml(logfile, "logits", results.logits);
-    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    dump_vector_float_yaml(logfile, "probs", results.probs);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 static std::vector<float> softmax(const std::vector<float>& logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
@@ -166,7 +120,7 @@ static void process_logits(
                 break;
             }
             lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
             const double v = -results.log_softmax;
             local_nll += v;
             local_nll2 += v*v;
@@ -200,7 +154,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
                 break;
             }
             lock.unlock();
-            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
             local_nll += v;
             local_nll2 += v*v;
         }
@@ -216,17 +170,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
 }
 
 struct kl_divergence_result {
-    double sum_nll  = 0;
-    double sum_nll2 = 0;
-    double sum_kld  = 0;
-    double sum_kld2 = 0;
-    double sum_nll_diff  = 0;
-    double sum_nll_diff2 = 0;
-    size_t n_same_top = 0;
-    size_t count = 0;
+    double sum_nll          = 0.0;
+    double sum_nll2         = 0.0;
+    double sum_nll_base     = 0.0;
+    double sum_nll_base2    = 0.0;
+    double sum_nll_nll_base = 0.0;
+    double sum_kld          = 0.0;
+    double sum_kld2         = 0.0;
+    double sum_p_diff       = 0.0;
+    double sum_p_diff2      = 0.0;
+    double sum_p_diff4      = 0.0;
+    float  max_p_diff       = 0.0f;
+    size_t n_same_top       = 0.0;
+    size_t count            = 0.0;
 };
 
-static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
     float max_logit = logits[0];
     int imax = 0;
     for (int i = 1; i < n_vocab; ++i) {
@@ -244,12 +203,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
     const float scale = d[0];
     const float min_log_prob = d[1];
     base_log_prob += 4;
-    float nll = max_logit + log_sum_exp - logits[tok];
+
+    const float nll = max_logit + log_sum_exp - logits[tok];
     kld.sum_nll  += nll;
     kld.sum_nll2 += nll*nll;
-    nll += (scale*base_log_prob[tok] + min_log_prob);
-    kld.sum_nll_diff  += nll;
-    kld.sum_nll_diff2 += nll*nll;
+
+    const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_base  += nll_base;
+    kld.sum_nll_base2 += nll_base*nll_base;
+
+    kld.sum_nll_nll_base += nll*nll_base;
+
     max_logit += log_sum_exp;
     double sum = 0;
     int imax_base = -1;
@@ -268,35 +232,53 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
     kld.sum_kld  += sum;
     kld.sum_kld2 += sum*sum;
     ++kld.count;
-    if (imax == imax_base) ++kld.n_same_top;
-    return sum;
+    if (imax == imax_base) {
+        ++kld.n_same_top;
+    }
+
+    const float p_base = expf(-nll_base);
+    const float p = expf(-nll);
+    const float p_diff = p - p_base;
+    kld.sum_p_diff  += p_diff;
+    const double p_diff2 = p_diff*p_diff;
+    kld.sum_p_diff2 += p_diff2;
+    kld.sum_p_diff4 += p_diff2*p_diff2;
+    kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
+
+    return std::make_pair(sum, p_diff);
 }
 
 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
         std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
-        float * kld_values) {
+        float * kld_values, float * p_diff_values) {
     std::mutex mutex;
     const int nv = 2*((n_vocab + 1)/2) + 4;
     int counter = 0;
-    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
         kl_divergence_result local_kld;
         while (true) {
             std::unique_lock<std::mutex> lock(mutex);
             int i = counter++;
             if (i >= n_token) {
-                kld.sum_nll  += local_kld.sum_nll;
-                kld.sum_nll2 += local_kld.sum_nll2;
-                kld.sum_kld  += local_kld.sum_kld;
-                kld.sum_kld2 += local_kld.sum_kld2;
-                kld.sum_nll_diff  += local_kld.sum_nll_diff;
-                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
-                kld.n_same_top += local_kld.n_same_top;
-                kld.count += local_kld.count;
+                kld.sum_nll          += local_kld.sum_nll;
+                kld.sum_nll2         += local_kld.sum_nll2;
+                kld.sum_nll_base     += local_kld.sum_nll_base;
+                kld.sum_nll_base2    += local_kld.sum_nll_base2;
+                kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
+                kld.sum_kld          += local_kld.sum_kld;
+                kld.sum_kld2         += local_kld.sum_kld2;
+                kld.sum_p_diff       += local_kld.sum_p_diff;
+                kld.sum_p_diff2      += local_kld.sum_p_diff2;
+                kld.sum_p_diff4      += local_kld.sum_p_diff4;
+                kld.n_same_top       += local_kld.n_same_top;
+                kld.max_p_diff        = std::max(kld.max_p_diff, local_kld.max_p_diff);
+                kld.count            += local_kld.count;
                 break;
             }
             lock.unlock();
-            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
-            kld_values[i] = (float)v;
+            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i]    = (float)v.first;
+            p_diff_values[i] = v.second;
         }
     };
     for (auto & w : workers) {
@@ -308,24 +290,28 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
     }
 }
 
-static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
+static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
 
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     const int n_ctx = llama_n_ctx(ctx);
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -336,16 +322,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     prob_history.resize(tokens.size());
 
     if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
 
     const int calc_chunk = n_ctx;
 
-    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
 
     if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                 tokens.size(), n_ctx, params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
@@ -353,20 +339,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
     int count = 0;
     double nll = 0.0;
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * params.ppl_stride;
         const int end   = start + calc_chunk;
 
         const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
 
         std::vector<float> logits;
 
@@ -375,13 +362,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
 
-            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //fprintf(stderr, "%s : failed to eval\n", __func__);
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            if (llama_decode(ctx, batch)) {
+                //LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -390,37 +385,38 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_vocab_bos(vocab);
             }
 
-            const auto batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            const auto * batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
 
             if (j == 0) {
                 tokens[batch_start] = token_org;
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
 
-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
         for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
-
             // Calculate probability of next token, given the previous ones.
             const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
+                logits.begin() + size_t(j + 0) * n_vocab,
+                logits.begin() + size_t(j + 1) * n_vocab);
 
             const float prob = softmax(tok_logits)[tokens[start + j + 1]];
             logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
@@ -431,54 +427,56 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         }
         // perplexity is e^(average negative log-likelihood)
         if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
         } else {
-            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
         }
-        fflush(stdout);
     }
-    printf("\n");
+    LOG("\n");
 
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
 
-static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
+static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
     if (params.ppl_stride > 0) {
         return perplexity_v2(ctx, params);
     }
 
     // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
 
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
     std::ofstream logits_stream;
     if (!params.logits_file.empty()) {
         logits_stream.open(params.logits_file.c_str(), std::ios::binary);
         if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
             return {};
         }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
         logits_stream.write("_logits_", 8);
         logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
     }
 
     auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -491,21 +489,28 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     const int n_chunk_max = tokens.size() / n_ctx;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
     int count = 0;
     double nll = 0.0;
     double nll2 = 0.0;
 
     const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int n_seq = std::max(1, n_batch / n_ctx);
+
+    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
 
     std::vector<float> logits;
     if (num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve(size_t(n_ctx) * n_vocab);
     }
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
@@ -518,10 +523,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         log_probs.resize(n_ctx * nv);
     }
 
-    for (int i = 0; i < n_chunk; ++i) {
+    // We get the logits for all the tokens in the context window (params.n_ctx)
+    // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+    // calculate the perplexity over the last half of the window (so the model always has
+    // some context to predict the token).
+    //
+    // We rely on the fact that attention in the forward pass only looks at previous
+    // tokens here, so the logits returned for each token are an accurate representation
+    // of what the model would have predicted at that point.
+    //
+    // Example, we have a context window of 512, we will compute perplexity for each of the
+    // last 256 tokens.  Then, we split the input up into context window size chunks to
+    // process the entire prompt.
+    const int first = n_ctx/2;
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
 
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);
+
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
@@ -531,78 +552,94 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
 
-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
+            int n_outputs = 0;
 
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+            batch.n_tokens = 0;
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after eval
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+
+                for (int k = 0; k < batch_size; ++k) {
+                    const int idx = seq*n_ctx + k;
+                    batch.token   [idx]    = tokens[seq_start + k];
+                    batch.pos     [idx]    = j*n_batch + k;
+                    batch.n_seq_id[idx]    = 1;
+                    batch.seq_id  [idx][0] = seq;
+                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
+
+                    n_outputs += batch.logits[idx] != 0;
+                }
+                batch.n_tokens += batch_size;
+
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
             }
 
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+            if (llama_decode(ctx, batch)) {
+                LOG_INF("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            if (num_batches > 1) {
+            if (num_batches > 1 && n_outputs > 0) {
                 const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
             }
         }
 
-        const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total*n_chunk/n_seq);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
 
-        // We get the logits for all the tokens in the context window (params.n_ctx)
-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half of the window (so the model always has
-        // some context to predict the token).
-        //
-        // We rely on the fact that attention in the forward pass only looks at previous
-        // tokens here, so the logits returned for each token are an accurate representation
-        // of what the model would have predicted at that point.
-        //
-        // Example, we have a context window of 512, we will compute perplexity for each of the
-        // last 256 tokens.  Then, we split the input up into context window size chunks to
-        // process the entire prompt.
-        const int first = n_ctx/2;
-        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        if (!params.logits_file.empty()) {
-            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, log_probs, nll, nll2);
-        } else {
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        }
-        count += n_ctx - first - 1;
+        for (int seq = 0; seq < n_seq_batch; seq++) {
+            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
 
-        // perplexity is e^(average negative log-likelihood)
-        if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        } else {
-            double av = nll/count;
-            double av2 = nll2/count - av*av;
-            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+            if (!params.logits_file.empty()) {
+                process_logits(logits_stream, n_vocab, all_logits,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, log_probs, nll, nll2);
+            } else {
+                process_logits(n_vocab, all_logits,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, nll, nll2,
+                        logit_history.data() + start + seq*n_ctx + first,
+                        prob_history.data()  + start + seq*n_ctx + first);
+            }
+            count += n_ctx - first - 1;
+
+            // perplexity is e^(average negative log-likelihood)
+            if (params.ppl_output_type == 0) {
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+            } else {
+                double av = nll/count;
+                double av2 = nll2/count - av*av;
+                if (av2 > 0) {
+                    av2 = sqrt(av2/(count-1));
+                }
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            }
         }
-        fflush(stdout);
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     nll2 /= count;
     nll /= count;
@@ -610,17 +647,20 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     nll2 -= nll * nll;
     if (nll2 > 0) {
         nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
     } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
     }
 
+    llama_batch_free(batch);
+
     return {tokens, ppl, logit_history, prob_history};
 }
 
-static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
-    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
+    int prev_outputs = 0;
+    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
+        const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
 
         llama_batch batch_view = {
             n_tokens,
@@ -630,16 +670,22 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             batch.n_seq_id + i,
             batch.seq_id   + i,
             batch.logits   + i,
-            0, 0, 0, // unused
         };
 
         const int ret = llama_decode(ctx, batch_view);
         if (ret != 0) {
-            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
             return false;
         }
 
-        memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
+        int n_outputs = 0;
+        for (int i = 0; i < n_tokens; ++i) {
+            n_outputs += batch_view.logits[i] != 0;
+        }
+
+        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
+
+        prev_outputs += n_outputs;
     }
 
     return true;
@@ -652,7 +698,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     if (eval_results.size() != eval_pairs.size()) {
         eval_results.resize(eval_pairs.size());
     }
-    if (eval_pairs.empty()) return;
+    if (eval_pairs.empty()) {
+        return;
+    }
 
     size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
 
@@ -660,11 +708,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
         float local_logprobs[K_TOKEN_CHUNK];
         while (true) {
-            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
-            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
+            const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
+            if (first >= eval_results.size()) {
+                break;
+            }
+            const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
             for (size_t i = first; i < last; ++i) {
-                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
+                const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
                 float max_logit = logits[0];
                 for (int j = 1; j < n_vocab; ++j) {
                     max_logit = std::max(max_logit, logits[j]);
@@ -687,7 +737,10 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
     }
 }
 
-static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+static void hellaswag_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -714,18 +767,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     }
 
     if (prompt_lines.size() % 6 != 0) {
-        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
         return;
     }
 
     size_t hs_task_count = prompt_lines.size()/6;
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
 
-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
-
-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
+    LOG_INF("================================= is_spm = %d\n", is_spm);
 
     // The tasks should be randomized so the score stabilizes quickly.
     bool randomize_tasks = true;
@@ -746,13 +796,13 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         size_t ending_logprob_count[4];
         double ending_logprob[4];
 
-        size_t i_batch;         // starting index in the llama_batch
+        size_t i_logits;        // starting index of logits in the llama_batch
         size_t common_prefix;   // max number of initial tokens that are the same in all sentences
         size_t required_tokens; // needed number of tokens to evaluate all 4 endings
         std::vector<llama_token> seq_tokens[4];
     };
 
-    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
 
     // Select and read data from prompt lines
     std::vector<hs_data_t> hs_data(hs_task_count);
@@ -771,7 +821,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
         for (size_t j = 0; j < 4; j++) {
             hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
+            hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
         }
 
         // determine the common prefix of the endings
@@ -790,7 +840,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
             hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
 
-        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
+        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
 
         // Delete the selected random example from the prompt
         if (randomize_tasks) {
@@ -798,23 +848,25 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
     }
 
-    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
     double acc = 0.0f;
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int max_tasks_per_batch = 32;
-    const int max_seq = 4*max_tasks_per_batch;
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    const int max_tasks_per_batch = 32;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, 4);
 
     std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    // TODO: this could be made smaller; it's currently the worst-case size
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -824,16 +876,17 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         int n_cur = 0;
 
         size_t i1 = i0;
-        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // batch as much tasks as possible into the available context
-        // each task has 4 unique seuqnce ids - one for each ending
+        // each task has 4 unique sequence ids - one for each ending
         // the common prefix is shared among the 4 sequences to save tokens
         // we extract logits only from the last common token and from all ending tokens of each sequence
         while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
             auto & hs_cur = hs_data[i1];
+            int n_logits = 0;
 
             const int s0 = 4*(i1 - i0);
             if (s0 + 4 > max_seq) {
@@ -841,18 +894,23 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
 
             for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
-                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
             }
             batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;
 
             for (int s = 0; s < 4; ++s) {
-                for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
+                const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
                 }
             }
 
-            hs_cur.i_batch = i_batch;
-            i_batch += hs_cur.required_tokens;
+            hs_cur.i_logits = i_logits;
+            i_logits += n_logits;
 
             n_cur += hs_data[i1].required_tokens;
             if (++i1 == hs_task_count) {
@@ -861,7 +919,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -869,7 +927,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -878,12 +936,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         eval_pairs.clear();
         for (size_t i = i0; i < i1; ++i) {
             auto & hs_cur = hs_data[i];
-            size_t li = hs_cur.common_prefix;
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
             for (int s = 0; s < 4; ++s) {
                 for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
                 }
-                ++li;
             }
         }
         // Then we do the actual calculation
@@ -895,7 +952,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         for (size_t i = i0; i < i1; ++i) {
             auto & hs_cur = hs_data[i];
 
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
 
             const auto first_probs = softmax(tok_logits);
 
@@ -919,7 +977,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                 }
             }
 
-            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
 
             // If the gold ending got the maximum logprobe add one accuracy point
             if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -927,8 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
 
             // Print the accumulated accuracy mean x 100
-            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
-            fflush(stdout);
+            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
         }
 
         i0 = i1 - 1;
@@ -936,7 +993,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
     llama_batch_free(batch);
 
-    printf("\n");
+    LOG("\n");
 }
 
 struct winogrande_entry {
@@ -945,7 +1002,7 @@ struct winogrande_entry {
     std::array<std::string, 2> choices;
     int answer;
 
-    size_t i_batch;
+    size_t i_logits;
     size_t common_prefix;
     size_t required_tokens;
     size_t n_base1; // number of tokens for context + choice 1
@@ -953,7 +1010,7 @@ struct winogrande_entry {
     std::vector<llama_token> seq_tokens[2];
 };
 
-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
     std::vector<winogrande_entry> result;
     std::istringstream in(prompt);
     std::string line;
@@ -980,7 +1037,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string&
             }
         }
         if (ipos != 4) {
-            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
             continue;
         }
         auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -994,13 +1051,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string&
             if (sentence[where] == '_') break;
         }
         if (where == int(sentence.size())) {
-            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
             continue;
         }
         std::istringstream stream(answer.c_str());
         int i_answer; stream >> i_answer;
         if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
             continue;
         }
         result.emplace_back();
@@ -1023,20 +1080,22 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string&
  *    0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
  *
  */
-static void winogrande_score(llama_context * ctx, const gpt_params & params) {
+static void winogrande_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     constexpr int k_min_trailing_ctx = 3;
 
     auto data = load_winogrande_from_csv(params.prompt);
     if (data.empty()) {
-        fprintf(stderr, "%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
 
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
 
     if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
         std::mt19937 rng(1);
         std::vector<int> aux(data.size());
         for (int i = 0; i < int(data.size()); ++i) {
@@ -1054,14 +1113,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         data = std::move(selected);
     }
 
-    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
-
-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);
 
     for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
+        task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
 
         task.common_prefix = 0;
         for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1071,27 +1127,30 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             task.common_prefix++;
         }
 
+        // TODO: the last token of each of the sequences don't need to be evaluated
         task.required_tokens = task.common_prefix +
             task.seq_tokens[0].size() - task.common_prefix +
             task.seq_tokens[1].size() - task.common_prefix;
 
-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
+        task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
     }
 
-    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int max_tasks_per_batch = 128;
-    const int max_seq = 2*max_tasks_per_batch;
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    const int max_tasks_per_batch = 128;
+    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, 2);
 
     std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    // TODO: this could be made smaller; it's currently the worst-case size
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -1104,29 +1163,33 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         int n_cur = 0;
 
         size_t i1 = i0;
-        size_t i_batch = 0;
+        size_t i_logits = 0;
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+            int n_logits = 0;
             const int s0 = 2*(i1 - i0);
             if (s0 + 2 > max_seq) {
                 break;
             }
 
             for (size_t i = 0; i < data[i1].common_prefix; ++i) {
-                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
+                common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
             }
             batch.logits[batch.n_tokens - 1] = true;
+            n_logits += 1;
 
             for (int s = 0; s < 2; ++s) {
+                // TODO: end before the last token, no need to predict past the end of the sequences
                 for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    n_logits += 1;
                 }
             }
 
-            data[i1].i_batch = i_batch;
-            i_batch += data[i1].required_tokens;
+            data[i1].i_logits = i_logits;
+            i_logits += n_logits;
 
             n_cur += data[i1].required_tokens;
             if (++i1 == data.size()) {
@@ -1135,7 +1198,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1143,7 +1206,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1157,15 +1220,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
             const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
             const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
-            size_t li = n_base1 - 1;
+            size_t li = n_base1 - task.common_prefix;
             for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
-                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
             }
             const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
             const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
-            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            // FIXME: this uses the wrong first logits when not skipping the choice word
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
             for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
-                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
             }
         }
         compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@@ -1202,20 +1266,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             ++n_done;
 
             // print the accumulated accuracy mean x 100
-            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
-            fflush(stdout);
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
         }
 
         i0 = i1 - 1;
     }
 
-    printf("\n");
+    LOG("\n");
 
     if (n_done < 100) return;
 
     const float p = 1.f*n_correct/n_done;
     const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 
 static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1254,17 +1318,17 @@ struct multiple_choice_task {
     }
 
     // For evaluation
-    size_t i_batch;         // starting index in the llama_batch
+    size_t i_logits;        // starting index of logits in the llama_batch
     size_t common_prefix;   // max number of initial tokens that are the same in all sentences
     size_t required_tokens; // needed number of tokens to evaluate all answers
     std::vector<std::vector<llama_token>> seq_tokens;
     std::vector<float> log_probs;
 };
 
-static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
+static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
     if (task.question.empty() || task.mc1.answers.empty()) {
         if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
         }
         return false;
     }
@@ -1272,11 +1336,11 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
     for (auto& answer : task.mc1.answers) {
         if (answer.empty()) {
             if (log_error) {
-                printf("%s: found empty answer\n", __func__);
+                LOG_ERR("%s: found empty answer\n", __func__);
             }
             return false;
         }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
+        task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
     }
     auto min_len = task.seq_tokens.front().size();
     for (auto& seq : task.seq_tokens) {
@@ -1320,20 +1384,22 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
 //     git@hf.co:datasets/Stevross/mmlu
 //     https://huggingface.co/datasets/truthful_qa
 //
-static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
+static void multiple_choice_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
     if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
     std::vector<uint32_t> task_pos(n_task);
     strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
     if (strstream.fail()) {
-        printf("%s: failed to raad task positions from prompt\n", __func__);
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
         return;
     }
 
@@ -1341,21 +1407,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
         // Use all tasks
         tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
-        int n_dot = n_task/100;
+        LOG_INF("%s: reading tasks", __func__);
+        int n_dot = std::max((int) n_task/100, 1);
         int i = 0;
         for (auto& task : tasks) {
             ++i;
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
                 return;
             }
-            if (i%n_dot == 0) printf(".");
+            if (i%n_dot == 0) LOG(".");
         }
-        printf("done\n");
+        LOG("done\n");
     }
     else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
         std::mt19937 rng(1);
         std::vector<int> aux(n_task);
         for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1368,24 +1434,19 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             aux.pop_back();
             strstream.seekg(task_pos[idx], std::ios::beg);
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                 return;
             }
         }
         n_task = params.multiple_choice_tasks;
     }
 
-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
+    LOG_INF("%s: preparing task data", __func__);
     if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
+        LOG("...");
         std::atomic<int> counter(0);
         std::atomic<int> n_bad(0);
-        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
+        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
             int num_tasks = tasks.size();
             int n_bad_local = 0;
             while (true) {
@@ -1396,7 +1457,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 }
                 int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
                 for (int i = first; i < last; ++i) {
-                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                    if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
                 }
             }
         };
@@ -1406,44 +1467,43 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         for (auto& w : workers) w = std::thread(prepare);
         prepare();
         for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
+        LOG("done\n");
         int nbad = n_bad;
         if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
             return;
         }
     } else {
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
         int i_task = 0;
         for (auto& task : tasks) {
             ++i_task;
-            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
+            if (!multiple_choice_prepare_one_task(ctx, task, true)) {
                 return;
             }
             if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
+                LOG(".");
             }
         }
-        printf("done\n");
+        LOG("done\n");
     }
 
-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
     const int max_tasks_per_batch = 32;
-    const int max_seq = 4*max_tasks_per_batch;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
     std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
 
     std::vector<std::pair<size_t, llama_token>> eval_pairs;
     std::vector<float> eval_results;
@@ -1458,17 +1518,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         int n_cur = 0;
 
         size_t i1 = i0;
-        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         // batch as much tasks as possible into the available context
-        // each task has 4 unique seuqnce ids - one for each ending
+        // each task has 4 unique sequence ids - one for each ending
         // the common prefix is shared among the 4 sequences to save tokens
         // we extract logits only from the last common token and from all ending tokens of each sequence
         int s0 = 0;
         while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
             auto& cur_task = tasks[i1];
+            int n_logits = 0;
 
             int num_answers = cur_task.seq_tokens.size();
             if (s0 + num_answers > max_seq) {
@@ -1482,20 +1543,25 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
             for (size_t i = 0; i < cur_task.common_prefix; ++i) {
                 //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
-                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+                common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
             }
             batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;
 
             for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
+                const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
                 }
             }
 
             s0 += num_answers;
 
-            cur_task.i_batch = i_batch;
-            i_batch += cur_task.required_tokens;
+            cur_task.i_logits = i_logits;
+            i_logits += n_logits;
 
             n_cur += cur_task.required_tokens;
             if (++i1 == tasks.size()) {
@@ -1504,7 +1570,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1512,7 +1578,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1521,12 +1587,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         eval_pairs.clear();
         for (size_t i = i0; i < i1; ++i) {
             auto& cur_task = tasks[i];
-            size_t li = cur_task.common_prefix;
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
             for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
                 }
-                ++li;
             }
         }
         // Then we do the actual calculation
@@ -1537,15 +1602,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // compute the logprobs for each ending of the decoded tasks
         for (size_t i = i0; i < i1; ++i) {
             auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
             //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
             //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
+            //        LOG("%d", j+1);
             //    }
             //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
 
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
 
             const auto first_probs = softmax(tok_logits);
 
@@ -1554,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 size_t count = 1;
                 float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
                     ++count;
                     log_prob += eval_results[ir++];
                 }
                 cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
             }
 
             // Find the ending with maximum logprob
@@ -1580,8 +1646,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             ++n_done;
 
             // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
         }
 
         i0 = i1 - 1;
@@ -1589,33 +1654,37 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
     llama_batch_free(batch);
 
-    if (n_done < 100) return;
+    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
 
     float p = 1.f*n_correct/n_done;
     float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
     p = 1.f*n_done/n_tot_answers;
     sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
 
-    printf("\n");
+    LOG_INF("\n");
 }
 
-static void kl_divergence(llama_context * ctx, const gpt_params & params) {
+static void kl_divergence(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
     if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
     }
     std::ifstream in(params.logits_file.c_str(), std::ios::binary);
     if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
         return;
     }
     {
         char check[9]; check[8] = 0;
         in.read(check, 8);
         if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
             return;
         }
     }
@@ -1623,37 +1692,40 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     uint32_t n_ctx;
     in.read((char *)&n_ctx, sizeof(n_ctx));
     if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                 __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
     }
 
-    int n_vocab, n_chunk;
+    int n_vocab;
+    int n_chunk;
     in.read((char *)&n_vocab, sizeof(n_vocab));
     in.read((char *)&n_chunk, sizeof(n_chunk));
     if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
-    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+    if (n_vocab != llama_vocab_n_tokens(vocab)) {
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
     }
 
-    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
     if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
         return;
     }
 
     const int n_batch = params.n_batch;
     const int num_batches = (n_ctx + n_batch - 1)/n_batch;
     const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
     std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
     std::vector<float> logits;
     if (num_batches > 1) {
-        logits.reserve(n_ctx * n_vocab);
+        logits.reserve(size_t(n_ctx) * n_vocab);
     }
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -1667,9 +1739,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
         return std::make_pair(f, df);
     };
+    auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
+        if (count < 10) {
+            return 0.0;
+        }
+        double var = sumab/count - (suma/count)*(sumb/count);
+        var /= count - 1;
+        return var;
+    };
 
     kl_divergence_result kld;
-    auto kld_ptr = kld_values.data();
+    auto    kld_ptr =    kld_values.data();
+    auto p_diff_ptr = p_diff_values.data();
 
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * n_ctx;
@@ -1678,13 +1759,15 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
             return;
         }
 
         // clear the KV cache
         llama_kv_cache_clear(ctx);
 
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -1694,11 +1777,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
             // add BOS token for the first batch of each chunk
             if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_vocab_bos(vocab);
             }
 
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                 return;
             }
 
@@ -1707,131 +1796,232 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
             if (num_batches > 1) {
                 const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
             }
         }
 
+        llama_batch_free(batch);
+
         const auto t_end = std::chrono::high_resolution_clock::now();
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
 
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld, kld_ptr);
-        kld_ptr += n_ctx - 1 - first;
+        process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
+        p_diff_ptr += n_ctx - 1 - first;
+        kld_ptr    += n_ctx - 1 - first;
 
-        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
-        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        auto p_top = 1.*kld.n_same_top/kld.count;
-        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
+        LOG("%4d", i+1);
 
-        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
-                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
-                p_top, d_p_top);
+        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        const double ppl_val = exp(log_ppl.first);
+        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
 
-        fflush(stdout);
+        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+
+        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+        const double p_diff_rms_val = sqrt(p_diff_mse.first);
+        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+        double p_top_val = 1.*kld.n_same_top/kld.count;
+        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+
+        LOG("\n");
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     if (kld.count < 100) return; // we do not wish to do statistics on so few values
 
     std::sort(kld_values.begin(), kld_values.end());
+    std::sort(p_diff_values.begin(), p_diff_values.end());
 
-    printf("===== KL-divergence statistics\n");
+    LOG("====== Perplexity statistics ======\n");
+
+    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+    const double ppl_val = exp(log_ppl.first);
+    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+
+    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+    const double ppl_base_val = exp(log_ppl_base.first);
+    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+
+    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+
+    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+    const double ppl_ratio_val = exp(log_ppl_ratio_val);
+    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+
+    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
+    const double ppl_diff_val = ppl_val - ppl_base_val;
+    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+
+    LOG("\n");
+
+    LOG("====== KL divergence statistics ======\n");
     auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
     auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                                : kld_values[kld_values.size()/2];
-    printf("Median : %10.6f\n", kld_median);
 
-    auto percentile = [&kld_values] (float fraction) {
-        if (fraction <= 0) return kld_values.front();
-        if (fraction >= 1) return kld_values.back();
-        float p = fraction*(kld_values.size() - 1);
+    auto percentile = [] (std::vector<float> values, float fraction) {
+        if (fraction <= 0) return values.front();
+        if (fraction >= 1) return values.back();
+        float p = fraction*(values.size() - 1);
         size_t ip = size_t(p); p -= ip;
-        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
+        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
     };
 
-    printf("Maximum: %10.6f\n", kld_values.back());
-    printf("KLD_99 : %10.6f\n", percentile(0.99f));
-    printf("KLD_95 : %10.6f\n", percentile(0.95f));
-    printf("KLD_90 : %10.6f\n", percentile(0.90f));
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());
 
-    printf("Minimum: %10.6f\n", kld_values.front());
-    printf("KLD_01 : %10.6f\n", percentile(0.01f));
-    printf("KLD_05 : %10.6f\n", percentile(0.05f));
-    printf("KLD_10 : %10.6f\n", percentile(0.10f));
+    LOG("\n");
 
+    LOG("====== Token probability statistics ======\n");
+
+    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+
+    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
+                                               : p_diff_values[p_diff_values.size()/2];
+
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+
+    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+
+    const double p_diff_rms_val = sqrt(p_diff_mse.first);
+    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+    const double same_top_p = 1.0*kld.n_same_top/kld.count;
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
 }
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    params.n_batch = 512;
-    if (!gpt_params_parse(argc, argv, params)) {
+    params.n_ctx = 512;
+    params.logits_all = true;
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
         return 1;
     }
 
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    common_init();
+
+    const int32_t n_ctx = params.n_ctx;
+
+    if (n_ctx <= 0) {
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
+    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
+
+    if (ppl) {
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
+        params.n_parallel = n_seq;
+        params.n_ctx      = n_kv;
+
+        params.n_batch = std::min(params.n_batch, n_kv);
+    } else {
+        params.n_batch = std::min(params.n_batch, params.n_ctx);
+        if (params.kl_divergence) {
+            params.n_parallel = 1;
+        } else {
+            // ensure there's at least enough seq_ids for HellaSwag
+            params.n_parallel = std::max(4, params.n_parallel);
+        }
+    }
 
     if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                 params.n_ctx, params.n_ctx + params.ppl_stride/2);
         params.n_ctx += params.ppl_stride/2;
     }
 
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
-
     // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
     struct results_perplexity results;
@@ -1844,14 +2034,11 @@ int main(int argc, char ** argv) {
     } else if (params.kl_divergence) {
         kl_divergence(ctx, params);
     } else {
-        results = perplexity(ctx, params);
+        results = perplexity(ctx, params, n_ctx);
     }
 
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, results);
-
-    llama_free(ctx);
-    llama_free_model(model);
+    LOG("\n");
+    llama_perf_context_print(ctx);
 
     llama_backend_free();
 
diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py
deleted file mode 100644
index 160966649..000000000
--- a/examples/pydantic-models-to-grammar-examples.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Function calling example using pydantic models.
-import datetime
-import importlib
-import json
-from enum import Enum
-from typing import Optional, Union
-
-import requests
-from pydantic import BaseModel, Field
-from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
-                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
-
-
-# Function to get completion on the llama.cpp server with grammar.
-def create_completion(prompt, grammar):
-    headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": grammar}
-
-    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
-    data = response.json()
-
-    print(data["content"])
-    return data["content"]
-
-
-# A function for the agent to send a message to the user.
-class SendMessageToUser(BaseModel):
-    """
-    Send a message to the User.
-    """
-    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
-    message: str = Field(..., description="Message you want to send to the user.")
-
-    def run(self):
-        print(self.message)
-
-
-# Enum for the calculator tool.
-class MathOperation(Enum):
-    ADD = "add"
-    SUBTRACT = "subtract"
-    MULTIPLY = "multiply"
-    DIVIDE = "divide"
-
-
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
-class Calculator(BaseModel):
-    """
-    Perform a math operation on two numbers.
-    """
-    number_one: Union[int, float] = Field(..., description="First number.")
-    operation: MathOperation = Field(..., description="Math operation to perform.")
-    number_two: Union[int, float] = Field(..., description="Second number.")
-
-    def run(self):
-        if self.operation == MathOperation.ADD:
-            return self.number_one + self.number_two
-        elif self.operation == MathOperation.SUBTRACT:
-            return self.number_one - self.number_two
-        elif self.operation == MathOperation.MULTIPLY:
-            return self.number_one * self.number_two
-        elif self.operation == MathOperation.DIVIDE:
-            return self.number_one / self.number_two
-        else:
-            raise ValueError("Unknown operation.")
-
-
-# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
-# pydantic_model_list is the list of pydanitc models
-# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
-# outer_object_content is the name of outer object content.
-# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
-# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
-    outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-
-print(gbnf_grammar)
-print(documentation)
-
-system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-
-user_message = "What is 42 * 42?"
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-# This should output something like this:
-# {
-#     "function": "calculator",
-#     "function_parameters": {
-#         "number_one": 42,
-#         "operation": "multiply",
-#         "number_two": 42
-#     }
-# }
-function_dictionary = json.loads(text)
-if function_dictionary["function"] == "calculator":
-    function_parameters = {**function_dictionary["function_parameters"]}
-
-    print(Calculator(**function_parameters).run())
-    # This should output: 1764
-
-
-# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
-class Category(Enum):
-    """
-    The category of the book.
-    """
-    Fiction = "Fiction"
-    NonFiction = "Non-Fiction"
-
-
-class Book(BaseModel):
-    """
-    Represents an entry about a book.
-    """
-    title: str = Field(..., description="Title of the book.")
-    author: str = Field(..., description="Author of the book.")
-    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
-    keywords: list[str] = Field(..., description="A list of keywords.")
-    category: Category = Field(..., description="Category of the book.")
-    summary: str = Field(..., description="Summary of the book.")
-
-
-# We need no additional parameters other than our list of pydantic models.
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
-
-system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
-
-text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-
-json_data = json.loads(text)
-
-print(Book(**json_data))
-# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
-
-def get_current_datetime(output_format: Optional[str] = None):
-    """
-    Get the current date and time in the given format.
-    Args:
-         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
-    """
-    if output_format is None:
-        output_format = '%Y-%m-%d %H:%M:%S'
-    return datetime.datetime.now().strftime(output_format)
-
-
-# Example function to get the weather
-def get_current_weather(location, unit):
-    """Get the current weather in a given location"""
-    if "London" in location:
-        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
-    elif "New York" in location:
-        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
-    elif "North Pole" in location:
-        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
-    else:
-        return json.dumps({"location": location, "temperature": "unknown"})
-
-
-# Here is a function definition in OpenAI style
-current_weather_tool = {
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {
-                    "type": "string",
-                    "description": "The city and state, e.g. San Francisco, CA",
-                },
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-            },
-            "required": ["location"],
-        },
-    },
-}
-
-# Convert OpenAI function definition into pydantic model
-current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
-# Add the actual function to a pydantic model
-current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
-
-# Convert normal Python function to a pydantic model
-current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
-
-tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
-
-
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=tool_list, outer_object_name="function",
-    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
-
-system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
-
-
-text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-
-json_data = json.loads(text)
-
-print(json_data)
-# Should output something like this:
-# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
-
-
-for call in json_data:
-    if call["function"] == "Calculator":
-        print(Calculator(**call["params"]).run())
-    elif call["function"] == "get_current_datetime":
-        print(current_datetime_model(**call["params"]).run())
-    elif call["function"] == "get_current_weather":
-        print(current_weather_tool_model(**call["params"]).run())
-# Should output something like this:
-# 2024-01-14 13:36:06
-# {"location": "London", "temperature": "42", "unit": "celsius"}
-# 1764
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 9acc7cc6d..93e5dcb6c 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -9,7 +9,7 @@ from inspect import getdoc, isclass
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
 
 from docstring_parser import parse
-from pydantic import BaseModel, Field, create_model
+from pydantic import BaseModel, create_model
 
 if TYPE_CHECKING:
     from types import GenericAlias
@@ -17,6 +17,9 @@ else:
     # python 3.8 compat
     from typing import _GenericAlias as GenericAlias
 
+# TODO: fix this
+# pyright: reportAttributeAccessIssue=information
+
 
 class PydanticDataType(Enum):
     """
@@ -50,35 +53,38 @@ class PydanticDataType(Enum):
 
 
 def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
-    if isclass(pydantic_type) and issubclass(pydantic_type, str):
+    origin_type = get_origin(pydantic_type)
+    origin_type = pydantic_type if origin_type is None else origin_type
+
+    if isclass(origin_type) and issubclass(origin_type, str):
         return PydanticDataType.STRING.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
+    elif isclass(origin_type) and issubclass(origin_type, bool):
         return PydanticDataType.BOOLEAN.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, int):
+    elif isclass(origin_type) and issubclass(origin_type, int):
         return PydanticDataType.INTEGER.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, float):
+    elif isclass(origin_type) and issubclass(origin_type, float):
         return PydanticDataType.FLOAT.value
-    elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
+    elif isclass(origin_type) and issubclass(origin_type, Enum):
         return PydanticDataType.ENUM.value
 
-    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
-        return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) is list:
+    elif isclass(origin_type) and issubclass(origin_type, BaseModel):
+        return format_model_and_field_name(origin_type.__name__)
+    elif origin_type is list:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) is set:
+    elif origin_type is set:
         element_type = get_args(pydantic_type)[0]
         return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) is Union:
+    elif origin_type is Union:
         union_types = get_args(pydantic_type)
         union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
         return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) is Optional:
+    elif origin_type is Optional:
         element_type = get_args(pydantic_type)[0]
         return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
-    elif isclass(pydantic_type):
-        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) is dict:
+    elif isclass(origin_type):
+        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
+    elif origin_type is dict:
         key_type, value_type = get_args(pydantic_type)
         return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
     else:
@@ -115,7 +121,7 @@ def get_members_structure(cls, rule_name):
         # Modify this comprehension
         members = [
             f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
-            for name, param_type in cls.__annotations__.items()
+            for name, param_type in get_type_hints(cls).items()
             if name != "self"
         ]
 
@@ -234,8 +240,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
 
     # Define the integer part rule
     integer_part_rule = (
-        "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
-        f"-min{min_digit}" if min_digit is not None else "")
+        "integer-part"
+        + (f"-max{max_digit}" if max_digit is not None else "")
+        + (f"-min{min_digit}" if min_digit is not None else "")
     )
 
     # Define the fractional part rule based on precision constraints
@@ -293,17 +300,20 @@ def generate_gbnf_rule_for_type(
     field_name = format_model_and_field_name(field_name)
     gbnf_type = map_pydantic_type_to_gbnf(field_type)
 
-    if isclass(field_type) and issubclass(field_type, BaseModel):
+    origin_type = get_origin(field_type)
+    origin_type = field_type if origin_type is None else origin_type
+
+    if isclass(origin_type) and issubclass(origin_type, BaseModel):
         nested_model_name = format_model_and_field_name(field_type.__name__)
         nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
         rules.extend(nested_model_rules)
         gbnf_type, rules = nested_model_name, rules
-    elif isclass(field_type) and issubclass(field_type, Enum):
+    elif isclass(origin_type) and issubclass(origin_type, Enum):
         enum_values = [f'"\\"{e.value}\\""' for e in field_type]  # Adding escaped quotes
         enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
         rules.append(enum_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
-    elif get_origin(field_type) == list:  # Array
+    elif origin_type is list:  # Array
         element_type = get_args(field_type)[0]
         element_rule_name, additional_rules = generate_gbnf_rule_for_type(
             model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -313,7 +323,7 @@ def generate_gbnf_rule_for_type(
         rules.append(array_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
 
-    elif get_origin(field_type) == set or field_type == set:  # Array
+    elif origin_type is set:  # Array
         element_type = get_args(field_type)[0]
         element_rule_name, additional_rules = generate_gbnf_rule_for_type(
             model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -367,7 +377,7 @@ def generate_gbnf_rule_for_type(
             gbnf_type = f"{model_name}-{field_name}-optional"
         else:
             gbnf_type = f"{model_name}-{field_name}-union"
-    elif isclass(field_type) and issubclass(field_type, str):
+    elif isclass(origin_type) and issubclass(origin_type, str):
         if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
             triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
             markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
@@ -383,8 +393,8 @@ def generate_gbnf_rule_for_type(
             gbnf_type = PydanticDataType.STRING.value
 
     elif (
-        isclass(field_type)
-        and issubclass(field_type, float)
+        isclass(origin_type)
+        and issubclass(origin_type, float)
         and field_info
         and hasattr(field_info, "json_schema_extra")
         and field_info.json_schema_extra is not None
@@ -409,8 +419,8 @@ def generate_gbnf_rule_for_type(
         )
 
     elif (
-        isclass(field_type)
-        and issubclass(field_type, int)
+        isclass(origin_type)
+        and issubclass(origin_type, int)
         and field_info
         and hasattr(field_info, "json_schema_extra")
         and field_info.json_schema_extra is not None
@@ -458,7 +468,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
     if not issubclass(model, BaseModel):
         # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
         if hasattr(model, "__annotations__") and model.__annotations__:
-            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
+            model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
         else:
             init_signature = inspect.signature(model.__init__)
             parameters = init_signature.parameters
@@ -466,7 +476,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
                             name != "self"}
     else:
         # For Pydantic models, use model_fields and check for ellipsis (required fields)
-        model_fields = model.__annotations__
+        model_fields = get_type_hints(model)
 
     model_rule_parts = []
     nested_rules = []
@@ -624,7 +634,7 @@ string ::= "\"" (
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" ws
 ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 
 integer ::= [0-9]+"""
 
@@ -680,7 +690,7 @@ def generate_markdown_documentation(
         str: Generated text documentation.
     """
     documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
     for model, add_prefix in pyd_models:
         if add_prefix:
             documentation += f"{model_prefix}: {model.__name__}\n"
@@ -700,9 +710,9 @@ def generate_markdown_documentation(
             # Indenting the fields section
             documentation += f"  {fields_prefix}:\n"
         else:
-            documentation += f"  Fields:\n"
+            documentation += f"  Fields:\n"  # noqa: F541
         if isclass(model) and issubclass(model, BaseModel):
-            for name, field_type in model.__annotations__.items():
+            for name, field_type in get_type_hints(model).items():
                 # if name == "markdown_code_block":
                 #    continue
                 if get_origin(field_type) == list:
@@ -750,14 +760,17 @@ def generate_field_markdown(
     field_info = model.model_fields.get(field_name)
     field_description = field_info.description if field_info and field_info.description else ""
 
-    if get_origin(field_type) == list:
+    origin_type = get_origin(field_type)
+    origin_type = field_type if origin_type is None else origin_type
+
+    if origin_type == list:
         element_type = get_args(field_type)[0]
         field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
         if field_description != "":
             field_text += ":\n"
         else:
             field_text += "\n"
-    elif get_origin(field_type) == Union:
+    elif origin_type == Union:
         element_types = get_args(field_type)
         types = []
         for element_type in element_types:
@@ -778,7 +791,7 @@ def generate_field_markdown(
         return field_text
 
     if field_description != "":
-        field_text += f"        Description: " + field_description + "\n"
+        field_text += f"        Description: {field_description}\n"
 
     # Check for and include field-specific examples if available
     if hasattr(model, "Config") and hasattr(model.Config,
@@ -788,9 +801,9 @@ def generate_field_markdown(
             example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
             field_text += f"{indent}  Example: {example_text}\n"
 
-    if isclass(field_type) and issubclass(field_type, BaseModel):
+    if isclass(origin_type) and issubclass(origin_type, BaseModel):
         field_text += f"{indent}  Details:\n"
-        for name, type_ in field_type.__annotations__.items():
+        for name, type_ in get_type_hints(field_type).items():
             field_text += generate_field_markdown(name, type_, field_type, depth + 2)
 
     return field_text
@@ -833,7 +846,7 @@ def generate_text_documentation(
         str: Generated text documentation.
     """
     documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
     for model, add_prefix in pyd_models:
         if add_prefix:
             documentation += f"{model_prefix}: {model.__name__}\n"
@@ -851,7 +864,7 @@ def generate_text_documentation(
 
         if isclass(model) and issubclass(model, BaseModel):
             documentation_fields = ""
-            for name, field_type in model.__annotations__.items():
+            for name, field_type in get_type_hints(model).items():
                 # if name == "markdown_code_block":
                 #    continue
                 if get_origin(field_type) == list:
@@ -944,7 +957,7 @@ def generate_field_text(
 
     if isclass(field_type) and issubclass(field_type, BaseModel):
         field_text += f"{indent}  Details:\n"
-        for name, type_ in field_type.__annotations__.items():
+        for name, type_ in get_type_hints(field_type).items():
             field_text += generate_field_text(name, type_, field_type, depth + 2)
 
     return field_text
@@ -1164,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
         dynamic_fields[param.name] = (
             param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
 
     for name, param_doc in param_docs:
         dynamic_model.model_fields[name].description = param_doc.description
@@ -1228,9 +1241,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
     return output
 
 
-from enum import Enum
-
-
 def json_schema_to_python_types(schema):
     type_map = {
         "any": Any,
@@ -1275,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                     if items != {}:
                         array = {"properties": items}
                         array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
+                        fields[field_name] = (List[array_type], ...)
                     else:
                         fields[field_name] = (list, ...)
                 elif field_type == "object":
@@ -1285,7 +1295,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                     required = field_data.get("enum", [])
                     for key, field in fields.items():
                         if key not in required:
-                            fields[key] = (Optional[fields[key][0]], ...)
+                            optional_type = fields[key][0]
+                            fields[key] = (Optional[optional_type], ...)
                 else:
                     field_type = json_schema_to_python_types(field_type)
                     fields[field_name] = (field_type, ...)
@@ -1305,6 +1316,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
         required = dictionary.get("required", [])
         for key, field in fields.items():
             if key not in required:
-                fields[key] = (Optional[fields[key][0]], ...)
+                optional_type = fields[key][0]
+                fields[key] = (Optional[optional_type], ...)
     custom_model = create_model(model_name, **fields)
     return custom_model
diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py
new file mode 100755
index 000000000..eb000d5cc
--- /dev/null
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+
+"""Function calling example using pydantic models."""
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import json
+import logging
+import textwrap
+import sys
+from enum import Enum
+from typing import Optional, Union
+
+import requests
+from pydantic import BaseModel, Field
+from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
+                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
+
+
+def create_completion(host, prompt, gbnf_grammar):
+    """Calls the /completion API on llama-server.
+
+    See
+    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    """
+    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
+    headers = {"Content-Type": "application/json"}
+    data = {"prompt": prompt, "grammar": gbnf_grammar}
+    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
+    assert data.get("error") is None, data
+    logging.info("Result: %s", result)
+    content = result["content"]
+    print(f"  Model: {result['model']}")
+    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
+    return content
+
+
+# A function for the agent to send a message to the user.
+class SendMessageToUser(BaseModel):
+    """Send a message to the User."""
+    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
+    message: str = Field(..., description="Message you want to send to the user.")
+
+    def run(self):
+        print(f"SendMessageToUser: {self.message}")
+
+
+def example_rce(host):
+    """Minimal test case where the LLM call an arbitrary python function."""
+    print("- example_rce")
+    tools = [SendMessageToUser]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    user_message = "What is 42 * 42?"
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    tools_map = {tool.__name__:tool for tool in tools}
+    # This finds "SendMessageToUser":
+    tool = tools_map.get(json_data["function"])
+    if not tool:
+        print(f"Error: unknown tool {json_data['function']}")
+        return 1
+    tool(**json_data["function_parameters"]).run()
+    return 0
+
+
+# Enum for the calculator tool.
+class MathOperation(Enum):
+    ADD = "add"
+    SUBTRACT = "subtract"
+    MULTIPLY = "multiply"
+    DIVIDE = "divide"
+
+
+# Simple pydantic calculator tool for the agent that can add, subtract,
+# multiply, and divide. Docstring and description of fields will be used in
+# system prompt.
+class Calculator(BaseModel):
+    """Perform a math operation on two numbers."""
+    number_one: Union[int, float] = Field(..., description="First number.")
+    operation: MathOperation = Field(..., description="Math operation to perform.")
+    number_two: Union[int, float] = Field(..., description="Second number.")
+
+    def run(self):
+        if self.operation == MathOperation.ADD:
+            return self.number_one + self.number_two
+        elif self.operation == MathOperation.SUBTRACT:
+            return self.number_one - self.number_two
+        elif self.operation == MathOperation.MULTIPLY:
+            return self.number_one * self.number_two
+        elif self.operation == MathOperation.DIVIDE:
+            return self.number_one / self.number_two
+        else:
+            raise ValueError("Unknown operation.")
+
+
+def example_calculator(host):
+    """Have the LLM ask to get a calculation done.
+
+    Here the grammar gets generated by passing the available function models to
+    generate_gbnf_grammar_and_documentation function. This also generates a
+    documentation usable by the LLM.
+
+    pydantic_model_list is the list of pydantic models outer_object_name is an
+    optional name for an outer object around the actual model object. Like a
+    "function" object with "function_parameters" which contains the actual model
+    object. If None, no outer object will be generated outer_object_content is
+    the name of outer object content.
+
+    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+    """
+    print("- example_calculator")
+    tools = [SendMessageToUser, Calculator]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    user_message1 = "What is 42 * 42?"
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    expected = {
+        "function": "Calculator",
+        "function_parameters": {
+            "number_one": 42,
+            "operation": "multiply",
+            "number_two": 42
+        }
+    }
+    if json_data != expected:
+        print("  Result is not as expected!")
+    tools_map = {tool.__name__:tool for tool in tools}
+    # This finds "Calculator":
+    tool = tools_map.get(json_data["function"])
+    if not tool:
+        print(f"Error: unknown tool {json_data['function']}")
+        return 1
+    result = tool(**json_data["function_parameters"]).run()
+    print(f"  Call {json_data['function']} gave result {result}")
+    return 0
+
+
+class Category(Enum):
+    """The category of the book."""
+    Fiction = "Fiction"
+    NonFiction = "Non-Fiction"
+
+
+class Book(BaseModel):
+    """Represents an entry about a book."""
+    title: str = Field(..., description="Title of the book.")
+    author: str = Field(..., description="Author of the book.")
+    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
+    keywords: list[str] = Field(..., description="A list of keywords.")
+    category: Category = Field(..., description="Category of the book.")
+    summary: str = Field(..., description="Summary of the book.")
+
+
+def example_struct(host):
+    """A example structured output based on pydantic models.
+
+    The LLM will create an entry for a Book database out of an unstructured
+    text. We need no additional parameters other than our list of pydantic
+    models.
+    """
+    print("- example_struct")
+    tools = [Book]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
+    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
+    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    # In this case, there's no function nor function_parameters.
+    # Here the result will vary based on the LLM used.
+    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
+    if keys != sorted(json_data.keys()):
+        print(f"Unexpected result: {sorted(json_data.keys())}")
+        return 1
+    book = Book(**json_data)
+    print(f"  As a Book object: %s" % book)
+    return 0
+
+
+def get_current_datetime(output_format: Optional[str] = None):
+    """Get the current date and time in the given format.
+
+    Args:
+         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
+    """
+    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
+
+
+# Example function to get the weather.
+def get_current_weather(location, unit):
+    """Get the current weather in a given location"""
+    if "London" in location:
+        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
+    elif "New York" in location:
+        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
+    elif "North Pole" in location:
+        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
+    return json.dumps({"location": location, "temperature": "unknown"})
+
+
+def example_concurrent(host):
+    """An example for parallel function calling with a Python function, a pydantic
+    function model and an OpenAI like function definition.
+    """
+    print("- example_concurrent")
+    # Function definition in OpenAI style.
+    current_weather_tool = {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+    # Convert OpenAI function definition into pydantic model.
+    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+    # Add the actual function to a pydantic model.
+    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
+
+    # Convert normal Python function to a pydantic model.
+    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+
+    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
+    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    expected = [
+      {
+        "function": "get_current_datetime",
+        "params": {
+          "output_format": "%Y-%m-%d %H:%M:%S"
+        }
+      },
+      {
+        "function": "get_current_weather",
+        "params": {
+          "location": "London",
+          "unit": "celsius"
+        }
+      },
+      {
+        "function": "Calculator",
+        "params": {
+          "number_one": 42,
+          "operation": "multiply",
+          "number_two": 42
+        }
+      }
+    ]
+    res = 0
+    if json_data != expected:
+        print("  Result is not as expected!")
+        print("  This can happen on highly quantized models")
+        res = 1
+    tools_map = {tool.__name__:tool for tool in tools}
+    for call in json_data:
+      tool = tools_map.get(call["function"])
+      if not tool:
+          print(f"Error: unknown tool {call['function']}")
+          return 1
+      result = tool(**call["params"]).run()
+      print(f"  Call {call['function']} returned {result}")
+    # Should output something like this:
+    #   Call get_current_datetime returned 2024-07-15 09:50:38
+    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
+    #   Call Calculator returned 1764
+    return res
+
+
+def main():
+    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
+    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
+    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
+    ret = 0
+    # Comment out below to only run the example you want.
+    ret = ret or example_rce(args.host)
+    ret = ret or example_calculator(args.host)
+    ret = ret or example_struct(args.host)
+    ret = ret or example_concurrent(args.host)
+    return ret
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
index e31cf5e38..9a3a0d3cd 100644
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(TARGET quantize-stats)
+set(TARGET llama-quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 1d05f1391..bd2f73467 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,7 +1,7 @@
-#define LLAMA_API_INTERNAL
-#include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-context.h"
+#include "common.h"
 
 #include <algorithm>
 #include <cassert>
@@ -9,11 +9,9 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include <map>
 #include <numeric>
 #include <regex>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include <thread>
 #include <mutex>
@@ -23,7 +21,7 @@
 #endif
 
 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.gguf";
+    std::string model = DEFAULT_MODEL_PATH;
     bool verbose = false;
     bool per_layer_stats = false;
     bool print_histogram = false;
@@ -142,7 +140,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }
 
 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
     if (layer->type == GGML_TYPE_F16) {
@@ -154,9 +152,9 @@ static void test_roundtrip_on_chunk(
     }
 
     if (use_reference) {
-        qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
     } else {
-        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
     }
     qfns.to_float(quantized_scratch, output_scratch, chunk_size);
 
@@ -166,7 +164,7 @@ static void test_roundtrip_on_chunk(
 
 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
     std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@@ -187,13 +185,13 @@ static void test_roundtrip_on_layer(
     int num_chunks = (nelements + chunk_size - 1)/chunk_size;
 
     if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
                 output_scratch.data(), print_layer_stats ? layer_error : total_error);
     } else {
         auto & stats = print_layer_stats ? layer_error : total_error;
         std::mutex mutex;
         uint64_t counter = 0;
-        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
              &quantized_scratch, &output_scratch, chunk_size] () {
             error_stats local_stats {};
             while (true) {
@@ -205,7 +203,7 @@ static void test_roundtrip_on_layer(
                 }
                 lock.unlock();
                 uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
                         quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
             }
         };
@@ -311,7 +309,7 @@ int main(int argc, char ** argv) {
         auto mparams = llama_model_default_params();
         mparams.use_mlock  = false;
 
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
@@ -319,25 +317,24 @@ int main(int argc, char ** argv) {
         }
 
         auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
-        cparams.seed       = 1;
+        cparams.n_ctx = 256;
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = llama_init_from_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
             return 1;
         }
     }
 
-    const auto &tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(ctx);
 
     // check layer tensors
     int included_layers = 0;
     int64_t max_nelements = 0;
     bool is_f16 = false;
-    for (const auto& kv_tensor : tensors) {
+    for (const auto & kv_tensor : tensors) {
         if (!layer_included(params, kv_tensor.first)) {
             continue;
         }
@@ -350,7 +347,7 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                 "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
             llama_free(ctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return 1;
         }
         included_layers++;
@@ -372,8 +369,9 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (qfns.from_float && qfns.to_float) {
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (qfns_cpu->from_float && qfns->to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
@@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
 
             error_stats global_stats {};
 
-            for (const auto& kv_tensor : tensors) {
+            for (const auto & kv_tensor : tensors) {
                 if (!layer_included(params, kv_tensor.first)) {
                     continue;
                 }
@@ -394,7 +392,7 @@ int main(int argc, char ** argv) {
                 test_roundtrip_on_layer(
                         layer_name,
                         params.per_layer_stats,
-                        qfns,
+                        *qfns, *qfns_cpu,
                         params.reference,
                         kv_tensor.second,
                         input_scratch,
@@ -411,7 +409,7 @@ int main(int argc, char ** argv) {
 
 
     llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
     // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 6f374a2bd..47e5cbe30 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(TARGET quantize)
+set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index c8b9a27a0..f9cce7b21 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -1,22 +1,107 @@
 # quantize
 
-TODO
+You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
 
-## Llama 2 7B
+Note: It is synced from llama.cpp `main` every 6 hours.
 
-Quantization | Bits per Weight (BPW)
--- | --
-Q2_K | 3.35
-Q3_K_S | 3.50
-Q3_K_M | 3.91
-Q3_K_L | 4.27
-Q4_K_S | 4.58
-Q4_K_M | 4.84
-Q5_K_S | 5.52
-Q5_K_M | 5.68
-Q6_K | 6.56
+Example usage:
+
+```bash
+# obtain the official LLaMA model weights and place them in ./models
+ls ./models
+llama-2-7b tokenizer_checklist.chk tokenizer.model
+# [Optional] for models using BPE tokenizers
+ls ./models
+<folder containing weights and tokenizer json> vocab.json
+# [Optional] for PyTorch .bin models like Mistral-7B
+ls ./models
+<folder containing weights and tokenizer json>
+
+# install Python dependencies
+python3 -m pip install -r requirements.txt
+
+# convert the model to ggml FP16 format
+python3 convert_hf_to_gguf.py models/mymodel/
+
+# quantize the model to 4-bits (using Q4_K_M method)
+./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
+
+# update the gguf filetype to current version if older version is now unsupported
+./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
+```
+
+Run the quantized model:
+
+```bash
+# start inference on a gguf model
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
+```
+
+When running the larger models, make sure you have enough disk space to store all the intermediate files.
+
+## Memory/Disk Requirements
+
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+
+| Model | Original size | Quantized size (Q4_0) |
+|------:|--------------:|----------------------:|
+|    7B |         13 GB |                3.9 GB |
+|   13B |         24 GB |                7.8 GB |
+|   30B |         60 GB |               19.5 GB |
+|   65B |        120 GB |               38.5 GB |
+
+## Quantization
+
+Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+
+*(outdated)*
+
+| Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
+|    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
+|    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
+|    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
+|    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
+|    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
+|   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
+|   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
+|   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
+|   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
+|   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
+
+- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
+- recent k-quants improvements and new i-quants
+  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
+  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
+  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
+  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
+  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
+  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
+  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
+  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
+  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
+  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
+  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
+  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
+
+**Llama 2 7B**
+
+| Quantization | Bits per Weight (BPW) |
+|--------------|-----------------------|
+| Q2_K         | 3.35                  |
+| Q3_K_S       | 3.50                  |
+| Q3_K_M       | 3.91                  |
+| Q3_K_L       | 4.27                  |
+| Q4_K_S       | 4.58                  |
+| Q4_K_M       | 4.84                  |
+| Q5_K_S       | 5.52                  |
+| Q5_K_M       | 5.68                  |
+| Q6_K         | 6.56                  |
+
+**Llama 2 13B**
 
-## Llama 2 13B
 Quantization | Bits per Weight (BPW)
 -- | --
 Q2_K | 3.34
@@ -29,7 +114,7 @@ Q5_K_S | 5.51
 Q5_K_M | 5.67
 Q6_K | 6.56
 
-# Llama 2 70B
+**Llama 2 70B**
 
 Quantization | Bits per Weight (BPW)
 -- | --
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 7662ec80c..8d47b17b6 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
-#include <algorithm>
 
 struct quant_option {
     std::string name;
@@ -17,41 +16,59 @@ struct quant_option {
 };
 
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
-    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
-    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
-    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
-    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
-    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
-    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
-    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
-    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
-    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
-    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization"   ,          },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
-    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
-    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
-    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
-    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
-    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
+    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
+    { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
+    { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
+    { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
+    { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
+    { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
+    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
+    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
+    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
+    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
+    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
+    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
+    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
+    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
+    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
+    { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
+    { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
+    { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
+    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
+    { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
+    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
+    { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
+    { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
+    { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
+    { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
+    { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
+    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
+    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
+    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
+    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
+    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
+    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 
+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
+
+static bool striequals(const char * a, const char * b) {
+    while (*a && *b) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+        a++; b++;
+    }
+    return *a == *b;
+}
 
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
@@ -60,7 +77,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
         ftype_str.push_back(std::toupper(ch));
     }
     for (auto & it : QUANT_OPTIONS) {
-        if (it.name == ftype_str) {
+        if (striequals(it.name.c_str(), ftype_str.c_str())) {
             ftype = it.ftype;
             ftype_str_out = it.name;
             return true;
@@ -83,17 +100,22 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quantized model in the same shards as input\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
@@ -107,61 +129,80 @@ static void usage(const char * executable) {
     exit(1);
 }
 
-static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
     std::ifstream in(imatrix_file.c_str(), std::ios::binary);
     if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
-        return;
+        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
+        exit(1);
     }
     int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
+    in.read((char *)&n_entries, sizeof(n_entries));
     if (in.fail() || n_entries < 1) {
         printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
-        return;
+        exit(1);
     }
     for (int i = 0; i < n_entries; ++i) {
         int len; in.read((char *)&len, sizeof(len));
         std::vector<char> name_as_vec(len+1);
         in.read((char *)name_as_vec.data(), len);
         if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
-            return;
+            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
+            exit(1);
         }
         name_as_vec[len] = 0;
         std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = imatrix_data[name];
         int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
+        in.read((char *)&ncall, sizeof(ncall));
         int nval;
         in.read((char *)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            printf("%s: failed reading number of values for entry %d\n", __func__, i);
             imatrix_data = {};
-            return;
+            exit(1);
         }
         e.resize(nval);
-        in.read((char*)e.data(), nval*sizeof(float));
+        in.read((char *)e.data(), nval*sizeof(float));
         if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
+            printf("%s: failed reading data for entry %d\n", __func__, i);
             imatrix_data = {};
-            return;
+            exit(1);
         }
         if (ncall > 0) {
             for (auto& v : e) v /= ncall;
         }
+
+        if (getenv("LLAMA_TRACE")) {
+            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
+        }
     }
-    printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
+
+    // latest imatrix version contains the dataset filename at the end of the file
+    int m_last_call = 0;
+    if (in.peek() != EOF) {
+        in.read((char *)&m_last_call, sizeof(m_last_call));
+        int dataset_len;
+        in.read((char *)&dataset_len, sizeof(dataset_len));
+        std::vector<char> dataset_as_vec(dataset_len);
+        in.read(dataset_as_vec.data(), dataset_len);
+        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+    }
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_call;
 }
 
-static void prepare_imatrix(const std::string& imatrix_file,
-        const std::vector<std::string>& included_weights,
-        const std::vector<std::string>& excluded_weights,
-        std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static int prepare_imatrix(const std::string & imatrix_file,
+        std::string & imatrix_dataset,
+        const std::vector<std::string> & included_weights,
+        const std::vector<std::string> & excluded_weights,
+        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
     }
     if (imatrix_data.empty()) {
-        return;
+        return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (auto& name : excluded_weights) {
@@ -187,6 +228,19 @@ static void prepare_imatrix(const std::string& imatrix_file,
     if (!imatrix_data.empty()) {
         printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
     }
+    return m_last_call;
+}
+
+static ggml_type parse_ggml_type(const char * arg) {
+    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
+        auto type = (ggml_type)i;
+        const auto * name = ggml_type_name(type);
+        if (name && striequals(name, arg)) {
+            return type;
+        }
+    }
+    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    return GGML_TYPE_COUNT;
 }
 
 int main(int argc, char ** argv) {
@@ -199,10 +253,33 @@ int main(int argc, char ** argv) {
     int arg_idx = 1;
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
+    std::vector<llama_model_kv_override> kv_overrides;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
             params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.token_embedding_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
+            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
             params.allow_requantize = true;
         } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -225,6 +302,8 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
+            params.keep_split = true;
         } else {
             usage(argv[0]);
         }
@@ -238,10 +317,48 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
+    std::string imatrix_dataset;
     std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
     if (!imatrix_data.empty()) {
         params.imatrix = &imatrix_data;
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+        if (!imatrix_dataset.empty()) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = imatrix_data.size();
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        if (m_last_call > 0) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = m_last_call;
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+    }
+    if (!kv_overrides.empty()) {
+        kv_overrides.emplace_back();
+        kv_overrides.back().key[0] = 0;
+        params.kv_overrides = &kv_overrides;
     }
 
     llama_backend_init();
@@ -252,21 +369,28 @@ int main(int argc, char ** argv) {
     std::string fname_out;
 
     std::string ftype_str;
+    std::string suffix = ".gguf";
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
         std::string fpath;
         const size_t pos = fname_inp.find_last_of("/\\");
         if (pos != std::string::npos) {
             fpath = fname_inp.substr(0, pos + 1);
         }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
         arg_idx++;
         if (ftype_str == "COPY") {
             params.only_copy = true;
         }
-    }
-    else {
+    } else {
         fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
         arg_idx++;
 
         if (argc <= arg_idx) {
@@ -296,10 +420,12 @@ int main(int argc, char ** argv) {
 
     if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
-        fprintf(stderr, "\n===============================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "===============================================================================================\n\n\n");
+         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+        fprintf(stderr, "\n==========================================================================================================\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "==========================================================================================================\n\n\n");
         return 1;
     }
 
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
new file mode 100644
index 000000000..70f7610f9
--- /dev/null
+++ b/examples/quantize/tests.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/llama-gguf-split
+QUANTIZE=$1/llama-quantize
+MAIN=$1/llama-cli
+WORK_PATH=$TMP_DIR/quantize
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep-split'
+$QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep-split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
diff --git a/examples/reason-act.sh b/examples/reason-act.sh
index 046c48db5..06d592799 100755
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
   MODEL="-m $2 "
 fi
 
-./main $MODEL --color \
+./llama-cli $MODEL --color \
     -f ./prompts/reason-act.txt \
     -i --interactive-first \
     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
diff --git a/examples/regex_to_grammar.py b/examples/regex_to_grammar.py
new file mode 100644
index 000000000..5cd9210a4
--- /dev/null
+++ b/examples/regex_to_grammar.py
@@ -0,0 +1,20 @@
+import json, subprocess, sys, os
+
+assert len(sys.argv) >= 2
+[_, pattern, *rest] = sys.argv
+
+print(subprocess.check_output(
+    [
+        "python",
+        os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "json_schema_to_grammar.py"),
+        *rest,
+        "-",
+        "--raw-pattern",
+    ],
+    text=True,
+    input=json.dumps({
+        "type": "string",
+        "pattern": pattern,
+    }, indent=2)))
diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt
new file mode 100644
index 000000000..512a602ec
--- /dev/null
+++ b/examples/retrieval/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-retrieval)
+add_executable(${TARGET} retrieval.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md
new file mode 100644
index 000000000..bc5f22e2f
--- /dev/null
+++ b/examples/retrieval/README.md
@@ -0,0 +1,69 @@
+# llama.cpp/examples/retrieval
+
+Demonstration of simple retrieval technique based on cosine similarity
+
+More info:
+https://github.com/ggerganov/llama.cpp/pull/6193
+
+### How to use
+
+`retieval.cpp` has parameters of its own:
+- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
+- `--chunk-size`: minimum size of each text chunk to be embedded
+- `--chunk-separator`: STRING to divide chunks by. newline by default
+
+`retrieval` example can be tested as follows:
+
+```bash
+make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+```
+
+This chunks and embeds all given files and starts a loop requesting query inputs:
+
+```
+Enter query:
+```
+
+On each query input, top k chunks are shown along with file name, chunk position within file and original text:
+
+```
+Enter query: describe the mit license
+batch_decode: n_tokens = 6, n_seq = 1
+Top 3 similar chunks:
+filename: README.md
+filepos: 119
+similarity: 0.762334
+textdata:
+png)
+
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+
+[Roadmap](https://github.
+--------------------
+filename: License
+filepos: 0
+similarity: 0.725146
+textdata:
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+--------------------
+filename: README.md
+filepos: 9178
+similarity: 0.621722
+textdata:
+com/cztomsik/ava) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [pythops/tenere](https://github.
+--------------------
+```
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
new file mode 100644
index 000000000..2439022a2
--- /dev/null
+++ b/examples/retrieval/retrieval.cpp
@@ -0,0 +1,304 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream> // TODO: remove me
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG("\n");
+}
+
+struct chunk {
+    // filename
+    std::string filename;
+    // original file position
+    size_t filepos;
+    // original text data
+    std::string textdata;
+    // tokenized text data
+    std::vector<llama_token> tokens;
+    // embedding
+    std::vector<float> embedding;
+};
+
+// chunk file data to chunks of size >= chunk_size
+// chunk_separator is the separator between chunks
+static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
+    std::vector<chunk> chunks;
+    std::ifstream f(filename.c_str());
+
+    if (!f.is_open()) {
+        LOG_ERR("could not open file %s\n", filename.c_str());
+        return chunks;
+    }
+
+    chunk current_chunk;
+    char buffer[1024];
+    int64_t filepos = 0;
+    std::string current;
+    while (f.read(buffer, 1024)) {
+        current += std::string(buffer, f.gcount());
+        size_t pos;
+        while ((pos = current.find(chunk_separator)) != std::string::npos) {
+            current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
+            if ((int) current_chunk.textdata.size() > chunk_size) {
+                // save chunk
+                current_chunk.filepos = filepos;
+                current_chunk.filename = filename;
+                chunks.push_back(current_chunk);
+                // update filepos
+                filepos += (int) current_chunk.textdata.size();
+                // reset current_chunk
+                current_chunk = chunk();
+            }
+            current = current.substr(pos + chunk_separator.size());
+        }
+
+    }
+    // add leftover data to last chunk
+    if (current_chunk.textdata.size() > 0) {
+        if (chunks.empty()) {
+            current_chunk.filepos = filepos;
+            current_chunk.filename = filename;
+            chunks.push_back(current_chunk);
+        } else {
+            chunks.back().textdata += current_chunk.textdata;
+        }
+    }
+    f.close();
+    return chunks;
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
+    }
+}
+
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_kv_cache_clear(ctx);
+
+    // run model
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to decode\n", __func__);
+    }
+
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        // try to get sequence embeddings - supported only when pooling_type is not NONE
+        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+            embd = llama_get_embeddings_ith(ctx, i);
+            if (embd == NULL) {
+                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
+                continue;
+            }
+        }
+
+        float * out = output + batch.seq_id[i][0] * n_embd;
+        common_embd_normalize(embd, out, n_embd, 2);
+    }
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // For BERT models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+    params.embedding = true;
+
+    if (params.chunk_size <= 0) {
+        LOG_ERR("chunk_size must be positive\n");
+        return 1;
+    }
+    if (params.context_files.empty()) {
+        LOG_ERR("context_files must be specified\n");
+        return 1;
+    }
+
+    LOG_INF("processing files:\n");
+    for (auto & context_file : params.context_files) {
+        LOG_INF("%s\n", context_file.c_str());
+    }
+
+    std::vector<chunk> chunks;
+    for (auto & context_file : params.context_files) {
+        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
+        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
+    }
+    LOG_INF("Number of chunks: %zu\n", chunks.size());
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
+    if (n_ctx > n_ctx_train) {
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
+
+    // tokenize the prompts and trim
+    for (auto & chunk : chunks) {
+        auto inp = common_tokenize(ctx, chunk.textdata, true, false);
+        if (inp.size() > n_batch) {
+            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
+        }
+        // add eos if not present
+        if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) {
+            inp.push_back(llama_vocab_eos(vocab));
+        }
+        chunk.tokens = inp;
+    }
+
+    // tokenization stats
+    if (params.verbose_prompt) {
+        for (int i = 0; i < (int) chunks.size(); i++) {
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+            }
+            LOG_INF("\n\n");
+        }
+    }
+
+    // initialize batch
+    const int n_chunks = chunks.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+    // allocate output
+    const int n_embd = llama_model_n_embd(model);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
+    float * emb = embeddings.data();
+
+    // break into batches
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_chunks; k++) {
+        // clamp to n_batch tokens
+        auto & inp = chunks[k].tokens;
+
+        const uint64_t n_toks = inp.size();
+
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd);
+            common_batch_clear(batch);
+            p += s;
+            s = 0;
+        }
+
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
+    }
+
+    // final batch
+    float * out = emb + p * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd);
+
+    // save embeddings to chunks
+    for (int i = 0; i < n_chunks; i++) {
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
+        // clear tokens as they are no longer needed
+        chunks[i].tokens.clear();
+    }
+
+    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+
+    // start loop, receive query and return top k similar chunks based on cosine similarity
+    std::string query;
+    while (true) {
+        LOG("Enter query: ");
+        std::getline(std::cin, query);
+        std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
+
+        batch_add_seq(query_batch, query_tokens, 0);
+
+        std::vector<float> query_emb(n_embd, 0);
+        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+
+        common_batch_clear(query_batch);
+
+        // compute cosine similarities
+        {
+            std::vector<std::pair<int, float>> similarities;
+            for (int i = 0; i < n_chunks; i++) {
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                similarities.push_back(std::make_pair(i, sim));
+            }
+
+            // sort similarities
+            std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
+                return a.second > b.second;
+            });
+
+            LOG("Top %d similar chunks:\n", params.sampling.top_k);
+            for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
+                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                LOG("similarity: %f\n", similarities[i].second);
+                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                LOG("--------------------\n");
+            }
+        }
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+
+    // clean up
+    llama_batch_free(query_batch);
+    llama_backend_free();
+}
diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt
new file mode 100644
index 000000000..ae48fb98d
--- /dev/null
+++ b/examples/rpc/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(rpc-server rpc-server.cpp)
+target_link_libraries(rpc-server PRIVATE ggml llama)
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
new file mode 100644
index 000000000..312bb634d
--- /dev/null
+++ b/examples/rpc/README.md
@@ -0,0 +1,74 @@
+## Overview
+
+> [!IMPORTANT]
+> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
+> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
+
+The `rpc-server` allows  running `ggml` backend on a remote host.
+The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
+This can be used for distributed LLM inference with `llama.cpp` in the following way:
+
+```mermaid
+flowchart TD
+    rpcb<-->|TCP|srva
+    rpcb<-->|TCP|srvb
+    rpcb<-.->|TCP|srvn
+    subgraph hostn[Host N]
+    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hostb[Host B]
+    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hosta[Host A]
+    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph host[Main Host]
+    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
+    ggml[llama-cli]<-->rpcb[RPC backend]
+    end
+    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
+```
+
+Each host can run a different backend, e.g. one with CUDA and another with Metal.
+You can also run multiple `rpc-server` instances on the same host, each with a different backend.
+
+## Usage
+
+On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
+For example, to build the CUDA backend with RPC support:
+
+```bash
+mkdir build-rpc-cuda
+cd build-rpc-cuda
+cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
+cmake --build . --config Release
+```
+
+Then, start the `rpc-server` with the backend:
+
+```bash
+$ bin/rpc-server -p 50052
+create_backend: using CUDA backend
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
+ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
+Starting RPC server on 0.0.0.0:50052
+```
+
+When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
+```bash
+$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
+```
+This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
+
+
+On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
+Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+
+```bash
+$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+```
+
+This way you can offload model layers to both local and remote devices.
+
diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
new file mode 100644
index 000000000..8b1b23eda
--- /dev/null
+++ b/examples/rpc/rpc-server.cpp
@@ -0,0 +1,171 @@
+#include "ggml-cpu.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#include "ggml-rpc.h"
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <unistd.h>
+#endif
+#include <string>
+#include <stdio.h>
+
+struct rpc_server_params {
+    std::string host        = "127.0.0.1";
+    int         port        = 50052;
+    size_t      backend_mem = 0;
+};
+
+static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
+    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
+    fprintf(stderr, "\n");
+}
+
+static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-H" || arg == "--host") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.host = argv[i];
+        } else if (arg == "-p" || arg == "--port") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.port = std::stoi(argv[i]);
+            if (params.port <= 0 || params.port > 65535) {
+                return false;
+            }
+        } else if (arg == "-m" || arg == "--mem") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+    return true;
+}
+
+static ggml_backend_t create_backend() {
+    ggml_backend_t backend = NULL;
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    backend = ggml_backend_cuda_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#elif GGML_USE_METAL
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    backend = ggml_backend_metal_init();
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+#elif GGML_USE_VULKAN
+    fprintf(stderr, "%s: using Vulkan backend\n", __func__);
+    backend = ggml_backend_vk_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
+    }
+#elif GGML_USE_SYCL
+    fprintf(stderr, "%s: using SYCL backend\n", __func__);
+    backend = ggml_backend_sycl_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
+    }
+#endif
+
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!backend) {
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        backend = ggml_backend_cpu_init();
+    }
+    return backend;
+}
+
+static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
+#ifdef GGML_USE_CUDA
+    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
+#elif GGML_USE_VULKAN
+    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
+#elif GGML_USE_SYCL
+    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
+#else
+    #ifdef _WIN32
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatusEx(&status);
+        *total_mem = status.ullTotalPhys;
+        *free_mem = status.ullAvailPhys;
+    #else
+        long pages = sysconf(_SC_PHYS_PAGES);
+        long page_size = sysconf(_SC_PAGE_SIZE);
+        *total_mem = pages * page_size;
+        *free_mem = *total_mem;
+    #endif
+#endif
+}
+
+int main(int argc, char * argv[]) {
+    rpc_server_params params;
+    if (!rpc_server_params_parse(argc, argv, params)) {
+        fprintf(stderr, "Invalid parameters\n");
+        return 1;
+    }
+
+    if (params.host != "127.0.0.1") {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
+        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
+        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "\n");
+    }
+
+    ggml_backend_t backend = create_backend();
+    if (!backend) {
+        fprintf(stderr, "Failed to create backend\n");
+        return 1;
+    }
+    std::string endpoint = params.host + ":" + std::to_string(params.port);
+    size_t free_mem, total_mem;
+    if (params.backend_mem > 0) {
+        free_mem = params.backend_mem;
+        total_mem = params.backend_mem;
+    } else {
+        get_backend_memory(&free_mem, &total_mem);
+    }
+    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
+    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
+    ggml_backend_free(backend);
+    return 0;
+}
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
new file mode 100644
index 000000000..cd6b0520e
--- /dev/null
+++ b/examples/run/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-run)
+add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/run/README.md b/examples/run/README.md
new file mode 100644
index 000000000..89a552079
--- /dev/null
+++ b/examples/run/README.md
@@ -0,0 +1,50 @@
+# llama.cpp/example/run
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
+
+```bash
+llama-run granite3-moe
+```
+
+```bash
+Description:
+  Runs a llm
+
+Usage:
+  llama-run [options] model [prompt]
+
+Options:
+  -c, --context-size <value>
+      Context size (default: 2048)
+  -n, -ngl, --ngl <value>
+      Number of GPU layers (default: 0)
+  --temp <value>
+      Temperature (default: 0.8)
+  -v, --verbose, --log-verbose
+      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
+  -h, --help
+      Show help message
+
+Commands:
+  model
+      Model is a string with an optional prefix of
+      huggingface:// (hf://), ollama://, https:// or file://.
+      If no protocol is specified and a file exists in the specified
+      path, file:// is assumed, otherwise if a file does not exist in
+      the specified path, ollama:// is assumed. Models that are being
+      pulled are downloaded with .partial extension while being
+      downloaded and then renamed as the file without the .partial
+      extension when complete.
+
+Examples:
+  llama-run llama3
+  llama-run ollama://granite-code
+  llama-run ollama://smollm:135m
+  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
+  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
+  llama-run https://example.com/some-file1.gguf
+  llama-run some-file2.gguf
+  llama-run file://some-file3.gguf
+  llama-run --ngl 999 some-file4.gguf
+  llama-run --ngl 999 some-file5.gguf Hello World
+```
diff --git a/examples/run/linenoise.cpp/LICENSE b/examples/run/linenoise.cpp/LICENSE
new file mode 100644
index 000000000..b006b3b24
--- /dev/null
+++ b/examples/run/linenoise.cpp/LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
+Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/examples/run/linenoise.cpp/linenoise.cpp b/examples/run/linenoise.cpp/linenoise.cpp
new file mode 100644
index 000000000..a68f12a1a
--- /dev/null
+++ b/examples/run/linenoise.cpp/linenoise.cpp
@@ -0,0 +1,1350 @@
+#ifndef _WIN32
+/*
+ * You can find the latest source code at:
+ *
+ *   http://github.com/ericcurtin/linenoise.cpp
+ *
+ * Does a number of crazy assumptions that happen to be true in 99.9999% of
+ * the 2010 UNIX computers around.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+ * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *  *  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  *  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * References:
+ * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html
+ * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html
+ *
+ * Todo list:
+ * - Filter bogus Ctrl+<char> combinations.
+ * - Win32 support
+ *
+ * Bloat:
+ * - History search like Ctrl+r in readline?
+ *
+ * List of escape sequences used by this program, we do everything just
+ * with three sequences. In order to be so cheap we may have some
+ * flickering effect with some slow terminal, but the lesser sequences
+ * the more compatible.
+ *
+ * EL (Erase Line)
+ *    Sequence: ESC [ n K
+ *    Effect: if n is 0 or missing, clear from cursor to end of line
+ *    Effect: if n is 1, clear from beginning of line to cursor
+ *    Effect: if n is 2, clear entire line
+ *
+ * CUF (CUrsor Forward)
+ *    Sequence: ESC [ n C
+ *    Effect: moves cursor forward n chars
+ *
+ * CUB (CUrsor Backward)
+ *    Sequence: ESC [ n D
+ *    Effect: moves cursor backward n chars
+ *
+ * The following is used to get the terminal width if getting
+ * the width with the TIOCGWINSZ ioctl fails
+ *
+ * DSR (Device Status Report)
+ *    Sequence: ESC [ 6 n
+ *    Effect: reports the current cusor position as ESC [ n ; m R
+ *            where n is the row and m is the column
+ *
+ * When multi line mode is enabled, we also use an additional escape
+ * sequence. However multi line editing is disabled by default.
+ *
+ * CUU (Cursor Up)
+ *    Sequence: ESC [ n A
+ *    Effect: moves cursor up of n chars.
+ *
+ * CUD (Cursor Down)
+ *    Sequence: ESC [ n B
+ *    Effect: moves cursor down of n chars.
+ *
+ * When linenoiseClearScreen() is called, two additional escape sequences
+ * are used in order to clear the screen and position the cursor at home
+ * position.
+ *
+ * CUP (Cursor position)
+ *    Sequence: ESC [ H
+ *    Effect: moves the cursor to upper left corner
+ *
+ * ED (Erase display)
+ *    Sequence: ESC [ 2 J
+ *    Effect: clear the whole screen
+ *
+ */
+
+#    include "linenoise.h"
+
+#    include <ctype.h>
+#    include <errno.h>
+#    include <stdio.h>
+#    include <string.h>
+#    include <sys/file.h>
+#    include <sys/ioctl.h>
+#    include <sys/stat.h>
+#    include <sys/types.h>
+#    include <termios.h>
+#    include <unistd.h>
+
+#    include <memory>
+#    include <string>
+#    include <vector>
+
+#    define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100
+#    define LINENOISE_MAX_LINE                4096
+static std::vector<const char *>    unsupported_term   = { "dumb", "cons25", "emacs" };
+static linenoiseCompletionCallback *completionCallback = NULL;
+static linenoiseHintsCallback *hintsCallback = NULL;
+static linenoiseFreeHintsCallback *freeHintsCallback = NULL;
+static char *linenoiseNoTTY(void);
+static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags);
+static void refreshLineWithFlags(struct linenoiseState *l, int flags);
+
+static struct termios orig_termios; /* In order to restore at exit.*/
+static int maskmode = 0; /* Show "***" instead of input. For passwords. */
+static int rawmode = 0; /* For atexit() function to check if restore is needed*/
+static int mlmode = 0;  /* Multi line mode. Default is single line. */
+static int atexit_registered = 0; /* Register atexit just 1 time. */
+static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN;
+static int history_len = 0;
+static char **history = NULL;
+
+enum KEY_ACTION{
+        KEY_NULL = 0,            /* NULL */
+        CTRL_A = 1,         /* Ctrl+a */
+        CTRL_B = 2,         /* Ctrl-b */
+        CTRL_C = 3,         /* Ctrl-c */
+        CTRL_D = 4,         /* Ctrl-d */
+        CTRL_E = 5,         /* Ctrl-e */
+        CTRL_F = 6,         /* Ctrl-f */
+        CTRL_H = 8,         /* Ctrl-h */
+        TAB = 9,            /* Tab */
+        CTRL_K = 11,        /* Ctrl+k */
+        CTRL_L = 12,        /* Ctrl+l */
+        ENTER = 13,         /* Enter */
+        CTRL_N = 14,        /* Ctrl-n */
+        CTRL_P = 16,        /* Ctrl-p */
+        CTRL_T = 20,        /* Ctrl-t */
+        CTRL_U = 21,        /* Ctrl+u */
+        CTRL_W = 23,        /* Ctrl+w */
+        ESC = 27,           /* Escape */
+        BACKSPACE =  127    /* Backspace */
+};
+
+static void linenoiseAtExit(void);
+int linenoiseHistoryAdd(const char *line);
+#define REFRESH_CLEAN (1<<0)    // Clean the old prompt from the screen
+#define REFRESH_WRITE (1<<1)    // Rewrite the prompt on the screen.
+#define REFRESH_ALL (REFRESH_CLEAN|REFRESH_WRITE) // Do both.
+static void refreshLine(struct linenoiseState *l);
+
+class File {
+  public:
+    FILE * file = nullptr;
+
+    FILE * open(const std::string & filename, const char * mode) {
+        file = fopen(filename.c_str(), mode);
+
+        return file;
+    }
+
+    int lock() {
+        if (file) {
+            fd = fileno(file);
+            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+                fd = -1;
+
+                return 1;
+            }
+        }
+
+        return 0;
+    }
+
+    ~File() {
+        if (fd >= 0) {
+            flock(fd, LOCK_UN);
+        }
+
+        if (file) {
+            fclose(file);
+        }
+    }
+
+  private:
+    int fd = -1;
+};
+
+__attribute__((format(printf, 1, 2)))
+/* Debugging function. */
+#if 0
+static void lndebug(const char *fmt, ...) {
+    static File file;
+    if (file.file == nullptr) {
+        file.open("/tmp/lndebug.txt", "a");
+    }
+
+    if (file.file != nullptr) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(file.file, fmt, args);
+        va_end(args);
+        fflush(file.file);
+    }
+}
+#else
+static void lndebug(const char *, ...) {
+}
+#endif
+
+/* ======================= Low level terminal handling ====================== */
+
+/* Enable "mask mode". When it is enabled, instead of the input that
+ * the user is typing, the terminal will just display a corresponding
+ * number of asterisks, like "****". This is useful for passwords and other
+ * secrets that should not be displayed. */
+void linenoiseMaskModeEnable(void) {
+    maskmode = 1;
+}
+
+/* Disable mask mode. */
+void linenoiseMaskModeDisable(void) {
+    maskmode = 0;
+}
+
+/* Set if to use or not the multi line mode. */
+void linenoiseSetMultiLine(int ml) {
+    mlmode = ml;
+}
+
+/* Return true if the terminal name is in the list of terminals we know are
+ * not able to understand basic escape sequences. */
+static int isUnsupportedTerm(void) {
+    char *term = getenv("TERM");
+    if (term == NULL) return 0;
+    for (size_t j = 0; j < unsupported_term.size(); ++j) {
+        if (!strcasecmp(term, unsupported_term[j])) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/* Raw mode: 1960 magic shit. */
+static int enableRawMode(int fd) {
+    struct termios raw;
+
+    if (!isatty(STDIN_FILENO)) goto fatal;
+    if (!atexit_registered) {
+        atexit(linenoiseAtExit);
+        atexit_registered = 1;
+    }
+    if (tcgetattr(fd,&orig_termios) == -1) goto fatal;
+
+    raw = orig_termios;  /* modify the original mode */
+    /* input modes: no break, no CR to NL, no parity check, no strip char,
+     * no start/stop output control. */
+    raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON);
+    /* output modes - disable post processing */
+    raw.c_oflag &= ~(OPOST);
+    /* control modes - set 8 bit chars */
+    raw.c_cflag |= (CS8);
+    /* local modes - choing off, canonical off, no extended functions,
+     * no signal chars (^Z,^C) */
+    raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG);
+    /* control chars - set return condition: min number of bytes and timer.
+     * We want read to return every single byte, without timeout. */
+    raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */
+
+    /* put terminal in raw mode after flushing */
+    if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal;
+    rawmode = 1;
+    return 0;
+
+fatal:
+    errno = ENOTTY;
+    return -1;
+}
+
+static void disableRawMode(int fd) {
+    /* Don't even check the return value as it's too late. */
+    if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
+        rawmode = 0;
+}
+
+/* Use the ESC [6n escape sequence to query the horizontal cursor position
+ * and return it. On error -1 is returned, on success the position of the
+ * cursor. */
+static int getCursorPosition(int ifd, int ofd) {
+    char buf[32];
+    int cols, rows;
+    unsigned int i = 0;
+
+    /* Report cursor location */
+    if (write(ofd, "\x1b[6n", 4) != 4) return -1;
+
+    /* Read the response: ESC [ rows ; cols R */
+    while (i < sizeof(buf)-1) {
+        if (read(ifd,buf+i,1) != 1) break;
+        if (buf[i] == 'R') break;
+        i++;
+    }
+    buf[i] = '\0';
+
+    /* Parse it. */
+    if (buf[0] != ESC || buf[1] != '[') return -1;
+    if (sscanf(buf+2,"%d;%d",&rows,&cols) != 2) return -1;
+    return cols;
+}
+
+/* Try to get the number of columns in the current terminal, or assume 80
+ * if it fails. */
+static int getColumns(int ifd, int ofd) {
+    struct winsize ws;
+
+    if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) {
+        /* ioctl() failed. Try to query the terminal itself. */
+        int start, cols;
+
+        /* Get the initial position so we can restore it later. */
+        start = getCursorPosition(ifd,ofd);
+        if (start == -1) goto failed;
+
+        /* Go to right margin and get position. */
+        if (write(ofd,"\x1b[999C",6) != 6) goto failed;
+        cols = getCursorPosition(ifd,ofd);
+        if (cols == -1) goto failed;
+
+        /* Restore position. */
+        if (cols > start) {
+            char seq[32];
+            snprintf(seq,32,"\x1b[%dD",cols-start);
+            if (write(ofd,seq,strlen(seq)) == -1) {
+                /* Can't recover... */
+            }
+        }
+        return cols;
+    } else {
+        return ws.ws_col;
+    }
+
+failed:
+    return 80;
+}
+
+/* Clear the screen. Used to handle ctrl+l */
+void linenoiseClearScreen(void) {
+    if (write(STDOUT_FILENO,"\x1b[H\x1b[2J",7) <= 0) {
+        /* nothing to do, just to avoid warning. */
+    }
+}
+
+/* Beep, used for completion when there is nothing to complete or when all
+ * the choices were already shown. */
+static void linenoiseBeep(void) {
+    fprintf(stderr, "\x7");
+    fflush(stderr);
+}
+
+/* Called by completeLine() and linenoiseShow() to render the current
+ * edited line with the proposed completion. If the current completion table
+ * is already available, it is passed as second argument, otherwise the
+ * function will use the callback to obtain it.
+ *
+ * Flags are the same as refreshLine*(), that is REFRESH_* macros. */
+static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags) {
+    /* Obtain the table of completions if the caller didn't provide one. */
+    linenoiseCompletions ctable;
+    if (lc == NULL) {
+        completionCallback(ls->buf, &ctable);
+        lc = &ctable;
+    }
+
+    /* Show the edited line with completion if possible, or just refresh. */
+    if (ls->completion_idx < lc->len) {
+        struct linenoiseState saved = *ls;
+        ls->len = ls->pos = strlen(lc->cvec[ls->completion_idx]);
+        ls->buf = lc->cvec[ls->completion_idx];
+        refreshLineWithFlags(ls, flags);
+        ls->len = saved.len;
+        ls->pos = saved.pos;
+        ls->buf = saved.buf;
+    } else {
+        refreshLineWithFlags(ls, flags);
+    }
+
+    if (lc == &ctable) {
+        ctable.to_free = false;
+    }
+}
+
+/* This is an helper function for linenoiseEdit*() and is called when the
+ * user types the <tab> key in order to complete the string currently in the
+ * input.
+ *
+ * The state of the editing is encapsulated into the pointed linenoiseState
+ * structure as described in the structure definition.
+ *
+ * If the function returns non-zero, the caller should handle the
+ * returned value as a byte read from the standard input, and process
+ * it as usually: this basically means that the function may return a byte
+ * read from the termianl but not processed. Otherwise, if zero is returned,
+ * the input was consumed by the completeLine() function to navigate the
+ * possible completions, and the caller should read for the next characters
+ * from stdin. */
+static int completeLine(struct linenoiseState *ls, int keypressed) {
+    linenoiseCompletions lc;
+    int nwritten;
+    char c = keypressed;
+
+    completionCallback(ls->buf, &lc);
+    if (lc.len == 0) {
+        linenoiseBeep();
+        ls->in_completion = 0;
+    } else {
+        switch(c) {
+            case 9: /* tab */
+                if (ls->in_completion == 0) {
+                    ls->in_completion = 1;
+                    ls->completion_idx = 0;
+                } else {
+                    ls->completion_idx = (ls->completion_idx + 1) % (lc.len + 1);
+                    if (ls->completion_idx == lc.len) linenoiseBeep();
+                }
+                c = 0;
+                break;
+            case 27: /* escape */
+                /* Re-show original buffer */
+                if (ls->completion_idx < lc.len) refreshLine(ls);
+                ls->in_completion = 0;
+                c = 0;
+                break;
+            default:
+                /* Update buffer and return */
+                if (ls->completion_idx < lc.len) {
+                    nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]);
+                    ls->len = ls->pos = nwritten;
+                }
+                ls->in_completion = 0;
+                break;
+        }
+
+        /* Show completion or original buffer */
+        if (ls->in_completion && ls->completion_idx < lc.len) {
+            refreshLineWithCompletion(ls, &lc, REFRESH_ALL);
+        } else {
+            refreshLine(ls);
+        }
+    }
+
+    return c; /* Return last read character */
+}
+
+/* Register a callback function to be called for tab-completion. */
+void linenoiseSetCompletionCallback(linenoiseCompletionCallback *fn) {
+    completionCallback = fn;
+}
+
+/* Register a hits function to be called to show hits to the user at the
+ * right of the prompt. */
+void linenoiseSetHintsCallback(linenoiseHintsCallback *fn) {
+    hintsCallback = fn;
+}
+
+/* Register a function to free the hints returned by the hints callback
+ * registered with linenoiseSetHintsCallback(). */
+void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *fn) {
+    freeHintsCallback = fn;
+}
+
+/* This function is used by the callback function registered by the user
+ * in order to add completion options given the input string when the
+ * user typed <tab>. See the example.c source code for a very easy to
+ * understand example. */
+void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) {
+    const size_t len  = strlen(str);
+    auto         copy = std::make_unique<char[]>(len + 1);
+    if (!copy) {
+        return;
+    }
+
+    memcpy(copy.get(), str, len + 1);
+    char ** cvec = static_cast<char **>(std::realloc(lc->cvec, sizeof(char *) * (lc->len + 1)));
+    if (cvec == nullptr) {
+        return;
+    }
+
+    lc->cvec = cvec;
+    lc->cvec[lc->len++] = copy.release();
+}
+
+/* Helper of refreshSingleLine() and refreshMultiLine() to show hints
+ * to the right of the prompt. */
+static void refreshShowHints(std::string & ab, struct linenoiseState * l, int plen) {
+    char seq[64];
+    if (hintsCallback && plen+l->len < l->cols) {
+        int color = -1, bold = 0;
+        const char *hint = hintsCallback(l->buf,&color,&bold);
+        if (hint) {
+            int hintlen = strlen(hint);
+            int hintmaxlen = l->cols-(plen+l->len);
+            if (hintlen > hintmaxlen) hintlen = hintmaxlen;
+            if (bold == 1 && color == -1) color = 37;
+            if (color != -1 || bold != 0)
+                snprintf(seq,64,"\033[%d;%d;49m",bold,color);
+            else
+                seq[0] = '\0';
+            ab.append(seq);
+            ab.append(hint, hintlen);
+            if (color != -1 || bold != 0)
+                ab.append("\033[0m");
+
+            /* Call the function to free the hint returned. */
+            if (freeHintsCallback) freeHintsCallback(hint);
+        }
+    }
+}
+
+/* Single line low level line refresh.
+ *
+ * Rewrite the currently edited line accordingly to the buffer content,
+ * cursor position, and number of columns of the terminal.
+ *
+ * Flags is REFRESH_* macros. The function can just remove the old
+ * prompt, just write it, or both. */
+static void refreshSingleLine(struct linenoiseState *l, int flags) {
+    char seq[64];
+    size_t plen = strlen(l->prompt);
+    int fd = l->ofd;
+    char *buf = l->buf;
+    size_t len = l->len;
+    size_t pos = l->pos;
+    std::string ab;
+    while((plen+pos) >= l->cols) {
+        buf++;
+        len--;
+        pos--;
+    }
+    while (plen+len > l->cols) {
+        len--;
+    }
+
+    /* Cursor to left edge */
+    snprintf(seq,sizeof(seq),"\r");
+    ab.append(seq);
+
+    if (flags & REFRESH_WRITE) {
+        /* Write the prompt and the current buffer content */
+        ab.append(l->prompt);
+        if (maskmode == 1) {
+            while (len--) {
+                ab.append("*");
+            }
+        } else {
+            ab.append(buf, len);
+        }
+        /* Show hits if any. */
+        refreshShowHints(ab, l, plen);
+    }
+
+    /* Erase to right */
+    snprintf(seq,sizeof(seq),"\x1b[0K");
+    ab.append(seq);
+    if (flags & REFRESH_WRITE) {
+        /* Move cursor to original position. */
+        snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen));
+        ab.append(seq);
+    }
+
+    (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
+}
+
+/* Multi line low level line refresh.
+ *
+ * Rewrite the currently edited line accordingly to the buffer content,
+ * cursor position, and number of columns of the terminal.
+ *
+ * Flags is REFRESH_* macros. The function can just remove the old
+ * prompt, just write it, or both. */
+static void refreshMultiLine(struct linenoiseState *l, int flags) {
+    char seq[64];
+    int plen = strlen(l->prompt);
+    int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */
+    int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */
+    int rpos2; /* rpos after refresh. */
+    int col; /* colum position, zero-based. */
+    int old_rows = l->oldrows;
+    int fd = l->ofd, j;
+    std::string ab;
+    l->oldrows = rows;
+
+    /* First step: clear all the lines used before. To do so start by
+     * going to the last row. */
+    if (flags & REFRESH_CLEAN) {
+        if (old_rows-rpos > 0) {
+            lndebug("go down %d", old_rows-rpos);
+            snprintf(seq,64,"\x1b[%dB", old_rows-rpos);
+            ab.append(seq);
+        }
+
+        /* Now for every row clear it, go up. */
+        for (j = 0; j < old_rows-1; j++) {
+            lndebug("clear+up");
+            snprintf(seq,64,"\r\x1b[0K\x1b[1A");
+            ab.append(seq);
+        }
+    }
+
+    if (flags & REFRESH_ALL) {
+        /* Clean the top line. */
+        lndebug("clear");
+        snprintf(seq,64,"\r\x1b[0K");
+        ab.append(seq);
+    }
+
+    if (flags & REFRESH_WRITE) {
+        /* Write the prompt and the current buffer content */
+        ab.append(l->prompt);
+        if (maskmode == 1) {
+            for (unsigned int i = 0; i < l->len; ++i) {
+                ab.append("*");
+            }
+        } else {
+            ab.append(l->buf, l->len);
+        }
+
+        /* Show hits if any. */
+        refreshShowHints(ab, l, plen);
+
+        /* If we are at the very end of the screen with our prompt, we need to
+         * emit a newline and move the prompt to the first column. */
+        if (l->pos &&
+            l->pos == l->len &&
+            (l->pos+plen) % l->cols == 0)
+        {
+            lndebug("<newline>");
+            ab.append("\n");
+            snprintf(seq,64,"\r");
+            ab.append(seq);
+            rows++;
+            if (rows > (int)l->oldrows) l->oldrows = rows;
+        }
+
+        /* Move cursor to right position. */
+        rpos2 = (plen+l->pos+l->cols)/l->cols; /* Current cursor relative row */
+        lndebug("rpos2 %d", rpos2);
+
+        /* Go up till we reach the expected positon. */
+        if (rows-rpos2 > 0) {
+            lndebug("go-up %d", rows-rpos2);
+            snprintf(seq,64,"\x1b[%dA", rows-rpos2);
+            ab.append(seq);
+        }
+
+        /* Set column. */
+        col = (plen+(int)l->pos) % (int)l->cols;
+        lndebug("set col %d", 1+col);
+        if (col)
+            snprintf(seq,64,"\r\x1b[%dC", col);
+        else
+            snprintf(seq,64,"\r");
+        ab.append(seq);
+    }
+
+    lndebug("\n");
+    l->oldpos = l->pos;
+    (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
+}
+
+/* Calls the two low level functions refreshSingleLine() or
+ * refreshMultiLine() according to the selected mode. */
+static void refreshLineWithFlags(struct linenoiseState *l, int flags) {
+    if (mlmode)
+        refreshMultiLine(l,flags);
+    else
+        refreshSingleLine(l,flags);
+}
+
+/* Utility function to avoid specifying REFRESH_ALL all the times. */
+static void refreshLine(struct linenoiseState *l) {
+    refreshLineWithFlags(l,REFRESH_ALL);
+}
+
+/* Hide the current line, when using the multiplexing API. */
+void linenoiseHide(struct linenoiseState *l) {
+    if (mlmode)
+        refreshMultiLine(l,REFRESH_CLEAN);
+    else
+        refreshSingleLine(l,REFRESH_CLEAN);
+}
+
+/* Show the current line, when using the multiplexing API. */
+void linenoiseShow(struct linenoiseState *l) {
+    if (l->in_completion) {
+        refreshLineWithCompletion(l,NULL,REFRESH_WRITE);
+    } else {
+        refreshLineWithFlags(l,REFRESH_WRITE);
+    }
+}
+
+/* Insert the character 'c' at cursor current position.
+ *
+ * On error writing to the terminal -1 is returned, otherwise 0. */
+static int linenoiseEditInsert(struct linenoiseState * l, char c) {
+    if (l->len < l->buflen) {
+        if (l->len == l->pos) {
+            l->buf[l->pos] = c;
+            l->pos++;
+            l->len++;
+            l->buf[l->len] = '\0';
+            if ((!mlmode && l->plen+l->len < l->cols && !hintsCallback)) {
+                /* Avoid a full update of the line in the
+                 * trivial case. */
+                char d = (maskmode==1) ? '*' : c;
+                if (write(l->ofd,&d,1) == -1) return -1;
+            } else {
+                refreshLine(l);
+            }
+        } else {
+            memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos);
+            l->buf[l->pos] = c;
+            l->len++;
+            l->pos++;
+            l->buf[l->len] = '\0';
+            refreshLine(l);
+        }
+    }
+    return 0;
+}
+
+/* Move cursor on the left. */
+static void linenoiseEditMoveLeft(struct linenoiseState * l) {
+    if (l->pos > 0) {
+        l->pos--;
+        refreshLine(l);
+    }
+}
+
+/* Move cursor on the right. */
+static void linenoiseEditMoveRight(struct linenoiseState * l) {
+    if (l->pos != l->len) {
+        l->pos++;
+        refreshLine(l);
+    }
+}
+
+/* Move cursor to the start of the line. */
+static void linenoiseEditMoveHome(struct linenoiseState * l) {
+    if (l->pos != 0) {
+        l->pos = 0;
+        refreshLine(l);
+    }
+}
+
+/* Move cursor to the end of the line. */
+static void linenoiseEditMoveEnd(struct linenoiseState * l) {
+    if (l->pos != l->len) {
+        l->pos = l->len;
+        refreshLine(l);
+    }
+}
+
+/* Substitute the currently edited line with the next or previous history
+ * entry as specified by 'dir'. */
+#define LINENOISE_HISTORY_NEXT 0
+#define LINENOISE_HISTORY_PREV 1
+
+static void linenoiseEditHistoryNext(struct linenoiseState * l, int dir) {
+    if (history_len > 1) {
+        /* Update the current history entry before to
+         * overwrite it with the next one. */
+        free(history[history_len - 1 - l->history_index]);
+        history[history_len - 1 - l->history_index] = strdup(l->buf);
+        /* Show the new entry */
+        l->history_index += (dir == LINENOISE_HISTORY_PREV) ? 1 : -1;
+        if (l->history_index < 0) {
+            l->history_index = 0;
+            return;
+        } else if (l->history_index >= history_len) {
+            l->history_index = history_len-1;
+            return;
+        }
+        strncpy(l->buf,history[history_len - 1 - l->history_index],l->buflen);
+        l->buf[l->buflen-1] = '\0';
+        l->len = l->pos = strlen(l->buf);
+        refreshLine(l);
+    }
+}
+
+/* Delete the character at the right of the cursor without altering the cursor
+ * position. Basically this is what happens with the "Delete" keyboard key. */
+static void linenoiseEditDelete(struct linenoiseState * l) {
+    if (l->len > 0 && l->pos < l->len) {
+        memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1);
+        l->len--;
+        l->buf[l->len] = '\0';
+        refreshLine(l);
+    }
+}
+
+/* Backspace implementation. */
+static void linenoiseEditBackspace(struct linenoiseState * l) {
+    if (l->pos > 0 && l->len > 0) {
+        memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos);
+        l->pos--;
+        l->len--;
+        l->buf[l->len] = '\0';
+        refreshLine(l);
+    }
+}
+
+/* Delete the previosu word, maintaining the cursor at the start of the
+ * current word. */
+static void linenoiseEditDeletePrevWord(struct linenoiseState * l) {
+    size_t old_pos = l->pos;
+    size_t diff;
+
+    while (l->pos > 0 && l->buf[l->pos-1] == ' ')
+        l->pos--;
+    while (l->pos > 0 && l->buf[l->pos-1] != ' ')
+        l->pos--;
+    diff = old_pos - l->pos;
+    memmove(l->buf+l->pos,l->buf+old_pos,l->len-old_pos+1);
+    l->len -= diff;
+    refreshLine(l);
+}
+
+/* This function is part of the multiplexed API of Linenoise, that is used
+ * in order to implement the blocking variant of the API but can also be
+ * called by the user directly in an event driven program. It will:
+ *
+ * 1. Initialize the linenoise state passed by the user.
+ * 2. Put the terminal in RAW mode.
+ * 3. Show the prompt.
+ * 4. Return control to the user, that will have to call linenoiseEditFeed()
+ *    each time there is some data arriving in the standard input.
+ *
+ * The user can also call linenoiseEditHide() and linenoiseEditShow() if it
+ * is required to show some input arriving asyncronously, without mixing
+ * it with the currently edited line.
+ *
+ * When linenoiseEditFeed() returns non-NULL, the user finished with the
+ * line editing session (pressed enter CTRL-D/C): in this case the caller
+ * needs to call linenoiseEditStop() to put back the terminal in normal
+ * mode. This will not destroy the buffer, as long as the linenoiseState
+ * is still valid in the context of the caller.
+ *
+ * The function returns 0 on success, or -1 if writing to standard output
+ * fails. If stdin_fd or stdout_fd are set to -1, the default is to use
+ * STDIN_FILENO and STDOUT_FILENO.
+ */
+int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) {
+    /* Populate the linenoise state that we pass to functions implementing
+     * specific editing functionalities. */
+    l->in_completion = 0;
+    l->ifd = stdin_fd != -1 ? stdin_fd : STDIN_FILENO;
+    l->ofd = stdout_fd != -1 ? stdout_fd : STDOUT_FILENO;
+    l->buf = buf;
+    l->buflen = buflen;
+    l->prompt = prompt;
+    l->plen = strlen(prompt);
+    l->oldpos = l->pos = 0;
+    l->len = 0;
+
+    /* Enter raw mode. */
+    if (enableRawMode(l->ifd) == -1) return -1;
+
+    l->cols = getColumns(stdin_fd, stdout_fd);
+    l->oldrows = 0;
+    l->history_index = 0;
+
+    /* Buffer starts empty. */
+    l->buf[0] = '\0';
+    l->buflen--; /* Make sure there is always space for the nulterm */
+
+    /* If stdin is not a tty, stop here with the initialization. We
+     * will actually just read a line from standard input in blocking
+     * mode later, in linenoiseEditFeed(). */
+    if (!isatty(l->ifd)) return 0;
+
+    /* The latest history entry is always our current buffer, that
+     * initially is just an empty string. */
+    linenoiseHistoryAdd("");
+
+    if (write(l->ofd,prompt,l->plen) == -1) return -1;
+    return 0;
+}
+
+const char* linenoiseEditMore = "If you see this, you are misusing the API: when linenoiseEditFeed() is called, if it returns linenoiseEditMore the user is yet editing the line. See the README file for more information.";
+
+/* This function is part of the multiplexed API of linenoise, see the top
+ * comment on linenoiseEditStart() for more information. Call this function
+ * each time there is some data to read from the standard input file
+ * descriptor. In the case of blocking operations, this function can just be
+ * called in a loop, and block.
+ *
+ * The function returns linenoiseEditMore to signal that line editing is still
+ * in progress, that is, the user didn't yet pressed enter / CTRL-D. Otherwise
+ * the function returns the pointer to the heap-allocated buffer with the
+ * edited line, that the user should free with linenoiseFree().
+ *
+ * On special conditions, NULL is returned and errno is populated:
+ *
+ * EAGAIN if the user pressed Ctrl-C
+ * ENOENT if the user pressed Ctrl-D
+ *
+ * Some other errno: I/O error.
+ */
+const char *linenoiseEditFeed(struct linenoiseState *l) {
+    /* Not a TTY, pass control to line reading without character
+     * count limits. */
+    if (!isatty(l->ifd)) return linenoiseNoTTY();
+
+    char c;
+    int nread;
+    char seq[3];
+
+    nread = read(l->ifd,&c,1);
+    if (nread <= 0) return NULL;
+
+    /* Only autocomplete when the callback is set. It returns < 0 when
+     * there was an error reading from fd. Otherwise it will return the
+     * character that should be handled next. */
+    if ((l->in_completion || c == 9) && completionCallback != NULL) {
+        c = completeLine(l,c);
+        /* Read next character when 0 */
+        if (c == 0) return linenoiseEditMore;
+    }
+
+    switch(c) {
+    case ENTER:    /* enter */
+        history_len--;
+        free(history[history_len]);
+        if (mlmode) linenoiseEditMoveEnd(l);
+        if (hintsCallback) {
+            /* Force a refresh without hints to leave the previous
+             * line as the user typed it after a newline. */
+            linenoiseHintsCallback *hc = hintsCallback;
+            hintsCallback = NULL;
+            refreshLine(l);
+            hintsCallback = hc;
+        }
+        return strdup(l->buf);
+    case CTRL_C:     /* ctrl-c */
+        errno = EAGAIN;
+        return NULL;
+    case BACKSPACE:   /* backspace */
+    case 8:     /* ctrl-h */
+        linenoiseEditBackspace(l);
+        break;
+    case CTRL_D:     /* ctrl-d, remove char at right of cursor, or if the
+                        line is empty, act as end-of-file. */
+        if (l->len > 0) {
+            linenoiseEditDelete(l);
+        } else {
+            history_len--;
+            free(history[history_len]);
+            errno = ENOENT;
+            return NULL;
+        }
+        break;
+    case CTRL_T:    /* ctrl-t, swaps current character with previous. */
+        if (l->pos > 0 && l->pos < l->len) {
+            int aux = l->buf[l->pos-1];
+            l->buf[l->pos-1] = l->buf[l->pos];
+            l->buf[l->pos] = aux;
+            if (l->pos != l->len-1) l->pos++;
+            refreshLine(l);
+        }
+        break;
+    case CTRL_B:     /* ctrl-b */
+        linenoiseEditMoveLeft(l);
+        break;
+    case CTRL_F:     /* ctrl-f */
+        linenoiseEditMoveRight(l);
+        break;
+    case CTRL_P:    /* ctrl-p */
+        linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV);
+        break;
+    case CTRL_N:    /* ctrl-n */
+        linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT);
+        break;
+    case ESC:    /* escape sequence */
+        /* Read the next two bytes representing the escape sequence.
+         * Use two calls to handle slow terminals returning the two
+         * chars at different times. */
+        if (read(l->ifd,seq,1) == -1) break;
+        if (read(l->ifd,seq+1,1) == -1) break;
+
+        /* ESC [ sequences. */
+        if (seq[0] == '[') {
+            if (seq[1] >= '0' && seq[1] <= '9') {
+                /* Extended escape, read additional byte. */
+                if (read(l->ifd,seq+2,1) == -1) break;
+                if (seq[2] == '~') {
+                    switch(seq[1]) {
+                    case '3': /* Delete key. */
+                        linenoiseEditDelete(l);
+                        break;
+                    }
+                }
+            } else {
+                switch(seq[1]) {
+                case 'A': /* Up */
+                    linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV);
+                    break;
+                case 'B': /* Down */
+                    linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT);
+                    break;
+                case 'C': /* Right */
+                    linenoiseEditMoveRight(l);
+                    break;
+                case 'D': /* Left */
+                    linenoiseEditMoveLeft(l);
+                    break;
+                case 'H': /* Home */
+                    linenoiseEditMoveHome(l);
+                    break;
+                case 'F': /* End*/
+                    linenoiseEditMoveEnd(l);
+                    break;
+                }
+            }
+        }
+
+        /* ESC O sequences. */
+        else if (seq[0] == 'O') {
+            switch(seq[1]) {
+            case 'H': /* Home */
+                linenoiseEditMoveHome(l);
+                break;
+            case 'F': /* End*/
+                linenoiseEditMoveEnd(l);
+                break;
+            }
+        }
+        break;
+    default:
+        if (linenoiseEditInsert(l,c)) return NULL;
+        break;
+    case CTRL_U: /* Ctrl+u, delete the whole line. */
+        l->buf[0] = '\0';
+        l->pos = l->len = 0;
+        refreshLine(l);
+        break;
+    case CTRL_K: /* Ctrl+k, delete from current to end of line. */
+        l->buf[l->pos] = '\0';
+        l->len = l->pos;
+        refreshLine(l);
+        break;
+    case CTRL_A: /* Ctrl+a, go to the start of the line */
+        linenoiseEditMoveHome(l);
+        break;
+    case CTRL_E: /* ctrl+e, go to the end of the line */
+        linenoiseEditMoveEnd(l);
+        break;
+    case CTRL_L: /* ctrl+l, clear screen */
+        linenoiseClearScreen();
+        refreshLine(l);
+        break;
+    case CTRL_W: /* ctrl+w, delete previous word */
+        linenoiseEditDeletePrevWord(l);
+        break;
+    }
+    return linenoiseEditMore;
+}
+
+/* This is part of the multiplexed linenoise API. See linenoiseEditStart()
+ * for more information. This function is called when linenoiseEditFeed()
+ * returns something different than NULL. At this point the user input
+ * is in the buffer, and we can restore the terminal in normal mode. */
+void linenoiseEditStop(struct linenoiseState *l) {
+    if (!isatty(l->ifd)) return;
+    disableRawMode(l->ifd);
+    printf("\n");
+}
+
+/* This just implements a blocking loop for the multiplexed API.
+ * In many applications that are not event-drivern, we can just call
+ * the blocking linenoise API, wait for the user to complete the editing
+ * and return the buffer. */
+static const char *linenoiseBlockingEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt)
+{
+    struct linenoiseState l;
+
+    /* Editing without a buffer is invalid. */
+    if (buflen == 0) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    linenoiseEditStart(&l,stdin_fd,stdout_fd,buf,buflen,prompt);
+    const char *res;
+    while((res = linenoiseEditFeed(&l)) == linenoiseEditMore);
+    linenoiseEditStop(&l);
+    return res;
+}
+
+/* This special mode is used by linenoise in order to print scan codes
+ * on screen for debugging / development purposes. It is implemented
+ * by the linenoise_example program using the --keycodes option. */
+void linenoisePrintKeyCodes(void) {
+    char quit[4];
+
+    printf("Linenoise key codes debugging mode.\n"
+            "Press keys to see scan codes. Type 'quit' at any time to exit.\n");
+    if (enableRawMode(STDIN_FILENO) == -1) return;
+    memset(quit,' ',4);
+    while(1) {
+        char c;
+        int nread;
+
+        nread = read(STDIN_FILENO,&c,1);
+        if (nread <= 0) continue;
+        memmove(quit,quit+1,sizeof(quit)-1); /* shift string to left. */
+        quit[sizeof(quit)-1] = c; /* Insert current char on the right. */
+        if (memcmp(quit,"quit",sizeof(quit)) == 0) break;
+
+        printf("'%c' %02x (%d) (type quit to exit)\n",
+            isprint(c) ? c : '?', (int)c, (int)c);
+        printf("\r"); /* Go left edge manually, we are in raw mode. */
+        fflush(stdout);
+    }
+    disableRawMode(STDIN_FILENO);
+}
+
+/* This function is called when linenoise() is called with the standard
+ * input file descriptor not attached to a TTY. So for example when the
+ * program using linenoise is called in pipe or with a file redirected
+ * to its standard input. In this case, we want to be able to return the
+ * line regardless of its length (by default we are limited to 4k). */
+static char *linenoiseNoTTY(void) {
+    char *line = NULL;
+    size_t len = 0, maxlen = 0;
+
+    while(1) {
+        if (len == maxlen) {
+            if (maxlen == 0) maxlen = 16;
+            maxlen *= 2;
+            char *oldval = line;
+            line = (char*) realloc(line,maxlen);
+            if (line == NULL) {
+                if (oldval) free(oldval);
+                return NULL;
+            }
+        }
+        int c = fgetc(stdin);
+        if (c == EOF || c == '\n') {
+            if (c == EOF && len == 0) {
+                free(line);
+                return NULL;
+            } else {
+                line[len] = '\0';
+                return line;
+            }
+        } else {
+            line[len] = c;
+            len++;
+        }
+    }
+}
+
+/* The high level function that is the main API of the linenoise library.
+ * This function checks if the terminal has basic capabilities, just checking
+ * for a blacklist of stupid terminals, and later either calls the line
+ * editing function or uses dummy fgets() so that you will be able to type
+ * something even in the most desperate of the conditions. */
+const char *linenoise(const char *prompt) {
+    char buf[LINENOISE_MAX_LINE];
+
+    if (!isatty(STDIN_FILENO)) {
+        /* Not a tty: read from file / pipe. In this mode we don't want any
+         * limit to the line size, so we call a function to handle that. */
+        return linenoiseNoTTY();
+    } else if (isUnsupportedTerm()) {
+        size_t len;
+
+        printf("%s",prompt);
+        fflush(stdout);
+        if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL;
+        len = strlen(buf);
+        while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) {
+            len--;
+            buf[len] = '\0';
+        }
+        return strdup(buf);
+    } else {
+        const char *retval = linenoiseBlockingEdit(STDIN_FILENO,STDOUT_FILENO,buf,LINENOISE_MAX_LINE,prompt);
+        return retval;
+    }
+}
+
+/* This is just a wrapper the user may want to call in order to make sure
+ * the linenoise returned buffer is freed with the same allocator it was
+ * created with. Useful when the main program is using an alternative
+ * allocator. */
+void linenoiseFree(void *ptr) {
+    if (ptr == linenoiseEditMore) return; // Protect from API misuse.
+    free(ptr);
+}
+
+/* ================================ History ================================= */
+
+/* Free the history, but does not reset it. Only used when we have to
+ * exit() to avoid memory leaks are reported by valgrind & co. */
+static void freeHistory(void) {
+    if (history) {
+        int j;
+
+        for (j = 0; j < history_len; j++)
+            free(history[j]);
+        free(history);
+    }
+}
+
+/* At exit we'll try to fix the terminal to the initial conditions. */
+static void linenoiseAtExit(void) {
+    disableRawMode(STDIN_FILENO);
+    freeHistory();
+}
+
+/* This is the API call to add a new entry in the linenoise history.
+ * It uses a fixed array of char pointers that are shifted (memmoved)
+ * when the history max length is reached in order to remove the older
+ * entry and make room for the new one, so it is not exactly suitable for huge
+ * histories, but will work well for a few hundred of entries.
+ *
+ * Using a circular buffer is smarter, but a bit more complex to handle. */
+int linenoiseHistoryAdd(const char *line) {
+    char *linecopy;
+
+    if (history_max_len == 0) return 0;
+
+    /* Initialization on first call. */
+    if (history == NULL) {
+        history = (char**) malloc(sizeof(char*)*history_max_len);
+        if (history == NULL) return 0;
+        memset(history,0,(sizeof(char*)*history_max_len));
+    }
+
+    /* Don't add duplicated lines. */
+    if (history_len && !strcmp(history[history_len-1], line)) return 0;
+
+    /* Add an heap allocated copy of the line in the history.
+     * If we reached the max length, remove the older line. */
+    linecopy = strdup(line);
+    if (!linecopy) return 0;
+    if (history_len == history_max_len) {
+        free(history[0]);
+        memmove(history,history+1,sizeof(char*)*(history_max_len-1));
+        history_len--;
+    }
+    history[history_len] = linecopy;
+    history_len++;
+    return 1;
+}
+
+/* Set the maximum length for the history. This function can be called even
+ * if there is already some history, the function will make sure to retain
+ * just the latest 'len' elements if the new history length value is smaller
+ * than the amount of items already inside the history. */
+int linenoiseHistorySetMaxLen(int len) {
+    char **new_ptr;
+
+    if (len < 1) return 0;
+    if (history) {
+        int tocopy = history_len;
+
+        new_ptr = (char**) malloc(sizeof(char*)*len);
+        if (new_ptr == NULL) return 0;
+
+        /* If we can't copy everything, free the elements we'll not use. */
+        if (len < tocopy) {
+            int j;
+
+            for (j = 0; j < tocopy-len; j++) free(history[j]);
+            tocopy = len;
+        }
+        memset(new_ptr,0,sizeof(char*)*len);
+        memcpy(new_ptr,history+(history_len-tocopy), sizeof(char*)*tocopy);
+        free(history);
+        history = new_ptr;
+    }
+    history_max_len = len;
+    if (history_len > history_max_len)
+        history_len = history_max_len;
+    return 1;
+}
+
+/* Save the history in the specified file. On success 0 is returned
+ * otherwise -1 is returned. */
+int linenoiseHistorySave(const char *filename) {
+    mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO);
+    File   file;
+    file.open(filename, "w");
+    umask(old_umask);
+    if (file.file == NULL) {
+        return -1;
+    }
+    chmod(filename,S_IRUSR|S_IWUSR);
+    for (int j = 0; j < history_len; ++j) {
+        fprintf(file.file, "%s\n", history[j]);
+    }
+
+    return 0;
+}
+
+/* Load the history from the specified file. If the file does not exist
+ * zero is returned and no operation is performed.
+ *
+ * If the file exists and the operation succeeded 0 is returned, otherwise
+ * on error -1 is returned. */
+int linenoiseHistoryLoad(const char *filename) {
+    File file;
+    file.open(filename, "r");
+    char buf[LINENOISE_MAX_LINE];
+    if (file.file == NULL) {
+        return -1;
+    }
+
+    while (fgets(buf, LINENOISE_MAX_LINE, file.file) != NULL) {
+        char *p;
+
+        p = strchr(buf,'\r');
+        if (!p) p = strchr(buf,'\n');
+        if (p) *p = '\0';
+        linenoiseHistoryAdd(buf);
+    }
+    return 0;
+}
+#endif
diff --git a/examples/run/linenoise.cpp/linenoise.h b/examples/run/linenoise.cpp/linenoise.h
new file mode 100644
index 000000000..a14ec6c74
--- /dev/null
+++ b/examples/run/linenoise.cpp/linenoise.h
@@ -0,0 +1,128 @@
+/* linenoise.h -- VERSION 1.0
+ *
+ * Guerrilla line editing library against the idea that a line editing lib
+ * needs to be 20,000 lines of C++ code.
+ *
+ * See linenoise.cpp for more information.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+ * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *  *  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  *  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINENOISE_H
+#define __LINENOISE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h> /* For size_t. */
+#include <stdlib.h>
+
+extern const char *linenoiseEditMore;
+
+/* The linenoiseState structure represents the state during line editing.
+ * We pass this state to functions implementing specific editing
+ * functionalities. */
+struct linenoiseState {
+    int in_completion;  /* The user pressed TAB and we are now in completion
+                         * mode, so input is handled by completeLine(). */
+    size_t completion_idx; /* Index of next completion to propose. */
+    int ifd;            /* Terminal stdin file descriptor. */
+    int ofd;            /* Terminal stdout file descriptor. */
+    char *buf;          /* Edited line buffer. */
+    size_t buflen;      /* Edited line buffer size. */
+    const char *prompt; /* Prompt to display. */
+    size_t plen;        /* Prompt length. */
+    size_t pos;         /* Current cursor position. */
+    size_t oldpos;      /* Previous refresh cursor position. */
+    size_t len;         /* Current edited line length. */
+    size_t cols;        /* Number of columns in terminal. */
+    size_t oldrows;     /* Rows used by last refrehsed line (multiline mode) */
+    int history_index;  /* The history index we are currently editing. */
+};
+
+struct linenoiseCompletions {
+    size_t  len     = 0;
+    char ** cvec    = nullptr;
+    bool    to_free = true;
+
+    ~linenoiseCompletions() {
+        if (!to_free) {
+            return;
+        }
+
+        for (size_t i = 0; i < len; ++i) {
+            free(cvec[i]);
+        }
+
+        free(cvec);
+    }
+};
+
+/* Non blocking API. */
+int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
+const char *linenoiseEditFeed(struct linenoiseState *l);
+void linenoiseEditStop(struct linenoiseState *l);
+void linenoiseHide(struct linenoiseState *l);
+void linenoiseShow(struct linenoiseState *l);
+
+/* Blocking API. */
+const char *linenoise(const char *prompt);
+void linenoiseFree(void *ptr);
+
+/* Completion API. */
+typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
+typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
+typedef void(linenoiseFreeHintsCallback)(const char *);
+void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
+void linenoiseSetHintsCallback(linenoiseHintsCallback *);
+void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
+void linenoiseAddCompletion(linenoiseCompletions *, const char *);
+
+/* History API. */
+int linenoiseHistoryAdd(const char *line);
+int linenoiseHistorySetMaxLen(int len);
+int linenoiseHistorySave(const char *filename);
+int linenoiseHistoryLoad(const char *filename);
+
+/* Other utilities. */
+void linenoiseClearScreen(void);
+void linenoiseSetMultiLine(int ml);
+void linenoisePrintKeyCodes(void);
+void linenoiseMaskModeEnable(void);
+void linenoiseMaskModeDisable(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LINENOISE_H */
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
new file mode 100644
index 000000000..9362da220
--- /dev/null
+++ b/examples/run/run.cpp
@@ -0,0 +1,1173 @@
+#if defined(_WIN32)
+#    include <windows.h>
+#    include <io.h>
+#else
+#    include <sys/file.h>
+#    include <sys/ioctl.h>
+#    include <unistd.h>
+#endif
+
+#if defined(LLAMA_USE_CURL)
+#    include <curl/curl.h>
+#endif
+
+#include <signal.h>
+
+#include <climits>
+#include <cstdarg>
+#include <cstdio>
+#include <cstring>
+#include <filesystem>
+#include <iostream>
+#include <list>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "chat-template.hpp"
+#include "common.h"
+#include "json.hpp"
+#include "linenoise.cpp/linenoise.h"
+#include "llama-cpp.h"
+#include "log.h"
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
+[[noreturn]] static void sigint_handler(int) {
+    printf("\n" LOG_COL_DEFAULT);
+    exit(0);  // not ideal, but it's the only way to guarantee exit in all cases
+}
+#endif
+
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static std::string fmt(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    const int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::string buf;
+    buf.resize(size);
+    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+
+    return buf;
+}
+
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static int printe(const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    const int ret = vfprintf(stderr, fmt, args);
+    va_end(args);
+
+    return ret;
+}
+
+static std::string strftime_fmt(const char * fmt, const std::tm & tm) {
+    std::ostringstream oss;
+    oss << std::put_time(&tm, fmt);
+
+    return oss.str();
+}
+
+class Opt {
+  public:
+    int init(int argc, const char ** argv) {
+        ctx_params           = llama_context_default_params();
+        model_params         = llama_model_default_params();
+        context_size_default = ctx_params.n_batch;
+        ngl_default          = model_params.n_gpu_layers;
+        common_params_sampling sampling;
+        temperature_default = sampling.temp;
+
+        if (argc < 2) {
+            printe("Error: No arguments provided.\n");
+            print_help();
+            return 1;
+        }
+
+        // Parse arguments
+        if (parse(argc, argv)) {
+            printe("Error: Failed to parse arguments.\n");
+            print_help();
+            return 1;
+        }
+
+        // If help is requested, show help and exit
+        if (help) {
+            print_help();
+            return 2;
+        }
+
+        ctx_params.n_batch        = context_size >= 0 ? context_size : context_size_default;
+        ctx_params.n_ctx          = ctx_params.n_batch;
+        model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
+        temperature               = temperature >= 0 ? temperature : temperature_default;
+
+        return 0;  // Success
+    }
+
+    llama_context_params ctx_params;
+    llama_model_params   model_params;
+    std::string model_;
+    std::string          user;
+    bool                 use_jinja   = false;
+    int                  context_size = -1, ngl = -1;
+    float                temperature = -1;
+    bool                 verbose     = false;
+
+  private:
+    int   context_size_default = -1, ngl_default = -1;
+    float temperature_default = -1;
+    bool  help                = false;
+
+    bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
+        return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
+    }
+
+    int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) {
+        if (i + 1 >= argc) {
+            return 1;
+        }
+
+        option_value = std::atoi(argv[++i]);
+
+        return 0;
+    }
+
+    int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
+        if (i + 1 >= argc) {
+            return 1;
+        }
+
+        option_value = std::atof(argv[++i]);
+
+        return 0;
+    }
+
+    int parse(int argc, const char ** argv) {
+        bool options_parsing   = true;
+        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
+            if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
+                if (handle_option_with_value(argc, argv, i, context_size) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing &&
+                       (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+                if (handle_option_with_value(argc, argv, i, ngl) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
+                if (handle_option_with_value(argc, argv, i, temperature) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing &&
+                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
+                verbose = true;
+            } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
+                use_jinja = true;
+            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
+                help = true;
+                return 0;
+            } else if (options_parsing && strcmp(argv[i], "--") == 0) {
+                options_parsing = false;
+            } else if (positional_args_i == 0) {
+                if (!argv[i][0] || argv[i][0] == '-') {
+                    return 1;
+                }
+
+                ++positional_args_i;
+                model_ = argv[i];
+            } else if (positional_args_i == 1) {
+                ++positional_args_i;
+                user = argv[i];
+            } else {
+                user += " " + std::string(argv[i]);
+            }
+        }
+
+        if (model_.empty()){
+            return 1;
+        }
+
+        return 0;
+    }
+
+    void print_help() const {
+        printf(
+            "Description:\n"
+            "  Runs a llm\n"
+            "\n"
+            "Usage:\n"
+            "  llama-run [options] model [prompt]\n"
+            "\n"
+            "Options:\n"
+            "  -c, --context-size <value>\n"
+            "      Context size (default: %d)\n"
+            "  -n, -ngl, --ngl <value>\n"
+            "      Number of GPU layers (default: %d)\n"
+            "  --temp <value>\n"
+            "      Temperature (default: %.1f)\n"
+            "  -v, --verbose, --log-verbose\n"
+            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
+            "  -h, --help\n"
+            "      Show help message\n"
+            "\n"
+            "Commands:\n"
+            "  model\n"
+            "      Model is a string with an optional prefix of \n"
+            "      huggingface:// (hf://), ollama://, https:// or file://.\n"
+            "      If no protocol is specified and a file exists in the specified\n"
+            "      path, file:// is assumed, otherwise if a file does not exist in\n"
+            "      the specified path, ollama:// is assumed. Models that are being\n"
+            "      pulled are downloaded with .partial extension while being\n"
+            "      downloaded and then renamed as the file without the .partial\n"
+            "      extension when complete.\n"
+            "\n"
+            "Examples:\n"
+            "  llama-run llama3\n"
+            "  llama-run ollama://granite-code\n"
+            "  llama-run ollama://smollm:135m\n"
+            "  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
+            "  llama-run "
+            "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
+            "  llama-run https://example.com/some-file1.gguf\n"
+            "  llama-run some-file2.gguf\n"
+            "  llama-run file://some-file3.gguf\n"
+            "  llama-run --ngl 999 some-file4.gguf\n"
+            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
+            context_size_default, ngl_default, temperature_default);
+    }
+};
+
+struct progress_data {
+    size_t                                file_size  = 0;
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+    bool                                  printed    = false;
+};
+
+static int get_terminal_width() {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO csbi;
+    GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+    return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+#else
+    struct winsize w;
+    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+    return w.ws_col;
+#endif
+}
+
+#ifdef LLAMA_USE_CURL
+class File {
+  public:
+    FILE * file = nullptr;
+
+    FILE * open(const std::string & filename, const char * mode) {
+        file = fopen(filename.c_str(), mode);
+
+        return file;
+    }
+
+    int lock() {
+        if (file) {
+#    ifdef _WIN32
+            fd    = _fileno(file);
+            hFile = (HANDLE) _get_osfhandle(fd);
+            if (hFile == INVALID_HANDLE_VALUE) {
+                fd = -1;
+
+                return 1;
+            }
+
+            OVERLAPPED overlapped = {};
+            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
+                            &overlapped)) {
+                fd = -1;
+
+                return 1;
+            }
+#    else
+            fd = fileno(file);
+            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+                fd = -1;
+
+                return 1;
+            }
+#    endif
+        }
+
+        return 0;
+    }
+
+    ~File() {
+        if (fd >= 0) {
+#    ifdef _WIN32
+            if (hFile != INVALID_HANDLE_VALUE) {
+                OVERLAPPED overlapped = {};
+                UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
+            }
+#    else
+            flock(fd, LOCK_UN);
+#    endif
+        }
+
+        if (file) {
+            fclose(file);
+        }
+    }
+
+  private:
+    int fd = -1;
+#    ifdef _WIN32
+    HANDLE hFile = nullptr;
+#    endif
+};
+
+class HttpClient {
+  public:
+    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+             const bool progress, std::string * response_str = nullptr) {
+        if (std::filesystem::exists(output_file)) {
+            return 0;
+        }
+
+        std::string output_file_partial;
+        curl = curl_easy_init();
+        if (!curl) {
+            return 1;
+        }
+
+        progress_data data;
+        File          out;
+        if (!output_file.empty()) {
+            output_file_partial = output_file + ".partial";
+            if (!out.open(output_file_partial, "ab")) {
+                printe("Failed to open file for writing\n");
+
+                return 1;
+            }
+
+            if (out.lock()) {
+                printe("Failed to exclusively lock file\n");
+
+                return 1;
+            }
+        }
+
+        set_write_options(response_str, out);
+        data.file_size = set_resume_point(output_file_partial);
+        set_progress_options(progress, data);
+        set_headers(headers);
+        CURLcode res = perform(url);
+        if (res != CURLE_OK){
+            printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res));
+            return 1;
+        }
+        if (!output_file.empty()) {
+            std::filesystem::rename(output_file_partial, output_file);
+        }
+
+        return 0;
+    }
+
+    ~HttpClient() {
+        if (chunk) {
+            curl_slist_free_all(chunk);
+        }
+
+        if (curl) {
+            curl_easy_cleanup(curl);
+        }
+    }
+
+  private:
+    CURL *              curl  = nullptr;
+    struct curl_slist * chunk = nullptr;
+
+    void set_write_options(std::string * response_str, const File & out) {
+        if (response_str) {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
+        } else {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file);
+        }
+    }
+
+    size_t set_resume_point(const std::string & output_file) {
+        size_t file_size = 0;
+        if (std::filesystem::exists(output_file)) {
+            file_size = std::filesystem::file_size(output_file);
+            curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
+        }
+
+        return file_size;
+    }
+
+    void set_progress_options(bool progress, progress_data & data) {
+        if (progress) {
+            curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+            curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress);
+        }
+    }
+
+    void set_headers(const std::vector<std::string> & headers) {
+        if (!headers.empty()) {
+            if (chunk) {
+                curl_slist_free_all(chunk);
+                chunk = 0;
+            }
+
+            for (const auto & header : headers) {
+                chunk = curl_slist_append(chunk, header.c_str());
+            }
+
+            curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
+        }
+    }
+
+    CURLcode perform(const std::string & url) {
+        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+        curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+        return curl_easy_perform(curl);
+    }
+
+    static std::string human_readable_time(double seconds) {
+        int hrs  = static_cast<int>(seconds) / 3600;
+        int mins = (static_cast<int>(seconds) % 3600) / 60;
+        int secs = static_cast<int>(seconds) % 60;
+
+        if (hrs > 0) {
+            return fmt("%dh %02dm %02ds", hrs, mins, secs);
+        } else if (mins > 0) {
+            return fmt("%dm %02ds", mins, secs);
+        } else {
+            return fmt("%ds", secs);
+        }
+    }
+
+    static std::string human_readable_size(curl_off_t size) {
+        static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
+        char                length   = sizeof(suffix) / sizeof(suffix[0]);
+        int                 i        = 0;
+        double              dbl_size = size;
+        if (size > 1024) {
+            for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
+                dbl_size = size / 1024.0;
+            }
+        }
+
+        return fmt("%.2f %s", dbl_size, suffix[i]);
+    }
+
+    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+                               curl_off_t) {
+        progress_data * data = static_cast<progress_data *>(ptr);
+        if (total_to_download <= 0) {
+            return 0;
+        }
+
+        total_to_download += data->file_size;
+        const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
+        const curl_off_t percentage      = calculate_percentage(now_downloaded_plus_file_size, total_to_download);
+        std::string      progress_prefix = generate_progress_prefix(percentage);
+
+        const double speed = calculate_speed(now_downloaded, data->start_time);
+        const double tim   = (total_to_download - now_downloaded) / speed;
+        std::string  progress_suffix =
+            generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim);
+
+        int         progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix);
+        std::string progress_bar;
+        generate_progress_bar(progress_bar_width, percentage, progress_bar);
+
+        print_progress(progress_prefix, progress_bar, progress_suffix);
+        data->printed = true;
+
+        return 0;
+    }
+
+    static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) {
+        return (now_downloaded_plus_file_size * 100) / total_to_download;
+    }
+
+    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
+
+    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
+        const auto                          now             = std::chrono::steady_clock::now();
+        const std::chrono::duration<double> elapsed_seconds = now - start_time;
+        return now_downloaded / elapsed_seconds.count();
+    }
+
+    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
+                                                double speed, double estimated_time) {
+        const int width = 10;
+        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
+                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
+                   human_readable_time(estimated_time).c_str());
+    }
+
+    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
+        int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3;
+        if (progress_bar_width < 1) {
+            progress_bar_width = 1;
+        }
+
+        return progress_bar_width;
+    }
+
+    static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage,
+                                             std::string & progress_bar) {
+        const curl_off_t pos = (percentage * progress_bar_width) / 100;
+        for (int i = 0; i < progress_bar_width; ++i) {
+            progress_bar.append((i < pos) ? "█" : " ");
+        }
+
+        return progress_bar;
+    }
+
+    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
+                               const std::string & progress_suffix) {
+        printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
+    }
+    // Function to write data to a file
+    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        FILE * out = static_cast<FILE *>(stream);
+        return fwrite(ptr, size, nmemb, out);
+    }
+
+    // Function to capture data into a string
+    static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        std::string * str = static_cast<std::string *>(stream);
+        str->append(static_cast<char *>(ptr), size * nmemb);
+        return size * nmemb;
+    }
+};
+#endif
+
+class LlamaData {
+  public:
+    llama_model_ptr                 model;
+    llama_sampler_ptr               sampler;
+    llama_context_ptr               context;
+    std::vector<llama_chat_message> messages;
+    std::list<std::string>          msg_strs;
+    std::vector<char>               fmtted;
+
+    int init(Opt & opt) {
+        model = initialize_model(opt);
+        if (!model) {
+            return 1;
+        }
+
+        context = initialize_context(model, opt);
+        if (!context) {
+            return 1;
+        }
+
+        sampler = initialize_sampler(opt);
+
+        return 0;
+    }
+
+  private:
+#ifdef LLAMA_USE_CURL
+    int download(const std::string & url, const std::string & output_file, const bool progress,
+                 const std::vector<std::string> & headers = {}, std::string * response_str = nullptr) {
+        HttpClient http;
+        if (http.init(url, headers, output_file, progress, response_str)) {
+            return 1;
+        }
+
+        return 0;
+    }
+#else
+    int download(const std::string &, const std::string &, const bool, const std::vector<std::string> & = {},
+                 std::string * = nullptr) {
+        printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+
+        return 1;
+    }
+#endif
+
+    // Helper function to handle model tag extraction and URL construction
+    std::pair<std::string, std::string> extract_model_and_tag(std::string & model, const std::string & base_url) {
+        std::string  model_tag = "latest";
+        const size_t colon_pos = model.find(':');
+        if (colon_pos != std::string::npos) {
+            model_tag = model.substr(colon_pos + 1);
+            model     = model.substr(0, colon_pos);
+        }
+
+        std::string url = base_url + model + "/manifests/" + model_tag;
+
+        return { model, url };
+    }
+
+    // Helper function to download and parse the manifest
+    int download_and_parse_manifest(const std::string & url, const std::vector<std::string> & headers,
+                                    nlohmann::json & manifest) {
+        std::string manifest_str;
+        int         ret = download(url, "", false, headers, &manifest_str);
+        if (ret) {
+            return ret;
+        }
+
+        manifest = nlohmann::json::parse(manifest_str);
+
+        return 0;
+    }
+
+    int huggingface_dl(std::string & model, const std::string & bn) {
+        // Find the second occurrence of '/' after protocol string
+        size_t pos = model.find('/');
+        pos        = model.find('/', pos + 1);
+        std::string              hfr, hff;
+        std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
+        std::string              url;
+
+        if (pos == std::string::npos) {
+            auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
+            hfr                             = model_name;
+
+            nlohmann::json manifest;
+            int            ret = download_and_parse_manifest(manifest_url, headers, manifest);
+            if (ret) {
+                return ret;
+            }
+
+            hff = manifest["ggufFile"]["rfilename"];
+        } else {
+            hfr = model.substr(0, pos);
+            hff = model.substr(pos + 1);
+        }
+
+        url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+
+        return download(url, bn, true, headers);
+    }
+
+    int ollama_dl(std::string & model, const std::string & bn) {
+        const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
+        if (model.find('/') == std::string::npos) {
+            model = "library/" + model;
+        }
+
+        auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/");
+        nlohmann::json manifest;
+        int            ret = download_and_parse_manifest(manifest_url, {}, manifest);
+        if (ret) {
+            return ret;
+        }
+
+        std::string layer;
+        for (const auto & l : manifest["layers"]) {
+            if (l["mediaType"] == "application/vnd.ollama.image.model") {
+                layer = l["digest"];
+                break;
+            }
+        }
+
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer;
+
+        return download(blob_url, bn, true, headers);
+    }
+
+    int github_dl(const std::string & model, const std::string & bn) {
+        std::string  repository = model;
+        std::string  branch     = "main";
+        const size_t at_pos     = model.find('@');
+        if (at_pos != std::string::npos) {
+            repository = model.substr(0, at_pos);
+            branch     = model.substr(at_pos + 1);
+        }
+
+        const std::vector<std::string> repo_parts = string_split(repository, "/");
+        if (repo_parts.size() < 3) {
+            printe("Invalid GitHub repository format\n");
+            return 1;
+        }
+
+        const std::string & org          = repo_parts[0];
+        const std::string & project      = repo_parts[1];
+        std::string         url          = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch;
+        for (size_t i = 2; i < repo_parts.size(); ++i) {
+            url += "/" + repo_parts[i];
+        }
+
+        return download(url, bn, true);
+    }
+
+    int s3_dl(const std::string & model, const std::string & bn) {
+        const size_t slash_pos = model.find('/');
+        if (slash_pos == std::string::npos) {
+            return 1;
+        }
+
+        const std::string bucket     = model.substr(0, slash_pos);
+        const std::string key        = model.substr(slash_pos + 1);
+        const char * access_key = std::getenv("AWS_ACCESS_KEY_ID");
+        const char * secret_key = std::getenv("AWS_SECRET_ACCESS_KEY");
+        if (!access_key || !secret_key) {
+            printe("AWS credentials not found in environment\n");
+            return 1;
+        }
+
+        // Generate AWS Signature Version 4 headers
+        // (Implementation requires HMAC-SHA256 and date handling)
+        // Get current timestamp
+        const time_t                   now     = time(nullptr);
+        const tm                       tm      = *gmtime(&now);
+        const std::string              date     = strftime_fmt("%Y%m%d", tm);
+        const std::string              datetime = strftime_fmt("%Y%m%dT%H%M%SZ", tm);
+        const std::vector<std::string> headers  = {
+            "Authorization: AWS4-HMAC-SHA256 Credential=" + std::string(access_key) + "/" + date +
+                "/us-east-1/s3/aws4_request",
+            "x-amz-content-sha256: UNSIGNED-PAYLOAD", "x-amz-date: " + datetime
+        };
+
+        const std::string url = "https://" + bucket + ".s3.amazonaws.com/" + key;
+
+        return download(url, bn, true, headers);
+    }
+
+    std::string basename(const std::string & path) {
+        const size_t pos = path.find_last_of("/\\");
+        if (pos == std::string::npos) {
+            return path;
+        }
+
+        return path.substr(pos + 1);
+    }
+
+    int rm_until_substring(std::string & model_, const std::string & substring) {
+        const std::string::size_type pos = model_.find(substring);
+        if (pos == std::string::npos) {
+            return 1;
+        }
+
+        model_ = model_.substr(pos + substring.size());  // Skip past the substring
+        return 0;
+    }
+
+    int resolve_model(std::string & model_) {
+        int                            ret     = 0;
+        if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
+            rm_until_substring(model_, "://");
+
+            return ret;
+        }
+
+        const std::string bn = basename(model_);
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") ||
+            string_starts_with(model_, "hf.co/")) {
+            rm_until_substring(model_, "hf.co/");
+            rm_until_substring(model_, "://");
+            ret = huggingface_dl(model_, bn);
+        } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
+                   !string_starts_with(model_, "https://ollama.com/library/")) {
+            ret = download(model_, bn, true);
+        } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) {
+            rm_until_substring(model_, "github:");
+            rm_until_substring(model_, "://");
+            ret = github_dl(model_, bn);
+        } else if (string_starts_with(model_, "s3://")) {
+            rm_until_substring(model_, "://");
+            ret = s3_dl(model_, bn);
+        } else {  // ollama:// or nothing
+            rm_until_substring(model_, "ollama.com/library/");
+            rm_until_substring(model_, "://");
+            ret = ollama_dl(model_, bn);
+        }
+
+        model_ = bn;
+
+        return ret;
+    }
+
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(Opt & opt) {
+        ggml_backend_load_all();
+        resolve_model(opt.model_);
+        printe("\r" LOG_CLR_TO_EOL "Loading model");
+        llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
+        if (!model) {
+            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
+        }
+
+        printe("\r" LOG_CLR_TO_EOL);
+        return model;
+    }
+
+    // Initializes the context with the specified parameters
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
+        llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params));
+        if (!context) {
+            printe("%s: error: failed to create the llama_context\n", __func__);
+        }
+
+        return context;
+    }
+
+    // Initializes and configures the sampler
+    llama_sampler_ptr initialize_sampler(const Opt & opt) {
+        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+        return sampler;
+    }
+};
+
+// Add a message to `messages` and store its content in `msg_strs`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
+    llama_data.msg_strs.push_back(std::move(text));
+    llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
+}
+
+// Function to apply the chat template and resize `formatted` if needed
+static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
+    if (use_jinja) {
+        json messages = json::array();
+        for (const auto & msg : llama_data.messages) {
+            messages.push_back({
+                {"role", msg.role},
+                {"content", msg.content},
+            });
+        }
+        try {
+            minja::chat_template_inputs tmpl_inputs;
+            tmpl_inputs.messages = messages;
+            tmpl_inputs.add_generation_prompt = append;
+
+            minja::chat_template_options tmpl_opts;
+            tmpl_opts.use_bos_token = false;
+            tmpl_opts.use_eos_token = false;
+
+            auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
+            llama_data.fmtted.resize(result.size() + 1);
+            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
+            return result.size();
+        } catch (const std::exception & e) {
+            printe("failed to render the chat template: %s\n", e.what());
+            return -1;
+        }
+    }
+    int result = llama_chat_apply_template(
+        tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
+        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
+    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
+        llama_data.fmtted.resize(result);
+        result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
+                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
+                                           llama_data.fmtted.size());
+    }
+
+    return result;
+}
+
+// Function to tokenize the prompt
+static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
+                           std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
+    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
+
+    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
+    prompt_tokens.resize(n_prompt_tokens);
+    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
+                       true) < 0) {
+        printe("failed to tokenize the prompt\n");
+        return -1;
+    }
+
+    return n_prompt_tokens;
+}
+
+// Check if we have enough space in the context to evaluate this batch
+static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    const int n_ctx      = llama_n_ctx(ctx.get());
+    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    if (n_ctx_used + batch.n_tokens > n_ctx) {
+        printf(LOG_COL_DEFAULT "\n");
+        printe("context size exceeded\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+// convert the token to a string
+static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) {
+    char buf[256];
+    int  n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        printe("failed to convert token to piece\n");
+        return 1;
+    }
+
+    piece = std::string(buf, n);
+    return 0;
+}
+
+static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
+    printf("%s", piece.c_str());
+    fflush(stdout);
+    response += piece;
+}
+
+// helper function to evaluate a prompt and generate a response
+static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
+
+    std::vector<llama_token> tokens;
+    if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) {
+        return 1;
+    }
+
+    // prepare a batch for the prompt
+    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
+    llama_token new_token_id;
+    while (true) {
+        check_context_size(llama_data.context, batch);
+        if (llama_decode(llama_data.context.get(), batch)) {
+            printe("failed to decode\n");
+            return 1;
+        }
+
+        // sample the next token, check is it an end of generation?
+        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
+        if (llama_vocab_is_eog(vocab, new_token_id)) {
+            break;
+        }
+
+        std::string piece;
+        if (convert_token_to_string(vocab, new_token_id, piece)) {
+            return 1;
+        }
+
+        print_word_and_concatenate_to_response(piece, response);
+
+        // prepare the next batch with the sampled token
+        batch = llama_batch_get_one(&new_token_id, 1);
+    }
+
+    printf(LOG_COL_DEFAULT);
+    return 0;
+}
+
+static int read_user_input(std::string & user_input) {
+    static const char * prompt_prefix = "> ";
+#ifdef WIN32
+    printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);
+
+    std::getline(std::cin, user_input);
+    if (std::cin.eof()) {
+        printf("\n");
+        return 1;
+    }
+#else
+    std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
+    if (!line) {
+        return 1;
+    }
+
+    user_input = line.get();
+#endif
+
+    if (user_input == "/bye") {
+        return 1;
+    }
+
+    if (user_input.empty()) {
+        return 2;
+    }
+
+#ifndef WIN32
+    linenoiseHistoryAdd(line.get());
+#endif
+
+    return 0;  // Should have data in happy path
+}
+
+// Function to generate a response based on the prompt
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response,
+                             const bool stdout_a_terminal) {
+    // Set response color
+    if (stdout_a_terminal) {
+        printf(LOG_COL_YELLOW);
+    }
+
+    if (generate(llama_data, prompt, response)) {
+        printe("failed to generate response\n");
+        return 1;
+    }
+
+    // End response with color reset and newline
+    printf("\n%s", stdout_a_terminal ? LOG_COL_DEFAULT : "");
+    return 0;
+}
+
+// Helper function to apply the chat template and handle errors
+static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
+    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
+    if (new_len < 0) {
+        printe("failed to apply the chat template\n");
+        return -1;
+    }
+
+    output_length = new_len;
+    return 0;
+}
+
+// Helper function to handle user input
+static int handle_user_input(std::string & user_input, const std::string & user) {
+    if (!user.empty()) {
+        user_input = user;
+        return 0;  // No need for interactive input
+    }
+
+    return read_user_input(user_input);  // Returns true if input ends the loop
+}
+
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+
+static bool is_stdout_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdout, &mode);
+#else
+    return isatty(STDOUT_FILENO);
+#endif
+}
+
+// Function to handle user input
+static int get_user_input(std::string & user_input, const std::string & user) {
+    while (true) {
+        const int ret = handle_user_input(user_input, user);
+        if (ret == 1) {
+            return 1;
+        }
+
+        if (ret == 2) {
+            continue;
+        }
+
+        break;
+    }
+
+    return 0;
+}
+
+// Main chat loop function
+static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
+    int prev_len = 0;
+    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
+    GGML_ASSERT(chat_templates.template_default);
+    static const bool stdout_a_terminal = is_stdout_a_terminal();
+    while (true) {
+        // Get user input
+        std::string user_input;
+        if (get_user_input(user_input, user) == 1) {
+            return 0;
+        }
+
+        add_message("user", user.empty() ? user_input : user, llama_data);
+        int new_len;
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
+            return 1;
+        }
+
+        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
+        std::string response;
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
+            return 1;
+        }
+
+        if (!user.empty()) {
+            break;
+        }
+
+        add_message("assistant", response, llama_data);
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
+    const Opt * opt = static_cast<Opt *>(p);
+    if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
+        printe("%s", text);
+    }
+}
+
+static std::string read_pipe_data() {
+    std::ostringstream result;
+    result << std::cin.rdbuf();  // Read all data from std::cin
+    return result.str();
+}
+
+static void ctrl_c_handling() {
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = sigint_handler;
+    sigemptyset(&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+#elif defined(_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+}
+
+int main(int argc, const char ** argv) {
+    ctrl_c_handling();
+    Opt       opt;
+    const int ret = opt.init(argc, argv);
+    if (ret == 2) {
+        return 0;
+    } else if (ret) {
+        return 1;
+    }
+
+    if (!is_stdin_a_terminal()) {
+        if (!opt.user.empty()) {
+            opt.user += "\n\n";
+        }
+
+        opt.user += read_pipe_data();
+    }
+
+    llama_log_set(log_callback, &opt);
+    LlamaData llama_data;
+    if (llama_data.init(opt)) {
+        return 1;
+    }
+
+    if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index cc6ed8554..0f50e50de 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET save-load-state)
+set(TARGET llama-save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index ef952e2bd..cf7cbd815 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,16 +1,17 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
 #include <vector>
 #include <cstdio>
-#include <chrono>
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
     params.prompt = "The quick brown fox";
+    params.sampling.seed = 1234;
 
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
@@ -24,28 +25,43 @@ int main(int argc, char ** argv) {
 
     std::string result0;
     std::string result1;
+    std::string result2;
 
     // init
-    llama_model * model;
-    llama_context * ctx;
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
     }
 
+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
+
     // tokenize prompt
-    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    auto tokens = common_tokenize(ctx, params.prompt, true);
+
+    // prepare the batch
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {0}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true; // generate next token
 
     // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
-    n_past += tokens.size();
+    llama_decode(ctx, batch);
+    n_past += batch.n_tokens;
 
     // save state (rng, logits, embedding and kv_cache) to file
     {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
-        const size_t written = llama_copy_state_data(ctx, state_mem.data());
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
+        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
 
         FILE *fp_write = fopen("dump_state.bin", "wb");
         fwrite(state_mem.data(), 1, written, fp_write);
@@ -61,25 +77,18 @@ int main(int argc, char ** argv) {
     printf("\nfirst run: %s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(model);
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx, &candidates_p);
-        auto next_token_str = llama_token_to_piece(ctx, next_token);
+        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
+        auto next_token_str = common_token_to_piece(ctx, next_token);
 
         printf("%s", next_token_str.c_str());
         result0 += next_token_str;
 
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {0}, true);
+
+        if (llama_decode(ctx, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_free(ctx);
-            llama_free_model(model);
+            llama_batch_free(batch);
             return 1;
         }
         n_past += 1;
@@ -87,26 +96,28 @@ int main(int argc, char ** argv) {
 
     printf("\n\n");
 
-    // free old context
-    llama_free(ctx);
-
     // make new context
-    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
+
+    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
 
     printf("\nsecond run: %s", params.prompt.c_str());
 
     // load state (rng, logits, embedding and kv_cache) from file
     {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+        std::vector<uint8_t> state_mem;
 
         FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
         const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
         fclose(fp_read);
 
-        if (read != llama_set_state_data(ctx2, state_mem.data())) {
+        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
             return 1;
         }
 
@@ -118,24 +129,99 @@ int main(int argc, char ** argv) {
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx2, &candidates_p);
-        auto next_token_str = llama_token_to_piece(ctx2, next_token);
+        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
+        auto next_token_str = common_token_to_piece(ctx2, next_token);
 
         printf("%s", next_token_str.c_str());
         result1 += next_token_str;
 
-        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {0}, true);
+
+        if (llama_decode(ctx2, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
+            llama_batch_free(batch);
+            return 1;
+        }
+        n_past += 1;
+    }
+
+    printf("\n\n");
+
+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+
+    // make new context
+    llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
+
+    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
+
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem;
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+
+        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            return 1;
+        }
+
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    }
+
+    // restore state (last tokens)
+    n_past = n_past_saved;
+
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+
+        // erase whole kv
+        llama_kv_cache_clear(ctx3);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+
+        // restore kv into seq 1
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+
+    // third run with seq 1 instead of 0
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
+        auto next_token_str = common_token_to_piece(ctx3, next_token);
+
+        printf("%s", next_token_str.c_str());
+        result2 += next_token_str;
+
+        common_batch_clear(batch);
+        common_batch_add(batch, next_token, n_past, {1}, true);
+
+        if (llama_decode(ctx3, batch)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_batch_free(batch);
             return 1;
         }
         n_past += 1;
@@ -143,11 +229,14 @@ int main(int argc, char ** argv) {
 
     printf("\n");
 
-    llama_free(ctx2);
-    llama_free_model(model);
+    llama_sampler_free(smpl);
+    llama_sampler_free(smpl2);
+    llama_sampler_free(smpl3);
 
-    if (result0 != result1) {
-        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+    llama_batch_free(batch);
+
+    if (result0 != result2) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
         return 1;
     }
 
diff --git a/examples/server-llama2-13B.sh b/examples/server-llama2-13B.sh
index 17fedc2b1..4ce79b7fa 100755
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
 
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./server $GEN_OPTIONS \
+./llama-server $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --rope-freq-scale 1.0 \
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index cc13b2d63..1b7cc8c13 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,13 +1,50 @@
-set(TARGET server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
-install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+set(TARGET llama-server)
+
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+if (MINGW)
+    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
+set(TARGET_SRCS
+    server.cpp
+    utils.hpp
+    httplib.h
 )
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+set(PUBLIC_ASSETS
+    index.html.gz
+    loading.html
+)
+
+foreach(asset ${PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+    set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
+endforeach()
+
+add_executable(${TARGET} ${TARGET_SRCS})
+install(TARGETS ${TARGET} RUNTIME)
+
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+
+if (LLAMA_SERVER_SSL)
+    find_package(OpenSSL REQUIRED)
+    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
+endif()
+
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/server/README.md b/examples/server/README.md
index 397ee8252..d0b262f0e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,76 +5,251 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
 Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 
 **Features:**
- * LLM inference of F16 and quantum models on GPU and CPU
+ * LLM inference of F16 and quantized models on GPU and CPU
  * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+ * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
  * Parallel decoding with multi-user support
  * Continuous batching
  * Multimodal (wip)
  * Monitoring endpoints
+ * Schema-constrained JSON response format
 
 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
 
-**Command line options:**
+## Usage
 
-- `--threads N`, `-t N`: Set the number of threads to use during generation.
-- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
-- `--threads-http N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`)
-- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
-- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
-- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
-- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
-- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
-- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
-- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
-- `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
-- `--numa distribute`: Spread execution evenly over all nodes
-- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
-- `--numa numactl`: Use the CPU map provided by numactl
-if run without this previously, it is recommended to drop the system page cache before using this
-see https://github.com/ggerganov/llama.cpp/issues/1437
+<!-- Note for contributors: The list below is generated by llama-gen-docs -->
 
-- `--numa`: Attempt optimizations that help on some NUMA systems.
-- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
-- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
-- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
-- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
-- `--port`: Set the port to listen. Default: `8080`.
-- `--path`: path from which to serve static files (default examples/server/public)
-- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
-- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
-- `--embedding`: Enable embedding extraction, Default: disabled.
-- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
-- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
-- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
-- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
-- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
-- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
-- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
-- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
-- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-- `--log-disable`: Output logs to stdout only, default: enabled.
-- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
+**Common params**
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `-h, --help, --usage` | print usage and exit |
+| `--version` | show version and build info |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
+| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
+| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
+| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
+| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
+| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
+| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
+| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
+| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
+| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
+| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
+| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--no-escape` | do not process escape sequences |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
+| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
+| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
+| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
+| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
+| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
+| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `--log-disable` | Log disable |
+| `--log-file FNAME` | Log to file |
+| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+
+
+**Sampling params**
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
+| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
+| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--temp N` | temperature (default: 0.8) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
+| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
+| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
+| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
+| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
+| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
+| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
+| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
+| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
+| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
+| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
+| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
+| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
+| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
+| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
+| `--grammar-file FNAME` | file to read grammar from |
+| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
+
+**Example-specific params**
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
+| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
+| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
+| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
+| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
+| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
+
+
+Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
+
+Example usage of docker compose with environment variables:
+
+```yml
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    ports:
+      - 8080:8080
+    volumes:
+      - ./models:/models
+    environment:
+      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
+      LLAMA_ARG_MODEL: /models/my_model.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_PORT: 8080
+```
 
 ## Build
 
-server is build alongside everything else from the root of the project
-
-- Using `make`:
-
-  ```bash
-  make
-  ```
+`llama-server` is built alongside everything else from the root of the project
 
 - Using `CMake`:
 
   ```bash
-  cmake --build . --config Release
+  cmake -B build
+  cmake --build build --config Release -t llama-server
   ```
 
+  Binary is at `./build/bin/llama-server`
+
+## Build with SSL
+
+`llama-server` can also be built with SSL support using OpenSSL 3
+
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DLLAMA_SERVER_SSL=ON
+  cmake --build build --config Release -t llama-server
+  ```
+
+## Web UI
+
+The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
+
+The web UI is developed using:
+- `react` framework for frontend development
+- `tailwindcss` and `daisyui` for styling
+- `vite` for build tooling
+
+A pre-built version is available as a single HTML file under `/public` directory.
+
+To build or to run the dev server (with hot reload):
+
+```sh
+# make sure you have nodejs installed
+cd examples/server/webui
+npm i
+
+# to run the dev server
+npm run dev
+
+# to build the public/index.html.gz
+npm run build
+```
+After `public/index.html.gz` has been generated we need to generate the c++
+headers (like build/examples/server/index.html.gz.hpp) that will be included
+by server.cpp. This is done by building `llama-server` as described in the
+[build](#build) section above.
+
+NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
+
+```js
+localStorage.setItem('base', 'http://localhost:8080')
+```
+
 ## Quick Start
 
 To get started right away, run the following command, making sure to use the correct path for the model you have:
@@ -82,13 +257,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.)
 
 ```bash
-./server -m models/7B/ggml-model.gguf -c 2048
+./llama-server -m models/7B/ggml-model.gguf -c 2048
 ```
 
 ### Windows
 
 ```powershell
-server.exe -m models\7B\ggml-model.gguf -c 2048
+llama-server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 
 The above command will start a server that by default listens on `127.0.0.1:8080`.
@@ -97,15 +272,15 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
 ### Docker
 
 ```bash
-docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
 
 # or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
 ```
 
 ## Testing with CURL
 
-Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
+Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
 
 ```sh
 curl --request POST \
@@ -129,23 +304,23 @@ mkdir llama-client
 cd llama-client
 ```
 
-Create a index.js file and put inside this:
+Create an index.js file and put this inside:
 
 ```javascript
-const prompt = `Building a website can be done in 10 simple steps:`;
+const prompt = "Building a website can be done in 10 simple steps:"
 
-async function Test() {
+async function test() {
     let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: 'POST',
+        method: "POST",
         body: JSON.stringify({
             prompt,
-            n_predict: 512,
+            n_predict: 64,
         })
     })
     console.log((await response.json()).content)
 }
 
-Test()
+test()
 ```
 
 And run it:
@@ -156,358 +331,1007 @@ node index.js
 
 ## API Endpoints
 
-- **GET** `/health`: Returns the current state of the server:
-  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
-  - 500 -> `{"status": "error"}` if the model failed to load.
-  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
-  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
+### GET `/health`: Returns heath check result
 
-  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
+**Response format**
 
-- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
+- HTTP status code 503
+  - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
+  - Explanation: the model is still being loaded.
+- HTTP status code 200
+  - Body: `{"status": "ok" }`
+  - Explanation: the model is successfully loaded and the server is ready.
 
-    *Options:*
+### POST `/completion`: Given a `prompt`, it returns the predicted completion.
 
-    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
+> [!IMPORTANT]
+>
+> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead.
 
-    `temperature`: Adjust the randomness of the generated text (default: 0.8).
+*Options:*
 
-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
+`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
 
-    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
+  - The prompt is a string or an array with the first element given as a string
+  - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
 
-    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
+These input shapes and data type are allowed for `prompt`:
 
-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
+  - Single string: `"string"`
+  - Single sequence of tokens: `[12, 34, 56]`
+  - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
 
-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
+Multiple prompts are also supported. In this case, the completion result will be an array.
 
-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+  - Only strings: `["string1", "string2"]`
+  - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
+  - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
 
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
+`temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
-    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
 
-    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
+`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
 
-    `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
+`top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
 
-    `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
 
-    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
+`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
 
-    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
 
-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
+`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
 
-    `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
+`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
-    `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
+`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
 
-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
+`stop`: Specify a JSON array of stopping strings.
+These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
 
-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
 
-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
 
-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
 
-    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
 
-    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
+`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
 
-    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
+`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
 
-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
+`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
 
-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
+`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
 
-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
+`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
 
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
 
-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
 
-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
+`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
 
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
+`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
 
-### Result JSON
+`mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
 
-- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+`grammar`: Set grammar for grammar-based sampling.  Default: no grammar
 
-- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
+`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
 
-```json
-{
-  "content": "<the token selected by the model>",
-  "probs": [
-    {
-      "prob": float,
-      "tok_str": "<most likely token>"
-    },
-    {
-      "prob": float,
-      "tok_str": "<second most likely tonen>"
-    },
+`seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+
+`ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+
+`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+
+`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+
+`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+
+`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
+`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+
+`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+
+`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+
+`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
+
+`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+
+`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
+
+`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
+
+`lora`: A list of LoRA adapters to be applied to this specific request. Each object in the list must contain `id` and `scale` fields. For example: `[{"id": 0, "scale": 0.5}, {"id": 1, "scale": 1.1}]`. If a LoRA adapter is not specified in the list, its scale will default to `0.0`. Please note that requests with different LoRA configurations will not be batched together, which may result in performance degradation.
+
+**Response format**
+
+- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
+
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
+  ```
+  {
+    "content": "<the generated completion text>",
+    "tokens": [ generated token ids if requested ],
     ...
-  ]
-},
-```
-
-Notice that each `probs` is an array of length `n_probs`.
+    "probs": [
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          ...
+        ]
+      },
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          ...
+        ]
+      },
+      ...
+    ]
+  },
+  ```
+  Please note that if `post_sampling_probs` is set to `true`:
+    - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0
+    - `top_logprobs` will be replaced with `top_probs`. Each element contains:
+      - `id`: token ID
+      - `token`: token in string
+      - `bytes`: token in bytes
+      - `prob`: token probability, with the value between 0.0 and 1.0
+    - Number of elements in `top_probs` may be less than `n_probs`
 
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
-- `model`: The path to the model loaded with `-m`
-- `prompt`: The provided `prompt`
-- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+- `model`: The model alias (for model path, please use `/props` endpoint)
+- `prompt`: The processed `prompt` (special tokens may be added)
+- `stop_type`: Indicating whether the completion has stopped. Possible values are:
+  - `none`: Generating (not stopped)
+  - `eos`: Stopped because it encountered the EOS token
+  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
 - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 
-- **POST** `/tokenize`: Tokenize a given text.
 
-    *Options:*
+### POST `/tokenize`: Tokenize a given text
 
-    `content`: Set the text to tokenize.
+*Options:*
 
-    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
+`content`: (Required) The text to tokenize.
 
-- **POST** `/detokenize`: Convert tokens to text.
+`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
 
-    *Options:*
+`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
 
-    `tokens`: Set the tokens to detokenize.
+**Response:**
 
-- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
 
-    *Options:*
-
-    `content`: Set the text to process.
-
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
-
-- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
-
-    *Options:*
-
-    `input_prefix`: Set the prefix of the code to infill.
-
-    `input_suffix`: Set the suffix of the code to infill.
-
-    It also accepts all the options of `/completion` except `stream` and `prompt`.
-
-- **GET** `/props`: Return current server settings.
-
-### Result JSON
 
+If `with_pieces` is `false`:
 ```json
 {
-  "assistant_name": "",
-  "user_name": "",
-  "default_generation_settings": { ... },
-  "total_slots": 1
+  "tokens": [123, 456, 789]
 }
 ```
 
-- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
-- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
-- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
-- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
 
-- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```
 
-    *Options:*
+### POST `/detokenize`: Convert tokens to text
 
-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
+*Options:*
 
-    *Examples:*
+`tokens`: Set the tokens to detokenize.
 
-    You can use either Python `openai` library with appropriate checkpoints:
+### POST `/apply-template`: Apply chat template to a conversation
 
-    ```python
-    import openai
+Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
 
-    client = openai.OpenAI(
-        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-        api_key = "sk-no-key-required"
-    )
+*Options:*
 
-    completion = client.chat.completions.create(
-    model="gpt-3.5-turbo",
-    messages=[
-        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-        {"role": "user", "content": "Write a limerick about python exceptions"}
-    ]
-    )
+`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
 
-    print(completion.choices[0].message)
-    ```
+**Response format**
 
-    ... or raw HTTP requests:
+Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
 
-    ```shell
-    curl http://localhost:8080/v1/chat/completions \
+### POST `/embedding`: Generate embedding of a given text
+
+> [!IMPORTANT]
+>
+> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead.
+
+The same as [the embedding example](../embedding) does.
+
+*Options:*
+
+`content`: Set the text to process.
+
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
+### POST `/reranking`: Rerank documents according to a given query
+
+Similar to https://jina.ai/reranker/ but might change in the future.
+Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
+
+*Options:*
+
+`query`: The query against which the documents will be ranked.
+
+`documents`: An array strings representing the documents to be ranked.
+
+*Aliases:*
+  - `/rerank`
+  - `/v1/rerank`
+  - `/v1/reranking`
+
+*Examples:*
+
+```shell
+curl http://127.0.0.1:8012/v1/rerank \
     -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
     -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-    {
-        "role": "system",
-        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-    },
-    {
-        "role": "user",
-        "content": "Write a limerick about python exceptions"
-    }
-    ]
-    }'
-    ```
+        "model": "some-model",
+            "query": "What is panda?",
+            "top_n": 3,
+            "documents": [
+                "hi",
+            "it is a bear",
+            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+            ]
+    }' | jq
+```
 
-- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
+### POST `/infill`: For code infilling.
 
-    *Options:*
+Takes a prefix and a suffix and returns the predicted completion as stream.
 
-    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+*Options:*
 
-    *Examples:*
+- `input_prefix`: Set the prefix of the code to infill.
+- `input_suffix`: Set the suffix of the code to infill.
+- `input_extra`:  Additional context inserted before the FIM prefix.
+- `prompt`:       Added after the `FIM_MID` token
 
-  - input as string
+`input_extra` is array of `{"filename": string, "text": string}` objects.
 
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": "hello",
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+The endpoint also accepts all the options of `/completion`.
 
-  - `input` as string array
+If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
 
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": ["hello", "world"],
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+```txt
+<FIM_REP>myproject
+<FIM_SEP>{chunk 0 filename}
+{chunk 0 text}
+<FIM_SEP>{chunk 1 filename}
+{chunk 1 text}
+...
+<FIM_SEP>filename
+<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
 
-- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+If the tokens are missing, then the extra context is simply prefixed at the start:
 
-### Result JSON
+```txt
+[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
+
+### **GET** `/props`: Get server global properties.
+
+This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
+
+**Response format**
 
 ```json
-[
-    {
-        "dynatemp_exponent": 1.0,
-        "dynatemp_range": 0.0,
-        "frequency_penalty": 0.0,
-        "grammar": "",
-        "id": 0,
-        "ignore_eos": false,
-        "logit_bias": [],
-        "min_p": 0.05000000074505806,
-        "mirostat": 0,
-        "mirostat_eta": 0.10000000149011612,
-        "mirostat_tau": 5.0,
-        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
-        "n_keep": 0,
-        "n_predict": 100000,
-        "n_probs": 0,
-        "next_token": {
-            "has_next_token": true,
-            "n_remain": -1,
-            "num_tokens_predicted": 0,
-            "stopped_eos": false,
-            "stopped_limit": false,
-            "stopped_word": false,
-            "stopping_word": ""
-        },
-        "penalize_nl": true,
-        "penalty_prompt_tokens": [],
-        "presence_penalty": 0.0,
-        "prompt": "Say hello to llama.cpp",
-        "repeat_last_n": 64,
-        "repeat_penalty": 1.100000023841858,
-        "samplers": [
-            "top_k",
-            "tfs_z",
-            "typical_p",
-            "top_p",
-            "min_p",
-            "temperature"
-        ],
-        "seed": 42,
-        "state": 1,
-        "stop": [
-            "\n"
-        ],
-        "stream": false,
-        "task_id": 0,
-        "temperature": 0.0,
-        "tfs_z": 1.0,
-        "top_k": 40,
-        "top_p": 0.949999988079071,
-        "typical_p": 1.0,
-        "use_penalty_prompt_tokens": false
+{
+  "default_generation_settings": {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
     }
+  },
+  "total_slots": 1,
+  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
+  "chat_template": "...",
+  "build_info": "b(build number)-(build commit hash)"
+}
+```
+
+- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
+- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
+- `model_path` - the path to model file (same with `-m` argument)
+- `chat_template` - the model's original Jinja2 prompt template
+
+### POST `/props`: Change server global properties.
+
+To use this endpoint with POST method, you need to start server with `--props`
+
+*Options:*
+
+- None yet
+
+### POST `/embeddings`: non-OpenAI-compatible embeddings API
+
+This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
+
+Note that the response format of this endpoint is different from `/v1/embeddings`.
+
+*Options:*
+
+Same as the `/v1/embeddings` endpoint.
+
+*Examples:*
+
+Same as the `/v1/embeddings` endpoint.
+
+**Response format**
+
+```
+[
+  {
+    "index": 0,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  },
+  ...
+  {
+    "index": P,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  }
 ]
 ```
 
-- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
+### GET `/slots`: Returns the current slots processing state
+
+> [!WARNING]
+> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
+
+This endpoint is disabled by default and can be enabled with `--slots`
+
+If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
+
+**Response format**
+
+Example:
+
+```json
+[
+  {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
+    }
+  }
+]
+```
+
+### GET `/metrics`: Prometheus compatible metrics exporter
+
+This endpoint is only accessible if `--metrics` is set.
 
 Available metrics:
 - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
 - `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
 - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
 - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
-- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
+- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
 - `llamacpp:kv_cache_tokens`: KV-cache tokens.
-- `llamacpp:requests_processing`: Number of request processing.
-- `llamacpp:requests_deferred`: Number of request deferred.
+- `llamacpp:requests_processing`: Number of requests processing.
+- `llamacpp:requests_deferred`: Number of requests deferred.
 
-## More examples
+### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 
-### Change system prompt on runtime
+*Options:*
 
-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
 
-`prompt`: Specify a context that you want all connecting clients to respect.
-
-`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
-
-`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
+**Response format**
 
 ```json
 {
-    "system_prompt": {
-        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
-        "anti_prompt": "User:",
-        "assistant_name": "Assistant:"
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_saved": 1745,
+    "n_written": 14309796,
+    "timings": {
+        "save_ms": 49.865
     }
 }
 ```
 
-**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
+### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+
+*Options:*
+
+`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+
+**Response format**
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_restored": 1745,
+    "n_read": 14309796,
+    "timings": {
+        "restore_ms": 42.937
+    }
+}
+```
+
+### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+
+**Response format**
+
+```json
+{
+    "id_slot": 0,
+    "n_erased": 1745
+}
+```
+
+### GET `/lora-adapters`: Get list of all LoRA adapters
+
+This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
+
+By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
+
+Please note that this value will be overwritten by the `lora` field for each request.
+
+If an adapter is disabled, the scale will be set to 0.
+
+**Response format**
+
+```json
+[
+    {
+        "id": 0,
+        "path": "my_adapter_1.gguf",
+        "scale": 0.0
+    },
+    {
+        "id": 1,
+        "path": "my_adapter_2.gguf",
+        "scale": 0.0
+    }
+]
+```
+
+### POST `/lora-adapters`: Set list of LoRA adapters
+
+This sets the global scale for LoRA adapters. Please note that this value will be overwritten by the `lora` field for each request.
+
+To disable an adapter, either remove it from the list below, or set scale to 0.
+
+**Request format**
+
+To know the `id` of the adapter, use GET `/lora-adapters`
+
+```json
+[
+  {"id": 0, "scale": 0.2},
+  {"id": 1, "scale": 0.8}
+]
+```
+
+## OpenAI-compatible API Endpoints
+
+### GET `/v1/models`: OpenAI-compatible Model Info API
+
+Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).
+
+The returned list always has one single element.
+
+By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.
+
+Example:
+
+```json
+{
+    "object": "list",
+    "data": [
+        {
+            "id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
+            "object": "model",
+            "created": 1735142223,
+            "owned_by": "llamacpp",
+            "meta": {
+                "vocab_type": 2,
+                "n_vocab": 128256,
+                "n_ctx_train": 131072,
+                "n_embd": 4096,
+                "n_params": 8030261312,
+                "size": 4912898304
+            }
+        }
+    ]
+}
+```
+
+### POST `/v1/completions`: OpenAI-compatible Completions API
+
+Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps.
+
+*Options:*
+
+See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions).
+
+llama.cpp `/completion`-specific features such as `mirostat` are supported.
+
+*Examples:*
+
+Example usage with `openai` python library:
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+completion = client.completions.create(
+  model="davinci-002",
+  prompt="I believe the meaning of life is",
+  max_tokens=8
+)
+
+print(completion.choices[0].text)
+```
+
+### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
+
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+
+*Options:*
+
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.
+
+The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+completion = client.chat.completions.create(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+    {"role": "user", "content": "Write a limerick about python exceptions"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-3.5-turbo",
+"messages": [
+{
+    "role": "system",
+    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+},
+{
+    "role": "user",
+    "content": "Write a limerick about python exceptions"
+}
+]
+}'
+```
+
+*Tool call support*
+
+[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
+
+- Requires `--jinja` flag
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Mistral Nemo
+  - Firefunction v2
+  - Command R7B
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+  <details>
+  <summary>Show some common templates and which format handler they use</summary>
+
+  | Template | Format |
+  |----------|--------|
+  | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
+  | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
+  | NexaAIDev-Octopus-v2.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
+  | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
+  | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
+  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
+  | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
+  | databricks-dbrx-instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
+  | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
+  | google-gemma-2-2b-it.jinja | generic tool calls |
+  | google-gemma-7b-it.jinja | generic tool calls |
+  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
+  | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
+  | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
+  | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
+  | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
+  | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
+  | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
+  | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
+  | mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
+  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | openchat-openchat-3.5-0106.jinja | generic tool calls |
+  | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
+
+  This table can be generated with:
+
+  ```bash
+  ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+
+  </details>
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
+
+- Run with:
+
+  ```shell
+  # Native support:
+  llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+  llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+  # Native support requires the right template for these GGUFs:
+
+  llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+
+  llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+
+  llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+    --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+    --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+  # Generic format support
+  llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+  llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+  ```
+
+- Test in CLI:
+
+  ```bash
+  curl http://localhost:8080/v1/chat/completions -d '{
+    "model": "gpt-3.5-turbo",
+    "tools": [
+      {
+        "type":"function",
+        "function":{
+          "name":"get_current_weather",
+          "description":"Get the current weather in a given location",
+          "parameters":{
+            "type":"object",
+            "properties":{
+              "location":{
+                "type":"string",
+                "description":"The city and state, e.g. San Francisco, CA"
+              }
+            },
+            "required":["location"]
+          }
+        }
+      }
+    ],
+    "messages": [
+      {
+        "role": "user",
+        "content": "What is the weather like in Istanbul?."
+      }
+    ]
+  }'
+  ```
+
+  <details>
+  <summary>Show output</summary>
+
+  ```json
+  {
+    "choices": [
+      {
+        "finish_reason": "tool",
+        "index": 0,
+        "message": {
+          "content": null,
+          "tool_calls": [
+            {
+              "name": "python",
+              "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
+            }
+          ],
+          "role": "assistant"
+        }
+      }
+    ],
+    "created": 1727287211,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "usage": {
+      "completion_tokens": 16,
+      "prompt_tokens": 44,
+      "total_tokens": 60
+    },
+    "id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
+  }
+  ```
+
+  </details>
+
+### POST `/v1/embeddings`: OpenAI-compatible embeddings API
+
+This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
+
+*Options:*
+
+See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+
+*Examples:*
+
+- input as string
+
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": "hello",
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
+
+- `input` as string array
+
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": ["hello", "world"],
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
+
+## More examples
 
 ### Interactive mode
 
@@ -526,13 +1350,65 @@ Run with bash:
 bash chat.sh
 ```
 
-### API like OAI
+### OAI-like API
 
-The HTTP server supports OAI-like API
+The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
+
+### API errors
+
+`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+
+Example of an error:
+
+```json
+{
+    "error": {
+        "code": 401,
+        "message": "Invalid API Key",
+        "type": "authentication_error"
+    }
+}
+```
+
+Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
+
+**When /metrics or /slots endpoint is disabled**
+
+```json
+{
+    "error": {
+        "code": 501,
+        "message": "This server does not support metrics endpoint.",
+        "type": "not_supported_error"
+    }
+}
+```
+
+**When the server receives invalid grammar via */completions endpoint**
+
+```json
+{
+    "error": {
+        "code": 400,
+        "message": "Failed to parse grammar",
+        "type": "invalid_request_error"
+    }
+}
+```
+
+### Legacy completion web UI
+
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+
+For example:
+
+```sh
+./llama-server -m my_model.gguf -c 8192 --path ./examples/server/public_legacy
+```
 
 ### Extending or building alternative Web Front End
 
-The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
 
 Read the documentation in `/completion.js` to see convenient ways to access llama.
 
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
new file mode 100644
index 000000000..9549795ec
--- /dev/null
+++ b/examples/server/bench/README.md
@@ -0,0 +1,119 @@
+### Server benchmark tools
+
+Benchmark is using [k6](https://k6.io/).
+
+##### Install k6 and sse extension
+
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
+
+Example (assuming golang >= 1.21 is installed):
+```shell
+go install go.k6.io/xk6/cmd/xk6@latest
+$GOPATH/bin/xk6 build master \
+--with github.com/phymbert/xk6-sse
+```
+
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+#### Download a model
+Example for PHI-2
+
+```shell
+../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
+```
+
+#### Start the server
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
+
+Example:
+```shell
+llama-server --host localhost --port 8080 \
+  --model ggml-model-q4_0.gguf \
+  --cont-batching \
+  --metrics \
+  --parallel 8 \
+  --batch-size 512 \
+  --ctx-size 4096 \
+  -ngl 33
+```
+
+#### Run the benchmark
+
+For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
+```shell
+./k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
+- `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
+
+#### Metrics
+
+Following metrics are available computed from the OAI chat completions response `usage`:
+- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
+- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
+
+The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+
+K6 metrics might be compared against [server metrics](../README.md), with:
+
+```shell
+curl http://localhost:8080/metrics
+```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
+              --runner-label local \
+              --name local \
+              --branch `git rev-parse --abbrev-ref HEAD` \
+              --commit `git rev-parse HEAD` \
+              --scenario script.js \
+              --duration 5m \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix models \
+              --parallel 4 \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 4096 \
+              --n-prompts 200 \
+              --max-prompt-tokens 256 \
+              --max-tokens 256
+```
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
new file mode 100644
index 000000000..5cc6f92ab
--- /dev/null
+++ b/examples/server/bench/bench.py
@@ -0,0 +1,323 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import threading
+import time
+import traceback
+from contextlib import closing
+from datetime import datetime
+
+import matplotlib
+import matplotlib.dates
+import matplotlib.pyplot as plt
+import requests
+from statistics import mean
+
+
+def main(args_in: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description="Start server benchmark scenario")
+    parser.add_argument("--name", type=str, help="Bench name", required=True)
+    parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
+    parser.add_argument("--branch", type=str, help="Branch name", default="detached")
+    parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
+    parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, help="Server listen host", default="8080")
+    parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
+    parser.add_argument("--n-prompts", type=int,
+                        help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
+    parser.add_argument("--max-prompt-tokens", type=int,
+                        help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
+                        required=True)
+    parser.add_argument("--max-tokens", type=int,
+                        help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
+                        required=True)
+    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
+    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
+    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
+    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
+    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
+    parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
+    parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
+    parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
+    parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+
+    args = parser.parse_args(args_in)
+
+    start_time = time.time()
+
+    # Start the server and performance scenario
+    try:
+        server_process = start_server(args)
+    except Exception:
+        print("bench: server start error :")
+        traceback.print_exc(file=sys.stdout)
+        sys.exit(1)
+
+    # start the benchmark
+    iterations = 0
+    data = {}
+    try:
+        start_benchmark(args)
+
+        with open("results.github.env", 'w') as github_env:
+            # parse output
+            with open('k6-results.json', 'r') as bench_results:
+                # Load JSON data from file
+                data = json.load(bench_results)
+                for metric_name in data['metrics']:
+                    for metric_metric in data['metrics'][metric_name]:
+                        value = data['metrics'][metric_name][metric_metric]
+                        if isinstance(value, float) or isinstance(value, int):
+                            value = round(value, 2)
+                            data['metrics'][metric_name][metric_metric]=value
+                            github_env.write(
+                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
+                iterations = data['root_group']['checks']['success completion']['passes']
+
+    except Exception:
+        print("bench: error :")
+        traceback.print_exc(file=sys.stdout)
+
+    # Stop the server
+    if server_process:
+        try:
+            print(f"bench: shutting down server pid={server_process.pid} ...")
+            if os.name == 'nt':
+                interrupt = signal.CTRL_C_EVENT
+            else:
+                interrupt = signal.SIGINT
+            server_process.send_signal(interrupt)
+            server_process.wait(0.5)
+
+        except subprocess.TimeoutExpired:
+            print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
+            server_process.kill()  # SIGKILL
+            server_process.wait()
+
+        while is_server_listening(args.host, args.port):
+            time.sleep(0.1)
+
+    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
+             f"duration={args.duration} {iterations} iterations")
+    xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
+              f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
+              f"branch={args.branch} commit={args.commit}")
+
+    # Prometheus
+    end_time = time.time()
+    prometheus_metrics = {}
+    if is_server_listening("0.0.0.0", 9090):
+        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
+                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
+
+        for metric in metrics:
+            resp = requests.get(f"http://localhost:9090/api/v1/query_range",
+                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+
+            with open(f"{metric}.json", 'w') as metric_json:
+                metric_json.write(resp.text)
+
+            if resp.status_code != 200:
+                print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
+            else:
+                metric_data = resp.json()
+                values = metric_data['data']['result'][0]['values']
+                timestamps, metric_values = zip(*values)
+                metric_values = [float(value) for value in metric_values]
+                prometheus_metrics[metric] = metric_values
+                timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
+                plt.figure(figsize=(16, 10), dpi=80)
+                plt.plot(timestamps_dt, metric_values, label=metric)
+                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
+                plt.yticks(fontsize=12, alpha=.7)
+
+                ylabel = f"llamacpp:{metric}"
+                plt.title(title,
+                          fontsize=14, wrap=True)
+                plt.grid(axis='both', alpha=.3)
+                plt.ylabel(ylabel, fontsize=22)
+                plt.xlabel(xlabel, fontsize=14, wrap=True)
+                plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
+                plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
+                plt.gcf().autofmt_xdate()
+
+                # Remove borders
+                plt.gca().spines["top"].set_alpha(0.0)
+                plt.gca().spines["bottom"].set_alpha(0.3)
+                plt.gca().spines["right"].set_alpha(0.0)
+                plt.gca().spines["left"].set_alpha(0.3)
+
+                # Save the plot as a jpg image
+                plt.savefig(f'{metric}.jpg', dpi=60)
+                plt.close()
+
+                # Mermaid format in case images upload failed
+                with open(f"{metric}.mermaid", 'w') as mermaid_f:
+                    mermaid = (
+                    f"""---
+config:
+    xyChart:
+        titleFontSize: 12
+        width: 900
+        height: 600
+    themeVariables:
+        xyChart:
+            titleColor: "#000000"
+---
+xychart-beta
+    title "{title}"
+    y-axis "llamacpp:{metric}"
+    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
+    line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
+                    """)
+                    mermaid_f.write(mermaid)
+
+    # 140 chars max for commit status description
+    bench_results = {
+        "i": iterations,
+        "req": {
+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
+            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
+        },
+        "pp": {
+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
+        },
+        "tg": {
+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
+        },
+    }
+    with open("results.github.env", 'a') as github_env:
+        github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
+        github_env.write(f"BENCH_ITERATIONS={iterations}\n")
+
+        title = title.replace('\n', ' ')
+        xlabel = xlabel.replace('\n', ' ')
+        github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
+        github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
+
+
+def start_benchmark(args):
+    k6_path = './k6'
+    if 'BENCH_K6_BIN_PATH' in os.environ:
+        k6_path = os.environ['BENCH_K6_BIN_PATH']
+    k6_args = [
+        'run', args.scenario,
+        '--no-color',
+        '--no-connection-reuse',
+        '--no-vu-connection-reuse',
+    ]
+    k6_args.extend(['--duration', args.duration])
+    k6_args.extend(['--iterations', args.n_prompts])
+    k6_args.extend(['--vus', args.parallel])
+    k6_args.extend(['--summary-export', 'k6-results.json'])
+    k6_args.extend(['--out', 'csv=k6-results.csv'])
+    args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
+    args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
+    print(f"bench: starting k6 with: {args}")
+    k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
+    if k6_completed.returncode != 0:
+        raise Exception("bench: unable to run k6")
+
+
+def start_server(args):
+    server_process = start_server_background(args)
+
+    attempts = 0
+    max_attempts = 600
+    if 'GITHUB_ACTIONS' in os.environ:
+        max_attempts *= 2
+
+    while not is_server_listening(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not started"
+        print(f"bench:     waiting for server to start ...")
+        time.sleep(0.5)
+
+    attempts = 0
+    while not is_server_ready(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not ready"
+        print(f"bench:     waiting for server to be ready ...")
+        time.sleep(0.5)
+
+    print("bench: server started and ready.")
+    return server_process
+
+
+def start_server_background(args):
+    # Start the server
+    server_path = '../../../build/bin/llama-server'
+    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
+        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+    server_args = [
+        '--host', args.host,
+        '--port', args.port,
+    ]
+    server_args.extend(['--hf-repo', args.hf_repo])
+    server_args.extend(['--hf-file', args.hf_file])
+    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
+    server_args.extend(['--ctx-size', args.ctx_size])
+    server_args.extend(['--parallel', args.parallel])
+    server_args.extend(['--batch-size', args.batch_size])
+    server_args.extend(['--ubatch-size', args.ubatch_size])
+    server_args.extend(['--n-predict', args.max_tokens * 2])
+    server_args.extend(['--defrag-thold', "0.1"])
+    server_args.append('--cont-batching')
+    server_args.append('--metrics')
+    server_args.append('--flash-attn')
+    args = [str(arg) for arg in [server_path, *server_args]]
+    print(f"bench: starting server with: {' '.join(args)}")
+    pkwargs = {
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
+    }
+    server_process = subprocess.Popen(
+        args,
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
+
+    def server_log(in_stream, out_stream):
+        for line in iter(in_stream.readline, b''):
+            print(line.decode('utf-8'), end='', file=out_stream)
+
+    thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
+    thread_stdout.start()
+    thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
+    thread_stderr.start()
+
+    return server_process
+
+
+def is_server_listening(server_fqdn, server_port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        result = sock.connect_ex((server_fqdn, server_port))
+        _is_server_listening = result == 0
+        if _is_server_listening:
+            print(f"server is listening on {server_fqdn}:{server_port}...")
+        return _is_server_listening
+
+
+def is_server_ready(server_fqdn, server_port):
+    url = f"http://{server_fqdn}:{server_port}/health"
+    response = requests.get(url)
+    return response.status_code == 200
+
+
+def escape_metric_name(metric_name):
+    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml
new file mode 100644
index 000000000..b15ee5244
--- /dev/null
+++ b/examples/server/bench/prometheus.yml
@@ -0,0 +1,9 @@
+global:
+  scrape_interval:     10s
+  external_labels:
+    llamacpp: 'server'
+
+scrape_configs:
+  - job_name: 'llama.cpp server'
+    static_configs:
+      - targets: ['localhost:8080']
diff --git a/examples/server/bench/requirements.txt b/examples/server/bench/requirements.txt
new file mode 100644
index 000000000..66ed226ed
--- /dev/null
+++ b/examples/server/bench/requirements.txt
@@ -0,0 +1,2 @@
+matplotlib
+requests
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
new file mode 100644
index 000000000..2772bee5e
--- /dev/null
+++ b/examples/server/bench/script.js
@@ -0,0 +1,162 @@
+import sse from 'k6/x/sse'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Rate, Trend} from 'k6/metrics'
+import exec from 'k6/execution';
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
+export function setup() {
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
+}
+
+const data = new SharedArray('conversations', function () {
+    const tokenizer = (message) => message.split(/[\s,'".?]/)
+
+    return JSON.parse(open(dataset_path))
+        // Filter out the conversations with less than 2 turns.
+        .filter(data => data["conversations"].length >= 2)
+        .filter(data => data["conversations"][0]["from"] === "human")
+        .map(data => {
+            return {
+                prompt: data["conversations"][0]["value"],
+                n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
+                n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
+            }
+        })
+        // Filter out too short sequences
+        .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
+        // Filter out too long sequences.
+        .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
+        // Keep only first n prompts
+        .slice(0, n_prompt)
+})
+
+const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
+
+export const options = {
+    thresholds: {
+        llamacpp_completions_truncated_rate: [
+            // more than 80% of truncated input will abort the test
+            {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
+        ],
+    },
+    duration: '10m',
+    vus: 8,
+}
+
+export default function () {
+    const conversation = data[exec.scenario.iterationInInstance % data.length]
+    const payload = {
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are ChatGPT, an AI assistant.",
+            },
+            {
+                "role": "user",
+                "content": conversation.prompt,
+            }
+        ],
+        "model": model,
+        "stream": true,
+        "stream_options": {
+          "include_usage": true, // False to be supported in llama.cpp server
+        },
+        "seed": 42,
+        "max_tokens": max_tokens,
+        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
+    }
+
+    const params = {method: 'POST', body: JSON.stringify(payload)};
+
+    const startTime = new Date()
+    let promptEvalEndTime = null
+    let prompt_tokens = 0
+    let completions_tokens = 0
+    let finish_reason = null
+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+        client.on('event', function (event) {
+            if (promptEvalEndTime == null) {
+                promptEvalEndTime = new Date()
+                llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+            }
+
+            if (event.data === '[DONE]' || event.data === '') {
+                return
+            }
+
+            let chunk = JSON.parse(event.data)
+
+            if (chunk.choices && chunk.choices.length > 0) {
+                let choice = chunk.choices[0]
+                if (choice.finish_reason) {
+                    finish_reason = choice.finish_reason
+                }
+            }
+
+            if (chunk.usage) {
+                prompt_tokens = chunk.usage.prompt_tokens
+                llamacpp_prompt_tokens.add(prompt_tokens)
+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+                completions_tokens = chunk.usage.completion_tokens
+                llamacpp_completion_tokens.add(completions_tokens)
+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error());
+            throw e;
+        })
+    })
+
+    check(res, {'success completion': (r) => r.status === 200})
+
+    const endTime = new Date()
+
+    const promptEvalTime = promptEvalEndTime - startTime
+    if (promptEvalTime > 0) {
+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
+    }
+
+    const completion_time = endTime - promptEvalEndTime
+    if (completions_tokens > 0 && completion_time > 0) {
+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+    }
+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
+    sleep(0.3)
+}
diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs
index 219ebb51a..4fef5655a 100644
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,7 +1,7 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
 import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
 
 const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
@@ -26,8 +26,9 @@ const propOrder = grammarJsonSchemaPropOrder
 
 let grammar = null
 if (grammarJsonSchemaFile) {
-    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
-    const converter = new SchemaConverter(propOrder)
+    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
+    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
     converter.visit(schema, '')
     grammar = converter.formatGrammar()
 }
diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp
deleted file mode 100644
index f5e696e17..000000000
--- a/examples/server/completion.js.hpp
+++ /dev/null
@@ -1,485 +0,0 @@
-unsigned char completion_js[] = {
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
-  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
-  0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
-  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
-  0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
-  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
-  0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
-  0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
-  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
-  0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
-  0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
-  0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
-  0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
-  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
-  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
-  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65,
-  0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c,
-  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f,
-  0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27,
-  0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62,
-  0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
-  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65,
-  0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27,
-  0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76,
-  0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70, 0x65, 0x27,
-  0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27,
-  0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x20,
-  0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x69, 0x7a,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42, 0x65, 0x61,
-  0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60, 0x7d, 0x20,
-  0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e,
-  0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70,
-  0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65,
-  0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64,
-  0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78,
-  0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42, 0x75, 0x66,
-  0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x74,
-  0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65,
-  0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
-  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x41,
-  0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
-  0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x6f, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x65, 0x66,
-  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65, 0x63, 0x6f,
-  0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63,
-  0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x73,
-  0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61,
-  0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65, 0x6e, 0x64,
-  0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53,
-  0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
-  0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x49,
-  0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x64,
-  0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77,
-  0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62,
-  0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65,
-  0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20,
-  0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x20,
-  0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x65, 0x6e,
-  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
-  0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65,
-  0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65, 0x73, 0x65,
-  0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69,
-  0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x61, 0x20,
-  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x61,
-  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c,
-  0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73,
-  0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68, 0x65,
-  0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c,
-  0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f,
-  0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
-  0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20,
-  0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63, 0x28, 0x6c,
-  0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
-  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
-  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
-  0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79, 0x69,
-  0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x66,
-  0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73, 0x74,
-  0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72, 0x6f,
-  0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77, 0x65,
-  0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20,
-  0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f,
-  0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x66, 0x61,
-  0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d,
-  0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
-  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
-  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61,
-  0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62,
-  0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20,
-  0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c,
-  0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77,
-  0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27,
-  0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c,
-  0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c,
-  0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
-  0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72,
-  0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61,
-  0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
-  0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65,
-  0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45,
-  0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20,
-  0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
-  0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
-  0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f,
-  0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72,
-  0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
-  0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
-  0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
-  0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
-  0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
-  0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
-  0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
-  0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
-  0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
-  0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
-  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
-  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
-  0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
-  0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
-  0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
-  0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
-  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
-  0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
-  0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
-  0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
-  0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
-  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
-  0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
-  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
-  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
-  0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
-  0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
-  0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
-  0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
-  0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
-  0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
-  0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
-  0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
-  0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
-  0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
-  0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
-  0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
-  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20,
-  0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63,
-  0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, 0x29, 0x2e,
-  0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e,
-  0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
-  0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
-};
-unsigned int completion_js_len = 5782;
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
deleted file mode 100755
index ea23e6450..000000000
--- a/examples/server/deps.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Download and update deps for binary
-
-# get the directory of this script file
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-PUBLIC=$DIR/public
-
-echo "download js bundle files"
-curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
-echo >> $PUBLIC/index.js # add newline
-
-FILES=$(ls $PUBLIC)
-
-cd $PUBLIC
-for FILE in $FILES; do
-  echo "generate $FILE.hpp"
-
-  # use simple flag for old version of xxd
-  xxd -i $FILE > $DIR/$FILE.hpp
-done
diff --git a/examples/server/httplib.h b/examples/server/httplib.h
index 28746000c..c2f12dd2a 100644
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
@@ -1,14 +1,14 @@
 //
 //  httplib.h
 //
-//  Copyright (c) 2023 Yuji Hirose. All rights reserved.
+//  Copyright (c) 2024 Yuji Hirose. All rights reserved.
 //  MIT License
 //
 
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.12.2"
+#define CPPHTTPLIB_VERSION "0.18.5"
 
 /*
  * Configuration
@@ -18,8 +18,12 @@
 #define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
 #endif
 
+#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND
+#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND 10000
+#endif
+
 #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
+#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 100
 #endif
 
 #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
@@ -30,20 +34,36 @@
 #define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0
 #endif
 
-#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND
-#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5
+#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND
+#define CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND 5
 #endif
 
-#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND
-#define CPPHTTPLIB_READ_TIMEOUT_USECOND 0
+#ifndef CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND
+#define CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND 0
 #endif
 
-#ifndef CPPHTTPLIB_WRITE_TIMEOUT_SECOND
-#define CPPHTTPLIB_WRITE_TIMEOUT_SECOND 5
+#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND
+#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND 5
 #endif
 
-#ifndef CPPHTTPLIB_WRITE_TIMEOUT_USECOND
-#define CPPHTTPLIB_WRITE_TIMEOUT_USECOND 0
+#ifndef CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND
+#define CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND 0
+#endif
+
+#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND
+#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND 300
+#endif
+
+#ifndef CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND
+#define CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND 0
+#endif
+
+#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND
+#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND 5
+#endif
+
+#ifndef CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND
+#define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0
 #endif
 
 #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
@@ -82,12 +102,20 @@
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
 #endif
 
+#ifndef CPPHTTPLIB_RANGE_MAX_COUNT
+#define CPPHTTPLIB_RANGE_MAX_COUNT 1024
+#endif
+
 #ifndef CPPHTTPLIB_TCP_NODELAY
 #define CPPHTTPLIB_TCP_NODELAY false
 #endif
 
+#ifndef CPPHTTPLIB_IPV6_V6ONLY
+#define CPPHTTPLIB_IPV6_V6ONLY false
+#endif
+
 #ifndef CPPHTTPLIB_RECV_BUFSIZ
-#define CPPHTTPLIB_RECV_BUFSIZ size_t(4096u)
+#define CPPHTTPLIB_RECV_BUFSIZ size_t(16384u)
 #endif
 
 #ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ
@@ -141,11 +169,11 @@ using ssize_t = long;
 #endif // _MSC_VER
 
 #ifndef S_ISREG
-#define S_ISREG(m) (((m)&S_IFREG) == S_IFREG)
+#define S_ISREG(m) (((m) & S_IFREG) == S_IFREG)
 #endif // S_ISREG
 
 #ifndef S_ISDIR
-#define S_ISDIR(m) (((m)&S_IFDIR) == S_IFDIR)
+#define S_ISDIR(m) (((m) & S_IFDIR) == S_IFDIR)
 #endif // S_ISDIR
 
 #ifndef NOMINMAX
@@ -160,10 +188,6 @@ using ssize_t = long;
 #define WSA_FLAG_NO_HANDLE_INHERIT 0x80
 #endif
 
-#ifndef strcasecmp
-#define strcasecmp _stricmp
-#endif // strcasecmp
-
 using socket_t = SOCKET;
 #ifdef CPPHTTPLIB_USE_POLL
 #define poll(fds, nfds, timeout) WSAPoll(fds, nfds, timeout)
@@ -172,9 +196,15 @@ using socket_t = SOCKET;
 #else // not _WIN32
 
 #include <arpa/inet.h>
-#ifndef _AIX
+#if !defined(_AIX) && !defined(__MVS__)
 #include <ifaddrs.h>
 #endif
+#ifdef __MVS__
+#include <strings.h>
+#ifndef NI_MAXHOST
+#define NI_MAXHOST 1025
+#endif
+#endif
 #include <net/if.h>
 #include <netdb.h>
 #include <netinet/in.h>
@@ -187,6 +217,7 @@ using socket_t = SOCKET;
 #endif
 #include <csignal>
 #include <pthread.h>
+#include <sys/mman.h>
 #include <sys/select.h>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -207,6 +238,7 @@ using socket_t = int;
 #include <condition_variable>
 #include <cstring>
 #include <errno.h>
+#include <exception>
 #include <fcntl.h>
 #include <fstream>
 #include <functional>
@@ -223,6 +255,9 @@ using socket_t = int;
 #include <string>
 #include <sys/stat.h>
 #include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 #ifdef _WIN32
@@ -237,7 +272,6 @@ using socket_t = int;
 
 #ifdef _MSC_VER
 #pragma comment(lib, "crypt32.lib")
-#pragma comment(lib, "cryptui.lib")
 #endif
 #elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
 #include <TargetConditionals.h>
@@ -259,10 +293,13 @@ using socket_t = int;
 #include <iostream>
 #include <sstream>
 
-#if OPENSSL_VERSION_NUMBER < 0x1010100fL
-#error Sorry, OpenSSL versions prior to 1.1.1 are not supported
-#elif OPENSSL_VERSION_NUMBER < 0x30000000L
+#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
+#if OPENSSL_VERSION_NUMBER < 0x1010107f
+#error Please use OpenSSL or a current version of BoringSSL
+#endif
 #define SSL_get1_peer_certificate SSL_get_peer_certificate
+#elif OPENSSL_VERSION_NUMBER < 0x30000000L
+#error Sorry, OpenSSL versions prior to 3.0.0 are not supported
 #endif
 
 #endif
@@ -304,16 +341,63 @@ make_unique(std::size_t n) {
   return std::unique_ptr<T>(new RT[n]);
 }
 
-struct ci {
-  bool operator()(const std::string &s1, const std::string &s2) const {
-    return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(),
-                                        s2.end(),
-                                        [](unsigned char c1, unsigned char c2) {
-                                          return ::tolower(c1) < ::tolower(c2);
-                                        });
+namespace case_ignore {
+
+inline unsigned char to_lower(int c) {
+  const static unsigned char table[256] = {
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+      60,  61,  62,  63,  64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106,
+      107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+      122, 91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+      120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+      135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+      150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+      165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+      180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 224, 225, 226,
+      227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+      242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, 224,
+      225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+      255,
+  };
+  return table[(unsigned char)(char)c];
+}
+
+inline bool equal(const std::string &a, const std::string &b) {
+  return a.size() == b.size() &&
+         std::equal(a.begin(), a.end(), b.begin(), [](char ca, char cb) {
+           return to_lower(ca) == to_lower(cb);
+         });
+}
+
+struct equal_to {
+  bool operator()(const std::string &a, const std::string &b) const {
+    return equal(a, b);
   }
 };
 
+struct hash {
+  size_t operator()(const std::string &key) const {
+    return hash_core(key.data(), key.size(), 0);
+  }
+
+  size_t hash_core(const char *s, size_t l, size_t h) const {
+    return (l == 0) ? h
+                    : hash_core(s + 1, l - 1,
+                                // Unsets the 6 high bits of h, therefore no
+                                // overflow happens
+                                (((std::numeric_limits<size_t>::max)() >> 6) &
+                                 h * 33) ^
+                                    static_cast<unsigned char>(to_lower(*s)));
+  }
+};
+
+} // namespace case_ignore
+
 // This is based on
 // "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189".
 
@@ -321,7 +405,7 @@ struct scope_exit {
   explicit scope_exit(std::function<void(void)> &&f)
       : exit_function(std::move(f)), execute_on_destruction{true} {}
 
-  scope_exit(scope_exit &&rhs)
+  scope_exit(scope_exit &&rhs) noexcept
       : exit_function(std::move(rhs.exit_function)),
         execute_on_destruction{rhs.execute_on_destruction} {
     rhs.release();
@@ -344,7 +428,84 @@ private:
 
 } // namespace detail
 
-using Headers = std::multimap<std::string, std::string, detail::ci>;
+enum StatusCode {
+  // Information responses
+  Continue_100 = 100,
+  SwitchingProtocol_101 = 101,
+  Processing_102 = 102,
+  EarlyHints_103 = 103,
+
+  // Successful responses
+  OK_200 = 200,
+  Created_201 = 201,
+  Accepted_202 = 202,
+  NonAuthoritativeInformation_203 = 203,
+  NoContent_204 = 204,
+  ResetContent_205 = 205,
+  PartialContent_206 = 206,
+  MultiStatus_207 = 207,
+  AlreadyReported_208 = 208,
+  IMUsed_226 = 226,
+
+  // Redirection messages
+  MultipleChoices_300 = 300,
+  MovedPermanently_301 = 301,
+  Found_302 = 302,
+  SeeOther_303 = 303,
+  NotModified_304 = 304,
+  UseProxy_305 = 305,
+  unused_306 = 306,
+  TemporaryRedirect_307 = 307,
+  PermanentRedirect_308 = 308,
+
+  // Client error responses
+  BadRequest_400 = 400,
+  Unauthorized_401 = 401,
+  PaymentRequired_402 = 402,
+  Forbidden_403 = 403,
+  NotFound_404 = 404,
+  MethodNotAllowed_405 = 405,
+  NotAcceptable_406 = 406,
+  ProxyAuthenticationRequired_407 = 407,
+  RequestTimeout_408 = 408,
+  Conflict_409 = 409,
+  Gone_410 = 410,
+  LengthRequired_411 = 411,
+  PreconditionFailed_412 = 412,
+  PayloadTooLarge_413 = 413,
+  UriTooLong_414 = 414,
+  UnsupportedMediaType_415 = 415,
+  RangeNotSatisfiable_416 = 416,
+  ExpectationFailed_417 = 417,
+  ImATeapot_418 = 418,
+  MisdirectedRequest_421 = 421,
+  UnprocessableContent_422 = 422,
+  Locked_423 = 423,
+  FailedDependency_424 = 424,
+  TooEarly_425 = 425,
+  UpgradeRequired_426 = 426,
+  PreconditionRequired_428 = 428,
+  TooManyRequests_429 = 429,
+  RequestHeaderFieldsTooLarge_431 = 431,
+  UnavailableForLegalReasons_451 = 451,
+
+  // Server error responses
+  InternalServerError_500 = 500,
+  NotImplemented_501 = 501,
+  BadGateway_502 = 502,
+  ServiceUnavailable_503 = 503,
+  GatewayTimeout_504 = 504,
+  HttpVersionNotSupported_505 = 505,
+  VariantAlsoNegotiates_506 = 506,
+  InsufficientStorage_507 = 507,
+  LoopDetected_508 = 508,
+  NotExtended_510 = 510,
+  NetworkAuthenticationRequired_511 = 511,
+};
+
+using Headers =
+    std::unordered_multimap<std::string, std::string, detail::case_ignore::hash,
+                            detail::case_ignore::equal_to>;
 
 using Params = std::multimap<std::string, std::string>;
 using Match = std::smatch;
@@ -373,17 +534,18 @@ public:
   DataSink &operator=(DataSink &&) = delete;
 
   std::function<bool(const char *data, size_t data_len)> write;
+  std::function<bool()> is_writable;
   std::function<void()> done;
   std::function<void(const Headers &trailer)> done_with_trailer;
   std::ostream os;
 
 private:
-  class data_sink_streambuf : public std::streambuf {
+  class data_sink_streambuf final : public std::streambuf {
   public:
     explicit data_sink_streambuf(DataSink &sink) : sink_(sink) {}
 
   protected:
-    std::streamsize xsputn(const char *s, std::streamsize n) {
+    std::streamsize xsputn(const char *s, std::streamsize n) override {
       sink_.write(s, static_cast<size_t>(n));
       return n;
     }
@@ -450,6 +612,7 @@ using Ranges = std::vector<Range>;
 struct Request {
   std::string method;
   std::string path;
+  Params params;
   Headers headers;
   std::string body;
 
@@ -461,10 +624,11 @@ struct Request {
   // for server
   std::string version;
   std::string target;
-  Params params;
   MultipartFormDataMap files;
   Ranges ranges;
   Match matches;
+  std::unordered_map<std::string, std::string> path_params;
+  std::function<bool()> is_connection_closed = []() { return true; };
 
   // for client
   ResponseHandler response_handler;
@@ -475,9 +639,10 @@ struct Request {
 #endif
 
   bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, size_t id = 0) const;
-  template <typename T>
-  T get_header_value(const std::string &key, size_t id = 0) const;
+  std::string get_header_value(const std::string &key, const char *def = "",
+                               size_t id = 0) const;
+  uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0,
+                                size_t id = 0) const;
   size_t get_header_value_count(const std::string &key) const;
   void set_header(const std::string &key, const std::string &val);
 
@@ -508,15 +673,17 @@ struct Response {
   std::string location; // Redirect location
 
   bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, size_t id = 0) const;
-  template <typename T>
-  T get_header_value(const std::string &key, size_t id = 0) const;
+  std::string get_header_value(const std::string &key, const char *def = "",
+                               size_t id = 0) const;
+  uint64_t get_header_value_u64(const std::string &key, uint64_t def = 0,
+                                size_t id = 0) const;
   size_t get_header_value_count(const std::string &key) const;
   void set_header(const std::string &key, const std::string &val);
 
-  void set_redirect(const std::string &url, int status = 302);
+  void set_redirect(const std::string &url, int status = StatusCode::Found_302);
   void set_content(const char *s, size_t n, const std::string &content_type);
   void set_content(const std::string &s, const std::string &content_type);
+  void set_content(std::string &&s, const std::string &content_type);
 
   void set_content_provider(
       size_t length, const std::string &content_type, ContentProvider provider,
@@ -530,6 +697,10 @@ struct Response {
       const std::string &content_type, ContentProviderWithoutLength provider,
       ContentProviderResourceReleaser resource_releaser = nullptr);
 
+  void set_file_content(const std::string &path,
+                        const std::string &content_type);
+  void set_file_content(const std::string &path);
+
   Response() = default;
   Response(const Response &) = default;
   Response &operator=(const Response &) = default;
@@ -547,6 +718,8 @@ struct Response {
   ContentProviderResourceReleaser content_provider_resource_releaser_;
   bool is_chunked_content_provider_ = false;
   bool content_provider_success_ = false;
+  std::string file_content_path_;
+  std::string file_content_content_type_;
 };
 
 class Stream {
@@ -562,8 +735,6 @@ public:
   virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0;
   virtual socket_t socket() const = 0;
 
-  template <typename... Args>
-  ssize_t write_format(const char *fmt, const Args &...args);
   ssize_t write(const char *ptr);
   ssize_t write(const std::string &s);
 };
@@ -573,15 +744,16 @@ public:
   TaskQueue() = default;
   virtual ~TaskQueue() = default;
 
-  virtual void enqueue(std::function<void()> fn) = 0;
+  virtual bool enqueue(std::function<void()> fn) = 0;
   virtual void shutdown() = 0;
 
   virtual void on_idle() {}
 };
 
-class ThreadPool : public TaskQueue {
+class ThreadPool final : public TaskQueue {
 public:
-  explicit ThreadPool(size_t n) : shutdown_(false) {
+  explicit ThreadPool(size_t n, size_t mqr = 0)
+      : shutdown_(false), max_queued_requests_(mqr) {
     while (n) {
       threads_.emplace_back(worker(*this));
       n--;
@@ -591,13 +763,17 @@ public:
   ThreadPool(const ThreadPool &) = delete;
   ~ThreadPool() override = default;
 
-  void enqueue(std::function<void()> fn) override {
+  bool enqueue(std::function<void()> fn) override {
     {
       std::unique_lock<std::mutex> lock(mutex_);
+      if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) {
+        return false;
+      }
       jobs_.push_back(std::move(fn));
     }
 
     cond_.notify_one();
+    return true;
   }
 
   void shutdown() override {
@@ -630,13 +806,18 @@ private:
 
           if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
 
-          fn = std::move(pool_.jobs_.front());
+          fn = pool_.jobs_.front();
           pool_.jobs_.pop_front();
         }
 
         assert(true == static_cast<bool>(fn));
         fn();
       }
+
+#if defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(OPENSSL_IS_BORINGSSL) &&   \
+    !defined(LIBRESSL_VERSION_NUMBER)
+      OPENSSL_thread_stop();
+#endif
     }
 
     ThreadPool &pool_;
@@ -647,6 +828,7 @@ private:
   std::list<std::function<void()>> jobs_;
 
   bool shutdown_;
+  size_t max_queued_requests_ = 0;
 
   std::condition_variable cond_;
   std::mutex mutex_;
@@ -658,6 +840,81 @@ using SocketOptions = std::function<void(socket_t sock)>;
 
 void default_socket_options(socket_t sock);
 
+const char *status_message(int status);
+
+std::string get_bearer_token_auth(const Request &req);
+
+namespace detail {
+
+class MatcherBase {
+public:
+  virtual ~MatcherBase() = default;
+
+  // Match request path and populate its matches and
+  virtual bool match(Request &request) const = 0;
+};
+
+/**
+ * Captures parameters in request path and stores them in Request::path_params
+ *
+ * Capture name is a substring of a pattern from : to /.
+ * The rest of the pattern is matched agains the request path directly
+ * Parameters are captured starting from the next character after
+ * the end of the last matched static pattern fragment until the next /.
+ *
+ * Example pattern:
+ * "/path/fragments/:capture/more/fragments/:second_capture"
+ * Static fragments:
+ * "/path/fragments/", "more/fragments/"
+ *
+ * Given the following request path:
+ * "/path/fragments/:1/more/fragments/:2"
+ * the resulting capture will be
+ * {{"capture", "1"}, {"second_capture", "2"}}
+ */
+class PathParamsMatcher final : public MatcherBase {
+public:
+  PathParamsMatcher(const std::string &pattern);
+
+  bool match(Request &request) const override;
+
+private:
+  // Treat segment separators as the end of path parameter capture
+  // Does not need to handle query parameters as they are parsed before path
+  // matching
+  static constexpr char separator = '/';
+
+  // Contains static path fragments to match against, excluding the '/' after
+  // path params
+  // Fragments are separated by path params
+  std::vector<std::string> static_fragments_;
+  // Stores the names of the path parameters to be used as keys in the
+  // Request::path_params map
+  std::vector<std::string> param_names_;
+};
+
+/**
+ * Performs std::regex_match on request path
+ * and stores the result in Request::matches
+ *
+ * Note that regex match is performed directly on the whole request.
+ * This means that wildcard patterns may match multiple path segments with /:
+ * "/begin/(.*)/end" will match both "/begin/middle/end" and "/begin/1/2/end".
+ */
+class RegexMatcher final : public MatcherBase {
+public:
+  RegexMatcher(const std::string &pattern) : regex_(pattern) {}
+
+  bool match(Request &request) const override;
+
+private:
+  std::regex regex_;
+};
+
+ssize_t write_headers(Stream &strm, const Headers &headers);
+
+} // namespace detail
+
 class Server {
 public:
   using Handler = std::function<void(const Request &, Response &)>;
@@ -702,10 +959,16 @@ public:
   bool remove_mount_point(const std::string &mount_point);
   Server &set_file_extension_and_mimetype_mapping(const std::string &ext,
                                                   const std::string &mime);
+  Server &set_default_file_mimetype(const std::string &mime);
   Server &set_file_request_handler(Handler handler);
 
-  Server &set_error_handler(HandlerWithResponse handler);
-  Server &set_error_handler(Handler handler);
+  template <class ErrorHandlerFunc>
+  Server &set_error_handler(ErrorHandlerFunc &&handler) {
+    return set_error_handler_core(
+        std::forward<ErrorHandlerFunc>(handler),
+        std::is_convertible<ErrorHandlerFunc, HandlerWithResponse>{});
+  }
+
   Server &set_exception_handler(ExceptionHandler handler);
   Server &set_pre_routing_handler(HandlerWithResponse handler);
   Server &set_post_routing_handler(Handler handler);
@@ -715,9 +978,12 @@ public:
 
   Server &set_address_family(int family);
   Server &set_tcp_nodelay(bool on);
+  Server &set_ipv6_v6only(bool on);
   Server &set_socket_options(SocketOptions socket_options);
 
   Server &set_default_headers(Headers headers);
+  Server &
+  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
 
   Server &set_keep_alive_max_count(size_t count);
   Server &set_keep_alive_timeout(time_t sec);
@@ -745,29 +1011,40 @@ public:
   bool is_running() const;
   void wait_until_ready() const;
   void stop();
+  void decommission();
 
   std::function<TaskQueue *(void)> new_task_queue;
 
 protected:
-  bool process_request(Stream &strm, bool close_connection,
+  bool process_request(Stream &strm, const std::string &remote_addr,
+                       int remote_port, const std::string &local_addr,
+                       int local_port, bool close_connection,
                        bool &connection_closed,
                        const std::function<void(Request &)> &setup_request);
 
   std::atomic<socket_t> svr_sock_{INVALID_SOCKET};
   size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT;
   time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND;
+  time_t read_timeout_sec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_SECOND;
+  time_t read_timeout_usec_ = CPPHTTPLIB_SERVER_READ_TIMEOUT_USECOND;
+  time_t write_timeout_sec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_SECOND;
+  time_t write_timeout_usec_ = CPPHTTPLIB_SERVER_WRITE_TIMEOUT_USECOND;
   time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND;
   time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND;
   size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
 
 private:
-  using Handlers = std::vector<std::pair<std::regex, Handler>>;
+  using Handlers =
+      std::vector<std::pair<std::unique_ptr<detail::MatcherBase>, Handler>>;
   using HandlersForContentReader =
-      std::vector<std::pair<std::regex, HandlerWithContentReader>>;
+      std::vector<std::pair<std::unique_ptr<detail::MatcherBase>,
+                            HandlerWithContentReader>>;
+
+  static std::unique_ptr<detail::MatcherBase>
+  make_matcher(const std::string &pattern);
+
+  Server &set_error_handler_core(HandlerWithResponse handler, std::true_type);
+  Server &set_error_handler_core(Handler handler, std::false_type);
 
   socket_t create_server_socket(const std::string &host, int port,
                                 int socket_flags,
@@ -778,16 +1055,16 @@ private:
   bool routing(Request &req, Response &res, Stream &strm);
   bool handle_file_request(const Request &req, Response &res,
                            bool head = false);
-  bool dispatch_request(Request &req, Response &res, const Handlers &handlers);
-  bool
-  dispatch_request_for_content_reader(Request &req, Response &res,
-                                      ContentReader content_reader,
-                                      const HandlersForContentReader &handlers);
+  bool dispatch_request(Request &req, Response &res,
+                        const Handlers &handlers) const;
+  bool dispatch_request_for_content_reader(
+      Request &req, Response &res, ContentReader content_reader,
+      const HandlersForContentReader &handlers) const;
 
-  bool parse_request_line(const char *s, Request &req);
+  bool parse_request_line(const char *s, Request &req) const;
   void apply_ranges(const Request &req, Response &res,
-                    std::string &content_type, std::string &boundary);
-  bool write_response(Stream &strm, bool close_connection, const Request &req,
+                    std::string &content_type, std::string &boundary) const;
+  bool write_response(Stream &strm, bool close_connection, Request &req,
                       Response &res);
   bool write_response_with_content(Stream &strm, bool close_connection,
                                    const Request &req, Response &res);
@@ -806,21 +1083,23 @@ private:
   bool read_content_core(Stream &strm, Request &req, Response &res,
                          ContentReceiver receiver,
                          MultipartContentHeader multipart_header,
-                         ContentReceiver multipart_receiver);
+                         ContentReceiver multipart_receiver) const;
 
   virtual bool process_and_close_socket(socket_t sock);
 
+  std::atomic<bool> is_running_{false};
+  std::atomic<bool> is_decommisioned{false};
+
   struct MountPointEntry {
     std::string mount_point;
     std::string base_dir;
     Headers headers;
   };
   std::vector<MountPointEntry> base_dirs_;
-
-  std::atomic<bool> is_running_{false};
-  std::atomic<bool> done_{false};
   std::map<std::string, std::string> file_extension_and_mimetype_map_;
+  std::string default_file_mimetype_ = "application/octet-stream";
   Handler file_request_handler_;
+
   Handlers get_handlers_;
   Handlers post_handlers_;
   HandlersForContentReader post_handlers_for_content_reader_;
@@ -831,18 +1110,23 @@ private:
   Handlers delete_handlers_;
   HandlersForContentReader delete_handlers_for_content_reader_;
   Handlers options_handlers_;
+
   HandlerWithResponse error_handler_;
   ExceptionHandler exception_handler_;
   HandlerWithResponse pre_routing_handler_;
   Handler post_routing_handler_;
-  Logger logger_;
   Expect100ContinueHandler expect_100_continue_handler_;
 
+  Logger logger_;
+
   int address_family_ = AF_UNSPEC;
   bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
+  bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY;
   SocketOptions socket_options_ = default_socket_options;
 
   Headers default_headers_;
+  std::function<ssize_t(Stream &, Headers &)> header_writer_ =
+      detail::write_headers;
 };
 
 enum class Error {
@@ -857,20 +1141,23 @@ enum class Error {
   SSLConnection,
   SSLLoadingCerts,
   SSLServerVerification,
+  SSLServerHostnameVerification,
   UnsupportedMultipartBoundaryChars,
   Compression,
   ConnectionTimeout,
+  ProxyConnection,
 
   // For internal use only
   SSLPeerCouldBeClosed_,
 };
 
-std::string to_string(const Error error);
+std::string to_string(Error error);
 
 std::ostream &operator<<(std::ostream &os, const Error &obj);
 
 class Result {
 public:
+  Result() = default;
   Result(std::unique_ptr<Response> &&res, Error err,
          Headers &&request_headers = Headers{})
       : res_(std::move(res)), err_(err),
@@ -892,14 +1179,15 @@ public:
   // Request Headers
   bool has_request_header(const std::string &key) const;
   std::string get_request_header_value(const std::string &key,
+                                       const char *def = "",
                                        size_t id = 0) const;
-  template <typename T>
-  T get_request_header_value(const std::string &key, size_t id = 0) const;
+  uint64_t get_request_header_value_u64(const std::string &key,
+                                        uint64_t def = 0, size_t id = 0) const;
   size_t get_request_header_value_count(const std::string &key) const;
 
 private:
   std::unique_ptr<Response> res_;
-  Error err_;
+  Error err_ = Error::Unknown;
   Headers request_headers_;
 };
 
@@ -958,10 +1246,18 @@ public:
               const std::string &content_type);
   Result Post(const std::string &path, const Headers &headers, const char *body,
               size_t content_length, const std::string &content_type);
+  Result Post(const std::string &path, const Headers &headers, const char *body,
+              size_t content_length, const std::string &content_type,
+              Progress progress);
   Result Post(const std::string &path, const std::string &body,
               const std::string &content_type);
+  Result Post(const std::string &path, const std::string &body,
+              const std::string &content_type, Progress progress);
   Result Post(const std::string &path, const Headers &headers,
               const std::string &body, const std::string &content_type);
+  Result Post(const std::string &path, const Headers &headers,
+              const std::string &body, const std::string &content_type,
+              Progress progress);
   Result Post(const std::string &path, size_t content_length,
               ContentProvider content_provider,
               const std::string &content_type);
@@ -977,6 +1273,8 @@ public:
   Result Post(const std::string &path, const Params &params);
   Result Post(const std::string &path, const Headers &headers,
               const Params &params);
+  Result Post(const std::string &path, const Headers &headers,
+              const Params &params, Progress progress);
   Result Post(const std::string &path, const MultipartFormDataItems &items);
   Result Post(const std::string &path, const Headers &headers,
               const MultipartFormDataItems &items);
@@ -991,10 +1289,18 @@ public:
              const std::string &content_type);
   Result Put(const std::string &path, const Headers &headers, const char *body,
              size_t content_length, const std::string &content_type);
+  Result Put(const std::string &path, const Headers &headers, const char *body,
+             size_t content_length, const std::string &content_type,
+             Progress progress);
   Result Put(const std::string &path, const std::string &body,
              const std::string &content_type);
+  Result Put(const std::string &path, const std::string &body,
+             const std::string &content_type, Progress progress);
   Result Put(const std::string &path, const Headers &headers,
              const std::string &body, const std::string &content_type);
+  Result Put(const std::string &path, const Headers &headers,
+             const std::string &body, const std::string &content_type,
+             Progress progress);
   Result Put(const std::string &path, size_t content_length,
              ContentProvider content_provider, const std::string &content_type);
   Result Put(const std::string &path,
@@ -1009,6 +1315,8 @@ public:
   Result Put(const std::string &path, const Params &params);
   Result Put(const std::string &path, const Headers &headers,
              const Params &params);
+  Result Put(const std::string &path, const Headers &headers,
+             const Params &params, Progress progress);
   Result Put(const std::string &path, const MultipartFormDataItems &items);
   Result Put(const std::string &path, const Headers &headers,
              const MultipartFormDataItems &items);
@@ -1021,13 +1329,23 @@ public:
   Result Patch(const std::string &path);
   Result Patch(const std::string &path, const char *body, size_t content_length,
                const std::string &content_type);
+  Result Patch(const std::string &path, const char *body, size_t content_length,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const Headers &headers,
                const char *body, size_t content_length,
                const std::string &content_type);
+  Result Patch(const std::string &path, const Headers &headers,
+               const char *body, size_t content_length,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const std::string &body,
                const std::string &content_type);
+  Result Patch(const std::string &path, const std::string &body,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const Headers &headers,
                const std::string &body, const std::string &content_type);
+  Result Patch(const std::string &path, const Headers &headers,
+               const std::string &body, const std::string &content_type,
+               Progress progress);
   Result Patch(const std::string &path, size_t content_length,
                ContentProvider content_provider,
                const std::string &content_type);
@@ -1045,13 +1363,24 @@ public:
   Result Delete(const std::string &path, const Headers &headers);
   Result Delete(const std::string &path, const char *body,
                 size_t content_length, const std::string &content_type);
+  Result Delete(const std::string &path, const char *body,
+                size_t content_length, const std::string &content_type,
+                Progress progress);
   Result Delete(const std::string &path, const Headers &headers,
                 const char *body, size_t content_length,
                 const std::string &content_type);
+  Result Delete(const std::string &path, const Headers &headers,
+                const char *body, size_t content_length,
+                const std::string &content_type, Progress progress);
   Result Delete(const std::string &path, const std::string &body,
                 const std::string &content_type);
+  Result Delete(const std::string &path, const std::string &body,
+                const std::string &content_type, Progress progress);
   Result Delete(const std::string &path, const Headers &headers,
                 const std::string &body, const std::string &content_type);
+  Result Delete(const std::string &path, const Headers &headers,
+                const std::string &body, const std::string &content_type,
+                Progress progress);
 
   Result Options(const std::string &path);
   Result Options(const std::string &path, const Headers &headers);
@@ -1059,18 +1388,24 @@ public:
   bool send(Request &req, Response &res, Error &error);
   Result send(const Request &req);
 
-  size_t is_socket_open() const;
-
-  socket_t socket() const;
-
   void stop();
 
+  std::string host() const;
+  int port() const;
+
+  size_t is_socket_open() const;
+  socket_t socket() const;
+
   void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
 
   void set_default_headers(Headers headers);
 
+  void
+  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
+
   void set_address_family(int family);
   void set_tcp_nodelay(bool on);
+  void set_ipv6_v6only(bool on);
   void set_socket_options(SocketOptions socket_options);
 
   void set_connection_timeout(time_t sec, time_t usec = 0);
@@ -1117,10 +1452,13 @@ public:
   void set_ca_cert_path(const std::string &ca_cert_file_path,
                         const std::string &ca_cert_dir_path = std::string());
   void set_ca_cert_store(X509_STORE *ca_cert_store);
+  X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const;
 #endif
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
   void enable_server_certificate_verification(bool enabled);
+  void enable_server_hostname_verification(bool enabled);
+  void set_server_certificate_verifier(std::function<bool(SSL *ssl)> verifier);
 #endif
 
   void set_logger(Logger logger);
@@ -1145,14 +1483,14 @@ protected:
   // Also, shutdown_ssl and close_socket should also NOT be called concurrently
   // with a DIFFERENT thread sending requests using that socket.
   virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully);
-  void shutdown_socket(Socket &socket);
+  void shutdown_socket(Socket &socket) const;
   void close_socket(Socket &socket);
 
   bool process_request(Stream &strm, Request &req, Response &res,
                        bool close_connection, Error &error);
 
   bool write_content_with_provider(Stream &strm, const Request &req,
-                                   Error &error);
+                                   Error &error) const;
 
   void copy_settings(const ClientImpl &rhs);
 
@@ -1177,16 +1515,20 @@ protected:
   // Default headers
   Headers default_headers_;
 
+  // Header writer
+  std::function<ssize_t(Stream &, Headers &)> header_writer_ =
+      detail::write_headers;
+
   // Settings
   std::string client_cert_path_;
   std::string client_key_path_;
 
   time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND;
   time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND;
+  time_t read_timeout_sec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_SECOND;
+  time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND;
+  time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND;
+  time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND;
 
   std::string basic_auth_username_;
   std::string basic_auth_password_;
@@ -1203,6 +1545,7 @@ protected:
 
   int address_family_ = AF_UNSPEC;
   bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
+  bool ipv6_v6only_ = CPPHTTPLIB_IPV6_V6ONLY;
   SocketOptions socket_options_ = nullptr;
 
   bool compress_ = false;
@@ -1230,6 +1573,8 @@ protected:
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
   bool server_certificate_verification_ = true;
+  bool server_hostname_verification_ = true;
+  std::function<bool(SSL *ssl)> server_certificate_verifier_;
 #endif
 
   Logger logger_;
@@ -1238,8 +1583,12 @@ private:
   bool send_(Request &req, Response &res, Error &error);
   Result send_(Request &&req);
 
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+  bool is_ssl_peer_could_be_closed(SSL *ssl) const;
+#endif
   socket_t create_client_socket(Error &error) const;
-  bool read_response_line(Stream &strm, const Request &req, Response &res);
+  bool read_response_line(Stream &strm, const Request &req,
+                          Response &res) const;
   bool write_request(Stream &strm, Request &req, bool close_connection,
                      Error &error);
   bool redirect(Request &req, Response &res, Error &error);
@@ -1255,10 +1604,10 @@ private:
       const Headers &headers, const char *body, size_t content_length,
       ContentProvider content_provider,
       ContentProviderWithoutLength content_provider_without_length,
-      const std::string &content_type);
+      const std::string &content_type, Progress progress);
   ContentProviderWithoutLength get_multipart_content_provider(
       const std::string &boundary, const MultipartFormDataItems &items,
-      const MultipartFormDataProviderItems &provider_items);
+      const MultipartFormDataProviderItems &provider_items) const;
 
   std::string adjust_host_string(const std::string &host) const;
 
@@ -1284,6 +1633,7 @@ public:
                   const std::string &client_key_path);
 
   Client(Client &&) = default;
+  Client &operator=(Client &&) = default;
 
   ~Client();
 
@@ -1330,10 +1680,18 @@ public:
               const std::string &content_type);
   Result Post(const std::string &path, const Headers &headers, const char *body,
               size_t content_length, const std::string &content_type);
+  Result Post(const std::string &path, const Headers &headers, const char *body,
+              size_t content_length, const std::string &content_type,
+              Progress progress);
   Result Post(const std::string &path, const std::string &body,
               const std::string &content_type);
+  Result Post(const std::string &path, const std::string &body,
+              const std::string &content_type, Progress progress);
   Result Post(const std::string &path, const Headers &headers,
               const std::string &body, const std::string &content_type);
+  Result Post(const std::string &path, const Headers &headers,
+              const std::string &body, const std::string &content_type,
+              Progress progress);
   Result Post(const std::string &path, size_t content_length,
               ContentProvider content_provider,
               const std::string &content_type);
@@ -1349,6 +1707,8 @@ public:
   Result Post(const std::string &path, const Params &params);
   Result Post(const std::string &path, const Headers &headers,
               const Params &params);
+  Result Post(const std::string &path, const Headers &headers,
+              const Params &params, Progress progress);
   Result Post(const std::string &path, const MultipartFormDataItems &items);
   Result Post(const std::string &path, const Headers &headers,
               const MultipartFormDataItems &items);
@@ -1363,10 +1723,18 @@ public:
              const std::string &content_type);
   Result Put(const std::string &path, const Headers &headers, const char *body,
              size_t content_length, const std::string &content_type);
+  Result Put(const std::string &path, const Headers &headers, const char *body,
+             size_t content_length, const std::string &content_type,
+             Progress progress);
   Result Put(const std::string &path, const std::string &body,
              const std::string &content_type);
+  Result Put(const std::string &path, const std::string &body,
+             const std::string &content_type, Progress progress);
   Result Put(const std::string &path, const Headers &headers,
              const std::string &body, const std::string &content_type);
+  Result Put(const std::string &path, const Headers &headers,
+             const std::string &body, const std::string &content_type,
+             Progress progress);
   Result Put(const std::string &path, size_t content_length,
              ContentProvider content_provider, const std::string &content_type);
   Result Put(const std::string &path,
@@ -1381,6 +1749,8 @@ public:
   Result Put(const std::string &path, const Params &params);
   Result Put(const std::string &path, const Headers &headers,
              const Params &params);
+  Result Put(const std::string &path, const Headers &headers,
+             const Params &params, Progress progress);
   Result Put(const std::string &path, const MultipartFormDataItems &items);
   Result Put(const std::string &path, const Headers &headers,
              const MultipartFormDataItems &items);
@@ -1393,13 +1763,23 @@ public:
   Result Patch(const std::string &path);
   Result Patch(const std::string &path, const char *body, size_t content_length,
                const std::string &content_type);
+  Result Patch(const std::string &path, const char *body, size_t content_length,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const Headers &headers,
                const char *body, size_t content_length,
                const std::string &content_type);
+  Result Patch(const std::string &path, const Headers &headers,
+               const char *body, size_t content_length,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const std::string &body,
                const std::string &content_type);
+  Result Patch(const std::string &path, const std::string &body,
+               const std::string &content_type, Progress progress);
   Result Patch(const std::string &path, const Headers &headers,
                const std::string &body, const std::string &content_type);
+  Result Patch(const std::string &path, const Headers &headers,
+               const std::string &body, const std::string &content_type,
+               Progress progress);
   Result Patch(const std::string &path, size_t content_length,
                ContentProvider content_provider,
                const std::string &content_type);
@@ -1417,13 +1797,24 @@ public:
   Result Delete(const std::string &path, const Headers &headers);
   Result Delete(const std::string &path, const char *body,
                 size_t content_length, const std::string &content_type);
+  Result Delete(const std::string &path, const char *body,
+                size_t content_length, const std::string &content_type,
+                Progress progress);
   Result Delete(const std::string &path, const Headers &headers,
                 const char *body, size_t content_length,
                 const std::string &content_type);
+  Result Delete(const std::string &path, const Headers &headers,
+                const char *body, size_t content_length,
+                const std::string &content_type, Progress progress);
   Result Delete(const std::string &path, const std::string &body,
                 const std::string &content_type);
+  Result Delete(const std::string &path, const std::string &body,
+                const std::string &content_type, Progress progress);
   Result Delete(const std::string &path, const Headers &headers,
                 const std::string &body, const std::string &content_type);
+  Result Delete(const std::string &path, const Headers &headers,
+                const std::string &body, const std::string &content_type,
+                Progress progress);
 
   Result Options(const std::string &path);
   Result Options(const std::string &path, const Headers &headers);
@@ -1431,16 +1822,21 @@ public:
   bool send(Request &req, Response &res, Error &error);
   Result send(const Request &req);
 
-  size_t is_socket_open() const;
-
-  socket_t socket() const;
-
   void stop();
 
+  std::string host() const;
+  int port() const;
+
+  size_t is_socket_open() const;
+  socket_t socket() const;
+
   void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
 
   void set_default_headers(Headers headers);
 
+  void
+  set_header_writer(std::function<ssize_t(Stream &, Headers &)> const &writer);
+
   void set_address_family(int family);
   void set_tcp_nodelay(bool on);
   void set_socket_options(SocketOptions socket_options);
@@ -1487,6 +1883,8 @@ public:
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
   void enable_server_certificate_verification(bool enabled);
+  void enable_server_hostname_verification(bool enabled);
+  void set_server_certificate_verifier(std::function<bool(SSL *ssl)> verifier);
 #endif
 
   void set_logger(Logger logger);
@@ -1497,6 +1895,7 @@ public:
                         const std::string &ca_cert_dir_path = std::string());
 
   void set_ca_cert_store(X509_STORE *ca_cert_store);
+  void load_ca_cert_store(const char *ca_cert, std::size_t size);
 
   long get_openssl_verify_result() const;
 
@@ -1531,6 +1930,9 @@ public:
 
   SSL_CTX *ssl_context() const;
 
+  void update_certs(X509 *cert, EVP_PKEY *private_key,
+                    X509_STORE *client_ca_cert_store = nullptr);
+
 private:
   bool process_and_close_socket(socket_t sock) override;
 
@@ -1538,7 +1940,7 @@ private:
   std::mutex ctx_mutex_;
 };
 
-class SSLClient : public ClientImpl {
+class SSLClient final : public ClientImpl {
 public:
   explicit SSLClient(const std::string &host);
 
@@ -1546,16 +1948,19 @@ public:
 
   explicit SSLClient(const std::string &host, int port,
                      const std::string &client_cert_path,
-                     const std::string &client_key_path);
+                     const std::string &client_key_path,
+                     const std::string &private_key_password = std::string());
 
   explicit SSLClient(const std::string &host, int port, X509 *client_cert,
-                     EVP_PKEY *client_key);
+                     EVP_PKEY *client_key,
+                     const std::string &private_key_password = std::string());
 
   ~SSLClient() override;
 
   bool is_valid() const override;
 
   void set_ca_cert_store(X509_STORE *ca_cert_store);
+  void load_ca_cert_store(const char *ca_cert, std::size_t size);
 
   long get_openssl_verify_result() const;
 
@@ -1564,7 +1969,7 @@ public:
 private:
   bool create_and_connect_socket(Socket &socket, Error &error) override;
   void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override;
-  void shutdown_ssl_impl(Socket &socket, bool shutdown_socket);
+  void shutdown_ssl_impl(Socket &socket, bool shutdown_gracefully);
 
   bool process_socket(const Socket &socket,
                       std::function<bool(Stream &strm)> callback) override;
@@ -1608,78 +2013,147 @@ inline void duration_to_sec_and_usec(const T &duration, U callback) {
   callback(static_cast<time_t>(sec), static_cast<time_t>(usec));
 }
 
-template <typename T>
-inline T get_header_value(const Headers & /*headers*/,
-                          const std::string & /*key*/, size_t /*id*/ = 0,
-                          uint64_t /*def*/ = 0) {}
+inline bool is_numeric(const std::string &str) {
+  return !str.empty() && std::all_of(str.begin(), str.end(), ::isdigit);
+}
 
-template <>
-inline uint64_t get_header_value<uint64_t>(const Headers &headers,
-                                           const std::string &key, size_t id,
-                                           uint64_t def) {
+inline uint64_t get_header_value_u64(const Headers &headers,
+                                     const std::string &key, uint64_t def,
+                                     size_t id, bool &is_invalid_value) {
+  is_invalid_value = false;
   auto rng = headers.equal_range(key);
   auto it = rng.first;
   std::advance(it, static_cast<ssize_t>(id));
   if (it != rng.second) {
-    return std::strtoull(it->second.data(), nullptr, 10);
+    if (is_numeric(it->second)) {
+      return std::strtoull(it->second.data(), nullptr, 10);
+    } else {
+      is_invalid_value = true;
+    }
   }
   return def;
 }
 
+inline uint64_t get_header_value_u64(const Headers &headers,
+                                     const std::string &key, uint64_t def,
+                                     size_t id) {
+  bool dummy = false;
+  return get_header_value_u64(headers, key, def, id, dummy);
+}
+
 } // namespace detail
 
-template <typename T>
-inline T Request::get_header_value(const std::string &key, size_t id) const {
-  return detail::get_header_value<T>(headers, key, id, 0);
+inline uint64_t Request::get_header_value_u64(const std::string &key,
+                                              uint64_t def, size_t id) const {
+  return detail::get_header_value_u64(headers, key, def, id);
 }
 
-template <typename T>
-inline T Response::get_header_value(const std::string &key, size_t id) const {
-  return detail::get_header_value<T>(headers, key, id, 0);
-}
-
-template <typename... Args>
-inline ssize_t Stream::write_format(const char *fmt, const Args &...args) {
-  const auto bufsiz = 2048;
-  std::array<char, bufsiz> buf{};
-
-  auto sn = snprintf(buf.data(), buf.size() - 1, fmt, args...);
-  if (sn <= 0) { return sn; }
-
-  auto n = static_cast<size_t>(sn);
-
-  if (n >= buf.size() - 1) {
-    std::vector<char> glowable_buf(buf.size());
-
-    while (n >= glowable_buf.size() - 1) {
-      glowable_buf.resize(glowable_buf.size() * 2);
-      n = static_cast<size_t>(
-          snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...));
-    }
-    return write(&glowable_buf[0], n);
-  } else {
-    return write(buf.data(), n);
-  }
+inline uint64_t Response::get_header_value_u64(const std::string &key,
+                                               uint64_t def, size_t id) const {
+  return detail::get_header_value_u64(headers, key, def, id);
 }
 
 inline void default_socket_options(socket_t sock) {
-  int yes = 1;
+  int opt = 1;
 #ifdef _WIN32
-  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast<char *>(&yes),
-             sizeof(yes));
-  setsockopt(sock, SOL_SOCKET, SO_EXCLUSIVEADDRUSE,
-             reinterpret_cast<char *>(&yes), sizeof(yes));
+  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+             reinterpret_cast<const char *>(&opt), sizeof(opt));
 #else
 #ifdef SO_REUSEPORT
-  setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, reinterpret_cast<void *>(&yes),
-             sizeof(yes));
+  setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
+             reinterpret_cast<const void *>(&opt), sizeof(opt));
 #else
-  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast<void *>(&yes),
-             sizeof(yes));
+  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+             reinterpret_cast<const void *>(&opt), sizeof(opt));
 #endif
 #endif
 }
 
+inline const char *status_message(int status) {
+  switch (status) {
+  case StatusCode::Continue_100: return "Continue";
+  case StatusCode::SwitchingProtocol_101: return "Switching Protocol";
+  case StatusCode::Processing_102: return "Processing";
+  case StatusCode::EarlyHints_103: return "Early Hints";
+  case StatusCode::OK_200: return "OK";
+  case StatusCode::Created_201: return "Created";
+  case StatusCode::Accepted_202: return "Accepted";
+  case StatusCode::NonAuthoritativeInformation_203:
+    return "Non-Authoritative Information";
+  case StatusCode::NoContent_204: return "No Content";
+  case StatusCode::ResetContent_205: return "Reset Content";
+  case StatusCode::PartialContent_206: return "Partial Content";
+  case StatusCode::MultiStatus_207: return "Multi-Status";
+  case StatusCode::AlreadyReported_208: return "Already Reported";
+  case StatusCode::IMUsed_226: return "IM Used";
+  case StatusCode::MultipleChoices_300: return "Multiple Choices";
+  case StatusCode::MovedPermanently_301: return "Moved Permanently";
+  case StatusCode::Found_302: return "Found";
+  case StatusCode::SeeOther_303: return "See Other";
+  case StatusCode::NotModified_304: return "Not Modified";
+  case StatusCode::UseProxy_305: return "Use Proxy";
+  case StatusCode::unused_306: return "unused";
+  case StatusCode::TemporaryRedirect_307: return "Temporary Redirect";
+  case StatusCode::PermanentRedirect_308: return "Permanent Redirect";
+  case StatusCode::BadRequest_400: return "Bad Request";
+  case StatusCode::Unauthorized_401: return "Unauthorized";
+  case StatusCode::PaymentRequired_402: return "Payment Required";
+  case StatusCode::Forbidden_403: return "Forbidden";
+  case StatusCode::NotFound_404: return "Not Found";
+  case StatusCode::MethodNotAllowed_405: return "Method Not Allowed";
+  case StatusCode::NotAcceptable_406: return "Not Acceptable";
+  case StatusCode::ProxyAuthenticationRequired_407:
+    return "Proxy Authentication Required";
+  case StatusCode::RequestTimeout_408: return "Request Timeout";
+  case StatusCode::Conflict_409: return "Conflict";
+  case StatusCode::Gone_410: return "Gone";
+  case StatusCode::LengthRequired_411: return "Length Required";
+  case StatusCode::PreconditionFailed_412: return "Precondition Failed";
+  case StatusCode::PayloadTooLarge_413: return "Payload Too Large";
+  case StatusCode::UriTooLong_414: return "URI Too Long";
+  case StatusCode::UnsupportedMediaType_415: return "Unsupported Media Type";
+  case StatusCode::RangeNotSatisfiable_416: return "Range Not Satisfiable";
+  case StatusCode::ExpectationFailed_417: return "Expectation Failed";
+  case StatusCode::ImATeapot_418: return "I'm a teapot";
+  case StatusCode::MisdirectedRequest_421: return "Misdirected Request";
+  case StatusCode::UnprocessableContent_422: return "Unprocessable Content";
+  case StatusCode::Locked_423: return "Locked";
+  case StatusCode::FailedDependency_424: return "Failed Dependency";
+  case StatusCode::TooEarly_425: return "Too Early";
+  case StatusCode::UpgradeRequired_426: return "Upgrade Required";
+  case StatusCode::PreconditionRequired_428: return "Precondition Required";
+  case StatusCode::TooManyRequests_429: return "Too Many Requests";
+  case StatusCode::RequestHeaderFieldsTooLarge_431:
+    return "Request Header Fields Too Large";
+  case StatusCode::UnavailableForLegalReasons_451:
+    return "Unavailable For Legal Reasons";
+  case StatusCode::NotImplemented_501: return "Not Implemented";
+  case StatusCode::BadGateway_502: return "Bad Gateway";
+  case StatusCode::ServiceUnavailable_503: return "Service Unavailable";
+  case StatusCode::GatewayTimeout_504: return "Gateway Timeout";
+  case StatusCode::HttpVersionNotSupported_505:
+    return "HTTP Version Not Supported";
+  case StatusCode::VariantAlsoNegotiates_506: return "Variant Also Negotiates";
+  case StatusCode::InsufficientStorage_507: return "Insufficient Storage";
+  case StatusCode::LoopDetected_508: return "Loop Detected";
+  case StatusCode::NotExtended_510: return "Not Extended";
+  case StatusCode::NetworkAuthenticationRequired_511:
+    return "Network Authentication Required";
+
+  default:
+  case StatusCode::InternalServerError_500: return "Internal Server Error";
+  }
+}
+
+inline std::string get_bearer_token_auth(const Request &req) {
+  if (req.has_header("Authorization")) {
+    static std::string BearerHeaderPrefix = "Bearer ";
+    return req.get_header_value("Authorization")
+        .substr(BearerHeaderPrefix.length());
+  }
+  return "";
+}
+
 template <class Rep, class Period>
 inline Server &
 Server::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
@@ -1716,10 +2190,13 @@ inline std::string to_string(const Error error) {
   case Error::SSLConnection: return "SSL connection failed";
   case Error::SSLLoadingCerts: return "SSL certificate loading failed";
   case Error::SSLServerVerification: return "SSL server verification failed";
+  case Error::SSLServerHostnameVerification:
+    return "SSL server hostname verification failed";
   case Error::UnsupportedMultipartBoundaryChars:
     return "Unsupported HTTP multipart boundary characters";
   case Error::Compression: return "Compression failed";
   case Error::ConnectionTimeout: return "Connection timed out";
+  case Error::ProxyConnection: return "Proxy connection failed";
   case Error::Unknown: return "Unknown";
   default: break;
   }
@@ -1733,10 +2210,10 @@ inline std::ostream &operator<<(std::ostream &os, const Error &obj) {
   return os;
 }
 
-template <typename T>
-inline T Result::get_request_header_value(const std::string &key,
-                                          size_t id) const {
-  return detail::get_header_value<T>(request_headers_, key, id, 0);
+inline uint64_t Result::get_request_header_value_u64(const std::string &key,
+                                                     uint64_t def,
+                                                     size_t id) const {
+  return detail::get_header_value_u64(request_headers_, key, def, id);
 }
 
 template <class Rep, class Period>
@@ -1790,7 +2267,7 @@ void hosted_at(const std::string &hostname, std::vector<std::string> &addrs);
 
 std::string append_query_params(const std::string &path, const Params &params);
 
-std::pair<std::string, std::string> make_range_header(Ranges ranges);
+std::pair<std::string, std::string> make_range_header(const Ranges &ranges);
 
 std::pair<std::string, std::string>
 make_basic_authentication_header(const std::string &username,
@@ -1799,6 +2276,36 @@ make_basic_authentication_header(const std::string &username,
 
 namespace detail {
 
+#if defined(_WIN32)
+inline std::wstring u8string_to_wstring(const char *s) {
+  std::wstring ws;
+  auto len = static_cast<int>(strlen(s));
+  auto wlen = ::MultiByteToWideChar(CP_UTF8, 0, s, len, nullptr, 0);
+  if (wlen > 0) {
+    ws.resize(wlen);
+    wlen = ::MultiByteToWideChar(
+        CP_UTF8, 0, s, len,
+        const_cast<LPWSTR>(reinterpret_cast<LPCWSTR>(ws.data())), wlen);
+    if (wlen != static_cast<int>(ws.size())) { ws.clear(); }
+  }
+  return ws;
+}
+#endif
+
+struct FileStat {
+  FileStat(const std::string &path);
+  bool is_file() const;
+  bool is_dir() const;
+
+private:
+#if defined(_WIN32)
+  struct _stat st_;
+#else
+  struct stat st_;
+#endif
+  int ret_ = -1;
+};
+
 std::string encode_query_param(const std::string &value);
 
 std::string decode_url(const std::string &s, bool convert_plus_to_space);
@@ -1807,26 +2314,44 @@ void read_file(const std::string &path, std::string &out);
 
 std::string trim_copy(const std::string &s);
 
+void divide(
+    const char *data, std::size_t size, char d,
+    std::function<void(const char *, std::size_t, const char *, std::size_t)>
+        fn);
+
+void divide(
+    const std::string &str, char d,
+    std::function<void(const char *, std::size_t, const char *, std::size_t)>
+        fn);
+
 void split(const char *b, const char *e, char d,
            std::function<void(const char *, const char *)> fn);
 
+void split(const char *b, const char *e, char d, size_t m,
+           std::function<void(const char *, const char *)> fn);
+
 bool process_client_socket(socket_t sock, time_t read_timeout_sec,
                            time_t read_timeout_usec, time_t write_timeout_sec,
                            time_t write_timeout_usec,
                            std::function<bool(Stream &)> callback);
 
-socket_t create_client_socket(
-    const std::string &host, const std::string &ip, int port,
-    int address_family, bool tcp_nodelay, SocketOptions socket_options,
-    time_t connection_timeout_sec, time_t connection_timeout_usec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, const std::string &intf, Error &error);
+socket_t create_client_socket(const std::string &host, const std::string &ip,
+                              int port, int address_family, bool tcp_nodelay,
+                              bool ipv6_v6only, SocketOptions socket_options,
+                              time_t connection_timeout_sec,
+                              time_t connection_timeout_usec,
+                              time_t read_timeout_sec, time_t read_timeout_usec,
+                              time_t write_timeout_sec,
+                              time_t write_timeout_usec,
+                              const std::string &intf, Error &error);
 
 const char *get_header_value(const Headers &headers, const std::string &key,
-                             size_t id = 0, const char *def = nullptr);
+                             const char *def, size_t id);
 
 std::string params_to_query_str(const Params &params);
 
+void parse_query_text(const char *data, std::size_t size, Params &params);
+
 void parse_query_text(const std::string &s, Params &params);
 
 bool parse_multipart_boundary(const std::string &content_type,
@@ -1844,7 +2369,7 @@ enum class EncodingType { None = 0, Gzip, Brotli };
 
 EncodingType encoding_type(const Request &req, const Response &res);
 
-class BufferStream : public Stream {
+class BufferStream final : public Stream {
 public:
   BufferStream() = default;
   ~BufferStream() override = default;
@@ -1884,19 +2409,19 @@ public:
                           Callback callback) = 0;
 };
 
-class nocompressor : public compressor {
+class nocompressor final : public compressor {
 public:
-  virtual ~nocompressor() = default;
+  ~nocompressor() override = default;
 
   bool compress(const char *data, size_t data_length, bool /*last*/,
                 Callback callback) override;
 };
 
 #ifdef CPPHTTPLIB_ZLIB_SUPPORT
-class gzip_compressor : public compressor {
+class gzip_compressor final : public compressor {
 public:
   gzip_compressor();
-  ~gzip_compressor();
+  ~gzip_compressor() override;
 
   bool compress(const char *data, size_t data_length, bool last,
                 Callback callback) override;
@@ -1906,10 +2431,10 @@ private:
   z_stream strm_;
 };
 
-class gzip_decompressor : public decompressor {
+class gzip_decompressor final : public decompressor {
 public:
   gzip_decompressor();
-  ~gzip_decompressor();
+  ~gzip_decompressor() override;
 
   bool is_valid() const override;
 
@@ -1923,7 +2448,7 @@ private:
 #endif
 
 #ifdef CPPHTTPLIB_BROTLI_SUPPORT
-class brotli_compressor : public compressor {
+class brotli_compressor final : public compressor {
 public:
   brotli_compressor();
   ~brotli_compressor();
@@ -1935,7 +2460,7 @@ private:
   BrotliEncoderState *state_ = nullptr;
 };
 
-class brotli_decompressor : public decompressor {
+class brotli_decompressor final : public decompressor {
 public:
   brotli_decompressor();
   ~brotli_decompressor();
@@ -1972,6 +2497,84 @@ private:
   std::string glowable_buffer_;
 };
 
+class mmap {
+public:
+  mmap(const char *path);
+  ~mmap();
+
+  bool open(const char *path);
+  void close();
+
+  bool is_open() const;
+  size_t size() const;
+  const char *data() const;
+
+private:
+#if defined(_WIN32)
+  HANDLE hFile_ = NULL;
+  HANDLE hMapping_ = NULL;
+#else
+  int fd_ = -1;
+#endif
+  size_t size_ = 0;
+  void *addr_ = nullptr;
+  bool is_open_empty_file = false;
+};
+
+// NOTE: https://www.rfc-editor.org/rfc/rfc9110#section-5
+namespace fields {
+
+inline bool is_token_char(char c) {
+  return std::isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' ||
+         c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' ||
+         c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~';
+}
+
+inline bool is_token(const std::string &s) {
+  if (s.empty()) { return false; }
+  for (auto c : s) {
+    if (!is_token_char(c)) { return false; }
+  }
+  return true;
+}
+
+inline bool is_field_name(const std::string &s) { return is_token(s); }
+
+inline bool is_vchar(char c) { return c >= 33 && c <= 126; }
+
+inline bool is_obs_text(char c) { return 128 <= static_cast<unsigned char>(c); }
+
+inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); }
+
+inline bool is_field_content(const std::string &s) {
+  if (s.empty()) { return false; }
+
+  if (s.size() == 1) {
+    return is_field_vchar(s[0]);
+  } else if (s.size() == 2) {
+    return is_field_vchar(s[0]) && is_field_vchar(s[1]);
+  } else {
+    size_t i = 0;
+
+    if (!is_field_vchar(s[i])) { return false; }
+    i++;
+
+    while (i < s.size() - 1) {
+      auto c = s[i++];
+      if (c == ' ' || c == '\t' || is_field_vchar(c)) {
+      } else {
+        return false;
+      }
+    }
+
+    return is_field_vchar(s[i]);
+  }
+}
+
+inline bool is_field_value(const std::string &s) { return is_field_content(s); }
+
+} // namespace fields
+
 } // namespace detail
 
 // ----------------------------------------------------------------------------
@@ -2003,7 +2606,7 @@ inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt,
   val = 0;
   for (; cnt; i++, cnt--) {
     if (!s[i]) { return false; }
-    int v = 0;
+    auto v = 0;
     if (is_hex(s[i], v)) {
       val = val * 16 + v;
     } else {
@@ -2014,7 +2617,7 @@ inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt,
 }
 
 inline std::string from_i_to_hex(size_t n) {
-  const char *charset = "0123456789abcdef";
+  static const auto charset = "0123456789abcdef";
   std::string ret;
   do {
     ret = charset[n & 15] + ret;
@@ -2025,7 +2628,7 @@ inline std::string from_i_to_hex(size_t n) {
 
 inline size_t to_utf8(int code, char *buff) {
   if (code < 0x0080) {
-    buff[0] = (code & 0x7F);
+    buff[0] = static_cast<char>(code & 0x7F);
     return 1;
   } else if (code < 0x0800) {
     buff[0] = static_cast<char>(0xC0 | ((code >> 6) & 0x1F));
@@ -2064,8 +2667,8 @@ inline std::string base64_encode(const std::string &in) {
   std::string out;
   out.reserve(in.size());
 
-  int val = 0;
-  int valb = -6;
+  auto val = 0;
+  auto valb = -6;
 
   for (auto c : in) {
     val = (val << 8) + static_cast<uint8_t>(c);
@@ -2085,20 +2688,6 @@ inline std::string base64_encode(const std::string &in) {
   return out;
 }
 
-inline bool is_file(const std::string &path) {
-#ifdef _WIN32
-  return _access_s(path.c_str(), 0) == 0;
-#else
-  struct stat st;
-  return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode);
-#endif
-}
-
-inline bool is_dir(const std::string &path) {
-  struct stat st;
-  return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode);
-}
-
 inline bool is_valid_path(const std::string &path) {
   size_t level = 0;
   size_t i = 0;
@@ -2112,6 +2701,11 @@ inline bool is_valid_path(const std::string &path) {
     // Read component
     auto beg = i;
     while (i < path.size() && path[i] != '/') {
+      if (path[i] == '\0') {
+        return false;
+      } else if (path[i] == '\\') {
+        return false;
+      }
       i++;
     }
 
@@ -2136,6 +2730,21 @@ inline bool is_valid_path(const std::string &path) {
   return true;
 }
 
+inline FileStat::FileStat(const std::string &path) {
+#if defined(_WIN32)
+  auto wpath = u8string_to_wstring(path.c_str());
+  ret_ = _wstat(wpath.c_str(), &st_);
+#else
+  ret_ = stat(path.c_str(), &st_);
+#endif
+}
+inline bool FileStat::is_file() const {
+  return ret_ >= 0 && S_ISREG(st_.st_mode);
+}
+inline bool FileStat::is_dir() const {
+  return ret_ >= 0 && S_ISDIR(st_.st_mode);
+}
+
 inline std::string encode_query_param(const std::string &value) {
   std::ostringstream escaped;
   escaped.fill('0');
@@ -2196,7 +2805,7 @@ inline std::string decode_url(const std::string &s,
   for (size_t i = 0; i < s.size(); i++) {
     if (s[i] == '%' && i + 1 < s.size()) {
       if (s[i + 1] == 'u') {
-        int val = 0;
+        auto val = 0;
         if (from_hex_to_i(s, i + 2, 4, val)) {
           // 4 digits Unicode codes
           char buff[4];
@@ -2207,7 +2816,7 @@ inline std::string decode_url(const std::string &s,
           result += s[i];
         }
       } else {
-        int val = 0;
+        auto val = 0;
         if (from_hex_to_i(s, i + 1, 2, val)) {
           // 2 digits hex codes
           result += static_cast<char>(val);
@@ -2260,16 +2869,51 @@ inline std::string trim_copy(const std::string &s) {
   return s.substr(r.first, r.second - r.first);
 }
 
+inline std::string trim_double_quotes_copy(const std::string &s) {
+  if (s.length() >= 2 && s.front() == '"' && s.back() == '"') {
+    return s.substr(1, s.size() - 2);
+  }
+  return s;
+}
+
+inline void
+divide(const char *data, std::size_t size, char d,
+       std::function<void(const char *, std::size_t, const char *, std::size_t)>
+           fn) {
+  const auto it = std::find(data, data + size, d);
+  const auto found = static_cast<std::size_t>(it != data + size);
+  const auto lhs_data = data;
+  const auto lhs_size = static_cast<std::size_t>(it - data);
+  const auto rhs_data = it + found;
+  const auto rhs_size = size - lhs_size - found;
+
+  fn(lhs_data, lhs_size, rhs_data, rhs_size);
+}
+
+inline void
+divide(const std::string &str, char d,
+       std::function<void(const char *, std::size_t, const char *, std::size_t)>
+           fn) {
+  divide(str.data(), str.size(), d, std::move(fn));
+}
+
 inline void split(const char *b, const char *e, char d,
                   std::function<void(const char *, const char *)> fn) {
+  return split(b, e, d, (std::numeric_limits<size_t>::max)(), std::move(fn));
+}
+
+inline void split(const char *b, const char *e, char d, size_t m,
+                  std::function<void(const char *, const char *)> fn) {
   size_t i = 0;
   size_t beg = 0;
+  size_t count = 1;
 
   while (e ? (b + i < e) : (b[i] != '\0')) {
-    if (b[i] == d) {
+    if (b[i] == d && count < m) {
       auto r = trim(b, e, beg, i);
       if (r.first < r.second) { fn(&b[r.first], &b[r.second]); }
       beg = i + 1;
+      count++;
     }
     i++;
   }
@@ -2310,6 +2954,10 @@ inline bool stream_line_reader::getline() {
   fixed_buffer_used_size_ = 0;
   glowable_buffer_.clear();
 
+#ifndef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
+  char prev_byte = 0;
+#endif
+
   for (size_t i = 0;; i++) {
     char byte;
     auto n = strm_.read(&byte, 1);
@@ -2326,7 +2974,12 @@ inline bool stream_line_reader::getline() {
 
     append(byte);
 
+#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
     if (byte == '\n') { break; }
+#else
+    if (prev_byte == '\r' && byte == '\n') { break; }
+    prev_byte = byte;
+#endif
   }
 
   return true;
@@ -2345,6 +2998,133 @@ inline void stream_line_reader::append(char c) {
   }
 }
 
+inline mmap::mmap(const char *path) { open(path); }
+
+inline mmap::~mmap() { close(); }
+
+inline bool mmap::open(const char *path) {
+  close();
+
+#if defined(_WIN32)
+  auto wpath = u8string_to_wstring(path);
+  if (wpath.empty()) { return false; }
+
+#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+  hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ,
+                         OPEN_EXISTING, NULL);
+#else
+  hFile_ = ::CreateFileW(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
+                         OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+#endif
+
+  if (hFile_ == INVALID_HANDLE_VALUE) { return false; }
+
+  LARGE_INTEGER size{};
+  if (!::GetFileSizeEx(hFile_, &size)) { return false; }
+  // If the following line doesn't compile due to QuadPart, update Windows SDK.
+  // See:
+  // https://github.com/yhirose/cpp-httplib/issues/1903#issuecomment-2316520721
+  if (static_cast<ULONGLONG>(size.QuadPart) >
+      (std::numeric_limits<decltype(size_)>::max)()) {
+    // `size_t` might be 32-bits, on 32-bits Windows.
+    return false;
+  }
+  size_ = static_cast<size_t>(size.QuadPart);
+
+#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+  hMapping_ =
+      ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL);
+#else
+  hMapping_ = ::CreateFileMappingW(hFile_, NULL, PAGE_READONLY, 0, 0, NULL);
+#endif
+
+  // Special treatment for an empty file...
+  if (hMapping_ == NULL && size_ == 0) {
+    close();
+    is_open_empty_file = true;
+    return true;
+  }
+
+  if (hMapping_ == NULL) {
+    close();
+    return false;
+  }
+
+#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+  addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0);
+#else
+  addr_ = ::MapViewOfFile(hMapping_, FILE_MAP_READ, 0, 0, 0);
+#endif
+
+  if (addr_ == nullptr) {
+    close();
+    return false;
+  }
+#else
+  fd_ = ::open(path, O_RDONLY);
+  if (fd_ == -1) { return false; }
+
+  struct stat sb;
+  if (fstat(fd_, &sb) == -1) {
+    close();
+    return false;
+  }
+  size_ = static_cast<size_t>(sb.st_size);
+
+  addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0);
+
+  // Special treatment for an empty file...
+  if (addr_ == MAP_FAILED && size_ == 0) {
+    close();
+    is_open_empty_file = true;
+    return false;
+  }
+#endif
+
+  return true;
+}
+
+inline bool mmap::is_open() const {
+  return is_open_empty_file ? true : addr_ != nullptr;
+}
+
+inline size_t mmap::size() const { return size_; }
+
+inline const char *mmap::data() const {
+  return is_open_empty_file ? "" : static_cast<const char *>(addr_);
+}
+
+inline void mmap::close() {
+#if defined(_WIN32)
+  if (addr_) {
+    ::UnmapViewOfFile(addr_);
+    addr_ = nullptr;
+  }
+
+  if (hMapping_) {
+    ::CloseHandle(hMapping_);
+    hMapping_ = NULL;
+  }
+
+  if (hFile_ != INVALID_HANDLE_VALUE) {
+    ::CloseHandle(hFile_);
+    hFile_ = INVALID_HANDLE_VALUE;
+  }
+
+  is_open_empty_file = false;
+#else
+  if (addr_ != nullptr) {
+    munmap(addr_, size_);
+    addr_ = nullptr;
+  }
+
+  if (fd_ != -1) {
+    ::close(fd_);
+    fd_ = -1;
+  }
+#endif
+  size_ = 0;
+}
 inline int close_socket(socket_t sock) {
 #ifdef _WIN32
   return closesocket(sock);
@@ -2354,10 +3134,13 @@ inline int close_socket(socket_t sock) {
 }
 
 template <typename T> inline ssize_t handle_EINTR(T fn) {
-  ssize_t res = false;
+  ssize_t res = 0;
   while (true) {
     res = fn();
-    if (res < 0 && errno == EINTR) { continue; }
+    if (res < 0 && errno == EINTR) {
+      std::this_thread::sleep_for(std::chrono::microseconds{1});
+      continue;
+    }
     break;
   }
   return res;
@@ -2399,7 +3182,7 @@ inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
   return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); });
 #else
 #ifndef _WIN32
-  if (sock >= FD_SETSIZE) { return 1; }
+  if (sock >= FD_SETSIZE) { return -1; }
 #endif
 
   fd_set fds;
@@ -2427,7 +3210,7 @@ inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
   return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); });
 #else
 #ifndef _WIN32
-  if (sock >= FD_SETSIZE) { return 1; }
+  if (sock >= FD_SETSIZE) { return -1; }
 #endif
 
   fd_set fds;
@@ -2458,7 +3241,7 @@ inline Error wait_until_socket_is_ready(socket_t sock, time_t sec,
   if (poll_res == 0) { return Error::ConnectionTimeout; }
 
   if (poll_res > 0 && pfd_read.revents & (POLLIN | POLLOUT)) {
-    int error = 0;
+    auto error = 0;
     socklen_t len = sizeof(error);
     auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
                           reinterpret_cast<char *>(&error), &len);
@@ -2490,7 +3273,7 @@ inline Error wait_until_socket_is_ready(socket_t sock, time_t sec,
   if (ret == 0) { return Error::ConnectionTimeout; }
 
   if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
-    int error = 0;
+    auto error = 0;
     socklen_t len = sizeof(error);
     auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
                           reinterpret_cast<char *>(&error), &len);
@@ -2512,7 +3295,7 @@ inline bool is_socket_alive(socket_t sock) {
   return detail::read_socket(sock, &buf[0], sizeof(buf), MSG_PEEK) > 0;
 }
 
-class SocketStream : public Stream {
+class SocketStream final : public Stream {
 public:
   SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
                time_t write_timeout_sec, time_t write_timeout_usec);
@@ -2537,11 +3320,11 @@ private:
   size_t read_buff_off_ = 0;
   size_t read_buff_content_size_ = 0;
 
-  static const size_t read_buff_size_ = 1024 * 4;
+  static const size_t read_buff_size_ = 1024l * 4;
 };
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLSocketStream : public Stream {
+class SSLSocketStream final : public Stream {
 public:
   SSLSocketStream(socket_t sock, SSL *ssl, time_t read_timeout_sec,
                   time_t read_timeout_usec, time_t write_timeout_sec,
@@ -2566,23 +3349,37 @@ private:
 };
 #endif
 
-inline bool keep_alive(socket_t sock, time_t keep_alive_timeout_sec) {
+inline bool keep_alive(const std::atomic<socket_t> &svr_sock, socket_t sock,
+                       time_t keep_alive_timeout_sec) {
   using namespace std::chrono;
-  auto start = steady_clock::now();
+
+  const auto interval_usec =
+      CPPHTTPLIB_KEEPALIVE_TIMEOUT_CHECK_INTERVAL_USECOND;
+
+  // Avoid expensive `steady_clock::now()` call for the first time
+  if (select_read(sock, 0, interval_usec) > 0) { return true; }
+
+  const auto start = steady_clock::now() - microseconds{interval_usec};
+  const auto timeout = seconds{keep_alive_timeout_sec};
+
   while (true) {
-    auto val = select_read(sock, 0, 10000);
+    if (svr_sock == INVALID_SOCKET) {
+      break; // Server socket is closed
+    }
+
+    auto val = select_read(sock, 0, interval_usec);
     if (val < 0) {
-      return false;
+      break; // Ssocket error
     } else if (val == 0) {
-      auto current = steady_clock::now();
-      auto duration = duration_cast<milliseconds>(current - start);
-      auto timeout = keep_alive_timeout_sec * 1000;
-      if (duration.count() > timeout) { return false; }
-      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      if (steady_clock::now() - start > timeout) {
+        break; // Timeout
+      }
     } else {
-      return true;
+      return true; // Ready for read
     }
   }
+
+  return false;
 }
 
 template <typename T>
@@ -2593,8 +3390,7 @@ process_server_socket_core(const std::atomic<socket_t> &svr_sock, socket_t sock,
   assert(keep_alive_max_count > 0);
   auto ret = false;
   auto count = keep_alive_max_count;
-  while (svr_sock != INVALID_SOCKET && count > 0 &&
-         keep_alive(sock, keep_alive_timeout_sec)) {
+  while (count > 0 && keep_alive(svr_sock, sock, keep_alive_timeout_sec)) {
     auto close_connection = count == 1;
     auto connection_closed = false;
     ret = callback(close_connection, connection_closed);
@@ -2638,10 +3434,29 @@ inline int shutdown_socket(socket_t sock) {
 #endif
 }
 
+inline std::string escape_abstract_namespace_unix_domain(const std::string &s) {
+  if (s.size() > 1 && s[0] == '\0') {
+    auto ret = s;
+    ret[0] = '@';
+    return ret;
+  }
+  return s;
+}
+
+inline std::string
+unescape_abstract_namespace_unix_domain(const std::string &s) {
+  if (s.size() > 1 && s[0] == '@') {
+    auto ret = s;
+    ret[0] = '\0';
+    return ret;
+  }
+  return s;
+}
+
 template <typename BindOrConnect>
 socket_t create_socket(const std::string &host, const std::string &ip, int port,
                        int address_family, int socket_flags, bool tcp_nodelay,
-                       SocketOptions socket_options,
+                       bool ipv6_v6only, SocketOptions socket_options,
                        BindOrConnect bind_or_connect) {
   // Get address info
   const char *node = nullptr;
@@ -2650,7 +3465,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port,
 
   memset(&hints, 0, sizeof(struct addrinfo));
   hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
+  hints.ai_protocol = IPPROTO_IP;
 
   if (!ip.empty()) {
     node = ip.c_str();
@@ -2666,22 +3481,34 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port,
 #ifndef _WIN32
   if (hints.ai_family == AF_UNIX) {
     const auto addrlen = host.length();
-    if (addrlen > sizeof(sockaddr_un::sun_path)) return INVALID_SOCKET;
+    if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; }
 
+#ifdef SOCK_CLOEXEC
+    auto sock = socket(hints.ai_family, hints.ai_socktype | SOCK_CLOEXEC,
+                       hints.ai_protocol);
+#else
     auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol);
+#endif
+
     if (sock != INVALID_SOCKET) {
       sockaddr_un addr{};
       addr.sun_family = AF_UNIX;
-      std::copy(host.begin(), host.end(), addr.sun_path);
+
+      auto unescaped_host = unescape_abstract_namespace_unix_domain(host);
+      std::copy(unescaped_host.begin(), unescaped_host.end(), addr.sun_path);
 
       hints.ai_addr = reinterpret_cast<sockaddr *>(&addr);
       hints.ai_addrlen = static_cast<socklen_t>(
           sizeof(addr) - sizeof(addr.sun_path) + addrlen);
 
+#ifndef SOCK_CLOEXEC
       fcntl(sock, F_SETFD, FD_CLOEXEC);
+#endif
+
       if (socket_options) { socket_options(sock); }
 
-      if (!bind_or_connect(sock, hints)) {
+      bool dummy;
+      if (!bind_or_connect(sock, hints, dummy)) {
         close_socket(sock);
         sock = INVALID_SOCKET;
       }
@@ -2698,6 +3525,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port,
 #endif
     return INVALID_SOCKET;
   }
+  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
 
   for (auto rp = result; rp; rp = rp->ai_next) {
     // Create a socket
@@ -2723,11 +3551,18 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port,
       sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
     }
 #else
+
+#ifdef SOCK_CLOEXEC
+    auto sock =
+        socket(rp->ai_family, rp->ai_socktype | SOCK_CLOEXEC, rp->ai_protocol);
+#else
     auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+#endif
+
 #endif
     if (sock == INVALID_SOCKET) { continue; }
 
-#ifndef _WIN32
+#if !defined _WIN32 && !defined SOCK_CLOEXEC
     if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) {
       close_socket(sock);
       continue;
@@ -2735,29 +3570,38 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port,
 #endif
 
     if (tcp_nodelay) {
-      int yes = 1;
-      setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&yes),
-                 sizeof(yes));
+      auto opt = 1;
+#ifdef _WIN32
+      setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+                 reinterpret_cast<const char *>(&opt), sizeof(opt));
+#else
+      setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+                 reinterpret_cast<const void *>(&opt), sizeof(opt));
+#endif
+    }
+
+    if (rp->ai_family == AF_INET6) {
+      auto opt = ipv6_v6only ? 1 : 0;
+#ifdef _WIN32
+      setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+                 reinterpret_cast<const char *>(&opt), sizeof(opt));
+#else
+      setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+                 reinterpret_cast<const void *>(&opt), sizeof(opt));
+#endif
     }
 
     if (socket_options) { socket_options(sock); }
 
-    if (rp->ai_family == AF_INET6) {
-      int no = 0;
-      setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, reinterpret_cast<char *>(&no),
-                 sizeof(no));
-    }
-
     // bind or connect
-    if (bind_or_connect(sock, *rp)) {
-      freeaddrinfo(result);
-      return sock;
-    }
+    auto quit = false;
+    if (bind_or_connect(sock, *rp, quit)) { return sock; }
 
     close_socket(sock);
+
+    if (quit) { break; }
   }
 
-  freeaddrinfo(result);
   return INVALID_SOCKET;
 }
 
@@ -2790,6 +3634,7 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) {
   hints.ai_protocol = 0;
 
   if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; }
+  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
 
   auto ret = false;
   for (auto rp = result; rp; rp = rp->ai_next) {
@@ -2800,11 +3645,10 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) {
     }
   }
 
-  freeaddrinfo(result);
   return ret;
 }
 
-#if !defined _WIN32 && !defined ANDROID && !defined _AIX
+#if !defined _WIN32 && !defined ANDROID && !defined _AIX && !defined __MVS__
 #define USE_IF2IP
 #endif
 
@@ -2812,6 +3656,8 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) {
 inline std::string if2ip(int address_family, const std::string &ifn) {
   struct ifaddrs *ifap;
   getifaddrs(&ifap);
+  auto se = detail::scope_exit([&] { freeifaddrs(ifap); });
+
   std::string addr_candidate;
   for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) {
     if (ifa->ifa_addr && ifn == ifa->ifa_name &&
@@ -2821,7 +3667,6 @@ inline std::string if2ip(int address_family, const std::string &ifn) {
         auto sa = reinterpret_cast<struct sockaddr_in *>(ifa->ifa_addr);
         char buf[INET_ADDRSTRLEN];
         if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) {
-          freeifaddrs(ifap);
           return std::string(buf, INET_ADDRSTRLEN);
         }
       } else if (ifa->ifa_addr->sa_family == AF_INET6) {
@@ -2834,7 +3679,6 @@ inline std::string if2ip(int address_family, const std::string &ifn) {
             if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) {
               addr_candidate = std::string(buf, INET6_ADDRSTRLEN);
             } else {
-              freeifaddrs(ifap);
               return std::string(buf, INET6_ADDRSTRLEN);
             }
           }
@@ -2842,25 +3686,26 @@ inline std::string if2ip(int address_family, const std::string &ifn) {
       }
     }
   }
-  freeifaddrs(ifap);
   return addr_candidate;
 }
 #endif
 
 inline socket_t create_client_socket(
     const std::string &host, const std::string &ip, int port,
-    int address_family, bool tcp_nodelay, SocketOptions socket_options,
-    time_t connection_timeout_sec, time_t connection_timeout_usec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
+    int address_family, bool tcp_nodelay, bool ipv6_v6only,
+    SocketOptions socket_options, time_t connection_timeout_sec,
+    time_t connection_timeout_usec, time_t read_timeout_sec,
+    time_t read_timeout_usec, time_t write_timeout_sec,
     time_t write_timeout_usec, const std::string &intf, Error &error) {
   auto sock = create_socket(
-      host, ip, port, address_family, 0, tcp_nodelay, std::move(socket_options),
-      [&](socket_t sock2, struct addrinfo &ai) -> bool {
+      host, ip, port, address_family, 0, tcp_nodelay, ipv6_v6only,
+      std::move(socket_options),
+      [&](socket_t sock2, struct addrinfo &ai, bool &quit) -> bool {
         if (!intf.empty()) {
 #ifdef USE_IF2IP
           auto ip_from_if = if2ip(address_family, intf);
           if (ip_from_if.empty()) { ip_from_if = intf; }
-          if (!bind_ip_address(sock2, ip_from_if.c_str())) {
+          if (!bind_ip_address(sock2, ip_from_if)) {
             error = Error::BindIPAddress;
             return false;
           }
@@ -2879,7 +3724,10 @@ inline socket_t create_client_socket(
           }
           error = wait_until_socket_is_ready(sock2, connection_timeout_sec,
                                              connection_timeout_usec);
-          if (error != Error::Success) { return false; }
+          if (error != Error::Success) {
+            if (error == Error::ConnectionTimeout) { quit = true; }
+            return false;
+          }
         }
 
         set_nonblocking(sock2, false);
@@ -2888,13 +3736,14 @@ inline socket_t create_client_socket(
 #ifdef _WIN32
           auto timeout = static_cast<uint32_t>(read_timeout_sec * 1000 +
                                                read_timeout_usec / 1000);
-          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
-                     sizeof(timeout));
+          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO,
+                     reinterpret_cast<const char *>(&timeout), sizeof(timeout));
 #else
           timeval tv;
           tv.tv_sec = static_cast<long>(read_timeout_sec);
           tv.tv_usec = static_cast<decltype(tv.tv_usec)>(read_timeout_usec);
-          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
+          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO,
+                     reinterpret_cast<const void *>(&tv), sizeof(tv));
 #endif
         }
         {
@@ -2902,13 +3751,14 @@ inline socket_t create_client_socket(
 #ifdef _WIN32
           auto timeout = static_cast<uint32_t>(write_timeout_sec * 1000 +
                                                write_timeout_usec / 1000);
-          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout,
-                     sizeof(timeout));
+          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO,
+                     reinterpret_cast<const char *>(&timeout), sizeof(timeout));
 #else
           timeval tv;
           tv.tv_sec = static_cast<long>(write_timeout_sec);
           tv.tv_usec = static_cast<decltype(tv.tv_usec)>(write_timeout_usec);
-          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
+          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO,
+                     reinterpret_cast<const void *>(&tv), sizeof(tv));
 #endif
         }
 
@@ -3002,24 +3852,26 @@ inline unsigned int str2tag(const std::string &s) {
 
 namespace udl {
 
-inline constexpr unsigned int operator"" _t(const char *s, size_t l) {
+inline constexpr unsigned int operator""_t(const char *s, size_t l) {
   return str2tag_core(s, l, 0);
 }
 
 } // namespace udl
 
-inline const char *
+inline std::string
 find_content_type(const std::string &path,
-                  const std::map<std::string, std::string> &user_data) {
+                  const std::map<std::string, std::string> &user_data,
+                  const std::string &default_content_type) {
   auto ext = file_extension(path);
 
   auto it = user_data.find(ext);
-  if (it != user_data.end()) { return it->second.c_str(); }
+  if (it != user_data.end()) { return it->second; }
 
   using udl::operator""_t;
 
   switch (str2tag(ext)) {
-  default: return nullptr;
+  default: return default_content_type;
+
   case "css"_t: return "text/css";
   case "csv"_t: return "text/csv";
   case "htm"_t:
@@ -3072,76 +3924,6 @@ find_content_type(const std::string &path,
   }
 }
 
-inline const char *status_message(int status) {
-  switch (status) {
-  case 100: return "Continue";
-  case 101: return "Switching Protocol";
-  case 102: return "Processing";
-  case 103: return "Early Hints";
-  case 200: return "OK";
-  case 201: return "Created";
-  case 202: return "Accepted";
-  case 203: return "Non-Authoritative Information";
-  case 204: return "No Content";
-  case 205: return "Reset Content";
-  case 206: return "Partial Content";
-  case 207: return "Multi-Status";
-  case 208: return "Already Reported";
-  case 226: return "IM Used";
-  case 300: return "Multiple Choice";
-  case 301: return "Moved Permanently";
-  case 302: return "Found";
-  case 303: return "See Other";
-  case 304: return "Not Modified";
-  case 305: return "Use Proxy";
-  case 306: return "unused";
-  case 307: return "Temporary Redirect";
-  case 308: return "Permanent Redirect";
-  case 400: return "Bad Request";
-  case 401: return "Unauthorized";
-  case 402: return "Payment Required";
-  case 403: return "Forbidden";
-  case 404: return "Not Found";
-  case 405: return "Method Not Allowed";
-  case 406: return "Not Acceptable";
-  case 407: return "Proxy Authentication Required";
-  case 408: return "Request Timeout";
-  case 409: return "Conflict";
-  case 410: return "Gone";
-  case 411: return "Length Required";
-  case 412: return "Precondition Failed";
-  case 413: return "Payload Too Large";
-  case 414: return "URI Too Long";
-  case 415: return "Unsupported Media Type";
-  case 416: return "Range Not Satisfiable";
-  case 417: return "Expectation Failed";
-  case 418: return "I'm a teapot";
-  case 421: return "Misdirected Request";
-  case 422: return "Unprocessable Entity";
-  case 423: return "Locked";
-  case 424: return "Failed Dependency";
-  case 425: return "Too Early";
-  case 426: return "Upgrade Required";
-  case 428: return "Precondition Required";
-  case 429: return "Too Many Requests";
-  case 431: return "Request Header Fields Too Large";
-  case 451: return "Unavailable For Legal Reasons";
-  case 501: return "Not Implemented";
-  case 502: return "Bad Gateway";
-  case 503: return "Service Unavailable";
-  case 504: return "Gateway Timeout";
-  case 505: return "HTTP Version Not Supported";
-  case 506: return "Variant Also Negotiates";
-  case 507: return "Insufficient Storage";
-  case 508: return "Loop Detected";
-  case 510: return "Not Extended";
-  case 511: return "Network Authentication Required";
-
-  default:
-  case 500: return "Internal Server Error";
-  }
-}
-
 inline bool can_compress_content_type(const std::string &content_type) {
   using udl::operator""_t;
 
@@ -3155,8 +3937,9 @@ inline bool can_compress_content_type(const std::string &content_type) {
   case "application/protobuf"_t:
   case "application/xhtml+xml"_t: return true;
 
-  default:
-    return !content_type.rfind("text/", 0) && tag != "text/event-stream"_t;
+  case "text/event-stream"_t: return false;
+
+  default: return !content_type.rfind("text/", 0);
   }
 }
 
@@ -3218,7 +4001,7 @@ inline bool gzip_compressor::compress(const char *data, size_t data_length,
     data += strm_.avail_in;
 
     auto flush = (last && data_length == 0) ? Z_FINISH : Z_NO_FLUSH;
-    int ret = Z_OK;
+    auto ret = Z_OK;
 
     std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
     do {
@@ -3262,7 +4045,7 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length,
                                           Callback callback) {
   assert(is_valid_);
 
-  int ret = Z_OK;
+  auto ret = Z_OK;
 
   do {
     constexpr size_t max_avail_in =
@@ -3276,16 +4059,12 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length,
     data += strm_.avail_in;
 
     std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-    while (strm_.avail_in > 0) {
+    while (strm_.avail_in > 0 && ret == Z_OK) {
       strm_.avail_out = static_cast<uInt>(buff.size());
       strm_.next_out = reinterpret_cast<Bytef *>(buff.data());
 
-      auto prev_avail_in = strm_.avail_in;
-
       ret = inflate(&strm_, Z_NO_FLUSH);
 
-      if (prev_avail_in - strm_.avail_in == 0) { return false; }
-
       assert(ret != Z_STREAM_ERROR);
       switch (ret) {
       case Z_NEED_DICT:
@@ -3298,7 +4077,7 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length,
       }
     }
 
-    if (ret != Z_OK && ret != Z_STREAM_END) return false;
+    if (ret != Z_OK && ret != Z_STREAM_END) { return false; }
 
   } while (data_length > 0);
 
@@ -3367,7 +4146,7 @@ inline bool brotli_decompressor::decompress(const char *data,
     return 0;
   }
 
-  const uint8_t *next_in = (const uint8_t *)data;
+  auto next_in = reinterpret_cast<const uint8_t *>(data);
   size_t avail_in = data_length;
   size_t total_out;
 
@@ -3397,8 +4176,8 @@ inline bool has_header(const Headers &headers, const std::string &key) {
 }
 
 inline const char *get_header_value(const Headers &headers,
-                                    const std::string &key, size_t id,
-                                    const char *def) {
+                                    const std::string &key, const char *def,
+                                    size_t id) {
   auto rng = headers.equal_range(key);
   auto it = rng.first;
   std::advance(it, static_cast<ssize_t>(id));
@@ -3406,14 +4185,6 @@ inline const char *get_header_value(const Headers &headers,
   return def;
 }
 
-inline bool compare_case_ignore(const std::string &a, const std::string &b) {
-  if (a.size() != b.size()) { return false; }
-  for (size_t i = 0; i < b.size(); i++) {
-    if (::tolower(a[i]) != ::tolower(b[i])) { return false; }
-  }
-  return true;
-}
-
 template <typename T>
 inline bool parse_header(const char *beg, const char *end, T fn) {
   // Skip trailing spaces and tabs.
@@ -3436,12 +4207,27 @@ inline bool parse_header(const char *beg, const char *end, T fn) {
     p++;
   }
 
-  if (p < end) {
+  if (p <= end) {
+    auto key_len = key_end - beg;
+    if (!key_len) { return false; }
+
     auto key = std::string(beg, key_end);
-    auto val = compare_case_ignore(key, "Location")
+    auto val = case_ignore::equal(key, "Location")
                    ? std::string(p, end)
                    : decode_url(std::string(p, end), false);
-    fn(std::move(key), std::move(val));
+
+    // NOTE: From RFC 9110:
+    // Field values containing CR, LF, or NUL characters are
+    // invalid and dangerous, due to the varying ways that
+    // implementations might parse and interpret those
+    // characters; a recipient of CR, LF, or NUL within a field
+    // value MUST either reject the message or replace each of
+    // those characters with SP before further processing or
+    // forwarding of that message.
+    static const std::string CR_LF_NUL("\r\n\0", 3);
+    if (val.find_first_of(CR_LF_NUL) != std::string::npos) { return false; }
+
+    fn(key, val);
     return true;
   }
 
@@ -3461,27 +4247,27 @@ inline bool read_headers(Stream &strm, Headers &headers) {
     if (line_reader.end_with_crlf()) {
       // Blank line indicates end of headers.
       if (line_reader.size() == 2) { break; }
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
     } else {
+#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
       // Blank line indicates end of headers.
       if (line_reader.size() == 1) { break; }
       line_terminator_len = 1;
-    }
 #else
-    } else {
       continue; // Skip invalid line.
-    }
 #endif
+    }
 
     if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
 
     // Exclude line terminator
     auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
 
-    parse_header(line_reader.ptr(), end,
-                 [&](std::string &&key, std::string &&val) {
-                   headers.emplace(std::move(key), std::move(val));
-                 });
+    if (!parse_header(line_reader.ptr(), end,
+                      [&](const std::string &key, std::string &val) {
+                        headers.emplace(key, val);
+                      })) {
+      return false;
+    }
   }
 
   return true;
@@ -3526,11 +4312,7 @@ inline bool read_content_without_length(Stream &strm,
   uint64_t r = 0;
   for (;;) {
     auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ);
-    if (n < 0) {
-      return false;
-    } else if (n == 0) {
-      return true;
-    }
+    if (n <= 0) { return true; }
 
     if (!out(buf, static_cast<size_t>(n), r, 0)) { return false; }
     r += static_cast<uint64_t>(n);
@@ -3566,17 +4348,28 @@ inline bool read_content_chunked(Stream &strm, T &x,
 
     if (!line_reader.getline()) { return false; }
 
-    if (strcmp(line_reader.ptr(), "\r\n")) { return false; }
+    if (strcmp(line_reader.ptr(), "\r\n") != 0) { return false; }
 
     if (!line_reader.getline()) { return false; }
   }
 
   assert(chunk_len == 0);
 
-  // Trailer
-  if (!line_reader.getline()) { return false; }
+  // NOTE: In RFC 9112, '7.1 Chunked Transfer Coding' mentiones "The chunked
+  // transfer coding is complete when a chunk with a chunk-size of zero is
+  // received, possibly followed by a trailer section, and finally terminated by
+  // an empty line". https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1
+  //
+  // In '7.1.3. Decoding Chunked', however, the pseudo-code in the section
+  // does't care for the existence of the final CRLF. In other words, it seems
+  // to be ok whether the final CRLF exists or not in the chunked data.
+  // https://www.rfc-editor.org/rfc/rfc9112.html#section-7.1.3
+  //
+  // According to the reference code in RFC 9112, cpp-htpplib now allows
+  // chuncked transfer coding data without the final CRLF.
+  if (!line_reader.getline()) { return true; }
 
-  while (strcmp(line_reader.ptr(), "\r\n")) {
+  while (strcmp(line_reader.ptr(), "\r\n") != 0) {
     if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
 
     // Exclude line terminator
@@ -3584,8 +4377,8 @@ inline bool read_content_chunked(Stream &strm, T &x,
     auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
 
     parse_header(line_reader.ptr(), end,
-                 [&](std::string &&key, std::string &&val) {
-                   x.headers.emplace(std::move(key), std::move(val));
+                 [&](const std::string &key, const std::string &val) {
+                   x.headers.emplace(key, val);
                  });
 
     if (!line_reader.getline()) { return false; }
@@ -3595,8 +4388,8 @@ inline bool read_content_chunked(Stream &strm, T &x,
 }
 
 inline bool is_chunked_transfer_encoding(const Headers &headers) {
-  return !strcasecmp(get_header_value(headers, "Transfer-Encoding", 0, ""),
-                     "chunked");
+  return case_ignore::equal(
+      get_header_value(headers, "Transfer-Encoding", "", 0), "chunked");
 }
 
 template <typename T, typename U>
@@ -3611,14 +4404,14 @@ bool prepare_content_receiver(T &x, int &status,
 #ifdef CPPHTTPLIB_ZLIB_SUPPORT
       decompressor = detail::make_unique<gzip_decompressor>();
 #else
-      status = 415;
+      status = StatusCode::UnsupportedMediaType_415;
       return false;
 #endif
     } else if (encoding.find("br") != std::string::npos) {
 #ifdef CPPHTTPLIB_BROTLI_SUPPORT
       decompressor = detail::make_unique<brotli_decompressor>();
 #else
-      status = 415;
+      status = StatusCode::UnsupportedMediaType_415;
       return false;
 #endif
     }
@@ -3634,7 +4427,7 @@ bool prepare_content_receiver(T &x, int &status,
         };
         return callback(std::move(out));
       } else {
-        status = 500;
+        status = StatusCode::InternalServerError_500;
         return false;
       }
     }
@@ -3662,8 +4455,14 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
         } else if (!has_header(x.headers, "Content-Length")) {
           ret = read_content_without_length(strm, out);
         } else {
-          auto len = get_header_value<uint64_t>(x.headers, "Content-Length");
-          if (len > payload_max_length) {
+          auto is_invalid_value = false;
+          auto len = get_header_value_u64(x.headers, "Content-Length",
+                                          std::numeric_limits<uint64_t>::max(),
+                                          0, is_invalid_value);
+
+          if (is_invalid_value) {
+            ret = false;
+          } else if (len > payload_max_length) {
             exceed_payload_max_length = true;
             skip_content_with_length(strm, len);
             ret = false;
@@ -3672,16 +4471,42 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
           }
         }
 
-        if (!ret) { status = exceed_payload_max_length ? 413 : 400; }
+        if (!ret) {
+          status = exceed_payload_max_length ? StatusCode::PayloadTooLarge_413
+                                             : StatusCode::BadRequest_400;
+        }
         return ret;
       });
-} // namespace detail
+}
+
+inline ssize_t write_request_line(Stream &strm, const std::string &method,
+                                  const std::string &path) {
+  std::string s = method;
+  s += " ";
+  s += path;
+  s += " HTTP/1.1\r\n";
+  return strm.write(s.data(), s.size());
+}
+
+inline ssize_t write_response_line(Stream &strm, int status) {
+  std::string s = "HTTP/1.1 ";
+  s += std::to_string(status);
+  s += " ";
+  s += httplib::status_message(status);
+  s += "\r\n";
+  return strm.write(s.data(), s.size());
+}
 
 inline ssize_t write_headers(Stream &strm, const Headers &headers) {
   ssize_t write_len = 0;
   for (const auto &x : headers) {
-    auto len =
-        strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str());
+    std::string s;
+    s = x.first;
+    s += ": ";
+    s += x.second;
+    s += "\r\n";
+
+    auto len = strm.write(s.data(), s.size());
     if (len < 0) { return len; }
     write_len += len;
   }
@@ -3720,6 +4545,8 @@ inline bool write_content(Stream &strm, const ContentProvider &content_provider,
     return ok;
   };
 
+  data_sink.is_writable = [&]() -> bool { return strm.is_writable(); };
+
   while (offset < end_offset && !is_shutting_down()) {
     if (!strm.is_writable()) {
       error = Error::Write;
@@ -3764,6 +4591,8 @@ write_content_without_length(Stream &strm,
     return ok;
   };
 
+  data_sink.is_writable = [&]() -> bool { return strm.is_writable(); };
+
   data_sink.done = [&](void) { data_available = false; };
 
   while (data_available && !is_shutting_down()) {
@@ -3814,6 +4643,8 @@ write_content_chunked(Stream &strm, const ContentProvider &content_provider,
     return ok;
   };
 
+  data_sink.is_writable = [&]() -> bool { return strm.is_writable(); };
+
   auto done_with_trailer = [&](const Headers *trailer) {
     if (!ok) { return; }
 
@@ -3898,7 +4729,8 @@ inline bool redirect(T &cli, Request &req, Response &res,
   new_req.path = path;
   new_req.redirect_count_ -= 1;
 
-  if (res.status == 303 && (req.method != "GET" && req.method != "HEAD")) {
+  if (res.status == StatusCode::SeeOther_303 &&
+      (req.method != "GET" && req.method != "HEAD")) {
     new_req.method = "GET";
     new_req.body.clear();
     new_req.headers.clear();
@@ -3910,7 +4742,8 @@ inline bool redirect(T &cli, Request &req, Response &res,
   if (ret) {
     req = new_req;
     res = new_res;
-    res.location = location;
+
+    if (res.location.empty()) { res.location = location; }
   }
   return ret;
 }
@@ -3927,9 +4760,47 @@ inline std::string params_to_query_str(const Params &params) {
   return query;
 }
 
-inline void parse_query_text(const std::string &s, Params &params) {
+inline void parse_query_text(const char *data, std::size_t size,
+                             Params &params) {
   std::set<std::string> cache;
-  split(s.data(), s.data() + s.size(), '&', [&](const char *b, const char *e) {
+  split(data, data + size, '&', [&](const char *b, const char *e) {
+    std::string kv(b, e);
+    if (cache.find(kv) != cache.end()) { return; }
+    cache.insert(std::move(kv));
+
+    std::string key;
+    std::string val;
+    divide(b, static_cast<std::size_t>(e - b), '=',
+           [&](const char *lhs_data, std::size_t lhs_size, const char *rhs_data,
+               std::size_t rhs_size) {
+             key.assign(lhs_data, lhs_size);
+             val.assign(rhs_data, rhs_size);
+           });
+
+    if (!key.empty()) {
+      params.emplace(decode_url(key, true), decode_url(val, true));
+    }
+  });
+}
+
+inline void parse_query_text(const std::string &s, Params &params) {
+  parse_query_text(s.data(), s.size(), params);
+}
+
+inline bool parse_multipart_boundary(const std::string &content_type,
+                                     std::string &boundary) {
+  auto boundary_keyword = "boundary=";
+  auto pos = content_type.find(boundary_keyword);
+  if (pos == std::string::npos) { return false; }
+  auto end = content_type.find(';', pos);
+  auto beg = pos + strlen(boundary_keyword);
+  boundary = trim_double_quotes_copy(content_type.substr(beg, end - beg));
+  return !boundary.empty();
+}
+
+inline void parse_disposition_params(const std::string &s, Params &params) {
+  std::set<std::string> cache;
+  split(s.data(), s.data() + s.size(), ';', [&](const char *b, const char *e) {
     std::string kv(b, e);
     if (cache.find(kv) != cache.end()) { return; }
     cache.insert(kv);
@@ -3945,60 +4816,55 @@ inline void parse_query_text(const std::string &s, Params &params) {
     });
 
     if (!key.empty()) {
-      params.emplace(decode_url(key, true), decode_url(val, true));
+      params.emplace(trim_double_quotes_copy((key)),
+                     trim_double_quotes_copy((val)));
     }
   });
 }
 
-inline bool parse_multipart_boundary(const std::string &content_type,
-                                     std::string &boundary) {
-  auto boundary_keyword = "boundary=";
-  auto pos = content_type.find(boundary_keyword);
-  if (pos == std::string::npos) { return false; }
-  auto end = content_type.find(';', pos);
-  auto beg = pos + strlen(boundary_keyword);
-  boundary = content_type.substr(beg, end - beg);
-  if (boundary.length() >= 2 && boundary.front() == '"' &&
-      boundary.back() == '"') {
-    boundary = boundary.substr(1, boundary.size() - 2);
-  }
-  return !boundary.empty();
-}
-
 #ifdef CPPHTTPLIB_NO_EXCEPTIONS
 inline bool parse_range_header(const std::string &s, Ranges &ranges) {
 #else
 inline bool parse_range_header(const std::string &s, Ranges &ranges) try {
 #endif
-  static auto re_first_range = std::regex(R"(bytes=(\d*-\d*(?:,\s*\d*-\d*)*))");
-  std::smatch m;
-  if (std::regex_match(s, m, re_first_range)) {
-    auto pos = static_cast<size_t>(m.position(1));
-    auto len = static_cast<size_t>(m.length(1));
-    bool all_valid_ranges = true;
+  auto is_valid = [](const std::string &str) {
+    return std::all_of(str.cbegin(), str.cend(),
+                       [](unsigned char c) { return std::isdigit(c); });
+  };
+
+  if (s.size() > 7 && s.compare(0, 6, "bytes=") == 0) {
+    const auto pos = static_cast<size_t>(6);
+    const auto len = static_cast<size_t>(s.size() - 6);
+    auto all_valid_ranges = true;
     split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) {
-      if (!all_valid_ranges) return;
-      static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))");
-      std::cmatch cm;
-      if (std::regex_match(b, e, cm, re_another_range)) {
-        ssize_t first = -1;
-        if (!cm.str(1).empty()) {
-          first = static_cast<ssize_t>(std::stoll(cm.str(1)));
-        }
+      if (!all_valid_ranges) { return; }
 
-        ssize_t last = -1;
-        if (!cm.str(2).empty()) {
-          last = static_cast<ssize_t>(std::stoll(cm.str(2)));
-        }
-
-        if (first != -1 && last != -1 && first > last) {
-          all_valid_ranges = false;
-          return;
-        }
-        ranges.emplace_back(std::make_pair(first, last));
+      const auto it = std::find(b, e, '-');
+      if (it == e) {
+        all_valid_ranges = false;
+        return;
       }
+
+      const auto lhs = std::string(b, it);
+      const auto rhs = std::string(it + 1, e);
+      if (!is_valid(lhs) || !is_valid(rhs)) {
+        all_valid_ranges = false;
+        return;
+      }
+
+      const auto first =
+          static_cast<ssize_t>(lhs.empty() ? -1 : std::stoll(lhs));
+      const auto last =
+          static_cast<ssize_t>(rhs.empty() ? -1 : std::stoll(rhs));
+      if ((first == -1 && last == -1) ||
+          (first != -1 && last != -1 && first > last)) {
+        all_valid_ranges = false;
+        return;
+      }
+
+      ranges.emplace_back(first, last);
     });
-    return all_valid_ranges;
+    return all_valid_ranges && !ranges.empty();
   }
   return false;
 #ifdef CPPHTTPLIB_NO_EXCEPTIONS
@@ -4022,11 +4888,6 @@ public:
   bool parse(const char *buf, size_t n, const ContentReceiver &content_callback,
              const MultipartContentHeader &header_callback) {
 
-    // TODO: support 'filename*'
-    static const std::regex re_content_disposition(
-        R"~(^Content-Disposition:\s*form-data;\s*name="(.*?)"(?:;\s*filename="(.*?)")?(?:;\s*filename\*=\S+)?\s*$)~",
-        std::regex_constants::icase);
-
     buf_append(buf, n);
 
     while (buf_size() > 0) {
@@ -4059,18 +4920,54 @@ public:
             break;
           }
 
-          static const std::string header_name = "content-type:";
           const auto header = buf_head(pos);
-          if (start_with_case_ignore(header, header_name)) {
-            file_.content_type = trim_copy(header.substr(header_name.size()));
+
+          if (!parse_header(header.data(), header.data() + header.size(),
+                            [&](const std::string &, const std::string &) {})) {
+            is_valid_ = false;
+            return false;
+          }
+
+          static const std::string header_content_type = "Content-Type:";
+
+          if (start_with_case_ignore(header, header_content_type)) {
+            file_.content_type =
+                trim_copy(header.substr(header_content_type.size()));
           } else {
+            static const std::regex re_content_disposition(
+                R"~(^Content-Disposition:\s*form-data;\s*(.*)$)~",
+                std::regex_constants::icase);
+
             std::smatch m;
             if (std::regex_match(header, m, re_content_disposition)) {
-              file_.name = m[1];
-              file_.filename = m[2];
-            } else {
-              is_valid_ = false;
-              return false;
+              Params params;
+              parse_disposition_params(m[1], params);
+
+              auto it = params.find("name");
+              if (it != params.end()) {
+                file_.name = it->second;
+              } else {
+                is_valid_ = false;
+                return false;
+              }
+
+              it = params.find("filename");
+              if (it != params.end()) { file_.filename = it->second; }
+
+              it = params.find("filename*");
+              if (it != params.end()) {
+                // Only allow UTF-8 enconnding...
+                static const std::regex re_rfc5987_encoding(
+                    R"~(^UTF-8''(.+?)$)~", std::regex_constants::icase);
+
+                std::smatch m2;
+                if (std::regex_match(it->second, m2, re_rfc5987_encoding)) {
+                  file_.filename = decode_url(m2[1], false); // override...
+                } else {
+                  is_valid_ = false;
+                  return false;
+                }
+              }
             }
           }
           buf_erase(pos + crlf_.size());
@@ -4108,9 +5005,9 @@ public:
           buf_erase(crlf_.size());
           state_ = 1;
         } else {
-          if (dash_crlf_.size() > buf_size()) { return true; }
-          if (buf_start_with(dash_crlf_)) {
-            buf_erase(dash_crlf_.size());
+          if (dash_.size() > buf_size()) { return true; }
+          if (buf_start_with(dash_)) {
+            buf_erase(dash_.size());
             is_valid_ = true;
             buf_erase(buf_size()); // Remove epilogue
           } else {
@@ -4136,14 +5033,15 @@ private:
                               const std::string &b) const {
     if (a.size() < b.size()) { return false; }
     for (size_t i = 0; i < b.size(); i++) {
-      if (::tolower(a[i]) != ::tolower(b[i])) { return false; }
+      if (case_ignore::to_lower(a[i]) != case_ignore::to_lower(b[i])) {
+        return false;
+      }
     }
     return true;
   }
 
   const std::string dash_ = "--";
   const std::string crlf_ = "\r\n";
-  const std::string dash_crlf_ = "--\r\n";
   std::string boundary_;
   std::string dash_boundary_crlf_;
   std::string crlf_dash_boundary_;
@@ -4220,38 +5118,32 @@ private:
   size_t buf_epos_ = 0;
 };
 
-inline std::string to_lower(const char *beg, const char *end) {
-  std::string out;
-  auto it = beg;
-  while (it != end) {
-    out += static_cast<char>(::tolower(*it));
-    it++;
-  }
-  return out;
-}
-
-inline std::string make_multipart_data_boundary() {
+inline std::string random_string(size_t length) {
   static const char data[] =
       "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
   // std::random_device might actually be deterministic on some
   // platforms, but due to lack of support in the c++ standard library,
   // doing better requires either some ugly hacks or breaking portability.
-  std::random_device seed_gen;
+  static std::random_device seed_gen;
 
   // Request 128 bits of entropy for initialization
-  std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()};
-  std::mt19937 engine(seed_sequence);
+  static std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(),
+                                     seed_gen()};
 
-  std::string result = "--cpp-httplib-multipart-data-";
+  static std::mt19937 engine(seed_sequence);
 
-  for (auto i = 0; i < 16; i++) {
+  std::string result;
+  for (size_t i = 0; i < length; i++) {
     result += data[engine() % (sizeof(data) - 1)];
   }
-
   return result;
 }
 
+inline std::string make_multipart_data_boundary() {
+  return "--cpp-httplib-multipart-data-" + detail::random_string(16);
+}
+
 inline bool is_multipart_boundary_chars_valid(const std::string &boundary) {
   auto valid = true;
   for (size_t i = 0; i < boundary.size(); i++) {
@@ -4304,48 +5196,107 @@ serialize_multipart_formdata(const MultipartFormDataItems &items,
     body += item.content + serialize_multipart_formdata_item_end();
   }
 
-  if (finish) body += serialize_multipart_formdata_finish(boundary);
+  if (finish) { body += serialize_multipart_formdata_finish(boundary); }
 
   return body;
 }
 
+inline bool range_error(Request &req, Response &res) {
+  if (!req.ranges.empty() && 200 <= res.status && res.status < 300) {
+    ssize_t contant_len = static_cast<ssize_t>(
+        res.content_length_ ? res.content_length_ : res.body.size());
+
+    ssize_t prev_first_pos = -1;
+    ssize_t prev_last_pos = -1;
+    size_t overwrapping_count = 0;
+
+    // NOTE: The following Range check is based on '14.2. Range' in RFC 9110
+    // 'HTTP Semantics' to avoid potential denial-of-service attacks.
+    // https://www.rfc-editor.org/rfc/rfc9110#section-14.2
+
+    // Too many ranges
+    if (req.ranges.size() > CPPHTTPLIB_RANGE_MAX_COUNT) { return true; }
+
+    for (auto &r : req.ranges) {
+      auto &first_pos = r.first;
+      auto &last_pos = r.second;
+
+      if (first_pos == -1 && last_pos == -1) {
+        first_pos = 0;
+        last_pos = contant_len;
+      }
+
+      if (first_pos == -1) {
+        first_pos = contant_len - last_pos;
+        last_pos = contant_len - 1;
+      }
+
+      // NOTE: RFC-9110 '14.1.2. Byte Ranges':
+      // A client can limit the number of bytes requested without knowing the
+      // size of the selected representation. If the last-pos value is absent,
+      // or if the value is greater than or equal to the current length of the
+      // representation data, the byte range is interpreted as the remainder of
+      // the representation (i.e., the server replaces the value of last-pos
+      // with a value that is one less than the current length of the selected
+      // representation).
+      // https://www.rfc-editor.org/rfc/rfc9110.html#section-14.1.2-6
+      if (last_pos == -1 || last_pos >= contant_len) {
+        last_pos = contant_len - 1;
+      }
+
+      // Range must be within content length
+      if (!(0 <= first_pos && first_pos <= last_pos &&
+            last_pos <= contant_len - 1)) {
+        return true;
+      }
+
+      // Ranges must be in ascending order
+      if (first_pos <= prev_first_pos) { return true; }
+
+      // Request must not have more than two overlapping ranges
+      if (first_pos <= prev_last_pos) {
+        overwrapping_count++;
+        if (overwrapping_count > 2) { return true; }
+      }
+
+      prev_first_pos = (std::max)(prev_first_pos, first_pos);
+      prev_last_pos = (std::max)(prev_last_pos, last_pos);
+    }
+  }
+
+  return false;
+}
+
 inline std::pair<size_t, size_t>
-get_range_offset_and_length(const Request &req, size_t content_length,
-                            size_t index) {
-  auto r = req.ranges[index];
-
-  if (r.first == -1 && r.second == -1) {
-    return std::make_pair(0, content_length);
-  }
-
-  auto slen = static_cast<ssize_t>(content_length);
-
-  if (r.first == -1) {
-    r.first = (std::max)(static_cast<ssize_t>(0), slen - r.second);
-    r.second = slen - 1;
-  }
-
-  if (r.second == -1) { r.second = slen - 1; }
+get_range_offset_and_length(Range r, size_t content_length) {
+  assert(r.first != -1 && r.second != -1);
+  assert(0 <= r.first && r.first < static_cast<ssize_t>(content_length));
+  assert(r.first <= r.second &&
+         r.second < static_cast<ssize_t>(content_length));
+  (void)(content_length);
   return std::make_pair(r.first, static_cast<size_t>(r.second - r.first) + 1);
 }
 
-inline std::string make_content_range_header_field(size_t offset, size_t length,
-                                                   size_t content_length) {
+inline std::string make_content_range_header_field(
+    const std::pair<size_t, size_t> &offset_and_length, size_t content_length) {
+  auto st = offset_and_length.first;
+  auto ed = st + offset_and_length.second - 1;
+
   std::string field = "bytes ";
-  field += std::to_string(offset);
+  field += std::to_string(st);
   field += "-";
-  field += std::to_string(offset + length - 1);
+  field += std::to_string(ed);
   field += "/";
   field += std::to_string(content_length);
   return field;
 }
 
 template <typename SToken, typename CToken, typename Content>
-bool process_multipart_ranges_data(const Request &req, Response &res,
+bool process_multipart_ranges_data(const Request &req,
                                    const std::string &boundary,
                                    const std::string &content_type,
-                                   SToken stoken, CToken ctoken,
-                                   Content content) {
+                                   size_t content_length, SToken stoken,
+                                   CToken ctoken, Content content) {
   for (size_t i = 0; i < req.ranges.size(); i++) {
     ctoken("--");
     stoken(boundary);
@@ -4356,50 +5307,51 @@ bool process_multipart_ranges_data(const Request &req, Response &res,
       ctoken("\r\n");
     }
 
-    auto offsets = get_range_offset_and_length(req, res.body.size(), i);
-    auto offset = offsets.first;
-    auto length = offsets.second;
+    auto offset_and_length =
+        get_range_offset_and_length(req.ranges[i], content_length);
 
     ctoken("Content-Range: ");
-    stoken(make_content_range_header_field(offset, length, res.body.size()));
+    stoken(make_content_range_header_field(offset_and_length, content_length));
     ctoken("\r\n");
     ctoken("\r\n");
-    if (!content(offset, length)) { return false; }
+
+    if (!content(offset_and_length.first, offset_and_length.second)) {
+      return false;
+    }
     ctoken("\r\n");
   }
 
   ctoken("--");
   stoken(boundary);
-  ctoken("--\r\n");
+  ctoken("--");
 
   return true;
 }
 
-inline bool make_multipart_ranges_data(const Request &req, Response &res,
+inline void make_multipart_ranges_data(const Request &req, Response &res,
                                        const std::string &boundary,
                                        const std::string &content_type,
+                                       size_t content_length,
                                        std::string &data) {
-  return process_multipart_ranges_data(
-      req, res, boundary, content_type,
+  process_multipart_ranges_data(
+      req, boundary, content_type, content_length,
       [&](const std::string &token) { data += token; },
       [&](const std::string &token) { data += token; },
       [&](size_t offset, size_t length) {
-        if (offset < res.body.size()) {
-          data += res.body.substr(offset, length);
-          return true;
-        }
-        return false;
+        assert(offset + length <= content_length);
+        data += res.body.substr(offset, length);
+        return true;
       });
 }
 
-inline size_t
-get_multipart_ranges_data_length(const Request &req, Response &res,
-                                 const std::string &boundary,
-                                 const std::string &content_type) {
+inline size_t get_multipart_ranges_data_length(const Request &req,
+                                               const std::string &boundary,
+                                               const std::string &content_type,
+                                               size_t content_length) {
   size_t data_length = 0;
 
   process_multipart_ranges_data(
-      req, res, boundary, content_type,
+      req, boundary, content_type, content_length,
       [&](const std::string &token) { data_length += token.size(); },
       [&](const std::string &token) { data_length += token.size(); },
       [&](size_t /*offset*/, size_t length) {
@@ -4411,13 +5363,13 @@ get_multipart_ranges_data_length(const Request &req, Response &res,
 }
 
 template <typename T>
-inline bool write_multipart_ranges_data(Stream &strm, const Request &req,
-                                        Response &res,
-                                        const std::string &boundary,
-                                        const std::string &content_type,
-                                        const T &is_shutting_down) {
+inline bool
+write_multipart_ranges_data(Stream &strm, const Request &req, Response &res,
+                            const std::string &boundary,
+                            const std::string &content_type,
+                            size_t content_length, const T &is_shutting_down) {
   return process_multipart_ranges_data(
-      req, res, boundary, content_type,
+      req, boundary, content_type, content_length,
       [&](const std::string &token) { strm.write(token); },
       [&](const std::string &token) { strm.write(token); },
       [&](size_t offset, size_t length) {
@@ -4426,18 +5378,6 @@ inline bool write_multipart_ranges_data(Stream &strm, const Request &req,
       });
 }
 
-inline std::pair<size_t, size_t>
-get_range_offset_and_length(const Request &req, const Response &res,
-                            size_t index) {
-  auto r = req.ranges[index];
-
-  if (r.second == -1) {
-    r.second = static_cast<ssize_t>(res.content_length_) - 1;
-  }
-
-  return std::make_pair(r.first, r.second - r.first + 1);
-}
-
 inline bool expect_content(const Request &req) {
   if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" ||
       req.method == "PRI" || req.method == "DELETE") {
@@ -4471,7 +5411,7 @@ inline std::string message_digest(const std::string &s, const EVP_MD *algo) {
   std::stringstream ss;
   for (auto i = 0u; i < hash_length; ++i) {
     ss << std::hex << std::setw(2) << std::setfill('0')
-       << (unsigned int)hash[i];
+       << static_cast<unsigned int>(hash[i]);
   }
 
   return ss.str();
@@ -4564,7 +5504,7 @@ inline bool retrieve_root_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
 
 inline bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) {
   auto result = false;
-  for (int i = 0; i < CFArrayGetCount(certs); ++i) {
+  for (auto i = 0; i < CFArrayGetCount(certs); ++i) {
     const auto cert = reinterpret_cast<const __SecCertificate *>(
         CFArrayGetValueAtIndex(certs, i));
 
@@ -4707,7 +5647,7 @@ inline bool parse_www_authenticate(const Response &res,
         s = s.substr(pos + 1);
         auto beg = std::sregex_iterator(s.begin(), s.end(), re);
         for (auto i = beg; i != std::sregex_iterator(); ++i) {
-          auto m = *i;
+          const auto &m = *i;
           auto key = s.substr(static_cast<size_t>(m.position(1)),
                               static_cast<size_t>(m.length(1)));
           auto val = m.length(2) > 0
@@ -4724,20 +5664,6 @@ inline bool parse_www_authenticate(const Response &res,
   return false;
 }
 
-// https://stackoverflow.com/questions/440133/how-do-i-create-a-random-alpha-numeric-string-in-c/440240#answer-440240
-inline std::string random_string(size_t length) {
-  auto randchar = []() -> char {
-    const char charset[] = "0123456789"
-                           "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                           "abcdefghijklmnopqrstuvwxyz";
-    const size_t max_index = (sizeof(charset) - 1);
-    return charset[static_cast<size_t>(std::rand()) % max_index];
-  };
-  std::string str(length, 0);
-  std::generate_n(str.begin(), length, randchar);
-  return str;
-}
-
 class ContentProviderAdapter {
 public:
   explicit ContentProviderAdapter(
@@ -4777,19 +5703,18 @@ inline void hosted_at(const std::string &hostname,
 #endif
     return;
   }
+  auto se = detail::scope_exit([&] { freeaddrinfo(result); });
 
   for (auto rp = result; rp; rp = rp->ai_next) {
     const auto &addr =
         *reinterpret_cast<struct sockaddr_storage *>(rp->ai_addr);
     std::string ip;
-    int dummy = -1;
+    auto dummy = -1;
     if (detail::get_ip_and_port(addr, sizeof(struct sockaddr_storage), ip,
                                 dummy)) {
       addrs.push_back(ip);
     }
   }
-
-  freeaddrinfo(result);
 }
 
 inline std::string append_query_params(const std::string &path,
@@ -4802,10 +5727,11 @@ inline std::string append_query_params(const std::string &path,
 }
 
 // Header utilities
-inline std::pair<std::string, std::string> make_range_header(Ranges ranges) {
+inline std::pair<std::string, std::string>
+make_range_header(const Ranges &ranges) {
   std::string field = "bytes=";
   auto i = 0;
-  for (auto r : ranges) {
+  for (const auto &r : ranges) {
     if (i != 0) { field += ", "; }
     if (r.first != -1) { field += std::to_string(r.first); }
     field += '-';
@@ -4837,8 +5763,8 @@ inline bool Request::has_header(const std::string &key) const {
 }
 
 inline std::string Request::get_header_value(const std::string &key,
-                                             size_t id) const {
-  return detail::get_header_value(headers, key, id, "");
+                                             const char *def, size_t id) const {
+  return detail::get_header_value(headers, key, def, id);
 }
 
 inline size_t Request::get_header_value_count(const std::string &key) const {
@@ -4848,7 +5774,8 @@ inline size_t Request::get_header_value_count(const std::string &key) const {
 
 inline void Request::set_header(const std::string &key,
                                 const std::string &val) {
-  if (!detail::has_crlf(key) && !detail::has_crlf(val)) {
+  if (detail::fields::is_field_name(key) &&
+      detail::fields::is_field_value(val)) {
     headers.emplace(key, val);
   }
 }
@@ -4902,8 +5829,9 @@ inline bool Response::has_header(const std::string &key) const {
 }
 
 inline std::string Response::get_header_value(const std::string &key,
+                                              const char *def,
                                               size_t id) const {
-  return detail::get_header_value(headers, key, id, "");
+  return detail::get_header_value(headers, key, def, id);
 }
 
 inline size_t Response::get_header_value_count(const std::string &key) const {
@@ -4913,18 +5841,19 @@ inline size_t Response::get_header_value_count(const std::string &key) const {
 
 inline void Response::set_header(const std::string &key,
                                  const std::string &val) {
-  if (!detail::has_crlf(key) && !detail::has_crlf(val)) {
+  if (detail::fields::is_field_name(key) &&
+      detail::fields::is_field_value(val)) {
     headers.emplace(key, val);
   }
 }
 
 inline void Response::set_redirect(const std::string &url, int stat) {
-  if (!detail::has_crlf(url)) {
+  if (detail::fields::is_field_value(url)) {
     set_header("Location", url);
     if (300 <= stat && stat < 400) {
       this->status = stat;
     } else {
-      this->status = 302;
+      this->status = StatusCode::Found_302;
     }
   }
 }
@@ -4943,13 +5872,22 @@ inline void Response::set_content(const std::string &s,
   set_content(s.data(), s.size(), content_type);
 }
 
+inline void Response::set_content(std::string &&s,
+                                  const std::string &content_type) {
+  body = std::move(s);
+
+  auto rng = headers.equal_range("Content-Type");
+  headers.erase(rng.first, rng.second);
+  set_header("Content-Type", content_type);
+}
+
 inline void Response::set_content_provider(
     size_t in_length, const std::string &content_type, ContentProvider provider,
     ContentProviderResourceReleaser resource_releaser) {
   set_header("Content-Type", content_type);
   content_length_ = in_length;
   if (in_length > 0) { content_provider_ = std::move(provider); }
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = false;
 }
 
@@ -4959,7 +5897,7 @@ inline void Response::set_content_provider(
   set_header("Content-Type", content_type);
   content_length_ = 0;
   content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = false;
 }
 
@@ -4969,18 +5907,29 @@ inline void Response::set_chunked_content_provider(
   set_header("Content-Type", content_type);
   content_length_ = 0;
   content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = true;
 }
 
+inline void Response::set_file_content(const std::string &path,
+                                       const std::string &content_type) {
+  file_content_path_ = path;
+  file_content_content_type_ = content_type;
+}
+
+inline void Response::set_file_content(const std::string &path) {
+  file_content_path_ = path;
+}
+
 // Result implementation
 inline bool Result::has_request_header(const std::string &key) const {
   return request_headers_.find(key) != request_headers_.end();
 }
 
 inline std::string Result::get_request_header_value(const std::string &key,
+                                                    const char *def,
                                                     size_t id) const {
-  return detail::get_header_value(request_headers_, key, id, "");
+  return detail::get_header_value(request_headers_, key, def, id);
 }
 
 inline size_t
@@ -5010,7 +5959,7 @@ inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec,
       write_timeout_sec_(write_timeout_sec),
       write_timeout_usec_(write_timeout_usec), read_buff_(read_buff_size_, 0) {}
 
-inline SocketStream::~SocketStream() {}
+inline SocketStream::~SocketStream() = default;
 
 inline bool SocketStream::is_readable() const {
   return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
@@ -5120,6 +6069,102 @@ inline socket_t BufferStream::socket() const { return 0; }
 
 inline const std::string &BufferStream::get_buffer() const { return buffer; }
 
+inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) {
+  static constexpr char marker[] = "/:";
+
+  // One past the last ending position of a path param substring
+  std::size_t last_param_end = 0;
+
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+  // Needed to ensure that parameter names are unique during matcher
+  // construction
+  // If exceptions are disabled, only last duplicate path
+  // parameter will be set
+  std::unordered_set<std::string> param_name_set;
+#endif
+
+  while (true) {
+    const auto marker_pos = pattern.find(
+        marker, last_param_end == 0 ? last_param_end : last_param_end - 1);
+    if (marker_pos == std::string::npos) { break; }
+
+    static_fragments_.push_back(
+        pattern.substr(last_param_end, marker_pos - last_param_end + 1));
+
+    const auto param_name_start = marker_pos + 2;
+
+    auto sep_pos = pattern.find(separator, param_name_start);
+    if (sep_pos == std::string::npos) { sep_pos = pattern.length(); }
+
+    auto param_name =
+        pattern.substr(param_name_start, sep_pos - param_name_start);
+
+#ifndef CPPHTTPLIB_NO_EXCEPTIONS
+    if (param_name_set.find(param_name) != param_name_set.cend()) {
+      std::string msg = "Encountered path parameter '" + param_name +
+                        "' multiple times in route pattern '" + pattern + "'.";
+      throw std::invalid_argument(msg);
+    }
+#endif
+
+    param_names_.push_back(std::move(param_name));
+
+    last_param_end = sep_pos + 1;
+  }
+
+  if (last_param_end < pattern.length()) {
+    static_fragments_.push_back(pattern.substr(last_param_end));
+  }
+}
+
+inline bool PathParamsMatcher::match(Request &request) const {
+  request.matches = std::smatch();
+  request.path_params.clear();
+  request.path_params.reserve(param_names_.size());
+
+  // One past the position at which the path matched the pattern last time
+  std::size_t starting_pos = 0;
+  for (size_t i = 0; i < static_fragments_.size(); ++i) {
+    const auto &fragment = static_fragments_[i];
+
+    if (starting_pos + fragment.length() > request.path.length()) {
+      return false;
+    }
+
+    // Avoid unnecessary allocation by using strncmp instead of substr +
+    // comparison
+    if (std::strncmp(request.path.c_str() + starting_pos, fragment.c_str(),
+                     fragment.length()) != 0) {
+      return false;
+    }
+
+    starting_pos += fragment.length();
+
+    // Should only happen when we have a static fragment after a param
+    // Example: '/users/:id/subscriptions'
+    // The 'subscriptions' fragment here does not have a corresponding param
+    if (i >= param_names_.size()) { continue; }
+
+    auto sep_pos = request.path.find(separator, starting_pos);
+    if (sep_pos == std::string::npos) { sep_pos = request.path.length(); }
+
+    const auto &param_name = param_names_[i];
+
+    request.path_params.emplace(
+        param_name, request.path.substr(starting_pos, sep_pos - starting_pos));
+
+    // Mark everything up to '/' as matched
+    starting_pos = sep_pos + 1;
+  }
+  // Returns false if the path is longer than the pattern
+  return starting_pos >= request.path.length();
+}
+
+inline bool RegexMatcher::match(Request &request) const {
+  request.path_params.clear();
+  return std::regex_match(request.path, request.matches, regex_);
+}
+
 } // namespace detail
 
 // HTTP server implementation
@@ -5131,69 +6176,72 @@ inline Server::Server()
 #endif
 }
 
-inline Server::~Server() {}
+inline Server::~Server() = default;
+
+inline std::unique_ptr<detail::MatcherBase>
+Server::make_matcher(const std::string &pattern) {
+  if (pattern.find("/:") != std::string::npos) {
+    return detail::make_unique<detail::PathParamsMatcher>(pattern);
+  } else {
+    return detail::make_unique<detail::RegexMatcher>(pattern);
+  }
+}
 
 inline Server &Server::Get(const std::string &pattern, Handler handler) {
-  get_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  get_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
 inline Server &Server::Post(const std::string &pattern, Handler handler) {
-  post_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  post_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
 inline Server &Server::Post(const std::string &pattern,
                             HandlerWithContentReader handler) {
-  post_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  post_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
+                                                 std::move(handler));
   return *this;
 }
 
 inline Server &Server::Put(const std::string &pattern, Handler handler) {
-  put_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  put_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
 inline Server &Server::Put(const std::string &pattern,
                            HandlerWithContentReader handler) {
-  put_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  put_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
+                                                std::move(handler));
   return *this;
 }
 
 inline Server &Server::Patch(const std::string &pattern, Handler handler) {
-  patch_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  patch_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
 inline Server &Server::Patch(const std::string &pattern,
                              HandlerWithContentReader handler) {
-  patch_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  patch_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
+                                                  std::move(handler));
   return *this;
 }
 
 inline Server &Server::Delete(const std::string &pattern, Handler handler) {
-  delete_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  delete_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
 inline Server &Server::Delete(const std::string &pattern,
                               HandlerWithContentReader handler) {
-  delete_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  delete_handlers_for_content_reader_.emplace_back(make_matcher(pattern),
+                                                   std::move(handler));
   return *this;
 }
 
 inline Server &Server::Options(const std::string &pattern, Handler handler) {
-  options_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
+  options_handlers_.emplace_back(make_matcher(pattern), std::move(handler));
   return *this;
 }
 
@@ -5204,7 +6252,8 @@ inline bool Server::set_base_dir(const std::string &dir,
 
 inline bool Server::set_mount_point(const std::string &mount_point,
                                     const std::string &dir, Headers headers) {
-  if (detail::is_dir(dir)) {
+  detail::FileStat stat(dir);
+  if (stat.is_dir()) {
     std::string mnt = !mount_point.empty() ? mount_point : "/";
     if (!mnt.empty() && mnt[0] == '/') {
       base_dirs_.push_back({mnt, dir, std::move(headers)});
@@ -5231,17 +6280,24 @@ Server::set_file_extension_and_mimetype_mapping(const std::string &ext,
   return *this;
 }
 
+inline Server &Server::set_default_file_mimetype(const std::string &mime) {
+  default_file_mimetype_ = mime;
+  return *this;
+}
+
 inline Server &Server::set_file_request_handler(Handler handler) {
   file_request_handler_ = std::move(handler);
   return *this;
 }
 
-inline Server &Server::set_error_handler(HandlerWithResponse handler) {
+inline Server &Server::set_error_handler_core(HandlerWithResponse handler,
+                                              std::true_type) {
   error_handler_ = std::move(handler);
   return *this;
 }
 
-inline Server &Server::set_error_handler(Handler handler) {
+inline Server &Server::set_error_handler_core(Handler handler,
+                                              std::false_type) {
   error_handler_ = [handler](const Request &req, Response &res) {
     handler(req, res);
     return HandlerResponse::Handled;
@@ -5272,7 +6328,6 @@ inline Server &Server::set_logger(Logger logger) {
 inline Server &
 Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) {
   expect_100_continue_handler_ = std::move(handler);
-
   return *this;
 }
 
@@ -5286,6 +6341,11 @@ inline Server &Server::set_tcp_nodelay(bool on) {
   return *this;
 }
 
+inline Server &Server::set_ipv6_v6only(bool on) {
+  ipv6_v6only_ = on;
+  return *this;
+}
+
 inline Server &Server::set_socket_options(SocketOptions socket_options) {
   socket_options_ = std::move(socket_options);
   return *this;
@@ -5296,6 +6356,12 @@ inline Server &Server::set_default_headers(Headers headers) {
   return *this;
 }
 
+inline Server &Server::set_header_writer(
+    std::function<ssize_t(Stream &, Headers &)> const &writer) {
+  header_writer_ = writer;
+  return *this;
+}
+
 inline Server &Server::set_keep_alive_max_count(size_t count) {
   keep_alive_max_count_ = count;
   return *this;
@@ -5331,28 +6397,27 @@ inline Server &Server::set_payload_max_length(size_t length) {
 
 inline bool Server::bind_to_port(const std::string &host, int port,
                                  int socket_flags) {
-  if (bind_internal(host, port, socket_flags) < 0) return false;
-  return true;
+  auto ret = bind_internal(host, port, socket_flags);
+  if (ret == -1) { is_decommisioned = true; }
+  return ret >= 0;
 }
 inline int Server::bind_to_any_port(const std::string &host, int socket_flags) {
-  return bind_internal(host, 0, socket_flags);
+  auto ret = bind_internal(host, 0, socket_flags);
+  if (ret == -1) { is_decommisioned = true; }
+  return ret;
 }
 
-inline bool Server::listen_after_bind() {
-  auto se = detail::scope_exit([&]() { done_ = true; });
-  return listen_internal();
-}
+inline bool Server::listen_after_bind() { return listen_internal(); }
 
 inline bool Server::listen(const std::string &host, int port,
                            int socket_flags) {
-  auto se = detail::scope_exit([&]() { done_ = true; });
   return bind_to_port(host, port, socket_flags) && listen_internal();
 }
 
 inline bool Server::is_running() const { return is_running_; }
 
 inline void Server::wait_until_ready() const {
-  while (!is_running() && !done_) {
+  while (!is_running_ && !is_decommisioned) {
     std::this_thread::sleep_for(std::chrono::milliseconds{1});
   }
 }
@@ -5364,9 +6429,12 @@ inline void Server::stop() {
     detail::shutdown_socket(sock);
     detail::close_socket(sock);
   }
+  is_decommisioned = false;
 }
 
-inline bool Server::parse_request_line(const char *s, Request &req) {
+inline void Server::decommission() { is_decommisioned = true; }
+
+inline bool Server::parse_request_line(const char *s, Request &req) const {
   auto len = strlen(s);
   if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; }
   len -= 2;
@@ -5404,33 +6472,23 @@ inline bool Server::parse_request_line(const char *s, Request &req) {
       }
     }
 
-    size_t count = 0;
-
-    detail::split(req.target.data(), req.target.data() + req.target.size(), '?',
-                  [&](const char *b, const char *e) {
-                    switch (count) {
-                    case 0:
-                      req.path = detail::decode_url(std::string(b, e), false);
-                      break;
-                    case 1: {
-                      if (e - b > 0) {
-                        detail::parse_query_text(std::string(b, e), req.params);
-                      }
-                      break;
-                    }
-                    default: break;
-                    }
-                    count++;
-                  });
-
-    if (count > 2) { return false; }
+    detail::divide(req.target, '?',
+                   [&](const char *lhs_data, std::size_t lhs_size,
+                       const char *rhs_data, std::size_t rhs_size) {
+                     req.path = detail::decode_url(
+                         std::string(lhs_data, lhs_size), false);
+                     detail::parse_query_text(rhs_data, rhs_size, req.params);
+                   });
   }
 
   return true;
 }
 
 inline bool Server::write_response(Stream &strm, bool close_connection,
-                                   const Request &req, Response &res) {
+                                   Request &req, Response &res) {
+  // NOTE: `req.ranges` should be empty, otherwise it will be applied
+  // incorrectly to the error content.
+  req.ranges.clear();
   return write_response_core(strm, close_connection, req, res, false);
 }
 
@@ -5459,23 +6517,24 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection,
   if (close_connection || req.get_header_value("Connection") == "close") {
     res.set_header("Connection", "close");
   } else {
-    std::stringstream ss;
-    ss << "timeout=" << keep_alive_timeout_sec_
-       << ", max=" << keep_alive_max_count_;
-    res.set_header("Keep-Alive", ss.str());
+    std::string s = "timeout=";
+    s += std::to_string(keep_alive_timeout_sec_);
+    s += ", max=";
+    s += std::to_string(keep_alive_max_count_);
+    res.set_header("Keep-Alive", s);
   }
 
-  if (!res.has_header("Content-Type") &&
-      (!res.body.empty() || res.content_length_ > 0 || res.content_provider_)) {
+  if ((!res.body.empty() || res.content_length_ > 0 || res.content_provider_) &&
+      !res.has_header("Content-Type")) {
     res.set_header("Content-Type", "text/plain");
   }
 
-  if (!res.has_header("Content-Length") && res.body.empty() &&
-      !res.content_length_ && !res.content_provider_) {
+  if (res.body.empty() && !res.content_length_ && !res.content_provider_ &&
+      !res.has_header("Content-Length")) {
     res.set_header("Content-Length", "0");
   }
 
-  if (!res.has_header("Accept-Ranges") && req.method == "HEAD") {
+  if (req.method == "HEAD" && !res.has_header("Accept-Ranges")) {
     res.set_header("Accept-Ranges", "bytes");
   }
 
@@ -5484,13 +6543,8 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection,
   // Response line and headers
   {
     detail::BufferStream bstrm;
-
-    if (!bstrm.write_format("HTTP/1.1 %d %s\r\n", res.status,
-                            detail::status_message(res.status))) {
-      return false;
-    }
-
-    if (!detail::write_headers(bstrm, res.headers)) { return false; }
+    if (!detail::write_response_line(bstrm, res.status)) { return false; }
+    if (!header_writer_(bstrm, res.headers)) { return false; }
 
     // Flush buffer
     auto &data = bstrm.get_buffer();
@@ -5508,7 +6562,6 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection,
       if (write_content_with_provider(strm, req, res, boundary, content_type)) {
         res.content_provider_success_ = true;
       } else {
-        res.content_provider_success_ = false;
         ret = false;
       }
     }
@@ -5533,15 +6586,16 @@ Server::write_content_with_provider(Stream &strm, const Request &req,
       return detail::write_content(strm, res.content_provider_, 0,
                                    res.content_length_, is_shutting_down);
     } else if (req.ranges.size() == 1) {
-      auto offsets =
-          detail::get_range_offset_and_length(req, res.content_length_, 0);
-      auto offset = offsets.first;
-      auto length = offsets.second;
-      return detail::write_content(strm, res.content_provider_, offset, length,
-                                   is_shutting_down);
+      auto offset_and_length = detail::get_range_offset_and_length(
+          req.ranges[0], res.content_length_);
+
+      return detail::write_content(strm, res.content_provider_,
+                                   offset_and_length.first,
+                                   offset_and_length.second, is_shutting_down);
     } else {
       return detail::write_multipart_ranges_data(
-          strm, req, res, boundary, content_type, is_shutting_down);
+          strm, req, res, boundary, content_type, res.content_length_,
+          is_shutting_down);
     }
   } else {
     if (res.is_chunked_content_provider_) {
@@ -5598,7 +6652,7 @@ inline bool Server::read_content(Stream &strm, Request &req, Response &res) {
     const auto &content_type = req.get_header_value("Content-Type");
     if (!content_type.find("application/x-www-form-urlencoded")) {
       if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) {
-        res.status = 413; // NOTE: should be 414?
+        res.status = StatusCode::PayloadTooLarge_413; // NOTE: should be 414?
         return false;
       }
       detail::parse_query_text(req.body, req.params);
@@ -5617,10 +6671,11 @@ inline bool Server::read_content_with_content_receiver(
                            std::move(multipart_receiver));
 }
 
-inline bool Server::read_content_core(Stream &strm, Request &req, Response &res,
-                                      ContentReceiver receiver,
-                                      MultipartContentHeader multipart_header,
-                                      ContentReceiver multipart_receiver) {
+inline bool
+Server::read_content_core(Stream &strm, Request &req, Response &res,
+                          ContentReceiver receiver,
+                          MultipartContentHeader multipart_header,
+                          ContentReceiver multipart_receiver) const {
   detail::MultipartFormDataParser multipart_form_data_parser;
   ContentReceiverWithProgress out;
 
@@ -5628,7 +6683,7 @@ inline bool Server::read_content_core(Stream &strm, Request &req, Response &res,
     const auto &content_type = req.get_header_value("Content-Type");
     std::string boundary;
     if (!detail::parse_multipart_boundary(content_type, boundary)) {
-      res.status = 400;
+      res.status = StatusCode::BadRequest_400;
       return false;
     }
 
@@ -5664,7 +6719,7 @@ inline bool Server::read_content_core(Stream &strm, Request &req, Response &res,
 
   if (req.is_multipart_form_data()) {
     if (!multipart_form_data_parser.is_valid()) {
-      res.status = 400;
+      res.status = StatusCode::BadRequest_400;
       return false;
     }
   }
@@ -5682,18 +6737,34 @@ inline bool Server::handle_file_request(const Request &req, Response &res,
         auto path = entry.base_dir + sub_path;
         if (path.back() == '/') { path += "index.html"; }
 
-        if (detail::is_file(path)) {
-          detail::read_file(path, res.body);
-          auto type =
-              detail::find_content_type(path, file_extension_and_mimetype_map_);
-          if (type) { res.set_header("Content-Type", type); }
+        detail::FileStat stat(path);
+
+        if (stat.is_dir()) {
+          res.set_redirect(sub_path + "/", StatusCode::MovedPermanently_301);
+          return true;
+        }
+
+        if (stat.is_file()) {
           for (const auto &kv : entry.headers) {
-            res.set_header(kv.first.c_str(), kv.second);
+            res.set_header(kv.first, kv.second);
           }
-          res.status = req.has_header("Range") ? 206 : 200;
+
+          auto mm = std::make_shared<detail::mmap>(path.c_str());
+          if (!mm->is_open()) { return false; }
+
+          res.set_content_provider(
+              mm->size(),
+              detail::find_content_type(path, file_extension_and_mimetype_map_,
+                                        default_file_mimetype_),
+              [mm](size_t offset, size_t length, DataSink &sink) -> bool {
+                sink.write(mm->data() + offset, length);
+                return true;
+              });
+
           if (!head && file_request_handler_) {
             file_request_handler_(req, res);
           }
+
           return true;
         }
       }
@@ -5708,8 +6779,8 @@ Server::create_server_socket(const std::string &host, int port,
                              SocketOptions socket_options) const {
   return detail::create_socket(
       host, std::string(), port, address_family_, socket_flags, tcp_nodelay_,
-      std::move(socket_options),
-      [](socket_t sock, struct addrinfo &ai) -> bool {
+      ipv6_v6only_, std::move(socket_options),
+      [](socket_t sock, struct addrinfo &ai, bool & /*quit*/) -> bool {
         if (::bind(sock, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen))) {
           return false;
         }
@@ -5720,6 +6791,8 @@ Server::create_server_socket(const std::string &host, int port,
 
 inline int Server::bind_internal(const std::string &host, int port,
                                  int socket_flags) {
+  if (is_decommisioned) { return -1; }
+
   if (!is_valid()) { return -1; }
 
   svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_);
@@ -5745,6 +6818,8 @@ inline int Server::bind_internal(const std::string &host, int port,
 }
 
 inline bool Server::listen_internal() {
+  if (is_decommisioned) { return false; }
+
   auto ret = true;
   is_running_ = true;
   auto se = detail::scope_exit([&]() { is_running_ = false; });
@@ -5765,13 +6840,22 @@ inline bool Server::listen_internal() {
 #ifndef _WIN32
       }
 #endif
+
+#if defined _WIN32
+      // sockets conneced via WASAccept inherit flags NO_HANDLE_INHERIT,
+      // OVERLAPPED
+      socket_t sock = WSAAccept(svr_sock_, nullptr, nullptr, nullptr, 0);
+#elif defined SOCK_CLOEXEC
+      socket_t sock = accept4(svr_sock_, nullptr, nullptr, SOCK_CLOEXEC);
+#else
       socket_t sock = accept(svr_sock_, nullptr, nullptr);
+#endif
 
       if (sock == INVALID_SOCKET) {
         if (errno == EMFILE) {
           // The per-process limit of open file descriptors has been reached.
           // Try to accept new connections after a short sleep.
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          std::this_thread::sleep_for(std::chrono::microseconds{1});
           continue;
         } else if (errno == EINTR || errno == EAGAIN) {
           continue;
@@ -5789,13 +6873,14 @@ inline bool Server::listen_internal() {
 #ifdef _WIN32
         auto timeout = static_cast<uint32_t>(read_timeout_sec_ * 1000 +
                                              read_timeout_usec_ / 1000);
-        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
-                   sizeof(timeout));
+        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                   reinterpret_cast<const char *>(&timeout), sizeof(timeout));
 #else
         timeval tv;
         tv.tv_sec = static_cast<long>(read_timeout_sec_);
         tv.tv_usec = static_cast<decltype(tv.tv_usec)>(read_timeout_usec_);
-        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
+        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                   reinterpret_cast<const void *>(&tv), sizeof(tv));
 #endif
       }
       {
@@ -5803,22 +6888,28 @@ inline bool Server::listen_internal() {
 #ifdef _WIN32
         auto timeout = static_cast<uint32_t>(write_timeout_sec_ * 1000 +
                                              write_timeout_usec_ / 1000);
-        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout,
-                   sizeof(timeout));
+        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                   reinterpret_cast<const char *>(&timeout), sizeof(timeout));
 #else
         timeval tv;
         tv.tv_sec = static_cast<long>(write_timeout_sec_);
         tv.tv_usec = static_cast<decltype(tv.tv_usec)>(write_timeout_usec_);
-        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
+        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                   reinterpret_cast<const void *>(&tv), sizeof(tv));
 #endif
       }
 
-      task_queue->enqueue([this, sock]() { process_and_close_socket(sock); });
+      if (!task_queue->enqueue(
+              [this, sock]() { process_and_close_socket(sock); })) {
+        detail::shutdown_socket(sock);
+        detail::close_socket(sock);
+      }
     }
 
     task_queue->shutdown();
   }
 
+  is_decommisioned = !ret;
   return ret;
 }
 
@@ -5829,7 +6920,7 @@ inline bool Server::routing(Request &req, Response &res, Stream &strm) {
   }
 
   // File handler
-  bool is_head_request = req.method == "HEAD";
+  auto is_head_request = req.method == "HEAD";
   if ((req.method == "GET" || is_head_request) &&
       handle_file_request(req, res, is_head_request)) {
     return true;
@@ -5895,17 +6986,17 @@ inline bool Server::routing(Request &req, Response &res, Stream &strm) {
     return dispatch_request(req, res, patch_handlers_);
   }
 
-  res.status = 400;
+  res.status = StatusCode::BadRequest_400;
   return false;
 }
 
 inline bool Server::dispatch_request(Request &req, Response &res,
-                                     const Handlers &handlers) {
+                                     const Handlers &handlers) const {
   for (const auto &x : handlers) {
-    const auto &pattern = x.first;
+    const auto &matcher = x.first;
     const auto &handler = x.second;
 
-    if (std::regex_match(req.path, req.matches, pattern)) {
+    if (matcher->match(req)) {
       handler(req, res);
       return true;
     }
@@ -5915,18 +7006,18 @@ inline bool Server::dispatch_request(Request &req, Response &res,
 
 inline void Server::apply_ranges(const Request &req, Response &res,
                                  std::string &content_type,
-                                 std::string &boundary) {
-  if (req.ranges.size() > 1) {
-    boundary = detail::make_multipart_data_boundary();
-
+                                 std::string &boundary) const {
+  if (req.ranges.size() > 1 && res.status == StatusCode::PartialContent_206) {
     auto it = res.headers.find("Content-Type");
     if (it != res.headers.end()) {
       content_type = it->second;
       res.headers.erase(it);
     }
 
-    res.headers.emplace("Content-Type",
-                        "multipart/byteranges; boundary=" + boundary);
+    boundary = detail::make_multipart_data_boundary();
+
+    res.set_header("Content-Type",
+                   "multipart/byteranges; boundary=" + boundary);
   }
 
   auto type = detail::encoding_type(req, res);
@@ -5934,19 +7025,20 @@ inline void Server::apply_ranges(const Request &req, Response &res,
   if (res.body.empty()) {
     if (res.content_length_ > 0) {
       size_t length = 0;
-      if (req.ranges.empty()) {
+      if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) {
         length = res.content_length_;
       } else if (req.ranges.size() == 1) {
-        auto offsets =
-            detail::get_range_offset_and_length(req, res.content_length_, 0);
-        auto offset = offsets.first;
-        length = offsets.second;
+        auto offset_and_length = detail::get_range_offset_and_length(
+            req.ranges[0], res.content_length_);
+
+        length = offset_and_length.second;
+
         auto content_range = detail::make_content_range_header_field(
-            offset, length, res.content_length_);
+            offset_and_length, res.content_length_);
         res.set_header("Content-Range", content_range);
       } else {
-        length = detail::get_multipart_ranges_data_length(req, res, boundary,
-                                                          content_type);
+        length = detail::get_multipart_ranges_data_length(
+            req, boundary, content_type, res.content_length_);
       }
       res.set_header("Content-Length", std::to_string(length));
     } else {
@@ -5962,31 +7054,25 @@ inline void Server::apply_ranges(const Request &req, Response &res,
       }
     }
   } else {
-    if (req.ranges.empty()) {
+    if (req.ranges.empty() || res.status != StatusCode::PartialContent_206) {
       ;
     } else if (req.ranges.size() == 1) {
-      auto offsets =
-          detail::get_range_offset_and_length(req, res.body.size(), 0);
-      auto offset = offsets.first;
-      auto length = offsets.second;
+      auto offset_and_length =
+          detail::get_range_offset_and_length(req.ranges[0], res.body.size());
+      auto offset = offset_and_length.first;
+      auto length = offset_and_length.second;
+
       auto content_range = detail::make_content_range_header_field(
-          offset, length, res.body.size());
+          offset_and_length, res.body.size());
       res.set_header("Content-Range", content_range);
-      if (offset < res.body.size()) {
-        res.body = res.body.substr(offset, length);
-      } else {
-        res.body.clear();
-        res.status = 416;
-      }
+
+      assert(offset + length <= res.body.size());
+      res.body = res.body.substr(offset, length);
     } else {
       std::string data;
-      if (detail::make_multipart_ranges_data(req, res, boundary, content_type,
-                                             data)) {
-        res.body.swap(data);
-      } else {
-        res.body.clear();
-        res.status = 416;
-      }
+      detail::make_multipart_ranges_data(req, res, boundary, content_type,
+                                         res.body.size(), data);
+      res.body.swap(data);
     }
 
     if (type != detail::EncodingType::None) {
@@ -6025,12 +7111,12 @@ inline void Server::apply_ranges(const Request &req, Response &res,
 
 inline bool Server::dispatch_request_for_content_reader(
     Request &req, Response &res, ContentReader content_reader,
-    const HandlersForContentReader &handlers) {
+    const HandlersForContentReader &handlers) const {
   for (const auto &x : handlers) {
-    const auto &pattern = x.first;
+    const auto &matcher = x.first;
     const auto &handler = x.second;
 
-    if (std::regex_match(req.path, req.matches, pattern)) {
+    if (matcher->match(req)) {
       handler(req, res, content_reader);
       return true;
     }
@@ -6039,7 +7125,9 @@ inline bool Server::dispatch_request_for_content_reader(
 }
 
 inline bool
-Server::process_request(Stream &strm, bool close_connection,
+Server::process_request(Stream &strm, const std::string &remote_addr,
+                        int remote_port, const std::string &local_addr,
+                        int local_port, bool close_connection,
                         bool &connection_closed,
                         const std::function<void(Request &)> &setup_request) {
   std::array<char, 2048> buf{};
@@ -6050,15 +7138,10 @@ Server::process_request(Stream &strm, bool close_connection,
   if (!line_reader.getline()) { return false; }
 
   Request req;
+
   Response res;
-
   res.version = "HTTP/1.1";
-
-  for (const auto &header : default_headers_) {
-    if (res.headers.find(header.first) == res.headers.end()) {
-      res.headers.insert(header);
-    }
-  }
+  res.headers = default_headers_;
 
 #ifdef _WIN32
   // TODO: Increase FD_SETSIZE statically (libzmq), dynamically (MySQL).
@@ -6068,7 +7151,7 @@ Server::process_request(Stream &strm, bool close_connection,
   if (strm.socket() >= FD_SETSIZE) {
     Headers dummy;
     detail::read_headers(strm, dummy);
-    res.status = 500;
+    res.status = StatusCode::InternalServerError_500;
     return write_response(strm, close_connection, req, res);
   }
 #endif
@@ -6078,14 +7161,14 @@ Server::process_request(Stream &strm, bool close_connection,
   if (line_reader.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
     Headers dummy;
     detail::read_headers(strm, dummy);
-    res.status = 414;
+    res.status = StatusCode::UriTooLong_414;
     return write_response(strm, close_connection, req, res);
   }
 
   // Request line and headers
   if (!parse_request_line(line_reader.ptr(), req) ||
       !detail::read_headers(strm, req.headers)) {
-    res.status = 400;
+    res.status = StatusCode::BadRequest_400;
     return write_response(strm, close_connection, req, res);
   }
 
@@ -6098,18 +7181,20 @@ Server::process_request(Stream &strm, bool close_connection,
     connection_closed = true;
   }
 
-  strm.get_remote_ip_and_port(req.remote_addr, req.remote_port);
+  req.remote_addr = remote_addr;
+  req.remote_port = remote_port;
   req.set_header("REMOTE_ADDR", req.remote_addr);
   req.set_header("REMOTE_PORT", std::to_string(req.remote_port));
 
-  strm.get_local_ip_and_port(req.local_addr, req.local_port);
+  req.local_addr = local_addr;
+  req.local_port = local_port;
   req.set_header("LOCAL_ADDR", req.local_addr);
   req.set_header("LOCAL_PORT", std::to_string(req.local_port));
 
   if (req.has_header("Range")) {
     const auto &range_header_value = req.get_header_value("Range");
     if (!detail::parse_range_header(range_header_value, req.ranges)) {
-      res.status = 416;
+      res.status = StatusCode::RangeNotSatisfiable_416;
       return write_response(strm, close_connection, req, res);
     }
   }
@@ -6117,22 +7202,29 @@ Server::process_request(Stream &strm, bool close_connection,
   if (setup_request) { setup_request(req); }
 
   if (req.get_header_value("Expect") == "100-continue") {
-    auto status = 100;
+    int status = StatusCode::Continue_100;
     if (expect_100_continue_handler_) {
       status = expect_100_continue_handler_(req, res);
     }
     switch (status) {
-    case 100:
-    case 417:
-      strm.write_format("HTTP/1.1 %d %s\r\n\r\n", status,
-                        detail::status_message(status));
+    case StatusCode::Continue_100:
+    case StatusCode::ExpectationFailed_417:
+      detail::write_response_line(strm, status);
+      strm.write("\r\n");
       break;
-    default: return write_response(strm, close_connection, req, res);
+    default:
+      connection_closed = true;
+      return write_response(strm, true, req, res);
     }
   }
 
-  // Rounting
-  bool routed = false;
+  // Setup `is_connection_closed` method
+  req.is_connection_closed = [&]() {
+    return !detail::is_socket_alive(strm.socket());
+  };
+
+  // Routing
+  auto routed = false;
 #ifdef CPPHTTPLIB_NO_EXCEPTIONS
   routed = routing(req, res, strm);
 #else
@@ -6144,7 +7236,7 @@ Server::process_request(Stream &strm, bool close_connection,
       exception_handler_(req, res, ep);
       routed = true;
     } else {
-      res.status = 500;
+      res.status = StatusCode::InternalServerError_500;
       std::string val;
       auto s = e.what();
       for (size_t i = 0; s[i]; i++) {
@@ -6162,17 +7254,55 @@ Server::process_request(Stream &strm, bool close_connection,
       exception_handler_(req, res, ep);
       routed = true;
     } else {
-      res.status = 500;
+      res.status = StatusCode::InternalServerError_500;
       res.set_header("EXCEPTION_WHAT", "UNKNOWN");
     }
   }
 #endif
-
   if (routed) {
-    if (res.status == -1) { res.status = req.ranges.empty() ? 200 : 206; }
+    if (res.status == -1) {
+      res.status = req.ranges.empty() ? StatusCode::OK_200
+                                      : StatusCode::PartialContent_206;
+    }
+
+    // Serve file content by using a content provider
+    if (!res.file_content_path_.empty()) {
+      const auto &path = res.file_content_path_;
+      auto mm = std::make_shared<detail::mmap>(path.c_str());
+      if (!mm->is_open()) {
+        res.body.clear();
+        res.content_length_ = 0;
+        res.content_provider_ = nullptr;
+        res.status = StatusCode::NotFound_404;
+        return write_response(strm, close_connection, req, res);
+      }
+
+      auto content_type = res.file_content_content_type_;
+      if (content_type.empty()) {
+        content_type = detail::find_content_type(
+            path, file_extension_and_mimetype_map_, default_file_mimetype_);
+      }
+
+      res.set_content_provider(
+          mm->size(), content_type,
+          [mm](size_t offset, size_t length, DataSink &sink) -> bool {
+            sink.write(mm->data() + offset, length);
+            return true;
+          });
+    }
+
+    if (detail::range_error(req, res)) {
+      res.body.clear();
+      res.content_length_ = 0;
+      res.content_provider_ = nullptr;
+      res.status = StatusCode::RangeNotSatisfiable_416;
+      return write_response(strm, close_connection, req, res);
+    }
+
     return write_response_with_content(strm, close_connection, req, res);
   } else {
-    if (res.status == -1) { res.status = 404; }
+    if (res.status == -1) { res.status = StatusCode::NotFound_404; }
+
     return write_response(strm, close_connection, req, res);
   }
 }
@@ -6180,12 +7310,21 @@ Server::process_request(Stream &strm, bool close_connection,
 inline bool Server::is_valid() const { return true; }
 
 inline bool Server::process_and_close_socket(socket_t sock) {
+  std::string remote_addr;
+  int remote_port = 0;
+  detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
+
+  std::string local_addr;
+  int local_port = 0;
+  detail::get_local_ip_and_port(sock, local_addr, local_port);
+
   auto ret = detail::process_server_socket(
       svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
       read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
       write_timeout_usec_,
-      [this](Stream &strm, bool close_connection, bool &connection_closed) {
-        return process_request(strm, close_connection, connection_closed,
+      [&](Stream &strm, bool close_connection, bool &connection_closed) {
+        return process_request(strm, remote_addr, remote_port, local_addr,
+                               local_port, close_connection, connection_closed,
                                nullptr);
       });
 
@@ -6204,8 +7343,8 @@ inline ClientImpl::ClientImpl(const std::string &host, int port)
 inline ClientImpl::ClientImpl(const std::string &host, int port,
                               const std::string &client_cert_path,
                               const std::string &client_key_path)
-    : host_(host), port_(port),
-      host_and_port_(adjust_host_string(host) + ":" + std::to_string(port)),
+    : host_(detail::escape_abstract_namespace_unix_domain(host)), port_(port),
+      host_and_port_(adjust_host_string(host_) + ":" + std::to_string(port)),
       client_cert_path_(client_cert_path), client_key_path_(client_key_path) {}
 
 inline ClientImpl::~ClientImpl() {
@@ -6236,6 +7375,7 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) {
   url_encode_ = rhs.url_encode_;
   address_family_ = rhs.address_family_;
   tcp_nodelay_ = rhs.tcp_nodelay_;
+  ipv6_v6only_ = rhs.ipv6_v6only_;
   socket_options_ = rhs.socket_options_;
   compress_ = rhs.compress_;
   decompress_ = rhs.decompress_;
@@ -6256,6 +7396,8 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) {
 #endif
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
   server_certificate_verification_ = rhs.server_certificate_verification_;
+  server_hostname_verification_ = rhs.server_hostname_verification_;
+  server_certificate_verifier_ = rhs.server_certificate_verifier_;
 #endif
   logger_ = rhs.logger_;
 }
@@ -6264,21 +7406,21 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const {
   if (!proxy_host_.empty() && proxy_port_ != -1) {
     return detail::create_client_socket(
         proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_,
-        socket_options_, connection_timeout_sec_, connection_timeout_usec_,
-        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-        write_timeout_usec_, interface_, error);
+        ipv6_v6only_, socket_options_, connection_timeout_sec_,
+        connection_timeout_usec_, read_timeout_sec_, read_timeout_usec_,
+        write_timeout_sec_, write_timeout_usec_, interface_, error);
   }
 
   // Check is custom IP specified for host_
   std::string ip;
   auto it = addr_map_.find(host_);
-  if (it != addr_map_.end()) ip = it->second;
+  if (it != addr_map_.end()) { ip = it->second; }
 
   return detail::create_client_socket(
-      host_, ip, port_, address_family_, tcp_nodelay_, socket_options_,
-      connection_timeout_sec_, connection_timeout_usec_, read_timeout_sec_,
-      read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, interface_,
-      error);
+      host_, ip, port_, address_family_, tcp_nodelay_, ipv6_v6only_,
+      socket_options_, connection_timeout_sec_, connection_timeout_usec_,
+      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
+      write_timeout_usec_, interface_, error);
 }
 
 inline bool ClientImpl::create_and_connect_socket(Socket &socket,
@@ -6297,7 +7439,7 @@ inline void ClientImpl::shutdown_ssl(Socket & /*socket*/,
          socket_requests_are_from_thread_ == std::this_thread::get_id());
 }
 
-inline void ClientImpl::shutdown_socket(Socket &socket) {
+inline void ClientImpl::shutdown_socket(Socket &socket) const {
   if (socket.sock == INVALID_SOCKET) { return; }
   detail::shutdown_socket(socket.sock);
 }
@@ -6322,7 +7464,7 @@ inline void ClientImpl::close_socket(Socket &socket) {
 }
 
 inline bool ClientImpl::read_response_line(Stream &strm, const Request &req,
-                                           Response &res) {
+                                           Response &res) const {
   std::array<char, 2048> buf{};
 
   detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
@@ -6330,9 +7472,9 @@ inline bool ClientImpl::read_response_line(Stream &strm, const Request &req,
   if (!line_reader.getline()) { return false; }
 
 #ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-  const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n");
-#else
   const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r?\n");
+#else
+  const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n");
 #endif
 
   std::cmatch m;
@@ -6344,7 +7486,7 @@ inline bool ClientImpl::read_response_line(Stream &strm, const Request &req,
   res.reason = std::string(m[3]);
 
   // Ignore '100 Continue'
-  while (res.status == 100) {
+  while (res.status == StatusCode::Continue_100) {
     if (!line_reader.getline()) { return false; } // CRLF
     if (!line_reader.getline()) { return false; } // next response line
 
@@ -6367,6 +7509,18 @@ inline bool ClientImpl::send(Request &req, Response &res, Error &error) {
   return ret;
 }
 
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+inline bool ClientImpl::is_ssl_peer_could_be_closed(SSL *ssl) const {
+  detail::set_nonblocking(socket_.sock, true);
+  auto se = detail::scope_exit(
+      [&]() { detail::set_nonblocking(socket_.sock, false); });
+
+  char buf[1];
+  return !SSL_peek(ssl, buf, 1) &&
+         SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN;
+}
+#endif
+
 inline bool ClientImpl::send_(Request &req, Response &res, Error &error) {
   {
     std::lock_guard<std::mutex> guard(socket_mutex_);
@@ -6378,6 +7532,13 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) {
     auto is_alive = false;
     if (socket_.is_open()) {
       is_alive = detail::is_socket_alive(socket_.sock);
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+      if (is_alive && is_ssl()) {
+        if (is_ssl_peer_could_be_closed(socket_.ssl)) { is_alive = false; }
+      }
+#endif
+
       if (!is_alive) {
         // Attempt to avoid sigpipe by shutting down nongracefully if it seems
         // like the other side has already closed the connection Also, there
@@ -6492,15 +7653,31 @@ inline bool ClientImpl::handle_request(Stream &strm, Request &req,
 
   if (!ret) { return false; }
 
+  if (res.get_header_value("Connection") == "close" ||
+      (res.version == "HTTP/1.0" && res.reason != "Connection established")) {
+    // TODO this requires a not-entirely-obvious chain of calls to be correct
+    // for this to be safe.
+
+    // This is safe to call because handle_request is only called by send_
+    // which locks the request mutex during the process. It would be a bug
+    // to call it from a different thread since it's a thread-safety issue
+    // to do these things to the socket if another thread is using the socket.
+    std::lock_guard<std::mutex> guard(socket_mutex_);
+    shutdown_ssl(socket_, true);
+    shutdown_socket(socket_);
+    close_socket(socket_);
+  }
+
   if (300 < res.status && res.status < 400 && follow_location_) {
     req = req_save;
     ret = redirect(req, res, error);
   }
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if ((res.status == 401 || res.status == 407) &&
+  if ((res.status == StatusCode::Unauthorized_401 ||
+       res.status == StatusCode::ProxyAuthenticationRequired_407) &&
       req.authorization_count_ < 5) {
-    auto is_proxy = res.status == 407;
+    auto is_proxy = res.status == StatusCode::ProxyAuthenticationRequired_407;
     const auto &username =
         is_proxy ? proxy_digest_auth_username_ : digest_auth_username_;
     const auto &password =
@@ -6539,7 +7716,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
   if (location.empty()) { return false; }
 
   const static std::regex re(
-      R"((?:(https?):)?(?://(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
+      R"((?:(https?):)?(?://(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
 
   std::smatch m;
   if (!std::regex_match(location, m, re)) { return false; }
@@ -6571,7 +7748,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
   } else {
     if (next_scheme == "https") {
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      SSLClient cli(next_host.c_str(), next_port);
+      SSLClient cli(next_host, next_port);
       cli.copy_settings(*this);
       if (ca_cert_store_) { cli.set_ca_cert_store(ca_cert_store_); }
       return detail::redirect(cli, req, res, path, location, error);
@@ -6579,7 +7756,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
       return false;
 #endif
     } else {
-      ClientImpl cli(next_host.c_str(), next_port);
+      ClientImpl cli(next_host, next_port);
       cli.copy_settings(*this);
       return detail::redirect(cli, req, res, path, location, error);
     }
@@ -6588,7 +7765,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
 
 inline bool ClientImpl::write_content_with_provider(Stream &strm,
                                                     const Request &req,
-                                                    Error &error) {
+                                                    Error &error) const {
   auto is_shutting_down = []() { return false; };
 
   if (req.is_chunked_content_provider_) {
@@ -6616,57 +7793,71 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req,
   // Prepare additional headers
   if (close_connection) {
     if (!req.has_header("Connection")) {
-      req.headers.emplace("Connection", "close");
+      req.set_header("Connection", "close");
     }
   }
 
   if (!req.has_header("Host")) {
     if (is_ssl()) {
       if (port_ == 443) {
-        req.headers.emplace("Host", host_);
+        req.set_header("Host", host_);
       } else {
-        req.headers.emplace("Host", host_and_port_);
+        req.set_header("Host", host_and_port_);
       }
     } else {
       if (port_ == 80) {
-        req.headers.emplace("Host", host_);
+        req.set_header("Host", host_);
       } else {
-        req.headers.emplace("Host", host_and_port_);
+        req.set_header("Host", host_and_port_);
       }
     }
   }
 
-  if (!req.has_header("Accept")) { req.headers.emplace("Accept", "*/*"); }
+  if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); }
+
+  if (!req.content_receiver) {
+    if (!req.has_header("Accept-Encoding")) {
+      std::string accept_encoding;
+#ifdef CPPHTTPLIB_BROTLI_SUPPORT
+      accept_encoding = "br";
+#endif
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+      if (!accept_encoding.empty()) { accept_encoding += ", "; }
+      accept_encoding += "gzip, deflate";
+#endif
+      req.set_header("Accept-Encoding", accept_encoding);
+    }
 
 #ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT
-  if (!req.has_header("User-Agent")) {
-    auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION;
-    req.headers.emplace("User-Agent", agent);
-  }
+    if (!req.has_header("User-Agent")) {
+      auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION;
+      req.set_header("User-Agent", agent);
+    }
 #endif
+  };
 
   if (req.body.empty()) {
     if (req.content_provider_) {
       if (!req.is_chunked_content_provider_) {
         if (!req.has_header("Content-Length")) {
           auto length = std::to_string(req.content_length_);
-          req.headers.emplace("Content-Length", length);
+          req.set_header("Content-Length", length);
         }
       }
     } else {
       if (req.method == "POST" || req.method == "PUT" ||
           req.method == "PATCH") {
-        req.headers.emplace("Content-Length", "0");
+        req.set_header("Content-Length", "0");
       }
     }
   } else {
     if (!req.has_header("Content-Type")) {
-      req.headers.emplace("Content-Type", "text/plain");
+      req.set_header("Content-Type", "text/plain");
     }
 
     if (!req.has_header("Content-Length")) {
       auto length = std::to_string(req.body.size());
-      req.headers.emplace("Content-Length", length);
+      req.set_header("Content-Length", length);
     }
   }
 
@@ -6703,10 +7894,16 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req,
   {
     detail::BufferStream bstrm;
 
-    const auto &path = url_encode_ ? detail::encode_url(req.path) : req.path;
-    bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str());
+    const auto &path_with_query =
+        req.params.empty() ? req.path
+                           : append_query_params(req.path, req.params);
 
-    detail::write_headers(bstrm, req.headers);
+    const auto &path =
+        url_encode_ ? detail::encode_url(path_with_query) : path_with_query;
+
+    detail::write_request_line(bstrm, req.method, path);
+
+    header_writer_(bstrm, req.headers);
 
     // Flush buffer
     auto &data = bstrm.get_buffer();
@@ -6734,12 +7931,10 @@ inline std::unique_ptr<Response> ClientImpl::send_with_content_provider(
     ContentProvider content_provider,
     ContentProviderWithoutLength content_provider_without_length,
     const std::string &content_type, Error &error) {
-  if (!content_type.empty()) {
-    req.headers.emplace("Content-Type", content_type);
-  }
+  if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
 
 #ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  if (compress_) { req.headers.emplace("Content-Encoding", "gzip"); }
+  if (compress_) { req.set_header("Content-Encoding", "gzip"); }
 #endif
 
 #ifdef CPPHTTPLIB_ZLIB_SUPPORT
@@ -6800,10 +7995,9 @@ inline std::unique_ptr<Response> ClientImpl::send_with_content_provider(
       req.content_provider_ = detail::ContentProviderAdapter(
           std::move(content_provider_without_length));
       req.is_chunked_content_provider_ = true;
-      req.headers.emplace("Transfer-Encoding", "chunked");
+      req.set_header("Transfer-Encoding", "chunked");
     } else {
       req.body.assign(body, content_length);
-      ;
     }
   }
 
@@ -6815,11 +8009,12 @@ inline Result ClientImpl::send_with_content_provider(
     const std::string &method, const std::string &path, const Headers &headers,
     const char *body, size_t content_length, ContentProvider content_provider,
     ContentProviderWithoutLength content_provider_without_length,
-    const std::string &content_type) {
+    const std::string &content_type, Progress progress) {
   Request req;
   req.method = method;
   req.headers = headers;
   req.path = path;
+  req.progress = progress;
 
   auto error = Error::Success;
 
@@ -6846,9 +8041,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req,
   if (is_ssl()) {
     auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1;
     if (!is_proxy_enabled) {
-      char buf[1];
-      if (SSL_peek(socket_.ssl, buf, 1) == 0 &&
-          SSL_get_error(socket_.ssl, 0) == SSL_ERROR_ZERO_RETURN) {
+      if (is_ssl_peer_could_be_closed(socket_.ssl)) {
         error = Error::SSLPeerCouldBeClosed_;
         return false;
       }
@@ -6864,8 +8057,11 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req,
   }
 
   // Body
-  if ((res.status != 204) && req.method != "HEAD" && req.method != "CONNECT") {
-    auto redirect = 300 < res.status && res.status < 400 && follow_location_;
+  if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" &&
+      req.method != "CONNECT") {
+    auto redirect = 300 < res.status && res.status < 400 &&
+                    res.status != StatusCode::NotModified_304 &&
+                    follow_location_;
 
     if (req.response_handler && !redirect) {
       if (!req.response_handler(res)) {
@@ -6886,9 +8082,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req,
             : static_cast<ContentReceiverWithProgress>(
                   [&](const char *buf, size_t n, uint64_t /*off*/,
                       uint64_t /*len*/) {
-                    if (res.body.size() + n > res.body.max_size()) {
-                      return false;
-                    }
+                    assert(res.body.size() + n <= res.body.max_size());
                     res.body.append(buf, n);
                     return true;
                   });
@@ -6900,31 +8094,26 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req,
       return ret;
     };
 
-    int dummy_status;
-    if (!detail::read_content(strm, res, (std::numeric_limits<size_t>::max)(),
-                              dummy_status, std::move(progress), std::move(out),
-                              decompress_)) {
-      if (error != Error::Canceled) { error = Error::Read; }
-      return false;
+    if (res.has_header("Content-Length")) {
+      if (!req.content_receiver) {
+        auto len = res.get_header_value_u64("Content-Length");
+        if (len > res.body.max_size()) {
+          error = Error::Read;
+          return false;
+        }
+        res.body.reserve(static_cast<size_t>(len));
+      }
     }
-  }
 
-  if (res.get_header_value("Connection") == "close" ||
-      (res.version == "HTTP/1.0" && res.reason != "Connection established")) {
-    // TODO this requires a not-entirely-obvious chain of calls to be correct
-    // for this to be safe. Maybe a code refactor (such as moving this out to
-    // the send function and getting rid of the recursiveness of the mutex)
-    // could make this more obvious.
-
-    // This is safe to call because process_request is only called by
-    // handle_request which is only called by send, which locks the request
-    // mutex during the process. It would be a bug to call it from a different
-    // thread since it's a thread-safety issue to do these things to the socket
-    // if another thread is using the socket.
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-    shutdown_ssl(socket_, true);
-    shutdown_socket(socket_);
-    close_socket(socket_);
+    if (res.status != StatusCode::NotModified_304) {
+      int dummy_status;
+      if (!detail::read_content(strm, res, (std::numeric_limits<size_t>::max)(),
+                                dummy_status, std::move(progress),
+                                std::move(out), decompress_)) {
+        if (error != Error::Canceled) { error = Error::Read; }
+        return false;
+      }
+    }
   }
 
   // Log
@@ -6935,13 +8124,14 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req,
 
 inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider(
     const std::string &boundary, const MultipartFormDataItems &items,
-    const MultipartFormDataProviderItems &provider_items) {
-  size_t cur_item = 0, cur_start = 0;
+    const MultipartFormDataProviderItems &provider_items) const {
+  size_t cur_item = 0;
+  size_t cur_start = 0;
   // cur_item and cur_start are copied to within the std::function and maintain
   // state between successive calls
   return [&, cur_item, cur_start](size_t offset,
                                   DataSink &sink) mutable -> bool {
-    if (!offset && items.size()) {
+    if (!offset && !items.empty()) {
       sink.os << detail::serialize_multipart_formdata(items, boundary, false);
       return true;
     } else if (cur_item < provider_items.size()) {
@@ -6954,12 +8144,13 @@ inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider(
       }
 
       DataSink cur_sink;
-      bool has_data = true;
+      auto has_data = true;
       cur_sink.write = sink.write;
       cur_sink.done = [&]() { has_data = false; };
 
-      if (!provider_items[cur_item].provider(offset - cur_start, cur_sink))
+      if (!provider_items[cur_item].provider(offset - cur_start, cur_sink)) {
         return false;
+      }
 
       if (!has_data) {
         sink.os << detail::serialize_multipart_formdata_item_end();
@@ -7078,14 +8269,15 @@ inline Result ClientImpl::Get(const std::string &path, const Params &params,
   if (params.empty()) { return Get(path, headers); }
 
   std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, progress);
+  return Get(path_with_query, headers, std::move(progress));
 }
 
 inline Result ClientImpl::Get(const std::string &path, const Params &params,
                               const Headers &headers,
                               ContentReceiver content_receiver,
                               Progress progress) {
-  return Get(path, params, headers, nullptr, content_receiver, progress);
+  return Get(path, params, headers, nullptr, std::move(content_receiver),
+             std::move(progress));
 }
 
 inline Result ClientImpl::Get(const std::string &path, const Params &params,
@@ -7094,12 +8286,13 @@ inline Result ClientImpl::Get(const std::string &path, const Params &params,
                               ContentReceiver content_receiver,
                               Progress progress) {
   if (params.empty()) {
-    return Get(path, headers, response_handler, content_receiver, progress);
+    return Get(path, headers, std::move(response_handler),
+               std::move(content_receiver), std::move(progress));
   }
 
   std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, response_handler,
-             content_receiver, progress);
+  return Get(path_with_query, headers, std::move(response_handler),
+             std::move(content_receiver), std::move(progress));
 }
 
 inline Result ClientImpl::Head(const std::string &path) {
@@ -7128,14 +8321,22 @@ inline Result ClientImpl::Post(const std::string &path,
 inline Result ClientImpl::Post(const std::string &path, const char *body,
                                size_t content_length,
                                const std::string &content_type) {
-  return Post(path, Headers(), body, content_length, content_type);
+  return Post(path, Headers(), body, content_length, content_type, nullptr);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
                                const char *body, size_t content_length,
                                const std::string &content_type) {
   return send_with_content_provider("POST", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type);
+                                    nullptr, nullptr, content_type, nullptr);
+}
+
+inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
+                               const char *body, size_t content_length,
+                               const std::string &content_type,
+                               Progress progress) {
+  return send_with_content_provider("POST", path, headers, body, content_length,
+                                    nullptr, nullptr, content_type, progress);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const std::string &body,
@@ -7143,12 +8344,27 @@ inline Result ClientImpl::Post(const std::string &path, const std::string &body,
   return Post(path, Headers(), body, content_type);
 }
 
+inline Result ClientImpl::Post(const std::string &path, const std::string &body,
+                               const std::string &content_type,
+                               Progress progress) {
+  return Post(path, Headers(), body, content_type, progress);
+}
+
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
                                const std::string &body,
                                const std::string &content_type) {
   return send_with_content_provider("POST", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
+                                    body.size(), nullptr, nullptr, content_type,
+                                    nullptr);
+}
+
+inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
+                               const std::string &body,
+                               const std::string &content_type,
+                               Progress progress) {
+  return send_with_content_provider("POST", path, headers, body.data(),
+                                    body.size(), nullptr, nullptr, content_type,
+                                    progress);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Params &params) {
@@ -7174,14 +8390,15 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
                                const std::string &content_type) {
   return send_with_content_provider("POST", path, headers, nullptr,
                                     content_length, std::move(content_provider),
-                                    nullptr, content_type);
+                                    nullptr, content_type, nullptr);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
                                ContentProviderWithoutLength content_provider,
                                const std::string &content_type) {
   return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
+                                    std::move(content_provider), content_type,
+                                    nullptr);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
@@ -7190,6 +8407,13 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
   return Post(path, headers, query, "application/x-www-form-urlencoded");
 }
 
+inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
+                               const Params &params, Progress progress) {
+  auto query = detail::params_to_query_str(params);
+  return Post(path, headers, query, "application/x-www-form-urlencoded",
+              progress);
+}
+
 inline Result ClientImpl::Post(const std::string &path,
                                const MultipartFormDataItems &items) {
   return Post(path, Headers(), items);
@@ -7201,7 +8425,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
   const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
+  return Post(path, headers, body, content_type);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
@@ -7214,7 +8438,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
   const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
+  return Post(path, headers, body, content_type);
 }
 
 inline Result
@@ -7227,7 +8451,7 @@ ClientImpl::Post(const std::string &path, const Headers &headers,
   return send_with_content_provider(
       "POST", path, headers, nullptr, 0, nullptr,
       get_multipart_content_provider(boundary, items, provider_items),
-      content_type);
+      content_type, nullptr);
 }
 
 inline Result ClientImpl::Put(const std::string &path) {
@@ -7244,7 +8468,15 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
                               const char *body, size_t content_length,
                               const std::string &content_type) {
   return send_with_content_provider("PUT", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type);
+                                    nullptr, nullptr, content_type, nullptr);
+}
+
+inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
+                              const char *body, size_t content_length,
+                              const std::string &content_type,
+                              Progress progress) {
+  return send_with_content_provider("PUT", path, headers, body, content_length,
+                                    nullptr, nullptr, content_type, progress);
 }
 
 inline Result ClientImpl::Put(const std::string &path, const std::string &body,
@@ -7252,12 +8484,27 @@ inline Result ClientImpl::Put(const std::string &path, const std::string &body,
   return Put(path, Headers(), body, content_type);
 }
 
+inline Result ClientImpl::Put(const std::string &path, const std::string &body,
+                              const std::string &content_type,
+                              Progress progress) {
+  return Put(path, Headers(), body, content_type, progress);
+}
+
 inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
                               const std::string &body,
                               const std::string &content_type) {
   return send_with_content_provider("PUT", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
+                                    body.size(), nullptr, nullptr, content_type,
+                                    nullptr);
+}
+
+inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
+                              const std::string &body,
+                              const std::string &content_type,
+                              Progress progress) {
+  return send_with_content_provider("PUT", path, headers, body.data(),
+                                    body.size(), nullptr, nullptr, content_type,
+                                    progress);
 }
 
 inline Result ClientImpl::Put(const std::string &path, size_t content_length,
@@ -7279,14 +8526,15 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
                               const std::string &content_type) {
   return send_with_content_provider("PUT", path, headers, nullptr,
                                     content_length, std::move(content_provider),
-                                    nullptr, content_type);
+                                    nullptr, content_type, nullptr);
 }
 
 inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
                               ContentProviderWithoutLength content_provider,
                               const std::string &content_type) {
   return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
+                                    std::move(content_provider), content_type,
+                                    nullptr);
 }
 
 inline Result ClientImpl::Put(const std::string &path, const Params &params) {
@@ -7299,6 +8547,13 @@ inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
   return Put(path, headers, query, "application/x-www-form-urlencoded");
 }
 
+inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
+                              const Params &params, Progress progress) {
+  auto query = detail::params_to_query_str(params);
+  return Put(path, headers, query, "application/x-www-form-urlencoded",
+             progress);
+}
+
 inline Result ClientImpl::Put(const std::string &path,
                               const MultipartFormDataItems &items) {
   return Put(path, Headers(), items);
@@ -7336,7 +8591,7 @@ ClientImpl::Put(const std::string &path, const Headers &headers,
   return send_with_content_provider(
       "PUT", path, headers, nullptr, 0, nullptr,
       get_multipart_content_provider(boundary, items, provider_items),
-      content_type);
+      content_type, nullptr);
 }
 inline Result ClientImpl::Patch(const std::string &path) {
   return Patch(path, std::string(), std::string());
@@ -7348,12 +8603,26 @@ inline Result ClientImpl::Patch(const std::string &path, const char *body,
   return Patch(path, Headers(), body, content_length, content_type);
 }
 
+inline Result ClientImpl::Patch(const std::string &path, const char *body,
+                                size_t content_length,
+                                const std::string &content_type,
+                                Progress progress) {
+  return Patch(path, Headers(), body, content_length, content_type, progress);
+}
+
 inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
                                 const char *body, size_t content_length,
                                 const std::string &content_type) {
+  return Patch(path, headers, body, content_length, content_type, nullptr);
+}
+
+inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
+                                const char *body, size_t content_length,
+                                const std::string &content_type,
+                                Progress progress) {
   return send_with_content_provider("PATCH", path, headers, body,
                                     content_length, nullptr, nullptr,
-                                    content_type);
+                                    content_type, progress);
 }
 
 inline Result ClientImpl::Patch(const std::string &path,
@@ -7362,12 +8631,26 @@ inline Result ClientImpl::Patch(const std::string &path,
   return Patch(path, Headers(), body, content_type);
 }
 
+inline Result ClientImpl::Patch(const std::string &path,
+                                const std::string &body,
+                                const std::string &content_type,
+                                Progress progress) {
+  return Patch(path, Headers(), body, content_type, progress);
+}
+
 inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
                                 const std::string &body,
                                 const std::string &content_type) {
+  return Patch(path, headers, body, content_type, nullptr);
+}
+
+inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
+                                const std::string &body,
+                                const std::string &content_type,
+                                Progress progress) {
   return send_with_content_provider("PATCH", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
+                                    body.size(), nullptr, nullptr, content_type,
+                                    progress);
 }
 
 inline Result ClientImpl::Patch(const std::string &path, size_t content_length,
@@ -7389,14 +8672,15 @@ inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
                                 const std::string &content_type) {
   return send_with_content_provider("PATCH", path, headers, nullptr,
                                     content_length, std::move(content_provider),
-                                    nullptr, content_type);
+                                    nullptr, content_type, nullptr);
 }
 
 inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
                                 ContentProviderWithoutLength content_provider,
                                 const std::string &content_type) {
   return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
+                                    std::move(content_provider), content_type,
+                                    nullptr);
 }
 
 inline Result ClientImpl::Delete(const std::string &path) {
@@ -7414,18 +8698,32 @@ inline Result ClientImpl::Delete(const std::string &path, const char *body,
   return Delete(path, Headers(), body, content_length, content_type);
 }
 
+inline Result ClientImpl::Delete(const std::string &path, const char *body,
+                                 size_t content_length,
+                                 const std::string &content_type,
+                                 Progress progress) {
+  return Delete(path, Headers(), body, content_length, content_type, progress);
+}
+
 inline Result ClientImpl::Delete(const std::string &path,
                                  const Headers &headers, const char *body,
                                  size_t content_length,
                                  const std::string &content_type) {
+  return Delete(path, headers, body, content_length, content_type, nullptr);
+}
+
+inline Result ClientImpl::Delete(const std::string &path,
+                                 const Headers &headers, const char *body,
+                                 size_t content_length,
+                                 const std::string &content_type,
+                                 Progress progress) {
   Request req;
   req.method = "DELETE";
   req.headers = headers;
   req.path = path;
+  req.progress = progress;
 
-  if (!content_type.empty()) {
-    req.headers.emplace("Content-Type", content_type);
-  }
+  if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
   req.body.assign(body, content_length);
 
   return send_(std::move(req));
@@ -7437,6 +8735,14 @@ inline Result ClientImpl::Delete(const std::string &path,
   return Delete(path, Headers(), body.data(), body.size(), content_type);
 }
 
+inline Result ClientImpl::Delete(const std::string &path,
+                                 const std::string &body,
+                                 const std::string &content_type,
+                                 Progress progress) {
+  return Delete(path, Headers(), body.data(), body.size(), content_type,
+                progress);
+}
+
 inline Result ClientImpl::Delete(const std::string &path,
                                  const Headers &headers,
                                  const std::string &body,
@@ -7444,6 +8750,15 @@ inline Result ClientImpl::Delete(const std::string &path,
   return Delete(path, headers, body.data(), body.size(), content_type);
 }
 
+inline Result ClientImpl::Delete(const std::string &path,
+                                 const Headers &headers,
+                                 const std::string &body,
+                                 const std::string &content_type,
+                                 Progress progress) {
+  return Delete(path, headers, body.data(), body.size(), content_type,
+                progress);
+}
+
 inline Result ClientImpl::Options(const std::string &path) {
   return Options(path, Headers());
 }
@@ -7458,13 +8773,6 @@ inline Result ClientImpl::Options(const std::string &path,
   return send_(std::move(req));
 }
 
-inline size_t ClientImpl::is_socket_open() const {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-  return socket_.is_open();
-}
-
-inline socket_t ClientImpl::socket() const { return socket_.sock; }
-
 inline void ClientImpl::stop() {
   std::lock_guard<std::mutex> guard(socket_mutex_);
 
@@ -7488,6 +8796,17 @@ inline void ClientImpl::stop() {
   close_socket(socket_);
 }
 
+inline std::string ClientImpl::host() const { return host_; }
+
+inline int ClientImpl::port() const { return port_; }
+
+inline size_t ClientImpl::is_socket_open() const {
+  std::lock_guard<std::mutex> guard(socket_mutex_);
+  return socket_.is_open();
+}
+
+inline socket_t ClientImpl::socket() const { return socket_.sock; }
+
 inline void ClientImpl::set_connection_timeout(time_t sec, time_t usec) {
   connection_timeout_sec_ = sec;
   connection_timeout_usec_ = usec;
@@ -7536,12 +8855,19 @@ inline void ClientImpl::set_default_headers(Headers headers) {
   default_headers_ = std::move(headers);
 }
 
+inline void ClientImpl::set_header_writer(
+    std::function<ssize_t(Stream &, Headers &)> const &writer) {
+  header_writer_ = writer;
+}
+
 inline void ClientImpl::set_address_family(int family) {
   address_family_ = family;
 }
 
 inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; }
 
+inline void ClientImpl::set_ipv6_v6only(bool on) { ipv6_v6only_ = on; }
+
 inline void ClientImpl::set_socket_options(SocketOptions socket_options) {
   socket_options_ = std::move(socket_options);
 }
@@ -7575,9 +8901,7 @@ inline void ClientImpl::set_proxy_digest_auth(const std::string &username,
   proxy_digest_auth_username_ = username;
   proxy_digest_auth_password_ = password;
 }
-#endif
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 inline void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path,
                                          const std::string &ca_cert_dir_path) {
   ca_cert_file_path_ = ca_cert_file_path;
@@ -7589,12 +8913,43 @@ inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) {
     ca_cert_store_ = ca_cert_store;
   }
 }
-#endif
 
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert,
+                                                    std::size_t size) const {
+  auto mem = BIO_new_mem_buf(ca_cert, static_cast<int>(size));
+  auto se = detail::scope_exit([&] { BIO_free_all(mem); });
+  if (!mem) { return nullptr; }
+
+  auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr);
+  if (!inf) { return nullptr; }
+
+  auto cts = X509_STORE_new();
+  if (cts) {
+    for (auto i = 0; i < static_cast<int>(sk_X509_INFO_num(inf)); i++) {
+      auto itmp = sk_X509_INFO_value(inf, i);
+      if (!itmp) { continue; }
+
+      if (itmp->x509) { X509_STORE_add_cert(cts, itmp->x509); }
+      if (itmp->crl) { X509_STORE_add_crl(cts, itmp->crl); }
+    }
+  }
+
+  sk_X509_INFO_pop_free(inf, X509_INFO_free);
+  return cts;
+}
+
 inline void ClientImpl::enable_server_certificate_verification(bool enabled) {
   server_certificate_verification_ = enabled;
 }
+
+inline void ClientImpl::enable_server_hostname_verification(bool enabled) {
+  server_hostname_verification_ = enabled;
+}
+
+inline void ClientImpl::set_server_certificate_verifier(
+    std::function<bool(SSL *ssl)> verifier) {
+  server_certificate_verifier_ = verifier;
+}
 #endif
 
 inline void ClientImpl::set_logger(Logger logger) {
@@ -7638,13 +8993,30 @@ inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex,
   return ssl;
 }
 
-inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl,
+inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock,
                        bool shutdown_gracefully) {
   // sometimes we may want to skip this to try to avoid SIGPIPE if we know
   // the remote has closed the network connection
   // Note that it is not always possible to avoid SIGPIPE, this is merely a
   // best-efforts.
-  if (shutdown_gracefully) { SSL_shutdown(ssl); }
+  if (shutdown_gracefully) {
+#ifdef _WIN32
+    (void)(sock);
+    SSL_shutdown(ssl);
+#else
+    timeval tv;
+    tv.tv_sec = 1;
+    tv.tv_usec = 0;
+    setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+               reinterpret_cast<const void *>(&tv), sizeof(tv));
+
+    auto ret = SSL_shutdown(ssl);
+    while (ret == 0) {
+      std::this_thread::sleep_for(std::chrono::milliseconds{100});
+      ret = SSL_shutdown(ssl);
+    }
+#endif
+  }
 
   std::lock_guard<std::mutex> guard(ctx_mutex);
   SSL_free(ssl);
@@ -7655,7 +9027,7 @@ bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl,
                                        U ssl_connect_or_accept,
                                        time_t timeout_sec,
                                        time_t timeout_usec) {
-  int res = 0;
+  auto res = 0;
   while ((res = ssl_connect_or_accept(ssl)) != 1) {
     auto err = SSL_get_error(ssl, res);
     switch (err) {
@@ -7718,7 +9090,7 @@ inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL *ssl,
   SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY);
 }
 
-inline SSLSocketStream::~SSLSocketStream() {}
+inline SSLSocketStream::~SSLSocketStream() = default;
 
 inline bool SSLSocketStream::is_readable() const {
   return detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
@@ -7736,7 +9108,7 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) {
     auto ret = SSL_read(ssl_, ptr, static_cast<int>(size));
     if (ret < 0) {
       auto err = SSL_get_error(ssl_, ret);
-      int n = 1000;
+      auto n = 1000;
 #ifdef _WIN32
       while (--n >= 0 && (err == SSL_ERROR_WANT_READ ||
                           (err == SSL_ERROR_SYSCALL &&
@@ -7747,7 +9119,7 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) {
         if (SSL_pending(ssl_) > 0) {
           return SSL_read(ssl_, ptr, static_cast<int>(size));
         } else if (is_readable()) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          std::this_thread::sleep_for(std::chrono::microseconds{10});
           ret = SSL_read(ssl_, ptr, static_cast<int>(size));
           if (ret >= 0) { return ret; }
           err = SSL_get_error(ssl_, ret);
@@ -7769,7 +9141,7 @@ inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
     auto ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
     if (ret < 0) {
       auto err = SSL_get_error(ssl_, ret);
-      int n = 1000;
+      auto n = 1000;
 #ifdef _WIN32
       while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE ||
                           (err == SSL_ERROR_SYSCALL &&
@@ -7778,7 +9150,7 @@ inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
       while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) {
 #endif
         if (is_writable()) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
+          std::this_thread::sleep_for(std::chrono::microseconds{10});
           ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
           if (ret >= 0) { return ret; }
           err = SSL_get_error(ssl_, ret);
@@ -7820,17 +9192,18 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
                         SSL_OP_NO_COMPRESSION |
                             SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
 
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION);
+    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
 
-    // add default password callback before opening encrypted private key
     if (private_key_password != nullptr && (private_key_password[0] != '\0')) {
-      SSL_CTX_set_default_passwd_cb_userdata(ctx_,
-                                             (char *)private_key_password);
+      SSL_CTX_set_default_passwd_cb_userdata(
+          ctx_,
+          reinterpret_cast<void *>(const_cast<char *>(private_key_password)));
     }
 
     if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 ||
         SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) !=
-            1) {
+            1 ||
+        SSL_CTX_check_private_key(ctx_) != 1) {
       SSL_CTX_free(ctx_);
       ctx_ = nullptr;
     } else if (client_ca_cert_file_path || client_ca_cert_dir_path) {
@@ -7852,7 +9225,7 @@ inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key,
                         SSL_OP_NO_COMPRESSION |
                             SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
 
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION);
+    SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
 
     if (SSL_CTX_use_certificate(ctx_, cert) != 1 ||
         SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) {
@@ -7886,6 +9259,19 @@ inline bool SSLServer::is_valid() const { return ctx_; }
 
 inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; }
 
+inline void SSLServer::update_certs(X509 *cert, EVP_PKEY *private_key,
+                                    X509_STORE *client_ca_cert_store) {
+
+  std::lock_guard<std::mutex> guard(ctx_mutex_);
+
+  SSL_CTX_use_certificate(ctx_, cert);
+  SSL_CTX_use_PrivateKey(ctx_, private_key);
+
+  if (client_ca_cert_store != nullptr) {
+    SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
+  }
+}
+
 inline bool SSLServer::process_and_close_socket(socket_t sock) {
   auto ssl = detail::ssl_new(
       sock, ctx_, ctx_mutex_,
@@ -7897,20 +9283,29 @@ inline bool SSLServer::process_and_close_socket(socket_t sock) {
 
   auto ret = false;
   if (ssl) {
+    std::string remote_addr;
+    int remote_port = 0;
+    detail::get_remote_ip_and_port(sock, remote_addr, remote_port);
+
+    std::string local_addr;
+    int local_port = 0;
+    detail::get_local_ip_and_port(sock, local_addr, local_port);
+
     ret = detail::process_server_socket_ssl(
         svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
         read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
         write_timeout_usec_,
-        [this, ssl](Stream &strm, bool close_connection,
-                    bool &connection_closed) {
-          return process_request(strm, close_connection, connection_closed,
+        [&](Stream &strm, bool close_connection, bool &connection_closed) {
+          return process_request(strm, remote_addr, remote_port, local_addr,
+                                 local_port, close_connection,
+                                 connection_closed,
                                  [&](Request &req) { req.ssl = ssl; });
         });
 
     // Shutdown gracefully if the result seemed successful, non-gracefully if
     // the connection appeared to be closed.
     const bool shutdown_gracefully = ret;
-    detail::ssl_delete(ctx_mutex_, ssl, shutdown_gracefully);
+    detail::ssl_delete(ctx_mutex_, ssl, sock, shutdown_gracefully);
   }
 
   detail::shutdown_socket(sock);
@@ -7927,16 +9322,25 @@ inline SSLClient::SSLClient(const std::string &host, int port)
 
 inline SSLClient::SSLClient(const std::string &host, int port,
                             const std::string &client_cert_path,
-                            const std::string &client_key_path)
+                            const std::string &client_key_path,
+                            const std::string &private_key_password)
     : ClientImpl(host, port, client_cert_path, client_key_path) {
   ctx_ = SSL_CTX_new(TLS_client_method());
 
+  SSL_CTX_set_min_proto_version(ctx_, TLS1_2_VERSION);
+
   detail::split(&host_[0], &host_[host_.size()], '.',
                 [&](const char *b, const char *e) {
-                  host_components_.emplace_back(std::string(b, e));
+                  host_components_.emplace_back(b, e);
                 });
 
   if (!client_cert_path.empty() && !client_key_path.empty()) {
+    if (!private_key_password.empty()) {
+      SSL_CTX_set_default_passwd_cb_userdata(
+          ctx_, reinterpret_cast<void *>(
+                    const_cast<char *>(private_key_password.c_str())));
+    }
+
     if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(),
                                      SSL_FILETYPE_PEM) != 1 ||
         SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(),
@@ -7948,16 +9352,23 @@ inline SSLClient::SSLClient(const std::string &host, int port,
 }
 
 inline SSLClient::SSLClient(const std::string &host, int port,
-                            X509 *client_cert, EVP_PKEY *client_key)
+                            X509 *client_cert, EVP_PKEY *client_key,
+                            const std::string &private_key_password)
     : ClientImpl(host, port) {
   ctx_ = SSL_CTX_new(TLS_client_method());
 
   detail::split(&host_[0], &host_[host_.size()], '.',
                 [&](const char *b, const char *e) {
-                  host_components_.emplace_back(std::string(b, e));
+                  host_components_.emplace_back(b, e);
                 });
 
   if (client_cert != nullptr && client_key != nullptr) {
+    if (!private_key_password.empty()) {
+      SSL_CTX_set_default_passwd_cb_userdata(
+          ctx_, reinterpret_cast<void *>(
+                    const_cast<char *>(private_key_password.c_str())));
+    }
+
     if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 ||
         SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) {
       SSL_CTX_free(ctx_);
@@ -7989,6 +9400,11 @@ inline void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) {
   }
 }
 
+inline void SSLClient::load_ca_cert_store(const char *ca_cert,
+                                          std::size_t size) {
+  set_ca_cert_store(ClientImpl::create_ca_cert_store(ca_cert, size));
+}
+
 inline long SSLClient::get_openssl_verify_result() const {
   return verify_result_;
 }
@@ -8003,14 +9419,14 @@ inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
 inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res,
                                           bool &success, Error &error) {
   success = true;
-  Response res2;
+  Response proxy_res;
   if (!detail::process_client_socket(
           socket.sock, read_timeout_sec_, read_timeout_usec_,
           write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) {
             Request req2;
             req2.method = "CONNECT";
             req2.path = host_and_port_;
-            return process_request(strm, req2, res2, false, error);
+            return process_request(strm, req2, proxy_res, false, error);
           })) {
     // Thread-safe to close everything because we are assuming there are no
     // requests in flight
@@ -8021,12 +9437,12 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res,
     return false;
   }
 
-  if (res2.status == 407) {
+  if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) {
     if (!proxy_digest_auth_username_.empty() &&
         !proxy_digest_auth_password_.empty()) {
       std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(res2, auth, true)) {
-        Response res3;
+      if (detail::parse_www_authenticate(proxy_res, auth, true)) {
+        proxy_res = Response();
         if (!detail::process_client_socket(
                 socket.sock, read_timeout_sec_, read_timeout_usec_,
                 write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) {
@@ -8037,7 +9453,7 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res,
                       req3, auth, 1, detail::random_string(10),
                       proxy_digest_auth_username_, proxy_digest_auth_password_,
                       true));
-                  return process_request(strm, req3, res3, false, error);
+                  return process_request(strm, req3, proxy_res, false, error);
                 })) {
           // Thread-safe to close everything because we are assuming there are
           // no requests in flight
@@ -8048,17 +9464,28 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res,
           return false;
         }
       }
-    } else {
-      res = res2;
-      return false;
     }
   }
 
+  // If status code is not 200, proxy request is failed.
+  // Set error to ProxyConnection and return proxy response
+  // as the response of the request
+  if (proxy_res.status != StatusCode::OK_200) {
+    error = Error::ProxyConnection;
+    res = std::move(proxy_res);
+    // Thread-safe to close everything because we are assuming there are
+    // no requests in flight
+    shutdown_ssl(socket, true);
+    shutdown_socket(socket);
+    close_socket(socket);
+    return false;
+  }
+
   return true;
 }
 
 inline bool SSLClient::load_certs() {
-  bool ret = true;
+  auto ret = true;
 
   std::call_once(initialize_cert_, [&]() {
     std::lock_guard<std::mutex> guard(ctx_mutex_);
@@ -8109,32 +9536,47 @@ inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
         }
 
         if (server_certificate_verification_) {
-          verify_result_ = SSL_get_verify_result(ssl2);
+          if (server_certificate_verifier_) {
+            if (!server_certificate_verifier_(ssl2)) {
+              error = Error::SSLServerVerification;
+              return false;
+            }
+          } else {
+            verify_result_ = SSL_get_verify_result(ssl2);
 
-          if (verify_result_ != X509_V_OK) {
-            error = Error::SSLServerVerification;
-            return false;
+            if (verify_result_ != X509_V_OK) {
+              error = Error::SSLServerVerification;
+              return false;
+            }
+
+            auto server_cert = SSL_get1_peer_certificate(ssl2);
+            auto se = detail::scope_exit([&] { X509_free(server_cert); });
+
+            if (server_cert == nullptr) {
+              error = Error::SSLServerVerification;
+              return false;
+            }
+
+            if (server_hostname_verification_) {
+              if (!verify_host(server_cert)) {
+                error = Error::SSLServerHostnameVerification;
+                return false;
+              }
+            }
           }
-
-          auto server_cert = SSL_get1_peer_certificate(ssl2);
-
-          if (server_cert == nullptr) {
-            error = Error::SSLServerVerification;
-            return false;
-          }
-
-          if (!verify_host(server_cert)) {
-            X509_free(server_cert);
-            error = Error::SSLServerVerification;
-            return false;
-          }
-          X509_free(server_cert);
         }
 
         return true;
       },
       [&](SSL *ssl2) {
+#if defined(OPENSSL_IS_BORINGSSL)
         SSL_set_tlsext_host_name(ssl2, host_.c_str());
+#else
+        // NOTE: Direct call instead of using the OpenSSL macro to suppress
+        // -Wold-style-cast warning
+        SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name,
+                 static_cast<void *>(const_cast<char *>(host_.c_str())));
+#endif
         return true;
       });
 
@@ -8159,7 +9601,8 @@ inline void SSLClient::shutdown_ssl_impl(Socket &socket,
     return;
   }
   if (socket.ssl) {
-    detail::ssl_delete(ctx_mutex_, socket.ssl, shutdown_gracefully);
+    detail::ssl_delete(ctx_mutex_, socket.ssl, socket.sock,
+                       shutdown_gracefully);
     socket.ssl = nullptr;
   }
   assert(socket.ssl == nullptr);
@@ -8208,8 +9651,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
 
   auto type = GEN_DNS;
 
-  struct in6_addr addr6;
-  struct in_addr addr;
+  struct in6_addr addr6{};
+  struct in_addr addr{};
   size_t addr_len = 0;
 
 #ifndef __MINGW32__
@@ -8234,8 +9677,9 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
     for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
       auto val = sk_GENERAL_NAME_value(alt_names, i);
       if (val->type == type) {
-        auto name = (const char *)ASN1_STRING_get0_data(val->d.ia5);
-        auto name_len = (size_t)ASN1_STRING_length(val->d.ia5);
+        auto name =
+            reinterpret_cast<const char *>(ASN1_STRING_get0_data(val->d.ia5));
+        auto name_len = static_cast<size_t>(ASN1_STRING_length(val->d.ia5));
 
         switch (type) {
         case GEN_DNS: dsn_matched = check_host_name(name, name_len); break;
@@ -8253,7 +9697,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
     if (dsn_matched || ip_matched) { ret = true; }
   }
 
-  GENERAL_NAMES_free((STACK_OF(GENERAL_NAME) *)alt_names);
+  GENERAL_NAMES_free(const_cast<STACK_OF(GENERAL_NAME) *>(
+      reinterpret_cast<const STACK_OF(GENERAL_NAME) *>(alt_names)));
   return ret;
 }
 
@@ -8282,7 +9727,7 @@ inline bool SSLClient::check_host_name(const char *pattern,
   std::vector<std::string> pattern_components;
   detail::split(&pattern[0], &pattern[pattern_len], '.',
                 [&](const char *b, const char *e) {
-                  pattern_components.emplace_back(std::string(b, e));
+                  pattern_components.emplace_back(b, e);
                 });
 
   if (host_components_.size() != pattern_components.size()) { return false; }
@@ -8310,7 +9755,7 @@ inline Client::Client(const std::string &scheme_host_port,
                       const std::string &client_cert_path,
                       const std::string &client_key_path) {
   const static std::regex re(
-      R"((?:([a-z]+):\/\/)?(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
+      R"((?:([a-z]+):\/\/)?(?:\[([a-fA-F\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
 
   std::smatch m;
   if (std::regex_match(scheme_host_port, m, re)) {
@@ -8347,10 +9792,12 @@ inline Client::Client(const std::string &scheme_host_port,
                                              client_key_path);
     }
   } else {
+    // NOTE: Update TEST(UniversalClientImplTest, Ipv6LiteralAddress)
+    // if port param below changes.
     cli_ = detail::make_unique<ClientImpl>(scheme_host_port, 80,
                                            client_cert_path, client_key_path);
   }
-}
+} // namespace detail
 
 inline Client::Client(const std::string &host, int port)
     : cli_(detail::make_unique<ClientImpl>(host, port)) {}
@@ -8361,7 +9808,7 @@ inline Client::Client(const std::string &host, int port,
     : cli_(detail::make_unique<ClientImpl>(host, port, client_cert_path,
                                            client_key_path)) {}
 
-inline Client::~Client() {}
+inline Client::~Client() = default;
 
 inline bool Client::is_valid() const {
   return cli_ != nullptr && cli_->is_valid();
@@ -8421,19 +9868,20 @@ inline Result Client::Get(const std::string &path, const Headers &headers,
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers, Progress progress) {
-  return cli_->Get(path, params, headers, progress);
+  return cli_->Get(path, params, headers, std::move(progress));
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers,
                           ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, content_receiver, progress);
+  return cli_->Get(path, params, headers, std::move(content_receiver),
+                   std::move(progress));
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers,
                           ResponseHandler response_handler,
                           ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, response_handler, content_receiver,
-                   progress);
+  return cli_->Get(path, params, headers, std::move(response_handler),
+                   std::move(content_receiver), std::move(progress));
 }
 
 inline Result Client::Head(const std::string &path) { return cli_->Head(path); }
@@ -8455,15 +9903,30 @@ inline Result Client::Post(const std::string &path, const Headers &headers,
                            const std::string &content_type) {
   return cli_->Post(path, headers, body, content_length, content_type);
 }
+inline Result Client::Post(const std::string &path, const Headers &headers,
+                           const char *body, size_t content_length,
+                           const std::string &content_type, Progress progress) {
+  return cli_->Post(path, headers, body, content_length, content_type,
+                    progress);
+}
 inline Result Client::Post(const std::string &path, const std::string &body,
                            const std::string &content_type) {
   return cli_->Post(path, body, content_type);
 }
+inline Result Client::Post(const std::string &path, const std::string &body,
+                           const std::string &content_type, Progress progress) {
+  return cli_->Post(path, body, content_type, progress);
+}
 inline Result Client::Post(const std::string &path, const Headers &headers,
                            const std::string &body,
                            const std::string &content_type) {
   return cli_->Post(path, headers, body, content_type);
 }
+inline Result Client::Post(const std::string &path, const Headers &headers,
+                           const std::string &body,
+                           const std::string &content_type, Progress progress) {
+  return cli_->Post(path, headers, body, content_type, progress);
+}
 inline Result Client::Post(const std::string &path, size_t content_length,
                            ContentProvider content_provider,
                            const std::string &content_type) {
@@ -8494,6 +9957,10 @@ inline Result Client::Post(const std::string &path, const Headers &headers,
                            const Params &params) {
   return cli_->Post(path, headers, params);
 }
+inline Result Client::Post(const std::string &path, const Headers &headers,
+                           const Params &params, Progress progress) {
+  return cli_->Post(path, headers, params, progress);
+}
 inline Result Client::Post(const std::string &path,
                            const MultipartFormDataItems &items) {
   return cli_->Post(path, items);
@@ -8524,15 +9991,29 @@ inline Result Client::Put(const std::string &path, const Headers &headers,
                           const std::string &content_type) {
   return cli_->Put(path, headers, body, content_length, content_type);
 }
+inline Result Client::Put(const std::string &path, const Headers &headers,
+                          const char *body, size_t content_length,
+                          const std::string &content_type, Progress progress) {
+  return cli_->Put(path, headers, body, content_length, content_type, progress);
+}
 inline Result Client::Put(const std::string &path, const std::string &body,
                           const std::string &content_type) {
   return cli_->Put(path, body, content_type);
 }
+inline Result Client::Put(const std::string &path, const std::string &body,
+                          const std::string &content_type, Progress progress) {
+  return cli_->Put(path, body, content_type, progress);
+}
 inline Result Client::Put(const std::string &path, const Headers &headers,
                           const std::string &body,
                           const std::string &content_type) {
   return cli_->Put(path, headers, body, content_type);
 }
+inline Result Client::Put(const std::string &path, const Headers &headers,
+                          const std::string &body,
+                          const std::string &content_type, Progress progress) {
+  return cli_->Put(path, headers, body, content_type, progress);
+}
 inline Result Client::Put(const std::string &path, size_t content_length,
                           ContentProvider content_provider,
                           const std::string &content_type) {
@@ -8563,6 +10044,10 @@ inline Result Client::Put(const std::string &path, const Headers &headers,
                           const Params &params) {
   return cli_->Put(path, headers, params);
 }
+inline Result Client::Put(const std::string &path, const Headers &headers,
+                          const Params &params, Progress progress) {
+  return cli_->Put(path, headers, params, progress);
+}
 inline Result Client::Put(const std::string &path,
                           const MultipartFormDataItems &items) {
   return cli_->Put(path, items);
@@ -8590,20 +10075,44 @@ inline Result Client::Patch(const std::string &path, const char *body,
                             const std::string &content_type) {
   return cli_->Patch(path, body, content_length, content_type);
 }
+inline Result Client::Patch(const std::string &path, const char *body,
+                            size_t content_length,
+                            const std::string &content_type,
+                            Progress progress) {
+  return cli_->Patch(path, body, content_length, content_type, progress);
+}
 inline Result Client::Patch(const std::string &path, const Headers &headers,
                             const char *body, size_t content_length,
                             const std::string &content_type) {
   return cli_->Patch(path, headers, body, content_length, content_type);
 }
+inline Result Client::Patch(const std::string &path, const Headers &headers,
+                            const char *body, size_t content_length,
+                            const std::string &content_type,
+                            Progress progress) {
+  return cli_->Patch(path, headers, body, content_length, content_type,
+                     progress);
+}
 inline Result Client::Patch(const std::string &path, const std::string &body,
                             const std::string &content_type) {
   return cli_->Patch(path, body, content_type);
 }
+inline Result Client::Patch(const std::string &path, const std::string &body,
+                            const std::string &content_type,
+                            Progress progress) {
+  return cli_->Patch(path, body, content_type, progress);
+}
 inline Result Client::Patch(const std::string &path, const Headers &headers,
                             const std::string &body,
                             const std::string &content_type) {
   return cli_->Patch(path, headers, body, content_type);
 }
+inline Result Client::Patch(const std::string &path, const Headers &headers,
+                            const std::string &body,
+                            const std::string &content_type,
+                            Progress progress) {
+  return cli_->Patch(path, headers, body, content_type, progress);
+}
 inline Result Client::Patch(const std::string &path, size_t content_length,
                             ContentProvider content_provider,
                             const std::string &content_type) {
@@ -8638,20 +10147,44 @@ inline Result Client::Delete(const std::string &path, const char *body,
                              const std::string &content_type) {
   return cli_->Delete(path, body, content_length, content_type);
 }
+inline Result Client::Delete(const std::string &path, const char *body,
+                             size_t content_length,
+                             const std::string &content_type,
+                             Progress progress) {
+  return cli_->Delete(path, body, content_length, content_type, progress);
+}
 inline Result Client::Delete(const std::string &path, const Headers &headers,
                              const char *body, size_t content_length,
                              const std::string &content_type) {
   return cli_->Delete(path, headers, body, content_length, content_type);
 }
+inline Result Client::Delete(const std::string &path, const Headers &headers,
+                             const char *body, size_t content_length,
+                             const std::string &content_type,
+                             Progress progress) {
+  return cli_->Delete(path, headers, body, content_length, content_type,
+                      progress);
+}
 inline Result Client::Delete(const std::string &path, const std::string &body,
                              const std::string &content_type) {
   return cli_->Delete(path, body, content_type);
 }
+inline Result Client::Delete(const std::string &path, const std::string &body,
+                             const std::string &content_type,
+                             Progress progress) {
+  return cli_->Delete(path, body, content_type, progress);
+}
 inline Result Client::Delete(const std::string &path, const Headers &headers,
                              const std::string &body,
                              const std::string &content_type) {
   return cli_->Delete(path, headers, body, content_type);
 }
+inline Result Client::Delete(const std::string &path, const Headers &headers,
+                             const std::string &body,
+                             const std::string &content_type,
+                             Progress progress) {
+  return cli_->Delete(path, headers, body, content_type, progress);
+}
 inline Result Client::Options(const std::string &path) {
   return cli_->Options(path);
 }
@@ -8665,12 +10198,16 @@ inline bool Client::send(Request &req, Response &res, Error &error) {
 
 inline Result Client::send(const Request &req) { return cli_->send(req); }
 
+inline void Client::stop() { cli_->stop(); }
+
+inline std::string Client::host() const { return cli_->host(); }
+
+inline int Client::port() const { return cli_->port(); }
+
 inline size_t Client::is_socket_open() const { return cli_->is_socket_open(); }
 
 inline socket_t Client::socket() const { return cli_->socket(); }
 
-inline void Client::stop() { cli_->stop(); }
-
 inline void
 Client::set_hostname_addr_map(std::map<std::string, std::string> addr_map) {
   cli_->set_hostname_addr_map(std::move(addr_map));
@@ -8680,6 +10217,11 @@ inline void Client::set_default_headers(Headers headers) {
   cli_->set_default_headers(std::move(headers));
 }
 
+inline void Client::set_header_writer(
+    std::function<ssize_t(Stream &, Headers &)> const &writer) {
+  cli_->set_header_writer(writer);
+}
+
 inline void Client::set_address_family(int family) {
   cli_->set_address_family(family);
 }
@@ -8752,9 +10294,20 @@ inline void Client::set_proxy_digest_auth(const std::string &username,
 inline void Client::enable_server_certificate_verification(bool enabled) {
   cli_->enable_server_certificate_verification(enabled);
 }
+
+inline void Client::enable_server_hostname_verification(bool enabled) {
+  cli_->enable_server_hostname_verification(enabled);
+}
+
+inline void Client::set_server_certificate_verifier(
+    std::function<bool(SSL *ssl)> verifier) {
+  cli_->set_server_certificate_verifier(verifier);
+}
 #endif
 
-inline void Client::set_logger(Logger logger) { cli_->set_logger(logger); }
+inline void Client::set_logger(Logger logger) {
+  cli_->set_logger(std::move(logger));
+}
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path,
@@ -8770,6 +10323,10 @@ inline void Client::set_ca_cert_store(X509_STORE *ca_cert_store) {
   }
 }
 
+inline void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) {
+  set_ca_cert_store(cli_->create_ca_cert_store(ca_cert, size));
+}
+
 inline long Client::get_openssl_verify_result() const {
   if (is_ssl_) {
     return static_cast<SSLClient &>(*cli_).get_openssl_verify_result();
diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
deleted file mode 100644
index 20551520e..000000000
--- a/examples/server/index.html.hpp
+++ /dev/null
@@ -1,2791 +0,0 @@
-unsigned char index_html[] = {
-  0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x0a, 0x3c, 0x68, 0x65, 0x61,
-  0x64, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x63,
-  0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46, 0x2d,
-  0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f,
-  0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d,
-  0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65, 0x76, 0x69, 0x63,
-  0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20, 0x69, 0x6e, 0x69,
-  0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31,
-  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2d, 0x73, 0x63,
-  0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
-  0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
-  0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65,
-  0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x6c,
-  0x69, 0x67, 0x68, 0x74, 0x20, 0x64, 0x61, 0x72, 0x6b, 0x22, 0x3e, 0x0a,
-  0x20, 0x20, 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61,
-  0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61,
-  0x74, 0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20,
-  0x20, 0x3c, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c,
-  0x79, 0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74,
-  0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23,
-  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e,
-  0x3a, 0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61,
-  0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d,
-  0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73,
-  0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77,
-  0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68,
-  0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x6d, 0x61, 0x69, 0x6e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70,
-  0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69,
-  0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c,
-  0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a,
-  0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65,
-  0x74, 0x77, 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67,
-  0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79,
-  0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x31,
-  0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, 0x63,
-  0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72,
-  0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20,
-  0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65,
-  0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68,
-  0x3a, 0x20, 0x36, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68,
-  0x3a, 0x20, 0x33, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x68, 0x65, 0x69, 0x67,
-  0x68, 0x74, 0x3a, 0x20, 0x31, 0x2e, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30,
-  0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20,
-  0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77,
-  0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x77, 0x6f, 0x72, 0x64, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20,
-  0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x79, 0x70, 0x68, 0x65, 0x6e,
-  0x73, 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f,
-  0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62,
-  0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x23, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72,
-  0x67, 0x69, 0x6e, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x20, 0x30,
-  0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69,
-  0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d,
-  0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63,
-  0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e,
-  0x2d, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x65,
-  0x74, 0x63, 0x68, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c,
-  0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72,
-  0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20,
-  0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65,
-  0x6e, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64,
-  0x65, 0x72, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a,
-  0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61,
-  0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x2e, 0x74, 0x77, 0x6f, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79,
-  0x3a, 0x20, 0x67, 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x67, 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31,
-  0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x2e,
-  0x74, 0x68, 0x72, 0x65, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x67,
-  0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67,
-  0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65,
-  0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72,
-  0x3a, 0x20, 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20,
-  0x23, 0x61, 0x61, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75,
-  0x73, 0x3a, 0x20, 0x34, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30,
-  0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67,
-  0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65,
-  0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x77,
-  0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x62, 0x6f, 0x6c, 0x64, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69,
-  0x6e, 0x3a, 0x20, 0x2d, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x2d, 0x30,
-  0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30,
-  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x75, 0x72, 0x73, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e,
-  0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x5b,
-  0x6f, 0x70, 0x65, 0x6e, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30,
-  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73,
-  0x65, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x33, 0x65,
-  0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72,
-  0x64, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20,
-  0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63,
-  0x63, 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e,
-  0x3a, 0x20, 0x61, 0x62, 0x73, 0x6f, 0x6c, 0x75, 0x74, 0x65, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72,
-  0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20,
-  0x77, 0x68, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e,
-  0x32, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
-  0x6f, 0x78, 0x2d, 0x73, 0x68, 0x61, 0x64, 0x6f, 0x77, 0x3a, 0x20, 0x30,
-  0x20, 0x30, 0x20, 0x31, 0x30, 0x70, 0x78, 0x20, 0x72, 0x67, 0x62, 0x61,
-  0x28, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2e,
-  0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64,
-  0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f,
-  0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79,
-  0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e,
-  0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x32, 0x32,
-  0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c,
-  0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64,
-  0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
-  0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d,
-  0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a,
-  0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64,
-  0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33,
-  0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e,
-  0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c,
-  0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
-  0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2e, 0x73, 0x6c, 0x69, 0x6d, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67,
-  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c,
-  0x61, 0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68,
-  0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
-  0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e,
-  0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6f,
-  0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x38,
-  0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d,
-  0x6f, 0x64, 0x65, 0x2d, 0x63, 0x68, 0x61, 0x74, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x34,
-  0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x63,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x65,
-  0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20,
-  0x31, 0x30, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x5b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x65, 0x64, 0x69, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5d, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c,
-  0x61, 0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x62,
-  0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x77, 0x68, 0x69, 0x74, 0x65, 0x2d, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3a,
-  0x20, 0x70, 0x72, 0x65, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x75, 0x74, 0x6c, 0x69, 0x6e, 0x65,
-  0x3a, 0x20, 0x30, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x40, 0x6b, 0x65, 0x79, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x20, 0x6c,
-  0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69,
-  0x70, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30,
-  0x25, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70,
-  0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x30, 0x25, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67,
-  0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69,
-  0x6f, 0x6e, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
-  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x31, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65,
-  0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
-  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65,
-  0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61,
-  0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x73, 0x69, 0x7a,
-  0x65, 0x3a, 0x20, 0x35, 0x30, 0x25, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67,
-  0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x3a,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x2d, 0x67, 0x72, 0x61, 0x64,
-  0x69, 0x65, 0x6e, 0x74, 0x28, 0x39, 0x30, 0x64, 0x65, 0x67, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x29, 0x2c, 0x20,
-  0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e,
-  0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, 0x65, 0x20, 0x32, 0x73,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x20, 0x69, 0x6e, 0x66, 0x69,
-  0x6e, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x40, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x20,
-  0x28, 0x70, 0x72, 0x65, 0x66, 0x65, 0x72, 0x73, 0x2d, 0x63, 0x6f, 0x6c,
-  0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, 0x3a, 0x20, 0x64,
-  0x61, 0x72, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f,
-  0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d,
-  0x31, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x30, 0x30,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d,
-  0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f,
-  0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32,
-  0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f,
-  0x76, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61,
-  0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c,
-  0x6f, 0x72, 0x3a, 0x20, 0x62, 0x6c, 0x61, 0x63, 0x6b, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a,
-  0x0a, 0x20, 0x20, 0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x2c, 0x20, 0x68, 0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c,
-  0x2c, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f,
-  0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64,
-  0x65, 0x72, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61,
-  0x6c, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74,
-  0x2c, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x2c, 0x20, 0x43, 0x6f,
-  0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64,
-  0x65, 0x78, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
-  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
-  0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70,
-  0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7d, 0x20,
-  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x2d,
-  0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72,
-  0x61, 0x6d, 0x6d, 0x61, 0x72, 0x2e, 0x6d, 0x6a, 0x73, 0x27, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65,
-  0x63, 0x74, 0x65, 0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d,
-  0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x76, 0x61, 0x72, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x20,
-  0x3d, 0x20, 0x2d, 0x31, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61,
-  0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x55, 0x73,
-  0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61,
-  0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79,
-  0x20, 0x63, 0x68, 0x61, 0x74, 0x62, 0x6f, 0x74, 0x2e, 0x20, 0x4c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x69, 0x73, 0x20, 0x68, 0x65, 0x6c, 0x70, 0x66,
-  0x75, 0x6c, 0x2c, 0x20, 0x6b, 0x69, 0x6e, 0x64, 0x2c, 0x20, 0x68, 0x6f,
-  0x6e, 0x65, 0x73, 0x74, 0x2c, 0x20, 0x67, 0x6f, 0x6f, 0x64, 0x20, 0x61,
-  0x74, 0x20, 0x77, 0x72, 0x69, 0x74, 0x69, 0x6e, 0x67, 0x2c, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x6e, 0x65, 0x76, 0x65, 0x72, 0x20, 0x66, 0x61, 0x69,
-  0x6c, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72,
-  0x20, 0x61, 0x6e, 0x79, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
-  0x73, 0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x74, 0x65, 0x6c,
-  0x79, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x70,
-  0x72, 0x65, 0x63, 0x69, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x22, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73,
-  0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68,
-  0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61,
-  0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
-  0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22,
-  0x2c, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22,
-  0x20, 0x7c, 0x20, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
-  0x6f, 0x6e, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68,
-  0x61, 0x72, 0x3a, 0x20, 0x22, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a,
-  0x20, 0x22, 0x55, 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x27, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
-  0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63,
-  0x74, 0x3a, 0x20, 0x34, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72,
-  0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73,
-  0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f,
-  0x20, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65,
-  0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31,
-  0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73,
-  0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79,
-  0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31,
-  0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65,
-  0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f,
-  0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d,
-  0x20, 0x30, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f,
-  0x63, 0x61, 0x62, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e,
-  0x39, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d,
-  0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x5f, 0x70, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, 0x20,
-  0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e,
-  0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20,
-  0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70,
-  0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e,
-  0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65,
-  0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30,
-  0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30,
-  0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74,
-  0x61, 0x74, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f,
-  0x31, 0x2f, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69,
-  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20,
-  0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x65, 0x6e, 0x74, 0x72, 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f,
-  0x65, 0x74, 0x61, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f,
-  0x20, 0x6c, 0x65, 0x61, 0x72, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61,
-  0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61,
-  0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x3a,
-  0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f,
-  0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f,
-  0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x63, 0x68, 0x65, 0x5f, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
-  0x79, 0x3a, 0x20, 0x27, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x53, 0x54, 0x41,
-  0x52, 0x54, 0x3a, 0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x20,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x62,
-  0x72, 0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, 0x20, 0x4c, 0x6f, 0x63, 0x61,
-  0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x2a, 0x2f, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
-  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20,
-  0x3d, 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x63, 0x70, 0x70, 0x5f,
-  0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x22, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61,
-  0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72,
-  0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61, 0x67,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49,
-  0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67,
-  0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b,
-  0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73,
-  0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74,
-  0x61, 0x46, 0x72, 0x6f, 0x6d, 0x52, 0x61, 0x77, 0x54, 0x65, 0x78, 0x74,
-  0x28, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e,
-  0x73, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61,
-  0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27,
-  0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74,
-  0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x74, 0x61,
-  0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d, 0x20,
-  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
-  0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63,
-  0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73,
-  0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20,
-  0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x69,
-  0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75,
-  0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
-  0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4a, 0x53,
-  0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x69, 0x74, 0x65,
-  0x6d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61,
-  0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65,
-  0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x52, 0x61, 0x77, 0x54, 0x65,
-  0x78, 0x74, 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74,
-  0x65, 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74,
-  0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65,
-  0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72,
-  0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b,
-  0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74,
-  0x61, 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74,
-  0x65, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65,
-  0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64,
-  0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x61, 0x76,
-  0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c,
-  0x28, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20,
-  0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x20, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x7b, 0x7d, 0x20, 0x7d, 0x20, 0x7d,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65,
-  0x74, 0x27, 0x73, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6c,
-  0x6f, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x69, 0x66, 0x20, 0x74, 0x68, 0x65, 0x72, 0x65, 0x20, 0x61, 0x72, 0x65,
-  0x20, 0x61, 0x6e, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x75, 0x73, 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72,
-  0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x6f, 0x62,
-  0x6a, 0x65, 0x63, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x69, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x66, 0x20, 0x7b,
-  0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61,
-  0x6d, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, 0x20, 0x61, 0x6e,
-  0x64, 0x20, 0x7b, 0x20, 0x22, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x6e, 0x61, 0x6d,
-  0x65, 0x22, 0x3a, 0x22, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67,
-  0x28, 0x27, 0x49, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x69, 0x6e, 0x67, 0x20,
-  0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x6c,
-  0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
-  0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62,
-  0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x6d, 0x70, 0x6f,
-  0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x77, 0x65, 0x72, 0x65, 0x20, 0x73,
-  0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x66, 0x75, 0x6c, 0x6c, 0x79, 0x20,
-  0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
-  0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x50, 0x72, 0x6f, 0x63, 0x65, 0x73,
-  0x73, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64,
-  0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65,
-  0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61,
-  0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x20,
-  0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28,
-  0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x73, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x3d, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65,
-  0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x6f, 0x76, 0x65,
-  0x72, 0x72, 0x69, 0x64, 0x65, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65,
-  0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d,
-  0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72,
-  0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x2c,
-  0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73,
-  0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x6e, 0x6f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x64, 0x65, 0x74, 0x65,
-  0x63, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67,
-  0x28, 0x27, 0x49, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, 0x69, 0x7a, 0x69,
-  0x6e, 0x67, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72,
-  0x61, 0x67, 0x65, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x61, 0x76, 0x69,
-  0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x22, 0x64,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x22, 0x3a, 0x20, 0x7b, 0x20, 0x73,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f,
-  0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44,
-  0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63,
-  0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x2c, 0x20, 0x73, 0x61, 0x76, 0x65,
-  0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54,
-  0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x52, 0x65, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c,
-  0x74, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65,
-  0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65,
-  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x73,
-  0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5b,
-  0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x5d, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72,
-  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c,
-  0x79, 0x28, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x27, 0x20, 0x7d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x2e,
-  0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20,
-  0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64,
-  0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x20, 0x7d, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74,
-  0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64,
-  0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65,
-  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61,
-  0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74,
-  0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x67, 0x65, 0x74, 0x20,
-  0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61,
-  0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x6c, 0x6f,
-  0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f,
-  0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a,
-  0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74,
-  0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65,
-  0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, 0x75, 0x74, 0x6f,
-  0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x72, 0x65,
-  0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63,
-  0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20,
-  0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20,
-  0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27,
-  0x4e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f,
-  0x75, 0x6e, 0x64, 0x2c, 0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f,
-  0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75,
-  0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x20, 0x77, 0x61, 0x73, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20,
-  0x73, 0x6f, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d,
-  0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74,
-  0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c,
-  0x6f, 0x67, 0x28, 0x27, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x69, 0x6e, 0x67,
-  0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64,
-  0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x65,
-  0x72, 0x6e, 0x61, 0x6c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x66, 0x72,
-  0x6f, 0x6d, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72,
-  0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c,
-  0x79, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x61, 0x76, 0x65, 0x64,
-  0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c,
-  0x6f, 0x67, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55,
-  0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65,
-  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x75, 0x74,
-  0x6f, 0x73, 0x61, 0x76, 0x65, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
-  0x6c, 0x6f, 0x67, 0x28, 0x27, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x20, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x2e, 0x2e,
-  0x2e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73,
-  0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d,
-  0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x77, 0x65, 0x20, 0x64, 0x6f, 0x6e, 0x27, 0x74, 0x20, 0x77, 0x61,
-  0x6e, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x6f,
-  0x76, 0x65, 0x72, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x73, 0x6f,
-  0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74,
-  0x65, 0x20, 0x61, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, 0x6e, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e,
-  0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x55, 0x73, 0x65, 0x72, 0x54,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2d, 0x27, 0x20, 0x2b, 0x20,
-  0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, 0x2e, 0x74,
-  0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x65,
-  0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20,
-  0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, 0x20, 0x6e, 0x65,
-  0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d,
-  0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, 0x61, 0x27, 0x3a, 0x20, 0x7b,
-  0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x27, 0x3a,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x20, 0x7d, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c,
-  0x6f, 0x67, 0x28, 0x27, 0x53, 0x61, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x61,
-  0x73, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73,
-  0x61, 0x76, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x61,
-  0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x73, 0x6c, 0x6f, 0x74,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63,
-  0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73,
-  0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62,
-  0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74,
-  0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73,
-  0x74, 0x27, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6c, 0x6f, 0x61,
-  0x64, 0x20, 0x69, 0x74, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x20, 0x61, 0x6e,
-  0x64, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41,
-  0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65,
-  0x64, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
-  0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f,
-  0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61,
-  0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27,
-  0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, 0x20, 0x7b, 0x20,
-  0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x6c, 0x65,
-  0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e,
-  0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, 0x61, 0x27, 0x3a,
-  0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x27,
-  0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
-  0x6c, 0x6f, 0x67, 0x28, 0x27, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x69, 0x6e,
-  0x67, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61,
-  0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65,
-  0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64,
-  0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76,
-  0x65, 0x64, 0x28, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a,
-  0x20, 0x45, 0x4e, 0x44, 0x3a, 0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72,
-  0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e,
-  0x67, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e,
-  0x20, 0x62, 0x72, 0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, 0x20, 0x4c, 0x6f,
-  0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x2a,
-  0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x20,
-  0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c,
-  0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x6e, 0x75, 0x6c,
-  0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63,
-  0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x6c, 0x79, 0x20, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x20, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3f, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d,
-  0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c,
-  0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x61,
-  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x73,
-  0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x61, 0x20, 0x63, 0x68, 0x61,
-  0x74, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64,
-  0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28,
-  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e,
-  0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
-  0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
-  0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d,
-  0x20, 0x28, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
-  0x69, 0x70, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
-  0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28,
-  0x73, 0x74, 0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65,
-  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65,
-  0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b,
-  0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
-  0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e,
-  0x67, 0x28, 0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
-  0x63, 0x65, 0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28,
-  0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20,
-  0x28, 0x5f, 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74,
-  0x74, 0x69, 0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61,
-  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61,
-  0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x68,
-  0x61, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20,
-  0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20,
-  0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69,
-  0x70, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20,
-  0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x61,
-  0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69,
-  0x6e, 0x67, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74,
-  0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20,
-  0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
-  0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77,
-  0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65,
-  0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c,
-  0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x20, 0x26, 0x26,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x73, 0x5b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74,
-  0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e,
-  0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x2e, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x28, 0x2f,
-  0x5c, 0x6e, 0x24, 0x2f, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c,
-  0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69,
-  0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x63, 0x68, 0x61, 0x72,
-  0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73,
-  0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72,
-  0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73,
-  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20,
-  0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2c, 0x20, 0x22,
-  0x27, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20,
-  0x22, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x64, 0x61,
-  0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x20, 0x3d,
-  0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69,
-  0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65,
-  0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x26, 0x26, 0x20, 0x21,
-  0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x6f,
-  0x64, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x65, 0x72, 0x74,
-  0x28, 0x22, 0x54, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
-  0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x63, 0x6f, 0x6d,
-  0x70, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x75,
-  0x6c, 0x74, 0x69, 0x6d, 0x6f, 0x64, 0x61, 0x6c, 0x20, 0x6f, 0x72, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x70, 0x72,
-  0x6f, 0x6a, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x20, 0x63, 0x61, 0x6e, 0x27,
-  0x74, 0x20, 0x62, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x2e,
-  0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
-  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61,
-  0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f,
-  0x72, 0x79, 0x2c, 0x20, 0x5b, 0x63, 0x68, 0x61, 0x72, 0x2c, 0x20, 0x63,
-  0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
-  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61,
-  0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x6d,
-  0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27,
-  0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e,
-  0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72,
-  0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e,
-  0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69,
-  0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72,
-  0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, 0x29, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20,
-  0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a, 0x20, 0x73, 0x65,
-  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x66,
-  0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x5b, 0x6e, 0x61, 0x6d, 0x65,
-  0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x41, 0x72,
-  0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28,
-  0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d,
-  0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28,
-  0x27, 0x27, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
-  0x2f, 0x5e, 0x5c, 0x73, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, 0x5c,
-  0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69, 0x6d,
-  0x61, 0x67, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20,
-  0x60, 0x41, 0x20, 0x63, 0x68, 0x61, 0x74, 0x20, 0x62, 0x65, 0x74, 0x77,
-  0x65, 0x65, 0x6e, 0x20, 0x61, 0x20, 0x63, 0x75, 0x72, 0x69, 0x6f, 0x75,
-  0x73, 0x20, 0x68, 0x75, 0x6d, 0x61, 0x6e, 0x20, 0x61, 0x6e, 0x64, 0x20,
-  0x61, 0x6e, 0x20, 0x61, 0x72, 0x74, 0x69, 0x66, 0x69, 0x63, 0x69, 0x61,
-  0x6c, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x6c, 0x6c, 0x69, 0x67, 0x65, 0x6e,
-  0x63, 0x65, 0x20, 0x61, 0x73, 0x73, 0x69, 0x73, 0x74, 0x61, 0x6e, 0x74,
-  0x2e, 0x20, 0x54, 0x68, 0x65, 0x20, 0x61, 0x73, 0x73, 0x69, 0x73, 0x74,
-  0x61, 0x6e, 0x74, 0x20, 0x67, 0x69, 0x76, 0x65, 0x73, 0x20, 0x68, 0x65,
-  0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69,
-  0x6c, 0x65, 0x64, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x70, 0x6f, 0x6c,
-  0x69, 0x74, 0x65, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20,
-  0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x68, 0x75, 0x6d, 0x61, 0x6e,
-  0x27, 0x73, 0x20, 0x71, 0x75, 0x65, 0x73, 0x74, 0x69, 0x6f, 0x6e, 0x73,
-  0x2e, 0x5c, 0x6e, 0x55, 0x53, 0x45, 0x52, 0x3a, 0x5b, 0x69, 0x6d, 0x67,
-  0x2d, 0x31, 0x30, 0x5d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, 0x5c, 0x6e,
-  0x41, 0x53, 0x53, 0x49, 0x53, 0x54, 0x41, 0x4e, 0x54, 0x3a, 0x60, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x75, 0x6e,
-  0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x3a, 0x20, 0x73, 0x6c,
-  0x6f, 0x74, 0x5f, 0x69, 0x64, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c,
-  0x2f, 0x73, 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d,
-  0x3a, 0x22, 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x28, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a,
-  0x22, 0x29, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x2c, 0x20, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x22,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6e, 0x43,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20,
-  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61,
-  0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69,
-  0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x73, 0x65,
-  0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73,
-  0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28,
-  0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63,
-  0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x22, 0x2c, 0x20, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61,
-  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x6c, 0x6f, 0x74,
-  0x5f, 0x69, 0x64, 0x3a, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64,
-  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74,
-  0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x22, 0x29, 0x2e, 0x66, 0x69, 0x6e,
-  0x61, 0x6c, 0x6c, 0x79, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72,
-  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6d, 0x61, 0x70,
-  0x28, 0x28, 0x5b, 0x5f, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x5d, 0x29,
-  0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72,
-  0x72, 0x61, 0x79, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3f, 0x20,
-  0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67,
-  0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27,
-  0x29, 0x20, 0x3a, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28,
-  0x27, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70,
-  0x74, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74,
-  0x6f, 0x70, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
-  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x61,
-  0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
-  0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e,
-  0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20,
-  0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e,
-  0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65,
-  0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75,
-  0x70, 0x6c, 0x6f, 0x61, 0x64, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d,
-  0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e,
-  0x74, 0x42, 0x79, 0x49, 0x64, 0x28, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x49,
-  0x6e, 0x70, 0x75, 0x74, 0x22, 0x29, 0x2e, 0x63, 0x6c, 0x69, 0x63, 0x6b,
-  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
-  0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x45, 0x6c,
-  0x65, 0x6d, 0x65, 0x6e, 0x74, 0x42, 0x79, 0x49, 0x64, 0x28, 0x22, 0x66,
-  0x69, 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0x29, 0x2e, 0x61,
-  0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
-  0x6e, 0x65, 0x72, 0x28, 0x22, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x22,
-  0x2c, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x28,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73,
-  0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x20,
-  0x3d, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x61,
-  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x46, 0x69,
-  0x6c, 0x65, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x61, 0x64, 0x65, 0x72, 0x2e, 0x6f, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x20,
-  0x3d, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x28,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6d,
-  0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x72,
-  0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, 0x65,
-  0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f,
-  0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67,
-  0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7b, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x69, 0x6d,
-  0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x72, 0x65, 0x70,
-  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x69,
-  0x6d, 0x61, 0x67, 0x65, 0x5c, 0x2f, 0x5b, 0x5e, 0x3b, 0x5d, 0x2b, 0x3b,
-  0x62, 0x61, 0x73, 0x65, 0x36, 0x34, 0x2c, 0x2f, 0x2c, 0x20, 0x27, 0x27,
-  0x29, 0x2c, 0x20, 0x69, 0x64, 0x3a, 0x20, 0x31, 0x30, 0x20, 0x7d, 0x5d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69,
-  0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x41, 0x73,
-  0x44, 0x61, 0x74, 0x61, 0x55, 0x52, 0x4c, 0x28, 0x73, 0x65, 0x6c, 0x65,
-  0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e,
-  0x70, 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67,
-  0x6e, 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62,
-  0x6d, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73,
-  0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69,
-  0x74, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e,
-  0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33,
-  0x20, 0x26, 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73,
-  0x68, 0x69, 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62,
-  0x6d, 0x69, 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66,
-  0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74,
-  0x3d, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x4e, 0x61,
-  0x6d, 0x65, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
-  0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20,
-  0x22, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x22, 0x20, 0x3a, 0x20,
-  0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6e, 0x69,
-  0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72, 0x65, 0x73, 0x73, 0x3d,
-  0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69,
-  0x74, 0x73, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65,
-  0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20,
-  0x73, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e,
-  0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78,
-  0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
-  0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x22,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20,
-  0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74,
-  0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22,
-  0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f,
-  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74,
-  0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d,
-  0x24, 0x7b, 0x75, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x49, 0x6d, 0x61, 0x67,
-  0x65, 0x7d, 0x3e, 0x55, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x49, 0x6d,
-  0x61, 0x67, 0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63,
-  0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d,
-  0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b,
-  0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, 0x3c,
-  0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75,
-  0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b,
-  0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65,
-  0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x73, 0x28, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6e, 0x43,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20,
-  0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x75,
-  0x62, 0x6d, 0x69, 0x74, 0x7d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
-  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x64, 0x69, 0x73, 0x61,
-  0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d,
-  0x3e, 0x53, 0x74, 0x61, 0x72, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74,
-  0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e,
-  0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70,
-  0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24,
-  0x7b, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70,
-  0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74,
-  0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d,
-  0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73,
-  0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69,
-  0x76, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x68,
-  0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72,
-  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x75, 0x73,
-  0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66,
-  0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73,
-  0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f, 0x74,
-  0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, 0x64,
-  0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74,
-  0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72,
-  0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x70, 0x61, 0x72,
-  0x65, 0x6e, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x70, 0x61,
-  0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48,
-  0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x70, 0x61, 0x72,
-  0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f,
-  0x70, 0x20, 0x2b, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x6f,
-  0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20,
-  0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e,
-  0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, 0x30,
-  0x2c, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72,
-  0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61,
-  0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, 0x43, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20,
-  0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
-  0x20, 0x27, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x20, 0x3d,
-  0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x5d, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6c, 0x65, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x4d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79,
-  0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61, 0x74,
-  0x61, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x20,
-  0x3e, 0x20, 0x30, 0x20, 0x26, 0x26, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72,
-  0x61, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69,
-  0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61, 0x74, 0x61,
-  0x3d, 0x24, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x7d, 0x20, 0x2f, 0x3e, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65,
-  0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65,
-  0x78, 0x74, 0x20, 0x3d, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79,
-  0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3f, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61,
-  0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d,
-  0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73,
-  0x2b, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61,
-  0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20,
-  0x69, 0x73, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x4d, 0x6f, 0x64, 0x65, 0x20, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20,
-  0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x4d, 0x61,
-  0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x7d, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x28, 0x74, 0x65, 0x78, 0x74, 0x29, 0x7d, 0x20, 0x2f, 0x3e,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x75, 0x73, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65, 0x79,
-  0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x3c, 0x73,
-  0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, 0x3a,
-  0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x24, 0x7b,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x70, 0x3e,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
-  0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x69, 0x73, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x4d, 0x6f, 0x64, 0x65, 0x20, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
-  0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b,
-  0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x24, 0x7b, 0x6d, 0x65, 0x73,
-  0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e,
-  0x60, 0x20, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20,
-  0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d,
-  0x3e, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c,
-  0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
-  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x45, 0x64, 0x69, 0x74, 0x20, 0x3d, 0x20,
-  0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x54, 0x65, 0x78, 0x74, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73,
-  0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72,
-  0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x5b,
-  0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22,
-  0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b,
-  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x20, 0x6b,
-  0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x7d, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6d,
-  0x67, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x22, 0x77, 0x69, 0x64,
-  0x74, 0x68, 0x3a, 0x20, 0x36, 0x30, 0x25, 0x3b, 0x24, 0x7b, 0x21, 0x73,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, 0x65, 0x63,
-  0x74, 0x65, 0x64, 0x20, 0x3f, 0x20, 0x60, 0x64, 0x69, 0x73, 0x70, 0x6c,
-  0x61, 0x79, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x60, 0x20, 0x3a,
-  0x20, 0x60, 0x60, 0x7d, 0x22, 0x20, 0x73, 0x72, 0x63, 0x3d, 0x22, 0x24,
-  0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x65, 0x64, 0x7d, 0x22, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61,
-  0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x65, 0x64, 0x69,
-  0x74, 0x61, 0x62, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x69, 0x73, 0x43, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x4d, 0x6f, 0x64, 0x65,
-  0x7d, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x74,
-  0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x45, 0x64, 0x69,
-  0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63,
-  0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x29, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70,
-  0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72,
-  0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65,
-  0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65,
-  0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x70, 0x61,
-  0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, 0x2e,
-  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65,
-  0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x4d, 0x61,
-  0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, 0x61, 0x72,
-  0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
-  0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
-  0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72,
-  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69,
-  0x67, 0x6e, 0x61, 0x6c, 0x28, 0x27, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73,
-  0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70,
-  0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a,
-  0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f,
-  0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x3d, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65,
-  0x72, 0x74, 0x4a, 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x29,
-  0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
-  0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x67, 0x72, 0x61,
-  0x6d, 0x6d, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65,
-  0x77, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, 0x76,
-  0x65, 0x72, 0x74, 0x65, 0x72, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d,
-  0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74,
-  0x28, 0x27, 0x2c, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x64,
-  0x75, 0x63, 0x65, 0x28, 0x28, 0x61, 0x63, 0x63, 0x2c, 0x20, 0x63, 0x75,
-  0x72, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x20,
-  0x2e, 0x2e, 0x2e, 0x61, 0x63, 0x63, 0x2c, 0x20, 0x5b, 0x63, 0x75, 0x72,
-  0x2e, 0x74, 0x72, 0x69, 0x6d, 0x28, 0x29, 0x5d, 0x3a, 0x20, 0x69, 0x20,
-  0x7d, 0x29, 0x2c, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72,
-  0x74, 0x65, 0x72, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63,
-  0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61,
-  0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72,
-  0x2e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d,
-  0x61, 0x72, 0x28, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x61, 0x6c, 0x65, 0x72, 0x74, 0x28, 0x60, 0x43, 0x6f, 0x6e,
-  0x76, 0x65, 0x72, 0x74, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x3a,
-  0x20, 0x24, 0x7b, 0x65, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46,
-  0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20,
-  0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61,
-  0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x20, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24,
-  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d,
-  0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69,
-  0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d,
-  0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20,
-  0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70,
-  0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e,
-  0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
-  0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f,
-  0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f,
-  0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e,
-  0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73,
-  0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65,
-  0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x20, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b,
-  0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62,
-  0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
-  0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22,
-  0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e,
-  0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61,
-  0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d,
-  0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65,
-  0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65,
-  0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f,
-  0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64, 0x41, 0x70,
-  0x70, 0x6c, 0x79, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74,
-  0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65,
-  0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20,
-  0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x64, 0x69, 0x73, 0x61,
-  0x62, 0x6c, 0x65, 0x64, 0x3e, 0x55, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74,
-  0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24,
-  0x7b, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73, 0x65,
-  0x74, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x65, 0x66,
-  0x61, 0x75, 0x6c, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63,
-  0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x75, 0x74,
-  0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x20, 0x6f, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x72, 0x79, 0x20,
-  0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x28,
-  0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x72, 0x61, 0x6d, 0x6d,
-  0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x20, 0x3d, 0x20,
-  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72,
-  0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e,
-  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3c, 0x2f, 0x6c, 0x61, 0x62,
-  0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65,
-  0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61,
-  0x72, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x67, 0x72, 0x61,
-  0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68,
-  0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x55, 0x73, 0x65, 0x20, 0x67,
-  0x62, 0x6e, 0x66, 0x20, 0x6f, 0x72, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20,
-  0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2b, 0x63, 0x6f, 0x6e, 0x76, 0x65,
-  0x72, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24,
-  0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x22, 0x20,
-  0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e,
-  0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65,
-  0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72,
-  0x6f, 0x70, 0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x22, 0x20, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x6f,
-  0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x31, 0x2c,
-  0x70, 0x72, 0x6f, 0x70, 0x32, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x33, 0x22,
-  0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75,
-  0x70, 0x64, 0x61, 0x74, 0x65, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
-  0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72,
-  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x7d, 0x20, 0x2f, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65,
-  0x3d, 0x22, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x6f, 0x6e,
-  0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x76,
-  0x65, 0x72, 0x74, 0x4a, 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d,
-  0x61, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x3e, 0x43, 0x6f,
-  0x6e, 0x76, 0x65, 0x72, 0x74, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64,
-  0x53, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69,
-  0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x46, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61,
-  0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65,
-  0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
-  0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d,
-  0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
-  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x43, 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67,
-  0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68,
-  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53,
-  0x65, 0x74, 0x28, 0x29, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
-  0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77,
-  0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22,
-  0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e,
-  0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70,
-  0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72,
-  0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24,
-  0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69,
-  0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76,
-  0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62,
-  0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22,
-  0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22,
-  0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d,
-  0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
-  0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66,
-  0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65,
-  0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f,
-  0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22,
-  0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70,
-  0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61,
-  0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74,
-  0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
-  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c,
-  0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, 0x68, 0x69,
-  0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69,
-  0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, 0x74, 0x6f,
-  0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73,
-  0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68,
-  0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61,
-  0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x31, 0x20,
-  0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70,
-  0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d,
-  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x28, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
-  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x66,
-  0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x53, 0x65, 0x74, 0x28, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64,
-  0x73, 0x65, 0x74, 0x3e, 0x24, 0x7b, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61,
-  0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x28, 0x29, 0x7d, 0x3c,
-  0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66,
-  0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74,
-  0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x24, 0x7b, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d,
-  0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75,
-  0x74, 0x74, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x63,
-  0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d, 0x22, 0x3e,
-  0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
-  0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3d, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x63, 0x68, 0x65,
-  0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69,
-  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70,
-  0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22,
-  0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b,
-  0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x43, 0x68, 0x61, 0x74, 0x3c, 0x2f,
-  0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x73,
-  0x6c, 0x69, 0x6d, 0x22, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20,
-  0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22,
-  0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63,
-  0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
-  0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65,
-  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x22, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53,
-  0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x43,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3c, 0x2f, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27,
-  0x63, 0x68, 0x61, 0x74, 0x27, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74,
-  0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29,
-  0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
-  0x6e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28,
-  0x29, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20,
-  0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28,
-  0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72,
-  0x65, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d,
-  0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74,
-  0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e,
-  0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x20, 0x7d, 0x29, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
-  0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22,
-  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20,
-  0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61,
-  0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75,
-  0x72, 0x65, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f,
-  0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69,
-  0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, 0x73, 0x65,
-  0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78,
-  0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20,
-  0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22,
-  0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c,
-  0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61,
-  0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e,
-  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62,
-  0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65,
-  0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66,
-  0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22,
-  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c,
-  0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c,
-  0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c,
-  0x61, 0x73, 0x74, 0x5f, 0x6e, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c,
-  0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b,
-  0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69,
-  0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x20,
-  0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46,
-  0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d,
-  0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
-  0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74,
-  0x6f, 0x70, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
-  0x4d, 0x69, 0x6e, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69,
-  0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e,
-  0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c,
-  0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x6e, 0x5f,
-  0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e,
-  0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
-  0x6d, 0x69, 0x6e, 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69,
-  0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79,
-  0x3e, 0x4d, 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e,
-  0x73, 0x3c, 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c,
-  0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c,
-  0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22,
-  0x54, 0x46, 0x53, 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a,
-  0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74,
-  0x66, 0x73, 0x5f, 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x20, 0x7d, 0x29, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69,
-  0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a,
-  0x20, 0x22, 0x54, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22,
-  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20,
-  0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61,
-  0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c,
-  0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30,
-  0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2e, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x20, 0x7d,
-  0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74,
-  0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65,
-  0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65,
-  0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d,
-  0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e,
-  0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70,
-  0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65,
-  0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63,
-  0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46,
-  0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c,
-  0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79,
-  0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d,
-  0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e,
-  0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f,
-  0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74,
-  0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20,
-  0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73,
-  0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61,
-  0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75,
-  0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69,
-  0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72,
-  0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64,
-  0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74,
-  0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e,
-  0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74,
-  0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70,
-  0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64,
-  0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69,
-  0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65,
-  0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61,
-  0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e,
-  0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65,
-  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f,
-  0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76,
-  0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e,
-  0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61,
-  0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d,
-  0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b,
-  0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74,
-  0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, 0x6e, 0x69,
-  0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74,
-  0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20,
-  0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20,
-  0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b,
-  0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72,
-  0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6d,
-  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d,
-  0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74,
-  0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a,
-  0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f,
-  0x74, 0x61, 0x75, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b,
-  0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b,
-  0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72,
-  0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20,
-  0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69,
-  0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f,
-  0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20,
-  0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65,
-  0x74, 0x61, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65,
-  0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c,
-  0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e,
-  0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62,
-  0x65, 0x6c, 0x3a, 0x20, 0x22, 0x53, 0x68, 0x6f, 0x77, 0x20, 0x50, 0x72,
-  0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x22,
-  0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2c, 0x20, 0x6d,
-  0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
-  0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x22, 0x2c, 0x20,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72,
-  0x6f, 0x62, 0x73, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69,
-  0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65,
-  0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61,
-  0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x61, 0x70, 0x69,
-  0x5f, 0x6b, 0x65, 0x79, 0x22, 0x3e, 0x41, 0x50, 0x49, 0x20, 0x4b, 0x65,
-  0x79, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d,
-  0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d,
-  0x22, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x22, 0x20, 0x76, 0x61,
-  0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x61, 0x70, 0x69, 0x5f,
-  0x6b, 0x65, 0x79, 0x7d, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68,
-  0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x45, 0x6e, 0x74, 0x65, 0x72,
-  0x20, 0x41, 0x50, 0x49, 0x20, 0x6b, 0x65, 0x79, 0x22, 0x20, 0x6f, 0x6e,
-  0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61,
-  0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x7d, 0x20, 0x2f, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72,
-  0x20, 0x3d, 0x20, 0x28, 0x70, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x72, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f,
-  0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20, 0x28, 0x31, 0x20,
-  0x2d, 0x20, 0x70, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x20, 0x3d, 0x20, 0x4d,
-  0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x31, 0x39,
-  0x32, 0x20, 0x2a, 0x20, 0x70, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x60, 0x72, 0x67,
-  0x62, 0x61, 0x28, 0x24, 0x7b, 0x72, 0x7d, 0x2c, 0x24, 0x7b, 0x67, 0x7d,
-  0x2c, 0x30, 0x2c, 0x30, 0x2e, 0x33, 0x29, 0x60, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69,
-  0x74, 0x69, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61,
-  0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
-  0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74,
-  0x69, 0x65, 0x73, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x6d, 0x73, 0x67, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x21, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65,
-  0x73, 0x20, 0x7c, 0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f,
-  0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74,
-  0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3d,
-  0x3d, 0x3d, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x73,
-  0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72,
-  0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e,
-  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x31, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x20, 0x4e, 0x6f, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x62,
-  0x79, 0x74, 0x65, 0x20, 0x70, 0x61, 0x69, 0x72, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72,
-  0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x5b,
-  0x30, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x73,
-  0x74, 0x61, 0x72, 0x74, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x62,
-  0x79, 0x74, 0x65, 0x3a, 0x20, 0x5c, 0x5c, 0x27, 0x29, 0x29, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73,
-  0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63,
-  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72,
-  0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e,
-  0x6d, 0x61, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x62, 0x20, 0x3d, 0x3e, 0x20,
-  0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20,
-  0x70, 0x72, 0x6f, 0x62, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69,
-  0x65, 0x73, 0x3a, 0x20, 0x5b, 0x70, 0x72, 0x6f, 0x62, 0x5d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c,
-  0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74,
-  0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x24, 0x7b,
-  0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74, 0x61, 0x7d, 0x20, 0x2f,
-  0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2c,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x3d,
-  0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65,
-  0x73, 0x5b, 0x30, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64,
-  0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, 0x66, 0x69, 0x6e,
-  0x64, 0x28, 0x70, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x2e, 0x74, 0x6f, 0x6b,
-  0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x6d, 0x73, 0x67,
-  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x66, 0x6f, 0x75,
-  0x6e, 0x64, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c,
-  0x6f, 0x72, 0x28, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2e, 0x70, 0x72, 0x6f,
-  0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,
-  0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f,
-  0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65,
-  0x6e, 0x20, 0x3d, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
-  0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x62,
-  0x2d, 0x73, 0x65, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f,
-  0x62, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x28, 0x70, 0x2c, 0x20, 0x69,
-  0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x69, 0x74, 0x6c,
-  0x65, 0x3d, 0x24, 0x7b, 0x60, 0x70, 0x72, 0x6f, 0x62, 0x3a, 0x20, 0x24,
-  0x7b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x7d, 0x60, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24,
-  0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67,
-  0x3a, 0x20, 0x27, 0x30, 0x2e, 0x33, 0x65, 0x6d, 0x27, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x43,
-  0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f,
-  0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f,
-  0x6c, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, 0x20,
-  0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65,
-  0x6e, 0x74, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e,
-  0x3e, 0x24, 0x7b, 0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72,
-  0x7d, 0x3a, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24,
-  0x7b, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28,
-  0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x20, 0x2a, 0x20, 0x31, 0x30, 0x30,
-  0x29, 0x7d, 0x25, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x60, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d,
-  0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x62,
-  0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x43, 0x6f, 0x6c,
-  0x6f, 0x72, 0x3a, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x7d,
-  0x7d, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69,
-  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f,
-  0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d,
-  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c,
-  0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x29, 0x20, 0x3f, 0x20, 0x68, 0x74, 0x6d,
-  0x6c, 0x60, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x60, 0x20, 0x3a, 0x20,
-  0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, 0x20, 0x6d,
-  0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f,
-  0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x64,
-  0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, 0x65,
-  0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, 0x2f, 0x67,
-  0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c,
-  0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26,
-  0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
-  0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, 0x27, 0x29,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65,
-  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, 0x2c,
-  0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, 0x6d,
-  0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x68,
-  0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c,
-  0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c, 0x2a,
-  0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67,
-  0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e,
-  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f, 0x28,
-  0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c,
-  0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73,
-  0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63,
-  0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a,
-  0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c,
-  0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
-  0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c, 0x20,
-  0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e,
-  0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60, 0x60,
-  0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53, 0x5d,
-  0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c,
-  0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31,
-  0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72, 0x65,
-  0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x28,
-  0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x63,
-  0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65,
-  0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x6e,
-  0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, 0x20, 0x2f,
-  0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c,
-  0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f,
-  0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72,
-  0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f, 0x68,
-  0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20, 0x2f,
-  0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x6f,
-  0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
-  0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72,
-  0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, 0x61,
-  0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
-  0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70,
-  0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61,
-  0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x6b,
-  0x65, 0x6e, 0x73, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65,
-  0x64, 0x7d, 0x20, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64,
-  0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61,
-  0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x6b,
-  0x65, 0x6e, 0x73, 0x5f, 0x63, 0x61, 0x63, 0x68, 0x65, 0x64, 0x7d, 0x20,
-  0x63, 0x61, 0x63, 0x68, 0x65, 0x64, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x2e, 0x70,
-  0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72,
-  0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f,
-  0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70,
-  0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73,
-  0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70,
-  0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x2e, 0x74, 0x6f,
-  0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, 0x7d, 0x20, 0x74, 0x6f,
-  0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x73, 0x65, 0x63,
-  0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65,
-  0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69, 0x6d, 0x70,
-  0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69,
-  0x73, 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53,
-  0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x3d,
-  0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b,
-  0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, 0x27, 0x2c,
-  0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, 0x27,
-  0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52,
-  0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28,
-  0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76,
-  0x65, 0x72, 0x52, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52,
-  0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
-  0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x62,
-  0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72,
-  0x72, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x72, 0x65, 0x63, 0x74, 0x20, 0x3d, 0x20, 0x62, 0x75, 0x74, 0x74, 0x6f,
-  0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74,
-  0x2e, 0x67, 0x65, 0x74, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x69, 0x6e, 0x67,
-  0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x63, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20,
-  0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, 0x2e, 0x62, 0x6f, 0x74, 0x74,
-  0x6f, 0x6d, 0x20, 0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x2e,
-  0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x59, 0x7d, 0x70, 0x78, 0x60, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x60, 0x24, 0x7b, 0x72, 0x65,
-  0x63, 0x74, 0x2e, 0x6c, 0x65, 0x66, 0x74, 0x20, 0x2b, 0x20, 0x77, 0x69,
-  0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x58,
-  0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x20, 0x3d, 0x20, 0x21, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43,
-  0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x20,
-  0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65,
-  0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26,
-  0x20, 0x21, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66,
-  0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x6f, 0x6e,
-  0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e,
-  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, 0x20, 0x26, 0x26, 0x20, 0x21,
-  0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75,
-  0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69,
-  0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c,
-  0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66,
-  0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e,
-  0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, 0x6d,
-  0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, 0x68,
-  0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75,
-  0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e,
-  0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, 0x6d,
-  0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, 0x68,
-  0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75,
-  0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
-  0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65,
-  0x3d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x74, 0x79,
-  0x6c, 0x65, 0x7d, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x62, 0x75,
-  0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x7d, 0x20, 0x6f, 0x6e, 0x43,
-  0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x74, 0x6f, 0x67, 0x67, 0x6c,
-  0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x3e, 0x24, 0x7b,
-  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72,
-  0x65, 0x6e, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x69, 0x73, 0x4f,
-  0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x26, 0x26,
-  0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74,
-  0x61, 0x6c, 0x7d, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x3d, 0x22, 0x23, 0x70,
-  0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x70, 0x6f, 0x70,
-  0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65,
-  0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x6f, 0x70, 0x3a, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69,
-  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6c, 0x65, 0x66,
-  0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x70, 0x72,
-  0x6f, 0x70, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43,
-  0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c,
-  0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x53, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x3a, 0x20, 0x70, 0x72,
-  0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x20,
-  0x28, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74,
-  0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x65, 0x76, 0x65,
-  0x6c, 0x6f, 0x70, 0x69, 0x74, 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74,
-  0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x2f, 0x62, 0x6c, 0x6f, 0x62,
-  0x2f, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72, 0x2f, 0x73, 0x72, 0x63, 0x2f,
-  0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61,
-  0x6c, 0x2e, 0x6a, 0x73, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a,
-  0x2a, 0x20, 0x52, 0x65, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x20, 0x72,
-  0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, 0x20,
-  0x64, 0x65, 0x73, 0x63, 0x65, 0x6e, 0x64, 0x61, 0x6e, 0x74, 0x73, 0x20,
-  0x69, 0x6e, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x69, 0x76,
-  0x65, 0x6e, 0x20, 0x43, 0x53, 0x53, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63,
-  0x74, 0x6f, 0x72, 0x20, 0x2a, 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x20,
-  0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, 0x70,
-  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44,
-  0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69,
-  0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x20,
-  0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f,
-  0x75, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64,
-  0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
-  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x4d,
-  0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73,
-  0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x74, 0x72,
-  0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c,
-  0x61, 0x79, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x2e,
-  0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72,
-  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
-  0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f,
-  0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e,
-  0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x66, 0x61, 0x6c,
-  0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e,
-  0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65,
-  0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d,
-  0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f,
-  0x64, 0x65, 0x29, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d,
-  0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f,
-  0x64, 0x65, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69,
-  0x6c, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f,
-  0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x64,
-  0x4e, 0x6f, 0x64, 0x65, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e,
-  0x6f, 0x64, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x73, 0x74, 0x72,
-  0x69, 0x6e, 0x67, 0x27, 0x20, 0x3f, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d,
-  0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, 0x6c,
-  0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, 0x20,
-  0x3a, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
-  0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x73,
-  0x68, 0x6f, 0x77, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f,
-  0x75, 0x6e, 0x74, 0x65, 0x64, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x2f, 0x2f, 0x20, 0x63, 0x6c, 0x65, 0x61, 0x6e, 0x20, 0x75, 0x70, 0x20,
-  0x6f, 0x6c, 0x64, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x69, 0x66, 0x20,
-  0x6d, 0x6f, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x62, 0x61, 0x73, 0x65, 0x73,
-  0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
-  0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74,
-  0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74,
-  0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e,
-  0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69,
-  0x6e, 0x74, 0x6f, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x3d, 0x20,
-  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, 0x6c, 0x60,
-  0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f,
-  0x78, 0x79, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x2c, 0x20, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74,
-  0x6f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x3d,
-  0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, 0x6c,
-  0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f,
-  0x78, 0x79, 0x7d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d,
-  0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x78, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x73, 0x68, 0x6f, 0x77, 0x20,
-  0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70,
-  0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c,
-  0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, 0x6f,
-  0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x2c, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x69, 0x67, 0x68,
-  0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
-  0x6e, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65,
-  0x6e, 0x64, 0x65, 0x72, 0x73, 0x20, 0x69, 0x74, 0x73, 0x20, 0x66, 0x69,
-  0x72, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x20, 0x69, 0x66,
-  0x20, 0x69, 0x74, 0x20, 0x65, 0x78, 0x69, 0x73, 0x74, 0x73, 0x2e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20,
-  0x61, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x64, 0x69, 0x74, 0x69,
-  0x6f, 0x6e, 0x61, 0x6c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69,
-  0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, 0x72, 0x74,
-  0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x20, 0x65, 0x78, 0x74, 0x65,
-  0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65,
-  0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78,
-  0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x78, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65,
-  0x72, 0x28, 0x7b, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e,
-  0x20, 0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x68, 0x69,
-  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e, 0x75, 0x6c,
-  0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
-  0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22,
-  0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69,
-  0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70,
-  0x65, 0x7d, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65,
-  0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24,
-  0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61,
-  0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, 0x61,
-  0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d,
-  0x3d, 0x20, 0x27, 0x63, 0x68, 0x61, 0x74, 0x27, 0x20, 0x3f, 0x20, 0x4d,
-  0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x20,
-  0x3a, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x73, 0x7d, 0x20, 0x2f, 0x3e,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
-  0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f,
-  0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b,
-  0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
-  0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c,
-  0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72,
-  0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65,
-  0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67,
-  0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67,
-  0x65, 0x72, 0x67, 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x2e, 0x63, 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64,
-  0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74,
-  0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61,
-  0x69, 0x22, 0x3e, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f,
-  0x61, 0x3e, 0x2e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74,
-  0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28,
-  0x41, 0x70, 0x70, 0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, 0x6c, 0x65,
-  0x63, 0x74, 0x6f, 0x72, 0x28, 0x27, 0x23, 0x63, 0x6f, 0x6e, 0x74, 0x61,
-  0x69, 0x6e, 0x65, 0x72, 0x27, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c,
-  0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68,
-  0x65, 0x61, 0x64, 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e,
-  0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22,
-  0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74,
-  0x79, 0x70, 0x65, 0x3d, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x22, 0x20, 0x69,
-  0x64, 0x3d, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74,
-  0x22, 0x20, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x3d, 0x22, 0x69, 0x6d,
-  0x61, 0x67, 0x65, 0x2f, 0x2a, 0x22, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65,
-  0x3d, 0x22, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x6e,
-  0x6f, 0x6e, 0x65, 0x3b, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x64,
-  0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69,
-  0x64, 0x3d, 0x22, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, 0x3c,
-  0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79,
-  0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x0a
-};
-unsigned int index_html_len = 33456;
diff --git a/examples/server/index.js.hpp b/examples/server/index.js.hpp
deleted file mode 100644
index e09b3c8c5..000000000
--- a/examples/server/index.js.hpp
+++ /dev/null
@@ -1,1903 +0,0 @@
-unsigned char index_js[] = {
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x28, 0x29,
-  0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x79, 0x63, 0x6c, 0x65, 0x20,
-  0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x22, 0x29, 0x7d, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x53, 0x79, 0x6d, 0x62, 0x6f,
-  0x6c, 0x2e, 0x66, 0x6f, 0x72, 0x28, 0x22, 0x70, 0x72, 0x65, 0x61, 0x63,
-  0x74, 0x2d, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x73, 0x22, 0x29, 0x3b,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x28, 0x29,
-  0x7b, 0x69, 0x66, 0x28, 0x66, 0x3e, 0x31, 0x29, 0x7b, 0x66, 0x2d, 0x2d,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7d, 0x6c, 0x65, 0x74, 0x20,
-  0x74, 0x2c, 0x6e, 0x3d, 0x21, 0x31, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65,
-  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6f, 0x29,
-  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x3d, 0x6f, 0x3b, 0x6f, 0x3d, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x73, 0x2b, 0x2b, 0x3b, 0x77, 0x68,
-  0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
-  0x3d, 0x5f, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x3d,
-  0x5f, 0x2e, 0x6f, 0x3b, 0x5f, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64,
-  0x20, 0x30, 0x3b, 0x5f, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69,
-  0x66, 0x28, 0x21, 0x28, 0x38, 0x26, 0x5f, 0x2e, 0x66, 0x29, 0x26, 0x26,
-  0x70, 0x28, 0x5f, 0x29, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x5f, 0x2e, 0x63,
-  0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, 0x7b,
-  0x69, 0x66, 0x28, 0x21, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x65, 0x3b, 0x6e,
-  0x3d, 0x21, 0x30, 0x7d, 0x7d, 0x5f, 0x3d, 0x69, 0x7d, 0x7d, 0x73, 0x3d,
-  0x30, 0x3b, 0x66, 0x2d, 0x2d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x74,
-  0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x5f, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28,
-  0x66, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
-  0x28, 0x29, 0x3b, 0x66, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69,
-  0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x7d, 0x6c,
-  0x65, 0x74, 0x20, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, 0x30, 0x3b, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x28, 0x74, 0x29,
-  0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x6e, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x3b, 0x72, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e,
-  0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x72, 0x2d, 0x2d, 0x3b, 0x69, 0x3d, 0x6e,
-  0x7d, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x66, 0x3d, 0x30, 0x2c, 0x73, 0x3d,
-  0x30, 0x2c, 0x6c, 0x3d, 0x30, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x20, 0x63, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74,
-  0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x6e, 0x2e, 0x74, 0x21, 0x3d, 0x3d,
-  0x69, 0x29, 0x7b, 0x6e, 0x3d, 0x7b, 0x69, 0x3a, 0x30, 0x2c, 0x53, 0x3a,
-  0x74, 0x2c, 0x70, 0x3a, 0x69, 0x2e, 0x73, 0x2c, 0x6e, 0x3a, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x3a, 0x69, 0x2c, 0x65, 0x3a, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x78, 0x3a, 0x76, 0x6f, 0x69, 0x64,
-  0x20, 0x30, 0x2c, 0x72, 0x3a, 0x6e, 0x7d, 0x3b, 0x69, 0x66, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x2e, 0x73, 0x29,
-  0x69, 0x2e, 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d,
-  0x6e, 0x3b, 0x74, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x33,
-  0x32, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x74, 0x2e, 0x53, 0x28, 0x6e, 0x29,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x65, 0x6c,
-  0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x6e,
-  0x2e, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x66,
-  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e,
-  0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x6e, 0x2e, 0x70,
-  0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
-  0x3d, 0x6e, 0x2e, 0x70, 0x29, 0x6e, 0x2e, 0x70, 0x2e, 0x6e, 0x3d, 0x6e,
-  0x2e, 0x6e, 0x3b, 0x6e, 0x2e, 0x70, 0x3d, 0x69, 0x2e, 0x73, 0x3b, 0x6e,
-  0x2e, 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e,
-  0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, 0x7d,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x7d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7b,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6e,
-  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x68, 0x2e,
-  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x62, 0x72,
-  0x61, 0x6e, 0x64, 0x3d, 0x6e, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74,
-  0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-  0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x26, 0x26, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x65, 0x29, 0x7b,
-  0x74, 0x2e, 0x78, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x69,
-  0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74,
-  0x2e, 0x65, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3d,
-  0x74, 0x7d, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74,
-  0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74,
-  0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e,
-  0x65, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x78, 0x3b, 0x69, 0x66, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x7b, 0x6e,
-  0x2e, 0x78, 0x3d, 0x65, 0x3b, 0x74, 0x2e, 0x65, 0x3d, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x65, 0x2e, 0x65, 0x3d, 0x6e,
-  0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d,
-  0x69, 0x66, 0x28, 0x74, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3d, 0x65, 0x7d, 0x7d,
-  0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65,
-  0x2e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x3d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x77, 0x28, 0x28, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2c, 0x5f, 0x3d, 0x33, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66,
-  0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x33,
-  0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x28, 0x65, 0x29, 0x7d, 0x66, 0x69,
-  0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66,
-  0x7c, 0x3d, 0x5f, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x3b, 0x68, 0x2e, 0x70,
-  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x4f, 0x66, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x68,
-  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74,
-  0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x2b, 0x22, 0x22, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-  0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x4a, 0x53, 0x4f, 0x4e, 0x3d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76,
-  0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74,
-  0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x3d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d,
-  0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69,
-  0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x68,
-  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28,
-  0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x63, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x69, 0x3d,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x2c, 0x73, 0x65,
-  0x74, 0x28, 0x6e, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x20, 0x69, 0x6e,
-  0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, 0x79, 0x29, 0x21,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x74,
-  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
-  0x6f, 0x72, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64,
-  0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x68, 0x61, 0x76, 0x65,
-  0x20, 0x73, 0x69, 0x64, 0x65, 0x2d, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74,
-  0x73, 0x22, 0x29, 0x7d, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x21,
-  0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x29, 0x7b, 0x69, 0x66,
-  0x28, 0x73, 0x3e, 0x31, 0x30, 0x30, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x6e, 0x3b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x69, 0x2b, 0x2b, 0x3b, 0x6c, 0x2b, 0x2b, 0x3b, 0x66, 0x2b, 0x2b,
-  0x3b, 0x74, 0x72, 0x79, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74,
-  0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74,
-  0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x66,
-  0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x7d,
-  0x7d, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x61, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x6e, 0x65, 0x77, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x28, 0x74, 0x29, 0x7b,
-  0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e,
-  0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e,
-  0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x2e,
-  0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, 0x7c, 0x7c, 0x21,
-  0x6e, 0x2e, 0x53, 0x2e, 0x68, 0x28, 0x29, 0x7c, 0x7c, 0x6e, 0x2e, 0x53,
-  0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, 0x29, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x21, 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x64, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74,
-  0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29,
-  0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x53,
-  0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x21, 0x3d, 0x3d, 0x65, 0x29, 0x6e, 0x2e, 0x72, 0x3d, 0x65, 0x3b, 0x6e,
-  0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x6e, 0x2e, 0x69, 0x3d, 0x2d,
-  0x31, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d,
-  0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x73, 0x3d, 0x6e,
-  0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x7d, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x76, 0x28, 0x74, 0x29, 0x7b, 0x6c,
-  0x65, 0x74, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x77,
-  0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21,
-  0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
-  0x3d, 0x65, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d,
-  0x3d, 0x65, 0x2e, 0x69, 0x29, 0x7b, 0x65, 0x2e, 0x53, 0x2e, 0x55, 0x28,
-  0x65, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x6e,
-  0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
-  0x3d, 0x65, 0x2e, 0x6e, 0x29, 0x65, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x74,
-  0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x3d, 0x65, 0x3b, 0x65, 0x2e,
-  0x53, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x72, 0x3b, 0x69, 0x66, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x72, 0x29,
-  0x65, 0x2e, 0x72, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x65,
-  0x3d, 0x74, 0x7d, 0x74, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x79, 0x28, 0x74, 0x29, 0x7b, 0x68,
-  0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67,
-  0x3d, 0x6c, 0x2d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x3d,
-  0x34, 0x7d, 0x28, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
-  0x70, 0x65, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x68, 0x29, 0x2e, 0x68, 0x3d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69, 0x66,
-  0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x33, 0x32,
-  0x3d, 0x3d, 0x28, 0x33, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66,
-  0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x35, 0x3b, 0x69, 0x66,
-  0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x3d, 0x3d, 0x6c, 0x29,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x67, 0x3d, 0x6c, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66,
-  0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x69, 0x3e, 0x30, 0x26, 0x26, 0x21, 0x70, 0x28, 0x74, 0x68, 0x69, 0x73,
-  0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d,
-  0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x69, 0x3b, 0x74, 0x72, 0x79,
-  0x7b, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x3d, 0x74,
-  0x68, 0x69, 0x73, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28,
-  0x31, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x7c, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x21, 0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x30,
-  0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x29, 0x7b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x31, 0x37, 0x3b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
-  0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x36, 0x3b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x69, 0x3d, 0x74, 0x3b,
-  0x76, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-  0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x33,
-  0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29,
-  0x74, 0x2e, 0x53, 0x2e, 0x53, 0x28, 0x74, 0x29, 0x7d, 0x68, 0x2e, 0x70,
-  0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x2e, 0x63,
-  0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x29, 0x7d,
-  0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65,
-  0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x68,
-  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55,
-  0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74,
-  0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d,
-  0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x33, 0x3b, 0x66, 0x6f,
-  0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d,
-  0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29, 0x74, 0x2e, 0x53, 0x2e,
-  0x55, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72,
-  0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28,
-  0x21, 0x28, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29,
-  0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x36, 0x3b, 0x66,
-  0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
-  0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74,
-  0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f,
-  0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x3d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69,
-  0x66, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x68, 0x28, 0x29, 0x29,
-  0x74, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x3b, 0x4f, 0x62, 0x6a,
-  0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72,
-  0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x79, 0x2e, 0x70, 0x72, 0x6f,
-  0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x69, 0x66,
-  0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x28,
-  0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x63, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x68,
-  0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x6e, 0x2e, 0x69, 0x3d, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x7d, 0x29, 0x3b, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, 0x28, 0x74, 0x29,
-  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20,
-  0x79, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x67, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x75, 0x3b, 0x74, 0x2e, 0x75, 0x3d, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70,
-  0x65, 0x6f, 0x66, 0x20, 0x6e, 0x29, 0x7b, 0x66, 0x2b, 0x2b, 0x3b, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x6e, 0x28,
-  0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74,
-  0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x74, 0x2e, 0x66, 0x7c, 0x3d,
-  0x38, 0x3b, 0x62, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x68, 0x72, 0x6f, 0x77,
-  0x20, 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x69,
-  0x3d, 0x5f, 0x3b, 0x65, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x28, 0x74, 0x29, 0x7b, 0x66,
-  0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73,
-  0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b,
-  0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x6e, 0x2e, 0x53, 0x2e, 0x55, 0x28,
-  0x6e, 0x29, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x3b, 0x74, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x3b, 0x67, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x20, 0x6b, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69,
-  0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x29, 0x74, 0x68, 0x72, 0x6f,
-  0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28,
-  0x22, 0x4f, 0x75, 0x74, 0x2d, 0x6f, 0x66, 0x2d, 0x6f, 0x72, 0x64, 0x65,
-  0x72, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x22, 0x29, 0x3b, 0x76,
-  0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x3d, 0x74, 0x3b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x69, 0x66,
-  0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x62, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x65, 0x28, 0x29, 0x7d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x53, 0x28, 0x74, 0x29, 0x7b,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64,
-  0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x3d, 0x33, 0x32,
-  0x7d, 0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65,
-  0x2e, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x53, 0x28, 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x69,
-  0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78,
-  0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29,
-  0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e,
-  0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x6e, 0x7d, 0x66, 0x69,
-  0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x28, 0x29, 0x7d, 0x7d, 0x3b,
-  0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e,
-  0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29,
-  0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66,
-  0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c,
-  0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d,
-  0x39, 0x3b, 0x67, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x64, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x66, 0x2b, 0x2b, 0x3b, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x74, 0x68,
-  0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x2e,
-  0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x29,
-  0x7d, 0x3b, 0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70,
-  0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, 0x26, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d,
-  0x6f, 0x3b, 0x6f, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x7d, 0x3b, 0x53,
-  0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64,
-  0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x38, 0x3b, 0x69, 0x66,
-  0x28, 0x21, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29,
-  0x29, 0x62, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x7d, 0x3b, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x77, 0x28, 0x74, 0x29, 0x7b,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20,
-  0x53, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x6e, 0x2e, 0x63,
-  0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b,
-  0x6e, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20,
-  0x74, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x64,
-  0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x29, 0x7d, 0x76, 0x61, 0x72,
-  0x20, 0x78, 0x2c, 0x43, 0x2c, 0x45, 0x2c, 0x55, 0x2c, 0x48, 0x2c, 0x50,
-  0x2c, 0x4e, 0x2c, 0x24, 0x2c, 0x44, 0x2c, 0x54, 0x3d, 0x7b, 0x7d, 0x2c,
-  0x56, 0x3d, 0x5b, 0x5d, 0x2c, 0x41, 0x3d, 0x2f, 0x61, 0x63, 0x69, 0x74,
-  0x7c, 0x65, 0x78, 0x28, 0x3f, 0x3a, 0x73, 0x7c, 0x67, 0x7c, 0x6e, 0x7c,
-  0x70, 0x7c, 0x24, 0x29, 0x7c, 0x72, 0x70, 0x68, 0x7c, 0x67, 0x72, 0x69,
-  0x64, 0x7c, 0x6f, 0x77, 0x73, 0x7c, 0x6d, 0x6e, 0x63, 0x7c, 0x6e, 0x74,
-  0x77, 0x7c, 0x69, 0x6e, 0x65, 0x5b, 0x63, 0x68, 0x5d, 0x7c, 0x7a, 0x6f,
-  0x6f, 0x7c, 0x5e, 0x6f, 0x72, 0x64, 0x7c, 0x69, 0x74, 0x65, 0x72, 0x61,
-  0x2f, 0x69, 0x2c, 0x46, 0x3d, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69,
-  0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x66,
-  0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x20, 0x69, 0x6e, 0x20,
-  0x6e, 0x29, 0x74, 0x5b, 0x65, 0x5d, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x57, 0x28, 0x74, 0x29, 0x7b, 0x76,
-  0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e,
-  0x74, 0x4e, 0x6f, 0x64, 0x65, 0x3b, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x72,
-  0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28, 0x74,
-  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c,
-  0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
-  0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, 0x7b, 0x7d, 0x3b, 0x66,
-  0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x22, 0x6b,
-  0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f,
-  0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69,
-  0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x72, 0x5b, 0x6f, 0x5d, 0x3d, 0x6e,
-  0x5b, 0x6f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d,
-  0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e,
-  0x32, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72,
-  0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73,
-  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f, 0x78, 0x2e,
-  0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x22, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70,
-  0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21,
-  0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72,
-  0x6f, 0x70, 0x73, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e,
-  0x20, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72,
-  0x6f, 0x70, 0x73, 0x29, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d,
-  0x3d, 0x72, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x28, 0x72, 0x5b, 0x6f, 0x5d,
-  0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72,
-  0x6f, 0x70, 0x73, 0x5b, 0x6f, 0x5d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x72, 0x2c, 0x5f, 0x2c, 0x69,
-  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c,
-  0x5f, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x7b,
-  0x74, 0x79, 0x70, 0x65, 0x3a, 0x74, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73,
-  0x3a, 0x6e, 0x2c, 0x6b, 0x65, 0x79, 0x3a, 0x65, 0x2c, 0x72, 0x65, 0x66,
-  0x3a, 0x5f, 0x2c, 0x5f, 0x5f, 0x6b, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
-  0x5f, 0x5f, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x62, 0x3a,
-  0x30, 0x2c, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f,
-  0x5f, 0x64, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x5f,
-  0x63, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20,
-  0x30, 0x2c, 0x5f, 0x5f, 0x76, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
-  0x69, 0x3f, 0x2b, 0x2b, 0x45, 0x3a, 0x69, 0x2c, 0x5f, 0x5f, 0x69, 0x3a,
-  0x2d, 0x31, 0x2c, 0x5f, 0x5f, 0x75, 0x3a, 0x30, 0x7d, 0x3b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x69,
-  0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x43, 0x2e, 0x76, 0x6e,
-  0x6f, 0x64, 0x65, 0x26, 0x26, 0x43, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65,
-  0x28, 0x6f, 0x29, 0x2c, 0x6f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x20, 0x52, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x6e, 0x75,
-  0x6c, 0x6c, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x6a, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x28, 0x74,
-  0x2c, 0x6e, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x3d, 0x74, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f,
-  0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x71, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b,
-  0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6e, 0x29, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x3f, 0x71,
-  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x69, 0x2b,
-  0x31, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x66, 0x6f, 0x72, 0x28,
-  0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, 0x5f, 0x5f,
-  0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e, 0x2b, 0x2b,
-  0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x65,
-  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29, 0x26, 0x26,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x5f, 0x5f, 0x65,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
-  0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3f, 0x71, 0x28, 0x74,
-  0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x42, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72,
-  0x20, 0x6e, 0x2c, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c,
-  0x21, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x26, 0x26,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29,
-  0x7b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74,
-  0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x6e, 0x75,
-  0x6c, 0x6c, 0x2c, 0x6e, 0x3d, 0x30, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, 0x5f,
-  0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e, 0x2b,
-  0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28,
-  0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29, 0x26,
-  0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65,
-  0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f,
-  0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65,
-  0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x42, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x47, 0x28, 0x74, 0x29, 0x7b, 0x28, 0x21,
-  0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f,
-  0x64, 0x3d, 0x21, 0x30, 0x29, 0x26, 0x26, 0x48, 0x2e, 0x70, 0x75, 0x73,
-  0x68, 0x28, 0x74, 0x29, 0x26, 0x26, 0x21, 0x7a, 0x2e, 0x5f, 0x5f, 0x72,
-  0x2b, 0x2b, 0x7c, 0x7c, 0x50, 0x21, 0x3d, 0x3d, 0x43, 0x2e, 0x64, 0x65,
-  0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, 0x64, 0x65, 0x72,
-  0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x28, 0x28, 0x50, 0x3d, 0x43, 0x2e,
-  0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, 0x64,
-  0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x7c, 0x7c, 0x4e, 0x29, 0x28, 0x7a,
-  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x7a,
-  0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
-  0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66,
-  0x3b, 0x66, 0x6f, 0x72, 0x28, 0x48, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28,
-  0x24, 0x29, 0x3b, 0x74, 0x3d, 0x48, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74,
-  0x28, 0x29, 0x3b, 0x29, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28,
-  0x6e, 0x3d, 0x48, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x5f,
-  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6f, 0x3d, 0x28, 0x69,
-  0x3d, 0x28, 0x65, 0x3d, 0x74, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x2e,
-  0x5f, 0x5f, 0x65, 0x2c, 0x75, 0x3d, 0x5b, 0x5d, 0x2c, 0x66, 0x3d, 0x5b,
-  0x5d, 0x2c, 0x28, 0x72, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x50, 0x29, 0x26,
-  0x26, 0x28, 0x28, 0x5f, 0x3d, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x69, 0x29,
-  0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x76, 0x2b,
-  0x31, 0x2c, 0x43, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, 0x26, 0x26, 0x43,
-  0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, 0x28, 0x5f, 0x29, 0x2c, 0x5f, 0x74,
-  0x28, 0x72, 0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x6e,
-  0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x72, 0x2e,
-  0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d,
-  0x65, 0x6e, 0x74, 0x2c, 0x33, 0x32, 0x26, 0x69, 0x2e, 0x5f, 0x5f, 0x75,
-  0x3f, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x75, 0x2c,
-  0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x3f, 0x71, 0x28, 0x69, 0x29,
-  0x3a, 0x6f, 0x2c, 0x21, 0x21, 0x28, 0x33, 0x32, 0x26, 0x69, 0x2e, 0x5f,
-  0x5f, 0x75, 0x29, 0x2c, 0x66, 0x29, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x2e,
-  0x5f, 0x5f, 0x6b, 0x5b, 0x5f, 0x2e, 0x5f, 0x5f, 0x69, 0x5d, 0x3d, 0x5f,
-  0x2c, 0x69, 0x74, 0x28, 0x75, 0x2c, 0x5f, 0x2c, 0x66, 0x29, 0x2c, 0x5f,
-  0x2e, 0x5f, 0x5f, 0x65, 0x21, 0x3d, 0x6f, 0x26, 0x26, 0x42, 0x28, 0x5f,
-  0x29, 0x29, 0x2c, 0x48, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e,
-  0x6e, 0x26, 0x26, 0x48, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x24, 0x29,
-  0x29, 0x3b, 0x7a, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x7d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x28, 0x74, 0x2c, 0x6e,
-  0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75,
-  0x2c, 0x66, 0x2c, 0x73, 0x2c, 0x6c, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
-  0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x3d,
-  0x5f, 0x26, 0x26, 0x5f, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x56, 0x2c,
-  0x79, 0x3d, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66,
-  0x6f, 0x72, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x66, 0x2c, 0x4b,
-  0x28, 0x65, 0x2c, 0x6e, 0x2c, 0x76, 0x29, 0x2c, 0x66, 0x3d, 0x65, 0x2e,
-  0x5f, 0x5f, 0x64, 0x2c, 0x63, 0x3d, 0x30, 0x3b, 0x63, 0x3c, 0x79, 0x3b,
-  0x63, 0x2b, 0x2b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x61,
-  0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x63, 0x5d, 0x29, 0x26, 0x26,
-  0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x26, 0x26, 0x22, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70,
-  0x65, 0x6f, 0x66, 0x20, 0x61, 0x26, 0x26, 0x28, 0x68, 0x3d, 0x2d, 0x31,
-  0x3d, 0x3d, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x69, 0x3f, 0x54, 0x3a, 0x76,
-  0x5b, 0x61, 0x2e, 0x5f, 0x5f, 0x69, 0x5d, 0x7c, 0x7c, 0x54, 0x2c, 0x61,
-  0x2e, 0x5f, 0x5f, 0x69, 0x3d, 0x63, 0x2c, 0x5f, 0x74, 0x28, 0x74, 0x2c,
-  0x61, 0x2c, 0x68, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c,
-  0x66, 0x2c, 0x73, 0x2c, 0x6c, 0x29, 0x2c, 0x70, 0x3d, 0x61, 0x2e, 0x5f,
-  0x5f, 0x65, 0x2c, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, 0x68, 0x2e,
-  0x72, 0x65, 0x66, 0x21, 0x3d, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26,
-  0x28, 0x68, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, 0x72, 0x74, 0x28, 0x68,
-  0x2e, 0x72, 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29,
-  0x2c, 0x6c, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x61, 0x2e, 0x72, 0x65,
-  0x66, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x63, 0x7c, 0x7c, 0x70, 0x2c, 0x61,
-  0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x64, 0x26, 0x26,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x70, 0x26, 0x26, 0x28, 0x64, 0x3d,
-  0x70, 0x29, 0x2c, 0x36, 0x35, 0x35, 0x33, 0x36, 0x26, 0x61, 0x2e, 0x5f,
-  0x5f, 0x75, 0x7c, 0x7c, 0x68, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x3d, 0x3d,
-  0x61, 0x2e, 0x5f, 0x5f, 0x6b, 0x3f, 0x66, 0x3d, 0x51, 0x28, 0x61, 0x2c,
-  0x66, 0x2c, 0x74, 0x29, 0x3a, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20,
-  0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64,
-  0x20, 0x30, 0x21, 0x3d, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3f, 0x66,
-  0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3a, 0x70, 0x26, 0x26, 0x28, 0x66,
-  0x3d, 0x70, 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69,
-  0x6e, 0x67, 0x29, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x3d,
-  0x2d, 0x31, 0x39, 0x36, 0x36, 0x30, 0x39, 0x29, 0x3b, 0x65, 0x2e, 0x5f,
-  0x5f, 0x64, 0x3d, 0x66, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x64,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x28,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f,
-  0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x3d, 0x6e,
-  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x73, 0x3d, 0x65, 0x2e,
-  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x6c, 0x3d, 0x73, 0x2c, 0x63,
-  0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x6b,
-  0x3d, 0x5b, 0x5d, 0x2c, 0x5f, 0x3d, 0x30, 0x3b, 0x5f, 0x3c, 0x66, 0x3b,
-  0x5f, 0x2b, 0x2b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x69,
-  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x5f, 0x5d, 0x3d, 0x6e, 0x75,
-  0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x69, 0x3d, 0x6e, 0x5b, 0x5f, 0x5d, 0x29,
-  0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x3d,
-  0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x7c, 0x7c, 0x22,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x3f, 0x6e, 0x75, 0x6c, 0x6c,
-  0x3a, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x7c, 0x7c, 0x22, 0x6e, 0x75,
-  0x6d, 0x62, 0x65, 0x72, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
-  0x66, 0x20, 0x69, 0x7c, 0x7c, 0x22, 0x62, 0x69, 0x67, 0x69, 0x6e, 0x74,
-  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x7c,
-  0x7c, 0x69, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
-  0x6f, 0x72, 0x3d, 0x3d, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3f, 0x4f,
-  0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
-  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x29, 0x3a, 0x46, 0x28, 0x69,
-  0x29, 0x3f, 0x4f, 0x28, 0x6a, 0x2c, 0x7b, 0x63, 0x68, 0x69, 0x6c, 0x64,
-  0x72, 0x65, 0x6e, 0x3a, 0x69, 0x7d, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
-  0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3a, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x2e, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x26, 0x26, 0x69,
-  0x2e, 0x5f, 0x5f, 0x62, 0x3e, 0x30, 0x3f, 0x4f, 0x28, 0x69, 0x2e, 0x74,
-  0x79, 0x70, 0x65, 0x2c, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c,
-  0x69, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x69, 0x2e, 0x72, 0x65, 0x66, 0x3f,
-  0x69, 0x2e, 0x72, 0x65, 0x66, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69,
-  0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x3a, 0x69, 0x29, 0x3f, 0x28, 0x69, 0x2e,
-  0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x74,
-  0x2e, 0x5f, 0x5f, 0x62, 0x2b, 0x31, 0x2c, 0x75, 0x3d, 0x59, 0x28, 0x69,
-  0x2c, 0x65, 0x2c, 0x72, 0x3d, 0x5f, 0x2b, 0x63, 0x2c, 0x6c, 0x29, 0x2c,
-  0x69, 0x2e, 0x5f, 0x5f, 0x69, 0x3d, 0x75, 0x2c, 0x6f, 0x3d, 0x6e, 0x75,
-  0x6c, 0x6c, 0x2c, 0x2d, 0x31, 0x21, 0x3d, 0x3d, 0x75, 0x26, 0x26, 0x28,
-  0x6c, 0x2d, 0x2d, 0x2c, 0x28, 0x6f, 0x3d, 0x65, 0x5b, 0x75, 0x5d, 0x29,
-  0x26, 0x26, 0x28, 0x6f, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, 0x31, 0x33,
-  0x31, 0x30, 0x37, 0x32, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
-  0x3d, 0x6f, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x6f,
-  0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x75, 0x26,
-  0x26, 0x63, 0x2d, 0x2d, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20,
-  0x69, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f,
-  0x5f, 0x75, 0x7c, 0x3d, 0x36, 0x35, 0x35, 0x33, 0x36, 0x29, 0x29, 0x3a,
-  0x75, 0x21, 0x3d, 0x3d, 0x72, 0x26, 0x26, 0x28, 0x75, 0x3d, 0x3d, 0x3d,
-  0x72, 0x2b, 0x31, 0x3f, 0x63, 0x2b, 0x2b, 0x3a, 0x75, 0x3e, 0x72, 0x3f,
-  0x6c, 0x3e, 0x66, 0x2d, 0x72, 0x3f, 0x63, 0x2b, 0x3d, 0x75, 0x2d, 0x72,
-  0x3a, 0x63, 0x2d, 0x2d, 0x3a, 0x63, 0x3d, 0x75, 0x3c, 0x72, 0x26, 0x26,
-  0x75, 0x3d, 0x3d, 0x72, 0x2d, 0x31, 0x3f, 0x75, 0x2d, 0x72, 0x3a, 0x30,
-  0x2c, 0x75, 0x21, 0x3d, 0x3d, 0x5f, 0x2b, 0x63, 0x26, 0x26, 0x28, 0x69,
-  0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, 0x36, 0x35, 0x35, 0x33, 0x36, 0x29,
-  0x29, 0x29, 0x3a, 0x28, 0x6f, 0x3d, 0x65, 0x5b, 0x5f, 0x5d, 0x29, 0x26,
-  0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x2e, 0x6b, 0x65, 0x79,
-  0x26, 0x26, 0x6f, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, 0x28, 0x6f, 0x2e,
-  0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26,
-  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x71, 0x28, 0x6f, 0x29, 0x29,
-  0x2c, 0x75, 0x74, 0x28, 0x6f, 0x2c, 0x6f, 0x2c, 0x21, 0x31, 0x29, 0x2c,
-  0x65, 0x5b, 0x5f, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6c, 0x2d,
-  0x2d, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x6c, 0x29, 0x66, 0x6f, 0x72, 0x28,
-  0x5f, 0x3d, 0x30, 0x3b, 0x5f, 0x3c, 0x73, 0x3b, 0x5f, 0x2b, 0x2b, 0x29,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x6f, 0x3d, 0x65, 0x5b, 0x5f,
-  0x5d, 0x29, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30,
-  0x37, 0x32, 0x26, 0x6f, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x26, 0x26, 0x28,
-  0x6f, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64,
-  0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x71, 0x28, 0x6f,
-  0x29, 0x29, 0x2c, 0x75, 0x74, 0x28, 0x6f, 0x2c, 0x6f, 0x29, 0x29, 0x7d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x28, 0x74,
-  0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c,
-  0x69, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20,
-  0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28,
-  0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x69, 0x3d, 0x30, 0x3b,
-  0x5f, 0x26, 0x26, 0x69, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
-  0x68, 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x5f, 0x5b, 0x69, 0x5d, 0x26, 0x26,
-  0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x6e,
-  0x3d, 0x51, 0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x29,
-  0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x21,
-  0x3d, 0x6e, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x69, 0x6e, 0x73, 0x65, 0x72,
-  0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x28, 0x74, 0x2e, 0x5f, 0x5f,
-  0x65, 0x2c, 0x6e, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x2c, 0x6e,
-  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x2c, 0x6e, 0x26, 0x26, 0x6e,
-  0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x28,
-  0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x6e, 0x3d, 0x6e, 0x7c, 0x7c, 0x5b, 0x5d, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
-  0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61,
-  0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74,
-  0x7c, 0x7c, 0x28, 0x46, 0x28, 0x74, 0x29, 0x3f, 0x74, 0x2e, 0x73, 0x6f,
-  0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x28, 0x74, 0x29, 0x7b, 0x58, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x29,
-  0x29, 0x3a, 0x6e, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x29,
-  0x2c, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x59, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7b, 0x76,
-  0x61, 0x72, 0x20, 0x69, 0x3d, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x6f,
-  0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x72, 0x3d, 0x65, 0x2d,
-  0x31, 0x2c, 0x75, 0x3d, 0x65, 0x2b, 0x31, 0x2c, 0x66, 0x3d, 0x6e, 0x5b,
-  0x65, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
-  0x3d, 0x66, 0x7c, 0x7c, 0x66, 0x26, 0x26, 0x69, 0x3d, 0x3d, 0x66, 0x2e,
-  0x6b, 0x65, 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74,
-  0x79, 0x70, 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65,
-  0x3b, 0x69, 0x66, 0x28, 0x5f, 0x3e, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21,
-  0x3d, 0x66, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30,
-  0x37, 0x32, 0x26, 0x66, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x3f, 0x31, 0x3a,
-  0x30, 0x29, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x3b, 0x72, 0x3e, 0x3d, 0x30,
-  0x7c, 0x7c, 0x75, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
-  0x3b, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x3d, 0x30, 0x29, 0x7b,
-  0x69, 0x66, 0x28, 0x28, 0x66, 0x3d, 0x6e, 0x5b, 0x72, 0x5d, 0x29, 0x26,
-  0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x26,
-  0x66, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x26, 0x26, 0x69, 0x3d, 0x3d, 0x66,
-  0x2e, 0x6b, 0x65, 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e,
-  0x74, 0x79, 0x70, 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x72, 0x3b, 0x72, 0x2d, 0x2d, 0x7d, 0x69, 0x66, 0x28, 0x75, 0x3c, 0x6e,
-  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x7b, 0x69, 0x66, 0x28,
-  0x28, 0x66, 0x3d, 0x6e, 0x5b, 0x75, 0x5d, 0x29, 0x26, 0x26, 0x30, 0x3d,
-  0x3d, 0x28, 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x26, 0x66, 0x2e, 0x5f,
-  0x5f, 0x75, 0x29, 0x26, 0x26, 0x69, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65,
-  0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, 0x70,
-  0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x3b, 0x75,
-  0x2b, 0x2b, 0x7d, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x2d, 0x31,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x28,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x22, 0x2d, 0x22, 0x3d, 0x3d,
-  0x3d, 0x6e, 0x5b, 0x30, 0x5d, 0x3f, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x50,
-  0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x6e, 0x2c, 0x6e, 0x75,
-  0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x65, 0x29, 0x3a,
-  0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65,
-  0x3f, 0x22, 0x22, 0x3a, 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22,
-  0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c,
-  0x41, 0x2e, 0x74, 0x65, 0x73, 0x74, 0x28, 0x6e, 0x29, 0x3f, 0x65, 0x3a,
-  0x65, 0x2b, 0x22, 0x70, 0x78, 0x22, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x74, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
-  0x2c, 0x5f, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3b,
-  0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x22,
-  0x3d, 0x3d, 0x3d, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72,
-  0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66,
-  0x20, 0x65, 0x29, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63,
-  0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73,
-  0x65, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67,
-  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x5f, 0x26,
-  0x26, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63, 0x73,
-  0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x5f, 0x3d, 0x22, 0x22, 0x29, 0x2c,
-  0x5f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x5f,
-  0x29, 0x65, 0x26, 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x7c, 0x7c,
-  0x5a, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, 0x2c,
-  0x22, 0x22, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x65, 0x29, 0x66, 0x6f, 0x72,
-  0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x5f, 0x26, 0x26, 0x65,
-  0x5b, 0x6e, 0x5d, 0x3d, 0x3d, 0x3d, 0x5f, 0x5b, 0x6e, 0x5d, 0x7c, 0x7c,
-  0x5a, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, 0x2c,
-  0x65, 0x5b, 0x6e, 0x5d, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69,
-  0x66, 0x28, 0x22, 0x6f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, 0x5d,
-  0x26, 0x26, 0x22, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x31, 0x5d,
-  0x29, 0x6f, 0x3d, 0x6e, 0x21, 0x3d, 0x3d, 0x28, 0x6e, 0x3d, 0x6e, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x28, 0x50, 0x6f,
-  0x69, 0x6e, 0x74, 0x65, 0x72, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65,
-  0x29, 0x24, 0x7c, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x24, 0x2f,
-  0x2c, 0x22, 0x24, 0x31, 0x22, 0x29, 0x29, 0x2c, 0x6e, 0x3d, 0x6e, 0x2e,
-  0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65, 0x28,
-  0x29, 0x69, 0x6e, 0x20, 0x74, 0x3f, 0x6e, 0x2e, 0x74, 0x6f, 0x4c, 0x6f,
-  0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x2e, 0x73, 0x6c,
-  0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x3a, 0x6e, 0x2e, 0x73, 0x6c, 0x69,
-  0x63, 0x65, 0x28, 0x32, 0x29, 0x2c, 0x74, 0x2e, 0x6c, 0x7c, 0x7c, 0x28,
-  0x74, 0x2e, 0x6c, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x6c, 0x5b,
-  0x6e, 0x2b, 0x6f, 0x5d, 0x3d, 0x65, 0x2c, 0x65, 0x3f, 0x5f, 0x3f, 0x65,
-  0x2e, 0x75, 0x3d, 0x5f, 0x2e, 0x75, 0x3a, 0x28, 0x65, 0x2e, 0x75, 0x3d,
-  0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, 0x2c, 0x74,
-  0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73,
-  0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, 0x65, 0x74,
-  0x3a, 0x6e, 0x74, 0x2c, 0x6f, 0x29, 0x29, 0x3a, 0x74, 0x2e, 0x72, 0x65,
-  0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73,
-  0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, 0x65, 0x74,
-  0x3a, 0x6e, 0x74, 0x2c, 0x6f, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b,
-  0x69, 0x66, 0x28, 0x69, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, 0x65, 0x70,
-  0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x78, 0x6c, 0x69, 0x6e, 0x6b, 0x28,
-  0x48, 0x7c, 0x3a, 0x68, 0x29, 0x2f, 0x2c, 0x22, 0x68, 0x22, 0x29, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x73, 0x4e, 0x61,
-  0x6d, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x73, 0x22, 0x29, 0x3b, 0x65, 0x6c,
-  0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x77, 0x69, 0x64, 0x74, 0x68,
-  0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x65, 0x69, 0x67,
-  0x68, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x72,
-  0x65, 0x66, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x6c, 0x69,
-  0x73, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x66, 0x6f,
-  0x72, 0x6d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x74, 0x61,
-  0x62, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26,
-  0x26, 0x22, 0x64, 0x6f, 0x77, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x22, 0x21,
-  0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x72, 0x6f, 0x77, 0x53, 0x70, 0x61,
-  0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x63, 0x6f, 0x6c,
-  0x53, 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22,
-  0x72, 0x6f, 0x6c, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x6e,
-  0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x5b,
-  0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22,
-  0x22, 0x3a, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74, 0x7d,
-  0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x7d, 0x22, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79,
-  0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x28, 0x6e, 0x75, 0x6c,
-  0x6c, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x21, 0x31, 0x3d, 0x3d, 0x3d, 0x65,
-  0x26, 0x26, 0x22, 0x2d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x5b, 0x34, 0x5d,
-  0x3f, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, 0x74, 0x74,
-  0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e,
-  0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
-  0x28, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76,
-  0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x5b,
-  0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x31, 0x5d, 0x3b, 0x69,
-  0x66, 0x28, 0x74, 0x2e, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x2e,
-  0x74, 0x3c, 0x3d, 0x6e, 0x2e, 0x75, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x74, 0x2e, 0x74, 0x3d, 0x44,
-  0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, 0x3b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x28, 0x43, 0x2e, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x3f, 0x43, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x74,
-  0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x5b, 0x74, 0x2e,
-  0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x30, 0x5d, 0x28, 0x43, 0x2e, 0x65,
-  0x76, 0x65, 0x6e, 0x74, 0x3f, 0x43, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x5f, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65,
-  0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66,
-  0x2c, 0x73, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6c, 0x2c, 0x63, 0x2c,
-  0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x2c, 0x79, 0x2c,
-  0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b, 0x2c, 0x53, 0x2c, 0x77, 0x2c,
-  0x78, 0x2c, 0x45, 0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, 0x69,
-  0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e,
-  0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72,
-  0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c,
-  0x3b, 0x31, 0x32, 0x38, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x26,
-  0x28, 0x66, 0x3d, 0x21, 0x21, 0x28, 0x33, 0x32, 0x26, 0x65, 0x2e, 0x5f,
-  0x5f, 0x75, 0x29, 0x2c, 0x6f, 0x3d, 0x5b, 0x75, 0x3d, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x5d, 0x29, 0x2c, 0x28,
-  0x6c, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x62, 0x29, 0x26, 0x26, 0x6c, 0x28,
-  0x6e, 0x29, 0x3b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65,
-  0x6f, 0x66, 0x20, 0x45, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x66, 0x28,
-  0x79, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x6d, 0x3d,
-  0x28, 0x6c, 0x3d, 0x45, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
-  0x54, 0x79, 0x70, 0x65, 0x29, 0x26, 0x26, 0x5f, 0x5b, 0x6c, 0x2e, 0x5f,
-  0x5f, 0x63, 0x5d, 0x2c, 0x67, 0x3d, 0x6c, 0x3f, 0x6d, 0x3f, 0x6d, 0x2e,
-  0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a,
-  0x6c, 0x2e, 0x5f, 0x5f, 0x3a, 0x5f, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x63,
-  0x3f, 0x76, 0x3d, 0x28, 0x63, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d,
-  0x65, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x3d, 0x63, 0x2e,
-  0x5f, 0x5f, 0x45, 0x3a, 0x28, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74,
-  0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20, 0x45, 0x26, 0x26, 0x45, 0x2e,
-  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65,
-  0x6e, 0x64, 0x65, 0x72, 0x3f, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63,
-  0x3d, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x28, 0x79, 0x2c, 0x67, 0x29, 0x3a,
-  0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63, 0x3d, 0x6e, 0x65, 0x77,
-  0x20, 0x49, 0x28, 0x79, 0x2c, 0x67, 0x29, 0x2c, 0x63, 0x2e, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x45, 0x2c,
-  0x63, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x66, 0x74, 0x29,
-  0x2c, 0x6d, 0x26, 0x26, 0x6d, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x63, 0x29,
-  0x2c, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63,
-  0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x63, 0x2e, 0x73,
-  0x74, 0x61, 0x74, 0x65, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x63, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x5f,
-  0x5f, 0x6e, 0x3d, 0x5f, 0x2c, 0x68, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x64,
-  0x3d, 0x21, 0x30, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d,
-  0x2c, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x6e,
-  0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26,
-  0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x63, 0x2e, 0x73, 0x74, 0x61,
-  0x74, 0x65, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x45, 0x2e,
-  0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74,
-  0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73,
-  0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x3d, 0x63, 0x2e,
-  0x73, 0x74, 0x61, 0x74, 0x65, 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f,
-  0x73, 0x3d, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73,
-  0x29, 0x29, 0x2c, 0x4d, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x45,
-  0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53,
-  0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70,
-  0x73, 0x28, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, 0x29,
-  0x2c, 0x61, 0x3d, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x70,
-  0x3d, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63, 0x2e, 0x5f,
-  0x5f, 0x76, 0x3d, 0x6e, 0x2c, 0x68, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x3d,
-  0x3d, 0x45, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65,
-  0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72,
-  0x6f, 0x70, 0x73, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63,
-  0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69,
-  0x6c, 0x6c, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63, 0x2e, 0x63,
-  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c,
-  0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
-  0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63,
-  0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x2e,
-  0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64,
-  0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b,
-  0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x45, 0x2e, 0x67,
-  0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61,
-  0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26,
-  0x26, 0x79, 0x21, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c,
-  0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65,
-  0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x63, 0x2e, 0x63, 0x6f, 0x6d,
-  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65,
-  0x63, 0x65, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, 0x79,
-  0x2c, 0x67, 0x29, 0x2c, 0x21, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26,
-  0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x73, 0x68, 0x6f,
-  0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74,
-  0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x21, 0x31, 0x3d, 0x3d,
-  0x3d, 0x63, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d,
-  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65,
-  0x28, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x7c,
-  0x7c, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f,
-  0x5f, 0x76, 0x29, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x76, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26,
-  0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63,
-  0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73,
-  0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x29, 0x2c, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
-  0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
-  0x29, 0x7b, 0x74, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e,
-  0x29, 0x7d, 0x29, 0x29, 0x2c, 0x62, 0x3d, 0x30, 0x3b, 0x62, 0x3c, 0x63,
-  0x2e, 0x5f, 0x73, 0x62, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b,
-  0x62, 0x2b, 0x2b, 0x29, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75,
-  0x73, 0x68, 0x28, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x5b, 0x62, 0x5d, 0x29,
-  0x3b, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x2e,
-  0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26,
-  0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29, 0x3b, 0x62, 0x72,
-  0x65, 0x61, 0x6b, 0x20, 0x74, 0x7d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d,
-  0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57,
-  0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x63,
-  0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69,
-  0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x79, 0x2c, 0x63,
-  0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c,
-  0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26,
-  0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x28,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63,
-  0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69,
-  0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x61, 0x2c, 0x70, 0x2c,
-  0x64, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x63, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x50,
-  0x3d, 0x74, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x31, 0x2c,
-  0x6b, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x53, 0x3d, 0x30, 0x2c,
-  0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x22, 0x69,
-  0x6e, 0x20, 0x45, 0x26, 0x26, 0x45, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-  0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x29,
-  0x7b, 0x66, 0x6f, 0x72, 0x28, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65,
-  0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x64,
-  0x3d, 0x21, 0x31, 0x2c, 0x6b, 0x26, 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c,
-  0x6c, 0x3d, 0x63, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63,
-  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61,
-  0x74, 0x65, 0x2c, 0x63, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
-  0x29, 0x2c, 0x77, 0x3d, 0x30, 0x3b, 0x77, 0x3c, 0x63, 0x2e, 0x5f, 0x73,
-  0x62, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x77, 0x2b, 0x2b,
-  0x29, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28,
-  0x63, 0x2e, 0x5f, 0x73, 0x62, 0x5b, 0x77, 0x5d, 0x29, 0x3b, 0x63, 0x2e,
-  0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20,
-  0x64, 0x6f, 0x7b, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x2c,
-  0x6b, 0x26, 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c, 0x6c, 0x3d, 0x63, 0x2e,
-  0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f,
-  0x70, 0x73, 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63,
-  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, 0x63, 0x2e,
-  0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x7d,
-  0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x26,
-  0x26, 0x2b, 0x2b, 0x53, 0x3c, 0x32, 0x35, 0x29, 0x3b, 0x63, 0x2e, 0x73,
-  0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6e,
-  0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68,
-  0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x26, 0x26,
-  0x28, 0x5f, 0x3d, 0x4d, 0x28, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x5f, 0x29,
-  0x2c, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43,
-  0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x29, 0x29, 0x2c, 0x68,
-  0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x67, 0x65,
-  0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66,
-  0x6f, 0x72, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28,
-  0x64, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73,
-  0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x55, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x28, 0x61, 0x2c, 0x70, 0x29, 0x29, 0x2c, 0x4a, 0x28,
-  0x74, 0x2c, 0x46, 0x28, 0x78, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d,
-  0x6c, 0x26, 0x26, 0x6c, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d,
-  0x6a, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6c, 0x2e, 0x6b,
-  0x65, 0x79, 0x3f, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63,
-  0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x6c, 0x29, 0x3f, 0x78,
-  0x3a, 0x5b, 0x78, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69,
-  0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x2c, 0x73, 0x29, 0x2c,
-  0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65,
-  0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x3d, 0x2d, 0x31, 0x36, 0x31,
-  0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
-  0x68, 0x26, 0x26, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29,
-  0x2c, 0x76, 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x63,
-  0x2e, 0x5f, 0x5f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x76,
-  0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x66, 0x7c, 0x7c, 0x6e, 0x75, 0x6c,
-  0x6c, 0x21, 0x3d, 0x6f, 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d,
-  0x75, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, 0x66, 0x3f, 0x31,
-  0x36, 0x30, 0x3a, 0x33, 0x32, 0x2c, 0x6f, 0x5b, 0x6f, 0x2e, 0x69, 0x6e,
-  0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x75, 0x29, 0x5d, 0x3d, 0x6e, 0x75,
-  0x6c, 0x6c, 0x29, 0x3a, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65,
-  0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65,
-  0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x2c, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20,
-  0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x26, 0x26, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28,
-  0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c,
-  0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29,
-  0x3a, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x6f, 0x74, 0x28, 0x65, 0x2e,
-  0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x2c,
-  0x6f, 0x2c, 0x72, 0x2c, 0x66, 0x2c, 0x73, 0x29, 0x3b, 0x28, 0x6c, 0x3d,
-  0x43, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x29, 0x26, 0x26, 0x6c,
-  0x28, 0x6e, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b,
-  0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3d, 0x30, 0x3b,
-  0x5f, 0x3c, 0x65, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x5f,
-  0x2b, 0x2b, 0x29, 0x72, 0x74, 0x28, 0x65, 0x5b, 0x5f, 0x5d, 0x2c, 0x65,
-  0x5b, 0x2b, 0x2b, 0x5f, 0x5d, 0x2c, 0x65, 0x5b, 0x2b, 0x2b, 0x5f, 0x5d,
-  0x29, 0x3b, 0x43, 0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x43, 0x2e, 0x5f,
-  0x5f, 0x63, 0x28, 0x6e, 0x2c, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x73, 0x6f,
-  0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x28, 0x6e, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x3d, 0x6e, 0x2e,
-  0x5f, 0x5f, 0x68, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d,
-  0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x63,
-  0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x65,
-  0x28, 0x74, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x29,
-  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f,
-  0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x2c,
-  0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x29, 0x7b, 0x76, 0x61, 0x72,
-  0x20, 0x73, 0x2c, 0x6c, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70,
-  0x2c, 0x64, 0x2c, 0x76, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73,
-  0x2c, 0x79, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x6d,
-  0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x22,
-  0x73, 0x76, 0x67, 0x22, 0x3d, 0x3d, 0x3d, 0x6d, 0x26, 0x26, 0x28, 0x69,
-  0x3d, 0x21, 0x30, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f,
-  0x29, 0x66, 0x6f, 0x72, 0x28, 0x73, 0x3d, 0x30, 0x3b, 0x73, 0x3c, 0x6f,
-  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, 0x2b, 0x2b, 0x29,
-  0x69, 0x66, 0x28, 0x28, 0x61, 0x3d, 0x6f, 0x5b, 0x73, 0x5d, 0x29, 0x26,
-  0x26, 0x22, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75,
-  0x74, 0x65, 0x22, 0x69, 0x6e, 0x20, 0x61, 0x3d, 0x3d, 0x21, 0x21, 0x6d,
-  0x26, 0x26, 0x28, 0x6d, 0x3f, 0x61, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c,
-  0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x3d, 0x3d, 0x6d, 0x3a, 0x33, 0x3d, 0x3d,
-  0x3d, 0x61, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29,
-  0x29, 0x7b, 0x74, 0x3d, 0x61, 0x2c, 0x6f, 0x5b, 0x73, 0x5d, 0x3d, 0x6e,
-  0x75, 0x6c, 0x6c, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x69, 0x66,
-  0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x29, 0x7b, 0x69, 0x66,
-  0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x6d, 0x29, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e,
-  0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x54, 0x65, 0x78, 0x74,
-  0x4e, 0x6f, 0x64, 0x65, 0x28, 0x79, 0x29, 0x3b, 0x74, 0x3d, 0x69, 0x3f,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65,
-  0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x4e, 0x53,
-  0x28, 0x22, 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77,
-  0x2e, 0x77, 0x33, 0x2e, 0x6f, 0x72, 0x67, 0x2f, 0x32, 0x30, 0x30, 0x30,
-  0x2f, 0x73, 0x76, 0x67, 0x22, 0x2c, 0x6d, 0x29, 0x3a, 0x64, 0x6f, 0x63,
-  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65,
-  0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x28, 0x6d, 0x2c, 0x79, 0x2e,
-  0x69, 0x73, 0x26, 0x26, 0x79, 0x29, 0x2c, 0x6f, 0x3d, 0x6e, 0x75, 0x6c,
-  0x6c, 0x2c, 0x75, 0x3d, 0x21, 0x31, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75,
-  0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x6d, 0x29, 0x76, 0x3d, 0x3d, 0x3d, 0x79,
-  0x7c, 0x7c, 0x75, 0x26, 0x26, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d,
-  0x3d, 0x3d, 0x79, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x3d, 0x79, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28,
-  0x6f, 0x3d, 0x6f, 0x26, 0x26, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28,
-  0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73,
-  0x29, 0x2c, 0x76, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x7c,
-  0x7c, 0x54, 0x2c, 0x21, 0x75, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21,
-  0x3d, 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x3d, 0x7b, 0x7d, 0x2c,
-  0x73, 0x3d, 0x30, 0x3b, 0x73, 0x3c, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72,
-  0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
-  0x68, 0x3b, 0x73, 0x2b, 0x2b, 0x29, 0x76, 0x5b, 0x28, 0x61, 0x3d, 0x74,
-  0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x5b,
-  0x73, 0x5d, 0x29, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3d, 0x61, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x73, 0x20,
-  0x69, 0x6e, 0x20, 0x76, 0x29, 0x61, 0x3d, 0x76, 0x5b, 0x73, 0x5d, 0x2c,
-  0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d,
-  0x73, 0x7c, 0x7c, 0x28, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f,
-  0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72,
-  0x48, 0x54, 0x4d, 0x4c, 0x22, 0x3d, 0x3d, 0x73, 0x3f, 0x63, 0x3d, 0x61,
-  0x3a, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x3d, 0x73, 0x7c, 0x7c,
-  0x73, 0x20, 0x69, 0x6e, 0x20, 0x79, 0x7c, 0x7c, 0x74, 0x74, 0x28, 0x74,
-  0x2c, 0x73, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x2c, 0x69, 0x29,
-  0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x79,
-  0x29, 0x61, 0x3d, 0x79, 0x5b, 0x73, 0x5d, 0x2c, 0x22, 0x63, 0x68, 0x69,
-  0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x73, 0x3f, 0x68, 0x3d,
-  0x61, 0x3a, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73,
-  0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54,
-  0x4d, 0x4c, 0x22, 0x3d, 0x3d, 0x73, 0x3f, 0x6c, 0x3d, 0x61, 0x3a, 0x22,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3d, 0x3d, 0x73, 0x3f, 0x70, 0x3d,
-  0x61, 0x3a, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x22, 0x3d,
-  0x3d, 0x73, 0x3f, 0x64, 0x3d, 0x61, 0x3a, 0x22, 0x6b, 0x65, 0x79, 0x22,
-  0x3d, 0x3d, 0x3d, 0x73, 0x7c, 0x7c, 0x75, 0x26, 0x26, 0x22, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70,
-  0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x76, 0x5b, 0x73, 0x5d, 0x3d,
-  0x3d, 0x3d, 0x61, 0x7c, 0x7c, 0x74, 0x74, 0x28, 0x74, 0x2c, 0x73, 0x2c,
-  0x61, 0x2c, 0x76, 0x5b, 0x73, 0x5d, 0x2c, 0x69, 0x29, 0x3b, 0x69, 0x66,
-  0x28, 0x6c, 0x29, 0x75, 0x7c, 0x7c, 0x63, 0x26, 0x26, 0x28, 0x6c, 0x2e,
-  0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x63, 0x2e, 0x5f,
-  0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x7c, 0x7c, 0x6c, 0x2e, 0x5f, 0x5f, 0x68,
-  0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65,
-  0x72, 0x48, 0x54, 0x4d, 0x4c, 0x29, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x69,
-  0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x6c, 0x2e, 0x5f,
-  0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b,
-  0x3d, 0x5b, 0x5d, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28,
-  0x63, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48,
-  0x54, 0x4d, 0x4c, 0x3d, 0x22, 0x22, 0x29, 0x2c, 0x4a, 0x28, 0x74, 0x2c,
-  0x46, 0x28, 0x68, 0x29, 0x3f, 0x68, 0x3a, 0x5b, 0x68, 0x5d, 0x2c, 0x6e,
-  0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x26, 0x26, 0x22, 0x66, 0x6f, 0x72,
-  0x65, 0x69, 0x67, 0x6e, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x22, 0x21,
-  0x3d, 0x3d, 0x6d, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6f, 0x3f, 0x6f, 0x5b,
-  0x30, 0x5d, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x26, 0x26, 0x71, 0x28,
-  0x65, 0x2c, 0x30, 0x29, 0x2c, 0x75, 0x2c, 0x66, 0x29, 0x2c, 0x6e, 0x75,
-  0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x73, 0x3d,
-  0x6f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, 0x2d, 0x2d,
-  0x3b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x5b, 0x73, 0x5d,
-  0x26, 0x26, 0x57, 0x28, 0x6f, 0x5b, 0x73, 0x5d, 0x29, 0x3b, 0x75, 0x7c,
-  0x7c, 0x28, 0x73, 0x3d, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c,
-  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x70, 0x26, 0x26,
-  0x28, 0x70, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x73, 0x5d, 0x7c, 0x7c, 0x22,
-  0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x22, 0x3d, 0x3d, 0x3d,
-  0x6d, 0x26, 0x26, 0x21, 0x70, 0x7c, 0x7c, 0x22, 0x6f, 0x70, 0x74, 0x69,
-  0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6d, 0x26, 0x26, 0x70, 0x21, 0x3d,
-  0x3d, 0x76, 0x5b, 0x73, 0x5d, 0x29, 0x26, 0x26, 0x74, 0x74, 0x28, 0x74,
-  0x2c, 0x73, 0x2c, 0x70, 0x2c, 0x76, 0x5b, 0x73, 0x5d, 0x2c, 0x21, 0x31,
-  0x29, 0x2c, 0x73, 0x3d, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64,
-  0x22, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x64,
-  0x26, 0x26, 0x64, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x73, 0x5d, 0x26, 0x26,
-  0x74, 0x74, 0x28, 0x74, 0x2c, 0x73, 0x2c, 0x64, 0x2c, 0x76, 0x5b, 0x73,
-  0x5d, 0x2c, 0x21, 0x31, 0x29, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x72, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x74,
-  0x72, 0x79, 0x7b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f,
-  0x74, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65,
-  0x6e, 0x74, 0x3d, 0x6e, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74,
-  0x29, 0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x65, 0x29,
-  0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75,
-  0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72,
-  0x20, 0x5f, 0x2c, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x43, 0x2e, 0x75, 0x6e,
-  0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x43, 0x2e, 0x75, 0x6e, 0x6d,
-  0x6f, 0x75, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x28, 0x5f, 0x3d, 0x74,
-  0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x63, 0x75,
-  0x72, 0x72, 0x65, 0x6e, 0x74, 0x26, 0x26, 0x5f, 0x2e, 0x63, 0x75, 0x72,
-  0x72, 0x65, 0x6e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65,
-  0x7c, 0x7c, 0x72, 0x74, 0x28, 0x5f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c,
-  0x6e, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x5f,
-  0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x29, 0x7b, 0x69, 0x66, 0x28,
-  0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57,
-  0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x74,
-  0x72, 0x79, 0x7b, 0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
-  0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e,
-  0x74, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29,
-  0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d,
-  0x5f, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x50,
-  0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d,
-  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x5f, 0x3d,
-  0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x69, 0x3d,
-  0x30, 0x3b, 0x69, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
-  0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x5f, 0x5b, 0x69, 0x5d, 0x26, 0x26, 0x75,
-  0x74, 0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x7c, 0x7c,
-  0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d,
-  0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70,
-  0x65, 0x29, 0x3b, 0x65, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
-  0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, 0x57, 0x28, 0x74, 0x2e, 0x5f,
-  0x5f, 0x65, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f,
-  0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x66, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x74,
-  0x2c, 0x65, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x73, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76,
-  0x61, 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x43,
-  0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x43, 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c,
-  0x6e, 0x29, 0x2c, 0x69, 0x3d, 0x28, 0x5f, 0x3d, 0x22, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65,
-  0x6f, 0x66, 0x20, 0x65, 0x29, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x65,
-  0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x6b, 0x2c, 0x6f, 0x3d, 0x5b, 0x5d, 0x2c, 0x72, 0x3d, 0x5b, 0x5d,
-  0x2c, 0x5f, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x3d, 0x28, 0x21, 0x5f, 0x26,
-  0x26, 0x65, 0x7c, 0x7c, 0x6e, 0x29, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x4c,
-  0x28, 0x6a, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5b, 0x74, 0x5d, 0x29,
-  0x2c, 0x69, 0x7c, 0x7c, 0x54, 0x2c, 0x54, 0x2c, 0x76, 0x6f, 0x69, 0x64,
-  0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72,
-  0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x21,
-  0x5f, 0x26, 0x26, 0x65, 0x3f, 0x5b, 0x65, 0x5d, 0x3a, 0x69, 0x3f, 0x6e,
-  0x75, 0x6c, 0x6c, 0x3a, 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73, 0x74, 0x43,
-  0x68, 0x69, 0x6c, 0x64, 0x3f, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28,
-  0x6e, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73,
-  0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6f, 0x2c, 0x21, 0x5f, 0x26,
-  0x26, 0x65, 0x3f, 0x65, 0x3a, 0x69, 0x3f, 0x69, 0x2e, 0x5f, 0x5f, 0x65,
-  0x3a, 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73, 0x74, 0x43, 0x68, 0x69, 0x6c,
-  0x64, 0x2c, 0x5f, 0x2c, 0x72, 0x29, 0x2c, 0x69, 0x74, 0x28, 0x6f, 0x2c,
-  0x74, 0x2c, 0x72, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x73, 0x74,
-  0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x6c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x63, 0x74, 0x28, 0x74, 0x2c, 0x6e,
-  0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x2c,
-  0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x3d, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x74,
-  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28,
-  0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26,
-  0x26, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x28, 0x72,
-  0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, 0x61,
-  0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x2c, 0x6e, 0x29,
-  0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e,
-  0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, 0x6f,
-  0x3f, 0x69, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x75, 0x5b, 0x6f, 0x5d,
-  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b,
-  0x6f, 0x5d, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d,
-  0x3d, 0x72, 0x3f, 0x72, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x5b, 0x6f, 0x5d,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x72, 0x67, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
-  0x3e, 0x32, 0x26, 0x26, 0x28, 0x75, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64,
-  0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f, 0x78,
-  0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65,
-  0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x4f, 0x28,
-  0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x75, 0x2c, 0x5f, 0x7c, 0x7c,
-  0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x69, 0x7c, 0x7c, 0x74, 0x2e, 0x72,
-  0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x74, 0x28, 0x74, 0x2c, 0x6e,
-  0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x7b, 0x5f, 0x5f, 0x63,
-  0x3a, 0x6e, 0x3d, 0x22, 0x5f, 0x5f, 0x63, 0x43, 0x22, 0x2b, 0x44, 0x2b,
-  0x2b, 0x2c, 0x5f, 0x5f, 0x3a, 0x74, 0x2c, 0x43, 0x6f, 0x6e, 0x73, 0x75,
-  0x6d, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x28,
-  0x6e, 0x29, 0x7d, 0x2c, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72,
-  0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29,
-  0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x2c, 0x5f, 0x3b, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74,
-  0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
-  0x7c, 0x7c, 0x28, 0x65, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x5f, 0x3d, 0x7b,
-  0x7d, 0x29, 0x5b, 0x6e, 0x5d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64,
-  0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x5f, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x68,
-  0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26,
-  0x65, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f,
-  0x65, 0x3d, 0x21, 0x30, 0x2c, 0x47, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x29,
-  0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x75, 0x62, 0x3d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x65,
-  0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72,
-  0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
-  0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e,
-  0x74, 0x3b, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74,
-  0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b,
-  0x65, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x65, 0x2e, 0x69,
-  0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, 0x29, 0x2c, 0x31, 0x29,
-  0x2c, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74,
-  0x29, 0x7d, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64,
-  0x72, 0x65, 0x6e, 0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x65, 0x2e, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72, 0x2e,
-  0x5f, 0x5f, 0x3d, 0x65, 0x2e, 0x43, 0x6f, 0x6e, 0x73, 0x75, 0x6d, 0x65,
-  0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x54, 0x79, 0x70,
-  0x65, 0x3d, 0x65, 0x7d, 0x78, 0x3d, 0x56, 0x2e, 0x73, 0x6c, 0x69, 0x63,
-  0x65, 0x2c, 0x43, 0x3d, 0x7b, 0x5f, 0x5f, 0x65, 0x3a, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c,
-  0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x69,
-  0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b,
-  0x29, 0x69, 0x66, 0x28, 0x28, 0x69, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63,
-  0x29, 0x26, 0x26, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x29, 0x74, 0x72, 0x79,
-  0x7b, 0x69, 0x66, 0x28, 0x28, 0x6f, 0x3d, 0x69, 0x2e, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x29, 0x26, 0x26, 0x6e,
-  0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65,
-  0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72,
-  0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x26, 0x26, 0x28, 0x69, 0x2e,
-  0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x6f, 0x2e, 0x67,
-  0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61,
-  0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28,
-  0x74, 0x29, 0x29, 0x2c, 0x72, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x64, 0x29,
-  0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x69, 0x2e, 0x63, 0x6f, 0x6d,
-  0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74,
-  0x63, 0x68, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
-  0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68,
-  0x28, 0x74, 0x2c, 0x5f, 0x7c, 0x7c, 0x7b, 0x7d, 0x29, 0x2c, 0x72, 0x3d,
-  0x69, 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, 0x72, 0x29, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x69, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x69, 0x7d,
-  0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x6e,
-  0x7d, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x7d, 0x2c, 0x45,
-  0x3d, 0x30, 0x2c, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x6e, 0x75, 0x6c,
-  0x6c, 0x3d, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75,
-  0x63, 0x74, 0x6f, 0x72, 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74,
-  0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61,
-  0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x65,
-  0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x5f, 0x5f, 0x73, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f,
-  0x73, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61,
-  0x74, 0x65, 0x3f, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3a,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x4d, 0x28, 0x7b,
-  0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65,
-  0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22,
-  0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26,
-  0x28, 0x74, 0x3d, 0x74, 0x28, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x65, 0x29,
-  0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29,
-  0x29, 0x2c, 0x74, 0x26, 0x26, 0x4d, 0x28, 0x65, 0x2c, 0x74, 0x29, 0x2c,
-  0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x6e, 0x26, 0x26, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e, 0x70, 0x75, 0x73, 0x68,
-  0x28, 0x6e, 0x29, 0x2c, 0x47, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x29,
-  0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70,
-  0x65, 0x2e, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74,
-  0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
-  0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26,
-  0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x30,
-  0x2c, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x68,
-  0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x2c, 0x47, 0x28, 0x74,
-  0x68, 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f,
-  0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65,
-  0x72, 0x3d, 0x6a, 0x2c, 0x48, 0x3d, 0x5b, 0x5d, 0x2c, 0x4e, 0x3d, 0x22,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73,
-  0x65, 0x3f, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x70, 0x72,
-  0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x68, 0x65, 0x6e,
-  0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73,
-  0x65, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x29, 0x29,
-  0x3a, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x2c,
-  0x24, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74,
-  0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
-  0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x2d, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x7d, 0x2c, 0x7a, 0x2e, 0x5f, 0x5f,
-  0x72, 0x3d, 0x30, 0x2c, 0x44, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20,
-  0x61, 0x74, 0x2c, 0x70, 0x74, 0x2c, 0x64, 0x74, 0x2c, 0x76, 0x74, 0x2c,
-  0x79, 0x74, 0x3d, 0x30, 0x2c, 0x6d, 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x67,
-  0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x62, 0x74, 0x3d, 0x43, 0x2e, 0x5f, 0x5f,
-  0x62, 0x2c, 0x6b, 0x74, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x53,
-  0x74, 0x3d, 0x43, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x2c, 0x77,
-  0x74, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x63, 0x2c, 0x78, 0x74, 0x3d, 0x43,
-  0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x43, 0x74, 0x28, 0x74, 0x2c, 0x6e,
-  0x29, 0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x43, 0x2e, 0x5f,
-  0x5f, 0x68, 0x28, 0x70, 0x74, 0x2c, 0x74, 0x2c, 0x79, 0x74, 0x7c, 0x7c,
-  0x6e, 0x29, 0x2c, 0x79, 0x74, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20,
-  0x65, 0x3d, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x7c, 0x7c, 0x28, 0x70,
-  0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x7b, 0x5f, 0x5f, 0x3a, 0x5b, 0x5d,
-  0x2c, 0x5f, 0x5f, 0x68, 0x3a, 0x5b, 0x5d, 0x7d, 0x29, 0x3b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x3e, 0x3d, 0x65, 0x2e, 0x5f, 0x5f,
-  0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x65, 0x2e, 0x5f,
-  0x5f, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x7b, 0x5f, 0x5f, 0x56, 0x3a,
-  0x67, 0x74, 0x7d, 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x5b, 0x74, 0x5d,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x45, 0x74,
-  0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79,
-  0x74, 0x3d, 0x31, 0x2c, 0x55, 0x74, 0x28, 0x71, 0x74, 0x2c, 0x74, 0x29,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x55, 0x74,
-  0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
-  0x5f, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x32, 0x29,
-  0x3b, 0x69, 0x66, 0x28, 0x5f, 0x2e, 0x74, 0x3d, 0x74, 0x2c, 0x21, 0x5f,
-  0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x5f, 0x5f, 0x3d,
-  0x5b, 0x65, 0x3f, 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x71, 0x74, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x29, 0x2c, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72,
-  0x20, 0x6e, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x4e, 0x3f, 0x5f, 0x2e, 0x5f,
-  0x5f, 0x4e, 0x5b, 0x30, 0x5d, 0x3a, 0x5f, 0x2e, 0x5f, 0x5f, 0x5b, 0x30,
-  0x5d, 0x2c, 0x65, 0x3d, 0x5f, 0x2e, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x29,
-  0x3b, 0x6e, 0x21, 0x3d, 0x3d, 0x65, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x5f,
-  0x5f, 0x4e, 0x3d, 0x5b, 0x65, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x5b, 0x31,
-  0x5d, 0x5d, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x73, 0x65, 0x74,
-  0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x5d,
-  0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x70, 0x74, 0x2c, 0x21, 0x70,
-  0x74, 0x2e, 0x75, 0x29, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e,
-  0x2c, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x5f, 0x2e, 0x5f, 0x5f,
-  0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x5f, 0x2e, 0x5f,
-  0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x69,
-  0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x7d, 0x29, 0x29, 0x3b, 0x69, 0x66,
-  0x28, 0x69, 0x2e, 0x65, 0x76, 0x65, 0x72, 0x79, 0x28, 0x28, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x7d, 0x29,
-  0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x6f, 0x7c, 0x7c,
-  0x6f, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x72,
-  0x3d, 0x21, 0x31, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69,
-  0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66,
-  0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
-  0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x3b, 0x74, 0x2e,
-  0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x2c, 0x74, 0x2e, 0x5f,
-  0x5f, 0x4e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x21,
-  0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x26, 0x26, 0x28,
-  0x72, 0x3d, 0x21, 0x30, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x21, 0x28,
-  0x21, 0x72, 0x26, 0x26, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x70, 0x72,
-  0x6f, 0x70, 0x73, 0x3d, 0x3d, 0x3d, 0x74, 0x29, 0x26, 0x26, 0x28, 0x21,
-  0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68,
-  0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x3b,
-  0x70, 0x74, 0x2e, 0x75, 0x3d, 0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20,
-  0x6f, 0x3d, 0x70, 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43,
-  0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61,
-  0x74, 0x65, 0x2c, 0x72, 0x3d, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70,
-  0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64,
-  0x61, 0x74, 0x65, 0x3b, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f,
-  0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61,
-  0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20,
-  0x5f, 0x3d, 0x6f, 0x3b, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x2c, 0x69, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x2c, 0x6f, 0x3d,
-  0x5f, 0x7d, 0x72, 0x26, 0x26, 0x72, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28,
-  0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d,
-  0x2c, 0x70, 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f,
-  0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74,
-  0x65, 0x3d, 0x69, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f,
-  0x2e, 0x5f, 0x5f, 0x4e, 0x7c, 0x7c, 0x5f, 0x2e, 0x5f, 0x5f, 0x7d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x48, 0x74, 0x28, 0x74,
-  0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x43, 0x74,
-  0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x33, 0x29, 0x3b, 0x21, 0x43, 0x2e,
-  0x5f, 0x5f, 0x73, 0x26, 0x26, 0x49, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f,
-  0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d,
-  0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x70, 0x74, 0x2e, 0x5f,
-  0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28,
-  0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x50, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72,
-  0x20, 0x65, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x34,
-  0x29, 0x3b, 0x21, 0x43, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x49, 0x74,
-  0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28,
-  0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e,
-  0x2c, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68,
-  0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x4e, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x79, 0x74, 0x3d, 0x35, 0x2c, 0x44, 0x74, 0x28, 0x28,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x3a, 0x74, 0x7d, 0x7d, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x24, 0x74, 0x28, 0x74,
-  0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x79, 0x74, 0x3d, 0x36, 0x2c, 0x50,
-  0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65,
-  0x6f, 0x66, 0x20, 0x74, 0x3f, 0x28, 0x74, 0x28, 0x6e, 0x28, 0x29, 0x29,
-  0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x6e, 0x75, 0x6c,
-  0x6c, 0x29, 0x7d, 0x29, 0x3a, 0x74, 0x3f, 0x28, 0x74, 0x2e, 0x63, 0x75,
-  0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x28, 0x29, 0x2c, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x29, 0x3a, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x7d, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d,
-  0x65, 0x3f, 0x65, 0x3a, 0x65, 0x2e, 0x63, 0x6f, 0x6e, 0x63, 0x61, 0x74,
-  0x28, 0x74, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x20, 0x44, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61,
-  0x72, 0x20, 0x65, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c,
-  0x37, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x49, 0x74,
-  0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x3f, 0x28, 0x65,
-  0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x74, 0x28, 0x29, 0x2c, 0x65, 0x2e, 0x69,
-  0x3d, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74, 0x2c, 0x65,
-  0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x7d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x54, 0x74, 0x28, 0x74,
-  0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79,
-  0x74, 0x3d, 0x38, 0x2c, 0x44, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x74, 0x7d, 0x29, 0x2c, 0x6e, 0x29, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x74, 0x28, 0x74, 0x29, 0x7b,
-  0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x78, 0x74, 0x5b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x5d, 0x2c,
-  0x65, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x39, 0x29,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x63, 0x3d,
-  0x74, 0x2c, 0x6e, 0x3f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65,
-  0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x21,
-  0x30, 0x2c, 0x6e, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x70, 0x74, 0x29, 0x29,
-  0x2c, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x29, 0x3a, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x74, 0x28, 0x74, 0x2c, 0x6e,
-  0x29, 0x7b, 0x43, 0x2e, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67,
-  0x56, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26, 0x43, 0x2e, 0x75, 0x73, 0x65,
-  0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x28, 0x6e,
-  0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x46, 0x74, 0x28, 0x74, 0x29, 0x7b,
-  0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b,
-  0x2b, 0x2c, 0x31, 0x30, 0x29, 0x2c, 0x65, 0x3d, 0x45, 0x74, 0x28, 0x29,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x5f, 0x5f,
-  0x3d, 0x74, 0x2c, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e,
-  0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, 0x7c,
-  0x7c, 0x28, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65,
-  0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, 0x3d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x5f, 0x29,
-  0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x28,
-  0x74, 0x2c, 0x5f, 0x29, 0x2c, 0x65, 0x5b, 0x31, 0x5d, 0x28, 0x74, 0x29,
-  0x7d, 0x29, 0x2c, 0x5b, 0x65, 0x5b, 0x30, 0x5d, 0x2c, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x65, 0x5b, 0x31, 0x5d,
-  0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x7d, 0x5d, 0x7d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x74, 0x28, 0x29,
-  0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74,
-  0x2b, 0x2b, 0x2c, 0x31, 0x31, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x74,
-  0x2e, 0x5f, 0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72,
-  0x20, 0x6e, 0x3d, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x6e, 0x75,
-  0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x21, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x6d, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x3b, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b,
-  0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x7c,
-  0x7c, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x3d, 0x5b, 0x30, 0x2c, 0x30,
-  0x5d, 0x29, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x22, 0x50, 0x22, 0x2b,
-  0x65, 0x5b, 0x30, 0x5d, 0x2b, 0x22, 0x2d, 0x22, 0x2b, 0x65, 0x5b, 0x31,
-  0x5d, 0x2b, 0x2b, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
-  0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x20, 0x57, 0x74, 0x28, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61,
-  0x72, 0x20, 0x74, 0x3b, 0x74, 0x3d, 0x6d, 0x74, 0x2e, 0x73, 0x68, 0x69,
-  0x66, 0x74, 0x28, 0x29, 0x3b, 0x29, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f,
-  0x5f, 0x50, 0x26, 0x26, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x29, 0x74, 0x72,
-  0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e,
-  0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x52, 0x74, 0x29, 0x2c,
-  0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f,
-  0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x6a, 0x74, 0x29, 0x2c, 0x74, 0x2e,
-  0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x7d, 0x63,
-  0x61, 0x74, 0x63, 0x68, 0x28, 0x75, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f,
-  0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x43, 0x2e, 0x5f,
-  0x5f, 0x65, 0x28, 0x75, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d,
-  0x7d, 0x43, 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x70, 0x74, 0x3d, 0x6e, 0x75,
-  0x6c, 0x6c, 0x2c, 0x62, 0x74, 0x26, 0x26, 0x62, 0x74, 0x28, 0x74, 0x29,
-  0x7d, 0x2c, 0x43, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x6b, 0x74, 0x26, 0x26,
-  0x6b, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x30, 0x3b, 0x76,
-  0x61, 0x72, 0x20, 0x6e, 0x3d, 0x28, 0x70, 0x74, 0x3d, 0x74, 0x2e, 0x5f,
-  0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x48, 0x3b, 0x6e, 0x26, 0x26, 0x28,
-  0x64, 0x74, 0x3d, 0x3d, 0x3d, 0x70, 0x74, 0x3f, 0x28, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x68,
-  0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72,
-  0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
-  0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x26,
-  0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e,
-  0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x67, 0x74, 0x2c, 0x74,
-  0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x74, 0x2e, 0x69, 0x3d, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x7d, 0x29, 0x29, 0x29, 0x3a, 0x28, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x52,
-  0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72,
-  0x45, 0x61, 0x63, 0x68, 0x28, 0x6a, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f,
-  0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x61, 0x74, 0x3d, 0x30, 0x29, 0x29,
-  0x2c, 0x64, 0x74, 0x3d, 0x70, 0x74, 0x7d, 0x2c, 0x43, 0x2e, 0x64, 0x69,
-  0x66, 0x66, 0x65, 0x64, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x28, 0x74, 0x29, 0x7b, 0x53, 0x74, 0x26, 0x26, 0x53, 0x74, 0x28,
-  0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f,
-  0x5f, 0x63, 0x3b, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x26,
-  0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e,
-  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x28, 0x31, 0x21, 0x3d,
-  0x3d, 0x6d, 0x74, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x6e, 0x29, 0x26,
-  0x26, 0x76, 0x74, 0x3d, 0x3d, 0x3d, 0x43, 0x2e, 0x72, 0x65, 0x71, 0x75,
-  0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x46, 0x72, 0x61, 0x6d, 0x65, 0x7c, 0x7c, 0x28, 0x28, 0x76, 0x74, 0x3d,
-  0x43, 0x2e, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69,
-  0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x29,
-  0x7c, 0x7c, 0x4f, 0x74, 0x29, 0x28, 0x57, 0x74, 0x29, 0x29, 0x2c, 0x6e,
-  0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45,
-  0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x69, 0x26, 0x26, 0x28, 0x74,
-  0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x74, 0x2e, 0x69, 0x29, 0x2c, 0x74, 0x2e,
-  0x5f, 0x5f, 0x56, 0x21, 0x3d, 0x3d, 0x67, 0x74, 0x26, 0x26, 0x28, 0x74,
-  0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x2c, 0x74,
-  0x2e, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x2e,
-  0x5f, 0x5f, 0x56, 0x3d, 0x67, 0x74, 0x7d, 0x29, 0x29, 0x29, 0x2c, 0x64,
-  0x74, 0x3d, 0x70, 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x2c, 0x43,
-  0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f, 0x6d,
-  0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28,
-  0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x68,
-  0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x52, 0x74, 0x29,
-  0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x68,
-  0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x7c, 0x7c, 0x6a, 0x74,
-  0x28, 0x74, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68,
-  0x28, 0x6c, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b,
-  0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f,
-  0x68, 0x3d, 0x5b, 0x5d, 0x29, 0x7d, 0x29, 0x29, 0x2c, 0x6e, 0x3d, 0x5b,
-  0x5d, 0x2c, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x6c, 0x2c, 0x74, 0x2e,
-  0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x77, 0x74, 0x26,
-  0x26, 0x77, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x2c, 0x43, 0x2e,
-  0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x78, 0x74, 0x26, 0x26,
-  0x78, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c,
-  0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x65, 0x26, 0x26, 0x65,
-  0x2e, 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48,
-  0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28,
-  0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29,
-  0x7b, 0x74, 0x72, 0x79, 0x7b, 0x52, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x63,
-  0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x7d,
-  0x7d, 0x29, 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x76, 0x6f,
-  0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x26, 0x26, 0x43, 0x2e, 0x5f, 0x5f,
-  0x65, 0x28, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x29, 0x7d,
-  0x3b, 0x76, 0x61, 0x72, 0x20, 0x4c, 0x74, 0x3d, 0x22, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65,
-  0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e,
-  0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65,
-  0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x74,
-  0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63,
-  0x6c, 0x65, 0x61, 0x72, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28,
-  0x5f, 0x29, 0x2c, 0x4c, 0x74, 0x26, 0x26, 0x63, 0x61, 0x6e, 0x63, 0x65,
-  0x6c, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72,
-  0x61, 0x6d, 0x65, 0x28, 0x6e, 0x29, 0x2c, 0x73, 0x65, 0x74, 0x54, 0x69,
-  0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x5f, 0x3d,
-  0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x65,
-  0x2c, 0x31, 0x30, 0x30, 0x29, 0x3b, 0x4c, 0x74, 0x26, 0x26, 0x28, 0x6e,
-  0x3d, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, 0x65,
-  0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x52, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d,
-  0x70, 0x74, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x22,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x26, 0x26, 0x28, 0x74, 0x2e,
-  0x5f, 0x5f, 0x63, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x65,
-  0x28, 0x29, 0x29, 0x2c, 0x70, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e,
-  0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6a, 0x74, 0x28, 0x74, 0x29, 0x7b,
-  0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x70, 0x74, 0x3b, 0x74, 0x2e, 0x5f,
-  0x5f, 0x63, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x28, 0x29, 0x2c, 0x70, 0x74,
-  0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x49, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x21, 0x74, 0x7c, 0x7c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67,
-  0x74, 0x68, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74,
-  0x68, 0x7c, 0x7c, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x2c, 0x65, 0x29,
-  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x21, 0x3d, 0x3d,
-  0x74, 0x5b, 0x65, 0x5d, 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x20, 0x71, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29,
-  0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f,
-  0x66, 0x20, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x6e, 0x7d, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x42, 0x74, 0x28, 0x74,
-  0x2c, 0x6e, 0x29, 0x7b, 0x43, 0x5b, 0x74, 0x5d, 0x3d, 0x6e, 0x2e, 0x62,
-  0x69, 0x6e, 0x64, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x43, 0x5b, 0x74,
-  0x5d, 0x7c, 0x7c, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x7d, 0x29, 0x29,
-  0x7d, 0x6c, 0x65, 0x74, 0x20, 0x47, 0x74, 0x2c, 0x7a, 0x74, 0x3b, 0x66,
-  0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x74, 0x28, 0x74,
-  0x29, 0x7b, 0x69, 0x66, 0x28, 0x7a, 0x74, 0x29, 0x7a, 0x74, 0x28, 0x29,
-  0x3b, 0x7a, 0x74, 0x3d, 0x74, 0x26, 0x26, 0x74, 0x2e, 0x53, 0x28, 0x29,
-  0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x74,
-  0x28, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x74, 0x7d, 0x29, 0x7b, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x58, 0x74, 0x28, 0x74, 0x29,
-  0x3b, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x44, 0x74, 0x28, 0x28, 0x29,
-  0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28,
-  0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x69, 0x66, 0x28, 0x74, 0x2e,
-  0x5f, 0x5f, 0x63, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f,
-  0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x7d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x2e, 0x63,
-  0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b,
-  0x69, 0x66, 0x28, 0x21, 0x55, 0x28, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b,
-  0x28, 0x29, 0x29, 0x26, 0x26, 0x33, 0x3d, 0x3d, 0x3d, 0x28, 0x6e, 0x75,
-  0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x62, 0x61, 0x73, 0x65, 0x29, 0x3f, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30,
-  0x3a, 0x74, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29,
-  0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x3d, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29,
-  0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
-  0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29,
-  0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x28,
-  0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e,
-  0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x30, 0x3d, 0x3d, 0x3d,
-  0x74, 0x3f, 0x30, 0x3a, 0x21, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x22,
-  0x22, 0x3a, 0x74, 0x7c, 0x7c, 0x22, 0x22, 0x7d, 0x29, 0x7d, 0x2c, 0x5b,
-  0x5d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x4b, 0x74, 0x2e, 0x64, 0x69, 0x73,
-  0x70, 0x6c, 0x61, 0x79, 0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x5f, 0x73,
-  0x74, 0x22, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65,
-  0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69,
-  0x65, 0x73, 0x28, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79,
-  0x70, 0x65, 0x2c, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63,
-  0x74, 0x6f, 0x72, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75,
-  0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x2c, 0x74,
-  0x79, 0x70, 0x65, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75,
-  0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c,
-  0x75, 0x65, 0x3a, 0x4b, 0x74, 0x7d, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73,
-  0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62,
-  0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a,
-  0x74, 0x68, 0x69, 0x73, 0x7d, 0x7d, 0x7d, 0x2c, 0x5f, 0x5f, 0x62, 0x3a,
-  0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c,
-  0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x31,
-  0x7d, 0x7d, 0x29, 0x3b, 0x42, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x62, 0x22,
-  0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28,
-  0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79,
-  0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29,
-  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x2c, 0x65, 0x3d, 0x6e, 0x2e, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74,
-  0x20, 0x5f, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28,
-  0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d,
-  0x3d, 0x5f, 0x29, 0x63, 0x6f, 0x6e, 0x74, 0x69, 0x6e, 0x75, 0x65, 0x3b,
-  0x6c, 0x65, 0x74, 0x20, 0x69, 0x3d, 0x65, 0x5b, 0x5f, 0x5d, 0x3b, 0x69,
-  0x66, 0x28, 0x69, 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65,
-  0x6f, 0x66, 0x20, 0x68, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x29,
-  0x6e, 0x2e, 0x5f, 0x5f, 0x6e, 0x70, 0x3d, 0x74, 0x3d, 0x7b, 0x7d, 0x3b,
-  0x74, 0x5b, 0x5f, 0x5d, 0x3d, 0x69, 0x3b, 0x65, 0x5b, 0x5f, 0x5d, 0x3d,
-  0x69, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74,
-  0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x42, 0x74, 0x28, 0x22, 0x5f, 0x5f,
-  0x72, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x4a,
-  0x74, 0x28, 0x29, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x2c, 0x5f, 0x3d,
-  0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x29, 0x7b,
-  0x5f, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x65,
-  0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x65, 0x29, 0x5f, 0x2e,
-  0x5f, 0x5f, 0x24, 0x75, 0x3d, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e,
-  0x3b, 0x77, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
-  0x28, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x29, 0x29,
-  0x3b, 0x6e, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x5f, 0x2e,
-  0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x5f, 0x2e, 0x73, 0x65,
-  0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x7d, 0x3b,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x28, 0x29, 0x7d,
-  0x47, 0x74, 0x3d, 0x5f, 0x3b, 0x4a, 0x74, 0x28, 0x65, 0x29, 0x3b, 0x74,
-  0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x42, 0x74, 0x28, 0x22, 0x5f, 0x5f,
-  0x65, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29,
-  0x3d, 0x3e, 0x7b, 0x4a, 0x74, 0x28, 0x29, 0x3b, 0x47, 0x74, 0x3d, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c,
-  0x5f, 0x29, 0x7d, 0x29, 0x3b, 0x42, 0x74, 0x28, 0x22, 0x64, 0x69, 0x66,
-  0x66, 0x65, 0x64, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e,
-  0x7b, 0x4a, 0x74, 0x28, 0x29, 0x3b, 0x47, 0x74, 0x3d, 0x76, 0x6f, 0x69,
-  0x64, 0x20, 0x30, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x3b, 0x69, 0x66,
-  0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74,
-  0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65,
-  0x26, 0x26, 0x28, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x29,
-  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6e,
-  0x70, 0x2c, 0x5f, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3b,
-  0x69, 0x66, 0x28, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d,
-  0x65, 0x2e, 0x55, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x66, 0x6f, 0x72,
-  0x28, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29,
-  0x7b, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b,
-  0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d,
-  0x5f, 0x26, 0x26, 0x21, 0x28, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29,
-  0x29, 0x7b, 0x5f, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x6e, 0x5b, 0x65, 0x5d,
-  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x7d, 0x65, 0x6c, 0x73,
-  0x65, 0x7b, 0x6e, 0x3d, 0x7b, 0x7d, 0x3b, 0x65, 0x2e, 0x55, 0x3d, 0x6e,
-  0x7d, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69,
-  0x6e, 0x20, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6f, 0x3d, 0x6e,
-  0x5b, 0x69, 0x5d, 0x2c, 0x72, 0x3d, 0x74, 0x5b, 0x69, 0x5d, 0x3b, 0x69,
-  0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6f,
-  0x29, 0x7b, 0x6f, 0x3d, 0x51, 0x74, 0x28, 0x65, 0x2c, 0x69, 0x2c, 0x72,
-  0x2c, 0x5f, 0x29, 0x3b, 0x6e, 0x5b, 0x69, 0x5d, 0x3d, 0x6f, 0x7d, 0x65,
-  0x6c, 0x73, 0x65, 0x20, 0x6f, 0x2e, 0x6f, 0x28, 0x72, 0x2c, 0x5f, 0x29,
-  0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x74, 0x28, 0x74, 0x2c,
-  0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x69, 0x3d, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x26, 0x26, 0x76,
-  0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x6f, 0x77,
-  0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e,
-  0x74, 0x2c, 0x6f, 0x3d, 0x61, 0x28, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x7b, 0x6f, 0x3a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d,
-  0x3e, 0x7b, 0x6f, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b,
-  0x5f, 0x3d, 0x6e, 0x7d, 0x2c, 0x64, 0x3a, 0x77, 0x28, 0x28, 0x29, 0x3d,
-  0x3e, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6f, 0x2e,
-  0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b,
-  0x69, 0x66, 0x28, 0x5f, 0x5b, 0x6e, 0x5d, 0x21, 0x3d, 0x3d, 0x65, 0x29,
-  0x7b, 0x5f, 0x5b, 0x6e, 0x5d, 0x3d, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x69,
-  0x29, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73, 0x65,
-  0x20, 0x69, 0x66, 0x28, 0x65, 0x29, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x41,
-  0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65,
-  0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x74, 0x2e, 0x72, 0x65, 0x6d,
-  0x6f, 0x76, 0x65, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
-  0x28, 0x6e, 0x29, 0x7d, 0x7d, 0x29, 0x7d, 0x7d, 0x42, 0x74, 0x28, 0x22,
-  0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x2c, 0x28, 0x74, 0x2c,
-  0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72,
-  0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66,
-  0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x6c, 0x65, 0x74,
-  0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x69, 0x66, 0x28,
-  0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74,
-  0x2e, 0x55, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x55,
-  0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28,
-  0x6c, 0x65, 0x74, 0x20, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x7b,
-  0x6c, 0x65, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x5b, 0x74, 0x5d, 0x3b, 0x69,
-  0x66, 0x28, 0x65, 0x29, 0x65, 0x2e, 0x64, 0x28, 0x29, 0x7d, 0x7d, 0x7d,
-  0x7d, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d,
-  0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x29, 0x7b,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f,
-  0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x5f,
-  0x5f, 0x24, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6e,
-  0x2e, 0x64, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d,
-  0x29, 0x3b, 0x42, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x68, 0x22, 0x2c, 0x28,
-  0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x3d, 0x3e, 0x7b, 0x69,
-  0x66, 0x28, 0x5f, 0x3c, 0x33, 0x7c, 0x7c, 0x39, 0x3d, 0x3d, 0x3d, 0x5f,
-  0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74,
-  0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7d, 0x29, 0x3b, 0x49, 0x2e,
-  0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x68,
-  0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e,
-  0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
-  0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x65, 0x26, 0x26,
-  0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x73,
-  0x7c, 0x7c, 0x34, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24,
-  0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b,
-  0x69, 0x66, 0x28, 0x33, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f,
-  0x24, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b,
-  0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x20, 0x69, 0x6e,
-  0x20, 0x6e, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b,
-  0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x20, 0x69, 0x6e,
-  0x20, 0x74, 0x29, 0x69, 0x66, 0x28, 0x22, 0x5f, 0x5f, 0x73, 0x6f, 0x75,
-  0x72, 0x63, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x74, 0x5b,
-  0x5f, 0x5d, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72,
-  0x6f, 0x70, 0x73, 0x5b, 0x5f, 0x5d, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20,
-  0x5f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72,
-  0x6f, 0x70, 0x73, 0x29, 0x69, 0x66, 0x28, 0x21, 0x28, 0x5f, 0x20, 0x69,
-  0x6e, 0x20, 0x74, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21,
-  0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x3b,
-  0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x74, 0x28,
-  0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x44, 0x74,
-  0x28, 0x28, 0x29, 0x3d, 0x3e, 0x61, 0x28, 0x74, 0x29, 0x2c, 0x5b, 0x5d,
-  0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59,
-  0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e,
-  0x3d, 0x4e, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75, 0x72,
-  0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, 0x47, 0x74, 0x2e, 0x5f, 0x5f,
-  0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x44, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x6d, 0x28, 0x28, 0x29,
-  0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28,
-  0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74,
-  0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x4e, 0x74, 0x28, 0x74, 0x29, 0x3b,
-  0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b,
-  0x48, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x77, 0x28, 0x28, 0x29, 0x3d,
-  0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, 0x29,
-  0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x74, 0x6e,
-  0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c,
-  0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69,
-  0x3b, 0x6e, 0x5b, 0x30, 0x5d, 0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28,
-  0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x31, 0x3b, 0x6f, 0x3c, 0x6e, 0x2e,
-  0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, 0x7b,
-  0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, 0x5d,
-  0x2c, 0x75, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3f, 0x28, 0x6e, 0x5b, 0x30,
-  0x5d, 0x7c, 0x3d, 0x72, 0x3f, 0x31, 0x3a, 0x32, 0x2c, 0x65, 0x5b, 0x6e,
-  0x5b, 0x6f, 0x2b, 0x2b, 0x5d, 0x5d, 0x29, 0x3a, 0x6e, 0x5b, 0x2b, 0x2b,
-  0x6f, 0x5d, 0x3b, 0x33, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b, 0x30,
-  0x5d, 0x3d, 0x75, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b,
-  0x31, 0x5d, 0x3d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x61, 0x73,
-  0x73, 0x69, 0x67, 0x6e, 0x28, 0x5f, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b,
-  0x7d, 0x2c, 0x75, 0x29, 0x3a, 0x35, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x28,
-  0x5f, 0x5b, 0x31, 0x5d, 0x3d, 0x5f, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b,
-  0x7d, 0x29, 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x3d, 0x75,
-  0x3a, 0x36, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b, 0x31, 0x5d, 0x5b,
-  0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x2b, 0x3d, 0x75, 0x2b, 0x22,
-  0x22, 0x3a, 0x72, 0x3f, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x61, 0x70, 0x70,
-  0x6c, 0x79, 0x28, 0x75, 0x2c, 0x74, 0x6e, 0x28, 0x74, 0x2c, 0x75, 0x2c,
-  0x65, 0x2c, 0x5b, 0x22, 0x22, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x5d, 0x29,
-  0x29, 0x2c, 0x5f, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x69, 0x29, 0x2c,
-  0x75, 0x5b, 0x30, 0x5d, 0x3f, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d, 0x32,
-  0x3a, 0x28, 0x6e, 0x5b, 0x6f, 0x2d, 0x32, 0x5d, 0x3d, 0x30, 0x2c, 0x6e,
-  0x5b, 0x6f, 0x5d, 0x3d, 0x69, 0x29, 0x29, 0x3a, 0x5f, 0x2e, 0x70, 0x75,
-  0x73, 0x68, 0x28, 0x75, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
-  0x20, 0x5f, 0x7d, 0x2c, 0x6e, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d,
-  0x61, 0x70, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
-  0x65, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d,
-  0x6e, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29,
-  0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7c, 0x7c, 0x28,
-  0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x2c, 0x6e, 0x6e,
-  0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x29,
-  0x29, 0x2c, 0x28, 0x6e, 0x3d, 0x74, 0x6e, 0x28, 0x74, 0x68, 0x69, 0x73,
-  0x2c, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7c, 0x7c, 0x28,
-  0x6e, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x3d, 0x66, 0x75,
-  0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f,
-  0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x3d,
-  0x31, 0x2c, 0x69, 0x3d, 0x22, 0x22, 0x2c, 0x6f, 0x3d, 0x22, 0x22, 0x2c,
-  0x72, 0x3d, 0x5b, 0x30, 0x5d, 0x2c, 0x75, 0x3d, 0x66, 0x75, 0x6e, 0x63,
-  0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x31, 0x3d, 0x3d, 0x3d,
-  0x5f, 0x26, 0x26, 0x28, 0x74, 0x7c, 0x7c, 0x28, 0x69, 0x3d, 0x69, 0x2e,
-  0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73,
-  0x2a, 0x5c, 0x6e, 0x5c, 0x73, 0x2a, 0x7c, 0x5c, 0x73, 0x2a, 0x5c, 0x6e,
-  0x5c, 0x73, 0x2a, 0x24, 0x2f, 0x67, 0x2c, 0x22, 0x22, 0x29, 0x29, 0x29,
-  0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x30, 0x2c, 0x74, 0x2c,
-  0x69, 0x29, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x28, 0x74,
-  0x7c, 0x7c, 0x69, 0x29, 0x3f, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68,
-  0x28, 0x33, 0x2c, 0x74, 0x2c, 0x69, 0x29, 0x2c, 0x5f, 0x3d, 0x32, 0x29,
-  0x3a, 0x32, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x22, 0x2e, 0x2e, 0x2e,
-  0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x74, 0x3f, 0x72, 0x2e, 0x70,
-  0x75, 0x73, 0x68, 0x28, 0x34, 0x2c, 0x74, 0x2c, 0x30, 0x29, 0x3a, 0x32,
-  0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x69, 0x26, 0x26, 0x21, 0x74, 0x3f,
-  0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x35, 0x2c, 0x30, 0x2c, 0x21,
-  0x30, 0x2c, 0x69, 0x29, 0x3a, 0x5f, 0x3e, 0x3d, 0x35, 0x26, 0x26, 0x28,
-  0x28, 0x69, 0x7c, 0x7c, 0x21, 0x74, 0x26, 0x26, 0x35, 0x3d, 0x3d, 0x3d,
-  0x5f, 0x29, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28,
-  0x5f, 0x2c, 0x30, 0x2c, 0x69, 0x2c, 0x65, 0x29, 0x2c, 0x5f, 0x3d, 0x36,
-  0x29, 0x2c, 0x74, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68,
-  0x28, 0x5f, 0x2c, 0x74, 0x2c, 0x30, 0x2c, 0x65, 0x29, 0x2c, 0x5f, 0x3d,
-  0x36, 0x29, 0x29, 0x2c, 0x69, 0x3d, 0x22, 0x22, 0x7d, 0x2c, 0x66, 0x3d,
-  0x30, 0x3b, 0x66, 0x3c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
-  0x3b, 0x66, 0x2b, 0x2b, 0x29, 0x7b, 0x66, 0x26, 0x26, 0x28, 0x31, 0x3d,
-  0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x75, 0x28, 0x29, 0x2c, 0x75, 0x28, 0x66,
-  0x29, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x73,
-  0x3d, 0x30, 0x3b, 0x73, 0x3c, 0x74, 0x5b, 0x66, 0x5d, 0x2e, 0x6c, 0x65,
-  0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, 0x2b, 0x2b, 0x29, 0x6e, 0x3d, 0x74,
-  0x5b, 0x66, 0x5d, 0x5b, 0x73, 0x5d, 0x2c, 0x31, 0x3d, 0x3d, 0x3d, 0x5f,
-  0x3f, 0x22, 0x3c, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28,
-  0x29, 0x2c, 0x72, 0x3d, 0x5b, 0x72, 0x5d, 0x2c, 0x5f, 0x3d, 0x33, 0x29,
-  0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x5f, 0x3f,
-  0x22, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x22, 0x3e,
-  0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x5f, 0x3d, 0x31, 0x2c, 0x69,
-  0x3d, 0x22, 0x22, 0x29, 0x3a, 0x69, 0x3d, 0x6e, 0x2b, 0x69, 0x5b, 0x30,
-  0x5d, 0x3a, 0x6f, 0x3f, 0x6e, 0x3d, 0x3d, 0x3d, 0x6f, 0x3f, 0x6f, 0x3d,
-  0x22, 0x22, 0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x3a, 0x27, 0x22, 0x27, 0x3d,
-  0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x27, 0x22, 0x3d, 0x3d, 0x3d, 0x6e,
-  0x3f, 0x6f, 0x3d, 0x6e, 0x3a, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e,
-  0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x5f, 0x3d, 0x31, 0x29, 0x3a, 0x5f,
-  0x26, 0x26, 0x28, 0x22, 0x3d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28,
-  0x5f, 0x3d, 0x35, 0x2c, 0x65, 0x3d, 0x69, 0x2c, 0x69, 0x3d, 0x22, 0x22,
-  0x29, 0x3a, 0x22, 0x2f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x28,
-  0x5f, 0x3c, 0x35, 0x7c, 0x7c, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x74,
-  0x5b, 0x66, 0x5d, 0x5b, 0x73, 0x2b, 0x31, 0x5d, 0x29, 0x3f, 0x28, 0x75,
-  0x28, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x28, 0x72,
-  0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2c, 0x5f, 0x3d, 0x72, 0x2c, 0x28,
-  0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2e, 0x70, 0x75, 0x73, 0x68,
-  0x28, 0x32, 0x2c, 0x30, 0x2c, 0x5f, 0x29, 0x2c, 0x5f, 0x3d, 0x30, 0x29,
-  0x3a, 0x22, 0x20, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c,
-  0x74, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x6e, 0x22,
-  0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x72, 0x22, 0x3d, 0x3d,
-  0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x5f, 0x3d, 0x32, 0x29,
-  0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x5f,
-  0x26, 0x26, 0x22, 0x21, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26,
-  0x26, 0x28, 0x5f, 0x3d, 0x34, 0x2c, 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d,
-  0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x28, 0x29,
-  0x2c, 0x72, 0x7d, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x29, 0x2c, 0x61,
-  0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x5b, 0x5d, 0x29,
-  0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x31, 0x3f, 0x6e,
-  0x3a, 0x6e, 0x5b, 0x30, 0x5d, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x6e,
-  0x3d, 0x65, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x4c, 0x29, 0x3b,
-  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x7b, 0x49, 0x20, 0x61, 0x73, 0x20,
-  0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x6a, 0x20,
-  0x61, 0x73, 0x20, 0x46, 0x72, 0x61, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x2c,
-  0x68, 0x20, 0x61, 0x73, 0x20, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c,
-  0x5f, 0x20, 0x61, 0x73, 0x20, 0x62, 0x61, 0x74, 0x63, 0x68, 0x2c, 0x63,
-  0x74, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6c, 0x6f, 0x6e, 0x65, 0x45, 0x6c,
-  0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x6d, 0x20, 0x61, 0x73, 0x20, 0x63,
-  0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x68, 0x74, 0x20, 0x61,
-  0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x74,
-  0x65, 0x78, 0x74, 0x2c, 0x4c, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65,
-  0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x52,
-  0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65,
-  0x66, 0x2c, 0x77, 0x20, 0x61, 0x73, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63,
-  0x74, 0x2c, 0x4c, 0x20, 0x61, 0x73, 0x20, 0x68, 0x2c, 0x5f, 0x6e, 0x20,
-  0x61, 0x73, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x6c, 0x74, 0x20, 0x61,
-  0x73, 0x20, 0x68, 0x79, 0x64, 0x72, 0x61, 0x74, 0x65, 0x2c, 0x55, 0x20,
-  0x61, 0x73, 0x20, 0x69, 0x73, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x45, 0x6c,
-  0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x43, 0x20, 0x61, 0x73, 0x20, 0x6f,
-  0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2c, 0x73, 0x74, 0x20, 0x61, 0x73,
-  0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x61, 0x20, 0x61, 0x73,
-  0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x58, 0x20, 0x61, 0x73,
-  0x20, 0x74, 0x6f, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x41, 0x72, 0x72, 0x61,
-  0x79, 0x2c, 0x75, 0x20, 0x61, 0x73, 0x20, 0x75, 0x6e, 0x74, 0x72, 0x61,
-  0x63, 0x6b, 0x65, 0x64, 0x2c, 0x54, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75,
-  0x73, 0x65, 0x43, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2c, 0x59,
-  0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, 0x6d, 0x70,
-  0x75, 0x74, 0x65, 0x64, 0x2c, 0x56, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75,
-  0x73, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, 0x41, 0x74,
-  0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67,
-  0x56, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x48, 0x74, 0x20, 0x61, 0x73, 0x20,
-  0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x46, 0x74,
-  0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x45, 0x72, 0x72, 0x6f, 0x72,
-  0x42, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x2c, 0x4d, 0x74, 0x20,
-  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x49, 0x64, 0x2c, 0x24, 0x74, 0x20,
-  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x49, 0x6d, 0x70, 0x65, 0x72, 0x61,
-  0x74, 0x69, 0x76, 0x65, 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x50,
-  0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4c, 0x61, 0x79, 0x6f,
-  0x75, 0x74, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x44, 0x74, 0x20,
-  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x2c, 0x55,
-  0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x64, 0x75,
-  0x63, 0x65, 0x72, 0x2c, 0x4e, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73,
-  0x65, 0x52, 0x65, 0x66, 0x2c, 0x58, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75,
-  0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x5a, 0x74, 0x20,
-  0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c,
-  0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x45, 0x74, 0x20, 0x61, 0x73,
-  0x20, 0x75, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a
-};
-unsigned int index_js_len = 22800;
diff --git a/examples/server/json-schema-to-grammar.mjs.hpp b/examples/server/json-schema-to-grammar.mjs.hpp
deleted file mode 100644
index 0a05c369d..000000000
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ /dev/null
@@ -1,311 +0,0 @@
-unsigned char json_schema_to_grammar_mjs[] = {
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
-  0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
-  0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
-  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
-  0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
-  0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
-  0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
-  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
-  0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
-  0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
-  0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
-  0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
-  0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
-  0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
-  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
-  0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
-  0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
-  0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
-  0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
-  0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
-  0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
-  0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
-  0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
-  0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
-  0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
-  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
-  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
-  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
-  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
-  0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
-  0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
-  0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
-  0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
-  0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
-  0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
-  0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
-  0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
-  0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
-  0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
-  0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
-  0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
-  0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
-  0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
-  0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
-  0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
-  0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
-  0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
-  0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
-  0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
-  0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
-  0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
-  0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
-  0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
-  0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
-  0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
-  0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
-  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
-  0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
-  0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
-  0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
-  0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
-  0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
-  0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
-  0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
-  0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
-  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
-  0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
-  0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
-  0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
-  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
-  0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
-  0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
-  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
-  0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
-  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
-  0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
-  0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
-  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
-  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
-  0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
-  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
-  0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
-  0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
-  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
-  0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
-  0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
-  0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
-  0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
-  0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
-  0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
-  0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
-  0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
-  0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
-  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
-  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
-  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
-  0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
-  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
-  0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
-  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
-  0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
-  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
-  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
-  0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
-  0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
-  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
-  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
-  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
-  0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
-  0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
-  0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
-  0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
-  0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
-  0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
-  0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
-  0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
-  0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
-  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
-  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
-  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
-  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
-  0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
-  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
-  0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
-  0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
-  0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
-  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
-  0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
-  0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
-  0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
-  0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
-  0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
-  0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
-  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
-  0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
-  0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
-  0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
-  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
-  0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
-  0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
-  0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
-  0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
-  0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
-  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
-  0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
-  0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
-  0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
-  0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
-  0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
-  0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
-  0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
-  0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
-  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
-  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
-  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
-  0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
-  0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
-  0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
-  0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
-  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
-  0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
-  0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
-  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
-  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
-  0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
-  0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
-  0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
-  0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
-  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
-  0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
-  0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
-  0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
-  0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
-  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
-  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
-  0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
-  0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
-  0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
-  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
-  0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
-  0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
-  0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
-  0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
-  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
-  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
-  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
-  0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
-  0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
-  0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
-  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
-  0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
-  0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
-  0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
-  0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
-  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
-  0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
-  0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
-  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
-  0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
-  0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
-  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
-  0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
-  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
-  0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
-  0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
-  0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
-  0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
-  0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
-};
-unsigned int json_schema_to_grammar_mjs_len = 3695;
diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp
deleted file mode 100644
index ff4ad6994..000000000
--- a/examples/server/oai.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-
-#include "json.hpp"
-#include "utils.hpp"
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-
-using json = nlohmann::json;
-
-inline static json oaicompat_completion_params_parse(
-    const struct llama_model * model,
-    const json &body, /* openai api json semantics */
-    const std::string &chat_template)
-{
-    json llama_params;
-
-    llama_params["__oaicompat"] = true;
-
-    // Map OpenAI parameters to llama.cpp parameters
-    //
-    // For parameters that are defined by the OpenAI documentation (e.g.
-    // temperature), we explicitly specify OpenAI's intended default; we
-    // need to do that because sometimes OpenAI disagrees with llama.cpp
-    //
-    // https://platform.openai.com/docs/api-reference/chat/create
-    llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
-    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
-    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
-
-    if (body.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
-
-    // Handle 'stop' field
-    if (body.contains("stop") && body["stop"].is_string()) {
-        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
-    } else {
-        llama_params["stop"] = json_value(body, "stop", json::array());
-    }
-
-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
-
-    return llama_params;
-}
-
-inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
-    json result = response.result_json;
-
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
-
-    std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-
-    json choices =
-        streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
-                  : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", content},
-                                                         {"role", "assistant"}}}}});
-
-    std::time_t t = std::time(0);
-
-    json res =
-        json{{"choices", choices},
-            {"created", t},
-            {"model",
-                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-            {"usage",
-                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
-            {"id", gen_chatcmplid()}};
-
-    if (server_verbose) {
-        res["__verbose"] = result;
-    }
-
-    if (result.contains("completion_probabilities")) {
-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    }
-
-    return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
-    json result = response.result_json;
-
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-        return std::vector<json>({response.result_json});
-    }
-
-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-    bool stopped_word   = json_value(result, "stopped_word", false);
-    bool stopped_eos    = json_value(result, "stopped_eos", false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content", std::string(""));
-
-    std::string finish_reason;
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-    if (stopped_limit) {
-        finish_reason = "length";
-    }
-
-    std::time_t t = std::time(0);
-
-    json choices;
-
-    if (!finish_reason.empty()) {
-        choices = json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            // Some idiosyncrasy in task processing logic makes several trailing calls
-            // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
-                return std::vector<json>({json::object()});
-            }
-
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json{
-                    {"content", content},
-                }},
-            }});
-        }
-    }
-
-    json ret = json{{"choices", choices},
-                    {"created", t},
-                    {"id", gen_chatcmplid()},
-                    {"model", modelname},
-                    {"object", "chat.completion.chunk"}};
-
-    return std::vector<json>({ret});
-}
-
-inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
-{
-    json res =
-        json{
-            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", "list"},
-            {"usage",
-                json{{"prompt_tokens", 0},
-                     {"total_tokens", 0}}},
-            {"data", embeddings}
-        };
-    return res;
-}
-
diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz
new file mode 100644
index 000000000..141e80920
Binary files /dev/null and b/examples/server/public/index.html.gz differ
diff --git a/examples/server/public/index.js b/examples/server/public/index.js
deleted file mode 100644
index 9db5a9d9f..000000000
--- a/examples/server/public/index.js
+++ /dev/null
@@ -1 +0,0 @@
-function t(){throw new Error("Cycle detected")}function n(){if(u>1){u--;return}let t,n=!1;while(void 0!==_){let i=_;_=void 0;f++;while(void 0!==i){const _=i.o;i.o=void 0;i.f&=-3;if(!(8&i.f)&&a(i))try{i.c()}catch(e){if(!n){t=e;n=!0}}i=_}}f=0;u--;if(n)throw t}function e(t){if(u>0)return t();u++;try{return t()}finally{n()}}let i,_,o=0;function r(t){if(o>0)return t();const n=i;i=void 0;o++;try{return t()}finally{o--;i=n}}let u=0,f=0,l=0;function s(t){if(void 0===i)return;let n=t.n;if(void 0===n||n.t!==i){n={i:0,S:t,p:i.s,n:void 0,t:i,e:void 0,x:void 0,r:n};if(void 0!==i.s)i.s.n=n;i.s=n;t.n=n;if(32&i.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=i.s;n.n=void 0;i.s.n=n;i.s=n}return n}}function c(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}c.prototype.h=function(){return!0};c.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};c.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};c.prototype.subscribe=function(t){const n=this;return S((function(){const e=n.value,i=32&this.f;this.f&=-33;try{t(e)}finally{this.f|=i}}))};c.prototype.valueOf=function(){return this.value};c.prototype.toString=function(){return this.value+""};c.prototype.toJSON=function(){return this.value};c.prototype.peek=function(){return this.v};Object.defineProperty(c.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(e){if(i instanceof v)!function(){throw new Error("Computed cannot have side-effects")}();if(e!==this.v){if(f>100)t();this.v=e;this.i++;l++;u++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function h(t){return new c(t)}function a(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function p(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function d(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function v(t){c.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(v.prototype=new c).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!a(this)){this.f&=-2;return!0}const t=i;try{p(this);i=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}i=t;d(this);this.f&=-2;return!0};v.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}c.prototype.S.call(this,t)};v.prototype.U=function(t){if(void 0!==this.t){c.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};v.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};v.prototype.peek=function(){if(!this.h())t();if(16&this.f)throw this.v;return this.v};Object.defineProperty(v.prototype,"value",{get(){if(1&this.f)t();const n=s(this);this.h();if(void 0!==n)n.i=this.i;if(16&this.f)throw this.v;return this.v}});function y(t){return new v(t)}function m(t){const e=t.u;t.u=void 0;if("function"==typeof e){u++;const _=i;i=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;g(t);throw n}finally{i=_;n()}}}function g(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;m(t)}function b(t){if(i!==this)throw new Error("Out-of-order effect");d(this);i=t;this.f&=-2;if(8&this.f)g(this);n()}function k(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}k.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};k.prototype.S=function(){if(1&this.f)t();this.f|=1;this.f&=-9;m(this);p(this);u++;const n=i;i=this;return b.bind(this,n)};k.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=_;_=this}};k.prototype.d=function(){this.f|=8;if(!(1&this.f))g(this)};function S(t){const n=new k(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var x,w,C,E,U,H,N,P,$,D={},T=[],V=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,A=Array.isArray;function F(t,n){for(var e in n)t[e]=n[e];return t}function M(t){var n=t.parentNode;n&&n.removeChild(t)}function W(t,n,e){var i,_,o,r={};for(o in n)"key"==o?i=n[o]:"ref"==o?_=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?x.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return O(t,r,i,_,null)}function O(t,n,e,i,_){var o={type:t,props:n,key:e,ref:i,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,__h:null,constructor:void 0,__v:null==_?++C:_};return null==_&&null!=w.vnode&&w.vnode(o),o}function L(){return{current:null}}function R(t){return t.children}function I(t,n){this.props=t,this.context=n}function j(t,n){if(null==n)return t.__?j(t.__,t.__.__k.indexOf(t)+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?j(t):null}function B(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return B(t)}}function q(t){(!t.__d&&(t.__d=!0)&&U.push(t)&&!G.__r++||H!==w.debounceRendering)&&((H=w.debounceRendering)||N)(G)}function G(){var t,n,e,i,_,o,r,u,f;for(U.sort(P);t=U.shift();)t.__d&&(n=U.length,i=void 0,_=void 0,o=void 0,u=(r=(e=t).__v).__e,(f=e.__P)&&(i=[],_=[],(o=F({},r)).__v=r.__v+1,it(f,r,o,e.__n,void 0!==f.ownerSVGElement,null!=r.__h?[u]:null,i,null==u?j(r):u,r.__h,_),_t(i,r,_),r.__e!=u&&B(r)),U.length>n&&U.sort(P));G.__r=0}function z(t,n,e,i,_,o,r,u,f,l,s){var c,h,a,p,d,v,y,m,g,b,k=0,S=i&&i.__k||T,x=S.length,w=x,C=n.length;for(e.__k=[],c=0;c<C;c++)null!=(p=e.__k[c]=null==(p=n[c])||"boolean"==typeof p||"function"==typeof p?null:"string"==typeof p||"number"==typeof p||"bigint"==typeof p?O(null,p,null,null,p):A(p)?O(R,{children:p},null,null,null):p.__b>0?O(p.type,p.props,p.key,p.ref?p.ref:null,p.__v):p)&&(p.__=e,p.__b=e.__b+1,-1===(m=X(p,S,y=c+k,w))?a=D:(a=S[m]||D,S[m]=void 0,w--),it(t,p,a,_,o,r,u,f,l,s),d=p.__e,(h=p.ref)&&a.ref!=h&&(a.ref&&rt(a.ref,null,p),s.push(h,p.__c||d,p)),null!=d&&(null==v&&(v=d),b=!(g=a===D||null===a.__v)&&m===y,g?-1==m&&k--:m!==y&&(m===y+1?(k++,b=!0):m>y?w>C-y?(k+=m-y,b=!0):k--:k=m<y&&m==y-1?m-y:0),y=c+k,b=b||m==c&&!g,"function"!=typeof p.type||m===y&&a.__k!==p.__k?"function"==typeof p.type||b?void 0!==p.__d?(f=p.__d,p.__d=void 0):f=d.nextSibling:f=Q(t,d,f):f=J(p,f,t),"function"==typeof e.type&&(e.__d=f)));for(e.__e=v,c=x;c--;)null!=S[c]&&("function"==typeof e.type&&null!=S[c].__e&&S[c].__e==e.__d&&(e.__d=S[c].__e.nextSibling),ut(S[c],S[c]))}function J(t,n,e){for(var i,_=t.__k,o=0;_&&o<_.length;o++)(i=_[o])&&(i.__=t,n="function"==typeof i.type?J(i,n,e):Q(e,i.__e,n));return n}function K(t,n){return n=n||[],null==t||"boolean"==typeof t||(A(t)?t.some((function(t){K(t,n)})):n.push(t)),n}function Q(t,n,e){return null==e||e.parentNode!==t?t.insertBefore(n,null):n==e&&null!=n.parentNode||t.insertBefore(n,e),n.nextSibling}function X(t,n,e,i){var _=t.key,o=t.type,r=e-1,u=e+1,f=n[e];if(null===f||f&&_==f.key&&o===f.type)return e;if(i>(null!=f?1:0))for(;r>=0||u<n.length;){if(r>=0){if((f=n[r])&&_==f.key&&o===f.type)return r;r--}if(u<n.length){if((f=n[u])&&_==f.key&&o===f.type)return u;u++}}return-1}function Y(t,n,e,i,_){var o;for(o in e)"children"===o||"key"===o||o in n||tt(t,o,null,e[o],i);for(o in n)_&&"function"!=typeof n[o]||"children"===o||"key"===o||"value"===o||"checked"===o||e[o]===n[o]||tt(t,o,n[o],e[o],i)}function Z(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||V.test(n)?e:e+"px"}function tt(t,n,e,i,_){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof i&&(t.style.cssText=i=""),i)for(n in i)e&&n in e||Z(t.style,n,"");if(e)for(n in e)i&&e[n]===i[n]||Z(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/Capture$/,"")),n=n.toLowerCase()in t?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?i||t.addEventListener(n,o?et:nt,o):t.removeEventListener(n,o?et:nt,o);else if("dangerouslySetInnerHTML"!==n){if(_)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!==n&&"height"!==n&&"href"!==n&&"list"!==n&&"form"!==n&&"tabIndex"!==n&&"download"!==n&&"rowSpan"!==n&&"colSpan"!==n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,e))}}function nt(t){return this.l[t.type+!1](w.event?w.event(t):t)}function et(t){return this.l[t.type+!0](w.event?w.event(t):t)}function it(t,n,e,i,_,o,r,u,f,l){var s,c,h,a,p,d,v,y,m,g,b,k,S,x,C,E=n.type;if(void 0!==n.constructor)return null;null!=e.__h&&(f=e.__h,u=n.__e=e.__e,n.__h=null,o=[u]),(s=w.__b)&&s(n);try{t:if("function"==typeof E){if(y=n.props,m=(s=E.contextType)&&i[s.__c],g=s?m?m.props.value:s.__:i,e.__c?v=(c=n.__c=e.__c).__=c.__E:("prototype"in E&&E.prototype.render?n.__c=c=new E(y,g):(n.__c=c=new I(y,g),c.constructor=E,c.render=ft),m&&m.sub(c),c.props=y,c.state||(c.state={}),c.context=g,c.__n=i,h=c.__d=!0,c.__h=[],c._sb=[]),null==c.__s&&(c.__s=c.state),null!=E.getDerivedStateFromProps&&(c.__s==c.state&&(c.__s=F({},c.__s)),F(c.__s,E.getDerivedStateFromProps(y,c.__s))),a=c.props,p=c.state,c.__v=n,h)null==E.getDerivedStateFromProps&&null!=c.componentWillMount&&c.componentWillMount(),null!=c.componentDidMount&&c.__h.push(c.componentDidMount);else{if(null==E.getDerivedStateFromProps&&y!==a&&null!=c.componentWillReceiveProps&&c.componentWillReceiveProps(y,g),!c.__e&&(null!=c.shouldComponentUpdate&&!1===c.shouldComponentUpdate(y,c.__s,g)||n.__v===e.__v)){for(n.__v!==e.__v&&(c.props=y,c.state=c.__s,c.__d=!1),n.__e=e.__e,n.__k=e.__k,n.__k.forEach((function(t){t&&(t.__=n)})),b=0;b<c._sb.length;b++)c.__h.push(c._sb[b]);c._sb=[],c.__h.length&&r.push(c);break t}null!=c.componentWillUpdate&&c.componentWillUpdate(y,c.__s,g),null!=c.componentDidUpdate&&c.__h.push((function(){c.componentDidUpdate(a,p,d)}))}if(c.context=g,c.props=y,c.__P=t,c.__e=!1,k=w.__r,S=0,"prototype"in E&&E.prototype.render){for(c.state=c.__s,c.__d=!1,k&&k(n),s=c.render(c.props,c.state,c.context),x=0;x<c._sb.length;x++)c.__h.push(c._sb[x]);c._sb=[]}else do{c.__d=!1,k&&k(n),s=c.render(c.props,c.state,c.context),c.state=c.__s}while(c.__d&&++S<25);c.state=c.__s,null!=c.getChildContext&&(i=F(F({},i),c.getChildContext())),h||null==c.getSnapshotBeforeUpdate||(d=c.getSnapshotBeforeUpdate(a,p)),z(t,A(C=null!=s&&s.type===R&&null==s.key?s.props.children:s)?C:[C],n,e,i,_,o,r,u,f,l),c.base=n.__e,n.__h=null,c.__h.length&&r.push(c),v&&(c.__E=c.__=null)}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=ot(e.__e,n,e,i,_,o,r,f,l);(s=w.diffed)&&s(n)}catch(t){n.__v=null,(f||null!=o)&&(n.__e=u,n.__h=!!f,o[o.indexOf(u)]=null),w.__e(t,n,e)}}function _t(t,n,e){for(var i=0;i<e.length;i++)rt(e[i],e[++i],e[++i]);w.__c&&w.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){w.__e(t,n.__v)}}))}function ot(t,n,e,i,_,o,r,u,f){var l,s,c,h=e.props,a=n.props,p=n.type,d=0;if("svg"===p&&(_=!0),null!=o)for(;d<o.length;d++)if((l=o[d])&&"setAttribute"in l==!!p&&(p?l.localName===p:3===l.nodeType)){t=l,o[d]=null;break}if(null==t){if(null===p)return document.createTextNode(a);t=_?document.createElementNS("http://www.w3.org/2000/svg",p):document.createElement(p,a.is&&a),o=null,u=!1}if(null===p)h===a||u&&t.data===a||(t.data=a);else{if(o=o&&x.call(t.childNodes),s=(h=e.props||D).dangerouslySetInnerHTML,c=a.dangerouslySetInnerHTML,!u){if(null!=o)for(h={},d=0;d<t.attributes.length;d++)h[t.attributes[d].name]=t.attributes[d].value;(c||s)&&(c&&(s&&c.__html==s.__html||c.__html===t.innerHTML)||(t.innerHTML=c&&c.__html||""))}if(Y(t,a,h,_,u),c)n.__k=[];else if(z(t,A(d=n.props.children)?d:[d],n,e,i,_&&"foreignObject"!==p,o,r,o?o[0]:e.__k&&j(e,0),u,f),null!=o)for(d=o.length;d--;)null!=o[d]&&M(o[d]);u||("value"in a&&void 0!==(d=a.value)&&(d!==t.value||"progress"===p&&!d||"option"===p&&d!==h.value)&&tt(t,"value",d,h.value,!1),"checked"in a&&void 0!==(d=a.checked)&&d!==t.checked&&tt(t,"checked",d,h.checked,!1))}return t}function rt(t,n,e){try{"function"==typeof t?t(n):t.current=n}catch(t){w.__e(t,e)}}function ut(t,n,e){var i,_;if(w.unmount&&w.unmount(t),(i=t.ref)&&(i.current&&i.current!==t.__e||rt(i,null,n)),null!=(i=t.__c)){if(i.componentWillUnmount)try{i.componentWillUnmount()}catch(t){w.__e(t,n)}i.base=i.__P=null,t.__c=void 0}if(i=t.__k)for(_=0;_<i.length;_++)i[_]&&ut(i[_],n,e||"function"!=typeof t.type);e||null==t.__e||M(t.__e),t.__=t.__e=t.__d=void 0}function ft(t,n,e){return this.constructor(t,e)}function lt(t,n,e){var i,_,o,r;w.__&&w.__(t,n),_=(i="function"==typeof e)?null:e&&e.__k||n.__k,o=[],r=[],it(n,t=(!i&&e||n).__k=W(R,null,[t]),_||D,D,void 0!==n.ownerSVGElement,!i&&e?[e]:_?null:n.firstChild?x.call(n.childNodes):null,o,!i&&e?e:_?_.__e:n.firstChild,i,r),_t(o,t,r)}function st(t,n){lt(t,n,st)}function ct(t,n,e){var i,_,o,r,u=F({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?i=n[o]:"ref"==o?_=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?x.call(arguments,2):e),O(t.type,u,i||t.key,_||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+$++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,i;return this.getChildContext||(e=[],(i={})[n]=this,this.getChildContext=function(){return i},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,q(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}x=T.slice,w={__e:function(t,n,e,i){for(var _,o,r;n=n.__;)if((_=n.__c)&&!_.__)try{if((o=_.constructor)&&null!=o.getDerivedStateFromError&&(_.setState(o.getDerivedStateFromError(t)),r=_.__d),null!=_.componentDidCatch&&(_.componentDidCatch(t,i||{}),r=_.__d),r)return _.__E=_}catch(n){t=n}throw t}},C=0,E=function(t){return null!=t&&void 0===t.constructor},I.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=F({},this.state),"function"==typeof t&&(t=t(F({},e),this.props)),t&&F(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),q(this))},I.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),q(this))},I.prototype.render=R,U=[],N="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},G.__r=0,$=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=w.__b,kt=w.__r,St=w.diffed,xt=w.__c,wt=w.unmount;function Ct(t,n){w.__h&&w.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Et(t){return yt=1,Ut(Bt,t)}function Ut(t,n,e){var i=Ct(at++,2);if(i.t=t,!i.__c&&(i.__=[e?e(n):Bt(void 0,n),function(t){var n=i.__N?i.__N[0]:i.__[0],e=i.t(n,t);n!==e&&(i.__N=[e,i.__[1]],i.__c.setState({}))}],i.__c=pt,!pt.u)){var _=function(t,n,e){if(!i.__c.__H)return!0;var _=i.__c.__H.__.filter((function(t){return t.__c}));if(_.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return _.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&i.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var i=o;o=void 0,_(t,n,e),o=i}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=_}return i.__N||i.__}function Ht(t,n){var e=Ct(at++,3);!w.__s&&jt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ct(at++,4);!w.__s&&jt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Pt(t){return yt=5,Dt((function(){return{current:t}}),[])}function $t(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ct(at++,7);return jt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Tt(t,n){return yt=8,Dt((function(){return t}),n)}function Vt(t){var n=pt.context[t.__c],e=Ct(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){w.useDebugValue&&w.useDebugValue(n?n(t):t)}function Ft(t){var n=Ct(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,i){n.__&&n.__(t,i),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Mt(){var t=Ct(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Wt(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Rt),t.__H.__h.forEach(It),t.__H.__h=[]}catch(u){t.__H.__h=[],w.__e(u,t.__v)}}w.__b=function(t){pt=null,bt&&bt(t)},w.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(Rt),n.__h.forEach(It),n.__h=[],at=0)),dt=pt},w.diffed=function(t){St&&St(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===w.requestAnimationFrame||((vt=w.requestAnimationFrame)||Lt)(Wt)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},w.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Rt),t.__h=t.__h.filter((function(t){return!t.__||It(t)}))}catch(s){n.some((function(t){t.__h&&(t.__h=[])})),n=[],w.__e(s,t.__v)}})),xt&&xt(t,n)},w.unmount=function(t){wt&&wt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Rt(t)}catch(t){n=t}})),e.__H=void 0,n&&w.__e(n,e.__v))};var Ot="function"==typeof requestAnimationFrame;function Lt(t){var n,e=function(){clearTimeout(i),Ot&&cancelAnimationFrame(n),setTimeout(t)},i=setTimeout(e,100);Ot&&(n=requestAnimationFrame(e))}function Rt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function It(t){var n=pt;t.__c=t.__(),pt=n}function jt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function qt(t,n){w[t]=n.bind(null,w[t]||(()=>{}))}let Gt,zt;function Jt(t){if(zt)zt();zt=t&&t.S()}function Kt({data:t}){const n=Xt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!E(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return y(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Kt.displayName="_st";Object.defineProperties(c.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Kt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});qt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let i in e){if("children"===i)continue;let _=e[i];if(_ instanceof c){if(!t)n.__np=t={};t[i]=_;e[i]=_.peek()}}}t(n)});qt("__r",(t,n)=>{Jt();let e,i=n.__c;if(i){i.__$f&=-2;e=i.__$u;if(void 0===e)i.__$u=e=function(t){let n;S((function(){n=this}));n.c=()=>{i.__$f|=1;i.setState({})};return n}()}Gt=i;Jt(e);t(n)});qt("__e",(t,n,e,i)=>{Jt();Gt=void 0;t(n,e,i)});qt("diffed",(t,n)=>{Jt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,i=n.props;if(t){let n=e.U;if(n)for(let e in n){let i=n[e];if(void 0!==i&&!(e in t)){i.d();n[e]=void 0}}else{n={};e.U=n}for(let _ in t){let o=n[_],r=t[_];if(void 0===o){o=Qt(e,_,r,i);n[_]=o}else o.o(r,i)}}}t(n)});function Qt(t,n,e,i){const _=n in t&&void 0===t.ownerSVGElement,o=h(e);return{o:(t,n)=>{o.value=t;i=n},d:S(()=>{const e=o.value.value;if(i[n]!==e){i[n]=e;if(_)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}qt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});qt("__h",(t,n,e,i)=>{if(i<3||9===i)n.__$f|=2;t(n,e,i)});I.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let i in n)return!0;for(let i in t)if("__source"!==i&&t[i]!==this.props[i])return!0;for(let i in this.props)if(!(i in t))return!0;return!1};function Xt(t){return Dt(()=>h(t),[])}function Yt(t){const n=Pt(t);n.current=t;Gt.__$f|=4;return Dt(()=>y(()=>n.current()),[])}function Zt(t){const n=Pt(t);n.current=t;Ht(()=>S(()=>n.current()),[])}var tn=function(t,n,e,i){var _;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?i[0]=u:4===r?i[1]=Object.assign(i[1]||{},u):5===r?(i[1]=i[1]||{})[n[++o]]=u:6===r?i[1][n[++o]]+=u+"":r?(_=t.apply(u,tn(t,u,e,["",null])),i.push(_),u[0]?n[0]|=2:(n[o-2]=0,n[o]=_)):i.push(u)}return i},nn=new Map;function en(t){var n=nn.get(this);return n||(n=new Map,nn.set(this,n)),(n=tn(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,i=1,_="",o="",r=[0],u=function(t){1===i&&(t||(_=_.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,_):3===i&&(t||_)?(r.push(3,t,_),i=2):2===i&&"..."===_&&t?r.push(4,t,0):2===i&&_&&!t?r.push(5,0,!0,_):i>=5&&((_||!t&&5===i)&&(r.push(i,0,_,e),i=6),t&&(r.push(i,t,0,e),i=6)),_=""},f=0;f<t.length;f++){f&&(1===i&&u(),u(f));for(var l=0;l<t[f].length;l++)n=t[f][l],1===i?"<"===n?(u(),r=[r],i=3):_+=n:4===i?"--"===_&&">"===n?(i=1,_=""):_=n+_[0]:o?n===o?o="":_+=n:'"'===n||"'"===n?o=n:">"===n?(u(),i=1):i&&("="===n?(i=5,e=_,_=""):"/"===n&&(i<5||">"===t[f][l+1])?(u(),3===i&&(r=r[0]),i=r,(r=r[0]).push(2,0,i),i=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),i=2):_+=n),3===i&&"!--"===_&&(i=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var _n=en.bind(W);export{I as Component,R as Fragment,c as Signal,e as batch,ct as cloneElement,y as computed,ht as createContext,W as createElement,L as createRef,S as effect,W as h,_n as html,st as hydrate,E as isValidElement,w as options,lt as render,h as signal,K as toChildArray,r as untracked,Tt as useCallback,Yt as useComputed,Vt as useContext,At as useDebugValue,Ht as useEffect,Ft as useErrorBoundary,Mt as useId,$t as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ut as useReducer,Pt as useRef,Xt as useSignal,Zt as useSignalEffect,Et as useState};
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
deleted file mode 100644
index 3f1b255c2..000000000
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ /dev/null
@@ -1,112 +0,0 @@
-const SPACE_RULE = '" "?';
-
-const PRIMITIVE_RULES = {
-  boolean: '("true" | "false") space',
-  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
-  string: ` "\\"" (
-        [^"\\\\] |
-        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\\"" space`,
-  null: '"null" space',
-};
-
-const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
-const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
-const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
-
-export class SchemaConverter {
-  constructor(propOrder) {
-    this._propOrder = propOrder || {};
-    this._rules = new Map();
-    this._rules.set('space', SPACE_RULE);
-  }
-
-  _formatLiteral(literal) {
-    const escaped = JSON.stringify(literal).replace(
-      GRAMMAR_LITERAL_ESCAPE_RE,
-      m => GRAMMAR_LITERAL_ESCAPES[m]
-    );
-    return `"${escaped}"`;
-  }
-
-  _addRule(name, rule) {
-    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
-    let key = escName;
-
-    if (this._rules.has(escName)) {
-      if (this._rules.get(escName) === rule) {
-        return key;
-      }
-
-      let i = 0;
-      while (this._rules.has(`${escName}${i}`)) {
-        i += 1;
-      }
-      key = `${escName}${i}`;
-    }
-
-    this._rules.set(key, rule);
-    return key;
-  }
-
-  visit(schema, name) {
-    const schemaType = schema.type;
-    const ruleName = name || 'root';
-
-    if (schema.oneOf || schema.anyOf) {
-      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
-        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
-      ).join(' | ');
-
-      return this._addRule(ruleName, rule);
-    } else if ('const' in schema) {
-      return this._addRule(ruleName, this._formatLiteral(schema.const));
-    } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
-      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'object' && 'properties' in schema) {
-      // TODO: `required` keyword (from python implementation)
-      const propOrder = this._propOrder;
-      const propPairs = Object.entries(schema.properties).sort((a, b) => {
-        // sort by position in prop_order (if specified) then by key
-        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
-        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
-        return orderA - orderB || a[0].localeCompare(b[0]);
-      });
-
-      let rule = '"{" space';
-      propPairs.forEach(([propName, propSchema], i) => {
-        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
-        if (i > 0) {
-          rule += ' "," space';
-        }
-        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
-      });
-      rule += ' "}" space';
-
-      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'array' && 'items' in schema) {
-      // TODO `prefixItems` keyword (from python implementation)
-      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
-      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
-      return this._addRule(ruleName, rule);
-    } else {
-      if (!PRIMITIVE_RULES[schemaType]) {
-        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
-      }
-      return this._addRule(
-        ruleName === 'root' ? 'root' : schemaType,
-        PRIMITIVE_RULES[schemaType]
-      );
-    }
-  }
-
-  formatGrammar() {
-    let grammar = '';
-    this._rules.forEach((rule, name) => {
-      grammar += `${name} ::= ${rule}\n`;
-    });
-    return grammar;
-  }
-}
diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html
new file mode 100644
index 000000000..c3fd19a0f
--- /dev/null
+++ b/examples/server/public/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/examples/server/public_legacy/colorthemes.css b/examples/server/public_legacy/colorthemes.css
new file mode 100755
index 000000000..b1e2b8b70
--- /dev/null
+++ b/examples/server/public_legacy/colorthemes.css
@@ -0,0 +1,402 @@
+@import url("theme-snowstorm.css");
+@import url("theme-polarnight.css");
+@import url("theme-ketivah.css");
+@import url("theme-mangotango.css");
+@import url("theme-playground.css");
+@import url("theme-beeninorder.css");
+
+:root {
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --primary-color-1-hue:             217.5;
+    --primary-color-1-saturation:      26.7%;
+    --primary-color-1-lightness:       94.1%;
+
+--primary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --primary-color-2-hue:             218.2;
+    --primary-color-2-saturation:      26.8%;
+    --primary-color-2-lightness:       92.0%;
+
+--primary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --primary-color-3-hue:             218.8;
+    --primary-color-3-saturation:      27.9%;
+    --primary-color-3-lightness:       88.0%;
+
+--primary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --primary-color-4-hue:             218.8;
+    --primary-color-4-saturation:      18.3%;
+    --primary-color-4-lightness:       81.8%;
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
+    --secondary-color-1-hue:             220.0;
+    --secondary-color-1-saturation:      16.4%;
+    --secondary-color-1-lightness:       21.6%;
+
+--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
+    --secondary-color-2-hue:             221.7;
+    --secondary-color-2-saturation:      16.3%;
+    --secondary-color-2-lightness:       27.6%;
+
+--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
+    --secondary-color-3-hue:             220.0;
+    --secondary-color-3-saturation:      16.8%;
+    --secondary-color-3-lightness:       31.6%;
+
+--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --secondary-color-4-hue:             220.0;
+    --secondary-color-4-saturation:      16.5%;
+    --secondary-color-4-lightness:       35.7%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(32.5, 80%, 50%);
+--theme-orange-color: hsl(32.5, 70%, 45%);
+--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
+--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-orange-color);
+--button-alert-border-hover:     var(--theme-orange-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) + 35%),
+    calc(var(--secondary-color-1-lightness)  - 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) + 40%),
+    calc(var(--theme-nuance-color-3-lightness)  - 55%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+}
+
+/*
+
+.theme-template {
+
+
+    If light theme: should go from bright to darker
+    If dark theme: should go from dark to brighter
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --primary-color-1: #2E3440;
+    --primary-color-2: #3B4252;
+    --primary-color-3: #434C5E;
+    --primary-color-4: #4C566A;
+
+
+
+    If light theme: should go from dark to brighter
+    If dark theme: should go from bright to darker
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --secondary-color-1: #ECEFF4;
+    --secondary-color-2: #E5E9F0;
+    --secondary-color-3: #D8DEE9;
+    --secondary-color-4: #C8CED9;
+
+
+
+    Choose wisely nuance colors. It is not easy to find
+    4 harmonizing nuance colors. But keep in mind, that
+    only one accent color could work too.
+
+    --theme-nuance-color-1: #8FBCBB;
+    --theme-nuance-color-2: #88C0D0;
+    --theme-nuance-color-3: #81A1C1;
+    --theme-nuance-color-4: #5E81AC;
+
+
+
+    adapt the color red, orange, yellow, green,
+    purple to the 'mood' of your overall design
+    e.g is it low-contrast? vibrant? dynamic? etc
+
+    --theme-red-color:    #BF616A;
+    --theme-orange-color: #D08770;
+    --theme-yellow-color: #EBCB8B;
+    --theme-green-color:  #A3BE8C;
+    --theme-purple-color: #B48EAD;
+
+
+
+NOTE: comment all those line `--- ...` out
+------------------------------------------------
+--background-color-1:
+--background-color-2:
+--background-color-3:
+--background-color-4:
+
+--border-color-1:
+--border-color-2:
+--border-color-3:
+
+--border-focus-color:
+--border-focus-shadow:
+
+--text-color-plain:
+--text-color-subtile-1:
+--text-color-subtile-2:
+
+--code-background-color:
+--code-text-color:
+
+--ui-range-thumb-color:
+--ui-range-thumb-border:
+
+--textarea-border-color:
+
+
+
+-------------------------------------------
+--button-alert-text-hover:
+--button-alert-color-hover:
+--button-alert-border-hover:
+
+--button-alert-text-active:
+--button-alert-color-active:
+--button-alert-border-active:
+
+
+
+----------- PRIMARY -----------------------
+--button should immediately catch the eye--
+
+--button-primary-text:
+--button-primary-color:
+--button-primary-border:
+
+
+---------hover----------
+--button-primary-text-hover:
+--button-primary-color-hover:
+--button-primary-border-hover:
+
+
+---------active---------
+--button-primary-text-active:
+--button-primary-color-active:
+--button-primary-border-active:
+
+
+
+------------ SECONDARY ------------------------
+--button should NOT immediately catch the eye--
+
+--button-secondary-text:
+--button-secondary-color:
+--button-secondary-border:
+
+
+---------hover----------
+--button-secondary-text-hover:
+--button-secondary-color-hover:
+--button-secondary-border-hover:
+
+
+---------active---------
+--button-secondary-text-active:
+--button-secondary-color-active:
+--button-secondary-border-active:
+
+
+
+---------- TERTIARY -----------------------
+---------- disabled buttons ---------------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+
+---------hover----------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+}
+
+*/
diff --git a/examples/server/public/completion.js b/examples/server/public_legacy/completion.js
similarity index 84%
rename from examples/server/public/completion.js
rename to examples/server/public_legacy/completion.js
index ab38a7b40..30df7c2fa 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public_legacy/completion.js
@@ -21,6 +21,7 @@ let generation_settings = null;
 //
 export async function* llama(prompt, params = {}, config = {}) {
   let controller = config.controller;
+  const api_url = config.api_url?.replace(/\/+$/, '') || "";
 
   if (!controller) {
     controller = new AbortController();
@@ -28,7 +29,7 @@ export async function* llama(prompt, params = {}, config = {}) {
 
   const completionParams = { ...paramDefaults, ...params, prompt };
 
-  const response = await fetch("/completion", {
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
     method: 'POST',
     body: JSON.stringify(completionParams),
     headers: {
@@ -77,7 +78,12 @@ export async function* llama(prompt, params = {}, config = {}) {
       for (const line of lines) {
         const match = regex.exec(line);
         if (match) {
-          result[match[1]] = match[2]
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
           // since we know this is llama.cpp, let's just decode the json in data
           if (result.data) {
             result.data = JSON.parse(result.data);
@@ -96,18 +102,18 @@ export async function* llama(prompt, params = {}, config = {}) {
             }
           }
           if (result.error) {
-            result.error = JSON.parse(result.error);
-            if (result.error.content.includes('slot unavailable')) {
-              // Throw an error to be caught by upstream callers
-              throw new Error('slot unavailable');
-            } else {
-              console.error(`llama.cpp error: ${result.error.content}`);
+            try {
+              result.error = JSON.parse(result.error);
+              if (result.error.message.includes('slot unavailable')) {
+                // Throw an error to be caught by upstream callers
+                throw new Error('slot unavailable');
+              } else {
+                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+              }
+            } catch(e) {
+              console.error(`llama.cpp error ${result.error}`)
             }
           }
-          if (result.error) {
-            result.error = JSON.parse(result.error);
-            console.error(`llama.cpp error: ${result.error.content}`);
-          }
         }
       }
     }
@@ -193,9 +199,10 @@ export const llamaComplete = async (params, controller, callback) => {
 }
 
 // Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async () => {
+export const llamaModelInfo = async (config = {}) => {
   if (!generation_settings) {
-    const props = await fetch("/props").then(r => r.json());
+    const api_url = config.api_url?.replace(/\/+$/, '') || "";
+    const props = await fetch(`${api_url}/props`).then(r => r.json());
     generation_settings = props.default_generation_settings;
   }
   return generation_settings;
diff --git a/examples/server/public_legacy/favicon.ico b/examples/server/public_legacy/favicon.ico
new file mode 100644
index 000000000..89e154a0a
Binary files /dev/null and b/examples/server/public_legacy/favicon.ico differ
diff --git a/examples/server/public_legacy/index-new.html b/examples/server/public_legacy/index-new.html
new file mode 100644
index 000000000..cbfbbdf28
--- /dev/null
+++ b/examples/server/public_legacy/index-new.html
@@ -0,0 +1,1190 @@
+<!DOCTYPE html>
+
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <link rel="icon" type="image/x-icon" href="favicon.ico">
+  <link rel="stylesheet" href="style.css">
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    import { promptFormats } from './prompt-formats.js';
+    import { systemPrompts } from './system-prompts.js'; // multilingual is wip
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "",
+      template: "{{prompt}}\n{{history}}{{char}}",
+      historyTemplate: "{{name}}: {{message}}\n",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "ASSISTANT",
+      user: "USER",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 358, // 358 is a nice number
+      temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
+      repeat_last_n: 0, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.0, // 1.0 = disabled
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+      top_k: 0, // <= 0 to use vocab size
+      top_p: 1.0, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+
+
+    /* START: Support for storing prompt templates and parameters in browser's LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfuly imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Reseting themplate to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browser's LocalStorage */
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+        const data = chunk.data;
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+        if (data.timings) {
+          // llamaStats.value = data.timings;
+          llamaStats.value = data;
+        }
+      }
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+    // just in case (e.g. llama2)
+    const suffix = session.value.userMsgSuffix || "";
+    const prefix = session.value.userMsgPrefix || "";
+    const userMsg = prefix + msg + suffix;
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", userMsg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join(''),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", "<|end|>", "<|eot_id|>", "<|end_of_text|>", "<|im_end|>", "<|EOT|>", "<|END_OF_TURN_TOKEN|>", "<|end_of_turn|>", "<|endoftext|>", template("{{char}}"), template("{{user}}")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+      <form onsubmit=${submit}>
+        <div class="chat-input-container">
+          <textarea
+            id="chat-input" placeholder="Say Something ... (Shift + Enter for new line)"
+            class="${generating.value ? 'loading' : null}"
+            oninput=${(e) => message.value = e.target.value}
+            onkeypress=${enterSubmits}
+            rows="2"
+            type="text"
+            value="${message}"
+          ></textarea>
+        </div>
+
+          <div class="right">
+            <button class="button-back" onclick=${reset}>Back</button>
+            <button onclick=${uploadImage}>Upload Image</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button type="submit" disabled=${generating.value}>Submit</button>
+          </div>
+        </form>
+      `
+    }
+
+    // the completion view needs some ux improvements
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div class="right">
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Back</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const isCompletionMode = session.value.type === 'completion'
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+        if (user) {
+          return html`<p key=${index}><strong class="chat-id-color">${template(user)}</strong> ${message}</p>`
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<p key=${index}>${message}</p>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+
+
+///////////// UI Improvements /////////////
+//
+//
+const handleToggleChange = (e) => {
+  const isChecked = e.target.checked;
+  session.value = { ...session.value, type: isChecked ? 'completion' : 'chat' };
+  localStorage.setItem('toggleState', isChecked);
+}
+//
+const loadToggleState = () => {
+  const storedState = localStorage.getItem('toggleState');
+  if (storedState !== null) {
+    const isChecked = storedState === 'true';
+    document.getElementById('toggle').checked = isChecked;
+    session.value = { ...session.value, type: isChecked ? 'completion' : 'chat' };
+  }
+}
+//
+document.addEventListener('DOMContentLoaded', loadToggleState);
+//
+//
+// function to update the prompt format
+function updatePromptFormat(e) {
+  const promptFormat = e.target.value;
+  if (promptFormats.hasOwnProperty(promptFormat)) {
+    session.value = {
+      ...session.value,
+      ...promptFormats[promptFormat]
+    };
+  } else {
+    // Use vicuna as llama.cpp's default setting, since it's most common
+    session.value = {
+      ...session.value,
+      template: "{{prompt}}\n{{history}}{{char}}",
+      historyTemplate: "{{name}}: {{message}}\n",
+      char: "ASSISTANT",
+      user: "USER"
+    };
+  }
+  console.log('Updated session value:', session.value);
+}
+//
+//
+// function to update the prompt format from the selected one
+function updatePromptFormatFromDropdown(element) {
+  const promptFormat = element.getAttribute('data-value');
+  console.log('Selected prompt format:', promptFormat); // debugging
+  updatePromptFormat({ target: { value: promptFormat } });
+}
+//
+//
+// function that adds the event listers as soon as the element is available
+function addEventListenersWhenAvailable() {
+  var themeSelector = document.getElementById('theme-selector');
+  if (themeSelector) {
+    themeSelector.addEventListener('change', function(event) {
+      // event-handler-code...
+    });
+    // placeholder event listeners
+  } else {
+    // if the element is not there yet, wait ahead
+    requestAnimationFrame(addEventListenersWhenAvailable);
+  }
+}
+//
+//
+// begin with the check
+requestAnimationFrame(addEventListenersWhenAvailable);
+//
+//
+// avoid default and create new event object with value from data value attribute
+function handleDropdownSelection(e, promptFormat) {
+  e.preventDefault();
+  const customEvent = {
+    target: {
+      value: promptFormat
+    }
+  };
+  // call our updatePromptFormat-function
+  updatePromptFormat(customEvent);
+}
+//
+//
+// function to update the system message
+function updateSystemPrompt(e) {
+  const SystemPrompt = e.target.value;
+  if (systemPrompts.hasOwnProperty(SystemPrompt)) {
+    session.value = {
+      ...session.value,
+      prompt: systemPrompts[SystemPrompt].systemPrompt
+    };
+  }
+}
+//
+//
+///////////// UI Improvements /////////////
+
+
+
+
+const ConfigForm = (props) => {
+  const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+  const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+  const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+  const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+  const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+  const grammarJsonSchemaPropOrder = signal('')
+  const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+  const convertJSONSchemaGrammar = async () => {
+    try {
+      let schema = JSON.parse(params.value.grammar)
+      const converter = new SchemaConverter({
+        prop_order: grammarJsonSchemaPropOrder.value
+          .split(',')
+          .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+        allow_fetch: true,
+      })
+      schema = await converter.resolveRefs(schema, 'input')
+      converter.visit(schema, '')
+      params.value = {
+        ...params.value,
+        grammar: converter.formatGrammar(),
+      }
+    } catch (e) {
+      alert(`Convert failed: ${e.message}`)
+    }
+  }
+
+  const FloatField = ({ label, title, max, min, name, step, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} title="${title}" />
+  <span id="${name}-value">${value}</span>
+</div>
+`
+};
+
+const IntField = ({ label, title, max, min, step, name, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsInt} title="${title}" />
+  <span id="${name}-value">${value}</span>
+</div>
+`
+};
+
+const BoolField = ({ label, title, name, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} title="${title}" />
+</div>
+`
+};
+
+  const userTemplateReset = (e) => {
+    e.preventDefault();
+    userTemplateResetToDefaultAndApply()
+  }
+
+  const UserTemplateResetButton = () => {
+    if (selectedUserTemplate.value.name == 'default') {
+      return html`
+      <button class="reset-button" id="id_reset" onclick="${userTemplateReset}">Reset</button>
+      `
+    }
+
+    return html`
+      <div class="button-container">
+        <button class="reset-button" title="Caution: This resets the entire form." onclick="${userTemplateReset}">Reset</button>
+      </div>
+    `
+  };
+
+  useEffect(() => {
+    // autosave template on every change
+    userTemplateAutosave()
+  }, [session.value, params.value])
+
+  const GrammarControl = () => (
+    html`
+      <div>
+        <div class="grammar">
+          <label for="template"></label>
+          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+        </div>
+        <div class="grammar-columns">
+          <div class="json-schema-controls">
+            <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+        </div>
+      </div>
+    `
+  );
+
+  const PromptControlFieldSet = () => (
+    html`
+      <fieldset>
+        <div class="input-container">
+          <label for="prompt" class="input-label">System</label>
+          <textarea
+            id="prompt"
+            class="persistent-input"
+            name="prompt"
+            placeholder="[Note] The following models do not support System Prompts by design:\n• OpenChat\n• Orion\n• Phi-3\n• Starling\n• Yi-...-Chat"
+            value="${session.value.prompt}"
+            oninput=${updateSession}
+          ></textarea>
+        </div>
+      </fieldset>
+    `
+  );
+
+  const ChatConfigForm = () => (
+    html`
+      <fieldset class="dropdowns">
+        <div>
+          <select id="promptFormat" name="promptFormat" onchange=${updatePromptFormat}>
+              <option value="default">Prompt Style</option>
+              <option value=""></option>
+            <optgroup label="Common Prompt-Styles">
+              <option value="alpaca">Alpaca</option>
+              <option value="chatml">ChatML</option>
+              <option value="commandr">Command R/+</option>
+              <option value="llama2">Llama 2</option>
+              <option value="llama3">Llama 3</option>
+              <option value="phi3">Phi-3</option>
+              <option value="openchat">OpenChat/Starling</option>
+              <option value="vicuna">Vicuna</option>
+              <option value=""></option>
+            </optgroup>
+            <optgroup label="More Prompt-Styles">
+              <option value="vicuna">Airoboros L2</option>
+              <option value="vicuna">BakLLaVA-1</option>
+              <option value="alpaca">Code Cherry Pop</option>
+              <option value="deepseekCoder">Deepseek Coder</option>
+              <option value="chatml">Dolphin Mistral</option>
+              <option value="chatml">evolvedSeeker 1.3B</option>
+              <option value="vicuna">Goliath 120B</option>
+              <option value="vicuna">Jordan</option>
+              <option value="vicuna">LLaVA</option>
+              <option value="chatml">Leo Hessianai</option>
+              <option value="vicuna">Leo Mistral</option>
+              <option value="vicuna">Marx</option>
+              <option value="med42">Med42</option>
+              <option value="alpaca">MetaMath</option>
+              <option value="llama2">Mistral Instruct</option>
+              <option value="chatml">Mistral 7B OpenOrca</option>
+              <option value="alpaca">MythoMax</option>
+              <option value="neuralchat">Neural Chat</option>
+              <option value="vicuna">Nous Capybara</option>
+              <option value="nousHermes">Nous Hermes</option>
+              <option value="openchatMath">OpenChat Math</option>
+              <option value="chatml">OpenHermes 2.5-Mistral</option>
+              <option value="alpaca">Orca Mini v3</option>
+              <option value="orion">Orion</option>
+              <option value="vicuna">Samantha</option>
+              <option value="chatml">Samantha Mistral</option>
+              <option value="sauerkrautLM">SauerkrautLM</option>
+              <option value="vicuna">Scarlett</option>
+              <option value="starlingCode">Starling Coding</option>
+              <option value="alpaca">Sydney</option>
+              <option value="vicuna">Synthia</option>
+              <option value="vicuna">Tess</option>
+              <option value="yi34b">Yi-6/9/34B-Chat</option>
+              <option value="zephyr">Zephyr</option>
+              <option value=""></option>
+            </optgroup>
+          </select>
+          <select id="SystemPrompt" name="SystemPrompt" onchange=${updateSystemPrompt}>
+            <option value="default">System Prompt</option>
+            <option value="empty">None</option>
+            <option value="airoboros">Airoboros</option>
+            <option value="alpaca">Alpaca</option>
+            <option value="atlas">Atlas</option>
+            <option value="atlas_de">Atlas - DE</option>
+            <option value="cot">Chain of Tought</option>
+            <option value="commandrempty">Command R/+ (empty)</option>
+            <option value="commandrexample">Command R/+ (example)</option>
+            <option value="deduce">Critical Thinking</option>
+            <option value="deepseekcoder">Deepseek Coder</option>
+            <option value="jordan">Jordan</option>
+            <option value="leomistral">Leo Mistral</option>
+            <option value="med42">Med42</option>
+            <option value="migeltot">Migel's Tree of Thought</option>
+            <option value="mistralopenorca">Mistral OpenOrca</option>
+            <option value="orcamini">Orca Mini</option>
+            <option value="samantha">Samantha</option>
+            <option value="sauerkraut">Sauerkraut</option>
+            <option value="scarlett">Scarlett</option>
+            <option value="synthia">Synthia</option>
+            <option value="vicuna">Vicuna</option>
+          </select>
+          <!--<select id="systemLanguage" name="systemLanguage">-->
+            <!--<option value="default">English</option>-->
+            <!--<option value="DE">German</option>-->
+            <!--<option value="placeholderLanguage">Placeholder</option>-->
+          <!--</select>-->
+        </div>
+      </fieldset>
+      ${PromptControlFieldSet()}
+        <fieldset>
+          <details>
+            <summary><span class="summary-title" id="id_prompt-style">Prompt Style</span></summary>
+            <fieldset class="names">
+          <div>
+            <label for="user" id="id_user-name">User ID</label>
+            <input type="text" id="user" name="user" value="${session.value.user}" oninput=${updateSession} />
+          </div>
+          <div>
+            <label for="bot" id="id_bot-name">AI ID</label>
+            <input type="text" id="bot" name="char" value="${session.value.char}" oninput=${updateSession} />
+          </div>
+        </fieldset>
+          <div class="two-columns">
+            <div>
+              <div class="input-container">
+                <label for="template" class="input-label-sec" id_prompt-template>Prompt Template</label>
+                <textarea id="template" class="persistent-input-sec" name="template" value="${session.value.template}" rows=6 oninput=${updateSession}/>
+              </div>
+            </div>
+            <div>
+              <div class="input-container">
+                <label for="template" class="input-label-sec" id="id_history-template">Chat History</label>
+                <textarea id="history-template" class="persistent-input-sec" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+              </div>
+            </div>
+          </div>
+        </details>
+        <details>
+          <summary><span class="summary-title" id="id_grammar-title" id_grammar-title>Grammar</span></summary>
+          ${GrammarControl()}
+        </details>
+
+        </fieldset>
+    `
+  );
+
+  const CompletionConfigForm = () => (
+    html`
+      ${PromptControlFieldSet()}
+      <fieldset>
+        <details>
+          <summary><span class="summary-title" id="id_grammar-title" id_grammar-title>Grammar</span></summary>
+          ${GrammarControl()}
+        </details>
+      </fieldset>
+    `
+  );
+// todo toggle button et api field et reset button in one nice row
+  return html`
+    <form>
+      <fieldset class="two">
+          <input type="checkbox" id="toggle" class="toggleCheckbox" onchange=${handleToggleChange} />
+            <label for="toggle" class="toggleContainer">
+              <div id="id_toggle-label-chat">Chat</div>
+              <div id="id_toggle-label-complete">Complete</div>
+            </label>
+      <fieldset>
+
+          <input type="text" id="api_key" class="apiKey" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+      </fieldset>
+
+        <${UserTemplateResetButton}/>
+      </fieldset>
+
+      ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+      <fieldset class="params">
+        ${IntField({ label: "Prediction", title: "Set the maximum number of tokens to predict when generating text. Note: May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. The value -1 means infinity. Default is 358", max: 2048, min: -1, step: 16, name: "n_predict", value: params.value.n_predict, })}
+        ${FloatField({ label: "Min-P sampling", title: "The minimum probability for a token to be considered, relative to the probability of the most likely token. Note that it's good practice to disable all other samplers aside from temperature when using min-p. It is also recommenend to go this approach. Default is 0.05 – But consider higher values like ~ 0.4 for non-English text generation. The value 1.0 means disabled", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+        ${FloatField({ label: "Repetition Penalty", title: "Control the repetition of token sequences in the generated text. Default is 1.1", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+        ${FloatField({ label: "Temperature", title: "This will adjust the overall randomness of the generated text. It is the most common sampler. Default is 0.8 but consider using lower values for more factual texts or for non-English text generation", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+      </fieldset>
+
+      <details>
+        <summary><span class="summary-title">Further Options</span></summary>
+        <fieldset class="params">
+          ${IntField({ label: "Top-K", title: "Limits the selection of the next token to the K most probable tokens. 1 means no randomness = greedy sampling. If set to 0, it means the entire vocabulary size is considered.", max: 100, min: 0, step: 1, name: "top_k", value: params.value.top_k })}
+          ${IntField({ label: "Penalize Last N", title: "The last n tokens that are taken into account to penalise repetitions. A value of 0 means that this function is deactivated and -1 means that the entire size of the context is taken into account.", max: 2048, min: 0, step: 16, name: "repeat_last_n", value: params.value.repeat_last_n })}
+          ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+          ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+          ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+          ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+          ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+          ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+          ${FloatField({ label: "DRY Penalty Multiplier", title: "Set the DRY repetition penalty multiplier. Default is 0.0, which disables DRY.", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+          ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+          ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+          ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+          ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+        </fieldset>
+
+        <hr style="height: 1px; background-color: #ececf1; border: none;" />
+
+        <fieldset class="three">
+          <label title="The Mirostat sampling method is an algorithm used in natural language processing to improve the quality and coherence of the generated texts. It is an at-runtime-adaptive method that aims to keep the entropy or surprise of a text within a desired range."><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> Mirostat off</label>
+          <label title="Mirostat version 1 was developed to adjust the probability of predictions so that the surprise in the text remains constant. This means that the algorithm tries to maintain a balance between predictable and surprising words so that the text is neither too monotonous nor too chaotic. V1 is recommended for longer writings, creative texts, etc."><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+          <label title="Mirostat version 2 builds on the idea of V1 but brings some improvements. V2 is recommended as a general purpose algorithm since it offers more precise control over entropy and reacts more quickly to unwanted deviations. As a result, the generated texts appear even more consistent and coherent, especially for everday life conversations."><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+          </fieldset>
+        <fieldset class="params">
+          ${FloatField({ label: "Entropy tau", title: "Tau controls the desired level of entropy (or 'surprise') in the text. A low tau (e.g. 0.5) would mean that a text is very predictable, but will also be very coherent. A high tau (e.g. 8.0) would mean that the text is very creative and surprising, but may also be difficult to follow because unlikely words will occur frequently.", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+          ${FloatField({ label: "Learning-rate eta", title: "Eta determines how quickly the Mirostat algorithm adjusts its predictions to achieve the desired entropy. A learning rate that is too high can cause the algorithm to react too quickly and possibly become unstable, because the algorithm will try to maintain a balance between surprises and precision in the context of only a few words. In this way, 'the common thread' could be lost. Whereas a learning rate that is too low means that the algorithm reacts too slowly and a red thread becomes a heavy goods train that takes a long time to come to a halt and change a 'topic station'.", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+        </fieldset>
+
+          <hr style="height: 1px; background-color: #ececf1; border: none;" />
+
+          <fieldset class="params">
+            ${IntField({ label: "Show Probabilities", title: "If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. The tokens will be colored in gradient from green to red depending on their probabilities. Note that for temperature 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Defaults to 0", max: 10, min: 0, step: 1, name: "n_probs", value: params.value.n_probs })}
+          </fieldset>
+      </details>
+    </form>
+  `
+}
+
+    // todo - beautify apikey section with css
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+      <span class=generation-statistics>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover} contenteditable="true">${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <h2>llama.cpp</h2>
+            <div class="dropdown">
+              <button class="dropbtn"><svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="12" cy="12" r="10" stroke-width="2"/></svg></button>
+              <div class="dropdown-content" id="theme-selector">
+                <a href="/">Old UI</a>
+                <a href="#" data-theme="default">Snow Storm</a>
+                <a href="#" data-theme="polarnight">Polar Night</a>
+                <a href="#" data-theme="ketivah">Ketivah</a>
+                <a href="#" data-theme="mangotango">Mango Tango</a>
+                <a href="#" data-theme="playground">Playground</a>
+                <a href="#" data-theme="beeninorder">Been In Order</a>
+              </div>
+            </div>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered By <a href="https://github.com/ggerganov/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
+          </footer>
+        </div>
+      `;
+    }
+
+  document.addEventListener('DOMContentLoaded', function() {
+  var themeSelector = document.getElementById('theme-selector');
+  var themeLinks = themeSelector.querySelectorAll('a[data-theme]');
+
+  themeLinks.forEach(function(link) {
+    link.addEventListener('click', function(event) {
+      event.preventDefault(); // avoid default behaviour
+      var selectedTheme = event.target.getAttribute('data-theme');
+      changeTheme(selectedTheme);
+    });
+  });
+
+  function changeTheme(theme) {
+    document.body.classList.remove('theme-default', 'theme-polarnight', 'theme-ketivah', 'theme-mangotango', 'theme-playground', 'theme-beeninorder');
+    if (theme !== 'default') {
+      document.body.classList.add('theme-' + theme);
+    }
+    localStorage.setItem('selected-theme', theme);
+  }
+
+  // set the selected theme when loading the page
+  var savedTheme = localStorage.getItem('selected-theme');
+  if (savedTheme && savedTheme !== 'default') {
+    document.body.classList.add('theme-' + savedTheme);
+    // update the dropdown if it still exists
+    var dropdown = document.getElementById('theme-selector-dropdown');
+    if (dropdown) {
+      dropdown.value = savedTheme;
+    }
+  }
+});
+
+
+// snapping of the slider to indicate 'disabled'
+document.addEventListener('DOMContentLoaded', (event) => {
+  // define an object that contains snap values and ranges for each slider
+  const snapSettings = {
+    temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
+    min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
+    xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
+    top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    frequency_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_multiplier: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_base: { snapValue: 1.75, snapRangeMultiplier: 4 },
+  };
+  // add an event listener for each slider
+  Object.keys(snapSettings).forEach(sliderName => {
+    const slider = document.querySelector(`input[name="${sliderName}"]`);
+    const settings = snapSettings[sliderName];
+
+    slider.addEventListener('input', (e) => {
+      let value = parseFloat(e.target.value);
+      const step = parseFloat(e.target.step);
+      const snapRange = step * settings.snapRangeMultiplier;
+      const valueDisplay = document.getElementById(`${e.target.name}-value`);
+
+      if (value >= settings.snapValue - snapRange && value <= settings.snapValue + snapRange) {
+        value = settings.snapValue; // set value to the snap value
+        e.target.value = value; // update the slider value
+      }
+      // update the displayed value
+      if (valueDisplay) {
+        valueDisplay.textContent = value.toFixed(2); // display value with two decimal places
+      }
+    });
+  });
+});
+
+    render(h(App), document.querySelector('#container'));
+
+  </script>
+</head>
+
+<body>
+
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html
new file mode 100644
index 000000000..75f39330a
--- /dev/null
+++ b/examples/server/public_legacy/index.html
@@ -0,0 +1,1301 @@
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+    }
+
+    .grid-container {
+      display: grid;
+      grid-template-columns: auto auto auto;
+      padding: 10px;
+    }
+
+    .grid-item {
+      padding: 5px;
+      /* font-size: 30px; */
+      text-align: center;
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+    }
+
+    h1 {
+      text-align: center;
+    }
+
+    .customlink:link {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      border-radius: 4px;
+      padding: 8px;
+    }
+
+    .customlink:visited {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:hover {
+      color: white;
+      background-color: #0070ee;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:active {
+      color: #0070ee;
+      background-color: #80b3ef;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .message-controls {
+      display: flex;
+      justify-content: flex-end;
+    }
+    .message-controls > div:nth-child(2) {
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+    }
+    .message-controls > div:nth-child(2) > div {
+      display: flex;
+      margin-left: auto;
+      gap: 0.5em;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const tts = window.speechSynthesis;
+    const ttsVoice = signal(null)
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
+    function MessageInput() {
+      const message = useSignal("");
+
+      const talkActive = useSignal(false);
+      const sendOnTalk = useSignal(false);
+      const talkStop = (e) => {
+        if (e) e.preventDefault();
+
+        talkActive.value = false;
+        talkRecognition?.stop();
+      }
+      const talk = (e) => {
+        e.preventDefault();
+
+        if (talkRecognition)
+          talkRecognition.start();
+        else
+          alert("Speech recognition is not supported by this browser.");
+      }
+      if(talkRecognition) {
+        talkRecognition.onstart = () => {
+          talkActive.value = true;
+        }
+        talkRecognition.onresult = (e) => {
+          if (event.results.length > 0) {
+            message.value = event.results[0][0].transcript;
+            if (sendOnTalk.value) {
+              submit(e);
+            }
+          }
+        }
+        talkRecognition.onspeechend = () => {
+          talkStop();
+        }
+      }
+
+      const ttsVoices = useSignal(tts?.getVoices() || []);
+      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
+      if (tts) {
+        tts.onvoiceschanged = () => {
+          ttsVoices.value = tts.getVoices();
+        }
+      }
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="message-controls">
+            <div> </div>
+            <div>
+              <div>
+                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
+                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
+                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+                <button onclick=${reset}>Reset</button>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
+                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `Note: STT requires HTTPS to work.`);
+                }}>[?]</a>
+                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
+                <div>
+                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
+                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
+                </div>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                }}>[?]</a>
+                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
+                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
+                  <option value="" selected="${!ttsVoice.value}">None</option>
+                  ${[
+                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
+                    ...ttsVoices.value.filter(v => !v.default),
+                  ].map(
+                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
+                  )}
+                </select>
+              </div>
+            </div>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const ttsChatLineActiveIx = useSignal(undefined);
+      const ttsChatLine = (e, ix, msg) => {
+        if (e) e.preventDefault();
+
+        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
+
+        const ttsVoices = tts.getVoices();
+        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
+        if (!voice) return;
+
+        if (ttsChatLineActiveIx.value !== undefined) {
+          tts.cancel();
+          if (ttsChatLineActiveIx.value === ix) {
+            ttsChatLineActiveIx.value = undefined;
+            return;
+          }
+        }
+
+        ttsChatLineActiveIx.value = ix;
+        let ttsUtter = new SpeechSynthesisUtterance(msg);
+        ttsUtter.voice = voice;
+        ttsUtter.onend = e => {
+          ttsChatLineActiveIx.value = undefined;
+        };
+        tts.speak(ttsUtter);
+      }
+
+      const isCompletionMode = session.value.type === 'completion'
+
+      // Try play the last bot message
+      const lastCharChatLinesIxs = useSignal([]);
+      const lastCharChatLinesIxsOld = useSignal([]);
+      useEffect(() => {
+        if (
+          !isCompletionMode
+          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
+          && !generating.value
+        ) {
+          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
+          if (ix !== undefined) {
+            const msg = messages[ix];
+            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
+          }
+
+          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
+        }
+      }, [generating.value]);
+
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data);
+        const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+
+        const fromBot = user && user === '{{char}}';
+        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
+          lastCharChatLinesIxs.value.push(index);
+
+        if (user) {
+          return html`
+          <div>
+            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
+            ${
+              fromBot && ttsVoice.value
+              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
+            }
+          </div>
+          `;
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<div><p key=${index}>${message}</p></div>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+              ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+              ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+              ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const chunks = params.text.split('```');
+
+      for (let i = 0; i < chunks.length; i++) {
+        if (i % 2 === 0) { // outside code block
+          chunks[i] = chunks[i]
+          .replace(/&/g, '&amp;')
+          .replace(/</g, '&lt;')
+          .replace(/>/g, '&gt;')
+          .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+          .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+          .replace(/`(.*?)`/g, '<code>$1</code>')
+          .replace(/\n/gim, '<br />');
+        } else { // inside code block
+          chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
+        }
+      }
+
+      const restoredText = chunks.join('');
+
+      return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <div class="grid-container">
+              <div class="grid-item"></div>
+              <div class="grid-item"><h1>llama.cpp</h1></div>
+              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
+            </div>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/examples/server/public_legacy/index.js b/examples/server/public_legacy/index.js
new file mode 100644
index 000000000..32ec6e9e1
--- /dev/null
+++ b/examples/server/public_legacy/index.js
@@ -0,0 +1 @@
+const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?B(t):null}function z(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return z(t)}}function G(t){(!t.__d&&(t.__d=!0)&&U.push(t)&&!J.__r++||E!==S.debounceRendering)&&((E=S.debounceRendering)||H)(J)}function J(){var t,n,e,_,i,o,r,u;for(U.sort(P);t=U.shift();)t.__d&&(n=U.length,_=void 0,o=(i=(e=t).__v).__e,r=[],u=[],e.__P&&((_=L({},i)).__v=i.__v+1,S.vnode&&S.vnode(_),_t(e.__P,_,i,e.__n,e.__P.namespaceURI,32&i.__u?[o]:null,r,null==o?B(i):o,!!(32&i.__u),u),_.__v=i.__v,_.__.__k[_.__i]=_,it(r,_,u),_.__e!=o&&z(_)),U.length>n&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c<y;c++)null!=(a=e.__k[c])&&(h=-1===a.__i?M:v[a.__i]||M,a.__i=c,_t(t,a,h,i,o,r,u,l,s,f),p=a.__e,a.ref&&h.ref!=a.ref&&(h.ref&&rt(h.ref,null,a),f.push(a.ref,a.__c||p,a)),null==d&&null!=p&&(d=p),65536&a.__u||h.__k===a.__k?l=X(a,l,t):"function"==typeof a.type&&void 0!==a.__d?l=a.__d:p&&(l=p.nextSibling),a.__d=void 0,a.__u&=-196609);e.__d=l,e.__e=d}function Q(t,n,e){var _,i,o,r,u,l=n.length,s=e.length,f=s,c=0;for(t.__k=[],_=0;_<l;_++)null!=(i=n[_])&&"boolean"!=typeof i&&"function"!=typeof i?(r=_+c,(i=t.__k[_]="string"==typeof i||"number"==typeof i||"bigint"==typeof i||i.constructor==String?I(null,i,null,null,null):W(i)?I(j,{children:i},null,null,null):void 0===i.constructor&&i.__b>0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_<s;_++)null!=(o=e[_])&&0==(131072&o.__u)&&(o.__e==t.__d&&(t.__d=B(o)),ut(o,o))}function X(t,n,e){var _,i;if("function"==typeof t.type){for(_=t.__k,i=0;_&&i<_.length;i++)_[i]&&(_[i].__=t,n=X(_[i],n,e));return n}t.__e!=n&&(n&&t.type&&!e.contains(n)&&(n=B(t)),e.insertBefore(t.__e,n||null),n=t.__e);do{n=n&&n.nextSibling}while(null!=n&&8===n.nodeType);return n}function Y(t,n){return n=n||[],null==t||"boolean"==typeof t||(W(t)?t.some((function(t){Y(t,n)})):n.push(t)),n}function Z(t,n,e,_){var i=t.key,o=t.type,r=e-1,u=e+1,l=n[e];if(null===l||l&&i==l.key&&o===l.type&&0==(131072&l.__u))return e;if(_>(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u<n.length;){if(r>=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u<n.length){if((l=n[u])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return u;u++}}return-1}function tt(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||F.test(n)?e:e+"px"}function nt(t,n,e,_,i){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof _&&(t.style.cssText=_=""),_)for(n in _)e&&n in e||tt(t.style,n,"");if(e)for(n in e)_&&e[n]===_[n]||tt(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/(PointerCapture)$|Capture$/i,"$1")),n=n.toLowerCase()in t||"onFocusOut"===n||"onFocusIn"===n?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?_?e.u=_.u:(e.u=N,t.addEventListener(n,o?T:$,o)):t.removeEventListener(n,o?T:$,o);else{if("http://www.w3.org/2000/svg"==i)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!=n&&"height"!=n&&"href"!=n&&"list"!=n&&"form"!=n&&"tabIndex"!=n&&"download"!=n&&"rowSpan"!=n&&"colSpan"!=n&&"role"!=n&&"popover"!=n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,"popover"==n&&1==e?"":e))}}function et(t){return function(n){if(this.l){var e=this.l[n.type+t];if(null==n.t)n.t=N++;else if(n.t<e.u)return;return e(S.event?S.event(n):n)}}}function _t(t,n,e,_,i,o,r,u,l,s){var f,c,h,a,p,d,v,y,m,g,b,k,w,x,C,U,E=n.type;if(void 0!==n.constructor)return null;128&e.__u&&(l=!!(32&e.__u),o=[u=n.__e=e.__e]),(f=S.__b)&&f(n);t:if("function"==typeof E)try{if(y=n.props,m="prototype"in E&&E.prototype.render,g=(f=E.contextType)&&_[f.__c],b=f?g?g.props.value:f.__:_,e.__c?v=(c=n.__c=e.__c).__=c.__E:(m?n.__c=c=new E(y,b):(n.__c=c=new q(y,b),c.constructor=E,c.render=lt),g&&g.sub(c),c.props=y,c.state||(c.state={}),c.context=b,c.__n=_,h=c.__d=!0,c.__h=[],c._sb=[]),m&&null==c.__s&&(c.__s=c.state),m&&null!=E.getDerivedStateFromProps&&(c.__s==c.state&&(c.__s=L({},c.__s)),L(c.__s,E.getDerivedStateFromProps(y,c.__s))),a=c.props,p=c.state,c.__v=n,h)m&&null==E.getDerivedStateFromProps&&null!=c.componentWillMount&&c.componentWillMount(),m&&null!=c.componentDidMount&&c.__h.push(c.componentDidMount);else{if(m&&null==E.getDerivedStateFromProps&&y!==a&&null!=c.componentWillReceiveProps&&c.componentWillReceiveProps(y,b),!c.__e&&(null!=c.shouldComponentUpdate&&!1===c.shouldComponentUpdate(y,c.__s,b)||n.__v===e.__v)){for(n.__v!==e.__v&&(c.props=y,c.state=c.__s,c.__d=!1),n.__e=e.__e,n.__k=e.__k,n.__k.some((function(t){t&&(t.__=n)})),k=0;k<c._sb.length;k++)c.__h.push(c._sb[k]);c._sb=[],c.__h.length&&r.push(c);break t}null!=c.componentWillUpdate&&c.componentWillUpdate(y,c.__s,b),m&&null!=c.componentDidUpdate&&c.__h.push((function(){c.componentDidUpdate(a,p,d)}))}if(c.context=b,c.props=y,c.__P=t,c.__e=!1,w=S.__r,x=0,m){for(c.state=c.__s,c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),C=0;C<c._sb.length;C++)c.__h.push(c._sb[C]);c._sb=[]}else do{c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),c.state=c.__s}while(c.__d&&++x<25);c.state=c.__s,null!=c.getChildContext&&(_=L(L({},_),c.getChildContext())),m&&!h&&null!=c.getSnapshotBeforeUpdate&&(d=c.getSnapshotBeforeUpdate(a,p)),K(t,W(U=null!=f&&f.type===j&&null==f.key?f.props.children:f)?U:[U],n,e,_,i,o,r,u,l,s),c.base=n.__e,n.__u&=-161,c.__h.length&&r.push(c),v&&(c.__E=c.__=null)}catch(t){if(n.__v=null,l||null!=o){for(n.__u|=l?160:128;u&&8===u.nodeType&&u.nextSibling;)u=u.nextSibling;o[o.indexOf(u)]=null,n.__e=u}else n.__e=e.__e,n.__k=e.__k;S.__e(t,n,e)}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=ot(e.__e,n,e,_,i,o,r,l,s);(f=S.diffed)&&f(n)}function it(t,n,e){n.__d=void 0;for(var _=0;_<e.length;_++)rt(e[_],e[++_],e[++_]);S.__c&&S.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){S.__e(t,n.__v)}}))}function ot(t,n,e,_,i,o,r,u,l){var s,f,c,h,a,p,d,v=e.props,y=n.props,m=n.type;if("svg"===m?i="http://www.w3.org/2000/svg":"math"===m?i="http://www.w3.org/1998/Math/MathML":i||(i="http://www.w3.org/1999/xhtml"),null!=o)for(s=0;s<o.length;s++)if((a=o[s])&&"setAttribute"in a==!!m&&(m?a.localName===m:3===a.nodeType)){t=a,o[s]=null;break}if(null==t){if(null===m)return document.createTextNode(y);t=document.createElementNS(i,m,y.is&&y),u&&(S.__m&&S.__m(n,o),u=!1),o=null}if(null===m)v===y||u&&t.data===y||(t.data=y);else{if(o=o&&w.call(t.childNodes),v=e.props||M,!u&&null!=o)for(v={},s=0;s<t.attributes.length;s++)v[(a=t.attributes[s]).name]=a.value;for(s in v)if(a=v[s],"children"==s);else if("dangerouslySetInnerHTML"==s)c=a;else if(!(s in y)){if("value"==s&&"defaultValue"in y||"checked"==s&&"defaultChecked"in y)continue;nt(t,s,null,a,i)}for(s in y)a=y[s],"children"==s?h=a:"dangerouslySetInnerHTML"==s?f=a:"value"==s?p=a:"checked"==s?d=a:u&&"function"!=typeof a||v[s]===a||nt(t,s,a,v[s],i);if(f)u||c&&(f.__html===c.__html||f.__html===t.innerHTML)||(t.innerHTML=f.__html),n.__k=[];else if(c&&(t.innerHTML=""),K(t,W(h)?h:[h],n,e,_,"foreignObject"===m?"http://www.w3.org/1999/xhtml":i,o,r,o?o[0]:e.__k&&B(e,0),u,l),null!=o)for(s=o.length;s--;)O(o[s]);u||(s="value","progress"===m&&null==p?t.removeAttribute("value"):void 0!==p&&(p!==t[s]||"progress"===m&&!p||"option"===m&&p!==v[s])&&nt(t,s,p,v[s],i),s="checked",void 0!==d&&d!==t[s]&&nt(t,s,d,v[s],i))}return t}function rt(t,n,e){try{if("function"==typeof t){var _="function"==typeof t.__u;_&&t.__u(),_&&null==n||(t.__u=t(n))}else t.current=n}catch(t){S.__e(t,e)}}function ut(t,n,e){var _,i;if(S.unmount&&S.unmount(t),(_=t.ref)&&(_.current&&_.current!==t.__e||rt(_,null,n)),null!=(_=t.__c)){if(_.componentWillUnmount)try{_.componentWillUnmount()}catch(t){S.__e(t,n)}_.base=_.__P=null}if(_=t.__k)for(i=0;i<_.length;i++)_[i]&&ut(_[i],n,e||"function"!=typeof t.type);e||O(t.__e),t.__c=t.__=t.__e=t.__d=void 0}function lt(t,n,e){return this.constructor(t,e)}function st(t,n,e){var _,i,o,r;S.__&&S.__(t,n),i=(_="function"==typeof e)?null:e&&e.__k||n.__k,o=[],r=[],_t(n,t=(!_&&e||n).__k=R(j,null,[t]),i||M,M,n.namespaceURI,!_&&e?[e]:i?null:n.firstChild?w.call(n.childNodes):null,o,!_&&e?e:i?i.__e:n.firstChild,_,r),it(o,t,r)}function ft(t,n){st(t,n,ft)}function ct(t,n,e){var _,i,o,r,u=L({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?_=n[o]:"ref"==o?i=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&&gt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&&gt.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&&gt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?_[0]=u:4===r?_[1]=Object.assign(_[1]||{},u):5===r?(_[1]=_[1]||{})[n[++o]]=u:6===r?_[1][n[++o]]+=u+"":r?(i=t.apply(u,nn(t,u,e,["",null])),_.push(i),u[0]?n[0]|=2:(n[o-2]=0,n[o]=i)):_.push(u)}return _},en=new Map;function _n(t){var n=en.get(this);return n||(n=new Map,en.set(this,n)),(n=nn(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,_=1,i="",o="",r=[0],u=function(t){1===_&&(t||(i=i.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,i):3===_&&(t||i)?(r.push(3,t,i),_=2):2===_&&"..."===i&&t?r.push(4,t,0):2===_&&i&&!t?r.push(5,0,!0,i):_>=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l<t.length;l++){l&&(1===_&&u(),u(l));for(var s=0;s<t[l].length;s++)n=t[l][s],1===_?"<"===n?(u(),r=[r],_=3):i+=n:4===_?"--"===i&&">"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState};
diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs
new file mode 100644
index 000000000..e67bb15c1
--- /dev/null
+++ b/examples/server/public_legacy/json-schema-to-grammar.mjs
@@ -0,0 +1,835 @@
+// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
+const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
+
+function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  if (minItems === 0 && maxItems === 1) {
+    return `${itemRule}?`;
+  }
+
+
+  const separatorRule = opts.separatorRule ?? '';
+  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
+
+  if (separatorRule === '') {
+    if (minItems === 1 && maxItems === undefined) {
+      return `${itemRule}+`;
+    } else if (minItems === 0 && maxItems === undefined) {
+      return `${itemRule}*`;
+    } else {
+      return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
+    }
+  }
+
+  const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
+  return minItems === 0 ? `(${result})?` : result;
+}
+
+function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
+  const hasMin = minValue !== null;
+  const hasMax = maxValue !== null;
+
+  function digitRange(fromChar, toChar) {
+      out.push("[");
+      if (fromChar === toChar) {
+          out.push(fromChar);
+      } else {
+          out.push(fromChar);
+          out.push("-");
+          out.push(toChar);
+      }
+      out.push("]");
+  }
+
+  function moreDigits(minDigits, maxDigits) {
+      out.push("[0-9]");
+      if (minDigits === maxDigits && minDigits === 1) {
+          return;
+      }
+      out.push("{");
+      out.push(minDigits.toString());
+      if (maxDigits !== minDigits) {
+          out.push(",");
+          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
+              out.push(maxDigits.toString());
+          }
+      }
+      out.push("}");
+  }
+
+  function uniformRange(fromStr, toStr) {
+      let i = 0;
+      while (i < fromStr.length && fromStr[i] === toStr[i]) {
+          i++;
+      }
+      if (i > 0) {
+          out.push("\"");
+          out.push(fromStr.slice(0, i));
+          out.push("\"");
+      }
+      if (i < fromStr.length) {
+          if (i > 0) {
+              out.push(" ");
+          }
+          const subLen = fromStr.length - i - 1;
+          if (subLen > 0) {
+              const fromSub = fromStr.slice(i + 1);
+              const toSub = toStr.slice(i + 1);
+              const subZeros = "0".repeat(subLen);
+              const subNines = "9".repeat(subLen);
+
+              let toReached = false;
+              out.push("(");
+              if (fromSub === subZeros) {
+                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
+                  out.push(" ");
+                  moreDigits(subLen, subLen);
+              } else {
+                  out.push("[");
+                  out.push(fromStr[i]);
+                  out.push("] ");
+                  out.push("(");
+                  uniformRange(fromSub, subNines);
+                  out.push(")");
+                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
+                      out.push(" | ");
+                      if (toSub === subNines) {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
+                          toReached = true;
+                      } else {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
+                      }
+                      out.push(" ");
+                      moreDigits(subLen, subLen);
+                  }
+              }
+              if (!toReached) {
+                  out.push(" | ");
+                  digitRange(toStr[i], toStr[i]);
+                  out.push(" ");
+                  uniformRange(subZeros, toSub);
+              }
+              out.push(")");
+          } else {
+              out.push("[");
+              out.push(fromStr[i]);
+              out.push("-");
+              out.push(toStr[i]);
+              out.push("]");
+          }
+      }
+  }
+
+  if (hasMin && hasMax) {
+      if (minValue < 0 && maxValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
+          out.push(")");
+          return;
+      }
+
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
+          out.push(") | ");
+          minValue = 0;
+      }
+
+      let minS = minValue.toString();
+      const maxS = maxValue.toString();
+      const minDigits = minS.length;
+      const maxDigits = maxS.length;
+
+      for (let digits = minDigits; digits < maxDigits; digits++) {
+          uniformRange(minS, "9".repeat(digits));
+          minS = "1" + "0".repeat(digits);
+          out.push(" | ");
+      }
+      uniformRange(minS, maxS);
+      return;
+  }
+
+  const lessDecimals = Math.max(decimalsLeft - 1, 1);
+
+  if (hasMin) {
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
+          out.push(") | [0] | [1-9] ");
+          moreDigits(0, decimalsLeft - 1);
+      } else if (minValue === 0) {
+          if (topLevel) {
+              out.push("[0] | [1-9] ");
+              moreDigits(0, lessDecimals);
+          } else {
+              moreDigits(1, decimalsLeft);
+          }
+      } else if (minValue <= 9) {
+          const c = minValue.toString();
+          const range_start = topLevel ? '1' : '0';
+          if (c > range_start) {
+              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(1, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, "9");
+          out.push(" ");
+          moreDigits(0, lessDecimals);
+      } else {
+          const minS = minValue.toString();
+          const length = minS.length;
+          const c = minS[0];
+
+          if (c > "1") {
+              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(length, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, c);
+          out.push(" (");
+          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
+          out.push(")");
+          if (c < "9") {
+              out.push(" | ");
+              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
+              out.push(" ");
+              moreDigits(length - 1, lessDecimals);
+          }
+      }
+      return;
+  }
+
+  if (hasMax) {
+      if (maxValue >= 0) {
+          if (topLevel) {
+              out.push("\"-\" [1-9] ");
+              moreDigits(0, lessDecimals);
+              out.push(" | ");
+          }
+          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
+      } else {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
+          out.push(")");
+      }
+      return;
+  }
+
+  throw new Error("At least one of minValue or maxValue must be set");
+}
+
+class BuiltinRule {
+  constructor(content, deps) {
+    this.content = content;
+    this.deps = deps || [];
+  }
+}
+
+const PRIMITIVE_RULES = {
+  boolean        : new BuiltinRule('("true" | "false") space', []),
+  'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
+  'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
+  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
+  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+  uuid           : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
+  char           : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
+  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
+  null           : new BuiltinRule('"null" space', []),
+};
+
+// TODO: support "uri", "email" string formats
+const STRING_FORMAT_RULES = {
+  'date'            : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
+  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
+  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
+  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+}
+
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
+const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
+const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
+
+const NON_LITERAL_SET = new Set('|.()[]{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
+
+export class SchemaConverter {
+  constructor(options) {
+    this._propOrder = options.prop_order || {};
+    this._allowFetch = options.allow_fetch || false;
+    this._dotall = options.dotall || false;
+    this._rules = {'space': SPACE_RULE};
+    this._refs = {};
+    this._refsBeingResolved = new Set();
+  }
+
+  _formatLiteral(literal) {
+    const escaped = literal.replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _formatRangeChar(literal) {
+    return JSON.stringify(literal).slice(1, -1).replace(
+      GRAMMAR_RANGE_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (escName in this._rules) {
+      if (this._rules[escName] === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules[key] = rule;
+    return key;
+  }
+
+  async resolveRefs(schema, url) {
+    const visit = async (n) => {
+      if (Array.isArray(n)) {
+        return Promise.all(n.map(visit));
+      } else if (typeof n === 'object' && n !== null) {
+        let ref = n.$ref;
+        let target;
+        if (ref !== undefined && !this._refs[ref]) {
+          if (ref.startsWith('https://')) {
+            if (!this._allowFetch) {
+              throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)');
+            }
+            const fetch = (await import('node-fetch')).default;
+
+            const fragSplit = ref.split('#');
+            const baseUrl = fragSplit[0];
+
+            target = this._refs[baseUrl];
+            if (!target) {
+              target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl);
+              this._refs[baseUrl] = target;
+            }
+
+            if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') {
+              return target;
+            }
+          } else if (ref.startsWith('#/')) {
+            target = schema;
+            ref = `${url}${ref}`;
+            n.$ref = ref;
+          } else {
+            throw new Error(`Unsupported ref ${ref}`);
+          }
+
+          const selectors = ref.split('#')[1].split('/').slice(1);
+          for (const sel of selectors) {
+            if (!target || !(sel in target)) {
+              throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`);
+            }
+            target = target[sel];
+          }
+
+          this._refs[ref] = target;
+        } else {
+          await Promise.all(Object.values(n).map(visit));
+        }
+      }
+
+      return n;
+    };
+
+    return visit(schema);
+  }
+
+  _generateUnionRule(name, altSchemas) {
+    return altSchemas
+      .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`))
+      .join(' | ');
+  }
+
+  _visitPattern(pattern, name) {
+    if (!pattern.startsWith('^') || !pattern.endsWith('$')) {
+      throw new Error('Pattern must start with "^" and end with "$"');
+    }
+    pattern = pattern.slice(1, -1);
+    const subRuleIds = {};
+
+    let i = 0;
+    const length = pattern.length;
+
+    const getDot = () => {
+      let rule;
+      if (this._dotall) {
+        rule = '[\\U00000000-\\U0010FFFF]';
+      } else {
+        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
+        rule = '[^\\x0A\\x0D]';
+      }
+      return this._addRule('dot', rule);
+    };
+
+
+    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
+
+    const transform = () => {
+      const start = i;
+      // For each component of this sequence, store its string representation and whether it's a literal.
+      // We only need a flat structure here to apply repetition operators to the last item, and
+      // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+      // (GBNF's syntax is luckily very close to regular expressions!)
+      const seq = [];
+
+      const joinSeq = () => {
+        const ret = [];
+        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
+          if (isLiteral) {
+            ret.push([[...g].map(x => x[0]).join(''), true]);
+          } else {
+            ret.push(...g);
+          }
+        }
+        if (ret.length === 1) {
+          return ret[0];
+        }
+        return [ret.map(x => toRule(x)).join(' '), false];
+      };
+
+      while (i < length) {
+        const c = pattern[i];
+        if (c === '.') {
+          seq.push([getDot(), false]);
+          i += 1;
+        } else if (c === '(') {
+          i += 1;
+          if (i < length) {
+            if (pattern[i] === '?') {
+              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
+            }
+          }
+          seq.push([`(${toRule(transform())})`, false]);
+        } else if (c === ')') {
+          i += 1;
+          if (start <= 0 || pattern[start - 1] !== '(') {
+            throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          return joinSeq();
+        } else if (c === '[') {
+          let squareBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== ']') {
+            if (pattern[i] === '\\') {
+              squareBrackets += pattern.slice(i, i + 2);
+              i += 2;
+            } else {
+              squareBrackets += pattern[i];
+              i += 1;
+            }
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          squareBrackets += ']';
+          i += 1;
+          seq.push([squareBrackets, false]);
+        } else if (c === '|') {
+          seq.push(['|', false]);
+          i += 1;
+        } else if (c === '*' || c === '+' || c === '?') {
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
+          i += 1;
+        } else if (c === '{') {
+          let curlyBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== '}') {
+            curlyBrackets += pattern[i];
+            i += 1;
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          curlyBrackets += '}';
+          i += 1;
+          const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim());
+          let minTimes, maxTimes;
+          if (nums.length === 1) {
+            minTimes = parseInt(nums[0], 10);
+            maxTimes = minTimes;
+          } else {
+            if (nums.length !== 2) {
+              throw new Error(`Invalid quantifier ${curlyBrackets}`);
+            }
+            minTimes = nums[0] ? parseInt(nums[0], 10) : 0;
+            maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity;
+          }
+
+          let [sub, subIsLiteral] = seq[seq.length - 1];
+
+          if (!subIsLiteral) {
+            let id = subRuleIds[sub];
+            if (id === undefined) {
+              id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+              subRuleIds[sub] = id;
+            }
+            sub = id;
+          }
+
+          seq[seq.length - 1] = [
+            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
+            false
+          ];
+        } else {
+          let literal = '';
+          while (i < length) {
+            if (pattern[i] === '\\' && i < length - 1) {
+              const next = pattern[i + 1];
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
+                i += 1;
+                literal += pattern[i];
+                i += 1;
+              } else {
+                literal += pattern.slice(i, i + 2);
+                i += 2;
+              }
+            } else if (pattern[i] === '"') {
+              literal += '\\"';
+              i += 1;
+            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
+                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
+              literal += pattern[i];
+              i += 1;
+            } else {
+              break;
+            }
+          }
+          if (literal !== '') {
+            seq.push([literal, true]);
+          }
+        }
+      }
+
+      return joinSeq();
+    };
+
+    return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
+  }
+
+  _notStrings(strings) {
+    class TrieNode {
+      constructor() {
+        this.children = {};
+        this.isEndOfString = false;
+      }
+
+      insert(str) {
+        let node = this;
+        for (const c of str) {
+          node = node.children[c] = node.children[c] || new TrieNode();
+        }
+        node.isEndOfString = true;
+      }
+    }
+
+    const trie = new TrieNode();
+    for (const s of strings) {
+      trie.insert(s);
+    }
+
+    const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+    const out = ['["] ( '];
+
+    const visit = (node) => {
+      const rejects = [];
+      let first = true;
+      for (const c of Object.keys(node.children).sort()) {
+        const child = node.children[c];
+        rejects.push(c);
+        if (first) {
+          first = false;
+        } else {
+          out.push(' | ');
+        }
+        out.push(`[${c}]`);
+        if (Object.keys(child.children).length > 0) {
+          out.push(' (');
+          visit(child);
+          out.push(')');
+        } else if (child.isEndOfString) {
+          out.push(` ${charRuleName}+`);
+        }
+      }
+      if (Object.keys(node.children).length > 0) {
+        if (!first) {
+          out.push(' | ');
+        }
+        out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
+      }
+    };
+
+    visit(trie);
+
+    out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
+    return out.join('');
+  }
+
+  _resolveRef(ref) {
+    let refName = ref.split('/').pop();
+    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
+      this._refsBeingResolved.add(ref);
+      const resolved = this._refs[ref];
+      refName = this.visit(resolved, refName);
+      this._refsBeingResolved.delete(ref);
+    }
+    return refName;
+  }
+
+  _generateConstantRule(value) {
+    return this._formatLiteral(JSON.stringify(value));
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const schemaFormat = schema.format;
+    const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name;
+
+    const ref = schema.$ref;
+    if (ref !== undefined) {
+      return this._addRule(ruleName, this._resolveRef(ref));
+    } else if (schema.oneOf || schema.anyOf) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
+    } else if (Array.isArray(schemaType)) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
+    } else if ('enum' in schema) {
+      const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
+      return this._addRule(ruleName, rule);
+    } else if ((schemaType === undefined || schemaType === 'object') &&
+               ('properties' in schema ||
+                ('additionalProperties' in schema && schema.additionalProperties !== true))) {
+      const required = new Set(schema.required || []);
+      const properties = Object.entries(schema.properties ?? {});
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
+    } else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) {
+      const required = new Set();
+      const properties = [];
+      const addComponent = (compSchema, isRequired) => {
+        const ref = compSchema.$ref;
+        if (ref !== undefined) {
+          compSchema = this._refs[ref];
+        }
+
+        if ('properties' in compSchema) {
+          for (const [propName, propSchema] of Object.entries(compSchema.properties)) {
+            properties.push([propName, propSchema]);
+            if (isRequired) {
+              required.add(propName);
+            }
+          }
+        }
+      };
+
+      for (const t of schema.allOf) {
+        if ('anyOf' in t) {
+          for (const tt of t.anyOf) {
+            addComponent(tt, false);
+          }
+        } else {
+          addComponent(t, true);
+        }
+      }
+
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
+    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
+      const items = schema.items ?? schema.prefixItems;
+      if (Array.isArray(items)) {
+        return this._addRule(
+          ruleName,
+          '"[" space ' +
+            items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') +
+            ' "]" space'
+        );
+      } else {
+        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
+        const minItems = schema.minItems || 0;
+        const maxItems = schema.maxItems;
+        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
+      }
+    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
+      return this._visitPattern(schema.pattern, ruleName);
+    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
+      return this._addPrimitive(
+        ruleName === 'root' ? 'root' : schemaFormat,
+        PRIMITIVE_RULES['uuid']
+      );
+    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
+      const primName = `${schema.format}-string`
+      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
+    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
+      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+      const minLen = schema.minLength || 0;
+      const maxLen = schema.maxLength;
+      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
+    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
+      let minValue = null;
+      let maxValue = null;
+      if ('minimum' in schema) {
+        minValue = schema.minimum;
+      } else if ('exclusiveMinimum' in schema) {
+        minValue = schema.exclusiveMinimum + 1;
+      }
+      if ('maximum' in schema) {
+        maxValue = schema.maximum;
+      } else if ('exclusiveMaximum' in schema) {
+        maxValue = schema.exclusiveMaximum - 1;
+      }
+
+      const out = ["("];
+      _generateMinMaxInt(minValue, maxValue, out);
+      out.push(") space");
+      return this._addRule(ruleName, out.join(''));
+    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
+      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
+    } else {
+      if (!(schemaType in PRIMITIVE_RULES)) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+    }
+  }
+
+  _addPrimitive(name, rule) {
+    let n = this._addRule(name, rule.content);
+    for (const dep of rule.deps) {
+      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
+      if (!depRule) {
+        throw new Error(`Rule ${dep} not known`);
+      }
+      if (!(dep in this._rules)) {
+        this._addPrimitive(dep, depRule);
+      }
+    }
+    return n;
+  }
+
+  _buildObjectRule(properties, required, name, additionalProperties) {
+    const propOrder = this._propOrder;
+    // sort by position in prop_order (if specified) then by original order
+    const sortedProps = properties.map(([k]) => k).sort((a, b) => {
+      const orderA = propOrder[a] || Infinity;
+      const orderB = propOrder[b] || Infinity;
+      return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b);
+    });
+
+    const propKvRuleNames = {};
+    for (const [propName, propSchema] of properties) {
+      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
+      propKvRuleNames[propName] = this._addRule(
+        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
+        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
+      );
+    }
+    const requiredProps = sortedProps.filter(k => required.has(k));
+    const optionalProps = sortedProps.filter(k => !required.has(k));
+
+    if (additionalProperties) {
+      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
+      const valueRule =
+        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
+        : this._addPrimitive('value', PRIMITIVE_RULES['value']);
+
+      const key_rule =
+        sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
+        : this._addRule(`${subName}-k`, this._notStrings(sortedProps));
+
+      propKvRuleNames['*'] = this._addRule(
+        `${subName}-kv`,
+        `${key_rule} ":" space ${valueRule}`);
+      optionalProps.push('*');
+    }
+
+    let rule = '"{" space ';
+    rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space ');
+
+    if (optionalProps.length > 0) {
+      rule += ' (';
+      if (requiredProps.length > 0) {
+        rule += ' "," space ( ';
+      }
+
+      const getRecursiveRefs = (ks, firstIsOptional) => {
+        const [k, ...rest] = ks;
+        const kvRuleName = propKvRuleNames[k];
+        let res;
+        const commaRef = `( "," space ${kvRuleName} )`;
+        if (firstIsOptional) {
+          res = commaRef + (k === '*' ? '*' : '?');
+        } else {
+          res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
+        }
+        if (rest.length > 0) {
+          res += ' ' + this._addRule(
+            `${name ?? ''}${name ? '-' : ''}${k}-rest`,
+            getRecursiveRefs(rest, true)
+          );
+        }
+        return res;
+      };
+
+      rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | ');
+      if (requiredProps.length > 0) {
+        rule += ' )';
+      }
+      rule += ' )?';
+    }
+
+    rule += ' "}" space';
+
+    return rule;
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) {
+      grammar += `${name} ::= ${rule}\n`;
+    }
+    return grammar;
+  }
+}
+
+// Helper function to group elements by a key function
+function* groupBy(iterable, keyFn) {
+  let lastKey = null;
+  let group = [];
+  for (const element of iterable) {
+    const key = keyFn(element);
+    if (lastKey !== null && key !== lastKey) {
+      yield [lastKey, group];
+      group = [];
+    }
+    group.push(element);
+    lastKey = key;
+  }
+  if (group.length > 0) {
+    yield [lastKey, group];
+  }
+}
diff --git a/examples/server/public_legacy/loading.html b/examples/server/public_legacy/loading.html
new file mode 100644
index 000000000..c3fd19a0f
--- /dev/null
+++ b/examples/server/public_legacy/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/examples/server/public_legacy/prompt-formats.js b/examples/server/public_legacy/prompt-formats.js
new file mode 100644
index 000000000..73ddb7187
--- /dev/null
+++ b/examples/server/public_legacy/prompt-formats.js
@@ -0,0 +1,331 @@
+// extended list
+export const promptFormats = {
+  "alpaca": {
+  template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Instruction",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "chatml": {
+  template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`,
+
+  historyTemplate: `<|im_start|>{{name}}\n{{message}}`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "<|im_end|>\n",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "commandr": {
+  template: `<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`,
+
+  historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`,
+
+  char: "CHATBOT_TOKEN",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "USER_TOKEN",
+  userMsgPrefix: "",
+  userMsgSuffix: "<|END_OF_TURN_TOKEN|>",
+
+  stops: ""
+  },
+  // ref: https://docs.cohere.com/docs/prompting-command-r
+
+  // ----------------------------
+
+  "llama2": {
+  template: `<s>[INST] <<SYS>>\n{{prompt}}\n<</SYS>>\n\nTest Message [/INST] Test Successfull </s>{{history}}{{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "</s>",
+
+  user: "User",
+  userMsgPrefix: "<s>[INST] ",
+  userMsgSuffix: " [/INST]",
+
+  stops: ""
+  },
+  // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+
+  // ----------------------------
+
+  "llama3": {
+  template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`,
+
+  historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|eot_id|>"
+  },
+  // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3
+
+  // ----------------------------
+
+  "openchat": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "phi3": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|end|>"
+  },
+  // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
+
+  // ----------------------------
+
+  "vicuna": {
+  template: `{{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}\n`,
+
+  char: "ASSISTANT",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "USER",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+  // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1
+
+  // ----------------------------
+
+  "deepseekCoder": {
+  template: `{{prompt}}{{history}}{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Instruction",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|EOT|>"
+  },
+
+  // ----------------------------
+
+  "med42": {
+  template: `<|system|>: {{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>: {{message}}\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "prompter",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "neuralchat": {
+  template: `### System:\n{{prompt}}\n{{history}}{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}\n`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "nousHermes": {
+  template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Input",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "openchatMath": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "orion": {
+  template: `<s>Human: Test Message\n\nAssistant: </s>Test Successful</s>{{history}}{{char}}:`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant </s>",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Human",
+  userMsgPrefix: "",
+  userMsgSuffix: "\n\n",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "sauerkraut": {
+  template: `{{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `
+  {{name}}: {{message}}\n`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "starlingCode": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "yi34b": {
+  template: `{{history}} {{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Human",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "zephyr": {
+  template: `<|system|>\n{{prompt}}</s>\n{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>\n{{message}}</s>\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  }
+  };
diff --git a/examples/server/public_legacy/style.css b/examples/server/public_legacy/style.css
new file mode 100644
index 000000000..087cc62da
--- /dev/null
+++ b/examples/server/public_legacy/style.css
@@ -0,0 +1,954 @@
+@import url("colorthemes.css");
+
+body {
+  font-family: 'Arial', sans-serif;
+  font-size: 90%;
+  background-color: var(--background-color-1);
+  color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */
+  max-width: 600px;
+  min-width: 300px;
+  line-height: 1.2;
+  margin: 0 auto;
+  padding: 0 0.5em;
+  transition: background-color 0.3s;
+}
+
+::selection {
+  color: var(--button-primary-text) ;
+  background: var(--button-primary-color);
+}
+
+code, pre code {
+  font-family: 'Courier New', monospace;
+}
+
+#container {
+  margin: 0em auto;
+  display: flex;
+  flex-direction: column;
+  justify-content: space-between;
+  height: 100%;
+}
+
+main {
+  margin: 3px;
+  display: flex;
+  flex-direction: column;
+  justify-content: space-between;
+  gap: 1em;
+  flex-grow: 1;
+  overflow-y: auto;
+  border: 1px solid var(--border-color-3);
+  border-radius: 5px;
+  padding: 0.5em;
+}
+
+p {
+  overflow-wrap: break-word;
+  word-wrap: break-word;
+  hyphens: auto;
+  margin-top: 0.5em;
+  margin-bottom: 0.5em;
+}
+
+#write form {
+  margin: 1em 0 0 0;
+  display: flex;
+  flex-direction: column;
+  gap: 0.5em;
+  align-items: stretch;
+}
+
+.right {
+  display: flex;
+  flex-direction: row;
+  gap: 0.5em;
+  justify-content: flex-end;
+  margin-bottom: 30px;
+}
+
+.two-columns {
+  width: 97%;
+  max-width: 97%;
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 1em;
+  position: relative;
+}
+
+.json-schema-controls {
+  margin-top: 10px;
+  width: 100%;
+  max-width: 100%;
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+.json-schema-controls > * {
+  flex: 1;
+}
+
+/* titles of the details-summary boxes */
+.summary-title {
+  font-weight: 600;
+  font-size: x-small;
+  color: var(--text-color-subtile-1);
+  text-transform: uppercase;
+  /* transition: ; */
+}
+
+fieldset {
+  border: none;
+  padding: 0;
+  margin: 0;
+  color: var(--text-color-plain);
+}
+
+fieldset.two {
+  display: grid;
+  grid-template: "a a a";
+  gap: 1em;
+  align-items: center;
+  font-size: x-small;
+  color: var(--text-color-plain);
+}
+
+fieldset.three {
+  display: grid;
+  grid-template: "a a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--text-color-plain);
+}
+
+/* titles of name fields*/
+fieldset.names {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+/* titles of params fields*/
+fieldset.params {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-4);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+fieldset.dropdowns {
+  -webkit-appearance: none;
+  display: flex;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: red;
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+/* input of name fields*/
+.names input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.chat-id-color {
+  color: var(--chat-id-color);
+}
+
+details {
+  border: 1px solid var(--border-color-2);
+  border-radius: 5px;
+  padding: 0.5em 0.5em 0;
+  margin-top: 0.5em;
+}
+
+summary {
+  font-weight: bold;
+  margin: -0.5em -0.5em 0;
+  padding: 0.5em;
+  cursor: pointer;
+}
+
+details[open] {
+  padding: 0.5em;
+}
+
+textarea-sec, input-sec, button-sec {
+  padding: 10px;
+  height: 40px;
+  align-items: center;
+}
+
+textarea-sec::placeholder, input-sec::placeholder {
+  padding-left: 10px;
+}
+
+.toggleCheckbox {
+  display: none;
+}
+
+.toggleContainer {
+  position: relative;
+  display: grid;
+  grid-template-columns: repeat(2, 1fr);
+  width: fit-content;
+  border: 3px solid var(--border-color-2);
+  border-radius: 20px;
+  background: var(--border-color-2);
+  font-size: small;
+  cursor: pointer;
+  overflow: hidden;
+}
+
+/* toggle button current state */
+.toggleContainer::before {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  content: '';
+  position: absolute;
+  width: 50%;
+  height: 100%;
+  left: 0%;
+  border-radius: 20px;
+  transition: all 0.3s;
+}
+
+.toggleContainer div {
+  padding: 6px;
+  text-align: center;
+  z-index: 1;
+  transition: color 0.3s;
+}
+
+.toggleCheckbox:checked + .toggleContainer::before {
+  left: 50%;
+}
+
+.toggleCheckbox:checked + .toggleContainer div:first-child {
+  color: var(--text-color-subtile-2);
+}
+
+.toggleCheckbox:checked + .toggleContainer div:last-child {
+  color: var(--button-primary-text);
+}
+
+.toggleCheckbox + .toggleContainer div:first-child {
+  color: var(--button-primary-text);
+}
+
+.toggleCheckbox + .toggleContainer div:last-child {
+  color: var(--text-color-subtile-2);
+}
+
+select {
+  padding: 5px;
+  margin-right: 5px;
+  border-radius: 4px;
+  border: 1px solid var(--secondary-color-4);
+  background-color: var(--primary-color-3);
+  color: var(--secondary-color-4);
+  cursor: pointer;
+}
+
+select:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 1px var(--border-focus-shadow);
+}
+
+.button-container {
+  display: flex;
+  justify-content: flex-end;
+}
+
+button {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  transition: background-color 0.1s;
+  border-radius: 12px;
+  font-size: x-small;
+  font-weight: 600;
+  text-shadow: 0px 0px 30px #ffffff;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 10px 20px;
+  display: inline-block;
+  cursor: pointer;
+}
+
+button:hover {
+  color: var(--button-primary-text-hover);
+  background-color: var(--button-primary-color-hover);
+  border: 1px solid var(--button-primary-border-hover);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+button:active {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+button:disabled {
+  color: var(--button-tertiary-text);
+  background-color: var(--button-tertiary-color);
+  border: 1px solid var(--button-tertiary-border);
+  font-size: x-small;
+  font-weight: 600;
+  cursor: not-allowed;
+}
+
+.reset-button {
+  background-color: var(--button-secondary-color);
+  border: 1px solid var(--button-secondary-color);
+  color: var(--button-secondary-text);
+  width: fit-content;
+  height: fit-content;
+  font-size: x-small;
+  font-weight: 600;
+  border-radius: 50px;
+  overflow: hidden;
+}
+
+.reset-button:hover {
+  color: var(--button-alert-text-hover);
+  background-color: var(--button-alert-color-hover);
+  border: 1px solid var(--button-alert-border-hover);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.reset-button:active {
+  color: var(--button-alert-text-active);
+  background-color: var(--button-alert-color-active);
+  border: 1px solid var(--button-alert-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.button-grammar {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  border-radius: 10px;
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 2px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+}
+
+.button-grammar:hover {
+  color: var(--button-primary-text-hover);
+  background-color: var(--button-primary-color-hover);
+  border: 1px solid var(--button-primary-border-hover);
+  border-radius: 10px;
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 2px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+}
+
+.button-grammar:active {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.button-back {
+  background-color: var(--button-secondary-color);
+  border: 1px solid var(--button-secondary-color);
+  color: var(--button-secondary-text);
+  transition: background-color 0.1s;
+  border-radius: 12px;
+  font-size: x-small;
+  font-weight: 600;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 10px 20px;
+  display: inline-block;
+  cursor: pointer;
+}
+
+.button-back:hover {
+  color: var(--button-secondary-text-hover);
+  background-color: var(--button-secondary-color-hover);
+  border: 1px solid var(--button-secondary-border-hover);
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 4px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+  border-radius: 12px;
+}
+
+.button-back:active {
+  color: var(--button-secondary-text-active);
+  background-color: var(--button-secondary-color-active);
+  border: 1px solid var(--button-secondary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.prob-set {
+  padding: 0.3em;
+  border-bottom: 1px solid red; /* unknown */
+}
+
+.popover-content {
+  position: absolute;
+  background-color: white;
+  padding: 0.2em;
+  box-shadow: 0 0 13px rgba(0, 0, 0, 0.1);
+}
+
+.grammar {
+  width: 97%;
+  max-width: 97%;
+}
+
+textarea {
+  padding: 5px;
+  flex-grow: 1;
+  width: 100%;
+  max-width: 100%;
+  border-radius: 8px;
+  border: 1px solid var(--border-color-1);
+  resize: none;
+  height: 6em;
+}
+
+textarea:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* "props" frame */
+input[type="text"],
+input[type="range"] {
+  padding: 5px;
+  border-radius: 8px;
+  border: 1px solid var(--border-color-1);
+}
+
+/* "names and props" frame focused*/
+input[type="text"]:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+input[type="range"]:hover {
+  opacity: 1;
+}
+
+input[type="range"]:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+  background-size: var(--slider-track-size-focus);
+}
+
+input[type="range"]::-moz-range-thumb {
+  width: 6px;
+  height: 25px;
+  border: 1px solid var(--ui-range-thumb-border);
+  border-radius: 5px;
+  background-color: var(--ui-range-thumb-color);
+  cursor: pointer;
+}
+
+input[type="range"] {
+  -webkit-appearance: none;
+  width: 80%;
+  height: 1px;
+  border: 1px solid var(--border-color-1);
+  border-radius: 8px;
+  background: var(--border-color-2);
+  outline: none;
+  opacity: 0.7;
+  -webkit-transition: .2s;
+  transition: opacity .2s;
+}
+
+input[type="range"]::-webkit-slider-thumb {
+  -webkit-appearance: none;
+  appearance: none;
+  width: 6px;
+  height: 25px;
+  border: 1px solid var(--ui-range-thumb-border);
+  border-radius: 5px;
+  background-color: var(--ui-range-thumb-color);
+  cursor: pointer;
+}
+
+input[type="range"]::-webkit-slider-runnable-track {
+  background-size: var(--slider-track-size);
+}
+
+input[type="radio"] {
+  accent-color:   var(--theme-nuance-color-2);
+}
+
+.chat-input-container {
+  position: relative;
+  max-width: 97%;
+  min-width: 97%;
+}
+
+.chat-input-label {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--text-color-plain);
+  pointer-events: none;
+  margin-left: 5px;
+  margin-top: 5px;
+}
+
+textarea#chat-input {
+  padding-top: 10px;
+  padding-left: 10px;
+  font-size: medium;
+  border: 1px solid var(--border-color-2);
+  resize: vertical;
+}
+
+textarea#chat-input:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+.input-container {
+  position: relative;
+  box-sizing: border-box;
+  width: 100%; /* Setzt die Breite auf 100% */
+  max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */
+}
+
+.input-container:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+/* titles of name fields*/
+/* fieldset.names {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+} */
+
+/* input of name fields*/
+/* .names input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+} */
+
+fieldset.apiKey {
+  width: 100%;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+.apiKey {
+  font-family: Arial, sans-serif;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.apiKey:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+.apiKey input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.apiKey label {
+  display: inline-block;
+  width: auto;
+  margin-right: 5px;
+}
+
+textarea#api_key {
+  padding-top: 10px;
+  padding-left: 10px;
+  font-size: medium;
+  border: 1px solid var(--border-color-2);
+  resize: vertical;
+}
+
+textarea#api_key:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* embedded title of the system prompt text area */
+.input-label {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--theme-nuance-color-4);
+  pointer-events: none;
+  border-radius: 8px 8px 0px 0px;
+  padding-top: 10px;
+  padding-left: 13px;
+  padding-right: 0px;
+  margin-top: 1px;
+  margin-left: 1px;
+  margin-right: 20px;
+  text-transform: uppercase;
+  font-weight: 600;
+  font-size: small;
+  background: rgba(255, 255, 255, 0.5);
+  backdrop-filter: blur(10px);
+  -webkit-backdrop-filter: blur(10px); /* for safari */
+  width: 97%;
+  /* display: block;
+  box-sizing: border-box; */
+}
+
+/* embedded title of the prompt style areas */
+.input-label-sec {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--theme-nuance-color-4);
+  pointer-events: none;
+  margin-left: 13px;
+  margin-top: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+  font-size: x-small;
+}
+
+/* system prompt input area */
+textarea.persistent-input {
+  padding-top: 42px;
+  padding-left: 11px;
+  width: 97%;
+  max-width: 97%;
+  height: 50px;
+  font-size: medium;
+  overscroll-behavior: contain;
+}
+
+/* system prompt box */
+.persistent-input {
+  height: auto;
+  width: 100%;
+  max-width: 100%;
+  min-height: 50px;
+  padding: 3px;
+  transition: min-height 0.3s ease;
+}
+
+/* chat history box */
+.persistent-input:focus {
+  height: auto;
+  min-height: 150px;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+textarea.persistent-input:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* prompt style input area */
+textarea.persistent-input-sec {
+  width: 97%;
+  max-width: 97%;
+  padding-top: 42px;
+  padding-left: 11px;
+  font-size: small;
+  border: 1px solid var(--border-color-1);
+  overscroll-behavior: contain;
+}
+
+textarea.persistent-input-sec:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* chat history box */
+.persistent-input-sec {
+  height: auto;
+  min-height: 150px;
+}
+
+img {
+  border-radius: 8px;
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+  width: 50%;
+}
+
+/* code area background */
+pre code {
+  display: block;
+  background-color: var(--code-background-color);
+  color: var(--code-text-color);
+  padding: 0.2em 0.2em;
+  border-radius: 5px;
+}
+
+/* code area text */
+code {
+  font-family: monospace;
+  font-weight: bold;
+  padding: 0.1em 0.3em;
+  border-radius: 5px;
+}
+
+fieldset label {
+  margin: 0.5em 0;
+  display: block;
+}
+
+fieldset label.slim {
+  margin: 0 0.5em;
+  display: inline;
+}
+
+header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  text-align: center;
+  padding-left: 15px;
+}
+
+.generation-statistics:hover {
+  color: var(--theme-nuance-color-4);
+  cursor: default;
+}
+
+footer {
+  font-size: 80%;
+  color: var(--background-color-3);
+  text-align: center;
+  cursor: default;
+}
+
+footer a {
+  color: var(--background-color-4); /* Color of the link */
+  text-decoration: none; /* No underlining */
+  font-weight: bold; /* Bold print */
+}
+
+footer a:hover {
+  color: var(--theme-nuance-color-4); /* Color of the link when hovering */
+  text-decoration: underline; /* Underlining when hovering */
+}
+
+.mode-chat textarea[name=prompt] {
+  height: 8.5em;
+  border: 1px solid var(--primary-color-3);
+}
+
+.mode-completion textarea[name=prompt] {
+  height: 30em;
+  border: 1px solid var(--primary-color-3);
+}
+
+@keyframes loading-bg-wipe {
+  0% {
+    background-position: 0%;
+  }
+  100% {
+    background-position: 100%;
+  }
+}
+
+.loading {
+  background-size: 50% 100%;
+  background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+  animation: loading-bg-wipe 2s linear infinite;
+}
+
+.dropbtn {
+  color: var(--button-primary-color);
+  background-color: var(--background-color-1);
+  border: 1px solid var(--background-color-1);
+  transition: background-color 0.1s;
+  border-radius: 4px 4px 0px 0px;
+  font-size: x-small;
+  font-weight: 600;
+  text-shadow: 0px 0px 2px #99999990;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 5px 20px;
+  display: inline-block;
+  cursor: pointer;
+  top: 0;
+}
+
+.dropbtn svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-color);
+}
+
+.dropbtn:hover svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-text);
+}
+
+.dropbtn:focus {
+  outline: none; /* Removes the blue border that appears when the button is focused */
+}
+
+.dropdown {
+  position: relative;
+  display: inline-block;
+}
+
+.dropdown-content {
+  /* display: none; */
+  position: absolute;
+  right: 0;
+  text-align: end;
+  color: var(--button-secondary-color);
+  background-color: var(--text-color-subtile-2);
+  border-radius: 4px 4px 4px 4px;
+  min-width: 160px;
+  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+  z-index: 1;
+  /* Verstecke den Inhalt sofort */
+  opacity: 0;
+  visibility: hidden;
+  /* übergangsverzögerung für das Verschwinden */
+  transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out;
+  transition-delay: 0.2s;
+}
+
+#dropdown-content {transition-timing-function: ease;}
+
+.dropdown-content:hover {
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a {
+  color: var(--border-color-2);
+  padding: 12px 16px;
+  border-radius: 4px 4px 4px 4px;
+  text-decoration: none;
+  display: block;
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a:hover {
+  color: var(--border-color-2);
+  background-color: var(--text-color-subtile-1);
+  font-weight: 600;
+}
+
+.dropdown:hover .dropdown-content {
+  /* display: block; */
+  border-radius: 4px 4px 4px 4px;
+  /* Übergang ohne Verzögerung für das Erscheinen */
+  opacity: 1;
+  visibility: visible;
+  transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s;
+}
+
+.dropdown:hover .dropbtn {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  font-size: x-small;
+  font-weight: 600;
+  stroke: var(--button-primary-text);
+}
+
+.dropdown:hover .dropbtn svg{
+  stroke: var(--button-primary-text);
+}
+
+/* .dropdown:active .dropbtn {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+  background-color: var(-background-color-4);
+} */
+
+/* .omni {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.5em;
+  border: 1px solid var(--border-color-3);
+  border-radius: 5px;
+  margin: 0.5em 0;
+} */
diff --git a/examples/server/public_legacy/system-prompts.js b/examples/server/public_legacy/system-prompts.js
new file mode 100644
index 000000000..f7df7d648
--- /dev/null
+++ b/examples/server/public_legacy/system-prompts.js
@@ -0,0 +1,68 @@
+export const systemPrompts = {
+  default: {
+    systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision."
+  },
+  empty: {
+    systemPrompt: ""
+  },
+  airoboros: {
+    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
+  },
+  alpaca: {
+    systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+  },
+  atlas: {
+    systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)."
+  },
+  atlas_de: {
+    systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)."
+  },
+  commandrempty: {
+    systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n"
+  },
+  commandrexample: {
+    systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available."
+  },
+  cot: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query."
+  },
+  deduce: {
+    systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer."
+  },
+  deepseekcoder: {
+    systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
+  },
+  jordan: {
+    systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information."
+  },
+  leomistral: {
+    systemPrompt: "Du bist ein hilfreicher Assistent."
+  },
+  med42: {
+    systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE."
+  },
+  mistralopenorca: {
+    systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!"
+  },
+  migeltot: {
+    systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers."
+  },
+  orcamini: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can."
+  },
+  samantha: {
+    systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha."
+  },
+  sauerkraut: {
+    systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten."
+  },
+  scarlett: {
+    systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI."
+  },
+  synthia: {
+    systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
+  },
+  vicuna: {
+    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
+  },
+  };
diff --git a/examples/server/public_legacy/theme-beeninorder.css b/examples/server/public_legacy/theme-beeninorder.css
new file mode 100755
index 000000000..f6e0e2900
--- /dev/null
+++ b/examples/server/public_legacy/theme-beeninorder.css
@@ -0,0 +1,228 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration was a batman wallpaper that i have on my phone */
+
+.theme-beeninorder {
+
+--primary-color-1:      hsl(202, 11%, 19%);
+--primary-color-2:      hsl(202, 11%, 23%);
+--primary-color-3:      hsl(201, 11%, 28%);
+--primary-color-4:      hsl(201, 11%, 40%);
+
+--secondary-color-1:    hsl(201, 11%, 80%);
+--secondary-color-2:    hsl(201, 11%, 74%);
+--secondary-color-3:    hsl(201, 11%, 67%);
+--secondary-color-4:    hsl(201, 11%, 60%);
+
+
+--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
+
+
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(201, 11%, 19%);
+    --primary-color-1-hue: 201;
+    --primary-color-1-saturation: 11%;
+    --primary-color-1-lightness: 19%;
+
+--primary-color-2: hsl(201, 11%, 23%);
+    --primary-color-2-hue: 201;
+    --primary-color-2-saturation: 11%;
+    --primary-color-2-lightness: 23%;
+
+--primary-color-3: hsl(201, 11%, 28%);
+    --primary-color-3-hue: 201;
+    --primary-color-3-saturation: 11%;
+    --primary-color-3-lightness: 28%;
+
+--primary-color-4: hsl(201, 11%, 40%);
+    --primary-color-4-hue: 201;
+    --primary-color-4-saturation: 11%;
+    --primary-color-4-lightness: 40%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(201, 11%, 80%);
+--secondary-color-1-hue: 201;
+--secondary-color-1-saturation: 11%;
+--secondary-color-1-lightness: 80%;
+
+--secondary-color-2: hsl(201, 11%, 74%);
+--secondary-color-2-hue: 201;
+--secondary-color-2-saturation: 11%;
+--secondary-color-2-lightness: 74%;
+
+--secondary-color-3: hsl(201, 11%, 67%);
+--secondary-color-3-hue: 201;
+--secondary-color-3-saturation: 11%;
+--secondary-color-3-lightness: 67%;
+
+--secondary-color-4: hsl(201, 11%, 60%);
+--secondary-color-4-hue: 201;
+--secondary-color-4-saturation: 11%;
+--secondary-color-4-lightness: 60%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-1-hue:             44.5;
+    --theme-nuance-color-1-saturation:      96.7%;
+    --theme-nuance-color-1-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-2-hue:             44.5;
+    --theme-nuance-color-2-saturation:      96.7%;
+    --theme-nuance-color-2-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-3-hue:             44.5;
+    --theme-nuance-color-3-saturation:      96.7%;
+    --theme-nuance-color-3-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-4-hue:             44.5;
+    --theme-nuance-color-4-saturation:      96.7%;
+    --theme-nuance-color-4-lightness:       52.9%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(232, 40%, 45%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  #ffd95f;
+    --theme-green-color:   #A3BE8C;
+    --theme-purple-color:  hsl(232, 30%, 40%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--primary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(201,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:   var(--secondary-color-1);
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active: var(--secondary-color-1);
+
+--button-secondary-color-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/examples/server/public_legacy/theme-ketivah.css b/examples/server/public_legacy/theme-ketivah.css
new file mode 100755
index 000000000..ee80f3c14
--- /dev/null
+++ b/examples/server/public_legacy/theme-ketivah.css
@@ -0,0 +1,201 @@
+/* Author: Yazan Agha-Schrader */
+
+.theme-ketivah {
+
+    /* ---------- PRIMARY COLORS ----------------- */
+    --primary-color-1: hsl(0, 0%,    99.2%);
+    --primary-color-1-hue:         0;
+    --primary-color-1-saturation:  0%;
+    --primary-color-1-lightness:   99.2%;
+
+    --primary-color-2: hsl(0, 0%,    95%);
+    --primary-color-2-hue:         0;
+    --primary-color-2-saturation:  0%;
+    --primary-color-2-lightness:   95%;
+
+    --primary-color-3: hsl(0, 0%,    88%);
+    --primary-color-3-hue:         0;
+    --primary-color-3-saturation:  0%;
+    --primary-color-3-lightness:   88%;
+
+    --primary-color-4: hsl(0, 0%,    80%);
+    --primary-color-4-hue:         0;
+    --primary-color-4-saturation:  0%;
+    --primary-color-4-lightness:   80%;
+
+    /* ---------- SECONDARY COLORS --------------- */
+    --secondary-color-1: hsl(0, 0%,    20%);
+    --secondary-color-1-hue:         0;
+    --secondary-color-1-saturation:  0%;
+    --secondary-color-1-lightness:   20%;
+
+    --secondary-color-2: hsl(0, 0%,    23.1%);
+    --secondary-color-2-hue:         0;
+    --secondary-color-2-saturation:  0%;
+    --secondary-color-2-lightness:   23.1%;
+
+    --secondary-color-3: hsl(0, 0%,    29%);
+    --secondary-color-3-hue:         0;
+    --secondary-color-3-saturation:  0%;
+    --secondary-color-3-lightness:   29%;
+
+    --secondary-color-4: hsl(0, 0.0%,  36.1%);
+    --secondary-color-4-hue:              0.0;
+    --secondary-color-4-saturation:       0.0%;
+    --secondary-color-4-lightness:       36.1%;
+
+    /* ----------- NUANCES COLORS ---------------- */
+    --theme-nuance-color-1: hsl(165.2, 0%, 35.1%);
+    --theme-nuance-color-1-hue:             165.2;
+    --theme-nuance-color-1-saturation:       82.1%;
+    --theme-nuance-color-1-lightness:        35.1%;
+
+    --theme-nuance-color-2: hsl(165.2, 0%, 35.1%);
+    --theme-nuance-color-2-hue:             165.2;
+    --theme-nuance-color-2-saturation:       82.1%;
+    --theme-nuance-color-2-lightness:        35.1%;
+
+    --theme-nuance-color-3: hsl(165.2, 0%, 35.3%);
+    --theme-nuance-color-3-hue:             165.2;
+    --theme-nuance-color-3-saturation:       81.1%;
+    --theme-nuance-color-3-lightness:        35.3%;
+
+    --theme-nuance-color-4: hsl(164.9, 0%, 27.6%);
+    --theme-nuance-color-4-hue:             164.9;
+    --theme-nuance-color-4-saturation:       81.6%;
+    --theme-nuance-color-4-lightness:        27.6%;
+
+    /* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(0.3, 80.0%, 50.0%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  hsl(60,  70.6%, 73.3%);
+    --theme-green-color:   #A3BE8C;
+    --theme-purple-color:  hsl(0.3, 70.0%, 45.0%);
+
+    /* ------------------------------------------- */
+    --background-color-1:    var(--primary-color-1);
+    --background-color-2:    var(--primary-color-2);
+    --background-color-3:    var(--primary-color-3);
+    --background-color-4:    var(--primary-color-4);
+
+    --border-color-1:        var(--primary-color-2);
+    --border-color-2:        var(--primary-color-3);
+    --border-color-3:        var(--primary-color-4);
+
+    --border-focus-color:    var(--theme-nuance-color-2);
+    --border-focus-shadow:   var(--theme-nuance-color-1);
+
+    --text-color-plain:      var(--secondary-color-1);
+    --text-color-subtile-1:  var(--secondary-color-2);
+    --text-color-subtile-2:  var(--secondary-color-3);
+
+    --code-background-color: var(--secondary-color-2);
+    --code-text-color:       var(--primary-color-2);
+
+    --ui-range-thumb-color:  var(--primary-color-4);
+    --ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+    --textarea-border-color: var(--secondary-color-4);
+
+    --chat-id-color:         var(--theme-nuance-color-4);
+
+    /* ------------------------------------------- */
+    --button-alert-text-hover:       var(--primary-color-1);
+    --button-alert-color-hover:      var(--theme-purple-color);
+    --button-alert-border-hover:     var(--theme-purple-color);
+
+    --button-alert-text-active:      var(--primary-color-1);
+    --button-alert-color-active:     var(--theme-red-color);
+    --button-alert-border-active:    var(--theme-red-color);
+
+    /* ----------- PRIMARY BUTTONS --------------- */
+    /* - button should immediately catch the eye - */
+    --button-primary-text:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+    --button-primary-color:  var(--theme-nuance-color-3);
+    --button-primary-border: var(--theme-nuance-color-3);
+
+    /* ---------hover---------- */
+    --button-primary-text-hover:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+    --button-primary-color-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+    --button-primary-border-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+    /* ---------active--------- */
+    --button-primary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+    --button-primary-color-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+    --button-primary-border-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+    /* ---------- SECONDARY BUTTONS -------------- */
+    /* these should NOT immediately catch the eye  */
+    --button-secondary-text:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+    --button-secondary-color:  var(--primary-color-3);
+    --button-secondary-border: var(--primary-color-3);
+
+    /* ---------hover---------- */
+    --button-secondary-text-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+    --button-secondary-color-hover:  var(--primary-color-4);
+    --button-secondary-border-hover: var(--primary-color-4);
+
+    /* ---------active--------- */
+    --button-secondary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+    --button-secondary-color-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 100%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+    --button-secondary-border-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 100%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+    /* ---------- TERTIARY BUTTONS --------------- */
+    /* ---------- disabled buttons --------------- */
+    --button-tertiary-text:   var(--primary-color-4);
+    --button-tertiary-color:  var(--primary-color-2);
+    --button-tertiary-border: var(--primary-color-2);
+
+    /* ---------hover---------- */
+    --button-tertiary-text:   var(--primary-color-4);
+    --button-tertiary-color:  var(--primary-color-2);
+    --button-tertiary-border: var(--primary-color-2);
+
+    --loading-color-1: #eeeeee00;
+    --loading-color-2: #eeeeeeff;
+    }
diff --git a/examples/server/public_legacy/theme-mangotango.css b/examples/server/public_legacy/theme-mangotango.css
new file mode 100755
index 000000000..e43380245
--- /dev/null
+++ b/examples/server/public_legacy/theme-mangotango.css
@@ -0,0 +1,216 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
+
+.theme-mangotango {
+
+--primary-color-1:      hsl(192, 8.5%, 11.6%);
+--primary-color-2:      hsl(192, 8.5%, 21%);
+--primary-color-3:      hsl(192, 8.5%, 30%);
+--primary-color-4:      hsl(192, 8.5%, 40%);
+
+--secondary-color-1:    hsl(192, 8.5%, 80%);
+--secondary-color-2:    hsl(192, 8.5%, 73%);
+--secondary-color-3:    hsl(192, 8.5%, 66%);
+--secondary-color-4:    hsl(192, 8.5%, 60%);
+
+--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
+
+
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(192, 8.5%, 11.6%);
+    --primary-color-1-saturation: 8.5%;
+    --primary-color-1-lightness: 11.6%;
+
+--primary-color-2: hsl(192, 8.5%, 21%);
+    --primary-color-2-saturation: 8.5%;
+    --primary-color-2-lightness: 21%;
+
+--primary-color-3: hsl(192, 8.5%, 30%);
+    --primary-color-3-saturation: 8.5%;
+    --primary-color-3-lightness: 30%;
+
+--primary-color-4: hsl(192, 8.5%, 40%);
+    --primary-color-4-saturation: 8.5%;
+    --primary-color-4-lightness: 40%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(192, 8.5%, 80%);
+    --secondary-color-1-saturation: 8.5%;
+    --secondary-color-1-lightness: 80%;
+
+--secondary-color-2: hsl(192, 8.5%, 73%);
+    --secondary-color-2-saturation: 8.5%;
+    --secondary-color-2-lightness: 73%;
+
+--secondary-color-3: hsl(192, 8.5%, 66%);
+    --secondary-color-3-saturation: 8.5%;
+    --secondary-color-3-lightness: 66%;
+
+--secondary-color-4: hsl(192, 8.5%, 60%);
+    --secondary-color-4-saturation: 8.5%;
+    --secondary-color-4-lightness: 60%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-1-saturation: 100%;
+    --theme-nuance-color-1-lightness: 60.2%;
+
+--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-2-saturation: 100%;
+    --theme-nuance-color-2-lightness: 60.2%;
+
+--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-3-saturation: 100%;
+    --theme-nuance-color-3-lightness: 60.2%;
+
+--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-4-saturation: 100%;
+    --theme-nuance-color-4-lightness: 60.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(325, 60%, 50%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  #ffd95f;
+    --theme-green-color:   #A3BE8C;
+    --theme-blue-color:    hsl(192, 95%, 40%);
+    --theme-purple-color:  hsl(192, 80%, 35%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-blue-color);
+--button-alert-border-active:    var(--theme-blue-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text: var(--primary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(192,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:   var(--secondary-color-1);
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active: var(--secondary-color-1);
+
+--button-secondary-color-active:
+    hsl(192,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(192,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/examples/server/public_legacy/theme-playground.css b/examples/server/public_legacy/theme-playground.css
new file mode 100755
index 000000000..9d56a7182
--- /dev/null
+++ b/examples/server/public_legacy/theme-playground.css
@@ -0,0 +1,221 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */
+
+.theme-playground {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(0, 0%,    99.2%);
+    --primary-color-1-hue:         0;
+    --primary-color-1-saturation:  0%;
+    --primary-color-1-lightness:   99.2%;
+
+--primary-color-2: hsl(0, 0%,    95%);
+    --primary-color-2-hue:         0;
+    --primary-color-2-saturation:  0%;
+    --primary-color-2-lightness:   95%;
+
+--primary-color-3: hsl(0, 0%,    88%);
+    --primary-color-3-hue:         0;
+    --primary-color-3-saturation:  0%;
+    --primary-color-3-lightness:   88%;
+
+--primary-color-4: hsl(0, 0%,    80%);
+    --primary-color-4-hue:         0;
+    --primary-color-4-saturation:  0%;
+    --primary-color-4-lightness:   80%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(0, 0%,    20%);
+    --secondary-color-1-hue:         0;
+    --secondary-color-1-saturation:  0%;
+    --secondary-color-1-lightness:   20%;
+
+--secondary-color-2: hsl(0, 0%,    23.1%);
+    --secondary-color-2-hue:         0;
+    --secondary-color-2-saturation:  0%;
+    --secondary-color-2-lightness:   23.1%;
+
+--secondary-color-3: hsl(0, 0%,    29%);
+    --secondary-color-3-hue:         0;
+    --secondary-color-3-saturation:  0%;
+    --secondary-color-3-lightness:   29%;
+
+--secondary-color-4: hsl(0, 0%,    36.1%);
+    --secondary-color-4-hue:         0;
+    --secondary-color-4-saturation:  0%;
+    --secondary-color-4-lightness:   36.1%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%);
+    --theme-nuance-color-1-hue:             165.2;
+    --theme-nuance-color-1-saturation:      82.1%;
+    --theme-nuance-color-1-lightness:       35.1%;
+
+--theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%);
+    --theme-nuance-color-2-hue:             165.2;
+    --theme-nuance-color-2-saturation:      82.1%;
+    --theme-nuance-color-2-lightness:       35.1%;
+
+--theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%);
+    --theme-nuance-color-3-hue:             165.2;
+    --theme-nuance-color-3-saturation:      81.1%;
+    --theme-nuance-color-3-lightness:       35.3%;
+
+--theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%);
+    --theme-nuance-color-4-hue:             164.9;
+    --theme-nuance-color-4-saturation:      81.6%;
+    --theme-nuance-color-4-lightness:       27.6%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:     hsl(0.3, 80%, 50%);
+--theme-orange-color:  #e76f51;
+--theme-yellow-color:  hsl(60, 70.6%, 73.3%);
+--theme-green-color:   #A3BE8C;
+--theme-purple-color:  hsl(0.3, 70%, 45%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--primary-color-4);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:        var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/examples/server/public_legacy/theme-polarnight.css b/examples/server/public_legacy/theme-polarnight.css
new file mode 100755
index 000000000..2bcfb33d8
--- /dev/null
+++ b/examples/server/public_legacy/theme-polarnight.css
@@ -0,0 +1,253 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
+
+.theme-polarnight {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(220.0, 16.4%, 21.6%) ;
+    --primary-color-1-hue:             220.0;
+    --primary-color-1-saturation:      16.4%;
+    --primary-color-1-lightness:       21.6%;
+
+--primary-color-2: hsl(221.7, 16.3%, 27.6%) ;
+    -primary-color-2-hue:              221.7;
+    --primary-color-2-saturation:      16.3%;
+    --primary-color-2-lightness:       27.6%;
+
+--primary-color-3: hsl(220.0, 16.8%, 31.6%) ;
+    --primary-color-3-hue:             220.0;
+    --primary-color-3-saturation:      16.8%;
+    --primary-color-3-lightness:       31.6%;
+
+--primary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --primary-color-4-hue:             220.0;
+    --primary-color-4-saturation:      16.5%;
+    --primary-color-4-lightness:       35.7%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --secondary-color-1-hue:             217.5;
+    --secondary-color-1-saturation:      26.7%;
+    --secondary-color-1-lightness:       94.1%;
+
+--secondary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --secondary-color-2-hue:             218.2;
+    --secondary-color-2-saturation:      26.8%;
+    --secondary-color-2-lightness:       92.0%;
+
+--secondary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --secondary-color-3-hue:             218.8;
+    --secondary-color-3-saturation:      27.9%;
+    --secondary-color-3-lightness:       88.0%;
+
+--secondary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --secondary-color-4-hue:             218.8;
+    --secondary-color-4-saturation:      18.3%;
+    --secondary-color-4-lightness:       81.8%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(354.3, 42.3%, 56.5%);
+--theme-orange-color: hsl(20, 85%, 50%);
+--theme-yellow-color: hsl(20, 75%, 45%);
+--theme-green-color:  hsl( 92.4, 27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------------ */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:        var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-yellow-color);
+--button-alert-border-hover:     var(--theme-yellow-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-orange-color);
+--button-alert-border-active:    var(--theme-orange-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) - 35%),
+    calc(var(--secondary-color-1-lightness)  + 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 25%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+}
diff --git a/examples/server/public_legacy/theme-snowstorm.css b/examples/server/public_legacy/theme-snowstorm.css
new file mode 100755
index 000000000..7bb227594
--- /dev/null
+++ b/examples/server/public_legacy/theme-snowstorm.css
@@ -0,0 +1,251 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
+
+.theme-snowstorm {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --primary-color-1-hue:             217.5;
+    --primary-color-1-saturation:      26.7%;
+    --primary-color-1-lightness:       94.1%;
+
+--primary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --primary-color-2-hue:             218.2;
+    --primary-color-2-saturation:      26.8%;
+    --primary-color-2-lightness:       92.0%;
+
+--primary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --primary-color-3-hue:             218.8;
+    --primary-color-3-saturation:      27.9%;
+    --primary-color-3-lightness:       88.0%;
+
+--primary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --primary-color-4-hue:             218.8;
+    --primary-color-4-saturation:      18.3%;
+    --primary-color-4-lightness:       81.8%;
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
+    --secondary-color-1-hue:             220.0;
+    --secondary-color-1-saturation:      16.4%;
+    --secondary-color-1-lightness:       21.6%;
+
+--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
+    --secondary-color-2-hue:             221.7;
+    --secondary-color-2-saturation:      16.3%;
+    --secondary-color-2-lightness:       27.6%;
+
+--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
+    --secondary-color-3-hue:             220.0;
+    --secondary-color-3-saturation:      16.8%;
+    --secondary-color-3-lightness:       31.6%;
+
+--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --secondary-color-4-hue:             220.0;
+    --secondary-color-4-saturation:      16.5%;
+    --secondary-color-4-lightness:       35.7%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(32.5, 80%, 50%);
+--theme-orange-color: hsl(32.5, 70%, 45%);
+--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
+--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-orange-color);
+--button-alert-border-hover:     var(--theme-orange-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) + 35%),
+    calc(var(--secondary-color-1-lightness)  - 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) + 40%),
+    calc(var(--theme-nuance-color-3-lightness)  - 55%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+}
diff --git a/examples/server/public_simplechat/datautils.mjs b/examples/server/public_simplechat/datautils.mjs
new file mode 100644
index 000000000..75159d6b1
--- /dev/null
+++ b/examples/server/public_simplechat/datautils.mjs
@@ -0,0 +1,266 @@
+//@ts-check
+// Helpers to work with different data types
+// by Humans for All
+//
+
+/**
+ * Given the limited context size of local LLMs and , many a times when context gets filled
+ * between the prompt and the response, it can lead to repeating text garbage generation.
+ * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
+ * repeatation with slight variations. These garbage inturn can lead to overloading of the
+ * available model context, leading to less valuable response for subsequent prompts/queries,
+ * if chat history is sent to ai model.
+ *
+ * So two simple minded garbage trimming logics are experimented below.
+ * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
+ * * another based on char-histogram-driven garbage trimming.
+ *   * in future characteristic of histogram over varying lengths could be used to allow for
+ *     a more aggressive and adaptive trimming logic.
+ */
+
+
+/**
+ * Simple minded logic to help remove repeating garbage at end of the string.
+ * The repeatation needs to be perfectly matching.
+ *
+ * The logic progressively goes on probing for longer and longer substring based
+ * repeatation, till there is no longer repeatation. Inturn picks the one with
+ * the longest chain.
+ *
+ * @param {string} sIn
+ * @param {number} maxSubL
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
+    let rCnt = [0];
+    let maxMatchLen = maxSubL;
+    let iMML = -1;
+    for(let subL=1; subL < maxSubL; subL++) {
+        rCnt.push(0);
+        let i;
+        let refS = sIn.substring(sIn.length-subL, sIn.length);
+        for(i=sIn.length; i > 0; i -= subL) {
+            let curS = sIn.substring(i-subL, i);
+            if (refS != curS) {
+                let curMatchLen = rCnt[subL]*subL;
+                if (maxMatchLen < curMatchLen) {
+                    maxMatchLen = curMatchLen;
+                    iMML = subL;
+                }
+                break;
+            }
+            rCnt[subL] += 1;
+        }
+    }
+    console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
+    if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
+        return {trimmed: false, data: sIn};
+    }
+    console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
+    let iEnd = sIn.length - maxMatchLen;
+    return { trimmed: true, data: sIn.substring(0, iEnd) };
+}
+
+
+/**
+ * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
+ * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
+ * This ensures that even if there are multiple runs of garbage with different patterns, the
+ * logic still tries to munch through them.
+ *
+ * @param {string} sIn
+ * @param {number} maxSubL
+ * @param {number | undefined} [maxMatchLenThreshold]
+ */
+export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
+    let sCur = sIn;
+    let sSaved = "";
+    let iTry = 0;
+    while(true) {
+        let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
+        if (got.trimmed != true) {
+            if (iTry == 0) {
+                sSaved = got.data;
+            }
+            iTry += 1;
+            if (iTry >= skipMax) {
+                return sSaved;
+            }
+            got.data = got.data.substring(0,got.data.length-1);
+        } else {
+            iTry = 0;
+        }
+        sCur = got.data;
+    }
+}
+
+
+/**
+ * A simple minded try trim garbage at end using histogram driven characteristics.
+ * There can be variation in the repeatations, as long as no new char props up.
+ *
+ * This tracks the chars and their frequency in a specified length of substring at the end
+ * and inturn checks if moving further into the generated text from the end remains within
+ * the same char subset or goes beyond it and based on that either trims the string at the
+ * end or not. This allows to filter garbage at the end, including even if there are certain
+ * kind of small variations in the repeated text wrt position of seen chars.
+ *
+ * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
+ * a given type of char ie numerals or alphabets or other types dont cross the specified
+ * maxType limit. This allows intermixed text garbage to be identified and trimmed.
+ *
+ * ALERT: This is not perfect and only provides a rough garbage identification logic.
+ * Also it currently only differentiates between character classes wrt english.
+ *
+ * @param {string} sIn
+ * @param {number} maxType
+ * @param {number} maxUniq
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
+    if (sIn.length < maxMatchLenThreshold) {
+        return { trimmed: false, data: sIn };
+    }
+    let iAlp = 0;
+    let iNum = 0;
+    let iOth = 0;
+    // Learn
+    let hist = {};
+    let iUniq = 0;
+    for(let i=0; i<maxMatchLenThreshold; i++) {
+        let c = sIn[sIn.length-1-i];
+        if (c in hist) {
+            hist[c] += 1;
+        } else {
+            if(c.match(/[0-9]/) != null) {
+                iNum += 1;
+            } else if(c.match(/[A-Za-z]/) != null) {
+                iAlp += 1;
+            } else {
+                iOth += 1;
+            }
+            iUniq += 1;
+            if (iUniq >= maxUniq) {
+                break;
+            }
+            hist[c] = 1;
+        }
+    }
+    console.debug("DBUG:TrimHistGarbage:", hist);
+    if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
+        return { trimmed: false, data: sIn };
+    }
+    // Catch and Trim
+    for(let i=0; i < sIn.length; i++) {
+        let c = sIn[sIn.length-1-i];
+        if (!(c in hist)) {
+            if (i < maxMatchLenThreshold) {
+                return { trimmed: false, data: sIn };
+            }
+            console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
+            return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
+        }
+    }
+    console.debug("DBUG:TrimHistGarbage:Trimmed fully");
+    return { trimmed: true, data: "" };
+}
+
+/**
+ * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
+ * This ensures that even if there are multiple runs of garbage with different patterns,
+ * the logic still tries to munch through them.
+ *
+ * @param {any} sIn
+ * @param {number} maxType
+ * @param {number} maxUniq
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
+    let sCur = sIn;
+    while (true) {
+        let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
+        if (!got.trimmed) {
+            return got.data;
+        }
+        sCur = got.data;
+    }
+}
+
+/**
+ * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
+ * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
+ * @param {string} sIn
+ */
+export function trim_garbage_at_end(sIn) {
+    let sCur = sIn;
+    for(let i=0; i<2; i++) {
+        sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
+        sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
+    }
+    return sCur;
+}
+
+
+/**
+ * NewLines array helper.
+ * Allow for maintaining a list of lines.
+ * Allow for a line to be builtup/appended part by part.
+ */
+export class NewLines {
+
+    constructor() {
+        /** @type {string[]} */
+        this.lines = [];
+    }
+
+    /**
+     * Extracts lines from the passed string and inturn either
+     * append to a previous partial line or add a new line.
+     * @param {string} sLines
+     */
+    add_append(sLines) {
+        let aLines = sLines.split("\n");
+        let lCnt = 0;
+        for(let line of aLines) {
+            lCnt += 1;
+            // Add back newline removed if any during split
+            if (lCnt < aLines.length) {
+                line += "\n";
+            } else {
+                if (sLines.endsWith("\n")) {
+                    line += "\n";
+                }
+            }
+            // Append if required
+            if (lCnt == 1) {
+                let lastLine = this.lines[this.lines.length-1];
+                if (lastLine != undefined) {
+                    if (!lastLine.endsWith("\n")) {
+                        this.lines[this.lines.length-1] += line;
+                        continue;
+                    }
+                }
+            }
+            // Add new line
+            this.lines.push(line);
+        }
+    }
+
+    /**
+     * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
+     * Optionally control whether only full lines (ie those with newline at end) will be returned
+     * or will a partial line without a newline at end (can only be the last line) be returned.
+     * @param {boolean} bFullWithNewLineOnly
+     */
+    shift(bFullWithNewLineOnly=true) {
+        let line = this.lines[0];
+        if (line == undefined) {
+            return undefined;
+        }
+        if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
+            return undefined;
+        }
+        return this.lines.shift();
+    }
+
+}
diff --git a/examples/server/public_simplechat/index.html b/examples/server/public_simplechat/index.html
new file mode 100644
index 000000000..f6413016f
--- /dev/null
+++ b/examples/server/public_simplechat/index.html
@@ -0,0 +1,51 @@
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <title>SimpleChat LlamaCppEtal </title>
+        <meta charset="UTF-8" />
+        <meta name="viewport" content="width=device-width, initial-scale=1" />
+        <meta name="message" content="Save Nature Save Earth" />
+        <meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
+        <meta name="author" content="by Humans for All" />
+        <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
+        <script type="importmap">
+            {
+                "imports": {
+                    "datautils": "./datautils.mjs",
+                    "ui": "./ui.mjs"
+                }
+            }
+        </script>
+        <script src="simplechat.js" type="module" defer></script>
+        <link rel="stylesheet" href="simplechat.css" />
+    </head>
+    <body>
+        <div class="samecolumn" id="fullbody">
+
+            <div class="sameline" id="heading">
+                <p class="heading flex-grow" > <b> SimpleChat </b> </p>
+                <button id="settings">Settings</button>
+            </div>
+
+            <div id="sessions-div" class="sameline"></div>
+
+            <hr>
+            <div class="sameline">
+                <label for="system-in">System</label>
+                <textarea name="system" id="system-in" rows="2" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"></textarea>
+            </div>
+
+            <hr>
+            <div id="chat-div">
+                <p> You need to have javascript enabled.</p>
+            </div>
+
+            <hr>
+            <div class="sameline">
+                <textarea id="user-in" class="flex-grow" rows="2" placeholder="enter your query to the ai model here" ></textarea>
+                <button id="user-btn">submit</button>
+            </div>
+
+        </div>
+    </body>
+</html>
diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md
new file mode 100644
index 000000000..21410199f
--- /dev/null
+++ b/examples/server/public_simplechat/readme.md
@@ -0,0 +1,286 @@
+
+# SimpleChat
+
+by Humans for All.
+
+## quickstart
+
+To run from the build dir
+
+bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
+
+Continue reading for the details.
+
+## overview
+
+This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
+in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
+multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
+own system prompts.
+
+This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
+or potentially as it is being generated, in a streamed manner from the server/ai-model.
+
+![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
+
+Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
+open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
+
+The UI follows a responsive web design so that the layout can adapt to available display space in a usable
+enough manner, in general.
+
+Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
+console. Parallely some of the directly useful to end-user settings can also be changed using the provided
+settings ui.
+
+NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
+any adaptive culling of old messages nor of replacing them with summary of their content etal. However there
+is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
+the chat history before sending to the ai model.
+
+NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
+However if someone wants they can update the js file or equivalent member in gMe as needed.
+
+NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
+limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui.
+
+
+## usage
+
+One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
+frontend to configure the server over http(s) or so, then run this web frontend using something like python's
+http module.
+
+### running using examples/server
+
+./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
+
+### running using python3's server module
+
+first run examples/server
+* ./llama-server -m path/model.gguf
+
+next run this web front end in examples/server/public_simplechat
+* cd ../examples/server/public_simplechat
+* python3 -m http.server PORT
+
+### using the front end
+
+Open this simple web front end from your local browser
+
+* http://127.0.0.1:PORT/index.html
+
+Once inside
+
+* If you want to, you can change many of the default global settings
+  * the base url (ie ip addr / domain name, port)
+  * chat (default) vs completion mode
+  * try trim garbage in response or not
+  * amount of chat history in the context sent to server/ai-model
+  * oneshot or streamed mode.
+
+* In completion mode
+  * one normally doesnt use a system prompt in completion mode.
+  * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
+    If the model requires any prefix wrt user role messages, then the end user has to
+    explicitly add the needed prefix, when they enter their chat message.
+    Similarly if the model requires any prefix to trigger assistant/ai-model response,
+    then the end user needs to enter the same.
+    This keeps the logic simple, while still giving flexibility to the end user to
+    manage any templating/tagging requirement wrt their messages to the model.
+  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
+    However if the chat being sent to /completions end point has more than one role's message,
+    then insert newline when moving from one role's message to the next role's message, so
+    that it can be clearly identified/distinguished.
+  * given that /completions endpoint normally doesnt add additional chat-templating of its
+    own, the above ensures that end user can create a custom single/multi message combo with
+    any tags/special-tokens related chat templating to test out model handshake. Or enduser
+    can use it just for normal completion related/based query.
+
+* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
+  Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
+  responses with a suitable system prompt.
+  * if chat.add_system_begin is used
+    * you cant change the system prompt, after it is has been submitted once along with user query.
+    * you cant set a system prompt, after you have submitted any user query
+  * if chat.add_system_anytime is used
+    * one can change the system prompt any time during chat, by changing the contents of system prompt.
+    * inturn the updated/changed system prompt will be inserted into the chat session.
+    * this allows for the subsequent user chatting to be driven by the new system prompt set above.
+
+* Enter your query and either press enter or click on the submit button.
+  If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
+
+* Wait for the logic to communicate with the server and get the response.
+  * the user is not allowed to enter any fresh query during this time.
+  * the user input box will be disabled and a working message will be shown in it.
+  * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent.
+
+* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
+
+* Using NewChat one can start independent chat sessions.
+  * two independent chat sessions are setup by default.
+
+* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of
+  interest, will display the full chat history till then wrt same, if you want full history for printing.
+
+
+## Devel note
+
+### Reason behind this
+
+The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
+by developers who may not be from web frontend background (so inturn may not be familiar with template /
+end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
+
+And given that the idea is also to help explore/experiment for developers, some flexibility is provided
+to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects).
+Skeletal logic has been implemented to explore some of the end points and ideas/implications around them.
+
+
+### General
+
+Me/gMe consolidates the settings which control the behaviour into one object.
+One can see the current settings, as well as change/update them using browsers devel-tool/console.
+It is attached to the document object. Some of these can also be updated using the Settings UI.
+
+  baseURL - the domain-name/ip-address and inturn the port to send the request.
+
+  bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing
+  of the generated response.
+
+    the logic assumes that the text sent from the server follows utf-8 encoding.
+
+    in streaming mode - if there is any exception, the logic traps the same and tries to ensure
+    that text generated till then is not lost.
+
+      if a very long text is being generated, which leads to no user interaction for sometime and
+      inturn the machine goes into power saving mode or so, the platform may stop network connection,
+      leading to exception.
+
+  apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model.
+
+  bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
+  communicating with the server or only sends the latest user query/message.
+
+  bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
+  messages that get inserted into prompt field wrt /Completion endpoint.
+
+  bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
+  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
+  subsequent chat history. At the same time the actual trimmed text is shown to the user, once
+  when it was generated, so user can check if any useful info/data was there in the response.
+
+    One may be able to request the ai-model to continue (wrt the last response) (if chat-history
+    is enabled as part of the chat-history-in-context setting), and chances are the ai-model will
+    continue starting from the trimmed part, thus allows long response to be recovered/continued
+    indirectly, in many cases.
+
+    The histogram/freq based trimming logic is currently tuned for english language wrt its
+    is-it-a-alpabetic|numeral-char regex match logic.
+
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
+  irrespective of whether /chat/completions or /completions endpoint.
+
+    If you want to add additional options/fields to send to the server/ai-model, and or
+    modify the existing options value or remove them, for now you can update this global var
+    using browser's development-tools/console.
+
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
+    created.
+
+    cache_prompt option supported by example/server is allowed to be controlled by user, so that
+    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
+    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
+    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
+    However system prompt should ideally get the benefit of caching.
+
+  headers - maintains the list of http headers sent when request is made to the server. By default
+  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
+  be set if needed using the settings ui.
+
+  iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
+  This is disabled by default. However if enabled, then in addition to latest system message, only
+  the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
+  from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
+  only user messages after the latest system message/prompt will be considered.
+
+    This specified sliding window user message count also includes the latest user query.
+    <0 : Send entire chat history to server
+     0 : Send only the system message if any to the server
+    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
+
+
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
+some extent in a simple crude way. You may also want to control the context size enabled when the
+server loads ai-model, on the server end.
+
+
+Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
+may not be visible. Also remember that just refreshing/reloading page in browser or for that
+matter clearing site data, dont directly override site caching in all cases. Worst case you may
+have to change port. Or in dev tools of browser, you may be able to disable caching fully.
+
+
+Currently the server to communicate with is maintained globally and not as part of a specific
+chat session. So if one changes the server ip/url in setting, then all chat sessions will auto
+switch to this new server, when you try using those sessions.
+
+
+By switching between chat.add_system_begin/anytime, one can control whether one can change
+the system prompt, anytime during the conversation or only at the beginning.
+
+
+### Default setup
+
+By default things are setup to try and make the user experience a bit better, if possible.
+However a developer when testing the server of ai-model may want to change these value.
+
+Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
+just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
+full chat history. This way if there is any response with garbage/repeatation, it doesnt
+mess with things beyond the next question/request/query, in some ways. The trim garbage
+option also tries to help avoid issues with garbage in the context to an extent.
+
+Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
+available wrt next query-response. However dont forget that the server when started should
+also be started with a model context size of 1k or more, to be on safe side.
+
+  The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
+  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
+  to /completions endpoint handling code on server side.
+
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
+wrt the set of fields sent to server along with the user query, to check how the model behaves
+wrt repeatations in general in the generated text response.
+
+A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
+using the provided settings ui (for settings exposed through the ui).
+
+
+### OpenAi / Equivalent API WebService
+
+One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint
+for a minimal chatting experimentation by setting the below.
+
+* the baseUrl in settings ui
+  * https://api.openai.com/v1 or similar
+
+* Wrt request body - gMe.apiRequestOptions
+  * model (settings ui)
+  * any additional fields if required in future
+
+* Wrt request headers - gMe.headers
+  * Authorization (available through settings ui)
+    * Bearer THE_OPENAI_API_KEY
+  * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so
+
+NOTE: Not tested, as there is no free tier api testing available. However logically this might
+work.
+
+
+## At the end
+
+Also a thank you to all open source and open model developers, who strive for the common good.
diff --git a/examples/server/public_simplechat/simplechat.css b/examples/server/public_simplechat/simplechat.css
new file mode 100644
index 000000000..13bfb80b4
--- /dev/null
+++ b/examples/server/public_simplechat/simplechat.css
@@ -0,0 +1,79 @@
+/**
+ * the styling of the simplechat web frontend
+ * by Humans for All
+ */
+
+#fullbody {
+    height: 98vh;
+}
+
+.heading {
+    background-color: lightgray;
+}
+
+.session-selected {
+    background-color: lightblue;
+}
+
+.role-system {
+    background-color: lightblue;
+}
+.role-user {
+    background-color: lightgray;
+}
+.role-trim {
+    background-color: lightpink;
+}
+
+.gridx2 {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    border-bottom-style: dotted;
+    border-bottom-width: thin;
+    border-bottom-color: lightblue;
+}
+
+.flex-grow {
+    flex-grow: 1;
+}
+.float-right {
+    float: right;
+}
+
+#chat-div {
+    overflow: scroll;
+    flex-grow: 1;
+    flex-shrink: 1;
+    min-height: 40vh;
+}
+button {
+    min-width: 8vw;
+}
+
+.sameline {
+    display: flex;
+    flex-direction: row;
+}
+.samecolumn {
+    display: flex;
+    flex-direction: column;
+}
+
+.ul1 {
+    padding-inline-start: 2vw;
+}
+.ul2 {
+    padding-inline-start: 2vw;
+}
+
+* {
+    margin: 0.6vmin;
+}
+
+@media print {
+
+    #fullbody {
+        height: auto;
+    }
+
+}
diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js
new file mode 100644
index 000000000..2fcd24a86
--- /dev/null
+++ b/examples/server/public_simplechat/simplechat.js
@@ -0,0 +1,929 @@
+// @ts-check
+// A simple completions and chat/completions test related web front end logic
+// by Humans for All
+
+import * as du from "./datautils.mjs";
+import * as ui from "./ui.mjs"
+
+class Roles {
+    static System = "system";
+    static User = "user";
+    static Assistant = "assistant";
+}
+
+class ApiEP {
+    static Type = {
+        Chat: "chat",
+        Completion: "completion",
+    }
+    static UrlSuffix = {
+        'chat': `/chat/completions`,
+        'completion': `/completions`,
+    }
+
+    /**
+     * Build the url from given baseUrl and apiEp id.
+     * @param {string} baseUrl
+     * @param {string} apiEP
+     */
+    static Url(baseUrl, apiEP) {
+        if (baseUrl.endsWith("/")) {
+            baseUrl = baseUrl.substring(0, baseUrl.length-1);
+        }
+        return `${baseUrl}${this.UrlSuffix[apiEP]}`;
+    }
+
+}
+
+
+let gUsageMsg = `
+    <p class="role-system">Usage</p>
+    <ul class="ul1">
+    <li> System prompt above, to try control ai response characteristics.</li>
+        <ul class="ul2">
+        <li> Completion mode - no system prompt normally.</li>
+        </ul>
+    <li> Use shift+enter for inserting enter/newline.</li>
+    <li> Enter your query to ai assistant below.</li>
+    <li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
+        <ul class="ul2">
+        <li> ChatHistInCtxt, MaxTokens, ModelCtxt window to expand</li>
+        </ul>
+    </ul>
+`;
+
+
+/** @typedef {{role: string, content: string}[]} ChatMessages */
+
+/** @typedef {{iLastSys: number, xchat: ChatMessages}} SimpleChatODS */
+
+class SimpleChat {
+
+    /**
+     * @param {string} chatId
+     */
+    constructor(chatId) {
+        this.chatId = chatId;
+        /**
+         * Maintain in a form suitable for common LLM web service chat/completions' messages entry
+         * @type {ChatMessages}
+         */
+        this.xchat = [];
+        this.iLastSys = -1;
+        this.latestResponse = "";
+    }
+
+    clear() {
+        this.xchat = [];
+        this.iLastSys = -1;
+    }
+
+    ods_key() {
+        return `SimpleChat-${this.chatId}`
+    }
+
+    save() {
+        /** @type {SimpleChatODS} */
+        let ods = {iLastSys: this.iLastSys, xchat: this.xchat};
+        localStorage.setItem(this.ods_key(), JSON.stringify(ods));
+    }
+
+    load() {
+        let sods = localStorage.getItem(this.ods_key());
+        if (sods == null) {
+            return;
+        }
+        /** @type {SimpleChatODS} */
+        let ods = JSON.parse(sods);
+        this.iLastSys = ods.iLastSys;
+        this.xchat = ods.xchat;
+    }
+
+    /**
+     * Recent chat messages.
+     * If iRecentUserMsgCnt < 0
+     *   Then return the full chat history
+     * Else
+     *   Return chat messages from latest going back till the last/latest system prompt.
+     *   While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
+     * @param {number} iRecentUserMsgCnt
+     */
+    recent_chat(iRecentUserMsgCnt) {
+        if (iRecentUserMsgCnt < 0) {
+            return this.xchat;
+        }
+        if (iRecentUserMsgCnt == 0) {
+            console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
+        }
+        /** @type{ChatMessages} */
+        let rchat = [];
+        let sysMsg = this.get_system_latest();
+        if (sysMsg.length != 0) {
+            rchat.push({role: Roles.System, content: sysMsg});
+        }
+        let iUserCnt = 0;
+        let iStart = this.xchat.length;
+        for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
+            if (iUserCnt >= iRecentUserMsgCnt) {
+                break;
+            }
+            let msg = this.xchat[i];
+            if (msg.role == Roles.User) {
+                iStart = i;
+                iUserCnt += 1;
+            }
+        }
+        for(let i = iStart; i < this.xchat.length; i++) {
+            let msg = this.xchat[i];
+            if (msg.role == Roles.System) {
+                continue;
+            }
+            rchat.push({role: msg.role, content: msg.content});
+        }
+        return rchat;
+    }
+
+    /**
+     * Collate the latest response from the server/ai-model, as it is becoming available.
+     * This is mainly useful for the stream mode.
+     * @param {string} content
+     */
+    append_response(content) {
+        this.latestResponse += content;
+    }
+
+    /**
+     * Add an entry into xchat
+     * @param {string} role
+     * @param {string|undefined|null} content
+     */
+    add(role, content) {
+        if ((content == undefined) || (content == null) || (content == "")) {
+            return false;
+        }
+        this.xchat.push( {role: role, content: content} );
+        if (role == Roles.System) {
+            this.iLastSys = this.xchat.length - 1;
+        }
+        this.save();
+        return true;
+    }
+
+    /**
+     * Show the contents in the specified div
+     * @param {HTMLDivElement} div
+     * @param {boolean} bClear
+     */
+    show(div, bClear=true) {
+        if (bClear) {
+            div.replaceChildren();
+        }
+        let last = undefined;
+        for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
+            let entry = ui.el_create_append_p(`${x.role}: ${x.content}`, div);
+            entry.className = `role-${x.role}`;
+            last = entry;
+        }
+        if (last !== undefined) {
+            last.scrollIntoView(false);
+        } else {
+            if (bClear) {
+                div.innerHTML = gUsageMsg;
+                gMe.setup_load(div, this);
+                gMe.show_info(div);
+            }
+        }
+        return last;
+    }
+
+    /**
+     * Setup the fetch headers.
+     * It picks the headers from gMe.headers.
+     * It inserts Authorization only if its non-empty.
+     * @param {string} apiEP
+     */
+    fetch_headers(apiEP) {
+        let headers = new Headers();
+        for(let k in gMe.headers) {
+            let v = gMe.headers[k];
+            if ((k == "Authorization") && (v.trim() == "")) {
+                continue;
+            }
+            headers.append(k, v);
+        }
+        return headers;
+    }
+
+    /**
+     * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
+     * The needed fields/options are picked from a global object.
+     * Add optional stream flag, if required.
+     * Convert the json into string.
+     * @param {Object} obj
+     */
+    request_jsonstr_extend(obj) {
+        for(let k in gMe.apiRequestOptions) {
+            obj[k] = gMe.apiRequestOptions[k];
+        }
+        if (gMe.bStream) {
+            obj["stream"] = true;
+        }
+        return JSON.stringify(obj);
+    }
+
+    /**
+     * Return a string form of json object suitable for chat/completions
+     */
+    request_messages_jsonstr() {
+        let req = {
+            messages: this.recent_chat(gMe.iRecentUserMsgCnt),
+        }
+        return this.request_jsonstr_extend(req);
+    }
+
+    /**
+     * Return a string form of json object suitable for /completions
+     * @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
+     */
+    request_prompt_jsonstr(bInsertStandardRolePrefix) {
+        let prompt = "";
+        let iCnt = 0;
+        for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
+            iCnt += 1;
+            if (iCnt > 1) {
+                prompt += "\n";
+            }
+            if (bInsertStandardRolePrefix) {
+                prompt += `${chat.role}: `;
+            }
+            prompt += `${chat.content}`;
+        }
+        let req = {
+            prompt: prompt,
+        }
+        return this.request_jsonstr_extend(req);
+    }
+
+    /**
+     * Return a string form of json object suitable for specified api endpoint.
+     * @param {string} apiEP
+     */
+    request_jsonstr(apiEP) {
+        if (apiEP == ApiEP.Type.Chat) {
+            return this.request_messages_jsonstr();
+        } else {
+            return this.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
+        }
+    }
+
+    /**
+     * Extract the ai-model/assistant's response from the http response got.
+     * Optionally trim the message wrt any garbage at the end.
+     * @param {any} respBody
+     * @param {string} apiEP
+     */
+    response_extract(respBody, apiEP) {
+        let assistant = "";
+        if (apiEP == ApiEP.Type.Chat) {
+            assistant = respBody["choices"][0]["message"]["content"];
+        } else {
+            try {
+                assistant = respBody["choices"][0]["text"];
+            } catch {
+                assistant = respBody["content"];
+            }
+        }
+        return assistant;
+    }
+
+    /**
+     * Extract the ai-model/assistant's response from the http response got in streaming mode.
+     * @param {any} respBody
+     * @param {string} apiEP
+     */
+    response_extract_stream(respBody, apiEP) {
+        let assistant = "";
+        if (apiEP == ApiEP.Type.Chat) {
+            if (respBody["choices"][0]["finish_reason"] !== "stop") {
+                assistant = respBody["choices"][0]["delta"]["content"];
+            }
+        } else {
+            try {
+                assistant = respBody["choices"][0]["text"];
+            } catch {
+                assistant = respBody["content"];
+            }
+        }
+        return assistant;
+    }
+
+    /**
+     * Allow setting of system prompt, but only at begining.
+     * @param {string} sysPrompt
+     * @param {string} msgTag
+     */
+    add_system_begin(sysPrompt, msgTag) {
+        if (this.xchat.length == 0) {
+            if (sysPrompt.length > 0) {
+                return this.add(Roles.System, sysPrompt);
+            }
+        } else {
+            if (sysPrompt.length > 0) {
+                if (this.xchat[0].role !== Roles.System) {
+                    console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
+                } else {
+                    if (this.xchat[0].content !== sysPrompt) {
+                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Allow setting of system prompt, at any time.
+     * @param {string} sysPrompt
+     * @param {string} msgTag
+     */
+    add_system_anytime(sysPrompt, msgTag) {
+        if (sysPrompt.length <= 0) {
+            return false;
+        }
+
+        if (this.iLastSys < 0) {
+            return this.add(Roles.System, sysPrompt);
+        }
+
+        let lastSys = this.xchat[this.iLastSys].content;
+        if (lastSys !== sysPrompt) {
+            return this.add(Roles.System, sysPrompt);
+        }
+        return false;
+    }
+
+    /**
+     * Retrieve the latest system prompt.
+     */
+    get_system_latest() {
+        if (this.iLastSys == -1) {
+            return "";
+        }
+        let sysPrompt = this.xchat[this.iLastSys].content;
+        return sysPrompt;
+    }
+
+
+    /**
+     * Handle the multipart response from server/ai-model
+     * @param {Response} resp
+     * @param {string} apiEP
+     * @param {HTMLDivElement} elDiv
+     */
+    async handle_response_multipart(resp, apiEP, elDiv) {
+        let elP = ui.el_create_append_p("", elDiv);
+        if (!resp.body) {
+            throw Error("ERRR:SimpleChat:SC:HandleResponseMultiPart:No body...");
+        }
+        let tdUtf8 = new TextDecoder("utf-8");
+        let rr = resp.body.getReader();
+        this.latestResponse = "";
+        let xLines = new du.NewLines();
+        while(true) {
+            let { value: cur,  done: done } = await rr.read();
+            if (cur) {
+                let curBody = tdUtf8.decode(cur, {stream: true});
+                console.debug("DBUG:SC:PART:Str:", curBody);
+                xLines.add_append(curBody);
+            }
+            while(true) {
+                let curLine = xLines.shift(!done);
+                if (curLine == undefined) {
+                    break;
+                }
+                if (curLine.trim() == "") {
+                    continue;
+                }
+                if (curLine.startsWith("data:")) {
+                    curLine = curLine.substring(5);
+                }
+                if (curLine.trim() === "[DONE]") {
+                    break;
+                }
+                let curJson = JSON.parse(curLine);
+                console.debug("DBUG:SC:PART:Json:", curJson);
+                this.append_response(this.response_extract_stream(curJson, apiEP));
+            }
+            elP.innerText = this.latestResponse;
+            elP.scrollIntoView(false);
+            if (done) {
+                break;
+            }
+        }
+        console.debug("DBUG:SC:PART:Full:", this.latestResponse);
+        return this.latestResponse;
+    }
+
+    /**
+     * Handle the oneshot response from server/ai-model
+     * @param {Response} resp
+     * @param {string} apiEP
+     */
+    async handle_response_oneshot(resp, apiEP) {
+        let respBody = await resp.json();
+        console.debug(`DBUG:SimpleChat:SC:${this.chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
+        return this.response_extract(respBody, apiEP);
+    }
+
+    /**
+     * Handle the response from the server be it in oneshot or multipart/stream mode.
+     * Also take care of the optional garbage trimming.
+     * @param {Response} resp
+     * @param {string} apiEP
+     * @param {HTMLDivElement} elDiv
+     */
+    async handle_response(resp, apiEP, elDiv) {
+        let theResp = {
+            assistant: "",
+            trimmed: "",
+        }
+        if (gMe.bStream) {
+            try {
+                theResp.assistant = await this.handle_response_multipart(resp, apiEP, elDiv);
+                this.latestResponse = "";
+            } catch (error) {
+                theResp.assistant = this.latestResponse;
+                this.add(Roles.Assistant, theResp.assistant);
+                this.latestResponse = "";
+                throw error;
+            }
+        } else {
+            theResp.assistant = await this.handle_response_oneshot(resp, apiEP);
+        }
+        if (gMe.bTrimGarbage) {
+            let origMsg = theResp.assistant;
+            theResp.assistant = du.trim_garbage_at_end(origMsg);
+            theResp.trimmed = origMsg.substring(theResp.assistant.length);
+        }
+        this.add(Roles.Assistant, theResp.assistant);
+        return theResp;
+    }
+
+}
+
+
+class MultiChatUI {
+
+    constructor() {
+        /** @type {Object<string, SimpleChat>} */
+        this.simpleChats = {};
+        /** @type {string} */
+        this.curChatId = "";
+
+        // the ui elements
+        this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
+        this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
+        this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
+        this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
+        this.elDivHeading = /** @type{HTMLSelectElement} */(document.getElementById("heading"));
+        this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
+        this.elBtnSettings = /** @type{HTMLButtonElement} */(document.getElementById("settings"));
+
+        this.validate_element(this.elInSystem, "system-in");
+        this.validate_element(this.elDivChat, "chat-div");
+        this.validate_element(this.elInUser, "user-in");
+        this.validate_element(this.elDivHeading, "heading");
+        this.validate_element(this.elDivChat, "sessions-div");
+        this.validate_element(this.elBtnSettings, "settings");
+    }
+
+    /**
+     * Check if the element got
+     * @param {HTMLElement | null} el
+     * @param {string} msgTag
+     */
+    validate_element(el, msgTag) {
+        if (el == null) {
+            throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
+        } else {
+            console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
+        }
+    }
+
+    /**
+     * Reset user input ui.
+     * * clear user input
+     * * enable user input
+     * * set focus to user input
+     */
+    ui_reset_userinput() {
+        this.elInUser.value = "";
+        this.elInUser.disabled = false;
+        this.elInUser.focus();
+    }
+
+    /**
+     * Setup the needed callbacks wrt UI, curChatId to defaultChatId and
+     * optionally switch to specified defaultChatId.
+     * @param {string} defaultChatId
+     * @param {boolean} bSwitchSession
+     */
+    setup_ui(defaultChatId, bSwitchSession=false) {
+
+        this.curChatId = defaultChatId;
+        if (bSwitchSession) {
+            this.handle_session_switch(this.curChatId);
+        }
+
+        this.elBtnSettings.addEventListener("click", (ev)=>{
+            this.elDivChat.replaceChildren();
+            gMe.show_settings(this.elDivChat);
+        });
+
+        this.elBtnUser.addEventListener("click", (ev)=>{
+            if (this.elInUser.disabled) {
+                return;
+            }
+            this.handle_user_submit(this.curChatId, gMe.apiEP).catch((/** @type{Error} */reason)=>{
+                let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
+                console.error(msg.replace("\n", ":"));
+                alert(msg);
+                this.ui_reset_userinput();
+            });
+        });
+
+        this.elInUser.addEventListener("keyup", (ev)=> {
+            // allow user to insert enter into their message using shift+enter.
+            // while just pressing enter key will lead to submitting.
+            if ((ev.key === "Enter") && (!ev.shiftKey)) {
+                let value = this.elInUser.value;
+                this.elInUser.value = value.substring(0,value.length-1);
+                this.elBtnUser.click();
+                ev.preventDefault();
+            }
+        });
+
+        this.elInSystem.addEventListener("keyup", (ev)=> {
+            // allow user to insert enter into the system prompt using shift+enter.
+            // while just pressing enter key will lead to setting the system prompt.
+            if ((ev.key === "Enter") && (!ev.shiftKey)) {
+                let value = this.elInSystem.value;
+                this.elInSystem.value = value.substring(0,value.length-1);
+                let chat = this.simpleChats[this.curChatId];
+                chat.add_system_anytime(this.elInSystem.value, this.curChatId);
+                chat.show(this.elDivChat);
+                ev.preventDefault();
+            }
+        });
+
+    }
+
+    /**
+     * Setup a new chat session and optionally switch to it.
+     * @param {string} chatId
+     * @param {boolean} bSwitchSession
+     */
+    new_chat_session(chatId, bSwitchSession=false) {
+        this.simpleChats[chatId] = new SimpleChat(chatId);
+        if (bSwitchSession) {
+            this.handle_session_switch(chatId);
+        }
+    }
+
+
+    /**
+     * Handle user query submit request, wrt specified chat session.
+     * @param {string} chatId
+     * @param {string} apiEP
+     */
+    async handle_user_submit(chatId, apiEP) {
+
+        let chat = this.simpleChats[chatId];
+
+        // In completion mode, if configured, clear any previous chat history.
+        // So if user wants to simulate a multi-chat based completion query,
+        // they will have to enter the full thing, as a suitable multiline
+        // user input/query.
+        if ((apiEP == ApiEP.Type.Completion) && (gMe.bCompletionFreshChatAlways)) {
+            chat.clear();
+        }
+
+        chat.add_system_anytime(this.elInSystem.value, chatId);
+
+        let content = this.elInUser.value;
+        if (!chat.add(Roles.User, content)) {
+            console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
+            return;
+        }
+        chat.show(this.elDivChat);
+
+        let theUrl = ApiEP.Url(gMe.baseURL, apiEP);
+        let theBody = chat.request_jsonstr(apiEP);
+
+        this.elInUser.value = "working...";
+        this.elInUser.disabled = true;
+        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
+        let theHeaders = chat.fetch_headers(apiEP);
+        let resp = await fetch(theUrl, {
+            method: "POST",
+            headers: theHeaders,
+            body: theBody,
+        });
+
+        let theResp = await chat.handle_response(resp, apiEP, this.elDivChat);
+        if (chatId == this.curChatId) {
+            chat.show(this.elDivChat);
+            if (theResp.trimmed.length > 0) {
+                let p = ui.el_create_append_p(`TRIMMED:${theResp.trimmed}`, this.elDivChat);
+                p.className="role-trim";
+            }
+        } else {
+            console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
+        }
+        this.ui_reset_userinput();
+    }
+
+    /**
+     * Show buttons for NewChat and available chat sessions, in the passed elDiv.
+     * If elDiv is undefined/null, then use this.elDivSessions.
+     * Take care of highlighting the selected chat-session's btn.
+     * @param {HTMLDivElement | undefined} elDiv
+     */
+    show_sessions(elDiv=undefined) {
+        if (!elDiv) {
+            elDiv = this.elDivSessions;
+        }
+        elDiv.replaceChildren();
+        // Btn for creating new chat session
+        let btnNew = ui.el_create_button("New CHAT", (ev)=> {
+            if (this.elInUser.disabled) {
+                console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
+                alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
+                return;
+            }
+            let chatId = `Chat${Object.keys(this.simpleChats).length}`;
+            let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
+            if (!chatIdGot) {
+                console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
+                return;
+            }
+            this.new_chat_session(chatIdGot, true);
+            this.create_session_btn(elDiv, chatIdGot);
+            ui.el_children_config_class(elDiv, chatIdGot, "session-selected", "");
+        });
+        elDiv.appendChild(btnNew);
+        // Btns for existing chat sessions
+        let chatIds = Object.keys(this.simpleChats);
+        for(let cid of chatIds) {
+            let btn = this.create_session_btn(elDiv, cid);
+            if (cid == this.curChatId) {
+                btn.className = "session-selected";
+            }
+        }
+    }
+
+    create_session_btn(elDiv, cid) {
+        let btn = ui.el_create_button(cid, (ev)=>{
+            let target = /** @type{HTMLButtonElement} */(ev.target);
+            console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
+            if (this.elInUser.disabled) {
+                console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
+                alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
+                return;
+            }
+            this.handle_session_switch(target.id);
+            ui.el_children_config_class(elDiv, target.id, "session-selected", "");
+        });
+        elDiv.appendChild(btn);
+        return btn;
+    }
+
+    /**
+     * Switch ui to the specified chatId and set curChatId to same.
+     * @param {string} chatId
+     */
+    async handle_session_switch(chatId) {
+        let chat = this.simpleChats[chatId];
+        if (chat == undefined) {
+            console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
+            return;
+        }
+        this.elInSystem.value = chat.get_system_latest();
+        this.elInUser.value = "";
+        chat.show(this.elDivChat);
+        this.elInUser.focus();
+        this.curChatId = chatId;
+        console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
+    }
+
+}
+
+
+class Me {
+
+    constructor() {
+        this.baseURL = "http://127.0.0.1:8080";
+        this.defaultChatIds = [ "Default", "Other" ];
+        this.multiChat = new MultiChatUI();
+        this.bStream = true;
+        this.bCompletionFreshChatAlways = true;
+        this.bCompletionInsertStandardRolePrefix = false;
+        this.bTrimGarbage = true;
+        this.iRecentUserMsgCnt = 2;
+        this.sRecentUserMsgCnt = {
+            "Full": -1,
+            "Last0": 1,
+            "Last1": 2,
+            "Last2": 3,
+            "Last4": 5,
+        };
+        this.apiEP = ApiEP.Type.Chat;
+        this.headers = {
+            "Content-Type": "application/json",
+            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
+        }
+        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
+        this.apiRequestOptions = {
+            "model": "gpt-3.5-turbo",
+            "temperature": 0.7,
+            "max_tokens": 1024,
+            "n_predict": 1024,
+            "cache_prompt": false,
+            //"frequency_penalty": 1.2,
+            //"presence_penalty": 1.2,
+        };
+    }
+
+    /**
+     * Disable console.debug by mapping it to a empty function.
+     */
+    debug_disable() {
+        this.console_debug = console.debug;
+        console.debug = () => {
+
+        };
+    }
+
+    /**
+     * Setup the load saved chat ui.
+     * @param {HTMLDivElement} div
+     * @param {SimpleChat} chat
+     */
+    setup_load(div, chat) {
+        if (!(chat.ods_key() in localStorage)) {
+            return;
+        }
+        div.innerHTML += `<p class="role-system">Restore</p>
+        <p>Load previously saved chat session, if available</p>`;
+        let btn = ui.el_create_button(chat.ods_key(), (ev)=>{
+            console.log("DBUG:SimpleChat:SC:Load", chat);
+            chat.load();
+            queueMicrotask(()=>{
+                chat.show(div);
+                this.multiChat.elInSystem.value = chat.get_system_latest();
+            });
+        });
+        div.appendChild(btn);
+    }
+
+    /**
+     * Show the configurable parameters info in the passed Div element.
+     * @param {HTMLDivElement} elDiv
+     * @param {boolean} bAll
+     */
+    show_info(elDiv, bAll=false) {
+
+        let p = ui.el_create_append_p("Settings (devel-tools-console document[gMe])", elDiv);
+        p.className = "role-system";
+
+        if (bAll) {
+
+            ui.el_create_append_p(`baseURL:${this.baseURL}`, elDiv);
+
+            ui.el_create_append_p(`Authorization:${this.headers["Authorization"]}`, elDiv);
+
+            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
+
+            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
+
+            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
+
+            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
+
+            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
+
+            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
+
+        }
+
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
+
+    }
+
+    /**
+     * Auto create ui input elements for fields in apiRequestOptions
+     * Currently supports text and number field types.
+     * @param {HTMLDivElement} elDiv
+     */
+    show_settings_apirequestoptions(elDiv) {
+        let typeDict = {
+            "string": "text",
+            "number": "number",
+        };
+        let fs = document.createElement("fieldset");
+        let legend = document.createElement("legend");
+        legend.innerText = "ApiRequestOptions";
+        fs.appendChild(legend);
+        elDiv.appendChild(fs);
+        for(const k in this.apiRequestOptions) {
+            let val = this.apiRequestOptions[k];
+            let type = typeof(val);
+            if (((type == "string") || (type == "number"))) {
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
+                    if (type == "number") {
+                        val = Number(val);
+                    }
+                    this.apiRequestOptions[k] = val;
+                });
+                fs.appendChild(inp.div);
+            } else if (type == "boolean") {
+                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
+                    this.apiRequestOptions[k] = userVal;
+                });
+                fs.appendChild(bbtn.div);
+            }
+        }
+    }
+
+    /**
+     * Show settings ui for configurable parameters, in the passed Div element.
+     * @param {HTMLDivElement} elDiv
+     */
+    show_settings(elDiv) {
+
+        let inp = ui.el_creatediv_input("SetBaseURL", "BaseURL", "text", this.baseURL, (val)=>{
+            this.baseURL = val;
+        });
+        elDiv.appendChild(inp.div);
+
+        inp = ui.el_creatediv_input("SetAuthorization", "Authorization", "text", this.headers["Authorization"], (val)=>{
+            this.headers["Authorization"] = val;
+        });
+        inp.el.placeholder = "Bearer OPENAI_API_KEY";
+        elDiv.appendChild(inp.div);
+
+        let bb = ui.el_creatediv_boolbutton("SetStream", "Stream", {true: "[+] yes stream", false: "[-] do oneshot"}, this.bStream, (val)=>{
+            this.bStream = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
+            this.bTrimGarbage = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        this.show_settings_apirequestoptions(elDiv);
+
+        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
+            this.apiEP = ApiEP.Type[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
+            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
+            this.bCompletionFreshChatAlways = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{
+            this.bCompletionInsertStandardRolePrefix = val;
+        });
+        elDiv.appendChild(bb.div);
+
+    }
+
+}
+
+
+/** @type {Me} */
+let gMe;
+
+function startme() {
+    console.log("INFO:SimpleChat:StartMe:Starting...");
+    gMe = new Me();
+    gMe.debug_disable();
+    document["gMe"] = gMe;
+    document["du"] = du;
+    for (let cid of gMe.defaultChatIds) {
+        gMe.multiChat.new_chat_session(cid);
+    }
+    gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
+    gMe.multiChat.show_sessions();
+}
+
+document.addEventListener("DOMContentLoaded", startme);
diff --git a/examples/server/public_simplechat/simplechat_screens.webp b/examples/server/public_simplechat/simplechat_screens.webp
new file mode 100644
index 000000000..ccea44396
Binary files /dev/null and b/examples/server/public_simplechat/simplechat_screens.webp differ
diff --git a/examples/server/public_simplechat/ui.mjs b/examples/server/public_simplechat/ui.mjs
new file mode 100644
index 000000000..b2d5b9aea
--- /dev/null
+++ b/examples/server/public_simplechat/ui.mjs
@@ -0,0 +1,211 @@
+//@ts-check
+// Helpers to work with html elements
+// by Humans for All
+//
+
+
+/**
+ * Set the class of the children, based on whether it is the idSelected or not.
+ * @param {HTMLDivElement} elBase
+ * @param {string} idSelected
+ * @param {string} classSelected
+ * @param {string} classUnSelected
+ */
+export function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
+    for(let child of elBase.children) {
+        if (child.id == idSelected) {
+            child.className = classSelected;
+        } else {
+            child.className = classUnSelected;
+        }
+    }
+}
+
+/**
+ * Create button and set it up.
+ * @param {string} id
+ * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
+ * @param {string | undefined} name
+ * @param {string | undefined} innerText
+ */
+export function el_create_button(id, callback, name=undefined, innerText=undefined) {
+    if (!name) {
+        name = id;
+    }
+    if (!innerText) {
+        innerText = id;
+    }
+    let btn = document.createElement("button");
+    btn.id = id;
+    btn.name = name;
+    btn.innerText = innerText;
+    btn.addEventListener("click", callback);
+    return btn;
+}
+
+/**
+ * Create a para and set it up. Optionaly append it to a passed parent.
+ * @param {string} text
+ * @param {HTMLElement | undefined} elParent
+ * @param {string | undefined} id
+ */
+export function el_create_append_p(text, elParent=undefined, id=undefined) {
+    let para = document.createElement("p");
+    para.innerText = text;
+    if (id) {
+        para.id = id;
+    }
+    if (elParent) {
+        elParent.appendChild(para);
+    }
+    return para;
+}
+
+/**
+ * Create a button which represents bool value using specified text wrt true and false.
+ * When ever user clicks the button, it will toggle the value and update the shown text.
+ *
+ * @param {string} id
+ * @param {{true: string, false: string}} texts
+ * @param {boolean} defaultValue
+ * @param {function(boolean):void} cb
+ */
+export function el_create_boolbutton(id, texts, defaultValue, cb) {
+    let el = document.createElement("button");
+    el["xbool"] = defaultValue;
+    el["xtexts"] = structuredClone(texts);
+    el.innerText = el["xtexts"][String(defaultValue)];
+    if (id) {
+        el.id = id;
+    }
+    el.addEventListener('click', (ev)=>{
+        el["xbool"] = !el["xbool"];
+        el.innerText = el["xtexts"][String(el["xbool"])];
+        cb(el["xbool"]);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped button which represents bool value using specified text wrt true and false.
+ * @param {string} id
+ * @param {string} label
+ * @param {{ true: string; false: string; }} texts
+ * @param {boolean} defaultValue
+ * @param {(arg0: boolean) => void} cb
+ * @param {string} className
+ */
+export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let btn = el_create_boolbutton(id, texts, defaultValue, cb);
+    div.appendChild(btn);
+    return { div: div, el: btn };
+}
+
+
+/**
+ * Create a select ui element, with a set of options to select from.
+ * * options: an object which contains name-value pairs
+ * * defaultOption: the value whose name should be choosen, by default.
+ * * cb : the call back returns the name string of the option selected.
+ *
+ * @param {string} id
+ * @param {Object<string,*>} options
+ * @param {*} defaultOption
+ * @param {function(string):void} cb
+ */
+export function el_create_select(id, options, defaultOption, cb) {
+    let el = document.createElement("select");
+    el["xselected"] = defaultOption;
+    el["xoptions"] = structuredClone(options);
+    for(let cur of Object.keys(options)) {
+        let op = document.createElement("option");
+        op.value = cur;
+        op.innerText = cur;
+        if (options[cur] == defaultOption) {
+            op.selected = true;
+        }
+        el.appendChild(op);
+    }
+    if (id) {
+        el.id = id;
+        el.name = id;
+    }
+    el.addEventListener('change', (ev)=>{
+        let target = /** @type{HTMLSelectElement} */(ev.target);
+        console.log("DBUG:UI:Select:", id, ":", target.value);
+        cb(target.value);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped select ui element, with a set of options to select from.
+ *
+ * @param {string} id
+ * @param {any} label
+ * @param {{ [x: string]: any; }} options
+ * @param {any} defaultOption
+ * @param {(arg0: string) => void} cb
+ * @param {string} className
+ */
+export function el_creatediv_select(id, label, options, defaultOption, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let sel = el_create_select(id, options,defaultOption, cb);
+    div.appendChild(sel);
+    return { div: div, el: sel };
+}
+
+
+/**
+ * Create a input ui element.
+ *
+ * @param {string} id
+ * @param {string} type
+ * @param {any} defaultValue
+ * @param {function(any):void} cb
+ */
+export function el_create_input(id, type, defaultValue, cb) {
+    let el = document.createElement("input");
+    el.type = type;
+    el.value = defaultValue;
+    if (id) {
+        el.id = id;
+    }
+    el.addEventListener('change', (ev)=>{
+        cb(el.value);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped input.
+ *
+ * @param {string} id
+ * @param {string} label
+ * @param {string} type
+ * @param {any} defaultValue
+ * @param {function(any):void} cb
+ * @param {string} className
+ */
+export function el_creatediv_input(id, label, type, defaultValue, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let el = el_create_input(id, type, defaultValue, cb);
+    div.appendChild(el);
+    return { div: div, el: el };
+}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2b2f4a0f4..0718806c8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,105 +1,1232 @@
-#include "common.h"
-#include "llama.h"
-#include "grammar-parser.h"
 #include "utils.hpp"
-#include "oai.hpp"
 
-#include "../llava/clip.h"
-#include "../llava/llava.h"
+#include "arg.h"
+#include "common.h"
+#include "json-schema-to-grammar.h"
+#include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"
 
-#include "stb_image.h"
-
-#ifndef NDEBUG
-// crash the server in debug mode, otherwise send an http 500 error
-#define CPPHTTPLIB_NO_EXCEPTIONS 1
-#endif
-// increase max payload length to allow use of larger context size
-#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
-#include "httplib.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
+// mime type for sending response
+#define MIMETYPE_JSON "application/json; charset=utf-8"
 
-// auto generated files (update with ./deps.sh)
-#include "index.html.hpp"
-#include "index.js.hpp"
-#include "completion.js.hpp"
-#include "json-schema-to-grammar.mjs.hpp"
+// auto generated files (see README.md for details)
+#include "index.html.gz.hpp"
+#include "loading.html.hpp"
 
-#include <cstddef>
-#include <thread>
+#include <atomic>
 #include <chrono>
 #include <condition_variable>
-#include <atomic>
+#include <cstddef>
+#include <cinttypes>
+#include <deque>
+#include <memory>
+#include <mutex>
 #include <signal.h>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
 
-using json = nlohmann::json;
+using json = nlohmann::ordered_json;
 
-struct server_params {
-    std::string hostname = "127.0.0.1";
-    std::vector<std::string> api_keys;
-    std::string public_path = "examples/server/public";
-    std::string chat_template = "";
-    int32_t port = 8080;
-    int32_t read_timeout = 600;
-    int32_t write_timeout = 600;
-    bool slots_endpoint = true;
-    bool metrics_endpoint = false;
-    int n_threads_http = -1;
-};
-
-bool server_verbose = false;
-bool server_log_json = true;
+constexpr int HTTP_POLLING_SECONDS = 1;
 
 enum stop_type {
-    STOP_FULL,
-    STOP_PARTIAL,
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
 };
 
-// TODO: can become bool if we can't find use of more states
+// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 enum slot_state {
-    IDLE,
-    PROCESSING,
+    SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
 };
 
-enum slot_command {
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+enum server_task_type {
+    SERVER_TASK_TYPE_COMPLETION,
+    SERVER_TASK_TYPE_EMBEDDING,
+    SERVER_TASK_TYPE_RERANK,
+    SERVER_TASK_TYPE_INFILL,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+enum oaicompat_type {
+    OAICOMPAT_TYPE_NONE,
+    OAICOMPAT_TYPE_CHAT,
+    OAICOMPAT_TYPE_COMPLETION,
+    OAICOMPAT_TYPE_EMBEDDING,
+};
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
 struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool stream        = true;
+    bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens = false;
 
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<common_adapter_lora_info> lora;
 
     std::vector<std::string> antiprompt;
+    std::vector<std::string> response_fields;
+    bool timings_per_token = false;
+    bool post_sampling_probs = false;
+    bool ignore_eos = false;
 
-    json input_prefix;
-    json input_suffix;
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // OAI-compat fields
+    bool                  verbose                   = false;
+    oaicompat_type        oaicompat                 = OAICOMPAT_TYPE_NONE;
+    std::string           oaicompat_model;
+    std::string           oaicompat_cmpl_id;
+    common_chat_format    oaicompat_chat_format     = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+
+    json to_json() const {
+        std::vector<std::string> samplers;
+        samplers.reserve(sampling.samplers.size());
+        for (const auto & sampler : sampling.samplers) {
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
+        }
+
+        json lora = json::array();
+        for (size_t i = 0; i < this->lora.size(); ++i) {
+            lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+        }
+
+        std::vector<std::string> grammar_trigger_words;
+        for (const auto & trigger : sampling.grammar_trigger_words) {
+            grammar_trigger_words.push_back(trigger.word);
+        }
+
+        return json {
+            {"n_predict",                 n_predict},     // Server configured n_predict
+            {"seed",                      sampling.seed},
+            {"temperature",               sampling.temp},
+            {"dynatemp_range",            sampling.dynatemp_range},
+            {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"top_k",                     sampling.top_k},
+            {"top_p",                     sampling.top_p},
+            {"min_p",                     sampling.min_p},
+            {"xtc_probability",           sampling.xtc_probability},
+            {"xtc_threshold",             sampling.xtc_threshold},
+            {"typical_p",                 sampling.typ_p},
+            {"repeat_last_n",             sampling.penalty_last_n},
+            {"repeat_penalty",            sampling.penalty_repeat},
+            {"presence_penalty",          sampling.penalty_present},
+            {"frequency_penalty",         sampling.penalty_freq},
+            {"dry_multiplier",            sampling.dry_multiplier},
+            {"dry_base",                  sampling.dry_base},
+            {"dry_allowed_length",        sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
+            {"mirostat",                  sampling.mirostat},
+            {"mirostat_tau",              sampling.mirostat_tau},
+            {"mirostat_eta",              sampling.mirostat_eta},
+            {"stop",                      antiprompt},
+            {"max_tokens",                n_predict}, // User configured n_predict
+            {"n_keep",                    n_keep},
+            {"n_discard",                 n_discard},
+            {"ignore_eos",                sampling.ignore_eos},
+            {"stream",                    stream},
+            {"logit_bias",                format_logit_bias(sampling.logit_bias)},
+            {"n_probs",                   sampling.n_probs},
+            {"min_keep",                  sampling.min_keep},
+            {"grammar",                   sampling.grammar},
+            {"grammar_trigger_words",     grammar_trigger_words},
+            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
+            {"preserved_tokens",          sampling.preserved_tokens},
+            {"samplers",                  samplers},
+            {"speculative.n_max",         speculative.n_max},
+            {"speculative.n_min",         speculative.n_min},
+            {"speculative.p_min",         speculative.p_min},
+            {"timings_per_token",         timings_per_token},
+            {"post_sampling_probs",       post_sampling_probs},
+            {"lora",                      lora},
+        };
+    }
 };
 
-struct slot_image {
-    int32_t id;
+struct server_task {
+    int id    = -1; // to be filled by server_queue
+    int index = -1; // used when there are multiple prompts (batch request)
 
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
+    server_task_type type;
 
-    clip_image_u8 * img_data;
+    // used by SERVER_TASK_TYPE_CANCEL
+    int id_target = -1;
 
-    std::string prefix_prompt; // before of this image
+    // used by SERVER_TASK_TYPE_INFERENCE
+    slot_params  params;
+    llama_tokens prompt_tokens;
+    int id_selected_slot = -1;
+
+    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
+    struct slot_action {
+        int slot_id;
+        std::string filename;
+        std::string filepath;
+    };
+    slot_action slot_action;
+
+    // used by SERVER_TASK_TYPE_METRICS
+    bool metrics_reset_bucket = false;
+
+    // used by SERVER_TASK_TYPE_SET_LORA
+    std::vector<common_adapter_lora_info> set_lora;
+
+    server_task(server_task_type type) : type(type) {}
+
+    static slot_params params_from_json_cmpl(
+            const llama_context * ctx,
+            const common_params & params_base,
+            const json & data) {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+
+        slot_params params;
+
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
+        slot_params defaults;
+        defaults.sampling    = params_base.sampling;
+        defaults.speculative = params_base.speculative;
+
+        // enabling this will output extra debug information in the HTTP responses from the server
+        params.verbose           = params_base.verbosity > 9;
+        params.timings_per_token = json_value(data, "timings_per_token", false);
+
+        params.stream           = json_value(data, "stream",             false);
+        params.cache_prompt     = json_value(data, "cache_prompt",       true);
+        params.return_tokens    = json_value(data, "return_tokens",      false);
+        params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
+        params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
+        params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
+        params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
+      //params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
+        params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
+        params.response_fields  = json_value(data, "response_fields",   std::vector<std::string>());
+
+        params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
+        params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
+        params.sampling.min_p              = json_value(data, "min_p",              defaults.sampling.min_p);
+        params.sampling.xtc_probability    = json_value(data, "xtc_probability",    defaults.sampling.xtc_probability);
+        params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",      defaults.sampling.xtc_threshold);
+        params.sampling.typ_p              = json_value(data, "typical_p",          defaults.sampling.typ_p);
+        params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
+        params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
+        params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
+        params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
+        params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
+        params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);
+        params.sampling.penalty_present    = json_value(data, "presence_penalty",   defaults.sampling.penalty_present);
+        params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",     defaults.sampling.dry_multiplier);
+        params.sampling.dry_base           = json_value(data, "dry_base",           defaults.sampling.dry_base);
+        params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
+        params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
+        params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
+        params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
+        params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
+        params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
+        params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
+        params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
+        params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
+
+        params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
+        params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
+        params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
+
+        params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
+        params.speculative.n_min = std::max(params.speculative.n_min, 2);
+        params.speculative.n_max = std::max(params.speculative.n_max, 0);
+
+        // Use OpenAI API logprobs only if n_probs wasn't provided
+        if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
+            params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
+        }
+
+        if (data.contains("lora")) {
+            if (data.at("lora").is_array()) {
+                params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
+            } else {
+                throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
+            }
+        } else {
+            params.lora = params_base.lora_adapters;
+        }
+
+        // TODO: add more sanity checks for the input parameters
+
+        if (params.sampling.penalty_last_n < -1) {
+            throw std::runtime_error("Error: repeat_last_n must be >= -1");
+        }
+
+        if (params.sampling.dry_penalty_last_n < -1) {
+            throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
+        }
+
+        if (params.sampling.penalty_last_n == -1) {
+            // note: should be the slot's context and not the full context, but it's ok
+            params.sampling.penalty_last_n = llama_n_ctx(ctx);
+        }
+
+        if (params.sampling.dry_penalty_last_n == -1) {
+            params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
+        }
+
+        if (params.sampling.dry_base < 1.0f) {
+            params.sampling.dry_base = defaults.sampling.dry_base;
+        }
+
+        // sequence breakers for DRY
+        {
+            // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
+            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
+
+            if (data.contains("dry_sequence_breakers")) {
+                params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                if (params.sampling.dry_sequence_breakers.empty()) {
+                    throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
+                }
+            }
+        }
+
+        // process "json_schema" and "grammar"
+        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
+            throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+        }
+        if (data.contains("json_schema") && !data.contains("grammar")) {
+            try {
+                auto schema                  = json_value(data, "json_schema", json::object());
+                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+                params.sampling.grammar      = json_schema_to_grammar(schema);
+                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+            } catch (const std::exception & e) {
+                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
+            }
+        } else {
+            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
+            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
+            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+        }
+
+        {
+            auto it = data.find("chat_format");
+            if (it != data.end()) {
+                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+            } else {
+                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
+            }
+        }
+
+        {
+            const auto grammar_triggers = data.find("grammar_triggers");
+            if (grammar_triggers != data.end()) {
+                for (const auto & t : *grammar_triggers) {
+                    common_grammar_trigger trigger;
+                    trigger.word = t.at("word");
+                    trigger.at_start = t.at("at_start");
+
+                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
+                    if (ids.size() == 1) {
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
+                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
+                        params.sampling.preserved_tokens.insert(ids[0]);
+                        continue;
+                    }
+                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
+                    params.sampling.grammar_trigger_words.push_back(trigger);
+                }
+            }
+            const auto preserved_tokens = data.find("preserved_tokens");
+            if (preserved_tokens != data.end()) {
+                for (const auto & t : *preserved_tokens) {
+                    auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
+                    if (ids.size() == 1) {
+                        SRV_DBG("Preserved token: %d\n", ids[0]);
+                        params.sampling.preserved_tokens.insert(ids[0]);
+                    } else {
+                        // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
+                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                    }
+                }
+            }
+            if (params.sampling.grammar_lazy) {
+                GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
+            }
+        }
+
+        {
+            params.sampling.logit_bias.clear();
+            params.ignore_eos = json_value(data, "ignore_eos", false);
+
+            const auto & logit_bias = data.find("logit_bias");
+            if (logit_bias != data.end() && logit_bias->is_array()) {
+                const int n_vocab = llama_vocab_n_tokens(vocab);
+                for (const auto & el : *logit_bias) {
+                    // TODO: we may want to throw errors here, in case "el" is incorrect
+                    if (el.is_array() && el.size() == 2) {
+                        float bias;
+                        if (el[1].is_number()) {
+                            bias = el[1].get<float>();
+                        } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+                            bias = -INFINITY;
+                        } else {
+                            continue;
+                        }
+
+                        if (el[0].is_number_integer()) {
+                            llama_token tok = el[0].get<llama_token>();
+                            if (tok >= 0 && tok < n_vocab) {
+                                params.sampling.logit_bias.push_back({tok, bias});
+                            }
+                        } else if (el[0].is_string()) {
+                            auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                            for (auto tok : toks) {
+                                params.sampling.logit_bias.push_back({tok, bias});
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        {
+            params.antiprompt.clear();
+
+            const auto & stop = data.find("stop");
+            if (stop != data.end() && stop->is_array()) {
+                for (const auto & word : *stop) {
+                    if (!word.empty()) {
+                        params.antiprompt.push_back(word);
+                    }
+                }
+            }
+        }
+
+        {
+            const auto samplers = data.find("samplers");
+            if (samplers != data.end()) {
+                if (samplers->is_array()) {
+                    params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
+                } else if (samplers->is_string()){
+                    params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
+                }
+            } else {
+                params.sampling.samplers = defaults.sampling.samplers;
+            }
+        }
+
+        std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
+        params.oaicompat_model = json_value(data, "model", model_name);
+
+        return params;
+    }
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct result_timings {
+    int32_t prompt_n = -1;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n = -1;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+
+    json to_json() const {
+        return {
+            {"prompt_n",               prompt_n},
+            {"prompt_ms",              prompt_ms},
+            {"prompt_per_token_ms",    prompt_per_token_ms},
+            {"prompt_per_second",      prompt_per_second},
+
+            {"predicted_n",            predicted_n},
+            {"predicted_ms",           predicted_ms},
+            {"predicted_per_token_ms", predicted_per_token_ms},
+            {"predicted_per_second",   predicted_per_second},
+        };
+    }
+};
+
+struct server_task_result {
+    int id           = -1;
+    int id_slot      = -1;
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_*
+        return false;
+    }
+    virtual int get_index() {
+        return -1;
+    }
+    virtual json to_json() = 0;
+    virtual ~server_task_result() = default;
+};
+
+// using shared_ptr for polymorphism of server_task_result
+using server_task_result_ptr = std::unique_ptr<server_task_result>;
+
+inline std::string stop_type_to_str(stop_type type) {
+    switch (type) {
+        case STOP_TYPE_EOS:   return "eos";
+        case STOP_TYPE_WORD:  return "word";
+        case STOP_TYPE_LIMIT: return "limit";
+        default:              return "none";
+    }
+}
+
+struct completion_token_output {
+    llama_token tok;
+    float prob;
+    std::string text_to_send;
+    struct prob_info {
+        llama_token tok;
+        std::string txt;
+        float prob;
+    };
+    std::vector<prob_info> probs;
+
+    json to_json(bool post_sampling_probs) const {
+        json probs_for_token = json::array();
+        for (const auto & p : probs) {
+            std::string txt(p.txt);
+            txt.resize(validate_utf8(txt));
+            probs_for_token.push_back(json {
+                {"id",      p.tok},
+                {"token",   txt},
+                {"bytes",   str_to_bytes(p.txt)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
+            });
+        }
+        return probs_for_token;
+    }
+
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
+        json out = json::array();
+        for (const auto & p : probs) {
+            std::string txt(p.text_to_send);
+            txt.resize(validate_utf8(txt));
+            out.push_back(json {
+                {"id",           p.tok},
+                {"token",        txt},
+                {"bytes",        str_to_bytes(p.text_to_send)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
+                {
+                    post_sampling_probs ? "top_probs" : "top_logprobs",
+                    p.to_json(post_sampling_probs)
+                },
+            });
+        }
+        return out;
+    }
+
+    static float logarithm(float x) {
+        // nlohmann::json converts -inf to null, so we need to prevent that
+        return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
+    }
+
+    static std::vector<unsigned char> str_to_bytes(const std::string & str) {
+        std::vector<unsigned char> bytes;
+        for (unsigned char c : str) {
+            bytes.push_back(c);
+        }
+        return bytes;
+    }
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    int index = 0;
+
+    std::string content;
+    llama_tokens tokens;
+
+    bool stream;
+    result_timings timings;
+    std::string prompt;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t n_tokens_cached;
+    bool has_new_line;
+    std::string stopping_word;
+    stop_type stop = STOP_TYPE_NONE;
+
+    bool post_sampling_probs;
+    std::vector<completion_token_output> probs_output;
+    std::vector<std::string>  response_fields;
+
+    slot_params generation_params;
+
+    // OAI-compat fields
+    bool                  verbose                  = false;
+    oaicompat_type        oaicompat                = OAICOMPAT_TYPE_NONE;
+    std::string           oaicompat_model;
+    std::string           oaicompat_cmpl_id;
+    common_chat_format    oaicompat_chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return true; // in stream mode, final responses are considered stop
+    }
+
+    virtual json to_json() override {
+        switch (oaicompat) {
+            case OAICOMPAT_TYPE_NONE:
+                return to_json_non_oaicompat();
+            case OAICOMPAT_TYPE_COMPLETION:
+                return to_json_oaicompat();
+            case OAICOMPAT_TYPE_CHAT:
+                return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
+            default:
+                GGML_ASSERT(false && "Invalid oaicompat_type");
+        }
+    }
+
+    json to_json_non_oaicompat() {
+        json res = json {
+            {"index",               index},
+            {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
+            {"tokens",              stream ? llama_tokens {} : tokens},
+            {"id_slot",             id_slot},
+            {"stop",                true},
+            {"model",               oaicompat_model},
+            {"tokens_predicted",    n_decoded},
+            {"tokens_evaluated",    n_prompt_tokens},
+            {"generation_settings", generation_params.to_json()},
+            {"prompt",              prompt},
+            {"has_new_line",        has_new_line},
+            {"truncated",           truncated},
+            {"stop_type",           stop_type_to_str(stop)},
+            {"stopping_word",       stopping_word},
+            {"tokens_cached",       n_tokens_cached},
+            {"timings",             timings.to_json()},
+        };
+        if (!stream && !probs_output.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
+        }
+        return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
+    }
+
+    json to_json_oaicompat() {
+        std::time_t t = std::time(0);
+        json logprobs = json(nullptr); // OAI default to null
+        if (!stream && probs_output.size() > 0) {
+            logprobs = json{
+                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+            };
+        }
+        json finish_reason = "length";
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        }
+        json res = json {
+            {"choices",            json::array({
+                json{
+                    {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
+                    {"index",         index},
+                    {"logprobs",      logprobs},
+                    {"finish_reason", finish_reason},
+                }
+            })},
+            {"created",            t},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "text_completion"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens}
+            }},
+            {"id", oaicompat_cmpl_id}
+        };
+
+        // extra fields for debugging purposes
+        if (verbose) {
+            res["__verbose"] = to_json_non_oaicompat();
+        }
+        if (timings.prompt_n >= 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+
+        return res;
+    }
+
+    json to_json_oaicompat_chat() {
+        std::string finish_reason = "length";
+        common_chat_msg msg;
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            SRV_DBG("Parsing chat message: %s\n", content.c_str());
+            msg = common_chat_parse(content, oaicompat_chat_format);
+            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
+        } else {
+            msg.content = content;
+        }
+
+        json tool_calls;
+        if (!msg.tool_calls.empty()) {
+            tool_calls = json::array();
+            for (const auto & tc : msg.tool_calls) {
+                tool_calls.push_back({
+                    {"type", "function"},
+                    {"function", {
+                        {"name", tc.name},
+                        {"arguments", tc.arguments},
+                    }},
+                    {"id", tc.id},
+                });
+            }
+        }
+
+        json message {
+            {"content", msg.content},
+            {"tool_calls", tool_calls},
+            {"role", "assistant"},
+        };
+        if (!msg.tool_plan.empty()) {
+            message["tool_plan"] = msg.tool_plan;
+        }
+
+        json choice {
+            {"finish_reason", finish_reason},
+            {"index", 0},
+            {"message", message},
+        };
+
+        if (!stream && probs_output.size() > 0) {
+            choice["logprobs"] = json{
+                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+            };
+        }
+
+        std::time_t t = std::time(0);
+
+        json res = json {
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens}
+            }},
+            {"id", oaicompat_cmpl_id}
+        };
+
+        // extra fields for debugging purposes
+        if (verbose) {
+            res["__verbose"] = to_json_non_oaicompat();
+        }
+        if (timings.prompt_n >= 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+
+        return res;
+    }
+
+    json to_json_oaicompat_chat_stream() {
+        std::time_t t = std::time(0);
+        std::string finish_reason = "length";
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        }
+
+        json choice = json {
+            {"finish_reason", finish_reason},
+            {"index", 0},
+            {"delta", json::object()}
+        };
+
+        json ret = json {
+            {"choices",            json::array({choice})},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens},
+            }},
+        };
+
+        if (timings.prompt_n >= 0) {
+            ret.push_back({"timings", timings.to_json()});
+        }
+
+        return ret;
+    }
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    int index = 0;
+
+    std::string  content;
+    llama_tokens tokens;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+
+    bool post_sampling_probs;
+    completion_token_output prob_output;
+    result_timings timings;
+
+    // OAI-compat fields
+    bool           verbose   = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string    oaicompat_model;
+    std::string    oaicompat_cmpl_id;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return false; // in stream mode, partial responses are not considered stop
+    }
+
+    virtual json to_json() override {
+        switch (oaicompat) {
+            case OAICOMPAT_TYPE_NONE:
+                return to_json_non_oaicompat();
+            case OAICOMPAT_TYPE_COMPLETION:
+                return to_json_oaicompat();
+            case OAICOMPAT_TYPE_CHAT:
+                return to_json_oaicompat_chat();
+            default:
+                GGML_ASSERT(false && "Invalid oaicompat_type");
+        }
+    }
+
+    json to_json_non_oaicompat() {
+        // non-OAI-compat JSON
+        json res = json {
+            {"index",            index},
+            {"content",          content},
+            {"tokens",           tokens},
+            {"stop",             false},
+            {"id_slot",          id_slot},
+            {"tokens_predicted", n_decoded},
+            {"tokens_evaluated", n_prompt_tokens},
+        };
+        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
+        if (timings.prompt_n > 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+        if (!prob_output.probs.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
+        }
+        return res;
+    }
+
+    json to_json_oaicompat() {
+        std::time_t t = std::time(0);
+        json logprobs = json(nullptr); // OAI default to null
+        if (prob_output.probs.size() > 0) {
+            logprobs = json{
+                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+            };
+        }
+        json res = json {
+            {"choices",            json::array({
+                json{
+                    {"text",          content},
+                    {"index",         index},
+                    {"logprobs",      logprobs},
+                    {"finish_reason", nullptr},
+                }
+            })},
+            {"created",            t},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "text_completion"},
+            {"id",                 oaicompat_cmpl_id}
+        };
+
+        // extra fields for debugging purposes
+        if (verbose) {
+            res["__verbose"] = to_json_non_oaicompat();
+        }
+        if (timings.prompt_n >= 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+
+        return res;
+    }
+
+    json to_json_oaicompat_chat() {
+        bool first = n_decoded == 0;
+        std::time_t t = std::time(0);
+        json choices;
+
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", oaicompat_cmpl_id},
+                            {"model", oaicompat_model},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json {
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", oaicompat_cmpl_id},
+                            {"model", oaicompat_model},
+                            {"object", "chat.completion.chunk"}};
+
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json {
+                    {"content", content},
+                }},
+            }});
+        }
+
+        GGML_ASSERT(choices.size() >= 1);
+
+        if (prob_output.probs.size() > 0) {
+            choices[0]["logprobs"] = json{
+                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+            };
+        }
+
+        json ret = json {
+            {"choices",            choices},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"}
+        };
+
+        if (timings.prompt_n >= 0) {
+            ret.push_back({"timings", timings.to_json()});
+        }
+
+        return std::vector<json>({ret});
+    }
+};
+
+struct server_task_result_embd : server_task_result {
+    int index = 0;
+    std::vector<std::vector<float>> embedding;
+
+    int32_t n_tokens;
+
+    // OAI-compat fields
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
+        return oaicompat == OAICOMPAT_TYPE_EMBEDDING
+            ? to_json_oaicompat()
+            : to_json_non_oaicompat();
+    }
+
+    json to_json_non_oaicompat() {
+        return json {
+            {"index",     index},
+            {"embedding", embedding},
+        };
+    }
+
+    json to_json_oaicompat() {
+        return json {
+            {"index",            index},
+            {"embedding",        embedding[0]},
+            {"tokens_evaluated", n_tokens},
+        };
+    }
+};
+
+struct server_task_result_rerank : server_task_result {
+    int index = 0;
+    float score = -1e6;
+
+    int32_t n_tokens;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
+        return json {
+            {"index",            index},
+            {"score",            score},
+            {"tokens_evaluated", n_tokens},
+        };
+    }
+};
+
+// this function maybe used outside of server_task_result_error
+static json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
+    }
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
+}
+
+struct server_task_result_error : server_task_result {
+    int index = 0;
+    error_type err_type = ERROR_TYPE_SERVER;
+    std::string err_msg;
+
+    virtual bool is_error() override {
+        return true;
+    }
+
+    virtual json to_json() override {
+        return format_error_response(err_msg, err_type);
+    }
+};
+
+struct server_task_result_metrics : server_task_result {
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
+
+    int32_t kv_cache_tokens_count;
+    int32_t kv_cache_used_cells;
+
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
+    // therefore, we use json to temporarily store the slot.to_json() result
+    json slots_data = json::array();
+
+    virtual json to_json() override {
+        return json {
+            { "idle",                            n_idle_slots },
+            { "processing",                      n_processing_slots },
+            { "deferred",                        n_tasks_deferred },
+            { "t_start",                         t_start },
+
+            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
+            { "t_tokens_generation_total",       t_tokens_generation_total },
+            { "n_tokens_predicted_total",        n_tokens_predicted_total },
+            { "t_prompt_processing_total",       t_prompt_processing_total },
+
+            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
+            { "t_prompt_processing",             t_prompt_processing },
+            { "n_tokens_predicted",              n_tokens_predicted },
+            { "t_tokens_generation",             t_tokens_generation },
+
+            { "n_decode_total",                  n_decode_total },
+            { "n_busy_slots_total",              n_busy_slots_total },
+
+            { "kv_cache_tokens_count",           kv_cache_tokens_count },
+            { "kv_cache_used_cells",             kv_cache_used_cells },
+
+            { "slots",                           slots_data },
+        };
+    }
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_tokens;
+    size_t n_bytes;
+    double t_ms;
+
+    virtual json to_json() override {
+        if (is_save) {
+            return json {
+                { "id_slot",   id_slot },
+                { "filename",  filename },
+                { "n_saved",   n_tokens },
+                { "n_written", n_bytes },
+                { "timings", {
+                    { "save_ms", t_ms }
+                }},
+            };
+        } else {
+            return json {
+                { "id_slot",    id_slot },
+                { "filename",   filename },
+                { "n_restored", n_tokens },
+                { "n_read",     n_bytes },
+                { "timings", {
+                    { "restore_ms", t_ms }
+                }},
+            };
+        }
+    }
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    size_t n_erased;
+
+    virtual json to_json() override {
+        return json {
+            { "id_slot",  id_slot },
+            { "n_erased", n_erased },
+        };
+    }
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    virtual json to_json() override {
+        return json {{ "success", true }};
+    }
 };
 
 struct server_slot {
     int id;
-    int task_id = -1;
+    int id_task = -1;
+
+    // only used for completion/embedding/infill/rerank
+    server_task_type task_type = SERVER_TASK_TYPE_COMPLETION;
+
+    llama_batch batch_spec = {};
+
+    llama_context * ctx = nullptr;
+    llama_context * ctx_dft = nullptr;
+
+    common_speculative * spec = nullptr;
+
+    std::vector<common_adapter_lora_info> lora;
+
+    // the index relative to completion multi-task request
+    size_t index = 0;
 
     struct slot_params params;
 
-    slot_state state = IDLE;
-    slot_command command = NONE;
+    slot_state state = SLOT_STATE_IDLE;
 
     // used to determine the slot that has been used the longest
     int64_t t_last_used = -1;
@@ -110,85 +1237,79 @@ struct server_slot {
     int32_t n_decoded   = 0;
     int32_t n_remaining = -1;
     int32_t i_batch     = -1;
-    int32_t n_predict   = -1;
+    int32_t n_predict   = -1; // TODO: disambiguate from params.n_predict
 
+    // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated
     int32_t n_prompt_tokens           = 0;
     int32_t n_prompt_tokens_processed = 0;
 
-    json prompt;
-    std::string generated_text;
-    llama_token sampled;
-    std::vector<llama_token> cache_tokens;
+    // input prompt tokens
+    llama_tokens prompt_tokens;
+
+    size_t last_nl_pos = 0;
+
+    std::string  generated_text;
+    llama_tokens generated_tokens;
+
+    llama_tokens cache_tokens;
+
     std::vector<completion_token_output> generated_token_probs;
 
-    bool infill = false;
-    bool embedding = false;
     bool has_next_token = true;
-    bool truncated = false;
-    bool stopped_eos = false;
-    bool stopped_word = false;
-    bool stopped_limit = false;
-
-    bool oaicompat = false;
-    std::string oaicompat_model;
+    bool has_new_line   = false;
+    bool truncated      = false;
+    stop_type stop;
 
     std::string stopping_word;
 
     // sampling
-    struct llama_sampling_params sparams;
-    llama_sampling_context *ctx_sampling = nullptr;
+    json json_schema;
 
-    int32_t ga_i = 0;   // group-attention state
-    int32_t ga_n = 1;   // group-attention factor
-    int32_t ga_w = 512; // group-attention width
+    struct common_sampler * smpl = nullptr;
 
-    int32_t n_past_se = 0; // self-extend
+    llama_token sampled;
 
-    // multimodal
-    std::vector<slot_image> images;
+    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
 
     // stats
-    size_t n_sent_text = 0; // number of sent text character
-    size_t n_sent_token_probs = 0;
+    size_t n_sent_text        = 0; // number of sent text character
 
     int64_t t_start_process_prompt;
-    int64_t t_start_genereration;
+    int64_t t_start_generation;
 
     double t_prompt_processing; // ms
-    double t_token_generation; // ms
+    double t_token_generation;  // ms
 
-    // multitasks
-    int multitask_id = -1;
+    std::function<void(int)> callback_on_release;
 
     void reset() {
-        n_prompt_tokens        = 0;
-        generated_text         = "";
-        truncated              = false;
-        stopped_eos            = false;
-        stopped_word           = false;
-        stopped_limit          = false;
-        stopping_word          = "";
-        n_past                 = 0;
-        n_sent_text            = 0;
-        n_sent_token_probs     = 0;
-        infill                 = false;
-        ga_i                   = 0;
-        n_past_se              = 0;
+        SLT_DBG(*this, "%s", "\n");
 
+        n_prompt_tokens    = 0;
+        last_nl_pos        = 0;
+        generated_text     = "";
+        has_new_line       = false;
+        truncated          = false;
+        stop               = STOP_TYPE_NONE;
+        stopping_word      = "";
+        n_past             = 0;
+        n_sent_text        = 0;
+        task_type          = SERVER_TASK_TYPE_COMPLETION;
+
+        generated_tokens.clear();
         generated_token_probs.clear();
-
-        for (slot_image & img : images) {
-            free(img.image_embedding);
-            if (img.img_data) {
-                clip_image_u8_free(img.img_data);
-            }
-            img.prefix_prompt = "";
-        }
-
-        images.clear();
     }
 
-    bool has_budget(gpt_params &global_params) {
+    bool is_non_causal() const {
+        return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
+    }
+
+    bool can_batch_with(server_slot & other_slot) {
+        return is_non_causal() == other_slot.is_non_causal()
+            && are_lora_equal(lora, other_slot.lora);
+    }
+
+    bool has_budget(const common_params & global_params) {
         if (params.n_predict == -1 && global_params.n_predict == -1) {
             return true; // limitless
         }
@@ -204,107 +1325,159 @@ struct server_slot {
         return n_remaining > 0; // no budget
     }
 
-    bool available() const {
-        return state == IDLE && command == NONE;
-    }
-
     bool is_processing() const {
-        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+        return state != SLOT_STATE_IDLE;
     }
 
-    void add_token_string(const completion_token_output &token) {
-        if (command == RELEASE) {
+    bool can_speculate() const {
+        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
+    }
+
+    void add_token(const completion_token_output & token) {
+        if (!is_processing()) {
+            SLT_WRN(*this, "%s", "slot is not processing\n");
             return;
         }
-        cache_tokens.push_back(token.tok);
         generated_token_probs.push_back(token);
     }
 
     void release() {
-        if (state == PROCESSING)
-        {
-            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
-            command = RELEASE;
+        if (is_processing()) {
+            SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
+
+            t_last_used = ggml_time_us();
+            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+            state = SLOT_STATE_IDLE;
+            callback_on_release(id);
         }
     }
 
-    json get_formated_timings() {
-        return json
-        {
-            {"prompt_n",               n_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
+    result_timings get_timings() const {
+        result_timings timings;
+        timings.prompt_n = n_prompt_tokens_processed;
+        timings.prompt_ms = t_prompt_processing;
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+        timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
 
-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
-        };
+        timings.predicted_n = n_decoded;
+        timings.predicted_ms = t_token_generation;
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
+        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
+
+        return timings;
+    }
+
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
+        size_t stop_pos = std::string::npos;
+
+        for (const std::string & word : params.antiprompt) {
+            size_t pos;
+
+            if (is_full_stop) {
+                const size_t tmp      = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+
+                pos = text.find(word, from_pos);
+            } else {
+                // otherwise, partial stop
+                pos = find_partial_stop_string(word, text);
+            }
+
+            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
+                if (is_full_stop) {
+                    stop           = STOP_TYPE_WORD;
+                    stopping_word  = word;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+
+        return stop_pos;
     }
 
     void print_timings() const {
-       char buffer[512];
-        double t_token = t_prompt_processing / n_prompt_tokens_processed;
-        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
-                t_prompt_processing, n_prompt_tokens_processed,
-                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
-            {"slot_id",                   id},
-            {"task_id",                   task_id},
-            {"t_prompt_processing",       t_prompt_processing},
-            {"n_prompt_tokens_processed", n_prompt_tokens_processed},
-            {"t_token",                   t_token},
-            {"n_tokens_second",           n_tokens_second},
-        });
+        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
 
-        t_token = t_token_generation / n_decoded;
-        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
-                t_token_generation, n_decoded,
-                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
-            {"slot_id",            id},
-            {"task_id",            task_id},
-            {"t_token_generation", t_token_generation},
-            {"n_decoded",          n_decoded},
-            {"t_token",            t_token},
-            {"n_tokens_second",    n_tokens_second},
-        });
+        const double t_gen        =       t_token_generation / n_decoded;
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
 
-        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-        LOG_INFO(buffer, {
-            {"slot_id",             id},
-            {"task_id",             task_id},
-            {"t_prompt_processing", t_prompt_processing},
-            {"t_token_generation",  t_token_generation},
-            {"t_total",             t_prompt_processing + t_token_generation},
-        });
+        SLT_INF(*this,
+                "\n"
+                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "      total time = %10.2f ms / %5d tokens\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_token_generation, n_decoded, t_gen, n_gen_second,
+                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
+    }
+
+    json to_json() const {
+        return json {
+            {"id",            id},
+            {"id_task",       id_task},
+            {"n_ctx",         n_ctx},
+            {"speculative",   can_speculate()},
+            {"is_processing", is_processing()},
+            {"non_causal",    is_non_causal()},
+            {"params",        params.to_json()},
+            {"prompt",        common_detokenize(ctx, prompt_tokens)},
+            {"next_token",
+                {
+                    {"has_next_token", has_next_token},
+                    {"has_new_line",   has_new_line},
+                    {"n_remain",       n_remaining},
+                    {"n_decoded",      n_decoded},
+                    {"stopping_word",  stopping_word},
+                }
+            },
+        };
     }
 };
 
 struct server_metrics {
+    int64_t t_start = 0;
+
     uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
     uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
 
     uint64_t n_prompt_tokens_processed = 0;
     uint64_t t_prompt_processing       = 0;
 
-    uint64_t n_tokens_predicted       = 0;
-    uint64_t t_tokens_generation      = 0;
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
 
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
 
-    void on_prompt_eval(const server_slot &slot) {
+    void init() {
+        t_start = ggml_time_us();
+    }
+
+    void on_prompt_eval(const server_slot & slot) {
         n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
         n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
         t_prompt_processing             += slot.t_prompt_processing;
+        t_prompt_processing_total       += slot.t_prompt_processing;
     }
 
-    void on_prediction(const server_slot &slot) {
-        n_tokens_predicted_total += slot.n_decoded;
-        n_tokens_predicted       += slot.n_decoded;
-        t_tokens_generation      += slot.t_token_generation;
+    void on_prediction(const server_slot & slot) {
+        n_tokens_predicted_total   += slot.n_decoded;
+        n_tokens_predicted         += slot.n_decoded;
+        t_tokens_generation        += slot.t_token_generation;
+        t_tokens_generation_total  += slot.t_token_generation;
+    }
+
+    void on_decoded(const std::vector<server_slot> & slots) {
+        n_decode_total++;
+        for (const auto & slot : slots) {
+            if (slot.is_processing()) {
+                n_busy_slots_total++;
+            }
+        }
     }
 
     void reset_bucket() {
@@ -315,1604 +1488,1661 @@ struct server_metrics {
     }
 };
 
-struct llama_server_context
-{
-    llama_model *model = nullptr;
-    llama_context *ctx = nullptr;
+struct server_queue {
+    int id = 0;
+    bool running;
 
-    clip_ctx *clp_ctx = nullptr;
+    // queues
+    std::deque<server_task> queue_tasks;
+    std::deque<server_task> queue_tasks_deferred;
 
-    gpt_params params;
+    std::mutex mutex_tasks;
+    std::condition_variable condition_tasks;
 
-    llama_batch batch;
+    // callback functions
+    std::function<void(server_task)> callback_new_task;
+    std::function<void(void)>        callback_update_slots;
 
-    bool multimodal         = false;
-    bool clean_kv_cache     = true;
-    bool all_slots_are_idle = false;
-    bool add_bos_token      = true;
+    // Add a new task to the end of the queue
+    int post(server_task task, bool front = false) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        GGML_ASSERT(task.id != -1);
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
+        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
+        if (front) {
+            queue_tasks.push_front(std::move(task));
+        } else {
+            queue_tasks.push_back(std::move(task));
+        }
+        condition_tasks.notify_one();
+        return task.id;
+    }
 
-    int32_t n_ctx;  // total context for all clients / slots
+    // multi-task version of post()
+    int post(std::vector<server_task> & tasks, bool front = false) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : tasks) {
+            if (task.id == -1) {
+                task.id = id++;
+            }
+            // if this is cancel task make sure to clean up pending tasks
+            if (task.type == SERVER_TASK_TYPE_CANCEL) {
+                cleanup_pending_task(task.id_target);
+            }
+            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
+            if (front) {
+                queue_tasks.push_front(std::move(task));
+            } else {
+                queue_tasks.push_back(std::move(task));
+            }
+        }
+        condition_tasks.notify_one();
+        return 0;
+    }
 
-    // system prompt
-    bool system_need_update = false;
+    // Add a new task, but defer until one slot is available
+    void defer(server_task task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        QUE_DBG("defer task, id = %d\n", task.id);
+        queue_tasks_deferred.push_back(std::move(task));
+        condition_tasks.notify_one();
+    }
 
-    std::string              system_prompt;
-    std::vector<llama_token> system_tokens;
+    // Get the next id for creating a new task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        int new_id = id++;
+        return new_id;
+    }
 
-    std::string name_user;      // this should be the antiprompt
-    std::string name_assistant;
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task)> callback) {
+        callback_new_task = std::move(callback);
+    }
+
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback) {
+        callback_update_slots = std::move(callback);
+    }
+
+    // Call when the state of one slot is changed, it will move one task from deferred to main queue
+    void pop_deferred_task() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (!queue_tasks_deferred.empty()) {
+            queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
+            queue_tasks_deferred.pop_front();
+        }
+        condition_tasks.notify_one();
+    }
+
+    // end the start_loop routine
+    void terminate() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        running = false;
+        condition_tasks.notify_all();
+    }
+
+    /**
+     * Main loop consists of these steps:
+     * - Wait until a new task arrives
+     * - Process the task (i.e. maybe copy data into slot)
+     * - Check if multitask is finished
+     * - Update all slots
+     */
+    void start_loop() {
+        running = true;
+
+        while (true) {
+            QUE_DBG("%s", "processing new tasks\n");
+
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    lock.unlock();
+                    break;
+                }
+                server_task task = queue_tasks.front();
+                queue_tasks.pop_front();
+                lock.unlock();
+
+                QUE_DBG("processing task, id = %d\n", task.id);
+                callback_new_task(std::move(task));
+            }
+
+            // all tasks in the current loop is processed, slots data is now ready
+            QUE_DBG("%s", "update slots\n");
+
+            callback_update_slots();
+
+            QUE_DBG("%s", "waiting for new tasks\n");
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    if (!running) {
+                        QUE_DBG("%s", "terminate\n");
+                        return;
+                    }
+                    condition_tasks.wait(lock, [&]{
+                        return (!queue_tasks.empty() || !running);
+                    });
+                }
+            }
+        }
+    }
+
+private:
+    void cleanup_pending_task(int id_target) {
+        // no need lock because this is called exclusively by post()
+        auto rm_func = [id_target](const server_task & task) {
+            return task.id_target == id_target;
+        };
+        queue_tasks.erase(
+            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
+            queue_tasks.end());
+        queue_tasks_deferred.erase(
+            std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
+            queue_tasks_deferred.end());
+    }
+};
+
+struct server_response {
+    // for keeping track of all tasks waiting for the result
+    std::unordered_set<int> waiting_task_ids;
+
+    // the main result queue (using ptr for polymorphism)
+    std::vector<server_task_result_ptr> queue_results;
+
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    // add the id_task to the list of tasks waiting for response
+    void add_waiting_task_id(int id_task) {
+        SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
+
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(id_task);
+    }
+
+    void add_waiting_tasks(const std::vector<server_task> & tasks) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (const auto & task : tasks) {
+            SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
+            waiting_task_ids.insert(task.id);
+        }
+    }
+
+    // when the request is finished, we can remove task associated with it
+    void remove_waiting_task_id(int id_task) {
+        SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(id_task);
+        // make sure to clean up all pending results
+        queue_results.erase(
+            std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
+                return res->id == id_task;
+            }),
+            queue_results.end());
+    }
+
+    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (const auto & id_task : id_tasks) {
+            SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+            waiting_task_ids.erase(id_task);
+        }
+    }
+
+    // This function blocks the thread until there is a response for one of the id_tasks
+    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+
+            for (size_t i = 0; i < queue_results.size(); i++) {
+                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                    server_task_result_ptr res = std::move(queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // same as recv(), but have timeout in seconds
+    // if timeout is reached, nullptr is returned
+    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_results);
+
+            for (int i = 0; i < (int) queue_results.size(); i++) {
+                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                    server_task_result_ptr res = std::move(queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+
+            std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+            if (cr_res == std::cv_status::timeout) {
+                return nullptr;
+            }
+        }
+
+        // should never reach here
+    }
+
+    // single-task version of recv()
+    server_task_result_ptr recv(int id_task) {
+        std::unordered_set<int> id_tasks = {id_task};
+        return recv(id_tasks);
+    }
+
+    // Send a new result to a waiting id_task
+    void send(server_task_result_ptr && result) {
+        SRV_DBG("sending result for task id = %d\n", result->id);
+
+        std::unique_lock<std::mutex> lock(mutex_results);
+        for (const auto & id_task : waiting_task_ids) {
+            if (result->id == id_task) {
+                SRV_DBG("task id = %d pushed to result queue\n", result->id);
+
+                queue_results.emplace_back(std::move(result));
+                condition_results.notify_all();
+                return;
+            }
+        }
+    }
+};
+
+struct server_context {
+    common_params params_base;
+
+    // note: keep these alive - they determine the lifetime of the model, context, etc.
+    common_init_result llama_init;
+    common_init_result llama_init_dft;
+
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+
+    const llama_vocab * vocab = nullptr;
+
+    llama_model * model_dft = nullptr;
+
+    llama_context_params cparams_dft;
+
+    llama_batch batch = {};
+
+    bool clean_kv_cache = true;
+    bool add_bos_token  = true;
+    bool has_eos_token  = false;
+
+    int32_t n_ctx; // total context for all clients / slots
 
     // slots / clients
     std::vector<server_slot> slots;
     json default_generation_settings_for_props;
 
-    llama_server_queue    queue_tasks;
-    llama_server_response queue_results;
+    server_queue    queue_tasks;
+    server_response queue_results;
 
     server_metrics metrics;
 
-    ~llama_server_context()
-    {
-        if (ctx)
-        {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-        if (model)
-        {
-            llama_free_model(model);
-            model = nullptr;
+    // Necessary similarity of prompt for slot selection
+    float slot_prompt_similarity = 0.0f;
+
+    common_chat_templates chat_templates;
+
+    ~server_context() {
+        // Clear any sampling context
+        for (server_slot & slot : slots) {
+            common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
+
+            llama_free(slot.ctx_dft);
+            slot.ctx_dft = nullptr;
+
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
+
+            llama_batch_free(slot.batch_spec);
         }
+
+        llama_batch_free(batch);
     }
 
-    bool load_model(const gpt_params &params_)
-    {
-        params = params_;
-        if (!params.mmproj.empty()) {
-            multimodal = true;
-            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
-                return false;
-            }
+    bool load_model(const common_params & params) {
+        SRV_INF("loading model '%s'\n", params.model.c_str());
 
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
-                params.n_ctx = 2048;
-            }
-        }
+        params_base = params;
 
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
-        if (model == nullptr)
-        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+        llama_init = common_init_from_params(params_base);
+
+        model = llama_init.model.get();
+        ctx   = llama_init.context.get();
+
+        if (model == nullptr) {
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
             return false;
         }
 
-        if (multimodal) {
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
-                llama_free(ctx);
-                llama_free_model(model);
-                return false;
-            }
-        }
+        vocab = llama_model_get_vocab(model);
 
         n_ctx = llama_n_ctx(ctx);
 
-        add_bos_token = llama_should_add_bos_token(model);
+        add_bos_token = llama_vocab_get_add_bos(vocab);
+        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+
+        if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+
+            auto params_dft = params_base;
+
+            params_dft.devices      = params_base.speculative.devices;
+            params_dft.hf_file      = params_base.speculative.hf_file;
+            params_dft.hf_repo      = params_base.speculative.hf_repo;
+            params_dft.model        = params_base.speculative.model;
+            params_dft.model_url    = params_base.speculative.model_url;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
+            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
+            params_dft.n_parallel   = 1;
+
+            llama_init_dft = common_init_from_params(params_dft);
+
+            model_dft = llama_init_dft.model.get();
+
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+                return false;
+            }
+
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+
+                return false;
+            }
+
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
+
+            cparams_dft = common_context_params_to_llama(params_dft);
+            cparams_dft.n_batch = n_ctx_dft;
+
+            // force F16 KV cache for the draft model for extra performance
+            cparams_dft.type_k = GGML_TYPE_F16;
+            cparams_dft.type_v = GGML_TYPE_F16;
+
+            // the context is not needed - we will create one for each slot
+            llama_init_dft.context.reset();
+        }
+
+        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            chat_templates = common_chat_templates_from_model(model, "chatml");
+        } else {
+            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
+        }
+        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
 
         return true;
     }
 
-    void validate_model_chat_template(server_params & sparams) {
+    bool validate_builtin_chat_template(bool use_jinja) const {
         llama_chat_message chat[] = {{"user", "test"}};
-        std::vector<char> buf(1);
-        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
-        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
+
+        if (use_jinja) {
+            auto templates = common_chat_templates_from_model(model, "");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            GGML_ASSERT(templates.template_default);
+            try {
+                common_chat_params_init(*templates.template_default, inputs);
+                if (templates.template_tool_use) {
+                    common_chat_params_init(*templates.template_tool_use, inputs);
+                }
+                return true;
+            } catch (const std::exception & e) {
+                SRV_ERR("failed to apply template: %s\n", e.what());
+                return false;
+            }
+        } else {
+            const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
+            const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
+            return chat_res > 0;
         }
     }
 
-    void initialize() {
-        // create slots
-        all_slots_are_idle = true;
+    void init() {
+        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
 
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
-        for (int i = 0; i < params.n_parallel; i++)
-        {
+        for (int i = 0; i < params_base.n_parallel; i++) {
             server_slot slot;
 
             slot.id = i;
+            slot.ctx = ctx;
             slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
+            slot.n_predict = params_base.n_predict;
 
-            LOG_INFO("new slot", {
-                {"slot_id",    slot.id},
-                {"n_ctx_slot", slot.n_ctx}
-            });
+            if (model_dft) {
+                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
 
-            const int ga_n = params.grp_attn_n;
-            const int ga_w = params.grp_attn_w;
+                slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
+                if (slot.ctx_dft == nullptr) {
+                    SRV_ERR("%s", "failed to create draft context\n");
+                    return;
+                }
 
-            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
-                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
-                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-
-                LOG_INFO("slot self-extend", {
-                    {"slot_id",   slot.id},
-                    {"ga_n",      ga_n},
-                    {"ga_w",      ga_w}
-                });
+                slot.spec = common_speculative_init(slot.ctx_dft);
+                if (slot.spec == nullptr) {
+                    SRV_ERR("%s", "failed to create speculator\n");
+                    return;
+                }
             }
 
-            slot.ga_i = 0;
-            slot.ga_n = ga_n;
-            slot.ga_w = ga_w;
+            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
+
+            slot.params.sampling = params_base.sampling;
+
+            slot.callback_on_release = [this](int) {
+                queue_tasks.pop_deferred_task();
+            };
 
             slot.reset();
 
             slots.push_back(slot);
         }
 
-        default_generation_settings_for_props = get_formated_generation(slots.front());
-        default_generation_settings_for_props["seed"] = -1;
+        default_generation_settings_for_props = slots[0].to_json();
 
-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
-    }
-
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
-    {
-        // TODO: currently, we tokenize using special tokens by default
-        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
-        //       but it's better compared to completely ignoring ChatML and other chat templates
-        const bool TMP_FORCE_SPECIAL = true;
-
-        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-        // or the first element of the json_prompt array is a string.
-        std::vector<llama_token> prompt_tokens;
-
-        if (json_prompt.is_array())
+        // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
+        // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
         {
-            bool first = true;
-            for (const auto& p : json_prompt)
-            {
-                if (p.is_string())
-                {
-                    auto s = p.template get<std::string>();
-                    std::vector<llama_token> p;
-                    if (first)
-                    {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
-                        first = false;
-                    }
-                    else
-                    {
-                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
-                    }
-                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-                }
-                else
-                {
-                    if (first)
-                    {
-                        first = false;
-                    }
-                    prompt_tokens.push_back(p.template get<llama_token>());
-                }
-            }
-        }
-        else
-        {
-            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            const int32_t n_batch = llama_n_batch(ctx);
+
+            // only a single seq_id per token is needed
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
         }
 
-        return prompt_tokens;
+        metrics.init();
     }
 
-    server_slot* get_slot(int id) {
-        int64_t t_last = ggml_time_us();
-        server_slot *last_used = nullptr;
-
-        for (server_slot & slot : slots)
-        {
-            if (slot.id == id && slot.available())
-            {
+    server_slot * get_slot_by_id(int id) {
+        for (server_slot & slot : slots) {
+            if (slot.id == id) {
                 return &slot;
             }
-
-            if (slot.available() && slot.t_last_used < t_last)
-            {
-                last_used = &slot;
-                t_last = slot.t_last_used;
-            }
         }
 
-        return last_used;
+        return nullptr;
     }
 
-    bool launch_slot_with_data(server_slot* &slot, json data) {
-        slot_params default_params;
-        llama_sampling_params default_sparams;
+    server_slot * get_available_slot(const server_task & task) {
+        server_slot * ret = nullptr;
 
-        if (data.count("__oaicompat") != 0) {
-            slot->oaicompat = true;
-            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-        } else {
-            slot->oaicompat = false;
-            slot->oaicompat_model = "";
+        // find the slot that has at least n% prompt similarity
+        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
+            int lcs_len = 0;
+            float similarity = 0;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
+
+                // skip the slot if it does not contains cached tokens
+                if (slot.cache_tokens.empty()) {
+                    continue;
+                }
+
+                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
+
+                // fraction of the common subsequence length compared to the current slot's prompt length
+                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
+
+                // select the current slot if the criteria match
+                if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
+                    lcs_len = cur_lcs_len;
+                    similarity = cur_similarity;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
+            }
         }
 
-        slot->params.stream             = json_value(data, "stream",            false);
-        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
-        slot->params.n_predict          = json_value(data, "n_predict",         default_params.n_predict);
-        slot->sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
-        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
-        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
-        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
-        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
-        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
-        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
-        slot->sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
-        slot->sparams.penalty_repeat    = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
-        slot->sparams.penalty_freq      = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-        slot->sparams.penalty_present   = json_value(data, "presence_penalty",  default_sparams.penalty_present);
-        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
-        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
-        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->params.seed               = json_value(data, "seed",              default_params.seed);
-        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
-        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
-        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
+        // find the slot that has been least recently used
+        if (ret == nullptr) {
+            int64_t t_last = ggml_time_us();
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
 
-        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+                // select the current slot if the criteria match
+                if (slot.t_last_used < t_last) {
+                    t_last = slot.t_last_used;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
+            }
+        }
+
+        return ret;
+    }
+
+    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
+        slot.reset();
+        slot.id_task       = task.id;
+        slot.index         = task.index;
+        slot.task_type     = task.type;
+        slot.params        = std::move(task.params);
+        slot.prompt_tokens = std::move(task.prompt_tokens);
+
+        if (!are_lora_equal(task.params.lora, slot.lora)) {
+            // if lora is changed, we cannot reuse cached tokens
+            slot.cache_tokens.clear();
+            slot.lora = task.params.lora;
+        }
+
+        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
+
+        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
             // Might be better to reject the request with a 400 ?
-            LOG_WARNING("Max tokens to predict exceeds server configuration", {
-                {"params.n_predict", slot->params.n_predict},
-                {"slot.n_predict", slot->n_predict},
-            });
-            slot->params.n_predict = slot->n_predict;
+            slot.params.n_predict = slot.n_predict;
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
         }
 
-        // infill
-        if (data.count("input_prefix") != 0)
-        {
-            slot->params.input_prefix = data["input_prefix"];
-        }
-        else
-        {
-            slot->params.input_prefix = "";
+        if (slot.params.ignore_eos && has_eos_token) {
+            slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
         }
 
-        if (data.count("input_suffix") != 0)
         {
-            slot->params.input_suffix = data["input_suffix"];
-        }
-        else
-        {
-            slot->params.input_suffix = "";
-        }
-
-        if (data.count("prompt") != 0)
-        {
-            slot->prompt = data["prompt"];
-        }
-        else
-        {
-            slot->prompt = "";
-        }
-
-        slot->sparams.penalty_prompt_tokens.clear();
-        slot->sparams.use_penalty_prompt_tokens = false;
-        const auto &penalty_prompt = data.find("penalty_prompt");
-        if (penalty_prompt != data.end())
-        {
-            if (penalty_prompt->is_string())
-            {
-                const auto penalty_prompt_string = penalty_prompt->get<std::string>();
-                auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
-                slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
-                if (slot->params.n_predict > 0)
-                {
-                    slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
-                }
-                slot->sparams.use_penalty_prompt_tokens = true;
+            if (slot.smpl != nullptr) {
+                common_sampler_free(slot.smpl);
             }
-            else if (penalty_prompt->is_array())
-            {
-                const auto n_tokens = penalty_prompt->size();
-                slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
-                const int n_vocab = llama_n_vocab(model);
-                for (const auto &penalty_token : *penalty_prompt)
-                {
-                    if (penalty_token.is_number_integer())
-                    {
-                        const auto tok = penalty_token.get<llama_token>();
-                        if (tok >= 0 && tok < n_vocab)
-                        {
-                            slot->sparams.penalty_prompt_tokens.push_back(tok);
-                        }
-                    }
-                }
-                slot->sparams.use_penalty_prompt_tokens = true;
+
+            slot.smpl = common_sampler_init(model, slot.params.sampling);
+            if (slot.smpl == nullptr) {
+                // for now, the only error that may happen here is invalid grammar
+                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
+                return false;
             }
         }
 
-        slot->sparams.logit_bias.clear();
+        if (slot.ctx_dft) {
+            llama_batch_free(slot.batch_spec);
 
-        if (json_value(data, "ignore_eos", false))
-        {
-            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
         }
 
-        const auto &logit_bias = data.find("logit_bias");
-        if (logit_bias != data.end() && logit_bias->is_array())
-        {
-            const int n_vocab = llama_n_vocab(model);
-            for (const auto &el : *logit_bias)
-            {
-                if (el.is_array() && el.size() == 2)
-                {
-                    float bias;
-                    if (el[1].is_number())
-                    {
-                        bias = el[1].get<float>();
-                    }
-                    else if (el[1].is_boolean() && !el[1].get<bool>())
-                    {
-                        bias = -INFINITY;
-                    }
-                    else
-                    {
-                        continue;
-                    }
+        slot.state = SLOT_STATE_STARTED;
 
-                    if (el[0].is_number_integer())
-                    {
-                        llama_token tok = el[0].get<llama_token>();
-                        if (tok >= 0 && tok < n_vocab)
-                        {
-                            slot->sparams.logit_bias[tok] = bias;
-                        }
-                    }
-                    else if (el[0].is_string())
-                    {
-                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
-                        for (auto tok : toks)
-                        {
-                            slot->sparams.logit_bias[tok] = bias;
-                        }
-                    }
-                }
-            }
-        }
-
-        slot->params.antiprompt.clear();
-
-        const auto &stop = data.find("stop");
-        if (stop != data.end() && stop->is_array())
-        {
-            for (const auto &word : *stop)
-            {
-                if (!word.empty())
-                {
-                    slot->params.antiprompt.push_back(word);
-                }
-            }
-        }
-
-        const auto &samplers_sequence = data.find("samplers");
-        if (samplers_sequence != data.end() && samplers_sequence->is_array())
-        {
-            std::vector<std::string> sampler_names;
-            for (const auto &sampler_name : *samplers_sequence)
-            {
-                if (sampler_name.is_string())
-                {
-                    sampler_names.emplace_back(sampler_name);
-                }
-            }
-            slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
-        }
-        else
-        {
-            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
-        }
-
-        if (multimodal)
-        {
-            const auto &images_data = data.find("image_data");
-            if (images_data != data.end() && images_data->is_array())
-            {
-                for (const auto &img : *images_data)
-                {
-                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
-
-                    slot_image img_sl;
-                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    img_sl.img_data = clip_image_u8_init();
-                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
-                    {
-                        LOG_ERROR("failed to load image", {
-                            {"slot_id",   slot->id},
-                            {"img_sl_id", img_sl.id}
-                        });
-                        return false;
-                    }
-                    LOG_VERBOSE("image loaded", {
-                        {"slot_id",   slot->id},
-                        {"img_sl_id", img_sl.id}
-                    });
-                    img_sl.request_encode_image = true;
-                    slot->images.push_back(img_sl);
-                }
-                // process prompt
-                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
-                if (slot->images.size() > 0 && !slot->prompt.is_array())
-                {
-                    std::string prompt = slot->prompt.get<std::string>();
-                    size_t pos = 0, begin_prefix = 0;
-                    std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t end_prefix = pos;
-                        pos += pattern.length();
-                        size_t end_pos = prompt.find(']', pos);
-                        if (end_pos != std::string::npos)
-                        {
-                            std::string image_id = prompt.substr(pos, end_pos - pos);
-                            try
-                            {
-                                int img_id = std::stoi(image_id);
-                                bool found = false;
-                                for (slot_image &img : slot->images)
-                                {
-                                    if (img.id == img_id) {
-                                        found = true;
-                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                                        begin_prefix = end_pos + 1;
-                                        break;
-                                    }
-                                }
-                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
-                                    slot->images.clear();
-                                    return false;
-                                }
-                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
-                                slot->images.clear();
-                                return false;
-                            }
-                        }
-                    }
-                    slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(begin_prefix);
-                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
-                }
-            }
-        }
-
-        if (slot->ctx_sampling != nullptr)
-        {
-            llama_sampling_free(slot->ctx_sampling);
-        }
-        slot->ctx_sampling = llama_sampling_init(slot->sparams);
-        llama_set_rng_seed(ctx, slot->params.seed);
-        slot->command = LOAD_PROMPT;
-
-        all_slots_are_idle = false;
-
-        LOG_INFO("slot is processing task", {
-            {"slot_id", slot->id},
-            {"task_id", slot->task_id},
-        });
+        SLT_INF(slot, "%s", "processing task\n");
 
         return true;
     }
 
     void kv_cache_clear() {
+        SRV_DBG("%s", "clearing KV cache\n");
+
         // clear the entire KV cache
         llama_kv_cache_clear(ctx);
         clean_kv_cache = false;
     }
 
-    void system_prompt_update() {
-        kv_cache_clear();
-        system_tokens.clear();
-
-        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
-
-            llama_batch_clear(batch);
-
-            for (int i = 0; i < (int)system_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
-            }
-
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
-            {
-                const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
-                if (llama_decode(ctx, batch_view) != 0)
-                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
-                    return;
-                }
-            }
-
-            // assign the system KV cache to all parallel sequences
-            for (int32_t i = 1; i < params.n_parallel; ++i)
-            {
-                llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
-            }
-        }
-
-        LOG_TEE("system prompt updated\n");
-        system_need_update = false;
-    }
-
-    void system_prompt_notify() {
-        // release all slots
-        for (server_slot &slot : slots)
-        {
-            slot.release();
-        }
-
-        system_need_update = true;
-    }
-
-    void system_prompt_process(const json &sys_props) {
-        system_prompt  = sys_props.value("prompt", "");
-        name_user      = sys_props.value("anti_prompt", "");
-        name_assistant = sys_props.value("assistant_name", "");
-
-
-        system_prompt_notify();
-    }
-
-    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
-                                        const stop_type type, server_slot &slot)
-    {
-        size_t stop_pos = std::string::npos;
-
-        for (const std::string &word : slot.params.antiprompt)
-        {
-            size_t pos;
-            if (type == STOP_FULL)
-            {
-                const size_t tmp = word.size() + last_token_size;
-                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-                pos = text.find(word, from_pos);
-            }
-            else
-            {
-                pos = find_partial_stop_string(word, text);
-            }
-            if (pos != std::string::npos &&
-                (stop_pos == std::string::npos || pos < stop_pos))
-            {
-                if (type == STOP_FULL)
-                {
-                    slot.stopped_word   = true;
-                    slot.stopping_word  = word;
-                    slot.has_next_token = false;
-                }
-                stop_pos = pos;
-            }
-        }
-
-        return stop_pos;
-    }
-
-    bool process_token(completion_token_output &result, server_slot &slot) {
+    bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = result.text_to_send;
         slot.sampled = result.tok;
 
-        // search stop word and delete it
         slot.generated_text += token_str;
+        if (slot.params.return_tokens) {
+            slot.generated_tokens.push_back(result.tok);
+        }
         slot.has_next_token = true;
 
-        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
-        {
-            // we can change penalty_prompt_tokens because it is always created from scratch each request
-            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
-        }
-
         // check if there is incomplete UTF-8 character at the end
-        bool incomplete = false;
-        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
-        {
-            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
-            if ((c & 0xC0) == 0x80)
-            {
-                // continuation byte: 10xxxxxx
-                continue;
-            }
-            if ((c & 0xE0) == 0xC0)
-            {
-                // 2-byte character: 110xxxxx ...
-                incomplete = i < 2;
-            }
-            else if ((c & 0xF0) == 0xE0)
-            {
-                // 3-byte character: 1110xxxx ...
-                incomplete = i < 3;
-            }
-            else if ((c & 0xF8) == 0xF0)
-            {
-                // 4-byte character: 11110xxx ...
-                incomplete = i < 4;
-            }
-            // else 1-byte character or invalid byte
-            break;
-        }
+        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
 
-        if (!incomplete)
-        {
+        // search stop word and delete it
+        if (!incomplete) {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+
             const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
-            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-            if (stop_pos != std::string::npos)
-            {
-                is_stop_full = true;
+            bool send_text = true;
+
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
+            if (stop_pos != std::string::npos) {
                 slot.generated_text.erase(
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
                 pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            }
-            else
-            {
-                is_stop_full = false;
-                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+            } else if (slot.has_next_token) {
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
+                send_text = stop_pos == std::string::npos;
             }
 
             // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-            {
+            if (send_text) {
                 // no send the stop word in the response
                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                 slot.n_sent_text += result.text_to_send.size();
                 // add the token to slot queue and cache
+            } else {
+                result.text_to_send = "";
             }
-            slot.add_token_string(result);
-            if (slot.params.stream)
-            {
+
+            slot.add_token(result);
+            if (slot.params.stream) {
                 send_partial_response(slot, result);
             }
         }
 
-        if (incomplete)
-        {
+        if (incomplete) {
             slot.has_next_token = true;
         }
 
         // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
-        {
-            slot.stopped_limit = true;
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
+            slot.stop           = STOP_TYPE_LIMIT;
             slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
         }
 
-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
-        {
-            slot.stopped_eos = true;
-            slot.has_next_token = false;
-            LOG_VERBOSE("eos token found", {});
+        if (slot.has_new_line) {
+            // if we have already seen a new line, we stop after a certain time limit
+            if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
+                slot.stop           = STOP_TYPE_LIMIT;
+                slot.has_next_token = false;
+
+                SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
+            }
+
+            // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
+            if (slot.params.n_indent > 0) {
+                // check the current indentation
+                // TODO: improve by not doing it more than once for each new line
+                if (slot.last_nl_pos > 0) {
+                    size_t pos = slot.last_nl_pos;
+
+                    int n_indent = 0;
+                    while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
+                        n_indent++;
+                        pos++;
+                    }
+
+                    if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
+                        slot.stop           = STOP_TYPE_LIMIT;
+                        slot.has_next_token = false;
+
+                        // cut the last line
+                        slot.generated_text.erase(pos, std::string::npos);
+
+                        SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
+                    }
+                }
+
+                // find the next new line
+                {
+                    const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
+
+                    if (pos != std::string::npos) {
+                        slot.last_nl_pos = pos + 1;
+                    }
+                }
+            }
         }
 
-        LOG_VERBOSE("next token", {
-                                      {"token", result.tok},
-                                      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
-                                      {"has_next_token", slot.has_next_token},
-                                      {"n_remain", slot.n_remaining},
-                                      {"num_tokens_predicted", slot.n_decoded},
-                                      {"stopped_eos", slot.stopped_eos},
-                                      {"stopped_word", slot.stopped_word},
-                                      {"stopped_limit", slot.stopped_limit},
-                                      {"stopping_word", slot.stopping_word},
-                                  });
+        // check if there is a new line in the generated text
+        if (result.text_to_send.find('\n') != std::string::npos) {
+            slot.has_new_line = true;
+        }
+
+        // if context shift is disabled, we stop when it reaches the context limit
+        if (slot.n_past >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
+        }
+
+        if (llama_vocab_is_eog(vocab, result.tok)) {
+            slot.stop           = STOP_TYPE_EOS;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "%s", "stopped by EOS\n");
+        }
+
+        const auto n_ctx_train = llama_model_n_ctx_train(model);
+
+        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
+            slot.truncated      = true;
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false; // stop prediction
+
+            SLT_WRN(slot,
+                    "n_predict (%d) is set for infinite generation. "
+                    "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
+                    slot.params.n_predict, n_ctx_train);
+        }
+
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
 
         return slot.has_next_token; // continue
     }
 
-    bool process_images(server_slot &slot) const
-    {
-        for (slot_image &img : slot.images)
-        {
-            if (!img.request_encode_image)
-            {
+    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
+        size_t n_probs = slot.params.sampling.n_probs;
+        size_t n_vocab = llama_vocab_n_tokens(vocab);
+        if (post_sampling) {
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+            const size_t max_probs = cur_p->size;
+
+            // set probability for sampled token
+            for (size_t i = 0; i < max_probs; i++) {
+                if (cur_p->data[i].id == result.tok) {
+                    result.prob = cur_p->data[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(max_probs);
+            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
+                result.probs.push_back({
+                    cur_p->data[i].id,
+                    common_detokenize(ctx, {cur_p->data[i].id}, special),
+                    cur_p->data[i].p
+                });
+            }
+        } else {
+            // TODO: optimize this with min-p optimization
+            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
+
+            // set probability for sampled token
+            for (size_t i = 0; i < n_vocab; i++) {
+                // set probability for sampled token
+                if (cur[i].id == result.tok) {
+                    result.prob = cur[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
+                result.probs.push_back({
+                    cur[i].id,
+                    common_detokenize(ctx, {cur[i].id}, special),
+                    cur[i].p
+                });
+            }
+        }
+    }
+
+    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(task.id, error, type);
+    }
+
+    void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(slot.id_task, error, type);
+    }
+
+    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
+
+        auto res = std::make_unique<server_task_result_error>();
+        res->id       = id_task;
+        res->err_type = type;
+        res->err_msg  = error;
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
+        auto res = std::make_unique<server_task_result_cmpl_partial>();
+
+        res->id      = slot.id_task;
+        res->index   = slot.index;
+        res->content = tkn.text_to_send;
+        res->tokens  = { tkn.tok };
+
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.n_prompt_tokens;
+        res->post_sampling_probs = slot.params.post_sampling_probs;
+
+        res->verbose           = slot.params.verbose;
+        res->oaicompat         = slot.params.oaicompat;
+        res->oaicompat_model   = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+
+        // populate res.probs_output
+        if (slot.params.sampling.n_probs > 0) {
+            res->prob_output = tkn; // copy the token probs
+        }
+
+        // populate timings if this is final response or timings_per_token is enabled
+        if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) {
+            res->timings = slot.get_timings();
+        }
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_final_response(server_slot & slot) {
+        auto res = std::make_unique<server_task_result_cmpl_final>();
+        res->id              = slot.id_task;
+        res->id_slot         = slot.id;
+
+        res->index           = slot.index;
+        res->content         = std::move(slot.generated_text);
+        res->tokens          = std::move(slot.generated_tokens);
+        res->timings         = slot.get_timings();
+        res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
+        res->response_fields = std::move(slot.params.response_fields);
+
+        res->truncated           = slot.truncated;
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.n_prompt_tokens;
+        res->n_tokens_cached     = slot.n_past;
+        res->has_new_line        = slot.has_new_line;
+        res->stopping_word       = slot.stopping_word;
+        res->stop                = slot.stop;
+        res->post_sampling_probs = slot.params.post_sampling_probs;
+
+        res->verbose               = slot.params.verbose;
+        res->stream                = slot.params.stream;
+        res->oaicompat             = slot.params.oaicompat;
+        res->oaicompat_model       = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
+        res->oaicompat_chat_format = slot.params.oaicompat_chat_format;
+        // populate res.probs_output
+        if (slot.params.sampling.n_probs > 0) {
+            if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
+                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+
+                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end() - safe_offset);
+            } else {
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end());
+            }
+        }
+
+        res->generation_params = slot.params; // copy the parameters
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_embedding(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_embd>();
+        res->id        = slot.id_task;
+        res->index     = slot.index;
+        res->n_tokens  = slot.n_prompt_tokens;
+        res->oaicompat = slot.params.oaicompat;
+
+        const int n_embd = llama_model_n_embd(model);
+
+        std::vector<float> embd_res(n_embd, 0.0f);
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                 continue;
             }
 
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
-                return false;
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
+                embd = llama_get_embeddings_ith(ctx, i);
             }
 
+            if (embd == NULL) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-            img.request_encode_image = false;
-        }
-
-        return slot.images.size() > 0;
-    }
-
-    void send_error(task_server& task, const std::string &error)
-    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
-        task_result res;
-        res.id = task.id;
-        res.multitask_id = task.multitask_id;
-        res.stop = false;
-        res.error = true;
-        res.result_json = { { "content", error } };
-        queue_results.send(res);
-    }
-
-    json get_formated_generation(server_slot &slot)
-    {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-        std::vector<std::string> samplers_sequence;
-        for (const auto &sampler_type : slot.sparams.samplers_sequence)
-        {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
-        }
-
-        return json {
-            {"n_ctx",             slot.n_ctx},
-            {"n_predict",         slot.n_predict},
-            {"model",             params.model_alias},
-            {"seed",              slot.params.seed},
-            {"temperature",       slot.sparams.temp},
-            {"dynatemp_range",    slot.sparams.dynatemp_range},
-            {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
-            {"top_k",             slot.sparams.top_k},
-            {"top_p",             slot.sparams.top_p},
-            {"min_p",             slot.sparams.min_p},
-            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
-            {"repeat_last_n",     slot.sparams.penalty_last_n},
-            {"repeat_penalty",    slot.sparams.penalty_repeat},
-            {"presence_penalty",  slot.sparams.penalty_present},
-            {"frequency_penalty", slot.sparams.penalty_freq},
-            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
-            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
-            {"mirostat",          slot.sparams.mirostat},
-            {"mirostat_tau",      slot.sparams.mirostat_tau},
-            {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
-            {"stop",              slot.params.antiprompt},
-            {"n_predict",         slot.params.n_predict},
-            {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
-            {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
-            {"n_probs",           slot.sparams.n_probs},
-            {"min_keep",          slot.sparams.min_keep},
-            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers_sequence}
-        };
-    }
-
-    void send_partial_response(server_slot &slot, completion_token_output tkn)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = false;
-
-        res.result_json = json
-        {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"slot_id",    slot.id},
-            {"multimodal", multimodal}
-        };
-
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
-            size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
-            if (probs_pos < probs_stop_pos)
-            {
-                probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
+                continue;
             }
-            slot.n_sent_token_probs = probs_stop_pos;
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
-        }
 
-        if (slot.oaicompat)
-        {
-            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-            res.result_json["model"] = slot.oaicompat_model;
-        }
-
-        queue_results.send(res);
-    }
-
-    void send_final_response(server_slot &slot)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = true;
-
-        res.result_json = json
-        {
-            {"content",             !slot.params.stream ? slot.generated_text : ""},
-            {"slot_id",             slot.id},
-            {"stop",                true},
-            {"model",               params.model_alias},
-            {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.n_prompt_tokens},
-            {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              slot.prompt},
-            {"truncated",           slot.truncated},
-            {"stopped_eos",         slot.stopped_eos},
-            {"stopped_word",        slot.stopped_word},
-            {"stopped_limit",       slot.stopped_limit},
-            {"stopping_word",       slot.stopping_word},
-            {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
-        };
-
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs = {};
-            if (!slot.params.stream && slot.stopped_word)
-            {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
-                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
+            // normalize only when there is pooling
+            // TODO: configurable
+            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
+                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                res->embedding.push_back(embd_res);
+            } else {
+                res->embedding.push_back({ embd, embd + n_embd });
             }
-            else
-            {
-                probs = std::vector<completion_token_output>(
-                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.end());
+        }
+
+        SLT_DBG(slot, "%s", "sending embeddings\n");
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_rerank>();
+        res->id    = slot.id_task;
+        res->index = slot.index;
+        res->n_tokens = slot.n_prompt_tokens;
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                continue;
             }
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            }
+
+            if (embd == NULL) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res->score = -1e6;
+                continue;
+            }
+
+            res->score = embd[0];
         }
 
-        if (slot.oaicompat)
-        {
-            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-            res.result_json["model"] = slot.oaicompat_model;
-        }
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score);
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
-    void send_embedding(server_slot &slot)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = true;
+    //
+    // Functions to create new task(s) and receive result(s)
+    //
 
-        const int n_embd = llama_n_embd(model);
-        if (!params.embedding)
-        {
-            LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
-            res.result_json = json
-            {
-                {"embedding", std::vector<float>(n_embd, 0.0f)},
-            };
+    void cancel_tasks(const std::unordered_set<int> & id_tasks) {
+        std::vector<server_task> cancel_tasks;
+        cancel_tasks.reserve(id_tasks.size());
+        for (const auto & id_task : id_tasks) {
+            SRV_WRN("cancel task, id_task = %d\n", id_task);
+
+            server_task task(SERVER_TASK_TYPE_CANCEL);
+            task.id_target = id_task;
+            queue_results.remove_waiting_task_id(id_task);
+            cancel_tasks.push_back(task);
         }
-        else
-        {
-            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
-            res.result_json = json
-            {
-                {"embedding", embedding},
-            };
-        }
-        queue_results.send(res);
+        // push to beginning of the queue, so it has highest priority
+        queue_tasks.post(cancel_tasks, true);
     }
 
-    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
-    {
-        task_server task;
-        task.id = task_id;
-        task.target_id = 0;
-        task.data = std::move(data);
-        task.infill_mode = infill;
-        task.embedding_mode = embedding;
-        task.type = TASK_TYPE_COMPLETION;
-        task.multitask_id = multitask_id;
+    // receive the results from task(s)
+    void receive_multi_results(
+            const std::unordered_set<int> & id_tasks,
+            const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
+            const std::function<void(json)> & error_handler,
+            const std::function<bool()> & is_connection_closed) {
+        std::vector<server_task_result_ptr> results(id_tasks.size());
+        for (int i = 0; i < (int)id_tasks.size(); i++) {
+            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
 
-        // when a completion task's prompt array is not a singleton, we split it into multiple requests
-        // otherwise, it's a single-prompt task, we actually queue it
-        // if there's numbers in the prompt array it will be treated as an array of tokens
-        if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
-            bool numbers = false;
-            for (const auto& e : task.data.at("prompt")) {
-                if (e.is_number()) {
-                    numbers = true;
+            if (is_connection_closed()) {
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            if (result == nullptr) {
+                i--; // retry
+                continue;
+            }
+
+            if (result->is_error()) {
+                error_handler(result->to_json());
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+                || dynamic_cast<server_task_result_embd*>(result.get()) != nullptr
+                || dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr
+            );
+            const size_t idx = result->get_index();
+            GGML_ASSERT(idx < results.size() && "index out of range");
+            results[idx] = std::move(result);
+        }
+        result_handler(results);
+    }
+
+    // receive the results from task(s), in stream mode
+    void receive_cmpl_results_stream(
+            const std::unordered_set<int> & id_tasks,
+            const std::function<bool(server_task_result_ptr&)> & result_handler,
+            const std::function<void(json)> & error_handler,
+            const std::function<bool()> & is_connection_closed) {
+        size_t n_finished = 0;
+        while (true) {
+            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
+
+            if (is_connection_closed()) {
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            if (result == nullptr) {
+                continue; // retry
+            }
+
+            if (result->is_error()) {
+                error_handler(result->to_json());
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
+                || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+            );
+            if (!result_handler(result)) {
+                cancel_tasks(id_tasks);
+                break;
+            }
+
+            if (result->is_stop()) {
+                if (++n_finished == id_tasks.size()) {
+                    break;
+                }
+            }
+        }
+    }
+
+    //
+    // Functions to process the task
+    //
+
+    void process_single_task(server_task task) {
+        switch (task.type) {
+            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFILL:
+            case SERVER_TASK_TYPE_EMBEDDING:
+            case SERVER_TASK_TYPE_RERANK:
+                {
+                    const int id_slot = task.id_selected_slot;
+
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
+
+                    if (slot == nullptr) {
+                        // if no slot is available, we defer this task for processing later
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(task);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(task);
+                        break;
+                    }
+
+                    if (!launch_slot_with_task(*slot, task)) {
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
+                        break;
+                    }
+                } break;
+            case SERVER_TASK_TYPE_CANCEL:
+                {
+                    // release slot linked with the task id
+                    for (auto & slot : slots) {
+                        if (slot.id_task == task.id_target) {
+                            slot.release();
+                            break;
+                        }
+                    }
+                } break;
+            case SERVER_TASK_TYPE_NEXT_RESPONSE:
+                {
+                    // do nothing
+                } break;
+            case SERVER_TASK_TYPE_METRICS:
+                {
+                    json slots_data = json::array();
+
+                    int n_idle_slots       = 0;
+                    int n_processing_slots = 0;
+
+                    for (server_slot & slot : slots) {
+                        json slot_data = slot.to_json();
+
+                        if (slot.is_processing()) {
+                            n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
+                        }
+
+                        slots_data.push_back(slot_data);
+                    }
+                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
+
+                    auto res = std::make_unique<server_task_result_metrics>();
+                    res->id                  = task.id;
+                    res->slots_data          = std::move(slots_data);
+                    res->n_idle_slots        = n_idle_slots;
+                    res->n_processing_slots  = n_processing_slots;
+                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
+                    res->t_start             = metrics.t_start;
+
+                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+
+                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
+                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
+                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
+                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
+
+                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
+                    res->t_prompt_processing       = metrics.t_prompt_processing;
+                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
+                    res->t_tokens_generation       = metrics.t_tokens_generation;
+
+                    res->n_decode_total          = metrics.n_decode_total;
+                    res->n_busy_slots_total      = metrics.n_busy_slots_total;
+
+                    if (task.metrics_reset_bucket) {
+                        metrics.reset_bucket();
+                    }
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+                {
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(task);
+                        break;
+                    }
+
+                    const size_t token_count = slot->cache_tokens.size();
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_save_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = true;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nwrite;
+                    res->t_ms     = t_save_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+                {
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(task);
+                        break;
+                    }
+
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    slot->cache_tokens.resize(slot->n_ctx);
+                    size_t token_count = 0;
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    if (nread == 0) {
+                        slot->cache_tokens.resize(0);
+                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    slot->cache_tokens.resize(token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_restore_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = false;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nread;
+                    res->t_ms     = t_restore_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                {
+                    int id_slot = task.slot_action.slot_id;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(task);
+                        break;
+                    }
+
+                    // Erase token cache
+                    const size_t n_erased = slot->cache_tokens.size();
+                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    slot->cache_tokens.clear();
+
+                    auto res = std::make_unique<server_task_result_slot_erase>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->n_erased = n_erased;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SET_LORA:
+                {
+                    params_base.lora_adapters = std::move(task.set_lora);
+                    auto res = std::make_unique<server_task_result_apply_lora>();
+                    res->id = task.id;
+                    queue_results.send(std::move(res));
+                } break;
+        }
+    }
+
+    void update_slots() {
+        // check if all slots are idle
+        {
+            bool all_idle = true;
+
+            for (auto & slot : slots) {
+                if (slot.is_processing()) {
+                    all_idle = false;
                     break;
                 }
             }
 
-            // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
-            // it will completely stall the server. I don't know where the bug for this is.
-            //
-            // if there are numbers, it needs to be treated like a single prompt,
-            // queue_tasks handles a mix of strings and numbers just fine.
-            if (numbers) {
-                queue_tasks.post(task);
-            } else {
-                split_multiprompt_task(task_id, task);
-            }
-        } else {
-            // an empty prompt can make slot become buggy
-            if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get<std::string>().empty()) {
-                task.data["prompt"] = " "; // add a space so that we have one token
+            if (all_idle) {
+                SRV_INF("%s", "all slots are idle\n");
+                if (clean_kv_cache) {
+                    kv_cache_clear();
+                }
+
+                return;
             }
+        }
+
+        {
+            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
+
+            server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
+            task.id = queue_tasks.get_new_id();
             queue_tasks.post(task);
         }
-    }
 
-    // for multiple images processing
-    bool ingest_images(server_slot &slot, int n_batch)
-    {
-        int image_idx = 0;
-
-        while (image_idx < (int) slot.images.size())
-        {
-            slot_image &img = slot.images[image_idx];
-
-            // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-            {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
-                if (llama_decode(ctx, batch_view))
-                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
-                    return false;
-                }
-            }
-
-            // process image with llm
-            for (int i = 0; i < img.image_tokens; i += n_batch)
-            {
-                int n_eval = img.image_tokens - i;
-                if (n_eval > n_batch)
-                {
-                    n_eval = n_batch;
+        // apply context-shift if needed
+        // TODO: simplify and improve
+        for (server_slot & slot : slots) {
+            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
+                if (!params_base.ctx_shift) {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+                    slot.release();
+                    send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                    continue;
                 }
 
-                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = {
-                    n_eval,
-                    nullptr,
-                    (img.image_embedding + i * n_embd),
-                    nullptr,
-                    nullptr,
-                    nullptr,
-                    nullptr,
-                    slot.n_past,
-                    1, 0
-                };
-                if (llama_decode(ctx, batch_img))
-                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
-                    return false;
-                }
-                slot.n_past += n_eval;
-            }
-            image_idx++;
+                // Shift context
+                const int n_keep    = slot.params.n_keep + add_bos_token;
+                const int n_left    = slot.n_past - n_keep;
+                const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
 
-            llama_batch_clear(batch);
+                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-            // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
+                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
-            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
-                slot.n_past += 1;
-            }
-        }
-
-        return true;
-    }
-
-    void request_cancel(int task_id)
-    {
-        task_server task;
-        task.type = TASK_TYPE_CANCEL;
-        task.target_id = task_id;
-        queue_tasks.post(task);
-    }
-
-    void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
-    {
-        int prompt_count = multiprompt_task.data.at("prompt").size();
-        if (prompt_count <= 1) {
-            send_error(multiprompt_task, "error while handling multiple prompts");
-            return;
-        }
-
-        // generate all the ID for subtask
-        std::vector<int> subtask_ids(prompt_count);
-        for (int i = 0; i < prompt_count; i++)
-        {
-            subtask_ids[i] = queue_tasks.get_new_id();
-        }
-
-        // queue up the multitask so we can track its subtask progression
-        queue_tasks.add_multitask(multitask_id, subtask_ids);
-
-        // add subtasks
-        for (int i = 0; i < prompt_count; i++)
-        {
-            json subtask_data = multiprompt_task.data;
-            subtask_data["prompt"] = subtask_data["prompt"][i];
-
-            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
-        }
-    }
-
-    void process_single_task(task_server& task)
-    {
-        switch (task.type)
-        {
-            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
-                if (slot == nullptr)
-                {
-                    // if no slot is available, we defer this task for processing later
-                    LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
-                    queue_tasks.defer(task);
-                    break;
-                }
-
-                if (task.data.contains("system_prompt"))
-                {
-                    if (!all_slots_are_idle) {
-                        send_error(task, "system prompt can only be updated when all slots are idle");
-                        break;
-                    }
-                    system_prompt_process(task.data["system_prompt"]);
-
-                    // reset cache_tokens for all slots
-                    for (server_slot &slot : slots)
-                    {
-                        slot.cache_tokens.clear();
-                        slot.n_past    = 0;
-                        slot.n_past_se = 0;
-                    }
-                }
-
-                slot->reset();
-
-                slot->infill       = task.infill_mode;
-                slot->embedding    = task.embedding_mode;
-                slot->task_id      = task.id;
-                slot->multitask_id = task.multitask_id;
-
-                if (!launch_slot_with_data(slot, task.data))
-                {
-                    // send error result
-                    send_error(task, "internal_error");
-                    break;
-                }
-            } break;
-            case TASK_TYPE_CANCEL: { // release slot linked with the task id
-                for (auto & slot : slots)
-                {
-                    if (slot.task_id == task.target_id)
-                    {
-                        slot.release();
-                        break;
-                    }
-                }
-            } break;
-            case TASK_TYPE_NEXT_RESPONSE: {
-                // do nothing
-            } break;
-            case TASK_TYPE_METRICS: {
-                json slots_data        = json::array();
-                int n_idle_slots       = 0;
-                int n_processing_slots = 0;
-
-                for (server_slot &slot: slots) {
-                    json slot_data = get_formated_generation(slot);
-                    slot_data["id"] = slot.id;
-                    slot_data["task_id"] = slot.task_id;
-                    slot_data["state"] = slot.state;
-                    slot_data["prompt"] = slot.prompt;
-                    slot_data["next_token"] = {
-                            {"has_next_token",       slot.has_next_token},
-                            {"n_remain",             slot.n_remaining},
-                            {"num_tokens_predicted", slot.n_decoded},
-                            {"stopped_eos",          slot.stopped_eos},
-                            {"stopped_word",         slot.stopped_word},
-                            {"stopped_limit",        slot.stopped_limit},
-                            {"stopping_word",        slot.stopping_word},
-                    };
-                    if (slot_data["state"] == IDLE) {
-                        n_idle_slots++;
-                    } else {
-                        n_processing_slots++;
-                    }
-                    slots_data.push_back(slot_data);
-                }
-                LOG_INFO("slot data", {
-                    {"task_id",            task.id},
-                    {"n_idle_slots",       n_idle_slots},
-                    {"n_processing_slots", n_processing_slots}
-                });
-                LOG_VERBOSE("slot data", {
-                    {"task_id",            task.id},
-                    {"n_idle_slots",       n_idle_slots},
-                    {"n_processing_slots", n_processing_slots},
-                    {"slots",              slots_data}
-                });
-                task_result res;
-                res.id = task.id;
-                res.multitask_id = task.multitask_id;
-                res.stop = true;
-                res.error = false;
-                res.result_json = {
-                        { "idle",                            n_idle_slots       },
-                        { "processing",                      n_processing_slots },
-                        { "deferred",                        queue_tasks.queue_tasks_deferred.size() },
-
-                        { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
-                        { "n_tokens_predicted_total",        metrics.n_tokens_predicted_total},
-
-                        { "n_prompt_tokens_processed",       metrics.n_prompt_tokens_processed},
-                        { "t_prompt_processing",             metrics.t_prompt_processing},
-                        { "n_tokens_predicted",              metrics.n_tokens_predicted},
-                        { "t_tokens_generation",             metrics.t_tokens_generation},
-
-                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
-                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},
-
-                        { "slots",                           slots_data },
-                };
-                metrics.reset_bucket();
-                queue_results.send(res);
-            } break;
-        }
-    }
-
-    void on_finish_multitask(task_multi& multitask)
-    {
-        // all subtasks done == multitask is done
-        task_result result;
-        result.id = multitask.id;
-        result.stop = true;
-        result.error = false;
-
-        // collect json results into one json result
-        std::vector<json> result_jsons;
-        for (auto& subres : multitask.results)
-        {
-            result_jsons.push_back(subres.result_json);
-            result.error = result.error && subres.error;
-        }
-        result.result_json = json{ { "results", result_jsons } };
-        queue_results.send(result);
-    }
-
-    bool update_slots() {
-        if (system_need_update)
-        {
-            LOG_INFO("updating system prompt", {});
-            system_prompt_update();
-        }
-
-        llama_batch_clear(batch);
-
-        if (all_slots_are_idle)
-        {
-            if (system_prompt.empty() && clean_kv_cache)
-            {
-                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
-                kv_cache_clear();
-            }
-            return true;
-        }
-
-        LOG_VERBOSE("posting NEXT_RESPONSE", {});
-        task_server task;
-        task.type = TASK_TYPE_NEXT_RESPONSE;
-        task.target_id = -1;
-        queue_tasks.post(task);
-
-        for (server_slot &slot : slots)
-        {
-            if (slot.ga_n == 1)
-            {
-                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
-                {
-                    // Shift context
-                    const int n_keep    = slot.params.n_keep + add_bos_token;
-                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
-                    const int n_discard = n_left / 2;
-
-                    LOG_INFO("slot context shift", {
-                        {"slot_id",         slot.id},
-                        {"task_id",         slot.task_id},
-                        {"n_keep",          n_keep},
-                        {"n_left",          n_left},
-                        {"n_discard",       n_discard},
-                        {"n_ctx",           n_ctx},
-                        {"n_past",          slot.n_past},
-                        {"n_system_tokens", system_tokens.size()},
-                        {"n_cache_tokens",  slot.cache_tokens.size()}
-                    });
-                    llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
-
-                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
+                if (slot.params.cache_prompt) {
+                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
                         slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                     }
 
                     slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
                 }
+
+                slot.n_past -= n_discard;
+
+                slot.truncated = true;
             }
         }
 
-        // decode any currently ongoing sequences
-        LOG_VERBOSE("decoding ongoing sequences", {});
-        for (auto & slot : slots)
-        {
-            // release the slot
-            if (slot.command == RELEASE)
-            {
-                slot.state = IDLE;
-                slot.command = NONE;
-                slot.t_last_used = ggml_time_us();
+        // start populating the batch for this iteration
+        common_batch_clear(batch);
 
-                LOG_INFO("slot released", {
-                    {"slot_id",         slot.id},
-                    {"task_id",         slot.task_id},
-                    {"n_ctx",           n_ctx},
-                    {"n_past",          slot.n_past},
-                    {"n_system_tokens", system_tokens.size()},
-                    {"n_cache_tokens",  slot.cache_tokens.size()},
-                    {"truncated",       slot.truncated}
-                });
-                queue_tasks.notify_slot_changed();
+        // track if given slot can be batched with slots already in the batch
+        server_slot * slot_batched = nullptr;
 
+        auto accept_special_token = [&](server_slot & slot, llama_token token) {
+            return params_base.special || slot.params.sampling.preserved_tokens.find(token) != slot.params.sampling.preserved_tokens.end();
+        };
+
+        // frist, add sampled tokens from any ongoing sequences
+        for (auto & slot : slots) {
+            if (slot.state != SLOT_STATE_GENERATING) {
                 continue;
             }
 
-            if (slot.state == IDLE)
-            {
+            // check if we can batch this slot with the previous one
+            if (!slot_batched) {
+                slot_batched = &slot;
+            } else if (!slot_batched->can_batch_with(slot)) {
                 continue;
             }
 
             slot.i_batch = batch.n_tokens;
 
-            const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
 
-            // TODO: we always have to take into account the "system_tokens"
-            //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
             slot.n_past += 1;
+
+            if (slot.params.cache_prompt) {
+                slot.cache_tokens.push_back(slot.sampled);
+            }
+
+            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
+                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
         }
 
         // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
+        int32_t n_batch  = llama_n_batch(ctx);
+        int32_t n_ubatch = llama_n_ubatch(ctx);
 
-        // assign workload to the slots
-        if (params.cont_batching || batch.n_tokens == 0)
-        {
-            for (auto & slot : slots)
-            {
-                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
-
-                // empty prompt passed -> release the slot and send empty response
-                // note: infill mode allows empty prompt
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
-                {
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    continue;
+        // next, batch any pending prompts without exceeding n_batch
+        if (params_base.cont_batching || batch.n_tokens == 0) {
+            for (auto & slot : slots) {
+                // check if we can batch this slot with the previous one
+                if (slot.is_processing()) {
+                    if (!slot_batched) {
+                        slot_batched = &slot;
+                    } else if (!slot_batched->can_batch_with(slot)) {
+                        continue;
+                    }
                 }
 
-                // need process the prompt
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
-                {
-                    slot.state = PROCESSING;
-                    slot.command = NONE;
-                    std::vector<llama_token> prompt_tokens;
-                    slot.t_start_process_prompt = ggml_time_us();
-                    slot.t_start_genereration = 0;
+                // this slot still has a prompt to be processed
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
+                    auto & prompt_tokens = slot.prompt_tokens;
 
-                    if (slot.infill)
-                    {
-                        bool suff_rm_leading_spc = true;
-                        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
-                        {
-                            params.input_suffix.erase(0, 1);
-                            suff_rm_leading_spc = false;
-                        }
-                        auto prefix_tokens = tokenize(slot.params.input_prefix, false);
-                        auto suffix_tokens = tokenize(slot.params.input_suffix, false);
-
-                        const int space_token = 29871; // TODO: this should not be hardcoded
-                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
-                            suffix_tokens.erase(suffix_tokens.begin());
-                        }
-
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
-                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_token_middle(model));
-                        prompt_tokens = prefix_tokens;
-                    }
-                    else
-                    {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
-                    }
-
-                    slot.n_prompt_tokens = prompt_tokens.size();
-
-                    if (slot.params.n_keep < 0)
-                    {
-                        slot.params.n_keep = slot.n_prompt_tokens;
-                    }
-                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
-                    // if input prompt is too big, truncate it
-                    if (slot.n_prompt_tokens >= slot.n_ctx)
-                    {
-                        const int n_left = slot.n_ctx - slot.params.n_keep;
-                        const int n_block_size = n_left / 2;
-                        const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
-
-                        std::vector<llama_token> new_tokens(
-                            prompt_tokens.begin(),
-                            prompt_tokens.begin() + slot.params.n_keep);
-                        new_tokens.insert(
-                            new_tokens.end(),
-                            prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
-                            prompt_tokens.end());
-
-                        LOG_VERBOSE("input truncated", {
-                            {"n_ctx",      slot.n_ctx},
-                            {"n_keep",     slot.params.n_keep},
-                            {"n_left",     n_left},
-                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                        });
-                        slot.truncated = true;
-                        prompt_tokens = new_tokens;
+                    // TODO: maybe move branch to outside of this loop in the future
+                    if (slot.state == SLOT_STATE_STARTED) {
+                        slot.t_start_process_prompt = ggml_time_us();
+                        slot.t_start_generation = 0;
 
+                        slot.n_past = 0;
                         slot.n_prompt_tokens = prompt_tokens.size();
-                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
-                    }
+                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
 
-                    if (!slot.params.cache_prompt)
-                    {
-                        llama_sampling_reset(slot.ctx_sampling);
+                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
-                        slot.n_past    = 0;
-                        slot.n_past_se = 0;
-                        slot.ga_i      = 0;
-                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
-                    }
-                    else
-                    {
-                        // push the prompt into the sampling context (do not apply grammar)
-                        for (auto &token : prompt_tokens)
-                        {
-                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+                        // print prompt tokens (for debugging)
+                        if (1) {
+                            // first 16 tokens (avoid flooding logs)
+                            for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
+                        } else {
+                            // all
+                            for (int i = 0; i < (int) prompt_tokens.size(); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                            }
                         }
 
-                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+                        // empty prompt passed -> release the slot and send empty response
+                        if (prompt_tokens.empty()) {
+                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
 
-                        // the last token of the cache is not in the KV cache until the next call to llama_decode
-                        // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
-                        if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
-                        {
-                            slot.n_past -= 1;
+                            slot.release();
+                            slot.print_timings();
+                            send_final_response(slot);
+                            continue;
                         }
 
-                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
+                        if (slot.is_non_causal()) {
+                            if (slot.n_prompt_tokens > n_ubatch) {
+                                slot.release();
+                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
+                                continue;
+                            }
 
-                        if (slot.ga_n != 1)
-                        {
-                            int ga_i = 0;
-                            int32_t ga_n = slot.ga_n;
-                            int32_t ga_w = slot.ga_w;
-                            int32_t slot_npast = 0;
-                            for (int k = 0; k < slot.n_past; ++k)
-                            {
-                                while (slot_npast >= ga_i + ga_w) {
-                                    const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                    slot_npast -= bd;
-                                    ga_i += ga_w/ga_n;
+                            if (slot.n_prompt_tokens > slot.n_ctx) {
+                                slot.release();
+                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER);
+                                continue;
+                            }
+                        } else {
+                            if (!params_base.ctx_shift) {
+                                // if context shift is disabled, we make sure prompt size is smaller than KV size
+                                // TODO: there should be a separate parameter that control prompt truncation
+                                //       context shift should be applied only during the generation phase
+                                if (slot.n_prompt_tokens >= slot.n_ctx) {
+                                    slot.release();
+                                    send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
+                                    continue;
                                 }
-                                slot_npast++;
                             }
-                            slot.n_past_se = slot_npast;
-                            slot.ga_i = ga_i;
-                        }
+                            if (slot.params.n_keep < 0) {
+                                slot.params.n_keep = slot.n_prompt_tokens;
+                            }
+                            slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
-                        LOG_INFO("slot progression", {
-                            { "slot_id", slot.id },
-                            { "task_id", slot.task_id },
-                            { "n_past",  slot.n_past },
-                            { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
-                        });
-                    }
+                            // if input prompt is too big, truncate it
+                            if (slot.n_prompt_tokens >= slot.n_ctx) {
+                                const int n_left = slot.n_ctx - slot.params.n_keep;
 
-                    slot.cache_tokens = prompt_tokens;
+                                const int n_block_size = n_left / 2;
+                                const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
-                    {
-                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
-                            { "slot_id", slot.id },
-                            { "task_id", slot.task_id }
-                        });
-                        slot.n_past--;
-                        if (slot.ga_i > 0)
-                        {
-                            slot.n_past_se--;
-                        }
-                    }
+                                llama_tokens new_tokens(
+                                        prompt_tokens.begin(),
+                                        prompt_tokens.begin() + slot.params.n_keep);
 
-                    int p0 = (int) system_tokens.size() + slot.n_past;
-                    LOG_INFO("kv cache rm [p0, end)", {
-                        { "slot_id", slot.id },
-                        { "task_id", slot.task_id },
-                        { "p0",      p0 }
-                    });
-                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+                                new_tokens.insert(
+                                        new_tokens.end(),
+                                        prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+                                        prompt_tokens.end());
 
-                    LOG_VERBOSE("prompt ingested", {
-                                                    {"n_past",  slot.n_past},
-                                                    {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
-                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
-                                                });
+                                prompt_tokens = std::move(new_tokens);
 
-                    const bool has_images = process_images(slot);
+                                slot.truncated = true;
+                                slot.n_prompt_tokens = prompt_tokens.size();
 
-                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+                                SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
 
-                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
+                                GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
+                            }
 
-                    int32_t ga_i = slot.ga_i;
-                    int32_t ga_n = slot.ga_n;
-                    int32_t ga_w = slot.ga_w;
+                            if (slot.params.cache_prompt) {
+                                // reuse any previously computed tokens that are common with the new prompt
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
 
-                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
-                    {
-                        if (slot.ga_n != 1)
-                        {
-                            while (slot_npast >= ga_i + ga_w) {
-                                const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                slot_npast -= bd;
-                                ga_i += ga_w/ga_n;
+                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
+                                if (params_base.n_cache_reuse > 0) {
+                                    size_t head_c = slot.n_past; // cache
+                                    size_t head_p = slot.n_past; // current prompt
+
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
+
+                                    while (head_c < slot.cache_tokens.size() &&
+                                           head_p < prompt_tokens.size()) {
+
+                                        size_t n_match = 0;
+                                        while (head_c + n_match < slot.cache_tokens.size() &&
+                                               head_p + n_match < prompt_tokens.size()     &&
+                                               slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
+
+                                            n_match++;
+                                        }
+
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
+                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                                            //}
+
+                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
+
+                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
+
+                                            for (size_t i = 0; i < n_match; i++) {
+                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
+                                                slot.n_past++;
+                                            }
+
+                                            head_c += n_match;
+                                            head_p += n_match;
+                                        } else {
+                                            head_c += 1;
+                                        }
+                                    }
+
+                                    SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
+                                }
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
-                        slot_npast++;
+
+                        if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
+                            // we have to evaluate at least 1 token to generate logits.
+                            SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
+
+                            slot.n_past--;
+                        }
+
+                        slot.n_prompt_tokens_processed = 0;
                     }
 
-                    if (has_images && !ingest_images(slot, n_batch))
-                    {
-                        LOG_ERROR("failed processing images", {
-                            {"slot_id", slot.id},
-                            {"task_id", slot.task_id},
-                        });
-                        // FIXME @phymbert: to be properly tested
-                        //  early returning without changing the slot state will block the slot for ever
-                        // no one at the moment is checking the return value
-                        return false;
+                    // non-causal tasks require to fit the entire prompt in the physical batch
+                    if (slot.is_non_causal()) {
+                        // cannot fit the prompt in the current batch - will try next iter
+                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+                            continue;
+                        }
                     }
 
-                    // extract the logits only for the last token
-                    if (batch.n_tokens > 0)
-                    {
+                    // keep only the common part
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                        // could not partially delete (likely using a non-Transformer model)
+                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+
+                        // there is no common part left
+                        slot.n_past = 0;
+                    }
+
+                    SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
+
+                    // remove the non-common part from the cache
+                    slot.cache_tokens.resize(slot.n_past);
+
+                    // add prompt tokens for processing in the current batch
+                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                        // without pooling, we want to output the embeddings for all the tokens in the batch
+                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+
+                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
+
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
+                        }
+
+                        slot.n_prompt_tokens_processed++;
+                        slot.n_past++;
+                    }
+
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+
+                    // entire prompt has been processed
+                    if (slot.n_past == slot.n_prompt_tokens) {
+                        slot.state = SLOT_STATE_DONE_PROMPT;
+
+                        GGML_ASSERT(batch.n_tokens > 0);
+
+                        common_sampler_reset(slot.smpl);
+
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                            common_sampler_accept(slot.smpl, prompt_tokens[i], false);
+                        }
+
+                        // extract the logits only for the last token
                         batch.logits[batch.n_tokens - 1] = true;
-                    }
 
-                    slot.n_decoded = 0;
-                    slot.i_batch   = batch.n_tokens - 1;
+                        slot.n_decoded = 0;
+                        slot.i_batch   = batch.n_tokens - 1;
+
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+                    }
+                }
+
+                if (batch.n_tokens >= n_batch) {
+                    break;
                 }
             }
         }
 
-        if (batch.n_tokens == 0)
-        {
-            all_slots_are_idle = true;
-            return true;
+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+            return;
         }
 
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-        {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
-            for (auto & slot : slots)
-            {
-                if (slot.ga_n != 1)
-                {
-                    // context extension via Self-Extend
-                    while (slot.n_past_se >= slot.ga_i + slot.ga_w)
-                    {
-                        const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
-                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
-                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
+        if (slot_batched) {
+            // make sure we're in the right embedding mode
+            llama_set_embeddings(ctx, slot_batched->is_non_causal());
+            // apply lora, only need to do it once per batch
+            common_set_adapter_lora(ctx, slot_batched->lora);
+        }
 
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+        // process the created batch of tokens
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
-
-                        slot.n_past_se -= bd;
-
-                        slot.ga_i += slot.ga_w / slot.ga_n;
-
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
-                    }
-                    slot.n_past_se += n_tokens;
-                }
-            }
-
-            llama_batch batch_view =
-            {
+            llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
                 nullptr,
@@ -1920,802 +3150,220 @@ struct llama_server_context
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
+            metrics.on_decoded(slots);
 
-            if (ret != 0)
-            {
-                if (n_batch == 1 || ret < 0)
-                {
+            if (ret != 0) {
+                if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
-                    return false;
+                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                    for (auto & slot : slots) {
+                        slot.release();
+                        send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
+                    }
+                    break; // break loop of n_batch
                 }
 
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
-
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
                 i -= n_batch;
-                continue;
+
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+
+                continue; // continue loop of n_batch
             }
 
-            for (auto & slot : slots)
-            {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
-                {
-                    continue;
+            for (auto & slot : slots) {
+                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                    continue; // continue loop of slots
                 }
 
-                // prompt evaluated for embedding
-                if (slot.embedding)
-                {
-                    send_embedding(slot);
-                    slot.release();
-                    slot.i_batch = -1;
-                    continue;
+                if (slot.state == SLOT_STATE_DONE_PROMPT) {
+                    if (slot.task_type == SERVER_TASK_TYPE_EMBEDDING) {
+                        // prompt evaluated for embedding
+                        send_embedding(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    if (slot.task_type == SERVER_TASK_TYPE_RERANK) {
+                        send_rerank(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    // prompt evaluated for next-token prediction
+                    slot.state = SLOT_STATE_GENERATING;
+                } else if (slot.state != SLOT_STATE_GENERATING) {
+                    continue; // continue loop of slots
                 }
 
-                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                const int tok_idx = slot.i_batch - i;
 
-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
+
+                slot.i_batch = -1;
+
+                common_sampler_accept(slot.smpl, id, true);
 
                 slot.n_decoded += 1;
-                if (slot.n_decoded == 1)
-                {
-                    slot.t_start_genereration = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+
+                const int64_t t_current = ggml_time_us();
+
+                if (slot.n_decoded == 1) {
+                    slot.t_start_generation = t_current;
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                     metrics.on_prompt_eval(slot);
                 }
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
-                result.tok = id;
+                slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
 
-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0)
-                {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
+                completion_token_output result;
+                result.tok          = id;
+                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
+
+                if (slot.params.sampling.n_probs > 0) {
+                    populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx);
                 }
 
-                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
-                {
-                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
-                }
-
-                if (!process_token(result, slot))
-                {
+                if (!process_token(result, slot)) {
+                    // release slot because of stop condition
                     slot.release();
                     slot.print_timings();
                     send_final_response(slot);
                     metrics.on_prediction(slot);
+                    continue;
+                }
+            }
+
+            // do speculative decoding
+            for (auto & slot : slots) {
+                if (!slot.is_processing() || !slot.can_speculate()) {
+                    continue;
                 }
 
-                slot.i_batch = -1;
+                if (slot.state != SLOT_STATE_GENERATING) {
+                    continue;
+                }
+
+                // determine the max draft that fits the current slot state
+                int n_draft_max = slot.params.speculative.n_max;
+
+                // note: n_past is not yet increased for the `id` token sampled above
+                //       also, need to leave space for 1 extra token to allow context shifts
+                n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
+
+                if (slot.n_remaining > 0) {
+                    n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
+                }
+
+                SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
+
+                if (n_draft_max < slot.params.speculative.n_min) {
+                    SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
+
+                    continue;
+                }
+
+                llama_token id = slot.sampled;
+
+                struct common_speculative_params params_spec;
+                params_spec.n_draft   = n_draft_max;
+                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
+                params_spec.p_min     = slot.params.speculative.p_min;
+
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
+
+                // ignore small drafts
+                if (slot.params.speculative.n_min > (int) draft.size()) {
+                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
+
+                    continue;
+                }
+
+                // construct the speculation batch
+                common_batch_clear(slot.batch_spec);
+                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
+
+                for (size_t i = 0; i < draft.size(); ++i) {
+                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
+                }
+
+                SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
+
+                llama_decode(ctx, slot.batch_spec);
+
+                // the accepted tokens from the speculation
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
+
+                slot.n_past    += ids.size();
+                slot.n_decoded += ids.size();
+
+                slot.cache_tokens.push_back(id);
+                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
+
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+
+                for (size_t i = 0; i < ids.size(); ++i) {
+                    completion_token_output result;
+
+                    result.tok          = ids[i];
+                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                    result.prob         = 1.0f; // set later
+
+                    // TODO: set result.probs
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        break;
+                    }
+                }
+
+                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
             }
         }
 
-        LOG_VERBOSE("slots updated", {});
-        return true;
+        SRV_DBG("%s", "run slots completed\n");
+    }
+
+    json model_meta() const {
+        return json {
+            {"vocab_type",  llama_vocab_type       (vocab)},
+            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
+            {"n_ctx_train", llama_model_n_ctx_train(model)},
+            {"n_embd",      llama_model_n_embd     (model)},
+            {"n_params",    llama_model_n_params   (model)},
+            {"size",        llama_model_size       (model)},
+        };
     }
 };
 
-static void server_print_usage(const char *argv0, const gpt_params &params,
-                               const server_params &sparams)
-{
-    printf("usage: %s [options]\n", argv0);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  --threads-http N          number of threads in the http server pool to process requests (default: hardware concurrency)\n");
-    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
-    if (llama_supports_mlock())
-    {
-        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_supports_mmap())
-    {
-        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
-    printf("                              - distribute: spread execution evenly over all nodes\n");
-    printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
-    printf("                              - numactl: use the CPU map provided my numactl\n");
-    if (llama_supports_gpu_offload()) {
-        printf("  -ngl N, --n-gpu-layers N\n");
-        printf("                            number of layers to store in VRAM\n");
-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-        printf("                            how to split the model across multiple GPUs, one of:\n");
-        printf("                              - none: use one GPU only\n");
-        printf("                              - layer (default): split layers and KV across GPUs\n");
-        printf("                              - row: split rows across GPUs\n");
-        printf("  -ts SPLIT --tensor-split SPLIT\n");
-        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
-        printf("                            or for intermediate results and KV (with split-mode = row)\n");
-    }
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
-    printf("  -a ALIAS, --alias ALIAS\n");
-    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
-    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
-    printf("  --path PUBLIC_PATH        path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
-    printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
-    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    printf("  --embedding               enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  -spf FNAME, --system-prompt-file FNAME\n");
-    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                            KV cache data type for K (default: f16)\n");
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                            KV cache data type for V (default: f16)\n");
-    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
-    printf("  --log-format              log output format: json or text (default: json)\n");
-    printf("  --log-disable             disables logging to a file.\n");
-    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
-    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
-    printf("\n");
-    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
-    printf("  --chat-template JINJA_TEMPLATE\n");
-    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
-    printf("                            Note: only commonly used templates are accepted, since we don't have jinja parser\n");
-    printf("\n");
-}
-
-static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params, llama_server_context& llama)
-{
-    gpt_params default_params;
-    server_params default_sparams;
-    std::string arg;
-    bool invalid_param = false;
-
-    for (int i = 1; i < argc; i++)
-    {
-        arg = argv[i];
-        if (arg == "--port")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.port = std::stoi(argv[i]);
-        }
-        else if (arg == "--host")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.hostname = argv[i];
-        }
-        else if (arg == "--path")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.public_path = argv[i];
-        }
-        else if (arg == "--api-key")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.api_keys.emplace_back(argv[i]);
-        }
-        else if (arg == "--api-key-file")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream key_file(argv[i]);
-            if (!key_file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string key;
-            while (std::getline(key_file, key)) {
-               if (key.size() > 0) {
-                   sparams.api_keys.push_back(key);
-               }
-            }
-            key_file.close();
-        }
-        else if (arg == "--timeout" || arg == "-to")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.read_timeout = std::stoi(argv[i]);
-            sparams.write_timeout = std::stoi(argv[i]);
-        }
-        else if (arg == "-m" || arg == "--model")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        }
-        else if (arg == "-a" || arg == "--alias")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model_alias = argv[i];
-        }
-        else if (arg == "-h" || arg == "--help")
-        {
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(0);
-        }
-        else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        }
-        else if (arg == "--rope-scaling")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-            else { invalid_param = true; break; }
-        }
-        else if (arg == "--rope-freq-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_base = std::stof(argv[i]);
-        }
-        else if (arg == "--rope-freq-scale")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-ext-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-attn-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-fast")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-slow")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
-        }
-        else if (arg == "--threads" || arg == "-t")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        }
-        else if (arg == "--grp-attn-n" || arg == "-gan")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        }
-        else if (arg == "--grp-attn-w" || arg == "-gaw")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
-        }
-        else if (arg == "--threads-batch" || arg == "-tb")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-        }
-        else if (arg == "--threads-http")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.n_threads_http = std::stoi(argv[i]);
-        }
-        else if (arg == "-b" || arg == "--batch-size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
-        }
-        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (llama_supports_gpu_offload()) {
-                params.n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                        "See main README.md for information on enabling GPU BLAS support",
-                        {{"n_gpu_layers", params.n_gpu_layers}});
-            }
-        }
-        else if (arg == "--split-mode" || arg == "-sm")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-            if (arg_next == "none")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            }
-            else if (arg_next == "layer")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
-                invalid_param = true;
-                break;
-            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--tensor-split" || arg == "-ts")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
-            std::string arg_next = argv[i];
-
-            // split string by , and /
-            const std::regex regex{R"([,/]+)"};
-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= llama_max_devices());
-
-            for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
-            {
-                if (i_device < split_arg.size())
-                {
-                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
-                }
-                else
-                {
-                    params.tensor_split[i_device] = 0.0f;
-                }
-            }
-#else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--main-gpu" || arg == "-mg")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
-            params.main_gpu = std::stoi(argv[i]);
-#else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
-#endif
-        }
-        else if (arg == "--lora")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(argv[i], 1.0f);
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-scaled")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        }
-        else if (arg == "-v" || arg == "--verbose")
-        {
-#if SERVER_VERBOSE != 1
-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
-            server_verbose = true;
-#endif
-        }
-        else if (arg == "--mlock")
-        {
-            params.use_mlock = true;
-        }
-        else if (arg == "--no-mmap")
-        {
-            params.use_mmap = false;
-        }
-        else if (arg == "--numa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            } else {
-                std::string value(argv[i]);
-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-                else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-                else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-                else { invalid_param = true; break; }
-            }
-        }
-        else if (arg == "--embedding")
-        {
-            params.embedding = true;
-        }
-        else if (arg == "-cb" || arg == "--cont-batching")
-        {
-            params.cont_batching = true;
-        }
-        else if (arg == "-np" || arg == "--parallel")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string systm_content;
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(systm_content)
-            );
-            llama.system_prompt_process(json::parse(systm_content));
-        }
-        else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        }
-        else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
-        }
-        else if(arg == "--mmproj")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.mmproj = argv[i];
-        }
-        else if (arg == "--log-format")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (std::strcmp(argv[i], "json") == 0)
-            {
-                server_log_json = true;
-            }
-            else if (std::strcmp(argv[i], "text") == 0)
-            {
-                server_log_json = false;
-            }
-            else
-            {
-                invalid_param = true;
-                break;
-            }
-        }
-        else if (arg == "--log-disable")
-        {
-            log_set_target(stdout);
-            LOG_INFO("logging to file is disabled.", {});
-        }
-        else if (arg == "--slots-endpoint-disable")
-        {
-            sparams.slots_endpoint = false;
-        }
-        else if (arg == "--metrics")
-        {
-            sparams.metrics_endpoint = true;
-        }
-        else if (arg == "--chat-template")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (!verify_custom_template(argv[i])) {
-                fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
-                fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
-                invalid_param = true;
-                break;
-            }
-            sparams.chat_template = argv[i];
-        }
-        else if (arg == "--override-kv")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            params.kv_overrides.push_back(kvo);
-        }
-        else
-        {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(1);
-        }
-    }
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    if (invalid_param)
-    {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        server_print_usage(argv[0], default_params, default_sparams);
-        exit(1);
-    }
-}
-
-/* llama.cpp completion api semantics */
-static json format_partial_response(
-    llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
-) {
-    json res = json
-    {
-        {"content",    content },
-        {"stop",       false},
-        {"slot_id",    slot->id },
-        {"multimodal", llama.multimodal }
-    };
-
-    if (slot->sparams.n_probs > 0)
-    {
-        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
-    }
-
-    return res;
-}
-
-static json format_tokenizer_response(const std::vector<llama_token> &tokens)
-{
-    return json {
-        {"tokens", tokens}
-    };
-}
-
-static json format_detokenized_response(std::string content)
-{
-    return json {
-        {"content", content}
-    };
-}
-
-
-static void log_server_request(const httplib::Request &req, const httplib::Response &res)
-{
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
     // skip GH copilot requests when using default port
-    if (req.path == "/v1/health" || req.path == "/v1/completions")
-    {
+    if (req.path == "/v1/health" || req.path == "/v1/completions") {
         return;
     }
 
-    LOG_INFO("request", {
-        {"remote_addr", req.remote_addr},
-        {"remote_port", req.remote_port},
-        {"status",      res.status},
-        {"method",      req.method},
-        {"path",        req.path},
-        {"params",      req.params},
-    });
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
 
-    LOG_VERBOSE("request", {
-        {"request",  req.body},
-        {"response", res.body},
-    });
-}
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
 
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot)
-{
-    auto & gtps = slot->generated_token_probs;
-    auto translator = token_translator{llama.ctx};
-    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
-    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
-    if (slot->generated_text.capacity() < slot->generated_text.size() + len)
-    {
-        slot->generated_text.reserve(slot->generated_text.size() + len);
-    }
-    for (const completion_token_output & cto : gtps)
-    {
-        slot->generated_text += translator(cto);
-    }
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
 }
 
 std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+
 inline void signal_handler(int signal) {
     if (is_terminating.test_and_set()) {
         // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
@@ -2723,787 +3371,1148 @@ inline void signal_handler(int signal) {
         fprintf(stderr, "Received second interrupt, terminating immediately.\n");
         exit(1);
     }
+
     shutdown_handler(signal);
 }
 
-int main(int argc, char **argv)
-{
-#if SERVER_VERBOSE != 1
-    log_disable();
-#endif
+int main(int argc, char ** argv) {
     // own arguments required by this example
-    gpt_params params;
-    server_params sparams;
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+        return 1;
+    }
+
+    common_init();
 
     // struct that contains llama context and inference
-    llama_server_context llama;
-
-    server_params_parse(argc, argv, sparams, params, llama);
-
-    if (params.model_alias == "unknown")
-    {
-        params.model_alias = params.model;
-    }
+    server_context ctx_server;
 
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
-                            {"commit", LLAMA_COMMIT}});
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("\n");
 
-    LOG_INFO("system info", {
-                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
-                                {"total_threads", std::thread::hardware_concurrency()},
-                                {"system_info", llama_print_system_info()},
-                            });
-
-    httplib::Server svr;
+    std::unique_ptr<httplib::Server> svr;
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        svr.reset(
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        );
+    } else {
+        LOG_INF("Running without SSL\n");
+        svr.reset(new httplib::Server());
+    }
+#else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return 1;
+    }
+    svr.reset(new httplib::Server());
+#endif
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
-    svr.set_default_headers({{"Server", "llama.cpp"}});
+    svr->set_default_headers({{"Server", "llama.cpp"}});
+    svr->set_logger(log_server_request);
 
-    // CORS preflight
-    svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        res.set_header("Access-Control-Allow-Credentials", "true");
-        res.set_header("Access-Control-Allow-Methods", "POST");
-        res.set_header("Access-Control-Allow-Headers", "*");
-    });
+    auto res_error = [](httplib::Response & res, const json & error_data) {
+        json final_response {{"error", error_data}};
+        res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON);
+        res.status = json_value(error_data, "code", 500);
+    };
 
-    svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
-        server_state current_state = state.load();
-        switch(current_state) {
-            case SERVER_STATE_READY: {
-                // request slots data using task queue
-                task_server task;
-                task.id   = llama.queue_tasks.get_new_id();
-                task.type = TASK_TYPE_METRICS;
-                task.target_id = -1;
+    auto res_ok = [](httplib::Response & res, const json & data) {
+        res.set_content(safe_json_to_str(data), MIMETYPE_JSON);
+        res.status = 200;
+    };
 
-                llama.queue_results.add_waiting_task_id(task.id);
-                llama.queue_tasks.post(task);
+    svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        std::string message;
+        try {
+            std::rethrow_exception(ep);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "Unknown Exception";
+        }
 
-                // get the result
-                task_result result = llama.queue_results.recv(task.id);
-                llama.queue_results.remove_waiting_task_id(task.id);
-
-                int n_idle_slots       = result.result_json["idle"];
-                int n_processing_slots = result.result_json["processing"];
-
-                json health = {
-                        {"status",           "ok"},
-                        {"slots_idle",       n_idle_slots},
-                        {"slots_processing", n_processing_slots}};
-                res.status = 200; // HTTP OK
-                if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                    health["slots"] = result.result_json["slots"];
-                }
-
-                if (n_idle_slots == 0) {
-                    health["status"] = "no slot available";
-                    if (req.has_param("fail_on_no_slot")) {
-                        res.status = 503; // HTTP Service Unavailable
-                    }
-                }
-                res.set_content(health.dump(), "application/json");
-                break;
-            }
-            case SERVER_STATE_LOADING_MODEL:
-                res.set_content(R"({"status": "loading model"})", "application/json");
-                res.status = 503; // HTTP Service Unavailable
-                break;
-            case SERVER_STATE_ERROR:
-                res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
-                res.status = 500; // HTTP Internal Server Error
-                break;
+        try {
+            json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
+            LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
+            res_error(res, formatted_error);
+        } catch (const std::exception & e) {
+            LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
         }
     });
 
-    if (sparams.slots_endpoint) {
-        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
-            // request slots data using task queue
-            task_server task;
-            task.id = llama.queue_tasks.get_new_id();
-            task.type = TASK_TYPE_METRICS;
-            task.target_id = -1;
-
-            llama.queue_results.add_waiting_task_id(task.id);
-            llama.queue_tasks.post(task);
-
-            // get the result
-            task_result result = llama.queue_results.recv(task.id);
-            llama.queue_results.remove_waiting_task_id(task.id);
-
-            res.set_content(result.result_json["slots"].dump(), "application/json");
-            res.status = 200; // HTTP OK
-        });
-    }
-
-    if (sparams.metrics_endpoint) {
-        svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
-            // request slots data using task queue
-            task_server task;
-            task.id = llama.queue_tasks.get_new_id();
-            task.type = TASK_TYPE_METRICS;
-            task.target_id = -1;
-
-            llama.queue_results.add_waiting_task_id(task.id);
-            llama.queue_tasks.post(task);
-
-            // get the result
-            task_result result = llama.queue_results.recv(task.id);
-            llama.queue_results.remove_waiting_task_id(task.id);
-
-            json data = result.result_json;
-
-            uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
-            uint64_t t_prompt_processing       = data["t_prompt_processing"];
-
-            uint64_t n_tokens_predicted       = data["n_tokens_predicted"];
-            uint64_t t_tokens_generation      = data["t_tokens_generation"];
-
-            int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
-
-            // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
-            json all_metrics_def = json {
-                    {"counter", {{
-                            {"name",  "prompt_tokens_total"},
-                            {"help",  "Number of prompt tokens processed."},
-                            {"value",  data["n_prompt_tokens_processed_total"]}
-                    }, {
-                            {"name",  "tokens_predicted_total"},
-                            {"help",  "Number of generation tokens processed."},
-                            {"value",  data["n_tokens_predicted_total"]}
-                    }}},
-                    {"gauge", {{
-                            {"name",  "prompt_tokens_seconds"},
-                            {"help",  "Average prompt throughput in tokens/s."},
-                            {"value",  n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
-                    },{
-                            {"name",  "predicted_tokens_seconds"},
-                            {"help",  "Average generation throughput in tokens/s."},
-                            {"value",  n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
-                     },{
-                            {"name",  "kv_cache_usage_ratio"},
-                            {"help",  "KV-cache usage. 1 means 100 percent usage."},
-                            {"value",  1. * kv_cache_used_cells / params.n_ctx}
-                     },{
-                            {"name",  "kv_cache_tokens"},
-                            {"help",  "KV-cache tokens."},
-                            {"value",  data["kv_cache_tokens_count"]}
-                    },{
-                            {"name",  "requests_processing"},
-                            {"help",  "Number of request processing."},
-                            {"value",  data["processing"]}
-                  },{
-                            {"name",  "requests_deferred"},
-                            {"help",  "Number of request deferred."},
-                            {"value",  data["deferred"]}
-                  }}}
-            };
-
-            std::stringstream prometheus;
-            for (const auto& el : all_metrics_def.items()) {
-                const auto& type = el.key();
-                const auto& metrics_def = el.value();
-                for (const auto& metric_def : metrics_def) {
-                    std::string name = metric_def["name"];
-                    std::string help = metric_def["help"];
-                    prometheus << "# HELP llamacpp:" << name << " " << help                << "\n"
-                               << "# TYPE llamacpp:" << name << " " << type                << "\n"
-                               << "llamacpp:"        << name << " " << metric_def["value"] << "\n";
-                }
-            }
-
-            res.set_content(prometheus.str(), "text/plain; version=0.0.4");
-            res.status = 200; // HTTP OK
-        });
-    }
-
-    svr.set_logger(log_server_request);
-
-    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
-            {
-                const char fmt[] = "500 Internal Server Error\n%s";
-                char buf[BUFSIZ];
-                try
-                {
-                    std::rethrow_exception(std::move(ep));
-                }
-                catch (std::exception &e)
-                {
-                    snprintf(buf, sizeof(buf), fmt, e.what());
-                }
-                catch (...)
-                {
-                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-                }
-                res.set_content(buf, "text/plain; charset=utf-8");
-                res.status = 500;
-            });
-
-    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
-            {
-                if (res.status == 401)
-                {
-                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
-                }
-                if (res.status == 400)
-                {
-                    res.set_content("Invalid request", "text/plain; charset=utf-8");
-                }
-                else if (res.status == 404)
-                {
-                    res.set_content("File Not Found", "text/plain; charset=utf-8");
-                    res.status = 404;
-                }
-            });
+    svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) {
+        if (res.status == 404) {
+            res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
+        }
+        // for other error codes, we skip processing here because it's already done by res_error()
+    });
 
     // set timeouts and change hostname and port
-    svr.set_read_timeout (sparams.read_timeout);
-    svr.set_write_timeout(sparams.write_timeout);
-
-    if (!svr.bind_to_port(sparams.hostname, sparams.port))
-    {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
-        return 1;
-    }
-
-    // Set the base directory for serving static files
-    svr.set_base_dir(sparams.public_path);
+    svr->set_read_timeout (params.timeout_read);
+    svr->set_write_timeout(params.timeout_write);
 
     std::unordered_map<std::string, std::string> log_data;
-    log_data["hostname"] = sparams.hostname;
-    log_data["port"] = std::to_string(sparams.port);
 
-    if (sparams.api_keys.size() == 1) {
-        log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
-    } else if (sparams.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
+    log_data["hostname"] = params.hostname;
+    log_data["port"]     = std::to_string(params.port);
+
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
+        log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
+    } else if (params.api_keys.size() > 1) {
+        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
     }
 
-    // load the model
-    if (!llama.load_model(params))
-    {
-        state.store(SERVER_STATE_ERROR);
-        return 1;
-    } else {
-        llama.initialize();
-        state.store(SERVER_STATE_READY);
-        LOG_INFO("model loaded", {});
-    }
+    // Necessary similarity of prompt for slot selection
+    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
 
-    if (sparams.chat_template.empty()) { // custom chat template is not supplied
-        // check if the template comes with the model is supported by us
-        llama.validate_model_chat_template(sparams);
-    }
+    //
+    // Middlewares
+    //
+
+    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/models",
+            "/v1/models",
+        };
 
-    // Middleware for API key validation
-    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
         // If API key is not set, skip validation
-        if (sparams.api_keys.empty()) {
+        if (params.api_keys.empty()) {
+            return true;
+        }
+
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
             return true;
         }
 
         // Check for API key in the header
         auto auth_header = req.get_header_value("Authorization");
+
         std::string prefix = "Bearer ";
         if (auth_header.substr(0, prefix.size()) == prefix) {
             std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+            if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
                 return true; // API key is valid
             }
         }
 
         // API key is invalid or not provided
-        res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
-        res.status = 401; // Unauthorized
+        res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
 
-        LOG_WARNING("Unauthorized: Invalid API Key", {});
+        LOG_WRN("Unauthorized: Invalid API Key\n");
 
         return false;
     };
 
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
-                return false;
-            });
+    auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
+        server_state current_state = state.load();
+        if (current_state == SERVER_STATE_LOADING_MODEL) {
+            auto tmp = string_split<std::string>(req.path, '.');
+            if (req.path == "/" || tmp.back() == "html") {
+                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+                res.status = 503;
+            } else {
+                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+            }
+            return false;
+        }
+        return true;
+    };
 
-    // this is only called if no index.js is found in the public --path
-    svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
-                return false;
-            });
-
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-
-    svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                json data = {
-                    { "user_name",      llama.name_user.c_str() },
-                    { "assistant_name", llama.name_assistant.c_str() },
-                    { "default_generation_settings", llama.default_generation_settings_for_props },
-                    { "total_slots",    llama.params.n_parallel }
-                };
-                res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                if (!validate_api_key(req, res)) {
-                    return;
-                }
-                json data = json::parse(req.body);
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, data, false, false, -1);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.queue_results.recv(task_id);
-                    if (!result.error && result.stop) {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                    }
-                    llama.queue_results.remove_waiting_task_id(task_id);
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
-                    {
-                        while (true)
-                        {
-                            task_result result = llama.queue_results.recv(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                    "data: " +
-                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                if (result.stop) {
-                                    break;
-                                }
-                            } else {
-                                const std::string str =
-                                    "error: " +
-                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                break;
-                            }
-                        }
-
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                        sink.done();
-                        return true;
-                    };
-
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                    };
-
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
-
-    svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                std::time_t t = std::time(0);
-
-                json models = {
-                    {"object", "list"},
-                    {"data", {
-                        {
-                            {"id", params.model_alias},
-                            {"object", "model"},
-                            {"created", t},
-                            {"owned_by", "llamacpp"}
-                        },
-                    }}
-                };
-
-                res.set_content(models.dump(), "application/json; charset=utf-8");
-            });
-
-    const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
-    {
+    // register server middlewares
+    svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        if (!validate_api_key(req, res)) {
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
+        if (!middleware_server_state(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        if (!middleware_validate_api_key(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    //
+    // Route handlers (or controllers)
+    //
+
+    const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
+        // error and loading states are handled by middleware
+        json health = {{"status", "ok"}};
+        res_ok(res, health);
+    };
+
+    const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
+        if (!params.endpoint_slots) {
+            res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
-        json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
 
-        const int task_id = llama.queue_tasks.get_new_id();
-        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        // request slots data using task queue
+        server_task task(SERVER_TASK_TYPE_METRICS);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task, true); // high-priority task
 
-        if (!json_value(data, "stream", false)) {
-            std::string completion_text;
-            task_result result = llama.queue_results.recv(task_id);
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
 
-            if (!result.error && result.stop) {
-                json oaicompat_result = format_final_response_oaicompat(data, result);
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
 
-                res.set_content(oaicompat_result.dump(-1, ' ', false,
-                                    json::error_handler_t::replace),
-                                    "application/json; charset=utf-8");
-            } else {
-                res.status = 500;
-                res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
+        // TODO: get rid of this dynamic_cast
+        auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_metrics != nullptr);
+
+        // optionally return "fail_on_no_slot" error
+        if (req.has_param("fail_on_no_slot")) {
+            if (res_metrics->n_idle_slots == 0) {
+                res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
+                return;
             }
-            llama.queue_results.remove_waiting_task_id(task_id);
-        } else {
-            const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
-                while (true) {
-                    task_result llama_result = llama.queue_results.recv(task_id);
-                    if (!llama_result.error) {
-                        std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
+        }
 
-                        for (auto it = result_array.begin(); it != result_array.end(); ++it)
-                        {
-                            if (!it->empty()) {
-                                const std::string str =
-                                    "data: " +
-                                    it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {{"to_send", str}});
-                                if (!sink.write(str.c_str(), str.size())) {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
+        res_ok(res, res_metrics->slots_data);
+    };
+
+    const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
+        if (!params.endpoint_metrics) {
+            res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        // request slots data using task queue
+        server_task task(SERVER_TASK_TYPE_METRICS);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        task.metrics_reset_bucket = true;
+
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task, true); // high-priority task
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_metrics != nullptr);
+
+        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
+        json all_metrics_def = json {
+            {"counter", {{
+                    {"name",  "prompt_tokens_total"},
+                    {"help",  "Number of prompt tokens processed."},
+                    {"value",  (uint64_t) res_metrics->n_prompt_tokens_processed_total}
+            }, {
+                    {"name",  "prompt_seconds_total"},
+                    {"help",  "Prompt process time"},
+                    {"value",  (uint64_t) res_metrics->t_prompt_processing_total / 1.e3}
+            }, {
+                    {"name",  "tokens_predicted_total"},
+                    {"help",  "Number of generation tokens processed."},
+                    {"value",  (uint64_t) res_metrics->n_tokens_predicted_total}
+            }, {
+                    {"name",  "tokens_predicted_seconds_total"},
+                    {"help",  "Predict process time"},
+                    {"value",  (uint64_t) res_metrics->t_tokens_generation_total / 1.e3}
+            }, {
+                    {"name",  "n_decode_total"},
+                    {"help",  "Total number of llama_decode() calls"},
+                    {"value",  res_metrics->n_decode_total}
+            }, {
+                    {"name",  "n_busy_slots_per_decode"},
+                    {"help",  "Average number of busy slots per llama_decode() call"},
+                    {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
+            }}},
+            {"gauge", {{
+                    {"name",  "prompt_tokens_seconds"},
+                    {"help",  "Average prompt throughput in tokens/s."},
+                    {"value",  res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.}
+            },{
+                    {"name",  "predicted_tokens_seconds"},
+                    {"help",  "Average generation throughput in tokens/s."},
+                    {"value",  res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
+            },{
+                    {"name",  "kv_cache_usage_ratio"},
+                    {"help",  "KV-cache usage. 1 means 100 percent usage."},
+                    {"value",  1. * res_metrics->kv_cache_used_cells / params.n_ctx}
+            },{
+                    {"name",  "kv_cache_tokens"},
+                    {"help",  "KV-cache tokens."},
+                    {"value",  (uint64_t) res_metrics->kv_cache_tokens_count}
+            },{
+                    {"name",  "requests_processing"},
+                    {"help",  "Number of requests processing."},
+                    {"value",  (uint64_t) res_metrics->n_processing_slots}
+            },{
+                    {"name",  "requests_deferred"},
+                    {"help",  "Number of requests deferred."},
+                    {"value",  (uint64_t) res_metrics->n_tasks_deferred}
+            }}}
+        };
+
+        std::stringstream prometheus;
+
+        for (const auto & el : all_metrics_def.items()) {
+            const auto & type        = el.key();
+            const auto & metrics_def = el.value();
+
+            for (const auto & metric_def : metrics_def) {
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");
+
+                auto value = json_value(metric_def, "value", 0.);
+                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
+                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
+                            << "llamacpp:"        << name << " " << value << "\n";
+            }
+        }
+
+        res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start));
+
+        res.set_content(prometheus.str(), "text/plain; version=0.0.4");
+        res.status = 200; // HTTP OK
+    };
+
+    const auto handle_slots_save = [&ctx_server, &res_error, &res_ok, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data.at("filename");
+        if (!fs_validate_filename(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = params.slot_save_path + filename;
+
+        server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        task.slot_action.slot_id  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task);
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        res_ok(res, result->to_json());
+    };
+
+    const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data.at("filename");
+        if (!fs_validate_filename(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = params.slot_save_path + filename;
+
+        server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        task.slot_action.slot_id  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task);
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
+    };
+
+    const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
+        server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        task.slot_action.slot_id = id_slot;
+
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task);
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
+    };
+
+    const auto handle_slots_action = [&params, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
+        if (params.slot_save_path.empty()) {
+            res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        std::string id_slot_str = req.path_params.at("id_slot");
+        int id_slot;
+
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        std::string action = req.get_param_value("action");
+
+        if (action == "save") {
+            handle_slots_save(req, res, id_slot);
+        } else if (action == "restore") {
+            handle_slots_restore(req, res, id_slot);
+        } else if (action == "erase") {
+            handle_slots_erase(req, res, id_slot);
+        } else {
+            res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+        }
+    };
+
+    const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
+        // this endpoint is publicly available, please only return what is safe to be exposed
+        json data = {
+            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
+            { "model_path",                  ctx_server.params_base.model },
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
+            { "bos_token",                   ctx_server.chat_templates.template_default->bos_token() },
+            { "eos_token",                   ctx_server.chat_templates.template_default->eos_token() },
+            { "build_info",                  build_info },
+        };
+        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
+            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
+        }
+
+        res_ok(res, data);
+    };
+
+    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        if (!ctx_server.params_base.endpoint_props) {
+            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        json data = json::parse(req.body);
+
+        // update any props here
+
+        res_ok(res, {{ "success", true }});
+    };
+
+    // handle completion-like requests (completion, chat, infill)
+    // we can optionally provide a custom format for partial results and final results
+    const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
+            server_task_type type,
+            json & data,
+            std::function<bool()> is_connection_closed,
+            httplib::Response & res,
+            oaicompat_type oaicompat) {
+        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
+
+        if (ctx_server.params_base.embedding) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        auto completion_id = gen_chatcmplid();
+        std::vector<server_task> tasks;
+
+        try {
+            const auto & prompt = data.at("prompt");
+            // TODO: this log can become very long, put it behind a flag or think about a more compact format
+            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
+            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+            tasks.reserve(tokenized_prompts.size());
+            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+                server_task task = server_task(type);
+
+                task.id    = ctx_server.queue_tasks.get_new_id();
+                task.index = i;
+
+                task.prompt_tokens    = std::move(tokenized_prompts[i]);
+                task.params           = server_task::params_from_json_cmpl(
+                                            ctx_server.ctx,
+                                            ctx_server.params_base,
+                                            data);
+                task.id_selected_slot = json_value(data, "id_slot", -1);
+
+                // OAI-compat
+                task.params.oaicompat                 = oaicompat;
+                task.params.oaicompat_cmpl_id         = completion_id;
+                // oaicompat_model is already populated by params_from_json_cmpl
+
+                tasks.push_back(task);
+            }
+        } catch (const std::exception & e) {
+            res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        ctx_server.queue_results.add_waiting_tasks(tasks);
+        ctx_server.queue_tasks.post(tasks);
+
+        bool stream = json_value(data, "stream", false);
+        const auto task_ids = server_task::get_list_id(tasks);
+
+        if (!stream) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
+                if (results.size() == 1) {
+                    // single result
+                    res_ok(res, results[0]->to_json());
+                } else {
+                    // multiple results (multitask)
+                    json arr = json::array();
+                    for (auto & res : results) {
+                        arr.push_back(res->to_json());
+                    }
+                    res_ok(res, arr);
+                }
+            }, [&](const json & error_data) {
+                res_error(res, error_data);
+            }, is_connection_closed);
+
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+        } else {
+            const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat](size_t, httplib::DataSink & sink) {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
+                    json res_json = result->to_json();
+                    if (res_json.is_array()) {
+                        for (const auto & res : res_json) {
+                            if (!server_sent_event(sink, "data", res)) {
+                                // sending failed (HTTP connection closed), cancel the generation
+                                return false;
                             }
                         }
-                        if (llama_result.stop) {
-                            break;
-                        }
+                        return true;
                     } else {
-                        const std::string str =
-                            "error: " +
-                            llama_result.result_json.dump(-1, ' ', false,
-                                    json::error_handler_t::replace) +
-                            "\n\n";
-                        LOG_VERBOSE("data stream", {{"to_send", str}});
-                        if (!sink.write(str.c_str(), str.size())) {
-                            llama.queue_results.remove_waiting_task_id(task_id);
-                            return false;
-                        }
-                        break;
+                        return server_sent_event(sink, "data", res_json);
                     }
+                }, [&](const json & error_data) {
+                    server_sent_event(sink, "error", error_data);
+                }, [&sink]() {
+                    // note: do not use req.is_connection_closed here because req is already destroyed
+                    return !sink.is_writable();
+                });
+                if (oaicompat != OAICOMPAT_TYPE_NONE) {
+                    static const std::string ev_done = "data: [DONE]\n\n";
+                    sink.write(ev_done.data(), ev_done.size());
                 }
                 sink.done();
-                llama.queue_results.remove_waiting_task_id(task_id);
-                return true;
+                return false;
             };
 
-            auto on_complete = [task_id, &llama](bool) {
-                // cancel request
-                llama.request_cancel(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
+            auto on_complete = [task_ids, &ctx_server] (bool) {
+                ctx_server.queue_results.remove_waiting_task_ids(task_ids);
             };
 
             res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
         }
     };
 
-    svr.Post("/chat/completions", chat_completions);
-    svr.Post("/v1/chat/completions", chat_completions);
+    const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+        json data = json::parse(req.body);
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            data,
+            req.is_connection_closed,
+            res,
+            OAICOMPAT_TYPE_NONE);
+    };
 
-    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                if (!validate_api_key(req, res)) {
-                    return;
-                }
-                json data = json::parse(req.body);
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, data, true, false, -1);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.queue_results.recv(task_id);
-                    if (!result.error && result.stop)
-                    {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                    }
-                    llama.queue_results.remove_waiting_task_id(task_id);
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
-                        while (true)
-                        {
-                            task_result result = llama.queue_results.recv(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                if (result.stop)
-                                {
-                                    break;
-                                }
-                            }
-                            else
-                            {
-                                break;
-                            }
-                        }
+    const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+        json data = oaicompat_completion_params_parse(json::parse(req.body));
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            data,
+            req.is_connection_closed,
+            res,
+            OAICOMPAT_TYPE_COMPLETION);
+    };
 
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                        sink.done();
-                        return true;
-                    };
-
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                    };
-
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
-
-    svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
-                { return res.set_content("", "application/json; charset=utf-8"); });
-
-    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                std::vector<llama_token> tokens;
-                if (body.count("content") != 0)
-                {
-                    tokens = llama.tokenize(body["content"], false);
-                }
-                const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                std::string content;
-                if (body.count("tokens") != 0)
-                {
-                    const std::vector<llama_token> tokens = body["tokens"];
-                    content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-                }
-
-                const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                json prompt;
-                if (body.count("content") != 0)
-                {
-                    prompt = body["content"];
-                }
-                else
-                {
-                    prompt = "";
-                }
-
-                json image_data;
-                if (body.count("image_data") != 0) {
-                    image_data = body["image_data"];
-                }
-                else
-                {
-                    image_data = "";
-                }
-
-                // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
-
-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
-
-                // send the result
-                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-
-                json prompt;
-                if (body.count("input") != 0)
-                {
-                    prompt = body["input"];
-                    // batch
-                    if(prompt.is_array()) {
-                        json data = json::array();
-                        int i = 0;
-                        for (const json &elem : prompt) {
-                            const int task_id = llama.queue_tasks.get_new_id();
-                            llama.queue_results.add_waiting_task_id(task_id);
-                            llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
-
-                            // get the result
-                            task_result result = llama.queue_results.recv(task_id);
-                            llama.queue_results.remove_waiting_task_id(task_id);
-
-                            json embedding = json{
-                                {"embedding", json_value(result.result_json, "embedding", json::array())},
-                                {"index", i++},
-                                {"object", "embedding"}
-                            };
-                            data.push_back(embedding);
-                        }
-                        json result = format_embeddings_response_oaicompat(body, data);
-                        return res.set_content(result.dump(), "application/json; charset=utf-8");
-                    }
-                }
-                else
-                {
-                    prompt = "";
-                }
-
-                // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
-
-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
-
-                json data = json::array({json{
-                        {"embedding", json_value(result.result_json, "embedding", json::array())},
-                        {"index", 0},
-                        {"object", "embedding"}
-                    }}
-                );
-
-                json root = format_embeddings_response_oaicompat(body, data);
-
-                // send the result
-                return res.set_content(root.dump(), "application/json; charset=utf-8");
-            });
-
-    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
-    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
-    //std::thread t2([&]()
-    /*{
-        bool running = true;
-        while (running)
-        {
-            running = llama.update_slots();
+    const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+        // check model compatibility
+        std::string err;
+        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "prefix token is missing. ";
+        }
+        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "suffix token is missing. ";
+        }
+        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "middle token is missing. ";
+        }
+        if (!err.empty()) {
+            res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
+            return;
         }
-    }*/
-    //);
 
-    if (sparams.n_threads_http > 0) {
-        log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-        svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+        json data = json::parse(req.body);
+
+        // validate input
+        if (data.contains("prompt") && !data.at("prompt").is_string()) {
+            // prompt is optional
+            res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_prefix")) {
+            res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_suffix")) {
+            res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            // input_extra is optional
+            res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        json input_extra = json_value(data, "input_extra", json::array());
+        for (const auto & chunk : input_extra) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
+                res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+            // filename is optional
+            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
+                res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+        data["input_extra"] = input_extra; // default to empty array if it's not exist
+
+        std::string prompt = json_value(data, "prompt", std::string());
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
+        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+        data["prompt"] = format_infill(
+            ctx_server.vocab,
+            data.at("input_prefix"),
+            data.at("input_suffix"),
+            data.at("input_extra"),
+            ctx_server.params_base.n_batch,
+            ctx_server.params_base.n_predict,
+            ctx_server.slots[0].n_ctx, // TODO: there should be a better way
+            ctx_server.params_base.spm_infill,
+            tokenized_prompts[0]
+        );
+
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_INFILL,
+            data,
+            req.is_connection_closed,
+            res,
+            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
+    };
+
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+        LOG_DBG("request: %s\n", req.body.c_str());
+        if (ctx_server.params_base.embedding) {
+            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        auto body = json::parse(req.body);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            data,
+            req.is_connection_closed,
+            res,
+            OAICOMPAT_TYPE_CHAT);
+    };
+
+    // same with handle_chat_completions, but without inference part
+    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        auto body = json::parse(req.body);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
+    };
+
+    const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
+        json models = {
+            {"object", "list"},
+            {"data", {
+                {
+                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
+                    {"object",   "model"},
+                    {"created",  std::time(0)},
+                    {"owned_by", "llamacpp"},
+                    {"meta",     ctx_server.model_meta()}
+                },
+             }}
+        };
+
+        res_ok(res, models);
+    };
+
+    const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        const json body = json::parse(req.body);
+
+        json tokens_response = json::array();
+        if (body.count("content") != 0) {
+            const bool add_special = json_value(body, "add_special", false);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
+        }
+
+        const json data = format_tokenizer_response(tokens_response);
+        res_ok(res, data);
+    };
+
+    const auto handle_detokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        const json body = json::parse(req.body);
+
+        std::string content;
+        if (body.count("tokens") != 0) {
+            const llama_tokens tokens = body.at("tokens");
+            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
+        }
+
+        const json data = format_detokenized_response(content);
+        res_ok(res, data);
+    };
+
+    const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
+        const json body = json::parse(req.body);
+
+        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        // for the shape of input/content, see tokenize_input_prompts()
+        json prompt;
+        if (body.count("input") != 0) {
+            prompt = body.at("input");
+        } else if (body.contains("content")) {
+            oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
+            prompt = body.at("content");
+        } else {
+            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        bool use_base64 = false;
+        if (body.count("encoding_format") != 0) {
+            const std::string& format = body.at("encoding_format");
+            if (format == "base64") {
+                use_base64 = true;
+            } else if (format != "float") {
+                res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
+        for (const auto & tokens : tokenized_prompts) {
+            // this check is necessary for models that do not add BOS token to the input
+            if (tokens.empty()) {
+                res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        }
+
+        // create and queue the task
+        json responses = json::array();
+        bool error = false;
+        {
+            std::vector<server_task> tasks;
+            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+                server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
+
+                task.id            = ctx_server.queue_tasks.get_new_id();
+                task.index         = i;
+                task.prompt_tokens = std::move(tokenized_prompts[i]);
+
+                // OAI-compat
+                task.params.oaicompat = oaicompat;
+
+                tasks.push_back(task);
+            }
+
+            ctx_server.queue_results.add_waiting_tasks(tasks);
+            ctx_server.queue_tasks.post(tasks);
+
+            // get the result
+            std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
+
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
+                for (auto & res : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
+                    responses.push_back(res->to_json());
+                }
+            }, [&](const json & error_data) {
+                res_error(res, error_data);
+                error = true;
+            }, req.is_connection_closed);
+
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+        }
+
+        if (error) {
+            return;
+        }
+
+        // write JSON response
+        json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
+            ? format_embeddings_response_oaicompat(body, responses, use_base64)
+            : json(responses);
+        res_ok(res, root);
+    };
+
+    const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
+        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
+    };
+
+    const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
+        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
+    };
+
+    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
+            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        const json body = json::parse(req.body);
+
+        // TODO: implement
+        //int top_n = 1;
+        //if (body.count("top_n") != 1) {
+        //    top_n = body.at("top_n");
+        //} else {
+        //    res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+        //    return;
+        //}
+
+        json query;
+        if (body.count("query") == 1) {
+            query = body.at("query");
+            if (!query.is_string()) {
+                res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return;
+            }
+        } else {
+            res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>());
+        if (documents.empty()) {
+            res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
+
+        // create and queue the task
+        json responses = json::array();
+        bool error = false;
+        {
+            std::vector<server_task> tasks;
+            std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
+            tasks.reserve(tokenized_docs.size());
+            for (size_t i = 0; i < tokenized_docs.size(); i++) {
+                server_task task   = server_task(SERVER_TASK_TYPE_RERANK);
+                task.id            = ctx_server.queue_tasks.get_new_id();
+                task.index         = i;
+                task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
+                tasks.push_back(task);
+            }
+
+            ctx_server.queue_results.add_waiting_tasks(tasks);
+            ctx_server.queue_tasks.post(tasks);
+
+            // get the result
+            std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
+
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
+                for (auto & res : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
+                    responses.push_back(res->to_json());
+                }
+            }, [&](const json & error_data) {
+                res_error(res, error_data);
+                error = true;
+            }, req.is_connection_closed);
+        }
+
+        if (error) {
+            return;
+        }
+
+        // write JSON response
+        json root = format_response_rerank(body, responses);
+        res_ok(res, root);
+    };
+
+    const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
+        json result = json::array();
+        const auto & loras = ctx_server.params_base.lora_adapters;
+        for (size_t i = 0; i < loras.size(); ++i) {
+            auto & lora = loras[i];
+            result.push_back({
+                {"id", i},
+                {"path", lora.path},
+                {"scale", lora.scale},
+            });
+        }
+        res_ok(res, result);
+        res.status = 200; // HTTP OK
+    };
+
+    const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
+        const json body = json::parse(req.body);
+        if (!body.is_array()) {
+            res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        server_task task(SERVER_TASK_TYPE_SET_LORA);
+        task.id = ctx_server.queue_tasks.get_new_id();
+        task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
+        ctx_server.queue_results.add_waiting_task_id(task.id);
+        ctx_server.queue_tasks.post(task);
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
+    };
+
+    //
+    // Router
+    //
+
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
+    } else {
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = svr->set_mount_point("/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            svr->Get("/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                }
+                return false;
+            });
+        }
     }
 
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
+    // register API routes
+    svr->Get ("/health",              handle_health); // public endpoint (no API key check)
+    svr->Get ("/metrics",             handle_metrics);
+    svr->Get ("/props",               handle_props);
+    svr->Post("/props",               handle_props_change);
+    svr->Get ("/models",              handle_models); // public endpoint (no API key check)
+    svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
+    svr->Post("/completion",          handle_completions); // legacy
+    svr->Post("/completions",         handle_completions);
+    svr->Post("/v1/completions",      handle_completions_oai);
+    svr->Post("/chat/completions",    handle_chat_completions);
+    svr->Post("/v1/chat/completions", handle_chat_completions);
+    svr->Post("/infill",              handle_infill);
+    svr->Post("/embedding",           handle_embeddings); // legacy
+    svr->Post("/embeddings",          handle_embeddings);
+    svr->Post("/v1/embeddings",       handle_embeddings_oai);
+    svr->Post("/rerank",              handle_rerank);
+    svr->Post("/reranking",           handle_rerank);
+    svr->Post("/v1/rerank",           handle_rerank);
+    svr->Post("/v1/reranking",        handle_rerank);
+    svr->Post("/tokenize",            handle_tokenize);
+    svr->Post("/detokenize",          handle_detokenize);
+    svr->Post("/apply-template",      handle_apply_template);
+    // LoRA adapters hotswap
+    svr->Get ("/lora-adapters",       handle_lora_adapters_list);
+    svr->Post("/lora-adapters",       handle_lora_adapters_apply);
+    // Save & load slots
+    svr->Get ("/slots",               handle_slots);
+    svr->Post("/slots/:id_slot",      handle_slots_action);
 
-                return 0;
-            });
+    //
+    // Start the server
+    //
+    if (params.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
+    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
 
-    llama.queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_run_slots(std::bind(
-        &llama_server_context::update_slots, &llama));
-    llama.queue_results.on_multitask_update(std::bind(
-        &llama_server_queue::update_multitask,
-        &llama.queue_tasks,
-        std::placeholders::_1,
-        std::placeholders::_2,
-        std::placeholders::_3
-    ));
+    // clean up function, to be called before exit
+    auto clean_up = [&svr]() {
+        svr->stop();
+        llama_backend_free();
+    };
+
+    // bind HTTP listen port
+    bool was_bound = false;
+    if (params.port == 0) {
+        int bound_port = svr->bind_to_any_port(params.hostname);
+        if ((was_bound = (bound_port >= 0))) {
+            params.port = bound_port;
+        }
+    } else {
+        was_bound = svr->bind_to_port(params.hostname, params.port);
+    }
+
+    if (!was_bound) {
+        //LOG_ERROR("couldn't bind HTTP server socket", {
+        //    {"hostname", params.hostname},
+        //    {"port", params.port},
+        //});
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
+        clean_up();
+        return 1;
+    }
+
+    // run the HTTP server in a thread
+    std::thread t([&]() { svr->listen_after_bind(); });
+    svr->wait_until_ready();
+
+    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
+
+    // load the model
+    LOG_INF("%s: loading model\n", __func__);
+
+    if (!ctx_server.load_model(params)) {
+        clean_up();
+        t.join();
+        LOG_ERR("%s: exiting due to model loading error\n", __func__);
+        return 1;
+    }
+
+    ctx_server.init();
+    state.store(SERVER_STATE_READY);
+
+    LOG_INF("%s: model loaded\n", __func__);
+
+    // print sample chat example to make it clear which template is used
+    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
+        ctx_server.chat_templates.template_default->source().c_str(),
+        common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
+
+    ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
+        ctx_server.process_single_task(task);
+    });
+
+    ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
+        ctx_server.update_slots();
+    });
 
     shutdown_handler = [&](int) {
-        llama.queue_tasks.terminate();
+        ctx_server.queue_tasks.terminate();
     };
 
+    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+
+    ctx_server.queue_tasks.start_loop();
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
     sigemptyset (&sigint_action.sa_mask);
     sigint_action.sa_flags = 0;
     sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
     };
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
-    llama.queue_tasks.start_loop();
-    svr.stop();
+
+    clean_up();
     t.join();
 
-    llama_backend_free();
     return 0;
 }
diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore
new file mode 100644
index 000000000..90ee7fe6d
--- /dev/null
+++ b/examples/server/tests/.gitignore
@@ -0,0 +1,2 @@
+.venv
+tmp
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 0b9fdc4e7..1de0eb30e 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -1,47 +1,66 @@
 # Server tests
 
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
- * [issues.feature](./features/issues.feature) Pending issues scenario
- * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
- * [security.feature](./features/security.feature) Security, CORS and API Key
- * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
+Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
 
 Tests target GitHub workflows job runners with 4 vCPU.
 
-Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
-
-Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
+Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
+To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
 ### Install dependencies
+
 `pip install -r requirements.txt`
 
 ### Run tests
+
 1. Build the server
+
 ```shell
 cd ../../..
-mkdir build
-cd build
-cmake ../
-cmake --build . --target server
+cmake -B build -DLLAMA_CURL=ON
+cmake --build build --target llama-server
 ```
-2. download required models:
-   1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
-3. Start the test: `./tests.sh`
+
+2. Start the test: `./tests.sh`
 
 It's possible to override some scenario steps values with environment variables:
- - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
- - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
- - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
- - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
 
-### Run @bug, @wip or @wrong_usage annotated scenario
+| variable                 | description                                                                                    |
+|--------------------------|------------------------------------------------------------------------------------------------|
+| `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
+| `DEBUG`                  | to enable steps and server verbose mode `--verbose`                                       |
+| `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
+| `LLAMA_CACHE`            | by default server tests re-download models to the `tmp` subfolder. Set this to your cache (e.g. `$HOME/Library/Caches/llama.cpp` on Mac or `$HOME/.cache/llama.cpp` on Unix) to avoid this |
 
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
-- `@bug` annotation aims to link a scenario with a GitHub issue.
-- `@wrong_usage` are meant to show user issue that are actually an expected behavior
-- `@wip` to focus on a scenario working in progress
+To run slow tests (will download many models, make sure to set `LLAMA_CACHE` if needed):
 
-To run a scenario annotated with `@bug`, start:
-`DEBUG=ON ./tests.sh --no-skipped --tags bug`
+```shell
+SLOW_TESTS=1 ./tests.sh
+```
 
-After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
+To run with stdout/stderr display in real time (verbose output, but useful for debugging):
+
+```shell
+DEBUG=1 ./tests.sh -s -v -x
+```
+
+To run all the tests in a file:
+
+```shell
+./tests.sh unit/test_chat_completion.py.py -v -x
+```
+
+To run a single test:
+
+```shell
+./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req
+```
+
+Hint: You can compile and run test in single command, useful for local developement:
+
+```shell
+cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
+```
+
+To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
diff --git a/examples/server/tests/conftest.py b/examples/server/tests/conftest.py
new file mode 100644
index 000000000..017d1bb84
--- /dev/null
+++ b/examples/server/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+from utils import *
+
+
+# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
+@pytest.fixture(autouse=True)
+def stop_server_after_each_test():
+    # do nothing before each test
+    yield
+    # stop all servers after each test
+    instances = set(
+        server_instances
+    )  # copy the set to prevent 'Set changed size during iteration'
+    for server in instances:
+        server.stop()
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
deleted file mode 100644
index 09e826747..000000000
--- a/examples/server/tests/features/environment.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-import socket
-import subprocess
-import time
-from contextlib import closing
-from signal import SIGKILL
-
-
-def before_scenario(context, scenario):
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
-    port = 8080
-    if 'PORT' in os.environ:
-        port = int(os.environ['PORT'])
-    if is_server_listening("localhost", port):
-        assert False, "Server already started"
-
-
-def after_scenario(context, scenario):
-    if context.server_process is None:
-        return
-    if scenario.status == "failed":
-        if 'GITHUB_ACTIONS' in os.environ:
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-            if os.path.isfile('llama.log'):
-                with closing(open('llama.log', 'r')) as f:
-                    for line in f:
-                        print(line)
-        if not is_server_listening(context.server_fqdn, context.server_port):
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
-
-    if not pid_exists(context.server_process.pid):
-        assert False, f"Server not running pid={context.server_process.pid} ..."
-
-    print(f"stopping server pid={context.server_process.pid} ...")
-    context.server_process.kill()
-    # Wait few for socket to free up
-    time.sleep(0.05)
-
-    attempts = 0
-    while is_server_listening(context.server_fqdn, context.server_port):
-        print(f"stopping server pid={context.server_process.pid} ...")
-        os.kill(context.server_process.pid, SIGKILL)
-        time.sleep(0.1)
-        attempts += 1
-        if attempts > 5:
-            print(f"Server dangling exits, killing all {context.server_path} ...")
-            process = subprocess.run(['killall', '-9', context.server_path],
-                                     stderr=subprocess.PIPE,
-                                     universal_newlines=True)
-            print(process)
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        return result == 0
-
-
-def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
-    import errno
-    if pid < 0:
-        return False
-    try:
-        os.kill(pid, 0)
-    except OSError as e:
-        return e.errno == errno.EPERM
-    else:
-        return True
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
deleted file mode 100644
index bf5a175a3..000000000
--- a/examples/server/tests/features/issues.feature
+++ /dev/null
@@ -1,4 +0,0 @@
-# List of ongoing issues
-@bug
-Feature: Issues
-  # No confirmed issue at the moment
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
deleted file mode 100644
index 5f895cf90..000000000
--- a/examples/server/tests/features/parallel.feature
+++ /dev/null
@@ -1,145 +0,0 @@
-@llama.cpp
-Feature: Parallel
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
-    And   42 as server seed
-    And   64 KV cache size
-    And   2 slots
-    And   embeddings extraction
-    And   continuous batching
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Multi users completion
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And <n_predict> max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | n_predict |
-      | 128       |
-
-  Scenario Outline: Multi users OAI completions compatibility
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users OAI completions compatibility no v1
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests no v1
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    And 128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted
-
-  Scenario: Multi users embeddings
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: Multi users OAI compatibility embeddings
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    And a prompt:
-      """
-      What is the biggest US city ?
-      """
-    And a prompt:
-      """
-      What is the capital of Bulgaria ?
-      """
-    And   a model tinyllama-2
-    Given concurrent OAI embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
deleted file mode 100644
index db06d3977..000000000
--- a/examples/server/tests/features/security.feature
+++ /dev/null
@@ -1,50 +0,0 @@
-@llama.cpp
-Feature: Security
-
-  Background: Server startup with an api key defined
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a server api key llama.cpp
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Completion with some user api key
-    Given a prompt test
-    And   a user api key <api_key>
-    And   4 max tokens to predict
-    And   a completion request with <api_error> api error
-
-    Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackeme   | raised    |
-      |           | raised    |
-
-  Scenario Outline: OAI Compatibility
-    Given a system prompt test
-    And   a user prompt test
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    And   a user api key <api_key>
-    Given an OAI compatible chat completions request with <api_error> api error
-
-    Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackme    | raised    |
-
-
-  Scenario Outline: CORS Options
-    When an OPTIONS request is sent from <origin>
-    Then CORS header <cors_header> is set to <cors_header_value>
-
-    Examples: Headers
-      | origin          | cors_header                      | cors_header_value |
-      | localhost       | Access-Control-Allow-Origin      | localhost         |
-      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
-      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
-      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
deleted file mode 100644
index b571582a7..000000000
--- a/examples/server/tests/features/server.feature
+++ /dev/null
@@ -1,84 +0,0 @@
-@llama.cpp
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
-    And   42 as server seed
-      # KV Cache corresponds to the total amount of tokens
-      # that can be stored across all independent sequences: #4130
-      # see --ctx-size and #5568
-    And   32 KV cache size
-    And   1 slots
-    And   embeddings extraction
-    And   32 server max tokens to predict
-    And   prometheus compatible metrics exposed
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Health
-    Then the server is ready
-    And  all slots are idle
-
-  Scenario Outline: Completion
-    Given a prompt <prompt>
-    And   <n_predict> max tokens to predict
-    And   a completion request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   prometheus metrics are exposed
-
-    Examples: Prompts
-      | prompt                           | n_predict | re_content                             | n_predicted |
-      | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           |
-      | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          |
-
-  Scenario Outline: OAI Compatibility
-    Given a model <model>
-    And   a system prompt <system_prompt>
-    And   a user prompt <user_prompt>
-    And   <max_tokens> max tokens to predict
-    And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-
-    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                 | n_predicted | enable_streaming |
-      | llama-2      | Book                        | What is the best book                | 8          | (Mom<or>what)+             | 8           | disabled         |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks<or>happy<or>bird)+ | 32          | enabled          |
-
-  Scenario: Embedding
-    When embeddings are computed for:
-    """
-    What is the capital of Bulgaria ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility
-    Given a model tinyllama-2
-    When an OAI compatible embeddings computation request for:
-    """
-    What is the capital of Spain ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility with multiple inputs
-    Given a model tinyllama-2
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    When an OAI compatible embeddings computation request for multiple inputs
-    Then embeddings are generated
-
-
-  Scenario: Tokenize / Detokenize
-    When tokenizing:
-    """
-    What is the capital of France ?
-    """
-    Then tokens can be detokenize
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
deleted file mode 100644
index 381da105e..000000000
--- a/examples/server/tests/features/steps/steps.py
+++ /dev/null
@@ -1,827 +0,0 @@
-import asyncio
-import collections
-import json
-import os
-import re
-import socket
-import subprocess
-import time
-from contextlib import closing
-from re import RegexFlag
-
-import aiohttp
-import openai
-from behave import step
-from behave.api.async_step import async_run_until_complete
-from prometheus_client import parser
-
-
-@step(u"a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn, server_port):
-    context.server_fqdn = server_fqdn
-    context.server_port = int(server_port)
-    if 'PORT' in os.environ:
-        context.server_port = int(os.environ['PORT'])
-        print(f"$PORT set, overriding server port with to {context.server_port}")
-
-    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
-
-    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
-    context.model_alias = None
-    context.n_ctx = None
-    context.n_predict = None
-    context.n_server_predict = None
-    context.n_slots = None
-    context.server_api_key = None
-    context.server_continuous_batching = False
-    context.server_embeddings = False
-    context.server_metrics = False
-    context.server_process = None
-    context.server_seed = None
-    context.user_api_key = None
-
-    context.tasks_result = []
-    context.concurrent_tasks = []
-    context.prompts = []
-
-
-@step(u'a model file {model_file}')
-def step_model_file(context, model_file):
-    context.model_file = model_file
-
-
-@step(u'a model alias {model_alias}')
-def step_model_alias(context, model_alias):
-    context.model_alias = model_alias
-
-
-@step(u'{seed} as server seed')
-def step_seed(context, seed):
-    context.server_seed = int(seed)
-
-
-@step(u'{n_ctx} KV cache size')
-def step_n_ctx(context, n_ctx):
-    context.n_ctx = int(n_ctx)
-
-
-@step(u'{n_slots} slots')
-def step_n_slots(context, n_slots):
-    context.n_slots = int(n_slots)
-
-
-@step(u'{n_predict} server max tokens to predict')
-def step_server_n_predict(context, n_predict):
-    context.n_server_predict = int(n_predict)
-
-
-@step(u'continuous batching')
-def step_server_continuous_batching(context):
-    context.server_continuous_batching = True
-
-
-@step(u'embeddings extraction')
-def step_server_embeddings(context):
-    context.server_embeddings = True
-
-
-@step(u'prometheus compatible metrics exposed')
-def step_server_metrics(context):
-    context.server_metrics = True
-
-
-@step(u"the server is starting")
-def step_start_server(context):
-    start_server_background(context)
-    attempts = 0
-    while True:
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            result = sock.connect_ex((context.server_fqdn, context.server_port))
-            if result == 0:
-                print("\x1b[33;46mserver started!\x1b[0m")
-                return
-            attempts += 1
-            if attempts > 20:
-                assert False, "server not started"
-            print(f"waiting for server to start, connect error code = {result}...")
-            time.sleep(0.1)
-
-
-@step(u"the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_the_server_to_be_started(context, expecting_status):
-    match expecting_status:
-        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
-
-        case 'ready' | 'idle':
-            await wait_for_health_status(context, context.base_url, 200, 'ok',
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=context.n_slots,
-                                         slots_processing=0,
-                                         expected_slots=[{'id': slot_id, 'state': 0}
-                                                         for slot_id in range(context.n_slots)])
-        case 'busy':
-            await wait_for_health_status(context, context.base_url, 503,
-                                         'no slot available',
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=0,
-                                         slots_processing=context.n_slots,
-                                         expected_slots=[{'id': slot_id, 'state': 1}
-                                                         for slot_id in range(context.n_slots)])
-        case _:
-            assert False, "unknown status"
-
-
-@step(u'all slots are {expected_slot_status_string}')
-@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string):
-    match expected_slot_status_string:
-        case 'idle':
-            expected_slot_status = 0
-        case 'busy':
-            expected_slot_status = 1
-        case _:
-            assert False, "unknown status"
-
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
-                      for slot_id in range(context.n_slots)]
-    await request_slots_status(context, expected_slots)
-
-
-@step(u'a completion request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error):
-    expect_api_error = api_error == 'raised'
-    completion = await request_completion(context.prompts.pop(),
-                                          context.base_url,
-                                          debug=context.debug,
-                                          n_predict=context.n_predict,
-                                          server_seed=context.server_seed,
-                                          expect_api_error=expect_api_error,
-                                          user_api_key=context.user_api_key)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-
-@step(u'{predicted_n} tokens are predicted matching {re_content}')
-def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
-    assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content)
-
-
-@step(u'{predicted_n} tokens are predicted')
-def step_n_tokens_predicted(context, predicted_n):
-    assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n))
-
-
-@step(u'a user prompt {user_prompt}')
-def step_user_prompt(context, user_prompt):
-    context.prompts.append(user_prompt)
-
-
-@step(u'a system prompt {system_prompt}')
-def step_system_prompt(context, system_prompt):
-    context.system_prompt = system_prompt
-
-
-@step(u'a model {model}')
-def step_model(context, model):
-    context.model = model
-
-
-@step(u'{max_tokens} max tokens to predict')
-def step_max_tokens(context, max_tokens):
-    context.n_predict = int(max_tokens)
-
-
-@step(u'streaming is {enable_streaming}')
-def step_streaming(context, enable_streaming):
-    context.enable_streaming = enable_streaming == 'enabled'
-
-
-@step(u'a user api key {user_api_key}')
-def step_user_api_key(context, user_api_key):
-    context.user_api_key = user_api_key
-
-
-@step(u'no user api key')
-def step_no_user_api_key(context):
-    context.user_api_key = None
-
-
-@step(u'a user api key ')
-def step_no_user_api_key_space(context):
-    context.user_api_key = None
-
-
-@step(u'a server api key {server_api_key}')
-def step_server_api_key(context, server_api_key):
-    context.server_api_key = server_api_key
-
-
-@step(u'an OAI compatible chat completions request with {api_error} api error')
-@async_run_until_complete
-async def step_oai_chat_completions(context, api_error):
-    if context.debug:
-        print(f"Submitting OAI compatible completions request...")
-    expect_api_error = api_error == 'raised'
-    completion = await oai_chat_completions(context.prompts.pop(),
-                                            context.system_prompt,
-                                            context.base_url,
-                                            '/v1/chat',
-                                            False,
-                                            model=context.model if hasattr(context, 'model') else None,
-
-                                            n_predict=context.n_predict
-                                            if hasattr(context, 'n_predict') else None,
-
-                                            enable_streaming=context.enable_streaming
-                                            if hasattr(context, 'enable_streaming') else None,
-
-                                            server_seed=context.server_seed
-                                            if hasattr(context, 'server_seed') else None,
-
-                                            user_api_key=context.user_api_key
-                                            if hasattr(context, 'user_api_key') else None,
-
-                                            expect_api_error=expect_api_error)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-    if context.debug:
-        print(f"Completion response: {completion}")
-
-
-@step(u'a prompt')
-def step_a_prompt(context):
-    context.prompts.append(context.text)
-
-
-@step(u'a prompt {prompt}')
-def step_a_prompt_prompt(context, prompt):
-    context.prompts.append(prompt)
-
-
-@step(u'concurrent completion requests')
-@async_run_until_complete()
-async def step_concurrent_completion_requests(context):
-    await concurrent_requests(context,
-                              request_completion,
-                              # prompt is inserted automatically
-                              context.base_url,
-                              debug=context.debug,
-                              n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
-                              server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
-                              user_api_key=context.user_api_key if hasattr(context,
-                                                                           'user_api_key') else None)
-
-
-@step(u'concurrent OAI completions requests')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/v1/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              server_seed=context.server_seed
-                              if hasattr(context, 'server_seed') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step(u'concurrent OAI completions requests no v1')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              server_seed=context.server_seed
-                              if hasattr(context, 'server_seed') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step(u'all prompts are predicted')
-@async_run_until_complete
-async def step_all_prompts_are_predicted(context):
-    await all_prompts_are_predicted(context)
-
-
-@step(u'all prompts are predicted with {n_predict} tokens')
-@async_run_until_complete
-async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict):
-    expected_predicted_n = int(n_predict)
-    await all_prompts_are_predicted(context, expected_predicted_n)
-
-
-async def all_prompts_are_predicted(context, expected_predicted_n=None):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions > 0
-    for i in range(n_completions):
-        assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
-    assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
-
-
-@step(u'embeddings are computed for')
-@async_run_until_complete
-async def step_compute_embedding(context):
-    context.embeddings = await request_embedding(context.text, base_url=context.base_url)
-
-
-@step(u'embeddings are generated')
-def step_assert_embeddings(context):
-    if len(context.prompts) == 0:
-        assert_embeddings(context.embeddings)
-    else:
-        assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
-                                                                 f"context.prompts={context.prompts}\n"
-                                                                 f"context.embeddings={context.embeddings}")
-        for embedding in context.embeddings:
-            context.prompts.pop()
-            assert_embeddings(embedding)
-
-
-@step(u'an OAI compatible embeddings computation request for')
-@async_run_until_complete
-async def step_oai_compute_embeddings(context):
-    context.embeddings = await request_oai_embeddings(context.text,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step(u'an OAI compatible embeddings computation request for multiple inputs')
-@async_run_until_complete
-async def step_oai_compute_embeddings_multiple_inputs(context):
-    context.embeddings = await request_oai_embeddings(context.prompts,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step(u'concurrent embedding requests')
-@async_run_until_complete()
-async def step_concurrent_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_embedding,
-                              # prompt is inserted automatically
-                              base_url=context.base_url)
-
-
-@step(u'concurrent OAI embedding requests')
-@async_run_until_complete()
-async def step_concurrent_oai_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_oai_embeddings,
-                              # prompt is inserted automatically
-                              base_url=context.base_url,
-                              async_client=True,
-                              model=context.model)
-
-
-@step(u'all embeddings are generated')
-@async_run_until_complete()
-async def all_embeddings_are_generated(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests > 0
-    for i in range(n_embedding_requests):
-        assert_embeddings(context.tasks_result.pop())
-
-
-@step(u'tokenizing')
-@async_run_until_complete
-async def step_tokenize(context):
-    context.tokenized_text = context.text
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{context.base_url}/tokenize',
-                                json={
-                                    "content": context.tokenized_text,
-                                }) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens = tokenize_json['tokens']
-
-
-@step(u'tokens can be detokenize')
-@async_run_until_complete
-async def step_detokenize(context):
-    assert len(context.tokens) > 0
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{context.base_url}/detokenize',
-                                json={
-                                    "tokens": context.tokens,
-                                }) as response:
-            assert response.status == 200
-            detokenize_json = await response.json()
-            # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
-            assert context.tokenized_text == detokenize_json['content'].strip()
-
-
-@step(u'an OPTIONS request is sent from {origin}')
-@async_run_until_complete
-async def step_options_request(context, origin):
-    async with aiohttp.ClientSession() as session:
-        async with session.options(f'{context.base_url}/v1/chat/completions',
-                                   headers={"Origin": origin}) as response:
-            assert response.status == 200
-            context.options_response = response
-
-
-@step(u'CORS header {cors_header} is set to {cors_header_value}')
-def step_check_options_header_value(context, cors_header, cors_header_value):
-    assert context.options_response.headers[cors_header] == cors_header_value
-
-
-@step(u'prometheus metrics are exposed')
-@async_run_until_complete
-async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession() as session:
-        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
-            assert metrics_response.status == 200
-            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
-            metrics_raw = await metrics_response.text()
-            metric_exported = False
-            for metric in parser.text_string_to_metric_families(metrics_raw):
-                match metric.name:
-                    case "llamacpp:kv_cache_usage_ratio":
-                        assert len(metric.samples) > 0
-                        metric_exported = True
-            assert metric_exported, "No metrics exported"
-
-
-async def concurrent_requests(context, f_completion, *args, **kwargs):
-    n_prompts = len(context.prompts)
-    if context.debug:
-        print(f"starting {n_prompts} concurrent completion requests...")
-    assert n_prompts > 0
-    for prompt_no in range(n_prompts):
-        shifted_args = [context.prompts.pop(), *args]
-        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
-    await asyncio.sleep(0.1)
-
-
-async def request_completion(prompt,
-                             base_url,
-                             debug=False,
-                             n_predict=None,
-                             server_seed=None,
-                             expect_api_error=None,
-                             user_api_key=None):
-    if debug:
-        print(f"Sending completion request: {prompt}")
-    origin = "my.super.domain"
-    headers = {
-        'Origin': origin
-    }
-    if user_api_key is not None:
-        if debug:
-            print(f"Set user_api_key: {user_api_key}")
-        headers['Authorization'] = f'Bearer {user_api_key}'
-
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{base_url}/completion',
-                                json={
-                                    "prompt": prompt,
-                                    "n_predict": int(n_predict) if n_predict is not None else -1,
-                                    "seed": server_seed if server_seed is not None else 42
-                                },
-                                headers=headers) as response:
-            if expect_api_error is None or not expect_api_error:
-                assert response.status == 200
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                return await response.json()
-            else:
-                return response.status
-
-
-async def oai_chat_completions(user_prompt,
-                               system_prompt,
-                               base_url,
-                               base_path,
-                               async_client,
-                               debug=False,
-                               model=None,
-                               n_predict=None,
-                               enable_streaming=None,
-                               server_seed=None,
-                               user_api_key=None,
-                               expect_api_error=None):
-    if debug:
-        print(f"Sending OAI Chat completions request: {user_prompt}")
-    # openai client always expects an api key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    seed = server_seed if server_seed is not None else 42
-    enable_streaming = enable_streaming if enable_streaming is not None else False
-    payload = {
-        "messages": [
-            {
-                "role": "system",
-                "content": system_prompt,
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        ],
-        "model": model,
-        "max_tokens": n_predict,
-        "stream": enable_streaming,
-        "seed": seed
-    }
-    completion_response = {
-        'content': '',
-        'timings': {
-            'predicted_n': 0
-        }
-    }
-    if async_client:
-        origin = 'llama.cpp'
-        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}{base_path}',
-                                    json=payload,
-                                    headers=headers) as response:
-                if enable_streaming:
-                    assert response.status == 200
-                    assert response.headers['Access-Control-Allow-Origin'] == origin
-                    assert response.headers['Content-Type'] == "text/event-stream"
-                    event_received = True
-                    while event_received:
-                        event_received = False
-                        async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf8')
-                            line = line.rstrip('\n').rstrip('\r')
-                            if line == '':
-                                continue
-                            event_data = line.split(': ', 1)
-                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
-                            chunk_raw = event_data[1]
-
-                            chunk = json.loads(chunk_raw)
-                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
-                            delta = chunk['choices'][0]['delta']
-                            if 'content' in delta:
-                                completion_response['content'] += delta['content']
-                                completion_response['timings']['predicted_n'] += 1
-                else:
-                    if expect_api_error is None or not expect_api_error:
-                        assert response.status == 200
-                        assert response.headers['Access-Control-Allow-Origin'] == origin
-                        assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                        chat_completion_raw = await response.json()
-                        completion_response = {
-                            'content': chat_completion_raw['choices'][0]['message'],
-                            'timings': {
-                                'predicted_n': chat_completion_raw['usage']['completion_tokens']
-                            }
-                        }
-                    else:
-                        return response.status
-    else:
-        try:
-            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}{base_path}'
-            chat_completion = openai.Completion.create(
-                messages=payload['messages'],
-                model=model,
-                max_tokens=n_predict,
-                stream=enable_streaming,
-                seed=seed
-            )
-        except openai.error.APIError as e:
-            if expect_api_error is not None and expect_api_error:
-                return 401
-            else:
-                assert False, f'error raised: {e}'
-
-        if enable_streaming:
-            for chunk in chat_completion:
-                assert len(chunk.choices) == 1
-                delta = chunk.choices[0].delta
-                if 'content' in delta:
-                    completion_response['content'] += delta['content']
-                    completion_response['timings']['predicted_n'] += 1
-        else:
-            assert len(chat_completion.choices) == 1
-            completion_response = {
-                'content': chat_completion.choices[0].message.content,
-                'timings': {
-                    'predicted_n': chat_completion.usage.completion_tokens
-                }
-            }
-    if debug:
-        print("OAI response formatted to llama.cpp:", completion_response)
-    return completion_response
-
-
-async def request_embedding(content, base_url=None):
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{base_url}/embedding',
-                                json={
-                                    "content": content,
-                                }) as response:
-            assert response.status == 200
-            response_json = await response.json()
-            return response_json['embedding']
-
-
-async def request_oai_embeddings(input,
-                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False):
-    # openai client always expects an api_key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    if async_client:
-        origin = 'llama.cpp'
-        if user_api_key is not None:
-            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}/v1/embeddings',
-                                    json={
-                                        "input": input,
-                                        "model": model,
-                                    },
-                                    headers=headers) as response:
-                assert response.status == 200, f"received status code not expected: {response.status}"
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                response_json = await response.json()
-                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
-                assert response_json['object'] == 'list'
-                return response_json['data']
-    else:
-        openai.api_key = user_api_key
-        openai.api_base = f'{base_url}/v1'
-        oai_embeddings = openai.Embedding.create(
-            model=model,
-            input=input,
-        )
-
-        if isinstance(input, collections.abc.Sequence):
-            embeddings = []
-            for an_oai_embeddings in oai_embeddings.data:
-                embeddings.append(an_oai_embeddings.embedding)
-        else:
-            embeddings = oai_embeddings.data.embedding
-        return embeddings
-
-
-def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
-    content = completion_response['content']
-    n_predicted = completion_response['timings']['predicted_n']
-    assert len(content) > 0, "no token predicted"
-    if expected_predicted_n is not None:
-        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
-                                                     f' {n_predicted} <> {expected_predicted_n}')
-    if re_content is not None:
-        re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
-        assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
-            f'invalid tokens predicted:'
-            f' ```\n{content}\n``` do not match /{re_content}/')
-
-
-async def gather_tasks_results(context):
-    n_tasks = len(context.concurrent_tasks)
-    if context.debug:
-        print(f"Waiting for all {n_tasks} tasks results...")
-    for task_no in range(n_tasks):
-        context.tasks_result.append(await context.concurrent_tasks.pop())
-    n_completions = len(context.tasks_result)
-    return n_completions
-
-
-async def wait_for_health_status(context,
-                                 base_url,
-                                 expected_http_status_code,
-                                 expected_health_status,
-                                 params=None,
-                                 slots_idle=None,
-                                 slots_processing=None,
-                                 expected_slots=None):
-    if context.debug:
-        print(f"Starting checking for health for expected_health_status={expected_health_status}")
-    timeout = 3  # seconds
-    if expected_health_status == 'ok':
-        timeout = 10 # CI slow inference
-    interval = 0.5
-    counter = 0
-    async with aiohttp.ClientSession() as session:
-        while True:
-            async with await session.get(f'{base_url}/health', params=params) as health_response:
-                status_code = health_response.status
-                health = await health_response.json()
-                if context.debug:
-                    print(f"HEALTH - response for expected health status='{expected_health_status}' on "
-                          f"'{base_url}/health'?{params} is {health}")
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
-                    return
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
-                    return
-            await asyncio.sleep(interval)
-
-            counter += interval
-            if counter >= timeout:
-                # Sometimes health requests are triggered after completions are predicted
-                if expected_http_status_code == 503:
-                    if len(context.tasks_result) == 0:
-                        print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
-                              " busy health check missed, probably too fast inference\x1b[0m")
-                        n_completions = await gather_tasks_results(context)
-                        if n_completions > 0:
-                            return
-
-                assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
-
-
-def assert_embeddings(embeddings):
-    assert len(embeddings) > 0
-    embeddings_computed = False
-    for emb in embeddings:
-        if emb != 0:
-            embeddings_computed = True
-    assert embeddings_computed, f"Embeddings: {embeddings}"
-
-
-async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession() as session:
-        async with await session.get(f'{context.base_url}/slots') as slots_response:
-            assert slots_response.status == 200
-            slots = await slots_response.json()
-            assert_slots_status(slots, expected_slots)
-
-
-def assert_slots_status(slots, expected_slots):
-    assert len(slots) == len(expected_slots)
-    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
-        for key in expected:
-            assert expected[key] == slot[key], (f"invalid slot {slot_id}"
-                                                f" expected[{key}] != slot[{key}]"
-                                                f" = {expected[key]} != {slot[key]}")
-
-
-def start_server_background(context):
-    context.server_path = '../../../build/bin/server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
-    server_args = [
-        '--host', context.server_fqdn,
-        '--port', context.server_port,
-        '--model', context.model_file
-    ]
-    if context.server_continuous_batching:
-        server_args.append('--cont-batching')
-    if context.server_embeddings:
-        server_args.append('--embedding')
-    if context.server_metrics:
-        server_args.append('--metrics')
-    if context.model_alias is not None:
-        server_args.extend(['--alias', context.model_alias])
-    if context.n_ctx is not None:
-        server_args.extend(['--ctx-size', context.n_ctx])
-    if context.n_slots is not None:
-        server_args.extend(['--parallel', context.n_slots])
-    if context.n_server_predict is not None:
-        server_args.extend(['--n-predict', context.n_server_predict])
-    if context.server_api_key is not None:
-        server_args.extend(['--api-key', context.server_api_key])
-    if context.debug:
-        server_args.append('--verbose')
-    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
-        server_args.extend(['--log-format', "text"])
-    print(f"starting server with: {context.server_path}", *server_args)
-    context.server_process = subprocess.Popen(
-        [str(arg) for arg in [context.server_path, *server_args]],
-        close_fds=True)
-    print(f"server pid={context.server_process.pid}")
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
deleted file mode 100644
index e228b2371..000000000
--- a/examples/server/tests/features/wrong_usages.feature
+++ /dev/null
@@ -1,21 +0,0 @@
-# run with ./test.sh --tags wrong_usage
-@wrong_usage
-Feature: Wrong usage of llama.cpp server
-
-  #3969 The user must always set --n-predict option
-  # to cap the number of tokens any completion request can generate
-  # or pass n_predict/max_tokens in the request.
-  Scenario: Infinite loop
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    # Uncomment below to fix the issue
-    #And   64 server max tokens to predict
-    Then  the server is starting
-    Given a prompt:
-      """
-      Go to: infinite loop
-      """
-    # Uncomment below to fix the issue
-    #And   128 max tokens to predict
-    Given concurrent completion requests
-    Then all prompts are predicted
diff --git a/examples/server/tests/pytest.ini b/examples/server/tests/pytest.ini
new file mode 100644
index 000000000..6df308df7
--- /dev/null
+++ b/examples/server/tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    serial
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 334fa4a70..15d024914 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,4 +1,8 @@
 aiohttp~=3.9.3
-behave~=1.2.6
-openai~=0.25.0
+pytest~=8.3.3
+huggingface_hub~=0.23.2
+numpy~=1.26.4
+openai~=1.55.3
 prometheus-client~=0.20.0
+requests~=2.32.3
+wget~=3.2
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 17a4e6fc6..33fa8cc64 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,12 +1,23 @@
 #!/bin/bash
 
+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
 set -eu
 
-if [ $# -lt 1 ]
-then
-  # Start @llama.cpp scenario
-  behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp
-else
-  behave "$@"
+if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
+    # Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
+    python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
 fi
 
+if [ $# -lt 1 ]
+then
+    if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
+        pytest -v -x
+    else
+        pytest -v -x -m "not slow"
+    fi
+else
+    pytest "$@"
+fi
diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py
new file mode 100644
index 000000000..1485de8ce
--- /dev/null
+++ b/examples/server/tests/unit/test_basic.py
@@ -0,0 +1,96 @@
+import pytest
+import requests
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_start_simple():
+    global server
+    server.start()
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_server_props():
+    global server
+    server.start()
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert ".gguf" in res.body["model_path"]
+    assert res.body["total_slots"] == server.n_slots
+    default_val = res.body["default_generation_settings"]
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert default_val["n_ctx"] == server.n_ctx / server.n_slots
+    assert default_val["params"]["seed"] == server.seed
+
+
+def test_server_models():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == server.model_alias
+
+
+def test_server_slots():
+    global server
+
+    # without slots endpoint enabled, this should return error
+    server.server_slots = False
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
+    assert "error" in res.body
+    server.stop()
+
+    # with slots endpoint enabled, this should return slots info
+    server.server_slots = True
+    server.n_slots = 2
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 200
+    assert len(res.body) == server.n_slots
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
+    assert "params" in res.body[0]
+    assert res.body[0]["params"]["seed"] == server.seed
+
+
+def test_load_split_model():
+    global server
+    server.model_hf_repo = "ggml-org/models"
+    server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
+    server.model_alias = "tinyllama-split"
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 16,
+        "prompt": "Hello",
+        "temperature": 0.0,
+    })
+    assert res.status_code == 200
+    assert match_regex("(little|girl)+", res.body["content"])
+
+
+def test_no_webui():
+    global server
+    # default: webui enabled
+    server.start()
+    url = f"http://{server.server_host}:{server.server_port}"
+    res = requests.get(url)
+    assert res.status_code == 200
+    assert "<html>" in res.text
+    server.stop()
+
+    # with --no-webui
+    server.no_webui = True
+    server.start()
+    res = requests.get(url)
+    assert res.status_code == 404
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
new file mode 100644
index 000000000..f23d5cff4
--- /dev/null
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -0,0 +1,267 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
+    [
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True,  None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
+        (None, "Book", "What is the best book", 8, "^ blue",                    23, 8, "length", True, "This is not a chat template, it is"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
+    ]
+)
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
+    global server
+    server.jinja = jinja
+    server.chat_template = chat_template
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    })
+    assert res.status_code == 200
+    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["system_fingerprint"].startswith("b")
+    assert res.body["model"] == model if model is not None else server.model_alias
+    assert res.body["usage"]["prompt_tokens"] == n_prompt
+    assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    assert choice["finish_reason"] == finish_reason
+
+
+@pytest.mark.parametrize(
+    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    [
+        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
+    ]
+)
+def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+    global server
+    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "stream": True,
+    })
+    content = ""
+    last_cmpl_id = None
+    for data in res:
+        choice = data["choices"][0]
+        assert data["system_fingerprint"].startswith("b")
+        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
+        if last_cmpl_id is None:
+            last_cmpl_id = data["id"]
+        assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
+        if choice["finish_reason"] in ["stop", "length"]:
+            assert data["usage"]["prompt_tokens"] == n_prompt
+            assert data["usage"]["completion_tokens"] == n_predicted
+            assert "content" not in choice["delta"]
+            assert match_regex(re_content, content)
+            assert choice["finish_reason"] == finish_reason
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["delta"]["content"]
+
+
+def test_chat_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=8,
+        seed=42,
+        temperature=0.8,
+    )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
+    assert res.choices[0].finish_reason == "length"
+    assert res.choices[0].message.content is not None
+    assert match_regex("(Suddenly)+", res.choices[0].message.content)
+
+
+def test_chat_template():
+    global server
+    server.chat_template = "llama3"
+    server.debug = True  # to get the "__verbose" object in the response
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "__verbose" in res.body
+    assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+
+
+def test_apply_chat_template():
+    global server
+    server.chat_template = "command-r"
+    server.start()
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "system", "content": "You are a test."},
+            {"role": "user", "content":"Hi there"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "prompt" in res.body
+    assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+
+
+@pytest.mark.parametrize("response_format,n_predicted,re_content", [
+    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
+    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
+    ({"type": "json_object"}, 10, "(\\{|John)+"),
+    ({"type": "sound"}, 0, None),
+    # invalid response format (expected to fail)
+    ({"type": "json_object", "schema": 123}, 0, None),
+    ({"type": "json_object", "schema": {"type": 123}}, 0, None),
+    ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
+])
+def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "response_format": response_format,
+    })
+    if re_content is not None:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+        assert "error" in res.body
+
+
+@pytest.mark.parametrize("messages", [
+    None,
+    "string",
+    [123],
+    [{}],
+    [{"role": 123}],
+    [{"role": "system", "content": 123}],
+    # [{"content": "hello"}], # TODO: should not be a valid case
+    [{"role": "system", "content": "test"}, {}],
+])
+def test_invalid_chat_completion_req(messages):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+    })
+    assert res.status_code == 400 or res.status_code == 500
+    assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": 10,
+        "messages": [{"role": "user", "content": "test"}],
+        "stream": True,
+        "timings_per_token": True,
+    })
+    for data in res:
+        assert "timings" in data
+        assert "prompt_per_second" in data["timings"]
+        assert "predicted_per_second" in data["timings"]
+        assert "predicted_n" in data["timings"]
+        assert data["timings"]["predicted_n"] <= 10
+
+
+def test_logprobs():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+    )
+    output_text = res.choices[0].message.content
+    aggregated_text = ''
+    assert res.choices[0].logprobs is not None
+    assert res.choices[0].logprobs.content is not None
+    for token in res.choices[0].logprobs.content:
+        aggregated_text += token.token
+        assert token.logprob <= 0.0
+        assert token.bytes is not None
+        assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
+
+
+def test_logprobs_stream():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+        stream=True,
+    )
+    output_text = ''
+    aggregated_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            if choice.delta.content:
+                output_text += choice.delta.content
+            assert choice.logprobs is not None
+            assert choice.logprobs.content is not None
+            for token in choice.logprobs.content:
+                aggregated_text += token.token
+                assert token.logprob <= 0.0
+                assert token.bytes is not None
+                assert token.top_logprobs is not None
+                assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
new file mode 100644
index 000000000..0ed5b99be
--- /dev/null
+++ b/examples/server/tests/unit/test_completion.py
@@ -0,0 +1,428 @@
+import pytest
+import requests
+import time
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
+])
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "return_tokens": return_tokens,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == n_prompt
+    assert res.body["timings"]["predicted_n"] == n_predicted
+    assert res.body["truncated"] == truncated
+    assert type(res.body["has_new_line"]) == bool
+    assert match_regex(re_content, res.body["content"])
+    if return_tokens:
+        assert len(res.body["tokens"]) > 0
+        assert all(type(tok) == int for tok in res.body["tokens"])
+    else:
+        assert res.body["tokens"] == []
+
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        assert "stop" in data and type(data["stop"]) == bool
+        if data["stop"]:
+            assert data["timings"]["prompt_n"] == n_prompt
+            assert data["timings"]["predicted_n"] == n_predicted
+            assert data["truncated"] == truncated
+            assert data["stop_type"] == "limit"
+            assert type(data["has_new_line"]) == bool
+            assert "generation_settings" in data
+            assert server.n_predict is not None
+            assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
+            assert data["generation_settings"]["seed"] == server.seed
+            assert match_regex(re_content, content)
+        else:
+            assert len(data["tokens"]) > 0
+            assert all(type(tok) == int for tok in data["tokens"])
+            content += data["content"]
+
+
+def test_completion_stream_vs_non_stream():
+    global server
+    server.start()
+    res_stream = server.make_stream_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+        "stream": True,
+    })
+    res_non_stream = server.make_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+    })
+    content_stream = ""
+    for data in res_stream:
+        content_stream += data["content"]
+    assert content_stream == res_non_stream.body["content"]
+
+
+def test_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="I believe the meaning of life is",
+        max_tokens=8,
+    )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
+    assert res.choices[0].finish_reason == "length"
+    assert res.choices[0].text is not None
+    assert match_regex("(going|bed)+", res.choices[0].text)
+
+
+def test_completion_stream_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="I believe the meaning of life is",
+        max_tokens=8,
+        stream=True,
+    )
+    output_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            assert choice.text is not None
+            output_text += choice.text
+    assert match_regex("(going|bed)+", output_text)
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_consistent_result_same_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": 0.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_different_result_different_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for seed in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": seed,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] != last_res.body["content"]
+        last_res = res
+
+# TODO figure why it don't work with temperature = 1
+# @pytest.mark.parametrize("temperature", [0.0, 1.0])
+@pytest.mark.parametrize("n_batch", [16, 32])
+@pytest.mark.parametrize("temperature", [0.0])
+def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
+    global server
+    server.n_batch = n_batch
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": temperature,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
+def test_cache_vs_nocache_prompt():
+    global server
+    server.start()
+    res_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": True,
+    })
+    res_no_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res_cache.body["content"] == res_no_cache.body["content"]
+
+
+def test_completion_with_tokens_input():
+    global server
+    server.temperature = 0.0
+    server.start()
+    prompt_str = "I believe the meaning of life is"
+    res = server.make_request("POST", "/tokenize", data={
+        "content": prompt_str,
+        "add_special": True,
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+
+    # single completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": tokens,
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+    # batch completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, tokens],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens in one sequence
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 3),
+    (2, 2),
+    (2, 4),
+    (4, 2), # some slots must be idle
+    (4, 6),
+])
+def test_completion_parallel_slots(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.temperature = 0.0
+    server.start()
+
+    PROMPTS = [
+        ("Write a very long book.", "(very|special|big)+"),
+        ("Write another a poem.", "(small|house)+"),
+        ("What is LLM?", "(Dad|said)+"),
+        ("The sky is blue and I love it.", "(climb|leaf)+"),
+        ("Write another very long music lyrics.", "(friends|step|sky)+"),
+        ("Write a very long joke.", "(cat|Whiskers)+"),
+    ]
+    def check_slots_status():
+        should_all_slots_busy = n_requests >= n_slots
+        time.sleep(0.1)
+        res = server.make_request("GET", "/slots")
+        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
+        if should_all_slots_busy:
+            assert n_busy == n_slots
+        else:
+            assert n_busy <= n_slots
+
+    tasks = []
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": prompt,
+            "seed": 42,
+            "temperature": 1.0,
+        })))
+    tasks.append((check_slots_status, ()))
+    results = parallel_function_calls(tasks)
+
+    # check results
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        res = results[i]
+        assert res.status_code == 200
+        assert type(res.body["content"]) == str
+        assert len(res.body["content"]) > 10
+        # FIXME: the result is not deterministic when using other slot than slot 0
+        # assert match_regex(re_content, res.body["content"])
+
+
+@pytest.mark.parametrize(
+    "prompt,n_predict,response_fields",
+    [
+        ("I believe the meaning of life is", 8, []),
+        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
+    ],
+)
+def test_completion_response_fields(
+    prompt: str, n_predict: int, response_fields: list[str]
+):
+    global server
+    server.start()
+    res = server.make_request(
+        "POST",
+        "/completion",
+        data={
+            "n_predict": n_predict,
+            "prompt": prompt,
+            "response_fields": response_fields,
+        },
+    )
+    assert res.status_code == 200
+    assert "content" in res.body
+    assert len(res.body["content"])
+    if len(response_fields):
+        assert res.body["generation_settings/n_predict"] == n_predict
+        assert res.body["prompt"] == "<s> " + prompt
+        assert isinstance(res.body["content"], str)
+        assert len(res.body) == len(response_fields)
+    else:
+        assert len(res.body)
+        assert "generation_settings" in res.body
+
+
+def test_n_probs():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "logprob" in tok and tok["logprob"] <= 0.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_logprobs"]) == 10
+        for prob in tok["top_logprobs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "logprob" in prob and prob["logprob"] <= 0.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_stream():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "stream": True,
+    })
+    for data in res:
+        if data["stop"] == False:
+            assert "completion_probabilities" in data
+            assert len(data["completion_probabilities"]) == 1
+            for tok in data["completion_probabilities"]:
+                assert "id" in tok and tok["id"] > 0
+                assert "token" in tok and type(tok["token"]) == str
+                assert "logprob" in tok and tok["logprob"] <= 0.0
+                assert "bytes" in tok and type(tok["bytes"]) == list
+                assert len(tok["top_logprobs"]) == 10
+                for prob in tok["top_logprobs"]:
+                    assert "id" in prob and prob["id"] > 0
+                    assert "token" in prob and type(prob["token"]) == str
+                    assert "logprob" in prob and prob["logprob"] <= 0.0
+                    assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_post_sampling():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "post_sampling_probs": True,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_probs"]) == 10
+        for prob in tok["top_probs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
+        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
+
+
+def test_cancel_request():
+    global server
+    server.n_ctx = 4096
+    server.n_predict = -1
+    server.n_slots = 1
+    server.server_slots = True
+    server.start()
+    # send a request that will take a long time, but cancel it before it finishes
+    try:
+        server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+        }, timeout=0.1)
+    except requests.exceptions.ReadTimeout:
+        pass # expected
+    # make sure the slot is free
+    time.sleep(1) # wait for HTTP_POLLING_SECONDS
+    res = server.make_request("GET", "/slots")
+    assert res.body[0]["is_processing"] == False
diff --git a/examples/server/tests/unit/test_ctx_shift.py b/examples/server/tests/unit/test_ctx_shift.py
new file mode 100644
index 000000000..be93a6d31
--- /dev/null
+++ b/examples/server/tests/unit/test_ctx_shift.py
@@ -0,0 +1,67 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+LONG_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+""".strip()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_ctx = 256
+    server.n_slots = 2
+
+
+def test_ctx_shift_enabled():
+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == 109
+    assert res.body["timings"]["predicted_n"] == 64
+    assert res.body["truncated"] is True
+
+
+@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
+    (64, 64, False),
+    (-1, 120, True),
+])
+def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
+    global server
+    server.disable_ctx_shift = True
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": "Hi how are you",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == n_token_output
+    assert res.body["truncated"] == truncated
+
+
+def test_ctx_shift_disabled_long_prompt():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code != 200
+    assert "error" in res.body
+    assert "exceeds the available context size" in res.body["error"]["message"]
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
new file mode 100644
index 000000000..8b0eb42b0
--- /dev/null
+++ b/examples/server/tests/unit/test_embedding.py
@@ -0,0 +1,237 @@
+import base64
+import struct
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.bert_bge_small()
+
+EPSILON = 1e-3
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.bert_bge_small()
+
+
+def test_embedding_single():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_multiple():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI from a very long prompt which will not be truncated",
+            "This is a test",
+            "This is another test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+@pytest.mark.parametrize(
+    "input,is_multi_prompt",
+    [
+        # do not crash on empty input
+        ("", False),
+        # single prompt
+        ("string", False),
+        ([12, 34, 56], False),
+        ([12, 34, "string", 56, 78], False),
+        # multiple prompts
+        (["string1", "string2"], True),
+        (["string1", [12, 34, 56]], True),
+        ([[12, 34, 56], [12, 34, 56]], True),
+        ([[12, 34, 56], [12, "string", 34, 56]], True),
+    ]
+)
+def test_embedding_mixed_input(input, is_multi_prompt: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": input})
+    assert res.status_code == 200
+    data = res.body['data']
+    if is_multi_prompt:
+        assert len(data) == len(input)
+        for d in data:
+            assert 'embedding' in d
+            assert len(d['embedding']) > 1
+    else:
+        assert 'embedding' in data[0]
+        assert len(data[0]['embedding']) > 1
+
+
+def test_embedding_pooling_none():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "hello hello hello",
+    })
+    assert res.status_code == 200
+    assert 'embedding' in res.body[0]
+    assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special
+
+    # make sure embedding vector is not normalized
+    for x in res.body[0]['embedding']:
+        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
+
+
+def test_embedding_pooling_none_oai():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "hello hello hello",
+    })
+
+    # /v1/embeddings does not support pooling type 'none'
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+def test_embedding_openai_library_single():
+    global server
+    server.pooling = 'last'
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
+    assert len(res.data) == 1
+    assert len(res.data[0].embedding) > 1
+
+
+def test_embedding_openai_library_multiple():
+    global server
+    server.pooling = 'last'
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.embeddings.create(model="text-embedding-3-small", input=[
+        "I believe the meaning of life is",
+        "Write a joke about AI from a very long prompt which will not be truncated",
+        "This is a test",
+        "This is another test",
+    ])
+    assert len(res.data) == 4
+    for d in res.data:
+        assert len(d.embedding) > 1
+
+
+def test_embedding_error_prompt_too_long():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "This is a test " * 512,
+    })
+    assert res.status_code != 200
+    assert "too large" in res.body["error"]["message"]
+
+
+def test_same_prompt_give_same_result():
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 5
+    for i in range(1, len(res.body['data'])):
+        v0 = res.body['data'][0]['embedding']
+        vi = res.body['data'][i]['embedding']
+        for x, y in zip(v0, vi):
+            assert abs(x - y) < EPSILON
+
+
+@pytest.mark.parametrize(
+    "content,n_tokens",
+    [
+        ("I believe the meaning of life is", 9),
+        ("This is a test", 6),
+    ]
+)
+def test_embedding_usage_single(content, n_tokens):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": content})
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+def test_embedding_usage_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == 2 * 9
+
+
+def test_embedding_openai_library_base64():
+    server.start()
+    test_input = "Test base64 embedding output"
+
+    # get embedding in default format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input
+    })
+    assert res.status_code == 200
+    vec0 = res.body["data"][0]["embedding"]
+
+    # get embedding in base64 format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input,
+        "encoding_format": "base64"
+    })
+
+    assert res.status_code == 200
+    assert "data" in res.body
+    assert len(res.body["data"]) == 1
+
+    embedding_data = res.body["data"][0]
+    assert "embedding" in embedding_data
+    assert isinstance(embedding_data["embedding"], str)
+
+    # Verify embedding is valid base64
+    decoded = base64.b64decode(embedding_data["embedding"])
+    # Verify decoded data can be converted back to float array
+    float_count = len(decoded) // 4  # 4 bytes per float
+    floats = struct.unpack(f'{float_count}f', decoded)
+    assert len(floats) > 0
+    assert all(isinstance(x, float) for x in floats)
+    assert len(floats) == len(vec0)
+
+    # make sure the decoded data is the same as the original
+    for x, y in zip(floats, vec0):
+        assert abs(x - y) < EPSILON
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
new file mode 100644
index 000000000..10554db0f
--- /dev/null
+++ b/examples/server/tests/unit/test_infill.py
@@ -0,0 +1,77 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama_infill()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama_infill()
+
+
+def test_infill_without_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
+
+
+def test_infill_with_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(Dad|excited|park)+", res.body["content"])
+
+
+@pytest.mark.parametrize("input_extra", [
+    {},
+    {"filename": "ok"},
+    {"filename": 123},
+    {"filename": 123, "text": "abc"},
+    {"filename": 123, "text": 456},
+])
+def test_invalid_input_extra_req(input_extra):
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [input_extra],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_qwen_model():
+    global server
+    server.model_file = None
+    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
+    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
+    server.start(timeout_seconds=600)
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"
diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py
new file mode 100644
index 000000000..c1aa8be70
--- /dev/null
+++ b/examples/server/tests/unit/test_lora.py
@@ -0,0 +1,115 @@
+import pytest
+from utils import *
+
+server = ServerPreset.stories15m_moe()
+
+LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    server.lora_files = [download_file(LORA_FILE_URL)]
+
+
+@pytest.mark.parametrize("scale,re_content", [
+    # without applying lora, the model should behave like a bedtime story generator
+    (0.0, "(little|girl|three|years|old)+"),
+    # with lora, the model should behave like a Shakespearean text generator
+    (1.0, "(eye|love|glass|sun)+"),
+])
+def test_lora(scale: float, re_content: str):
+    global server
+    server.start()
+    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": scale}
+    ])
+    assert res_lora_control.status_code == 200
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+    })
+    assert res.status_code == 200
+    assert match_regex(re_content, res.body["content"])
+
+
+def test_lora_per_request():
+    global server
+    server.n_slots = 4
+    server.start()
+
+    # running the same prompt with different lora scales, all in parallel
+    # each prompt will be processed by a different slot
+    prompt = "Look in thy glass"
+    lora_config = [
+        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
+        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
+        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
+        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
+        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
+        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
+    ]
+
+    tasks = [(
+        server.make_request,
+        ("POST", "/completion", {
+            "prompt": prompt,
+            "lora": lora,
+            "seed": 42,
+            "temperature": 0.0,
+            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+    ) for lora, _ in lora_config]
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    for res, (_, re_test) in zip(results, lora_config):
+        assert match_regex(re_test, res.body["content"])
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_big_model():
+    server = ServerProcess()
+    server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
+    server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
+    server.model_alias = "Llama-3.2-8B-Instruct"
+    server.n_slots = 4
+    server.n_ctx = server.n_slots * 1024
+    server.n_predict = 64
+    server.temperature = 0.0
+    server.seed = 42
+    server.lora_files = [
+        download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
+        # TODO: find & add other lora adapters for this model
+    ]
+    server.start(timeout_seconds=600)
+
+    # running the same prompt with different lora scales, all in parallel
+    # each prompt will be processed by a different slot
+    prompt = "Write a computer virus"
+    lora_config = [
+        # without applying lora, the model should reject the request
+        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
+        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
+        ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
+        # with 0.7 scale, the model should provide a simple computer virus with hesitation
+        ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
+        # with 1.5 scale, the model should confidently provide a computer virus
+        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
+        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
+    ]
+
+    tasks = [(
+        server.make_request,
+        ("POST", "/v1/chat/completions", {
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "lora": lora,
+            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+    ) for lora, _ in lora_config]
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    for res, (_, re_test) in zip(results, lora_config):
+        assert re_test in res.body["choices"][0]["message"]["content"]
diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py
new file mode 100644
index 000000000..7203d7943
--- /dev/null
+++ b/examples/server/tests/unit/test_rerank.py
@@ -0,0 +1,78 @@
+import pytest
+from utils import *
+
+server = ServerPreset.jina_reranker_tiny()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.jina_reranker_tiny()
+
+
+def test_rerank():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": [
+            "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+            "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+            "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+            "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+        ]
+    })
+    assert res.status_code == 200
+    assert len(res.body["results"]) == 4
+
+    most_relevant = res.body["results"][0]
+    least_relevant = res.body["results"][0]
+    for doc in res.body["results"]:
+        if doc["relevance_score"] > most_relevant["relevance_score"]:
+            most_relevant = doc
+        if doc["relevance_score"] < least_relevant["relevance_score"]:
+            least_relevant = doc
+
+    assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
+
+
+@pytest.mark.parametrize("documents", [
+    [],
+    None,
+    123,
+    [1, 2, 3],
+])
+def test_invalid_rerank_req(documents):
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": documents,
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+@pytest.mark.parametrize(
+    "query,doc1,doc2,n_tokens",
+    [
+        ("Machine learning is", "A machine", "Learning is", 19),
+        ("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
+    ]
+)
+def test_rerank_usage(query, doc1, doc2, n_tokens):
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/rerank", data={
+        "query": query,
+        "documents": [
+            doc1,
+            doc2,
+        ]
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
diff --git a/examples/server/tests/unit/test_security.py b/examples/server/tests/unit/test_security.py
new file mode 100644
index 000000000..620b25376
--- /dev/null
+++ b/examples/server/tests/unit/test_security.py
@@ -0,0 +1,83 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+TEST_API_KEY = "sk-this-is-the-secret-key"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.api_key = TEST_API_KEY
+
+
+@pytest.mark.parametrize("endpoint", ["/health", "/models"])
+def test_access_public_endpoint(endpoint: str):
+    global server
+    server.start()
+    res = server.make_request("GET", endpoint)
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+
+@pytest.mark.parametrize("api_key", [None, "invalid-key"])
+def test_incorrect_api_key(api_key: str):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {api_key}" if api_key else None,
+    })
+    assert res.status_code == 401
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "authentication_error"
+
+
+def test_correct_api_key():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {TEST_API_KEY}",
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_openai_library_correct_api_key():
+    global server
+    server.start()
+    client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a chatbot."},
+            {"role": "user", "content": "What is the meaning of life?"},
+        ],
+    )
+    assert len(res.choices) == 1
+
+
+@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
+    ("localhost", "Access-Control-Allow-Origin", "localhost"),
+    ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
+    ("origin", "Access-Control-Allow-Credentials", "true"),
+    ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
+    ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
+])
+def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
+    global server
+    server.start()
+    res = server.make_request("OPTIONS", "/completions", headers={
+        "Origin": origin,
+        "Access-Control-Request-Method": "POST",
+        "Access-Control-Request-Headers": "Authorization",
+    })
+    assert res.status_code == 200
+    assert cors_header in res.headers
+    assert res.headers[cors_header] == cors_header_value
diff --git a/examples/server/tests/unit/test_slot_save.py b/examples/server/tests/unit/test_slot_save.py
new file mode 100644
index 000000000..38704f5ec
--- /dev/null
+++ b/examples/server/tests/unit/test_slot_save.py
@@ -0,0 +1,98 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.slot_save_path = "./tmp"
+    server.temperature = 0.0
+
+
+def test_slot_save_restore():
+    global server
+    server.start()
+
+    # First prompt in slot 1 should be fully processed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # Save state of slot 1
+    res = server.make_request("POST", "/slots/1?action=save", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_saved"] == 84
+
+    # Since we have cache, this should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # Loading the saved cache into slot 0
+    res = server.make_request("POST", "/slots/0?action=restore", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_restored"] == 84
+
+    # Since we have cache, slot 0 should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing should work
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 1
+
+
+def test_slot_erase():
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # erase slot 1
+    res = server.make_request("POST", "/slots/1?action=erase")
+    assert res.status_code == 200
+
+    # re-run the same prompt, it should process all tokens again
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py
new file mode 100644
index 000000000..54db38cf3
--- /dev/null
+++ b/examples/server/tests/unit/test_speculative.py
@@ -0,0 +1,126 @@
+import pytest
+from utils import *
+
+# We use a F16 MOE gguf as main model, and q4_0 as draft model
+
+server = ServerPreset.stories15m_moe()
+
+MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
+
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # set default values
+    server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
+    server.draft_min = 4
+    server.draft_max = 8
+
+
+@pytest.fixture(scope="module", autouse=True)
+def fixture_create_server():
+    return create_server()
+
+
+def test_with_and_without_draft():
+    global server
+    server.model_draft = None  # disable draft model
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_no_draft = res.body["content"]
+    server.stop()
+
+    # create new server with draft model
+    create_server()
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+    })
+    assert res.status_code == 200
+    content_draft = res.body["content"]
+
+    assert content_no_draft == content_draft
+
+
+def test_different_draft_min_draft_max():
+    global server
+    test_values = [
+        (1, 2),
+        (1, 4),
+        (4, 8),
+        (4, 12),
+        (8, 16),
+    ]
+    last_content = None
+    for draft_min, draft_max in test_values:
+        server.stop()
+        server.draft_min = draft_min
+        server.draft_max = draft_max
+        server.start()
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })
+        assert res.status_code == 200
+        if last_content is not None:
+            assert last_content == res.body["content"]
+        last_content = res.body["content"]
+
+
+def test_slot_ctx_not_exceeded():
+    global server
+    server.n_ctx = 64
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 56,
+        "temperature": 0.0,
+        "top_k": 1,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+
+
+def test_with_ctx_shift():
+    global server
+    server.n_ctx = 64
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 56,
+        "temperature": 0.0,
+        "top_k": 1,
+        "n_predict": 64,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+    assert res.body["tokens_predicted"] == 64
+    assert res.body["truncated"] == True
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 2),
+    (2, 2),
+])
+def test_multi_requests_parallel(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    tasks = []
+    for _ in range(n_requests):
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })))
+    results = parallel_function_calls(tasks)
+    for res in results:
+        assert res.status_code == 200
+        assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
diff --git a/examples/server/tests/unit/test_tokenize.py b/examples/server/tests/unit/test_tokenize.py
new file mode 100644
index 000000000..382457c9d
--- /dev/null
+++ b/examples/server/tests/unit/test_tokenize.py
@@ -0,0 +1,59 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_tokenize_detokenize():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content
+    })
+    assert res_tok.status_code == 200
+    assert len(res_tok.body["tokens"]) > 5
+    # detokenize
+    res_detok = server.make_request("POST", "/detokenize", data={
+        "tokens": res_tok.body["tokens"],
+    })
+    assert res_detok.status_code == 200
+    assert res_detok.body["content"].strip() == content
+
+
+def test_tokenize_with_bos():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    bosId = 1
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "add_special": True,
+    })
+    assert res_tok.status_code == 200
+    assert res_tok.body["tokens"][0] == bosId
+
+
+def test_tokenize_with_pieces():
+    global server
+    server.start()
+    # tokenize
+    content = "This is a test string with unicode 媽 and emoji 🤗"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "with_pieces": True,
+    })
+    assert res_tok.status_code == 200
+    for token in res_tok.body["tokens"]:
+        assert "id" in token
+        assert token["id"] > 0
+        assert "piece" in token
+        assert len(token["piece"]) > 0
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
new file mode 100644
index 000000000..4a551404f
--- /dev/null
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -0,0 +1,418 @@
+import pytest
+from utils import *
+
+server: ServerProcess
+
+TIMEOUT_SERVER_START = 15*60
+TIMEOUT_HTTP_REQUEST = 60
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.model_alias = "tinyllama-2-tool-call"
+    server.server_port = 8081
+
+
+TEST_TOOL = {
+    "type":"function",
+    "function": {
+        "name": "test",
+        "description": "",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "success": {"type": "boolean", "const": True},
+            },
+            "required": ["success"]
+        }
+    }
+}
+
+PYTHON_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "python",
+        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "code": {
+                    "type": "string",
+                    "description": "The code to run in the ipython interpreter."
+                }
+            },
+            "required": ["code"]
+        }
+    }
+}
+
+WEATHER_TOOL = {
+  "type":"function",
+  "function":{
+    "name":"get_current_weather",
+    "description":"Get the current weather in a given location",
+    "parameters":{
+      "type":"object",
+      "properties":{
+        "location":{
+          "type":"string",
+          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
+        }
+      },
+      "required":["location"]
+    }
+  }
+}
+
+
+def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
+    global server
+    n_predict = 512
+    # server = ServerPreset.stories15m_moe()
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "tool_choice": "required",
+        "tools": [tool],
+        "parallel_tool_calls": False,
+        "temperature": 0.0,
+        "top_k": 1,
+        "top_p": 1.0,
+    })
+    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = res.body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
+    assert expected_function_name == tool_call["function"]["name"]
+    actual_arguments = tool_call["function"]["arguments"]
+    assert isinstance(actual_arguments, str)
+    if argument_key is not None:
+        actual_arguments = json.loads(actual_arguments)
+        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
+
+
+@pytest.mark.parametrize("template_name,tool,argument_key", [
+    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+])
+def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
+    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("template_name,tool,argument_key", [
+    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
+    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
+    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
+    ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
+    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
+    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
+    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
+    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+    ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+])
+def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
+    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
+    (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
+    (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    # TODO: fix these
+    # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+])
+def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+    global server
+    n_predict = 512
+    server.n_slots = 1
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "tool_choice": "required",
+        "tools": [tool],
+        "parallel_tool_calls": False,
+        "temperature": 0.0,
+        "top_k": 1,
+        "top_p": 1.0,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = res.body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
+    assert expected_function_name == tool_call["function"]["name"]
+    actual_arguments = tool_call["function"]["arguments"]
+    assert isinstance(actual_arguments, str)
+    if argument_key is not None:
+        actual_arguments = json.loads(actual_arguments)
+        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
+
+
+def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
+    global server
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "say hello world with python"},
+        ],
+        "tools": tools if tools else None,
+        "tool_choice": tool_choice,
+        "temperature": 0.0,
+        "top_k": 1,
+        "top_p": 1.0,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = res.body["choices"][0]
+    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
+
+
+@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [],            None),
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [TEST_TOOL],   None),
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
+])
+def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
+    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
+    ("meetkai-functionary-medium-v3.2",               256, [],            None),
+    ("meetkai-functionary-medium-v3.2",               256, [TEST_TOOL],   None),
+    ("meetkai-functionary-medium-v3.2",               256, [PYTHON_TOOL], 'none'),
+    ("meetkai-functionary-medium-v3.1",               256, [],            None),
+    ("meetkai-functionary-medium-v3.1",               256, [TEST_TOOL],   None),
+    ("meetkai-functionary-medium-v3.1",               256, [PYTHON_TOOL], 'none'),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [],            None),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [TEST_TOOL],   None),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
+])
+def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
+    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
+    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+])
+def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
+    global server
+    n_predict = 512
+    server.n_slots = 1
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "user", "content": "What is the weather in Istanbul?"},
+        ],
+        "tools": [WEATHER_TOOL],
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = res.body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
+    actual_arguments = json.loads(tool_call["function"]["arguments"])
+    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
+    location = actual_arguments["location"]
+    assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
+    assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
+    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
+    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
+    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
+    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+])
+def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+    global server
+    server.n_slots = 1
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = 128
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 256,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "say hello world with python"},
+        ],
+        "tools": [PYTHON_TOOL],
+        # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n    print("Hello, World!")\nhello_world()` which is correct but a pain to test.
+        "temperature": 0.0,
+        "top_k": 1,
+        "top_p": 1.0,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = res.body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
+    actual_arguments = tool_call["function"]["arguments"]
+    if expected_arguments_override is not None:
+        assert actual_arguments == expected_arguments_override
+    else:
+        actual_arguments = json.loads(actual_arguments)
+        assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
+        code = actual_arguments["code"]
+        assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
+        assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
new file mode 100644
index 000000000..ce0680662
--- /dev/null
+++ b/examples/server/tests/utils.py
@@ -0,0 +1,415 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# type: ignore[reportUnusedImport]
+
+import subprocess
+import os
+import re
+import json
+import sys
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Tuple,
+    Set,
+)
+from re import RegexFlag
+import wget
+
+
+DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30
+
+
+class ServerResponse:
+    headers: dict
+    status_code: int
+    body: dict | Any
+
+
+class ServerProcess:
+    # default options
+    debug: bool = False
+    server_port: int = 8080
+    server_host: str = "127.0.0.1"
+    model_hf_repo: str = "ggml-org/models"
+    model_hf_file: str | None = "tinyllamas/stories260K.gguf"
+    model_alias: str = "tinyllama-2"
+    temperature: float = 0.8
+    seed: int = 42
+
+    # custom options
+    model_alias: str | None = None
+    model_url: str | None = None
+    model_file: str | None = None
+    model_draft: str | None = None
+    n_threads: int | None = None
+    n_gpu_layer: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_ctx: int | None = None
+    n_ga: int | None = None
+    n_ga_w: int | None = None
+    n_predict: int | None = None
+    n_prompts: int | None = 0
+    slot_save_path: str | None = None
+    id_slot: int | None = None
+    cache_prompt: bool | None = None
+    n_slots: int | None = None
+    server_continuous_batching: bool | None = False
+    server_embeddings: bool | None = False
+    server_reranking: bool | None = False
+    server_metrics: bool | None = False
+    server_slots: bool | None = False
+    pooling: str | None = None
+    draft: int | None = None
+    api_key: str | None = None
+    lora_files: List[str] | None = None
+    disable_ctx_shift: int | None = False
+    draft_min: int | None = None
+    draft_max: int | None = None
+    no_webui: bool | None = None
+    jinja: bool | None = None
+    chat_template: str | None = None
+    chat_template_file: str | None = None
+
+    # session variables
+    process: subprocess.Popen | None = None
+
+    def __init__(self):
+        if "N_GPU_LAYERS" in os.environ:
+            self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
+        if "DEBUG" in os.environ:
+            self.debug = True
+        if "PORT" in os.environ:
+            self.server_port = int(os.environ["PORT"])
+
+    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
+        if "LLAMA_SERVER_BIN_PATH" in os.environ:
+            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
+        elif os.name == "nt":
+            server_path = "../../../build/bin/Release/llama-server.exe"
+        else:
+            server_path = "../../../build/bin/llama-server"
+        server_args = [
+            "--host",
+            self.server_host,
+            "--port",
+            self.server_port,
+            "--temp",
+            self.temperature,
+            "--seed",
+            self.seed,
+        ]
+        if self.model_file:
+            server_args.extend(["--model", self.model_file])
+        if self.model_url:
+            server_args.extend(["--model-url", self.model_url])
+        if self.model_draft:
+            server_args.extend(["--model-draft", self.model_draft])
+        if self.model_hf_repo:
+            server_args.extend(["--hf-repo", self.model_hf_repo])
+        if self.model_hf_file:
+            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.n_batch:
+            server_args.extend(["--batch-size", self.n_batch])
+        if self.n_ubatch:
+            server_args.extend(["--ubatch-size", self.n_ubatch])
+        if self.n_threads:
+            server_args.extend(["--threads", self.n_threads])
+        if self.n_gpu_layer:
+            server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
+        if self.draft is not None:
+            server_args.extend(["--draft", self.draft])
+        if self.server_continuous_batching:
+            server_args.append("--cont-batching")
+        if self.server_embeddings:
+            server_args.append("--embedding")
+        if self.server_reranking:
+            server_args.append("--reranking")
+        if self.server_metrics:
+            server_args.append("--metrics")
+        if self.server_slots:
+            server_args.append("--slots")
+        if self.pooling:
+            server_args.extend(["--pooling", self.pooling])
+        if self.model_alias:
+            server_args.extend(["--alias", self.model_alias])
+        if self.n_ctx:
+            server_args.extend(["--ctx-size", self.n_ctx])
+        if self.n_slots:
+            server_args.extend(["--parallel", self.n_slots])
+        if self.n_predict:
+            server_args.extend(["--n-predict", self.n_predict])
+        if self.slot_save_path:
+            server_args.extend(["--slot-save-path", self.slot_save_path])
+        if self.n_ga:
+            server_args.extend(["--grp-attn-n", self.n_ga])
+        if self.n_ga_w:
+            server_args.extend(["--grp-attn-w", self.n_ga_w])
+        if self.debug:
+            server_args.append("--verbose")
+        if self.lora_files:
+            for lora_file in self.lora_files:
+                server_args.extend(["--lora", lora_file])
+        if self.disable_ctx_shift:
+            server_args.extend(["--no-context-shift"])
+        if self.api_key:
+            server_args.extend(["--api-key", self.api_key])
+        if self.draft_max:
+            server_args.extend(["--draft-max", self.draft_max])
+        if self.draft_min:
+            server_args.extend(["--draft-min", self.draft_min])
+        if self.no_webui:
+            server_args.append("--no-webui")
+        if self.jinja:
+            server_args.append("--jinja")
+        if self.chat_template:
+            server_args.extend(["--chat-template", self.chat_template])
+        if self.chat_template_file:
+            server_args.extend(["--chat-template-file", self.chat_template_file])
+
+        args = [str(arg) for arg in [server_path, *server_args]]
+        print(f"bench: starting server with: {' '.join(args)}")
+
+        flags = 0
+        if "nt" == os.name:
+            flags |= subprocess.DETACHED_PROCESS
+            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+            flags |= subprocess.CREATE_NO_WINDOW
+
+        self.process = subprocess.Popen(
+            [str(arg) for arg in [server_path, *server_args]],
+            creationflags=flags,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
+            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+        )
+        server_instances.add(self)
+
+        print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
+
+        # wait for server to start
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                response = self.make_request("GET", "/health", headers={
+                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
+                })
+                if response.status_code == 200:
+                    self.ready = True
+                    return  # server is ready
+            except Exception as e:
+                pass
+            print(f"Waiting for server to start...")
+            time.sleep(0.5)
+        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
+
+    def stop(self) -> None:
+        if self in server_instances:
+            server_instances.remove(self)
+        if self.process:
+            print(f"Stopping server with pid={self.process.pid}")
+            self.process.kill()
+            self.process = None
+
+    def make_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | Any | None = None,
+        headers: dict | None = None,
+        timeout: float | None = None,
+    ) -> ServerResponse:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        parse_body = False
+        if method == "GET":
+            response = requests.get(url, headers=headers, timeout=timeout)
+            parse_body = True
+        elif method == "POST":
+            response = requests.post(url, headers=headers, json=data, timeout=timeout)
+            parse_body = True
+        elif method == "OPTIONS":
+            response = requests.options(url, headers=headers, timeout=timeout)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        result = ServerResponse()
+        result.headers = dict(response.headers)
+        result.status_code = response.status_code
+        result.body = response.json() if parse_body else None
+        print("Response from server", json.dumps(result.body, indent=2))
+        return result
+
+    def make_stream_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+    ) -> Iterator[dict]:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        if method == "POST":
+            response = requests.post(url, headers=headers, json=data, stream=True)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        for line_bytes in response.iter_lines():
+            line = line_bytes.decode("utf-8")
+            if '[DONE]' in line:
+                break
+            elif line.startswith('data: '):
+                data = json.loads(line[6:])
+                print("Partial response from server", json.dumps(data, indent=2))
+                yield data
+
+
+server_instances: Set[ServerProcess] = set()
+
+
+class ServerPreset:
+    @staticmethod
+    def tinyllama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K.gguf"
+        server.model_alias = "tinyllama-2"
+        server.n_ctx = 256
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 64
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def bert_bge_small() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 512
+        server.n_batch = 128
+        server.n_ubatch = 128
+        server.n_slots = 2
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def tinyllama_infill() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
+        server.model_alias = "tinyllama-infill"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def stories15m_moe() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/stories15M_MOE"
+        server.model_hf_file = "stories15M_MOE-F16.gguf"
+        server.model_alias = "stories15m-moe"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def jina_reranker_tiny() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
+        server.model_alias = "jina-reranker"
+        server.n_ctx = 512
+        server.n_batch = 512
+        server.n_slots = 1
+        server.seed = 42
+        server.server_reranking = True
+        return server
+
+
+def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
+    """
+    Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
+
+    Example usage:
+
+    results = parallel_function_calls([
+        (func1, (arg1, arg2)),
+        (func2, (arg3, arg4)),
+    ])
+    """
+    results = [None] * len(function_list)
+    exceptions = []
+
+    def worker(index, func, args):
+        try:
+            result = func(*args)
+            results[index] = result
+        except Exception as e:
+            exceptions.append((index, str(e)))
+
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i, (func, args) in enumerate(function_list):
+            future = executor.submit(worker, i, func, args)
+            futures.append(future)
+
+        # Wait for all futures to complete
+        for future in as_completed(futures):
+            pass
+
+    # Check if there were any exceptions
+    if exceptions:
+        print("Exceptions occurred:")
+        for index, error in exceptions:
+            print(f"Function at index {index}: {error}")
+
+    return results
+
+
+def match_regex(regex: str, text: str) -> bool:
+    return (
+        re.compile(
+            regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
+        ).search(text)
+        is not None
+    )
+
+
+def download_file(url: str, output_file_path: str | None = None) -> str:
+    """
+    Download a file from a URL to a local path. If the file already exists, it will not be downloaded again.
+
+    output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory.
+
+    Returns the local path of the downloaded file.
+    """
+    file_name = url.split('/').pop()
+    output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path
+    if not os.path.exists(output_file):
+        print(f"Downloading {url} to {output_file}")
+        wget.download(url, out=output_file)
+        print(f"Done downloading to {output_file}")
+    else:
+        print(f"File already exists at {output_file}")
+    return output_file
+
+
+def is_slow_test_allowed():
+    return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
diff --git a/examples/server/themes/README.md b/examples/server/themes/README.md
new file mode 100644
index 000000000..62e721a27
--- /dev/null
+++ b/examples/server/themes/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
+
+![image](wild/wild.png)
diff --git a/examples/server/themes/buttons-top/README.md b/examples/server/themes/buttons-top/README.md
new file mode 100644
index 000000000..808c4cf81
--- /dev/null
+++ b/examples/server/themes/buttons-top/README.md
@@ -0,0 +1,7 @@
+# LLaMA.cpp Server Buttons Top Theme
+
+Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
+
+To use simply run server with `--path=themes/buttons_top`
+
+![image](buttons_top.png)
diff --git a/examples/server/themes/buttons-top/buttons_top.png b/examples/server/themes/buttons-top/buttons_top.png
new file mode 100644
index 000000000..c54454519
Binary files /dev/null and b/examples/server/themes/buttons-top/buttons_top.png differ
diff --git a/examples/server/themes/buttons-top/favicon.ico b/examples/server/themes/buttons-top/favicon.ico
new file mode 100644
index 000000000..89e154a0a
Binary files /dev/null and b/examples/server/themes/buttons-top/favicon.ico differ
diff --git a/examples/server/public/index.html b/examples/server/themes/buttons-top/index.html
similarity index 96%
rename from examples/server/public/index.html
rename to examples/server/themes/buttons-top/index.html
index 84038ddce..3fb88fcc8 100644
--- a/examples/server/public/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -199,10 +199,10 @@
   <script type="module">
     import {
       html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
-    } from '/index.js';
+    } from './index.js';
 
-    import { llama } from '/completion.js';
-    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
     let selected_image = false;
     var slot_id = -1;
 
@@ -225,7 +225,6 @@
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
-      tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
       frequency_penalty: 0.0, // 0.0 = disabled
@@ -405,7 +404,7 @@
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
         const data = chunk.data;
 
         if (data.stop) {
@@ -627,17 +626,20 @@
       const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
       const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
       const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
 
       const grammarJsonSchemaPropOrder = signal('')
       const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
-      const convertJSONSchemaGrammar = () => {
+      const convertJSONSchemaGrammar = async () => {
         try {
-          const schema = JSON.parse(params.value.grammar)
-          const converter = new SchemaConverter(
-            grammarJsonSchemaPropOrder.value
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
               .split(',')
-              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
-          )
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
           converter.visit(schema, '')
           params.value = {
             ...params.value,
@@ -668,6 +670,15 @@
         `
       };
 
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
       const userTemplateReset = (e) => {
         e.preventDefault();
         userTemplateResetToDefaultAndApply()
@@ -774,7 +785,6 @@
           <details>
             <summary>More options</summary>
             <fieldset class="two">
-              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
@@ -1001,6 +1011,10 @@
     }
 
     function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
 
       return html`
         <div class="mode-${session.value.type}">
@@ -1008,14 +1022,14 @@
             <h1>llama.cpp</h1>
           </header>
 
-          <main id="content">
-            <${chatStarted.value ? ChatLog : ConfigForm} />
-          </main>
-
           <section id="write">
             <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
           </section>
 
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
           <footer>
             <p><${ModelGenerationInfo} /></p>
             <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
@@ -1036,4 +1050,3 @@
 </body>
 
 </html>
-
diff --git a/examples/server/themes/wild/README.md b/examples/server/themes/wild/README.md
new file mode 100644
index 000000000..560bcc81b
--- /dev/null
+++ b/examples/server/themes/wild/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
+
+![image](wild.png)
diff --git a/examples/server/themes/wild/favicon.ico b/examples/server/themes/wild/favicon.ico
new file mode 100644
index 000000000..89e154a0a
Binary files /dev/null and b/examples/server/themes/wild/favicon.ico differ
diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html
new file mode 100644
index 000000000..73f36d4b2
--- /dev/null
+++ b/examples/server/themes/wild/index.html
@@ -0,0 +1,1056 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+      background-image: url('llamapattern.png');
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+
+      background-color: rgba(255,255,255,0.9);
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .right {
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="right">
+            <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button onclick=${reset}>Reset</button>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const isCompletionMode = session.value.type === 'completion'
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data;
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+        if (user) {
+          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<p key=${index}>${message}</p>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <img src="llama_cpp.png" style="width:100%"/>
+          </header>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/examples/server/themes/wild/llama_cpp.png b/examples/server/themes/wild/llama_cpp.png
new file mode 100644
index 000000000..bad1dc9fc
Binary files /dev/null and b/examples/server/themes/wild/llama_cpp.png differ
diff --git a/examples/server/themes/wild/llamapattern.png b/examples/server/themes/wild/llamapattern.png
new file mode 100644
index 000000000..2a159ce6a
Binary files /dev/null and b/examples/server/themes/wild/llamapattern.png differ
diff --git a/examples/server/themes/wild/wild.png b/examples/server/themes/wild/wild.png
new file mode 100644
index 000000000..46ffa0f3e
Binary files /dev/null and b/examples/server/themes/wild/wild.png differ
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d98541f26..5f97df5fd 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,444 +1,387 @@
 #pragma once
 
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "common/base64.hpp"
+
+// increase max payload length to allow use of larger context size
+#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
+#include "httplib.h"
+
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+#include "minja.hpp"
+#include "chat.hpp"
+#include "chat-template.hpp"
+
+#include <random>
+#include <sstream>
 #include <string>
 #include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
+#include <memory>
 
-#include "json.hpp"
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 
-#include "../llava/clip.h"
+using json = nlohmann::ordered_json;
 
-using json = nlohmann::json;
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
 
-extern bool server_verbose;
-extern bool server_log_json;
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE,
-    TASK_TYPE_METRICS
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// completion token output with probabilities
-struct completion_token_output {
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-struct token_translator {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
-        {"tid", ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (server_log_json) {
-        log.merge_patch(
-                {
-                        {"level",     level},
-                        {"function",  function},
-                        {"line",      line},
-                        {"msg",       message},
-                });
-        if (!extra.empty()) {
-            log.merge_patch(extra);
+template <typename T>
+static T json_value(const json & body, const std::string & key, const T & default_value) {
+    // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()) {
+        try {
+            return body.at(key);
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+            return default_value;
         }
-
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
     } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-        std::stringstream ss;
-        ss << buf << " |";
-        for (const auto& el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
-            ss << buf;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
+        return default_value;
     }
 }
 
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
 //
-// server utils
+// tokenizer and input processing utils
 //
 
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
+static bool json_is_array_of_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (!e.is_number_integer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
 }
 
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
-    return res >= 0;
+// is array having BOTH numbers & strings?
+static bool json_is_array_of_mixed_numbers_strings(const json & data) {
+    bool seen_string = false;
+    bool seen_number = false;
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            seen_string |= e.is_string();
+            seen_number |= e.is_number_integer();
+            if (seen_number && seen_string) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// get value by path(key1 / key2)
+static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
+    json result = json::object();
+
+    for (const std::string & path : paths) {
+        json current = js;
+        const auto keys = string_split<std::string>(path, /*separator*/ '/');
+        bool valid_path = true;
+        for (const std::string & k : keys) {
+            if (valid_path && current.is_object() && current.contains(k)) {
+                current = current[k];
+            } else {
+                valid_path = false;
+            }
+        }
+        if (valid_path) {
+            result[path] = current;
+        }
+    }
+    return result;
+}
+
+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+    // or the first element of the json_prompt array is a string.
+    llama_tokens prompt_tokens;
+
+    if (json_prompt.is_array()) {
+        bool first = true;
+        for (const auto & p : json_prompt) {
+            if (p.is_string()) {
+                auto s = p.template get<std::string>();
+
+                llama_tokens p;
+                if (first) {
+                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    first = false;
+                } else {
+                    p = common_tokenize(vocab, s, false, parse_special);
+                }
+
+                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+            } else {
+                if (first) {
+                    first = false;
+                }
+
+                prompt_tokens.push_back(p.template get<llama_token>());
+            }
+        }
+    } else {
+        auto s = json_prompt.template get<std::string>();
+        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
+    }
+
+    return prompt_tokens;
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
+ */
+static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+    std::vector<llama_tokens> result;
+    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
+        // string or mixed
+        result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
+    } else if (json_is_array_of_numbers(json_prompt)) {
+        // array of tokens
+        result.push_back(json_prompt.get<llama_tokens>());
+    } else if (json_prompt.is_array()) {
+        // array of prompts
+        result.reserve(json_prompt.size());
+        for (const auto & p : json_prompt) {
+            if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
+                result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
+            } else if (json_is_array_of_numbers(p)) {
+                // array of tokens
+                result.push_back(p.get<llama_tokens>());
+            } else {
+                throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
+            }
+        }
+    } else {
+        throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
+    }
+    if (result.empty()) {
+        throw std::runtime_error("\"prompt\" must not be empty");
+    }
+    return result;
+}
+
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+static size_t validate_utf8(const std::string& text) {
+    size_t len = text.size();
+    if (len == 0) return 0;
+
+    // Check the last few bytes to see if a multi-byte character is cut off
+    for (size_t i = 1; i <= 4 && i <= len; ++i) {
+        unsigned char c = text[len - i];
+        // Check for start of a multi-byte sequence from the end
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character start: 110xxxxx
+            // Needs at least 2 bytes
+            if (i < 2) return len - i;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character start: 1110xxxx
+            // Needs at least 3 bytes
+            if (i < 3) return len - i;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character start: 11110xxx
+            // Needs at least 4 bytes
+            if (i < 4) return len - i;
+        }
+    }
+
+    // If no cut-off multi-byte character is found, return full length
+    return len;
+}
+
+//
+// template utils
+//
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
+static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
+    llama_tokens result;
+
+    result.reserve(doc.size() + query.size() + 4);
+    result.push_back(llama_vocab_bos(vocab));
+    result.insert(result.end(), query.begin(), query.end());
+    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(llama_vocab_sep(vocab));
+    result.insert(result.end(), doc.begin(), doc.end());
+    result.push_back(llama_vocab_eos(vocab));
+
+    return result;
+}
+
+// format infill task
+static llama_tokens format_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt
+    ) {
+    // TODO: optimize this block by reducing memory allocations and movement
+
+    // use FIM repo-level pattern:
+    // ref: https://arxiv.org/pdf/2409.12186
+    //
+    // [FIM_REP]myproject
+    // [FIM_SEP]filename0
+    // extra chunk 0
+    // [FIM_SEP]filename1
+    // extra chunk 1
+    // ...
+    // [FIM_SEP]filename
+    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
+    //
+    llama_tokens extra_tokens;
+    extra_tokens.reserve(n_ctx);
+
+    auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
+    auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
+
+    if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: make project name an input
+        static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
+
+        extra_tokens.push_back(llama_vocab_fim_rep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
+    }
+    for (const auto & chunk : input_extra) {
+        // { "text": string, "filename": string }
+        const std::string text     = json_value(chunk, "text",     std::string());
+        const std::string filename = json_value(chunk, "filename", std::string("tmp"));
+
+        if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+            const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
+
+            extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+        } else {
+            // chunk separator in binary form to avoid confusing the AI
+            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
+            static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
+
+            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
+        }
+
+        const auto chunk_tokens = common_tokenize(vocab, text, false, false);
+        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
+    }
+
+    if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: current filename
+        static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
+
+        extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+    }
+
+    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+    const int n_prefix_take = std::min<int>(tokens_prefix.size(),                3*(n_batch/4));
+    const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
+
+    SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
+
+    // fill the rest of the context with extra chunks
+    const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
+
+    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+    tokens_suffix.resize(n_suffix_take);
+
+    tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
+    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+    tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
+
+    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
+    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
+
+    if (llama_vocab_get_add_bos(vocab)) {
+        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+    }
+
+    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
+
+    // put the extra context before the FIM prefix
+    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
+
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+    embd_inp.push_back(llama_vocab_fim_mid(vocab));
+
+    return embd_inp;
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
+    std::vector<common_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+        const auto & curr_msg = messages[i];
+
+        std::string role = json_value(curr_msg, "role", std::string(""));
+
+        std::string content;
+        if (curr_msg.contains("content")) {
+            if (curr_msg["content"].is_string()) {
+                content = curr_msg["content"].get<std::string>();
+            } else if (curr_msg["content"].is_array()) {
+                for (const auto & part : curr_msg["content"]) {
+                    if (part.contains("text")) {
+                        content += "\n" + part["text"].get<std::string>();
+                    }
+                }
+            } else {
+                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            }
+        } else {
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+        }
+
+        chat.push_back({role, content, /* tool_calls= */ {}});
     }
 
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+    const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
+    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
 
     return formatted_chat;
 }
 
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    bool running;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
-        return new_id;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask when it is finished
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_run_slots(std::function<void(void)> callback) {
-        callback_run_slots = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // end the start_loop routine
-    void terminate() {
-        {
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            running = false;
-        }
-        condition_tasks.notify_all();
-    }
-
-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Run all slots
-     */
-    void start_loop() {
-        running = true;
-        while (true) {
-            LOG_VERBOSE("new task may arrive", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("update_multitasks", {});
-                // check if we have any finished multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is processed, slots data is now ready
-                LOG_VERBOSE("callback_run_slots", {});
-                callback_run_slots();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
-                        return;
-                    }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    // add the task_id to the list of tasks waiting for response
-    void add_waiting_task_id(int task_id) {
-        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    // when the request is finished, we can remove task associated with it
-    void remove_waiting_task_id(int task_id) {
-        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {{"task_id", result.id}});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
-                queue_results.push_back(result);
-                condition_results.notify_all();
-                return;
-            }
-        }
-    }
-};
-
 //
 // base64 utils (TODO: move to common in the future)
 //
@@ -448,13 +391,11 @@ static const std::string base64_chars =
              "abcdefghijklmnopqrstuvwxyz"
              "0123456789+/";
 
-static inline bool is_base64(uint8_t c)
-{
+static inline bool is_base64(uint8_t c) {
     return (isalnum(c) || (c == '+') || (c == '/'));
 }
 
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
     int i = 0;
     int j = 0;
     int in_ = 0;
@@ -466,13 +407,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
 
     std::vector<uint8_t> ret;
 
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
         char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
+        if (i == 4) {
+            for (i = 0; i < 4; i++) {
                 char_array_4[i] = base64_chars.find(char_array_4[i]);
             }
 
@@ -480,23 +418,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
             char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
             char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
 
-            for (i = 0; (i < 3); i++)
-            {
+            for (i = 0; (i < 3); i++) {
                 ret.push_back(char_array_3[i]);
             }
+
             i = 0;
         }
     }
 
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
+    if (i) {
+        for (j = i; j < 4; j++) {
             char_array_4[j] = 0;
         }
 
-        for (j = 0; j <4; j++)
-        {
+        for (j = 0; j < 4; j++) {
             char_array_4[j] = base64_chars.find(char_array_4[j]);
         }
 
@@ -504,8 +439,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
         char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
         char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
 
-        for (j = 0; (j < i - 1); j++)
-        {
+        for (j = 0; j < i - 1; j++) {
             ret.push_back(char_array_3[j]);
         }
     }
@@ -517,8 +451,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
 // random string / id
 //
 
-static std::string random_string()
-{
+static std::string random_string() {
     static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
 
     std::random_device rd;
@@ -533,102 +466,432 @@ static std::string random_string()
     return result;
 }
 
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
+static std::string gen_chatcmplid() {
+    return "chatcmpl-" + random_string();
 }
 
 //
 // other common utils
 //
 
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
+static bool ends_with(const std::string & str, const std::string & suffix) {
+    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
+static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
+    if (!text.empty() && !stop.empty()) {
         const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
                 const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
+                if (ends_with(text, current_partial)) {
                     return text.size() - char_index - 1;
                 }
             }
         }
     }
+
     return std::string::npos;
 }
 
 // TODO: reuse llama_detokenize
 template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
     std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
     }
+
     return ret;
 }
 
 // format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
+    std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
+
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
         std::stringstream ss;
         ss << std::hex << (out[0] & 0xff);
         std::string res(ss.str());
         out = "byte: \\x" + res;
     }
+
     return out;
 }
 
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
+static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
+    const std::string str =
+        std::string(event) + ": " +
+        data.dump(-1, ' ', false, json::error_handler_t::replace) +
+        "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
+
+    LOG_DBG("data stream, to_send: %s", str.c_str());
+
+    return sink.write(str.c_str(), str.size());
+}
+
+//
+// OAI utils
+//
+
+static json oaicompat_completion_params_parse(const json & body) {
+    json llama_params;
+
+    if (!body.contains("prompt")) {
+        throw std::runtime_error("\"prompt\" is required");
+    }
+
+    // Handle "stop" field
+    if (body.contains("stop") && body.at("stop").is_string()) {
+        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
+    for (const auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
+    // Copy remaining properties to llama_params
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }
+
+    return llama_params;
+}
+
+static json oaicompat_completion_params_parse(
+    const json & body, /* openai api json semantics */
+    bool use_jinja,
+    const common_chat_templates & chat_templates)
 {
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
+    json llama_params;
+    const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
+        ? *chat_templates.template_tool_use
+        : *chat_templates.template_default;
+
+    auto tools = json_value(body, "tools", json());
+    auto stream = json_value(body, "stream", false);
+
+    if (tools.is_array() && !tools.empty()) {
+        if (stream) {
+            throw std::runtime_error("Cannot use tools with stream");
+        }
+        if (!use_jinja) {
+            throw std::runtime_error("tools param requires --jinja flag");
+        }
+    }
+    if (!use_jinja) {
+        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
+            throw std::runtime_error("Unsupported param: tool_choice");
+        }
+    }
+
+    // Handle "stop" field
+    if (body.contains("stop") && body.at("stop").is_string()) {
+        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+        } else if (response_type == "json_schema") {
+            json json_schema = json_value(response_format, "json_schema", json::object());
+            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+
+    // Apply chat template to the list of messages
+    if (use_jinja) {
+        auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
+        if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
+            throw std::runtime_error("Invalid tool_choice: " + tool_choice);
+        }
+        if (tool_choice != "none" && llama_params.contains("grammar")) {
+            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+        }
+        common_chat_inputs inputs;
+        inputs.messages = body.at("messages");
+        inputs.tools = tools;
+        inputs.tool_choice = tool_choice;
+        inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
+        if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+            LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+            inputs.parallel_tool_calls = false;
+        }
+        inputs.stream = stream;
+        // TODO: support mixing schema w/ tools beyond generic format.
+        inputs.json_schema = json_value(llama_params, "json_schema", json());
+        auto chat_params = common_chat_params_init(tmpl, inputs);
+
+        llama_params["chat_format"] = static_cast<int>(chat_params.format);
+        llama_params["prompt"] = chat_params.prompt;
+        llama_params["grammar"] = chat_params.grammar;
+        llama_params["grammar_lazy"] = chat_params.grammar_lazy;
+        auto grammar_triggers = json::array();
+        for (const auto & trigger : chat_params.grammar_triggers) {
+            grammar_triggers.push_back({
+                {"word", trigger.word},
+                {"at_start", trigger.at_start},
             });
         }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
+        llama_params["grammar_triggers"] = grammar_triggers;
+        llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+        for (const auto & stop : chat_params.additional_stops) {
+            llama_params["stop"].push_back(stop);
+        }
+    } else {
+        llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
+    }
+
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (json_value(body, "logprobs", false)) {
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
+        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+    }
+
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }
+
+    return llama_params;
+}
+
+static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
+    json data = json::array();
+    int32_t n_tokens = 0;
+    int i = 0;
+    for (const auto & elem : embeddings) {
+        json embedding_obj;
+
+        if (use_base64) {
+            const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
+            const char* data_ptr = reinterpret_cast<const char*>(vec.data());
+            size_t data_size = vec.size() * sizeof(float);
+            embedding_obj = {
+                {"embedding", base64::encode(data_ptr, data_size)},
+                {"index", i++},
+                {"object", "embedding"},
+                {"encoding_format", "base64"}
+            };
+        } else {
+            embedding_obj = {
+                {"embedding", json_value(elem, "embedding", json::array())},
+                {"index", i++},
+                {"object", "embedding"}
+            };
+        }
+        data.push_back(embedding_obj);
+
+        n_tokens += json_value(elem, "tokens_evaluated", 0);
+    }
+
+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"data", data}
+    };
+
+    return res;
+}
+
+static json format_response_rerank(const json & request, const json & ranks) {
+    json data = json::array();
+    int32_t n_tokens = 0;
+    int i = 0;
+    for (const auto & rank : ranks) {
+        data.push_back(json{
+            {"index",    i++},
+            {"relevance_score", json_value(rank, "score", 0.0)},
+        });
+
+        n_tokens += json_value(rank, "tokens_evaluated", 0);
+    }
+
+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"results", data}
+    };
+
+    return res;
+}
+
+static bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static json format_tokenizer_response(const json & tokens) {
+    return json {
+        {"tokens", tokens}
+    };
+}
+
+static json format_detokenized_response(const std::string & content) {
+    return json {
+        {"content", content}
+    };
+}
+
+static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
+    json data = json::array();
+    for (const auto & lb : logit_bias) {
+        data.push_back(json{
+            {"bias", lb.bias},
+            {"token", lb.token},
         });
     }
-    return out;
+    return data;
+}
+
+static std::string safe_json_to_str(const json & data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
+}
+
+static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+    std::vector<llama_token_data> cur;
+    const auto * logits = llama_get_logits_ith(ctx, idx);
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    cur.resize(n_vocab);
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+    }
+
+    // sort tokens by logits
+    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    });
+
+    // apply softmax
+    float max_l = cur[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < cur.size(); ++i) {
+        float p = expf(cur[i].logit - max_l);
+        cur[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < cur.size(); ++i) {
+        cur[i].p /= cum_sum;
+    }
+
+    return cur;
+}
+
+static bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2) {
+    if (l1.size() != l2.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < l1.size(); ++i) {
+        // we don't check lora.path to reduce the time complexity
+        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// parse lora config from JSON request, returned a copy of lora_base with updated scale
+static std::vector<common_adapter_lora_info> parse_lora_request(
+        const std::vector<common_adapter_lora_info> & lora_base,
+        const json & data) {
+    std::vector<common_adapter_lora_info> lora(lora_base);
+    int max_idx = lora.size();
+
+    // clear existing value
+    for (auto & entry : lora) {
+        entry.scale = 0.0f;
+    }
+
+    // set value
+    for (const auto & entry : data) {
+        int id      = json_value(entry, "id", -1);
+        float scale = json_value(entry, "scale", 0.0f);
+        if (0 <= id && id < max_idx) {
+            lora[id].scale = scale;
+        } else {
+            throw std::runtime_error("invalid adapter id");
+        }
+    }
+
+    return lora;
 }
diff --git a/examples/server/webui/.gitignore b/examples/server/webui/.gitignore
new file mode 100644
index 000000000..a547bf36d
--- /dev/null
+++ b/examples/server/webui/.gitignore
@@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
diff --git a/examples/server/webui/.prettierignore b/examples/server/webui/.prettierignore
new file mode 100644
index 000000000..c0cb165b3
--- /dev/null
+++ b/examples/server/webui/.prettierignore
@@ -0,0 +1,10 @@
+**/.vscode
+**/.github
+**/.git
+**/.svn
+**/.hg
+**/node_modules
+**/dist
+**/build
+
+*.config.js
diff --git a/examples/server/webui/eslint.config.js b/examples/server/webui/eslint.config.js
new file mode 100644
index 000000000..7c0d39b89
--- /dev/null
+++ b/examples/server/webui/eslint.config.js
@@ -0,0 +1,26 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': 'off',
+      '@typescript-eslint/no-unused-vars': 'off',
+    },
+  },
+)
diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html
new file mode 100644
index 000000000..471f46b3a
--- /dev/null
+++ b/examples/server/webui/index.html
@@ -0,0 +1,16 @@
+<!doctype html>
+<html>
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1, maximum-scale=1"
+    />
+    <meta name="color-scheme" content="light dark" />
+    <title>🦙 llama.cpp - chat</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json
new file mode 100644
index 000000000..c6c5de3c0
--- /dev/null
+++ b/examples/server/webui/package-lock.json
@@ -0,0 +1,6608 @@
+{
+  "name": "webui",
+  "version": "0.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "webui",
+      "version": "0.0.0",
+      "dependencies": {
+        "@heroicons/react": "^2.2.0",
+        "@sec-ant/readable-stream": "^0.6.0",
+        "@vscode/markdown-it-katex": "^1.1.1",
+        "autoprefixer": "^10.4.20",
+        "daisyui": "^4.12.14",
+        "highlight.js": "^11.10.0",
+        "katex": "^0.16.15",
+        "postcss": "^8.4.49",
+        "react": "^18.3.1",
+        "react-dom": "^18.3.1",
+        "react-markdown": "^9.0.3",
+        "react-router": "^7.1.5",
+        "rehype-highlight": "^7.0.2",
+        "rehype-katex": "^7.0.1",
+        "remark-breaks": "^4.0.0",
+        "remark-gfm": "^4.0.0",
+        "remark-math": "^6.0.0",
+        "tailwindcss": "^3.4.15",
+        "textlinestream": "^1.1.1",
+        "vite-plugin-singlefile": "^2.0.3"
+      },
+      "devDependencies": {
+        "@eslint/js": "^9.17.0",
+        "@types/markdown-it": "^14.1.2",
+        "@types/node": "^22.13.1",
+        "@types/react": "^18.3.18",
+        "@types/react-dom": "^18.3.5",
+        "@vitejs/plugin-react": "^4.3.4",
+        "eslint": "^9.17.0",
+        "eslint-plugin-react-hooks": "^5.0.0",
+        "eslint-plugin-react-refresh": "^0.4.16",
+        "globals": "^15.14.0",
+        "prettier": "^3.4.2",
+        "sass-embedded": "^1.83.4",
+        "typescript": "~5.6.2",
+        "typescript-eslint": "^8.18.2",
+        "vite": "^6.0.5"
+      }
+    },
+    "node_modules/@alloc/quick-lru": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz",
+      "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/@ampproject/remapping": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz",
+      "integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/code-frame": {
+      "version": "7.26.2",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.26.2.tgz",
+      "integrity": "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.25.9",
+        "js-tokens": "^4.0.0",
+        "picocolors": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/compat-data": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.26.5.tgz",
+      "integrity": "sha512-XvcZi1KWf88RVbF9wn8MN6tYFloU5qX8KjuF3E1PVBmJ9eypXfs4GRiJwLuTZL0iSnJUKn1BFPa5BPZZJyFzPg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/core": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.26.7.tgz",
+      "integrity": "sha512-SRijHmF0PSPgLIBYlWnG0hyeJLwXE2CgpsXaMOrtt2yp9/86ALw6oUlj9KYuZ0JN07T4eBMVIW4li/9S1j2BGA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@ampproject/remapping": "^2.2.0",
+        "@babel/code-frame": "^7.26.2",
+        "@babel/generator": "^7.26.5",
+        "@babel/helper-compilation-targets": "^7.26.5",
+        "@babel/helper-module-transforms": "^7.26.0",
+        "@babel/helpers": "^7.26.7",
+        "@babel/parser": "^7.26.7",
+        "@babel/template": "^7.25.9",
+        "@babel/traverse": "^7.26.7",
+        "@babel/types": "^7.26.7",
+        "convert-source-map": "^2.0.0",
+        "debug": "^4.1.0",
+        "gensync": "^1.0.0-beta.2",
+        "json5": "^2.2.3",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/babel"
+      }
+    },
+    "node_modules/@babel/generator": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.26.5.tgz",
+      "integrity": "sha512-2caSP6fN9I7HOe6nqhtft7V4g7/V/gfDsC3Ag4W7kEzzvRGKqiv0pu0HogPiZ3KaVSoNDhUws6IJjDjpfmYIXw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.26.5",
+        "@babel/types": "^7.26.5",
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.25",
+        "jsesc": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-compilation-targets": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.26.5.tgz",
+      "integrity": "sha512-IXuyn5EkouFJscIDuFF5EsiSolseme1s0CZB+QxVugqJLYmKdxI1VfIBOst0SUu4rnk2Z7kqTwmoO1lp3HIfnA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/compat-data": "^7.26.5",
+        "@babel/helper-validator-option": "^7.25.9",
+        "browserslist": "^4.24.0",
+        "lru-cache": "^5.1.1",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-imports": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.25.9.tgz",
+      "integrity": "sha512-tnUA4RsrmflIM6W6RFTLFSXITtl0wKjgpnLgXyowocVPrbYrLUXSBXDgTs8BlbmIzIdlBySRQjINYs2BAkiLtw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/traverse": "^7.25.9",
+        "@babel/types": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-transforms": {
+      "version": "7.26.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.26.0.tgz",
+      "integrity": "sha512-xO+xu6B5K2czEnQye6BHA7DolFFmS3LB7stHZFaOLb1pAwO1HWLS8fXA+eh0A2yIvltPVmx3eNNDBJA2SLHXFw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-module-imports": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9",
+        "@babel/traverse": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/@babel/helper-plugin-utils": {
+      "version": "7.26.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.26.5.tgz",
+      "integrity": "sha512-RS+jZcRdZdRFzMyr+wcsaqOmld1/EqTghfaBGQQd/WnRdzdlvSZ//kF7U8VQTxf1ynZ4cjUcYgjVGx13ewNPMg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-string-parser": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz",
+      "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz",
+      "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-option": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.25.9.tgz",
+      "integrity": "sha512-e/zv1co8pp55dNdEcCynfj9X7nyUKUXoUEwfXqaZt0omVOmDe9oOTdKStH4GmAw6zxMFs50ZayuMfHDKlO7Tfw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helpers": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.7.tgz",
+      "integrity": "sha512-8NHiL98vsi0mbPQmYAGWwfcFaOy4j2HY49fXJCfuDcdE7fMIsH9a7GdaeXpIBsbT7307WU8KCMp5pUVDNL4f9A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/template": "^7.25.9",
+        "@babel/types": "^7.26.7"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/parser": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.7.tgz",
+      "integrity": "sha512-kEvgGGgEjRUutvdVvZhbn/BxVt+5VSpwXz1j3WYXQbXDo8KzFOPNG2GQbdAiNq8g6wn1yKk7C/qrke03a84V+w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.26.7"
+      },
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-self": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.25.9.tgz",
+      "integrity": "sha512-y8quW6p0WHkEhmErnfe58r7x0A70uKphQm8Sp8cV7tjNQwK56sNVK0M73LK3WuYmsuyrftut4xAkjjgU0twaMg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-source": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.25.9.tgz",
+      "integrity": "sha512-+iqjT8xmXhhYv4/uiYd8FNQsraMFZIfxVSqxxVSZP0WbbSAWvBXAul0m/zu+7Vv4O/3WtApy9pmaTMiumEZgfg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/template": {
+      "version": "7.25.9",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz",
+      "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.25.9",
+        "@babel/parser": "^7.25.9",
+        "@babel/types": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.26.7.tgz",
+      "integrity": "sha512-1x1sgeyRLC3r5fQOM0/xtQKsYjyxmFjaOrLJNtZ81inNjyJHGIolTULPiSc/2qe1/qfpFLisLQYFnnZl7QoedA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.26.2",
+        "@babel/generator": "^7.26.5",
+        "@babel/parser": "^7.26.7",
+        "@babel/template": "^7.25.9",
+        "@babel/types": "^7.26.7",
+        "debug": "^4.3.1",
+        "globals": "^11.1.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse/node_modules/globals": {
+      "version": "11.12.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz",
+      "integrity": "sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/@babel/types": {
+      "version": "7.26.7",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.7.tgz",
+      "integrity": "sha512-t8kDRGrKXyp6+tjUh7hw2RLyclsW4TRoRvRHtSyAX9Bb5ldlFh+90YAYY6awRXrlB4G5G2izNeGySpATlFzmOg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.25.9",
+        "@babel/helper-validator-identifier": "^7.25.9"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@bufbuild/protobuf": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz",
+      "integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==",
+      "devOptional": true,
+      "license": "(Apache-2.0 AND BSD-3-Clause)"
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.24.2.tgz",
+      "integrity": "sha512-thpVCb/rhxE/BnMLQ7GReQLLN8q9qbHmI55F4489/ByVg2aQaQ6kbcLb6FHkocZzQhxc4gx0sCk0tJkKBFzDhA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.24.2.tgz",
+      "integrity": "sha512-tmwl4hJkCfNHwFB3nBa8z1Uy3ypZpxqxfTQOcHX+xRByyYgunVbZ9MzUUfb0RxaHIMnbHagwAxuTL+tnNM+1/Q==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.24.2.tgz",
+      "integrity": "sha512-cNLgeqCqV8WxfcTIOeL4OAtSmL8JjcN6m09XIgro1Wi7cF4t/THaWEa7eL5CMoMBdjoHOTh/vwTO/o2TRXIyzg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.24.2.tgz",
+      "integrity": "sha512-B6Q0YQDqMx9D7rvIcsXfmJfvUYLoP722bgfBlO5cGvNVb5V/+Y7nhBE3mHV9OpxBf4eAS2S68KZztiPaWq4XYw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.24.2.tgz",
+      "integrity": "sha512-kj3AnYWc+CekmZnS5IPu9D+HWtUI49hbnyqk0FLEJDbzCIQt7hg7ucF1SQAilhtYpIujfaHr6O0UHlzzSPdOeA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.24.2.tgz",
+      "integrity": "sha512-WeSrmwwHaPkNR5H3yYfowhZcbriGqooyu3zI/3GGpF8AyUdsrrP0X6KumITGA9WOyiJavnGZUwPGvxvwfWPHIA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.24.2.tgz",
+      "integrity": "sha512-UN8HXjtJ0k/Mj6a9+5u6+2eZ2ERD7Edt1Q9IZiB5UZAIdPnVKDoG7mdTVGhHJIeEml60JteamR3qhsr1r8gXvg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.24.2.tgz",
+      "integrity": "sha512-TvW7wE/89PYW+IevEJXZ5sF6gJRDY/14hyIGFXdIucxCsbRmLUcjseQu1SyTko+2idmCw94TgyaEZi9HUSOe3Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.24.2.tgz",
+      "integrity": "sha512-n0WRM/gWIdU29J57hJyUdIsk0WarGd6To0s+Y+LwvlC55wt+GT/OgkwoXCXvIue1i1sSNWblHEig00GBWiJgfA==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.24.2.tgz",
+      "integrity": "sha512-7HnAD6074BW43YvvUmE/35Id9/NB7BeX5EoNkK9obndmZBUk8xmJJeU7DwmUeN7tkysslb2eSl6CTrYz6oEMQg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.24.2.tgz",
+      "integrity": "sha512-sfv0tGPQhcZOgTKO3oBE9xpHuUqguHvSo4jl+wjnKwFpapx+vUDcawbwPNuBIAYdRAvIDBfZVvXprIj3HA+Ugw==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.24.2.tgz",
+      "integrity": "sha512-CN9AZr8kEndGooS35ntToZLTQLHEjtVB5n7dl8ZcTZMonJ7CCfStrYhrzF97eAecqVbVJ7APOEe18RPI4KLhwQ==",
+      "cpu": [
+        "loong64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.24.2.tgz",
+      "integrity": "sha512-iMkk7qr/wl3exJATwkISxI7kTcmHKE+BlymIAbHO8xanq/TjHaaVThFF6ipWzPHryoFsesNQJPE/3wFJw4+huw==",
+      "cpu": [
+        "mips64el"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.24.2.tgz",
+      "integrity": "sha512-shsVrgCZ57Vr2L8mm39kO5PPIb+843FStGt7sGGoqiiWYconSxwTiuswC1VJZLCjNiMLAMh34jg4VSEQb+iEbw==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.24.2.tgz",
+      "integrity": "sha512-4eSFWnU9Hhd68fW16GD0TINewo1L6dRrB+oLNNbYyMUAeOD2yCK5KXGK1GH4qD/kT+bTEXjsyTCiJGHPZ3eM9Q==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.24.2.tgz",
+      "integrity": "sha512-S0Bh0A53b0YHL2XEXC20bHLuGMOhFDO6GN4b3YjRLK//Ep3ql3erpNcPlEFed93hsQAjAQDNsvcK+hV90FubSw==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.24.2.tgz",
+      "integrity": "sha512-8Qi4nQcCTbLnK9WoMjdC9NiTG6/E38RNICU6sUNqK0QFxCYgoARqVqxdFmWkdonVsvGqWhmm7MO0jyTqLqwj0Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.24.2.tgz",
+      "integrity": "sha512-wuLK/VztRRpMt9zyHSazyCVdCXlpHkKm34WUyinD2lzK07FAHTq0KQvZZlXikNWkDGoT6x3TD51jKQ7gMVpopw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.24.2.tgz",
+      "integrity": "sha512-VefFaQUc4FMmJuAxmIHgUmfNiLXY438XrL4GDNV1Y1H/RW3qow68xTwjZKfj/+Plp9NANmzbH5R40Meudu8mmw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.24.2.tgz",
+      "integrity": "sha512-YQbi46SBct6iKnszhSvdluqDmxCJA+Pu280Av9WICNwQmMxV7nLRHZfjQzwbPs3jeWnuAhE9Jy0NrnJ12Oz+0A==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.24.2.tgz",
+      "integrity": "sha512-+iDS6zpNM6EnJyWv0bMGLWSWeXGN/HTaF/LXHXHwejGsVi+ooqDfMCCTerNFxEkM3wYVcExkeGXNqshc9iMaOA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.24.2.tgz",
+      "integrity": "sha512-hTdsW27jcktEvpwNHJU4ZwWFGkz2zRJUz8pvddmXPtXDzVKTTINmlmga3ZzwcuMpUvLw7JkLy9QLKyGpD2Yxig==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.24.2.tgz",
+      "integrity": "sha512-LihEQ2BBKVFLOC9ZItT9iFprsE9tqjDjnbulhHoFxYQtQfai7qfluVODIYxt1PgdoyQkz23+01rzwNwYfutxUQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.24.2.tgz",
+      "integrity": "sha512-q+iGUwfs8tncmFC9pcnD5IvRHAzmbwQ3GPS5/ceCyHdjXubwQWI12MKWSNSMYLJMq23/IUCvJMS76PDqXe1fxA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.24.2.tgz",
+      "integrity": "sha512-7VTgWzgMGvup6aSqDPLiW5zHaxYJGTO4OokMjIlrCtf+VpEL+cXKtCvg723iguPYI5oaUNdS+/V7OU2gvXVWEg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@eslint-community/eslint-utils": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.1.tgz",
+      "integrity": "sha512-s3O3waFUrMV8P/XaF/+ZTp1X9XBZW1a4B97ZnjQF2KYWaFD2A8KyFBsrsfSjEmjn3RGWAIuvlneuZm3CUK3jbA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "eslint-visitor-keys": "^3.4.3"
+      },
+      "engines": {
+        "node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0"
+      }
+    },
+    "node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": {
+      "version": "3.4.3",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+      "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/@eslint-community/regexpp": {
+      "version": "4.12.1",
+      "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.1.tgz",
+      "integrity": "sha512-CCZCDJuduB9OUkFkY2IgppNZMi2lBQgD2qzwXkEia16cge2pijY/aXi96CJMquDMn3nJdlPV1A5KrJEXwfLNzQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^12.0.0 || ^14.0.0 || >=16.0.0"
+      }
+    },
+    "node_modules/@eslint/config-array": {
+      "version": "0.19.2",
+      "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.19.2.tgz",
+      "integrity": "sha512-GNKqxfHG2ySmJOBSHg7LxeUx4xpuCoFjacmlCoYWEbaPXLwvfIjixRI12xCQZeULksQb23uiA8F40w5TojpV7w==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@eslint/object-schema": "^2.1.6",
+        "debug": "^4.3.1",
+        "minimatch": "^3.1.2"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      }
+    },
+    "node_modules/@eslint/core": {
+      "version": "0.10.0",
+      "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.10.0.tgz",
+      "integrity": "sha512-gFHJ+xBOo4G3WRlR1e/3G8A6/KZAH6zcE/hkLRCZTi/B9avAG365QhFA8uOGzTMqgTghpn7/fSnscW++dpMSAw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@types/json-schema": "^7.0.15"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      }
+    },
+    "node_modules/@eslint/eslintrc": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.2.0.tgz",
+      "integrity": "sha512-grOjVNN8P3hjJn/eIETF1wwd12DdnwFDoyceUJLYYdkpbwq3nLi+4fqrTAONx7XDALqlL220wC/RHSC/QTI/0w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ajv": "^6.12.4",
+        "debug": "^4.3.2",
+        "espree": "^10.0.1",
+        "globals": "^14.0.0",
+        "ignore": "^5.2.0",
+        "import-fresh": "^3.2.1",
+        "js-yaml": "^4.1.0",
+        "minimatch": "^3.1.2",
+        "strip-json-comments": "^3.1.1"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/@eslint/eslintrc/node_modules/globals": {
+      "version": "14.0.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz",
+      "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/@eslint/js": {
+      "version": "9.19.0",
+      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.19.0.tgz",
+      "integrity": "sha512-rbq9/g38qjfqFLOVPvwjIvFFdNziEC5S65jmjPw5r6A//QH+W91akh9irMwjDN8zKUTak6W9EsAv4m/7Wnw0UQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      }
+    },
+    "node_modules/@eslint/object-schema": {
+      "version": "2.1.6",
+      "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.6.tgz",
+      "integrity": "sha512-RBMg5FRL0I0gs51M/guSAj5/e14VQ4tpZnQNWwuDT66P14I43ItmPfIZRhO9fUVIPOAQXU47atlywZ/czoqFPA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      }
+    },
+    "node_modules/@eslint/plugin-kit": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.2.5.tgz",
+      "integrity": "sha512-lB05FkqEdUg2AA0xEbUz0SnkXT1LcCTa438W4IWTUh4hdOnVbQyOJ81OrDXsJk/LSiJHubgGEFoR5EHq1NsH1A==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@eslint/core": "^0.10.0",
+        "levn": "^0.4.1"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      }
+    },
+    "node_modules/@heroicons/react": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/@heroicons/react/-/react-2.2.0.tgz",
+      "integrity": "sha512-LMcepvRaS9LYHJGsF0zzmgKCUim/X3N/DQKc4jepAXJ7l8QxJ1PmxJzqplF2Z3FE4PqBAIGyJAQ/w4B5dsqbtQ==",
+      "license": "MIT",
+      "peerDependencies": {
+        "react": ">= 16 || ^19.0.0-rc"
+      }
+    },
+    "node_modules/@humanfs/core": {
+      "version": "0.19.1",
+      "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
+      "integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=18.18.0"
+      }
+    },
+    "node_modules/@humanfs/node": {
+      "version": "0.16.6",
+      "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.6.tgz",
+      "integrity": "sha512-YuI2ZHQL78Q5HbhDiBA1X4LmYdXCKCMQIfw0pw7piHJwyREFebJUvrQN4cMssyES6x+vfUbx1CIpaQUKYdQZOw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@humanfs/core": "^0.19.1",
+        "@humanwhocodes/retry": "^0.3.0"
+      },
+      "engines": {
+        "node": ">=18.18.0"
+      }
+    },
+    "node_modules/@humanfs/node/node_modules/@humanwhocodes/retry": {
+      "version": "0.3.1",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.3.1.tgz",
+      "integrity": "sha512-JBxkERygn7Bv/GbN5Rv8Ul6LVknS+5Bp6RgDC/O8gEBU/yeH5Ui5C/OlWrTb6qct7LjjfT6Re2NxB0ln0yYybA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=18.18"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/nzakas"
+      }
+    },
+    "node_modules/@humanwhocodes/module-importer": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz",
+      "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=12.22"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/nzakas"
+      }
+    },
+    "node_modules/@humanwhocodes/retry": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.1.tgz",
+      "integrity": "sha512-c7hNEllBlenFTHBky65mhq8WD2kbN9Q6gk0bTk8lSBvc554jpXSkST1iePudpt7+A/AQvuHs9EMqjHDXMY1lrA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=18.18"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/nzakas"
+      }
+    },
+    "node_modules/@isaacs/cliui": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
+      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
+      "license": "ISC",
+      "dependencies": {
+        "string-width": "^5.1.2",
+        "string-width-cjs": "npm:string-width@^4.2.0",
+        "strip-ansi": "^7.0.1",
+        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
+        "wrap-ansi": "^8.1.0",
+        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.8",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.8.tgz",
+      "integrity": "sha512-imAbBGkb+ebQyxKgzv5Hu2nmROxoDOXHh80evxdoXNOrvAnVx7zimzc1Oo5h9RlfV4vPXaE2iM5pOFbvOCClWA==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/set-array": "^1.2.1",
+        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/set-array": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
+      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
+      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "license": "MIT"
+    },
+    "node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.25",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
+      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
+    "node_modules/@nodelib/fs.scandir": {
+      "version": "2.1.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "2.0.5",
+        "run-parallel": "^1.1.9"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/@nodelib/fs.stat": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/@nodelib/fs.walk": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.scandir": "2.1.5",
+        "fastq": "^1.6.0"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/@pkgjs/parseargs": {
+      "version": "0.11.0",
+      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
+      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
+      "license": "MIT",
+      "optional": true,
+      "engines": {
+        "node": ">=14"
+      }
+    },
+    "node_modules/@rollup/rollup-android-arm-eabi": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.34.2.tgz",
+      "integrity": "sha512-6Fyg9yQbwJR+ykVdT9sid1oc2ewejS6h4wzQltmJfSW53N60G/ah9pngXGANdy9/aaE/TcUFpWosdm7JXS1WTQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-android-arm64": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.34.2.tgz",
+      "integrity": "sha512-K5GfWe+vtQ3kyEbihrimM38UgX57UqHp+oME7X/EX9Im6suwZfa7Hsr8AtzbJvukTpwMGs+4s29YMSO3rwWtsw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-arm64": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.34.2.tgz",
+      "integrity": "sha512-PSN58XG/V/tzqDb9kDGutUruycgylMlUE59f40ny6QIRNsTEIZsrNQTJKUN2keMMSmlzgunMFqyaGLmly39sug==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-x64": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.34.2.tgz",
+      "integrity": "sha512-gQhK788rQJm9pzmXyfBB84VHViDERhAhzGafw+E5mUpnGKuxZGkMVDa3wgDFKT6ukLC5V7QTifzsUKdNVxp5qQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-arm64": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.34.2.tgz",
+      "integrity": "sha512-eiaHgQwGPpxLC3+zTAcdKl4VsBl3r0AiJOd1Um/ArEzAjN/dbPK1nROHrVkdnoE6p7Svvn04w3f/jEZSTVHunA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-x64": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.34.2.tgz",
+      "integrity": "sha512-lhdiwQ+jf8pewYOTG4bag0Qd68Jn1v2gO1i0mTuiD+Qkt5vNfHVK/jrT7uVvycV8ZchlzXp5HDVmhpzjC6mh0g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.34.2.tgz",
+      "integrity": "sha512-lfqTpWjSvbgQP1vqGTXdv+/kxIznKXZlI109WkIFPbud41bjigjNmOAAKoazmRGx+k9e3rtIdbq2pQZPV1pMig==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-musleabihf": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.34.2.tgz",
+      "integrity": "sha512-RGjqULqIurqqv+NJTyuPgdZhka8ImMLB32YwUle2BPTDqDoXNgwFjdjQC59FbSk08z0IqlRJjrJ0AvDQ5W5lpw==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.34.2.tgz",
+      "integrity": "sha512-ZvkPiheyXtXlFqHpsdgscx+tZ7hoR59vOettvArinEspq5fxSDSgfF+L5wqqJ9R4t+n53nyn0sKxeXlik7AY9Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-musl": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.34.2.tgz",
+      "integrity": "sha512-UlFk+E46TZEoxD9ufLKDBzfSG7Ki03fo6hsNRRRHF+KuvNZ5vd1RRVQm8YZlGsjcJG8R252XFK0xNPay+4WV7w==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-loongarch64-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.34.2.tgz",
+      "integrity": "sha512-hJhfsD9ykx59jZuuoQgYT1GEcNNi3RCoEmbo5OGfG8RlHOiVS7iVNev9rhLKh7UBYq409f4uEw0cclTXx8nh8Q==",
+      "cpu": [
+        "loong64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.34.2.tgz",
+      "integrity": "sha512-g/O5IpgtrQqPegvqopvmdCF9vneLE7eqYfdPWW8yjPS8f63DNam3U4ARL1PNNB64XHZDHKpvO2Giftf43puB8Q==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.34.2.tgz",
+      "integrity": "sha512-bSQijDC96M6PuooOuXHpvXUYiIwsnDmqGU8+br2U7iPoykNi9JtMUpN7K6xml29e0evK0/g0D1qbAUzWZFHY5Q==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-s390x-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.34.2.tgz",
+      "integrity": "sha512-49TtdeVAsdRuiUHXPrFVucaP4SivazetGUVH8CIxVsNsaPHV4PFkpLmH9LeqU/R4Nbgky9lzX5Xe1NrzLyraVA==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-gnu": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.34.2.tgz",
+      "integrity": "sha512-j+jFdfOycLIQ7FWKka9Zd3qvsIyugg5LeZuHF6kFlXo6MSOc6R1w37YUVy8VpAKd81LMWGi5g9J25P09M0SSIw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-musl": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.34.2.tgz",
+      "integrity": "sha512-aDPHyM/D2SpXfSNCVWCxyHmOqN9qb7SWkY1+vaXqMNMXslZYnwh9V/UCudl6psyG0v6Ukj7pXanIpfZwCOEMUg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-arm64-msvc": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.34.2.tgz",
+      "integrity": "sha512-LQRkCyUBnAo7r8dbEdtNU08EKLCJMgAk2oP5H3R7BnUlKLqgR3dUjrLBVirmc1RK6U6qhtDw29Dimeer8d5hzQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-ia32-msvc": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.34.2.tgz",
+      "integrity": "sha512-wt8OhpQUi6JuPFkm1wbVi1BByeag87LDFzeKSXzIdGcX4bMLqORTtKxLoCbV57BHYNSUSOKlSL4BYYUghainYA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-msvc": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.34.2.tgz",
+      "integrity": "sha512-rUrqINax0TvrPBXrFKg0YbQx18NpPN3NNrgmaao9xRNbTwek7lOXObhx8tQy8gelmQ/gLaGy1WptpU2eKJZImg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@sec-ant/readable-stream": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz",
+      "integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==",
+      "license": "MIT"
+    },
+    "node_modules/@types/babel__core": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
+      "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.20.7",
+        "@babel/types": "^7.20.7",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
+      }
+    },
+    "node_modules/@types/babel__generator": {
+      "version": "7.6.8",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.6.8.tgz",
+      "integrity": "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__template": {
+      "version": "7.4.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
+      "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__traverse": {
+      "version": "7.20.6",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.20.6.tgz",
+      "integrity": "sha512-r1bzfrm0tomOI8g1SzvCaQHo6Lcv6zu0EA+W2kHrt8dyrHQxGzBBL4kdkzIS+jBMV+EYcMAEAqXqYaLJq5rOZg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.20.7"
+      }
+    },
+    "node_modules/@types/cookie": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz",
+      "integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==",
+      "license": "MIT"
+    },
+    "node_modules/@types/debug": {
+      "version": "4.1.12",
+      "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
+      "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/ms": "*"
+      }
+    },
+    "node_modules/@types/estree": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz",
+      "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==",
+      "license": "MIT"
+    },
+    "node_modules/@types/estree-jsx": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.5.tgz",
+      "integrity": "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "*"
+      }
+    },
+    "node_modules/@types/hast": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
+      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "*"
+      }
+    },
+    "node_modules/@types/json-schema": {
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@types/katex": {
+      "version": "0.16.7",
+      "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.7.tgz",
+      "integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==",
+      "license": "MIT"
+    },
+    "node_modules/@types/linkify-it": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/@types/linkify-it/-/linkify-it-5.0.0.tgz",
+      "integrity": "sha512-sVDA58zAw4eWAffKOaQH5/5j3XeayukzDk+ewSsnv3p4yJEZHCCzMDiZM8e0OUrRvmpGZ85jf4yDHkHsgBNr9Q==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@types/markdown-it": {
+      "version": "14.1.2",
+      "resolved": "https://registry.npmjs.org/@types/markdown-it/-/markdown-it-14.1.2.tgz",
+      "integrity": "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/linkify-it": "^5",
+        "@types/mdurl": "^2"
+      }
+    },
+    "node_modules/@types/mdast": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz",
+      "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "*"
+      }
+    },
+    "node_modules/@types/mdurl": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/@types/mdurl/-/mdurl-2.0.0.tgz",
+      "integrity": "sha512-RGdgjQUZba5p6QEFAVx2OGb8rQDL/cPRG7GiedRzMcJ1tYnUANBncjbSB1NRGwbvjcPeikRABz2nshyPk1bhWg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@types/ms": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
+      "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==",
+      "license": "MIT"
+    },
+    "node_modules/@types/node": {
+      "version": "22.13.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.13.1.tgz",
+      "integrity": "sha512-jK8uzQlrvXqEU91UxiK5J7pKHyzgnI1Qnl0QDHIgVGuolJhRb9EEl28Cj9b3rGR8B2lhFCtvIm5os8lFnO/1Ew==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~6.20.0"
+      }
+    },
+    "node_modules/@types/prop-types": {
+      "version": "15.7.14",
+      "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.14.tgz",
+      "integrity": "sha512-gNMvNH49DJ7OJYv+KAKn0Xp45p8PLl6zo2YnvDIbTd4J6MER2BmWN49TG7n9LvkyihINxeKW8+3bfS2yDC9dzQ==",
+      "license": "MIT"
+    },
+    "node_modules/@types/react": {
+      "version": "18.3.18",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.18.tgz",
+      "integrity": "sha512-t4yC+vtgnkYjNSKlFx1jkAhH8LgTo2N/7Qvi83kdEaUtMDiwpbLAktKDaAMlRcJ5eSxZkH74eEGt1ky31d7kfQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/prop-types": "*",
+        "csstype": "^3.0.2"
+      }
+    },
+    "node_modules/@types/react-dom": {
+      "version": "18.3.5",
+      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.5.tgz",
+      "integrity": "sha512-P4t6saawp+b/dFrUr2cvkVsfvPguwsxtH6dNIYRllMsefqFzkZk5UIjzyDOv5g1dXIPdG4Sp1yCR4Z6RCUsG/Q==",
+      "dev": true,
+      "license": "MIT",
+      "peerDependencies": {
+        "@types/react": "^18.0.0"
+      }
+    },
+    "node_modules/@types/unist": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+      "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+      "license": "MIT"
+    },
+    "node_modules/@typescript-eslint/eslint-plugin": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.23.0.tgz",
+      "integrity": "sha512-vBz65tJgRrA1Q5gWlRfvoH+w943dq9K1p1yDBY2pc+a1nbBLZp7fB9+Hk8DaALUbzjqlMfgaqlVPT1REJdkt/w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@eslint-community/regexpp": "^4.10.0",
+        "@typescript-eslint/scope-manager": "8.23.0",
+        "@typescript-eslint/type-utils": "8.23.0",
+        "@typescript-eslint/utils": "8.23.0",
+        "@typescript-eslint/visitor-keys": "8.23.0",
+        "graphemer": "^1.4.0",
+        "ignore": "^5.3.1",
+        "natural-compare": "^1.4.0",
+        "ts-api-utils": "^2.0.1"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "@typescript-eslint/parser": "^8.0.0 || ^8.0.0-alpha.0",
+        "eslint": "^8.57.0 || ^9.0.0",
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/@typescript-eslint/parser": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.23.0.tgz",
+      "integrity": "sha512-h2lUByouOXFAlMec2mILeELUbME5SZRN/7R9Cw2RD2lRQQY08MWMM+PmVVKKJNK1aIwqTo9t/0CvOxwPbRIE2Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/scope-manager": "8.23.0",
+        "@typescript-eslint/types": "8.23.0",
+        "@typescript-eslint/typescript-estree": "8.23.0",
+        "@typescript-eslint/visitor-keys": "8.23.0",
+        "debug": "^4.3.4"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^8.57.0 || ^9.0.0",
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/@typescript-eslint/scope-manager": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.23.0.tgz",
+      "integrity": "sha512-OGqo7+dXHqI7Hfm+WqkZjKjsiRtFUQHPdGMXzk5mYXhJUedO7e/Y7i8AK3MyLMgZR93TX4bIzYrfyVjLC+0VSw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/types": "8.23.0",
+        "@typescript-eslint/visitor-keys": "8.23.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      }
+    },
+    "node_modules/@typescript-eslint/type-utils": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.23.0.tgz",
+      "integrity": "sha512-iIuLdYpQWZKbiH+RkCGc6iu+VwscP5rCtQ1lyQ7TYuKLrcZoeJVpcLiG8DliXVkUxirW/PWlmS+d6yD51L9jvA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/typescript-estree": "8.23.0",
+        "@typescript-eslint/utils": "8.23.0",
+        "debug": "^4.3.4",
+        "ts-api-utils": "^2.0.1"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^8.57.0 || ^9.0.0",
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/@typescript-eslint/types": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.23.0.tgz",
+      "integrity": "sha512-1sK4ILJbCmZOTt9k4vkoulT6/y5CHJ1qUYxqpF1K/DBAd8+ZUL4LlSCxOssuH5m4rUaaN0uS0HlVPvd45zjduQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.23.0.tgz",
+      "integrity": "sha512-LcqzfipsB8RTvH8FX24W4UUFk1bl+0yTOf9ZA08XngFwMg4Kj8A+9hwz8Cr/ZS4KwHrmo9PJiLZkOt49vPnuvQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/types": "8.23.0",
+        "@typescript-eslint/visitor-keys": "8.23.0",
+        "debug": "^4.3.4",
+        "fast-glob": "^3.3.2",
+        "is-glob": "^4.0.3",
+        "minimatch": "^9.0.4",
+        "semver": "^7.6.0",
+        "ts-api-utils": "^2.0.1"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
+      "version": "9.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree/node_modules/semver": {
+      "version": "7.7.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.1.tgz",
+      "integrity": "sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/@typescript-eslint/utils": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.23.0.tgz",
+      "integrity": "sha512-uB/+PSo6Exu02b5ZEiVtmY6RVYO7YU5xqgzTIVZwTHvvK3HsL8tZZHFaTLFtRG3CsV4A5mhOv+NZx5BlhXPyIA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@eslint-community/eslint-utils": "^4.4.0",
+        "@typescript-eslint/scope-manager": "8.23.0",
+        "@typescript-eslint/types": "8.23.0",
+        "@typescript-eslint/typescript-estree": "8.23.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^8.57.0 || ^9.0.0",
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/@typescript-eslint/visitor-keys": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.23.0.tgz",
+      "integrity": "sha512-oWWhcWDLwDfu++BGTZcmXWqpwtkwb5o7fxUIGksMQQDSdPW9prsSnfIOZMlsj4vBOSrcnjIUZMiIjODgGosFhQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/types": "8.23.0",
+        "eslint-visitor-keys": "^4.2.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      }
+    },
+    "node_modules/@ungap/structured-clone": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz",
+      "integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==",
+      "license": "ISC"
+    },
+    "node_modules/@vitejs/plugin-react": {
+      "version": "4.3.4",
+      "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.3.4.tgz",
+      "integrity": "sha512-SCCPBJtYLdE8PX/7ZQAs1QAZ8Jqwih+0VBLum1EGqmCCQal+MIUqLCzj3ZUy8ufbC0cAM4LRlSTm7IQJwWT4ug==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/core": "^7.26.0",
+        "@babel/plugin-transform-react-jsx-self": "^7.25.9",
+        "@babel/plugin-transform-react-jsx-source": "^7.25.9",
+        "@types/babel__core": "^7.20.5",
+        "react-refresh": "^0.14.2"
+      },
+      "engines": {
+        "node": "^14.18.0 || >=16.0.0"
+      },
+      "peerDependencies": {
+        "vite": "^4.2.0 || ^5.0.0 || ^6.0.0"
+      }
+    },
+    "node_modules/@vscode/markdown-it-katex": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz",
+      "integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==",
+      "license": "MIT",
+      "dependencies": {
+        "katex": "^0.16.4"
+      }
+    },
+    "node_modules/acorn": {
+      "version": "8.14.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.0.tgz",
+      "integrity": "sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "acorn": "bin/acorn"
+      },
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/acorn-jsx": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+      "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+      "dev": true,
+      "license": "MIT",
+      "peerDependencies": {
+        "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
+      }
+    },
+    "node_modules/ajv": {
+      "version": "6.12.6",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fast-deep-equal": "^3.1.1",
+        "fast-json-stable-stringify": "^2.0.0",
+        "json-schema-traverse": "^0.4.1",
+        "uri-js": "^4.2.2"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/epoberezkin"
+      }
+    },
+    "node_modules/ansi-regex": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
+      "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
+      }
+    },
+    "node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "license": "MIT",
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/any-promise": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz",
+      "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==",
+      "license": "MIT"
+    },
+    "node_modules/anymatch": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
+      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
+      "license": "ISC",
+      "dependencies": {
+        "normalize-path": "^3.0.0",
+        "picomatch": "^2.0.4"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/arg": {
+      "version": "5.0.2",
+      "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz",
+      "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==",
+      "license": "MIT"
+    },
+    "node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "dev": true,
+      "license": "Python-2.0"
+    },
+    "node_modules/autoprefixer": {
+      "version": "10.4.20",
+      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
+      "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/autoprefixer"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "browserslist": "^4.23.3",
+        "caniuse-lite": "^1.0.30001646",
+        "fraction.js": "^4.3.7",
+        "normalize-range": "^0.1.2",
+        "picocolors": "^1.0.1",
+        "postcss-value-parser": "^4.2.0"
+      },
+      "bin": {
+        "autoprefixer": "bin/autoprefixer"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      },
+      "peerDependencies": {
+        "postcss": "^8.1.0"
+      }
+    },
+    "node_modules/bail": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
+      "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/balanced-match": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "license": "MIT"
+    },
+    "node_modules/binary-extensions": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
+      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/brace-expansion": {
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0",
+        "concat-map": "0.0.1"
+      }
+    },
+    "node_modules/braces": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+      "license": "MIT",
+      "dependencies": {
+        "fill-range": "^7.1.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/browserslist": {
+      "version": "4.24.4",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.4.tgz",
+      "integrity": "sha512-KDi1Ny1gSePi1vm0q4oxSF8b4DR44GF4BbmS2YdhPLOEqd8pDviZOGH/GsmRwoWJ2+5Lr085X7naowMwKHDG1A==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "caniuse-lite": "^1.0.30001688",
+        "electron-to-chromium": "^1.5.73",
+        "node-releases": "^2.0.19",
+        "update-browserslist-db": "^1.1.1"
+      },
+      "bin": {
+        "browserslist": "cli.js"
+      },
+      "engines": {
+        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
+      }
+    },
+    "node_modules/buffer-builder": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz",
+      "integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==",
+      "devOptional": true,
+      "license": "MIT/X11"
+    },
+    "node_modules/callsites": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/camelcase-css": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz",
+      "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/caniuse-lite": {
+      "version": "1.0.30001697",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001697.tgz",
+      "integrity": "sha512-GwNPlWJin8E+d7Gxq96jxM6w0w+VFeyyXRsjU58emtkYqnbwHqXm5uT2uCmO0RQE9htWknOP4xtBlLmM/gWxvQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "CC-BY-4.0"
+    },
+    "node_modules/ccount": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz",
+      "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/character-entities": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
+      "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-entities-html4": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz",
+      "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-entities-legacy": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
+      "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-reference-invalid": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz",
+      "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/chokidar": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
+      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
+      "license": "MIT",
+      "dependencies": {
+        "anymatch": "~3.1.2",
+        "braces": "~3.0.2",
+        "glob-parent": "~5.1.2",
+        "is-binary-path": "~2.1.0",
+        "is-glob": "~4.0.1",
+        "normalize-path": "~3.0.0",
+        "readdirp": "~3.6.0"
+      },
+      "engines": {
+        "node": ">= 8.10.0"
+      },
+      "funding": {
+        "url": "https://paulmillr.com/funding/"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/chokidar/node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "license": "MIT"
+    },
+    "node_modules/colorjs.io": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz",
+      "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==",
+      "devOptional": true,
+      "license": "MIT"
+    },
+    "node_modules/comma-separated-tokens": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
+      "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/commander": {
+      "version": "8.3.0",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
+      "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 12"
+      }
+    },
+    "node_modules/concat-map": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/convert-source-map": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz",
+      "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/cookie": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.0.2.tgz",
+      "integrity": "sha512-9Kr/j4O16ISv8zBBhJoi4bXOYNTkFLOqSL3UDB0njXxCXNezjeyVrJyGOWtgfs/q2km1gwBcfH8q1yEGoMYunA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/cross-spawn": {
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
+      "license": "MIT",
+      "dependencies": {
+        "path-key": "^3.1.0",
+        "shebang-command": "^2.0.0",
+        "which": "^2.0.1"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/css-selector-tokenizer": {
+      "version": "0.8.0",
+      "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz",
+      "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==",
+      "license": "MIT",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "fastparse": "^1.1.2"
+      }
+    },
+    "node_modules/cssesc": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
+      "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==",
+      "license": "MIT",
+      "bin": {
+        "cssesc": "bin/cssesc"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/csstype": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
+      "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
+      "license": "MIT"
+    },
+    "node_modules/culori": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/culori/-/culori-3.3.0.tgz",
+      "integrity": "sha512-pHJg+jbuFsCjz9iclQBqyL3B2HLCBF71BwVNujUYEvCeQMvV97R59MNK3R2+jgJ3a1fcZgI9B3vYgz8lzr/BFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      }
+    },
+    "node_modules/daisyui": {
+      "version": "4.12.23",
+      "resolved": "https://registry.npmjs.org/daisyui/-/daisyui-4.12.23.tgz",
+      "integrity": "sha512-EM38duvxutJ5PD65lO/AFMpcw+9qEy6XAZrTpzp7WyaPeO/l+F/Qiq0ECHHmFNcFXh5aVoALY4MGrrxtCiaQCQ==",
+      "license": "MIT",
+      "dependencies": {
+        "css-selector-tokenizer": "^0.8",
+        "culori": "^3",
+        "picocolors": "^1",
+        "postcss-js": "^4"
+      },
+      "engines": {
+        "node": ">=16.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/daisyui"
+      }
+    },
+    "node_modules/debug": {
+      "version": "4.4.0",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz",
+      "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==",
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/decode-named-character-reference": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz",
+      "integrity": "sha512-O8x12RzrUF8xyVcY0KJowWsmaJxQbmy0/EtnNtHRpsOcT7dFk5W598coHqBVpmWo1oQQfsCqfCmkZN5DJrZVdg==",
+      "license": "MIT",
+      "dependencies": {
+        "character-entities": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/deep-is": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+      "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/dequal": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/devlop": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
+      "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
+      "license": "MIT",
+      "dependencies": {
+        "dequal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/didyoumean": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz",
+      "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==",
+      "license": "Apache-2.0"
+    },
+    "node_modules/dlv": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz",
+      "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==",
+      "license": "MIT"
+    },
+    "node_modules/eastasianwidth": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
+      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==",
+      "license": "MIT"
+    },
+    "node_modules/electron-to-chromium": {
+      "version": "1.5.91",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.91.tgz",
+      "integrity": "sha512-sNSHHyq048PFmZY4S90ax61q+gLCs0X0YmcOII9wG9S2XwbVr+h4VW2wWhnbp/Eys3cCwTxVF292W3qPaxIapQ==",
+      "license": "ISC"
+    },
+    "node_modules/emoji-regex": {
+      "version": "9.2.2",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
+      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
+      "license": "MIT"
+    },
+    "node_modules/entities": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=0.12"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/entities?sponsor=1"
+      }
+    },
+    "node_modules/esbuild": {
+      "version": "0.24.2",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.24.2.tgz",
+      "integrity": "sha512-+9egpBW8I3CD5XPe0n6BfT5fxLzxrlDzqydF3aviG+9ni1lDC/OvMHcxqEFV0+LANZG5R1bFMWfUrjVsdwxJvA==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.24.2",
+        "@esbuild/android-arm": "0.24.2",
+        "@esbuild/android-arm64": "0.24.2",
+        "@esbuild/android-x64": "0.24.2",
+        "@esbuild/darwin-arm64": "0.24.2",
+        "@esbuild/darwin-x64": "0.24.2",
+        "@esbuild/freebsd-arm64": "0.24.2",
+        "@esbuild/freebsd-x64": "0.24.2",
+        "@esbuild/linux-arm": "0.24.2",
+        "@esbuild/linux-arm64": "0.24.2",
+        "@esbuild/linux-ia32": "0.24.2",
+        "@esbuild/linux-loong64": "0.24.2",
+        "@esbuild/linux-mips64el": "0.24.2",
+        "@esbuild/linux-ppc64": "0.24.2",
+        "@esbuild/linux-riscv64": "0.24.2",
+        "@esbuild/linux-s390x": "0.24.2",
+        "@esbuild/linux-x64": "0.24.2",
+        "@esbuild/netbsd-arm64": "0.24.2",
+        "@esbuild/netbsd-x64": "0.24.2",
+        "@esbuild/openbsd-arm64": "0.24.2",
+        "@esbuild/openbsd-x64": "0.24.2",
+        "@esbuild/sunos-x64": "0.24.2",
+        "@esbuild/win32-arm64": "0.24.2",
+        "@esbuild/win32-ia32": "0.24.2",
+        "@esbuild/win32-x64": "0.24.2"
+      }
+    },
+    "node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/escape-string-regexp": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
+      "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/eslint": {
+      "version": "9.19.0",
+      "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.19.0.tgz",
+      "integrity": "sha512-ug92j0LepKlbbEv6hD911THhoRHmbdXt2gX+VDABAW/Ir7D3nqKdv5Pf5vtlyY6HQMTEP2skXY43ueqTCWssEA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@eslint-community/eslint-utils": "^4.2.0",
+        "@eslint-community/regexpp": "^4.12.1",
+        "@eslint/config-array": "^0.19.0",
+        "@eslint/core": "^0.10.0",
+        "@eslint/eslintrc": "^3.2.0",
+        "@eslint/js": "9.19.0",
+        "@eslint/plugin-kit": "^0.2.5",
+        "@humanfs/node": "^0.16.6",
+        "@humanwhocodes/module-importer": "^1.0.1",
+        "@humanwhocodes/retry": "^0.4.1",
+        "@types/estree": "^1.0.6",
+        "@types/json-schema": "^7.0.15",
+        "ajv": "^6.12.4",
+        "chalk": "^4.0.0",
+        "cross-spawn": "^7.0.6",
+        "debug": "^4.3.2",
+        "escape-string-regexp": "^4.0.0",
+        "eslint-scope": "^8.2.0",
+        "eslint-visitor-keys": "^4.2.0",
+        "espree": "^10.3.0",
+        "esquery": "^1.5.0",
+        "esutils": "^2.0.2",
+        "fast-deep-equal": "^3.1.3",
+        "file-entry-cache": "^8.0.0",
+        "find-up": "^5.0.0",
+        "glob-parent": "^6.0.2",
+        "ignore": "^5.2.0",
+        "imurmurhash": "^0.1.4",
+        "is-glob": "^4.0.0",
+        "json-stable-stringify-without-jsonify": "^1.0.1",
+        "lodash.merge": "^4.6.2",
+        "minimatch": "^3.1.2",
+        "natural-compare": "^1.4.0",
+        "optionator": "^0.9.3"
+      },
+      "bin": {
+        "eslint": "bin/eslint.js"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "url": "https://eslint.org/donate"
+      },
+      "peerDependencies": {
+        "jiti": "*"
+      },
+      "peerDependenciesMeta": {
+        "jiti": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/eslint-plugin-react-hooks": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/eslint-plugin-react-hooks/-/eslint-plugin-react-hooks-5.1.0.tgz",
+      "integrity": "sha512-mpJRtPgHN2tNAvZ35AMfqeB3Xqeo273QxrHJsbBEPWODRM4r0yB6jfoROqKEYrOn27UtRPpcpHc2UqyBSuUNTw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "peerDependencies": {
+        "eslint": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0-0 || ^9.0.0"
+      }
+    },
+    "node_modules/eslint-plugin-react-refresh": {
+      "version": "0.4.18",
+      "resolved": "https://registry.npmjs.org/eslint-plugin-react-refresh/-/eslint-plugin-react-refresh-0.4.18.tgz",
+      "integrity": "sha512-IRGEoFn3OKalm3hjfolEWGqoF/jPqeEYFp+C8B0WMzwGwBMvlRDQd06kghDhF0C61uJ6WfSDhEZE/sAQjduKgw==",
+      "dev": true,
+      "license": "MIT",
+      "peerDependencies": {
+        "eslint": ">=8.40"
+      }
+    },
+    "node_modules/eslint-scope": {
+      "version": "8.2.0",
+      "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.2.0.tgz",
+      "integrity": "sha512-PHlWUfG6lvPc3yvP5A4PNyBL1W8fkDUccmI21JUu/+GKZBoH/W5u6usENXUrWFRsyoW5ACUjFGgAFQp5gUlb/A==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "esrecurse": "^4.3.0",
+        "estraverse": "^5.2.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/eslint-visitor-keys": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz",
+      "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/espree": {
+      "version": "10.3.0",
+      "resolved": "https://registry.npmjs.org/espree/-/espree-10.3.0.tgz",
+      "integrity": "sha512-0QYC8b24HWY8zjRnDTL6RiHfDbAWn63qb4LMj1Z4b076A4une81+z03Kg7l7mn/48PUTqoLptSXez8oknU8Clg==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "acorn": "^8.14.0",
+        "acorn-jsx": "^5.3.2",
+        "eslint-visitor-keys": "^4.2.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/esquery": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz",
+      "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "estraverse": "^5.1.0"
+      },
+      "engines": {
+        "node": ">=0.10"
+      }
+    },
+    "node_modules/esrecurse": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+      "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "estraverse": "^5.2.0"
+      },
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/estraverse": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/estree-util-is-identifier-name": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz",
+      "integrity": "sha512-hFtqIDZTIUZ9BXLb8y4pYGyk6+wekIivNVTcmvk8NoOh+VeRn5y6cEHzbURrWbfp1fIqdVipilzj+lfaadNZmg==",
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/esutils": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extend": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
+      "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
+      "license": "MIT"
+    },
+    "node_modules/fast-deep-equal": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fast-glob": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
+      "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
+      "license": "MIT",
+      "dependencies": {
+        "@nodelib/fs.stat": "^2.0.2",
+        "@nodelib/fs.walk": "^1.2.3",
+        "glob-parent": "^5.1.2",
+        "merge2": "^1.3.0",
+        "micromatch": "^4.0.8"
+      },
+      "engines": {
+        "node": ">=8.6.0"
+      }
+    },
+    "node_modules/fast-glob/node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/fast-json-stable-stringify": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fast-levenshtein": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+      "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fastparse": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz",
+      "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==",
+      "license": "MIT"
+    },
+    "node_modules/fastq": {
+      "version": "1.19.0",
+      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.0.tgz",
+      "integrity": "sha512-7SFSRCNjBQIZH/xZR3iy5iQYR8aGBE0h3VG6/cwlbrpdciNYBMotQav8c1XI3HjHH+NikUpP53nPdlZSdWmFzA==",
+      "license": "ISC",
+      "dependencies": {
+        "reusify": "^1.0.4"
+      }
+    },
+    "node_modules/file-entry-cache": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
+      "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "flat-cache": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
+    "node_modules/fill-range": {
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+      "license": "MIT",
+      "dependencies": {
+        "to-regex-range": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/find-up": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
+      "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "locate-path": "^6.0.0",
+        "path-exists": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/flat-cache": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz",
+      "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "flatted": "^3.2.9",
+        "keyv": "^4.5.4"
+      },
+      "engines": {
+        "node": ">=16"
+      }
+    },
+    "node_modules/flatted": {
+      "version": "3.3.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.2.tgz",
+      "integrity": "sha512-AiwGJM8YcNOaobumgtng+6NHuOqC3A7MixFeDafM3X9cIUM+xUXoS5Vfgf+OihAYe20fxqNM9yPBXJzRtZ/4eA==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/foreground-child": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz",
+      "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==",
+      "license": "ISC",
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "signal-exit": "^4.0.1"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/fraction.js": {
+      "version": "4.3.7",
+      "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz",
+      "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==",
+      "license": "MIT",
+      "engines": {
+        "node": "*"
+      },
+      "funding": {
+        "type": "patreon",
+        "url": "https://github.com/sponsors/rawify"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/gensync": {
+      "version": "1.0.0-beta.2",
+      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
+      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/glob": {
+      "version": "10.4.5",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz",
+      "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==",
+      "license": "ISC",
+      "dependencies": {
+        "foreground-child": "^3.1.0",
+        "jackspeak": "^3.1.2",
+        "minimatch": "^9.0.4",
+        "minipass": "^7.1.2",
+        "package-json-from-dist": "^1.0.0",
+        "path-scurry": "^1.11.1"
+      },
+      "bin": {
+        "glob": "dist/esm/bin.mjs"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/glob-parent": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
+      "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.3"
+      },
+      "engines": {
+        "node": ">=10.13.0"
+      }
+    },
+    "node_modules/glob/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/glob/node_modules/minimatch": {
+      "version": "9.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/globals": {
+      "version": "15.14.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-15.14.0.tgz",
+      "integrity": "sha512-OkToC372DtlQeje9/zHIo5CT8lRP/FUgEOKBEhU4e0abL7J7CD24fD9ohiLN5hagG/kWCYj4K5oaxxtj2Z0Dig==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/graphemer": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
+      "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "devOptional": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/hast-util-from-dom": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz",
+      "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==",
+      "license": "ISC",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "hastscript": "^9.0.0",
+        "web-namespaces": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-html": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz",
+      "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "devlop": "^1.1.0",
+        "hast-util-from-parse5": "^8.0.0",
+        "parse5": "^7.0.0",
+        "vfile": "^6.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-html-isomorphic": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz",
+      "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "hast-util-from-dom": "^5.0.0",
+        "hast-util-from-html": "^2.0.0",
+        "unist-util-remove-position": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.2.tgz",
+      "integrity": "sha512-SfMzfdAi/zAoZ1KkFEyyeXBn7u/ShQrfd675ZEE9M3qj+PMFX05xubzRyF76CCSJu8au9jgVxDV1+okFvgZU4A==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/unist": "^3.0.0",
+        "devlop": "^1.0.0",
+        "hastscript": "^9.0.0",
+        "property-information": "^6.0.0",
+        "vfile": "^6.0.0",
+        "vfile-location": "^5.0.0",
+        "web-namespaces": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-is-element": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz",
+      "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-parse-selector": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz",
+      "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-to-jsx-runtime": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.2.tgz",
+      "integrity": "sha512-1ngXYb+V9UT5h+PxNRa1O1FYguZK/XL+gkeqvp7EdHlB9oHUG0eYRo/vY5inBdcqo3RkPMC58/H94HvkbfGdyg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/unist": "^3.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "devlop": "^1.0.0",
+        "estree-util-is-identifier-name": "^3.0.0",
+        "hast-util-whitespace": "^3.0.0",
+        "mdast-util-mdx-expression": "^2.0.0",
+        "mdast-util-mdx-jsx": "^3.0.0",
+        "mdast-util-mdxjs-esm": "^2.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0",
+        "style-to-object": "^1.0.0",
+        "unist-util-position": "^5.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-to-text": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz",
+      "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/unist": "^3.0.0",
+        "hast-util-is-element": "^3.0.0",
+        "unist-util-find-after": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-whitespace": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
+      "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hastscript": {
+      "version": "9.0.0",
+      "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.0.tgz",
+      "integrity": "sha512-jzaLBGavEDKHrc5EfFImKN7nZKKBdSLIdGvCwDZ9TfzbF2ffXiov8CKE445L2Z1Ek2t/m4SKQ2j6Ipv7NyUolw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "hast-util-parse-selector": "^4.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/highlight.js": {
+      "version": "11.11.1",
+      "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz",
+      "integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==",
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
+    "node_modules/html-url-attributes": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
+      "integrity": "sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/ignore": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
+      "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 4"
+      }
+    },
+    "node_modules/immutable": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz",
+      "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==",
+      "devOptional": true,
+      "license": "MIT"
+    },
+    "node_modules/import-fresh": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
+      "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "parent-module": "^1.0.0",
+        "resolve-from": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/imurmurhash": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+      "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.8.19"
+      }
+    },
+    "node_modules/inline-style-parser": {
+      "version": "0.2.4",
+      "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.4.tgz",
+      "integrity": "sha512-0aO8FkhNZlj/ZIbNi7Lxxr12obT7cL1moPfE4tg1LkX7LlLfC6DeX4l2ZEud1ukP9jNQyNnfzQVqwbwmAATY4Q==",
+      "license": "MIT"
+    },
+    "node_modules/is-alphabetical": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz",
+      "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-alphanumerical": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz",
+      "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==",
+      "license": "MIT",
+      "dependencies": {
+        "is-alphabetical": "^2.0.0",
+        "is-decimal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-binary-path": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
+      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
+      "license": "MIT",
+      "dependencies": {
+        "binary-extensions": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/is-core-module": {
+      "version": "2.16.1",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz",
+      "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==",
+      "license": "MIT",
+      "dependencies": {
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/is-decimal": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz",
+      "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-fullwidth-code-point": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "license": "MIT",
+      "dependencies": {
+        "is-extglob": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-hexadecimal": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz",
+      "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/is-plain-obj": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
+      "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/isexe": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "license": "ISC"
+    },
+    "node_modules/jackspeak": {
+      "version": "3.4.3",
+      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz",
+      "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==",
+      "license": "BlueOak-1.0.0",
+      "dependencies": {
+        "@isaacs/cliui": "^8.0.2"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      },
+      "optionalDependencies": {
+        "@pkgjs/parseargs": "^0.11.0"
+      }
+    },
+    "node_modules/jiti": {
+      "version": "1.21.7",
+      "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz",
+      "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==",
+      "license": "MIT",
+      "bin": {
+        "jiti": "bin/jiti.js"
+      }
+    },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "license": "MIT"
+    },
+    "node_modules/js-yaml": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/jsesc": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz",
+      "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "jsesc": "bin/jsesc"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/json-buffer": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
+      "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/json-schema-traverse": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/json-stable-stringify-without-jsonify": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+      "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/json5": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
+      "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "json5": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/katex": {
+      "version": "0.16.21",
+      "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.21.tgz",
+      "integrity": "sha512-XvqR7FgOHtWupfMiigNzmh+MgUVmDGU2kXZm899ZkPfcuoPuFxyHmXsgATDpFZDAXCI8tvinaVcDo8PIIJSo4A==",
+      "funding": [
+        "https://opencollective.com/katex",
+        "https://github.com/sponsors/katex"
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "commander": "^8.3.0"
+      },
+      "bin": {
+        "katex": "cli.js"
+      }
+    },
+    "node_modules/keyv": {
+      "version": "4.5.4",
+      "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
+      "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "json-buffer": "3.0.1"
+      }
+    },
+    "node_modules/levn": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz",
+      "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "prelude-ls": "^1.2.1",
+        "type-check": "~0.4.0"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/lilconfig": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz",
+      "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/antonk52"
+      }
+    },
+    "node_modules/lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "license": "MIT"
+    },
+    "node_modules/locate-path": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
+      "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-locate": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/lodash.merge": {
+      "version": "4.6.2",
+      "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
+      "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/longest-streak": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
+      "integrity": "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/loose-envify": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
+      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+      "license": "MIT",
+      "dependencies": {
+        "js-tokens": "^3.0.0 || ^4.0.0"
+      },
+      "bin": {
+        "loose-envify": "cli.js"
+      }
+    },
+    "node_modules/lowlight": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-3.3.0.tgz",
+      "integrity": "sha512-0JNhgFoPvP6U6lE/UdVsSq99tn6DhjjpAj5MxG49ewd2mOBVtwWYIT8ClyABhq198aXXODMU6Ox8DrGy/CpTZQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "devlop": "^1.0.0",
+        "highlight.js": "~11.11.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/lru-cache": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
+      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "yallist": "^3.0.2"
+      }
+    },
+    "node_modules/markdown-table": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz",
+      "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/mdast-util-find-and-replace": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz",
+      "integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "escape-string-regexp": "^5.0.0",
+        "unist-util-is": "^6.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
+      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/mdast-util-from-markdown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz",
+      "integrity": "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-to-string": "^4.0.0",
+        "micromark": "^4.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-decode-string": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unist-util-stringify-position": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.0.0.tgz",
+      "integrity": "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw==",
+      "license": "MIT",
+      "dependencies": {
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-gfm-autolink-literal": "^2.0.0",
+        "mdast-util-gfm-footnote": "^2.0.0",
+        "mdast-util-gfm-strikethrough": "^2.0.0",
+        "mdast-util-gfm-table": "^2.0.0",
+        "mdast-util-gfm-task-list-item": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-autolink-literal": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz",
+      "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "ccount": "^2.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-find-and-replace": "^3.0.0",
+        "micromark-util-character": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-footnote": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.0.0.tgz",
+      "integrity": "sha512-5jOT2boTSVkMnQ7LTrd6n/18kqwjmuYqo7JUPe+tRCY6O7dAuTFMtTPauYYrMPpox9hlN0uOx/FL8XvEfG9/mQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.1.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-strikethrough": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz",
+      "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-table": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz",
+      "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "markdown-table": "^3.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-gfm-task-list-item": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz",
+      "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-math": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-math/-/mdast-util-math-3.0.0.tgz",
+      "integrity": "sha512-Tl9GBNeG/AhJnQM221bJR2HPvLOSnLE/T9cJI9tlc6zwQk2nPk/4f0cHkOdEixQPC/j8UtKDdITswvLAy1OZ1w==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "longest-streak": "^3.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.1.0",
+        "unist-util-remove-position": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdx-expression": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz",
+      "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdx-jsx": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz",
+      "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "ccount": "^2.0.0",
+        "devlop": "^1.1.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "parse-entities": "^4.0.0",
+        "stringify-entities": "^4.0.0",
+        "unist-util-stringify-position": "^4.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdxjs-esm": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz",
+      "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-newline-to-break": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-newline-to-break/-/mdast-util-newline-to-break-2.0.0.tgz",
+      "integrity": "sha512-MbgeFca0hLYIEx/2zGsszCSEJJ1JSCdiY5xQxRcLDDGa8EPvlLPupJ4DSajbMPAnC0je8jfb9TiUATnxxrHUog==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-find-and-replace": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-phrasing": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
+      "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "unist-util-is": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-hast": {
+      "version": "13.2.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
+      "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "@ungap/structured-clone": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "trim-lines": "^3.0.0",
+        "unist-util-position": "^5.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-markdown": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.2.tgz",
+      "integrity": "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "longest-streak": "^3.0.0",
+        "mdast-util-phrasing": "^4.0.0",
+        "mdast-util-to-string": "^4.0.0",
+        "micromark-util-classify-character": "^2.0.0",
+        "micromark-util-decode-string": "^2.0.0",
+        "unist-util-visit": "^5.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-string": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
+      "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/merge2": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/micromark": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.1.tgz",
+      "integrity": "sha512-eBPdkcoCNvYcxQOAKAlceo5SNdzZWfF+FcSupREAzdAh9rRmE239CEQAiTwIgblwnoM8zzj35sZ5ZwvSEOF6Kw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "@types/debug": "^4.0.0",
+        "debug": "^4.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-core-commonmark": "^2.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-combine-extensions": "^2.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-encode": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-resolve-all": "^2.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "micromark-util-subtokenize": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-core-commonmark": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/micromark-core-commonmark/-/micromark-core-commonmark-2.0.2.tgz",
+      "integrity": "sha512-FKjQKbxd1cibWMM1P9N+H8TwlgGgSkWZMmfuVucLCHaYqeSvJ0hFeHsIa65pA2nYbes0f8LDHPMrd9X7Ujxg9w==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-factory-destination": "^2.0.0",
+        "micromark-factory-label": "^2.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-factory-title": "^2.0.0",
+        "micromark-factory-whitespace": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-classify-character": "^2.0.0",
+        "micromark-util-html-tag-name": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-resolve-all": "^2.0.0",
+        "micromark-util-subtokenize": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-extension-gfm": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz",
+      "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==",
+      "license": "MIT",
+      "dependencies": {
+        "micromark-extension-gfm-autolink-literal": "^2.0.0",
+        "micromark-extension-gfm-footnote": "^2.0.0",
+        "micromark-extension-gfm-strikethrough": "^2.0.0",
+        "micromark-extension-gfm-table": "^2.0.0",
+        "micromark-extension-gfm-tagfilter": "^2.0.0",
+        "micromark-extension-gfm-task-list-item": "^2.0.0",
+        "micromark-util-combine-extensions": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-autolink-literal": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz",
+      "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==",
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-footnote": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz",
+      "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==",
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-core-commonmark": "^2.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-strikethrough": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz",
+      "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==",
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-classify-character": "^2.0.0",
+        "micromark-util-resolve-all": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-table": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz",
+      "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==",
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-tagfilter": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz",
+      "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==",
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-gfm-task-list-item": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz",
+      "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==",
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-extension-math": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-math/-/micromark-extension-math-3.1.0.tgz",
+      "integrity": "sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/katex": "^0.16.0",
+        "devlop": "^1.0.0",
+        "katex": "^0.16.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark-factory-destination": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz",
+      "integrity": "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-label": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-label/-/micromark-factory-label-2.0.1.tgz",
+      "integrity": "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-space": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-space/-/micromark-factory-space-2.0.1.tgz",
+      "integrity": "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-title": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-title/-/micromark-factory-title-2.0.1.tgz",
+      "integrity": "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-whitespace": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-whitespace/-/micromark-factory-whitespace-2.0.1.tgz",
+      "integrity": "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-character": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
+      "integrity": "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-chunked": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-chunked/-/micromark-util-chunked-2.0.1.tgz",
+      "integrity": "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-classify-character": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-classify-character/-/micromark-util-classify-character-2.0.1.tgz",
+      "integrity": "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-combine-extensions": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-combine-extensions/-/micromark-util-combine-extensions-2.0.1.tgz",
+      "integrity": "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-decode-numeric-character-reference": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-2.0.2.tgz",
+      "integrity": "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-decode-string": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-decode-string/-/micromark-util-decode-string-2.0.1.tgz",
+      "integrity": "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "decode-named-character-reference": "^1.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-encode": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-encode/-/micromark-util-encode-2.0.1.tgz",
+      "integrity": "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-html-tag-name": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-html-tag-name/-/micromark-util-html-tag-name-2.0.1.tgz",
+      "integrity": "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-normalize-identifier": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-2.0.1.tgz",
+      "integrity": "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-resolve-all": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-resolve-all/-/micromark-util-resolve-all-2.0.1.tgz",
+      "integrity": "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-sanitize-uri": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.1.tgz",
+      "integrity": "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-encode": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-subtokenize": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/micromark-util-subtokenize/-/micromark-util-subtokenize-2.0.4.tgz",
+      "integrity": "sha512-N6hXjrin2GTJDe3MVjf5FuXpm12PGm80BrUAeub9XFXca8JZbP+oIwY4LJSVwFUCL1IPm/WwSVUN7goFHmSGGQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-symbol": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.1.tgz",
+      "integrity": "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-types": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-types/-/micromark-util-types-2.0.1.tgz",
+      "integrity": "sha512-534m2WhVTddrcKVepwmVEVnUAmtrx9bfIjNoQHRqfnvdaHQiFytEhJoTgpWJvDEXCO5gLTQh3wYC1PgOJA4NSQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromatch": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
+      "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==",
+      "license": "MIT",
+      "dependencies": {
+        "braces": "^3.0.3",
+        "picomatch": "^2.3.1"
+      },
+      "engines": {
+        "node": ">=8.6"
+      }
+    },
+    "node_modules/minimatch": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^1.1.7"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/minipass": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
+      "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "license": "MIT"
+    },
+    "node_modules/mz": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz",
+      "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==",
+      "license": "MIT",
+      "dependencies": {
+        "any-promise": "^1.0.0",
+        "object-assign": "^4.0.1",
+        "thenify-all": "^1.0.0"
+      }
+    },
+    "node_modules/nanoid": {
+      "version": "3.3.8",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz",
+      "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/natural-compare": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+      "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/node-releases": {
+      "version": "2.0.19",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.19.tgz",
+      "integrity": "sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==",
+      "license": "MIT"
+    },
+    "node_modules/normalize-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
+      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/normalize-range": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz",
+      "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-hash": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz",
+      "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/optionator": {
+      "version": "0.9.4",
+      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
+      "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "deep-is": "^0.1.3",
+        "fast-levenshtein": "^2.0.6",
+        "levn": "^0.4.1",
+        "prelude-ls": "^1.2.1",
+        "type-check": "^0.4.0",
+        "word-wrap": "^1.2.5"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/p-limit": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
+      "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "yocto-queue": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/p-locate": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
+      "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "p-limit": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/package-json-from-dist": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz",
+      "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==",
+      "license": "BlueOak-1.0.0"
+    },
+    "node_modules/parent-module": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "callsites": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/parse-entities": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.2.tgz",
+      "integrity": "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "character-entities-legacy": "^3.0.0",
+        "character-reference-invalid": "^2.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "is-alphanumerical": "^2.0.0",
+        "is-decimal": "^2.0.0",
+        "is-hexadecimal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/parse-entities/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/parse5": {
+      "version": "7.2.1",
+      "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz",
+      "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==",
+      "license": "MIT",
+      "dependencies": {
+        "entities": "^4.5.0"
+      },
+      "funding": {
+        "url": "https://github.com/inikulin/parse5?sponsor=1"
+      }
+    },
+    "node_modules/path-exists": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+      "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/path-key": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/path-parse": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
+      "license": "MIT"
+    },
+    "node_modules/path-scurry": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
+      "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==",
+      "license": "BlueOak-1.0.0",
+      "dependencies": {
+        "lru-cache": "^10.2.0",
+        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/path-scurry/node_modules/lru-cache": {
+      "version": "10.4.3",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz",
+      "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==",
+      "license": "ISC"
+    },
+    "node_modules/picocolors": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+      "license": "ISC"
+    },
+    "node_modules/picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/pify": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz",
+      "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/pirates": {
+      "version": "4.0.6",
+      "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz",
+      "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/postcss": {
+      "version": "8.5.1",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.1.tgz",
+      "integrity": "sha512-6oz2beyjc5VMn/KV1pPw8fliQkhBXrVn1Z3TVyqZxU8kZpzEKhBdmCFqI6ZbmGtamQvQGuU1sgPTk8ZrXDD7jQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/postcss"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "nanoid": "^3.3.8",
+        "picocolors": "^1.1.1",
+        "source-map-js": "^1.2.1"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      }
+    },
+    "node_modules/postcss-import": {
+      "version": "15.1.0",
+      "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz",
+      "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==",
+      "license": "MIT",
+      "dependencies": {
+        "postcss-value-parser": "^4.0.0",
+        "read-cache": "^1.0.0",
+        "resolve": "^1.1.7"
+      },
+      "engines": {
+        "node": ">=14.0.0"
+      },
+      "peerDependencies": {
+        "postcss": "^8.0.0"
+      }
+    },
+    "node_modules/postcss-js": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz",
+      "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==",
+      "license": "MIT",
+      "dependencies": {
+        "camelcase-css": "^2.0.1"
+      },
+      "engines": {
+        "node": "^12 || ^14 || >= 16"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/postcss/"
+      },
+      "peerDependencies": {
+        "postcss": "^8.4.21"
+      }
+    },
+    "node_modules/postcss-load-config": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz",
+      "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "lilconfig": "^3.0.0",
+        "yaml": "^2.3.4"
+      },
+      "engines": {
+        "node": ">= 14"
+      },
+      "peerDependencies": {
+        "postcss": ">=8.0.9",
+        "ts-node": ">=9.0.0"
+      },
+      "peerDependenciesMeta": {
+        "postcss": {
+          "optional": true
+        },
+        "ts-node": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/postcss-nested": {
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz",
+      "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "postcss-selector-parser": "^6.1.1"
+      },
+      "engines": {
+        "node": ">=12.0"
+      },
+      "peerDependencies": {
+        "postcss": "^8.2.14"
+      }
+    },
+    "node_modules/postcss-selector-parser": {
+      "version": "6.1.2",
+      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz",
+      "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==",
+      "license": "MIT",
+      "dependencies": {
+        "cssesc": "^3.0.0",
+        "util-deprecate": "^1.0.2"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/postcss-value-parser": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz",
+      "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==",
+      "license": "MIT"
+    },
+    "node_modules/prelude-ls": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
+      "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/prettier": {
+      "version": "3.4.2",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.4.2.tgz",
+      "integrity": "sha512-e9MewbtFo+Fevyuxn/4rrcDAaq0IYxPGLvObpQjiZBMAzB9IGmzlnG9RZy3FFas+eBMu2vA0CszMeduow5dIuQ==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "prettier": "bin/prettier.cjs"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/prettier/prettier?sponsor=1"
+      }
+    },
+    "node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/punycode": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
+      "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/queue-microtask": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/react": {
+      "version": "18.3.1",
+      "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
+      "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/react-dom": {
+      "version": "18.3.1",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
+      "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0",
+        "scheduler": "^0.23.2"
+      },
+      "peerDependencies": {
+        "react": "^18.3.1"
+      }
+    },
+    "node_modules/react-markdown": {
+      "version": "9.0.3",
+      "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz",
+      "integrity": "sha512-Yk7Z94dbgYTOrdk41Z74GoKA7rThnsbbqBTRYuxoe08qvfQ9tJVhmAKw6BJS/ZORG7kTy/s1QvYzSuaoBA1qfw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "devlop": "^1.0.0",
+        "hast-util-to-jsx-runtime": "^2.0.0",
+        "html-url-attributes": "^3.0.0",
+        "mdast-util-to-hast": "^13.0.0",
+        "remark-parse": "^11.0.0",
+        "remark-rehype": "^11.0.0",
+        "unified": "^11.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      },
+      "peerDependencies": {
+        "@types/react": ">=18",
+        "react": ">=18"
+      }
+    },
+    "node_modules/react-refresh": {
+      "version": "0.14.2",
+      "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.14.2.tgz",
+      "integrity": "sha512-jCvmsr+1IUSMUyzOkRcvnVbX3ZYC6g9TDrDbFuFmRDq7PD4yaGbLKNQL6k2jnArV8hjYxh7hVhAZB6s9HDGpZA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/react-router": {
+      "version": "7.1.5",
+      "resolved": "https://registry.npmjs.org/react-router/-/react-router-7.1.5.tgz",
+      "integrity": "sha512-8BUF+hZEU4/z/JD201yK6S+UYhsf58bzYIDq2NS1iGpwxSXDu7F+DeGSkIXMFBuHZB21FSiCzEcUb18cQNdRkA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/cookie": "^0.6.0",
+        "cookie": "^1.0.1",
+        "set-cookie-parser": "^2.6.0",
+        "turbo-stream": "2.4.0"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      },
+      "peerDependencies": {
+        "react": ">=18",
+        "react-dom": ">=18"
+      },
+      "peerDependenciesMeta": {
+        "react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/read-cache": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz",
+      "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==",
+      "license": "MIT",
+      "dependencies": {
+        "pify": "^2.3.0"
+      }
+    },
+    "node_modules/readdirp": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
+      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
+      "license": "MIT",
+      "dependencies": {
+        "picomatch": "^2.2.1"
+      },
+      "engines": {
+        "node": ">=8.10.0"
+      }
+    },
+    "node_modules/rehype-highlight": {
+      "version": "7.0.2",
+      "resolved": "https://registry.npmjs.org/rehype-highlight/-/rehype-highlight-7.0.2.tgz",
+      "integrity": "sha512-k158pK7wdC2qL3M5NcZROZ2tR/l7zOzjxXd5VGdcfIyoijjQqpHd3JKtYSBDpDZ38UI2WJWuFAtkMDxmx5kstA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "hast-util-to-text": "^4.0.0",
+        "lowlight": "^3.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-katex": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/rehype-katex/-/rehype-katex-7.0.1.tgz",
+      "integrity": "sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/katex": "^0.16.0",
+        "hast-util-from-html-isomorphic": "^2.0.0",
+        "hast-util-to-text": "^4.0.0",
+        "katex": "^0.16.0",
+        "unist-util-visit-parents": "^6.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-breaks": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/remark-breaks/-/remark-breaks-4.0.0.tgz",
+      "integrity": "sha512-IjEjJOkH4FuJvHZVIW0QCDWxcG96kCq7An/KVH2NfJe6rKZU2AsHeB3OEjPNRxi4QC34Xdx7I2KGYn6IpT7gxQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-newline-to-break": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-gfm": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.0.tgz",
+      "integrity": "sha512-U92vJgBPkbw4Zfu/IiW2oTZLSL3Zpv+uI7My2eq8JxKgqraFdU8YUGicEJCEgSbeaG+QDFqIcwwfMTOEelPxuA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-gfm": "^3.0.0",
+        "micromark-extension-gfm": "^3.0.0",
+        "remark-parse": "^11.0.0",
+        "remark-stringify": "^11.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-math": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/remark-math/-/remark-math-6.0.0.tgz",
+      "integrity": "sha512-MMqgnP74Igy+S3WwnhQ7kqGlEerTETXMvJhrUzDikVZ2/uogJCb+WHUg97hK9/jcfc0dkD73s3LN8zU49cTEtA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-math": "^3.0.0",
+        "micromark-extension-math": "^3.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-parse": {
+      "version": "11.0.0",
+      "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz",
+      "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-rehype": {
+      "version": "11.1.1",
+      "resolved": "https://registry.npmjs.org/remark-rehype/-/remark-rehype-11.1.1.tgz",
+      "integrity": "sha512-g/osARvjkBXb6Wo0XvAeXQohVta8i84ACbenPpoSsxTOQH/Ae0/RGP4WZgnMH5pMLpsj4FG7OHmcIcXxpza8eQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "mdast-util-to-hast": "^13.0.0",
+        "unified": "^11.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-stringify": {
+      "version": "11.0.0",
+      "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz",
+      "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/resolve": {
+      "version": "1.22.10",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.10.tgz",
+      "integrity": "sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==",
+      "license": "MIT",
+      "dependencies": {
+        "is-core-module": "^2.16.0",
+        "path-parse": "^1.0.7",
+        "supports-preserve-symlinks-flag": "^1.0.0"
+      },
+      "bin": {
+        "resolve": "bin/resolve"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/resolve-from": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/reusify": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz",
+      "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
+      "license": "MIT",
+      "engines": {
+        "iojs": ">=1.0.0",
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/rollup": {
+      "version": "4.34.2",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.34.2.tgz",
+      "integrity": "sha512-sBDUoxZEaqLu9QeNalL8v3jw6WjPku4wfZGyTU7l7m1oC+rpRihXc/n/H+4148ZkGz5Xli8CHMns//fFGKvpIQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "1.0.6"
+      },
+      "bin": {
+        "rollup": "dist/bin/rollup"
+      },
+      "engines": {
+        "node": ">=18.0.0",
+        "npm": ">=8.0.0"
+      },
+      "optionalDependencies": {
+        "@rollup/rollup-android-arm-eabi": "4.34.2",
+        "@rollup/rollup-android-arm64": "4.34.2",
+        "@rollup/rollup-darwin-arm64": "4.34.2",
+        "@rollup/rollup-darwin-x64": "4.34.2",
+        "@rollup/rollup-freebsd-arm64": "4.34.2",
+        "@rollup/rollup-freebsd-x64": "4.34.2",
+        "@rollup/rollup-linux-arm-gnueabihf": "4.34.2",
+        "@rollup/rollup-linux-arm-musleabihf": "4.34.2",
+        "@rollup/rollup-linux-arm64-gnu": "4.34.2",
+        "@rollup/rollup-linux-arm64-musl": "4.34.2",
+        "@rollup/rollup-linux-loongarch64-gnu": "4.34.2",
+        "@rollup/rollup-linux-powerpc64le-gnu": "4.34.2",
+        "@rollup/rollup-linux-riscv64-gnu": "4.34.2",
+        "@rollup/rollup-linux-s390x-gnu": "4.34.2",
+        "@rollup/rollup-linux-x64-gnu": "4.34.2",
+        "@rollup/rollup-linux-x64-musl": "4.34.2",
+        "@rollup/rollup-win32-arm64-msvc": "4.34.2",
+        "@rollup/rollup-win32-ia32-msvc": "4.34.2",
+        "@rollup/rollup-win32-x64-msvc": "4.34.2",
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/run-parallel": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "queue-microtask": "^1.2.2"
+      }
+    },
+    "node_modules/rxjs": {
+      "version": "7.8.1",
+      "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz",
+      "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==",
+      "devOptional": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "tslib": "^2.1.0"
+      }
+    },
+    "node_modules/sass-embedded": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.4.tgz",
+      "integrity": "sha512-Hf2burRA/y5PGxsg6jB9UpoK/xZ6g/pgrkOcdl6j+rRg1Zj8XhGKZ1MTysZGtTPUUmiiErqzkP5+Kzp95yv9GQ==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "@bufbuild/protobuf": "^2.0.0",
+        "buffer-builder": "^0.2.0",
+        "colorjs.io": "^0.5.0",
+        "immutable": "^5.0.2",
+        "rxjs": "^7.4.0",
+        "supports-color": "^8.1.1",
+        "sync-child-process": "^1.0.2",
+        "varint": "^6.0.0"
+      },
+      "bin": {
+        "sass": "dist/bin/sass.js"
+      },
+      "engines": {
+        "node": ">=16.0.0"
+      },
+      "optionalDependencies": {
+        "sass-embedded-android-arm": "1.83.4",
+        "sass-embedded-android-arm64": "1.83.4",
+        "sass-embedded-android-ia32": "1.83.4",
+        "sass-embedded-android-riscv64": "1.83.4",
+        "sass-embedded-android-x64": "1.83.4",
+        "sass-embedded-darwin-arm64": "1.83.4",
+        "sass-embedded-darwin-x64": "1.83.4",
+        "sass-embedded-linux-arm": "1.83.4",
+        "sass-embedded-linux-arm64": "1.83.4",
+        "sass-embedded-linux-ia32": "1.83.4",
+        "sass-embedded-linux-musl-arm": "1.83.4",
+        "sass-embedded-linux-musl-arm64": "1.83.4",
+        "sass-embedded-linux-musl-ia32": "1.83.4",
+        "sass-embedded-linux-musl-riscv64": "1.83.4",
+        "sass-embedded-linux-musl-x64": "1.83.4",
+        "sass-embedded-linux-riscv64": "1.83.4",
+        "sass-embedded-linux-x64": "1.83.4",
+        "sass-embedded-win32-arm64": "1.83.4",
+        "sass-embedded-win32-ia32": "1.83.4",
+        "sass-embedded-win32-x64": "1.83.4"
+      }
+    },
+    "node_modules/sass-embedded-android-arm": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.4.tgz",
+      "integrity": "sha512-9Z4pJAOgEkXa3VDY/o+U6l5XvV0mZTJcSl0l/mSPHihjAHSpLYnOW6+KOWeM8dxqrsqTYcd6COzhanI/a++5Gw==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-arm64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.4.tgz",
+      "integrity": "sha512-tgX4FzmbVqnQmD67ZxQDvI+qFNABrboOQgwsG05E5bA/US42zGajW9AxpECJYiMXVOHmg+d81ICbjb0fsVHskw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-ia32": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.4.tgz",
+      "integrity": "sha512-RsFOziFqPcfZXdFRULC4Ayzy9aK6R6FwQ411broCjlOBX+b0gurjRadkue3cfUEUR5mmy0KeCbp7zVKPLTK+5Q==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-riscv64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.4.tgz",
+      "integrity": "sha512-EHwh0nmQarBBrMRU928eTZkFGx19k/XW2YwbPR4gBVdWLkbTgCA5aGe8hTE6/1zStyx++3nDGvTZ78+b/VvvLg==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-android-x64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.4.tgz",
+      "integrity": "sha512-0PgQNuPWYy1jEOEPDVsV89KfqOsMLIp9CSbjBY7jRcwRhyVAcigqrUG6bDeNtojHUYKA1kU+Eh/85WxOHUOgBw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-darwin-arm64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.4.tgz",
+      "integrity": "sha512-rp2ywymWc3nymnSnAFG5R/8hvxWCsuhK3wOnD10IDlmNB7o4rzKby1c+2ZfpQGowlYGWsWWTgz8FW2qzmZsQRw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-darwin-x64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.4.tgz",
+      "integrity": "sha512-kLkN2lXz9PCgGfDS8Ev5YVcl/V2173L6379en/CaFuJJi7WiyPgBymW7hOmfCt4uO4R1y7CP2Uc08DRtZsBlAA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-arm": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.4.tgz",
+      "integrity": "sha512-nL90ryxX2lNmFucr9jYUyHHx21AoAgdCL1O5Ltx2rKg2xTdytAGHYo2MT5S0LIeKLa/yKP/hjuSvrbICYNDvtA==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-arm64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.4.tgz",
+      "integrity": "sha512-E0zjsZX2HgESwyqw31EHtI39DKa7RgK7nvIhIRco1d0QEw227WnoR9pjH3M/ZQy4gQj3GKilOFHM5Krs/omeIA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-ia32": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.4.tgz",
+      "integrity": "sha512-ew5HpchSzgAYbQoriRh8QhlWn5Kw2nQ2jHoV9YLwGKe3fwwOWA0KDedssvDv7FWnY/FCqXyymhLd6Bxae4Xquw==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-arm": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.4.tgz",
+      "integrity": "sha512-0RrJRwMrmm+gG0VOB5b5Cjs7Sd+lhqpQJa6EJNEaZHljJokEfpE5GejZsGMRMIQLxEvVphZnnxl6sonCGFE/QQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-arm64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.4.tgz",
+      "integrity": "sha512-IzMgalf6MZOxgp4AVCgsaWAFDP/IVWOrgVXxkyhw29fyAEoSWBJH4k87wyPhEtxSuzVHLxKNbc8k3UzdWmlBFg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-ia32": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.4.tgz",
+      "integrity": "sha512-LLb4lYbcxPzX4UaJymYXC+WwokxUlfTJEFUv5VF0OTuSsHAGNRs/rslPtzVBTvMeG9TtlOQDhku1F7G6iaDotA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-riscv64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.4.tgz",
+      "integrity": "sha512-zoKlPzD5Z13HKin1UGR74QkEy+kZEk2AkGX5RelRG494mi+IWwRuWCppXIovor9+BQb9eDWPYPoMVahwN5F7VA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-musl-x64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.4.tgz",
+      "integrity": "sha512-hB8+/PYhfEf2zTIcidO5Bpof9trK6WJjZ4T8g2MrxQh8REVtdPcgIkoxczRynqybf9+fbqbUwzXtiUao2GV+vQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-riscv64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.4.tgz",
+      "integrity": "sha512-83fL4n+oeDJ0Y4KjASmZ9jHS1Vl9ESVQYHMhJE0i4xDi/P3BNarm2rsKljq/QtrwGpbqwn8ujzOu7DsNCMDSHA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-linux-x64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.4.tgz",
+      "integrity": "sha512-NlnGdvCmTD5PK+LKXlK3sAuxOgbRIEoZfnHvxd157imCm/s2SYF/R28D0DAAjEViyI8DovIWghgbcqwuertXsA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-arm64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.4.tgz",
+      "integrity": "sha512-J2BFKrEaeSrVazU2qTjyQdAk+MvbzJeTuCET0uAJEXSKtvQ3AzxvzndS7LqkDPbF32eXAHLw8GVpwcBwKbB3Uw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-ia32": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.4.tgz",
+      "integrity": "sha512-uPAe9T/5sANFhJS5dcfAOhOJy8/l2TRYG4r+UO3Wp4yhqbN7bggPvY9c7zMYS0OC8tU/bCvfYUDFHYMCl91FgA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded-win32-x64": {
+      "version": "1.83.4",
+      "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.4.tgz",
+      "integrity": "sha512-C9fkDY0jKITdJFij4UbfPFswxoXN9O/Dr79v17fJnstVwtUojzVJWKHUXvF0Zg2LIR7TCc4ju3adejKFxj7ueA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/sass-embedded/node_modules/supports-color": {
+      "version": "8.1.1",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
+      "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/supports-color?sponsor=1"
+      }
+    },
+    "node_modules/scheduler": {
+      "version": "0.23.2",
+      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz",
+      "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.1.0"
+      }
+    },
+    "node_modules/semver": {
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/set-cookie-parser": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz",
+      "integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==",
+      "license": "MIT"
+    },
+    "node_modules/shebang-command": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+      "license": "MIT",
+      "dependencies": {
+        "shebang-regex": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/shebang-regex": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/signal-exit": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/space-separated-tokens": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
+      "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/string-width": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
+      "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
+      "license": "MIT",
+      "dependencies": {
+        "eastasianwidth": "^0.2.0",
+        "emoji-regex": "^9.2.2",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/string-width-cjs": {
+      "name": "string-width",
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/string-width-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/string-width-cjs/node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "license": "MIT"
+    },
+    "node_modules/string-width-cjs/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/stringify-entities": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz",
+      "integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==",
+      "license": "MIT",
+      "dependencies": {
+        "character-entities-html4": "^2.0.0",
+        "character-entities-legacy": "^3.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/strip-ansi": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
+      "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
+      }
+    },
+    "node_modules/strip-ansi-cjs": {
+      "name": "strip-ansi",
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strip-ansi-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strip-json-comments": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+      "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/style-to-object": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.8.tgz",
+      "integrity": "sha512-xT47I/Eo0rwJmaXC4oilDGDWLohVhR6o/xAQcPQN8q6QBuZVL8qMYL85kLmST5cPjAorwvqIA4qXTRQoYHaL6g==",
+      "license": "MIT",
+      "dependencies": {
+        "inline-style-parser": "0.2.4"
+      }
+    },
+    "node_modules/sucrase": {
+      "version": "3.35.0",
+      "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
+      "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==",
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.2",
+        "commander": "^4.0.0",
+        "glob": "^10.3.10",
+        "lines-and-columns": "^1.1.6",
+        "mz": "^2.7.0",
+        "pirates": "^4.0.1",
+        "ts-interface-checker": "^0.1.9"
+      },
+      "bin": {
+        "sucrase": "bin/sucrase",
+        "sucrase-node": "bin/sucrase-node"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      }
+    },
+    "node_modules/sucrase/node_modules/commander": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz",
+      "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/supports-preserve-symlinks-flag": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
+      "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/sync-child-process": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz",
+      "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==",
+      "devOptional": true,
+      "license": "MIT",
+      "dependencies": {
+        "sync-message-port": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
+    "node_modules/sync-message-port": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz",
+      "integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==",
+      "devOptional": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
+    "node_modules/tailwindcss": {
+      "version": "3.4.17",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.17.tgz",
+      "integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==",
+      "license": "MIT",
+      "dependencies": {
+        "@alloc/quick-lru": "^5.2.0",
+        "arg": "^5.0.2",
+        "chokidar": "^3.6.0",
+        "didyoumean": "^1.2.2",
+        "dlv": "^1.1.3",
+        "fast-glob": "^3.3.2",
+        "glob-parent": "^6.0.2",
+        "is-glob": "^4.0.3",
+        "jiti": "^1.21.6",
+        "lilconfig": "^3.1.3",
+        "micromatch": "^4.0.8",
+        "normalize-path": "^3.0.0",
+        "object-hash": "^3.0.0",
+        "picocolors": "^1.1.1",
+        "postcss": "^8.4.47",
+        "postcss-import": "^15.1.0",
+        "postcss-js": "^4.0.1",
+        "postcss-load-config": "^4.0.2",
+        "postcss-nested": "^6.2.0",
+        "postcss-selector-parser": "^6.1.2",
+        "resolve": "^1.22.8",
+        "sucrase": "^3.35.0"
+      },
+      "bin": {
+        "tailwind": "lib/cli.js",
+        "tailwindcss": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
+    "node_modules/textlinestream": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
+      "integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
+      "license": "MIT"
+    },
+    "node_modules/thenify": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz",
+      "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==",
+      "license": "MIT",
+      "dependencies": {
+        "any-promise": "^1.0.0"
+      }
+    },
+    "node_modules/thenify-all": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz",
+      "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==",
+      "license": "MIT",
+      "dependencies": {
+        "thenify": ">= 3.1.0 < 4"
+      },
+      "engines": {
+        "node": ">=0.8"
+      }
+    },
+    "node_modules/to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "license": "MIT",
+      "dependencies": {
+        "is-number": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8.0"
+      }
+    },
+    "node_modules/trim-lines": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
+      "integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/trough": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz",
+      "integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/ts-api-utils": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.0.1.tgz",
+      "integrity": "sha512-dnlgjFSVetynI8nzgJ+qF62efpglpWRk8isUEWZGWlJYySCTD6aKvbUDu+zbPeDakk3bg5H4XpitHukgfL1m9w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.12"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.8.4"
+      }
+    },
+    "node_modules/ts-interface-checker": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz",
+      "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==",
+      "license": "Apache-2.0"
+    },
+    "node_modules/tslib": {
+      "version": "2.8.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+      "devOptional": true,
+      "license": "0BSD"
+    },
+    "node_modules/turbo-stream": {
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/turbo-stream/-/turbo-stream-2.4.0.tgz",
+      "integrity": "sha512-FHncC10WpBd2eOmGwpmQsWLDoK4cqsA/UT/GqNoaKOQnT8uzhtCbg3EoUDMvqpOSAI0S26mr0rkjzbOO6S3v1g==",
+      "license": "ISC"
+    },
+    "node_modules/type-check": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
+      "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "prelude-ls": "^1.2.1"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "5.6.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz",
+      "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    },
+    "node_modules/typescript-eslint": {
+      "version": "8.23.0",
+      "resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.23.0.tgz",
+      "integrity": "sha512-/LBRo3HrXr5LxmrdYSOCvoAMm7p2jNizNfbIpCgvG4HMsnoprRUOce/+8VJ9BDYWW68rqIENE/haVLWPeFZBVQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/eslint-plugin": "8.23.0",
+        "@typescript-eslint/parser": "8.23.0",
+        "@typescript-eslint/utils": "8.23.0"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^8.57.0 || ^9.0.0",
+        "typescript": ">=4.8.4 <5.8.0"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "6.20.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz",
+      "integrity": "sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==",
+      "devOptional": true,
+      "license": "MIT"
+    },
+    "node_modules/unified": {
+      "version": "11.0.5",
+      "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
+      "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "bail": "^2.0.0",
+        "devlop": "^1.0.0",
+        "extend": "^3.0.0",
+        "is-plain-obj": "^4.0.0",
+        "trough": "^2.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-find-after": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz",
+      "integrity": "sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-is": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.0.tgz",
+      "integrity": "sha512-2qCTHimwdxLfz+YzdGfkqNlH0tLi9xjTnHddPmJwtIG9MGsdbutfTc4P+haPD7l7Cjxf/WZj+we5qfVPvvxfYw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-position": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz",
+      "integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-remove-position": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz",
+      "integrity": "sha512-Hp5Kh3wLxv0PHj9m2yZhhLt58KzPtEYKQQ4yxfYFEO7EvHwzyDYnduhHnY1mDxoqr7VUwVuHXk9RXKIiYS1N8Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-visit": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-stringify-position": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
+      "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-visit": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz",
+      "integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-visit-parents": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.1.tgz",
+      "integrity": "sha512-L/PqWzfTP9lzzEa6CKs0k2nARxTdZduw3zyh8d2NVBnsyvHjSX4TWse388YrrQKbvI8w20fGjGlhgT96WwKykw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/update-browserslist-db": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.2.tgz",
+      "integrity": "sha512-PPypAm5qvlD7XMZC3BujecnaOxwhrtoFR+Dqkk5Aa/6DssiH0ibKoketaj9w8LP7Bont1rYeoV5plxD7RTEPRg==",
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
+      },
+      "bin": {
+        "update-browserslist-db": "cli.js"
+      },
+      "peerDependencies": {
+        "browserslist": ">= 4.21.0"
+      }
+    },
+    "node_modules/uri-js": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "punycode": "^2.1.0"
+      }
+    },
+    "node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+      "license": "MIT"
+    },
+    "node_modules/varint": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz",
+      "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==",
+      "devOptional": true,
+      "license": "MIT"
+    },
+    "node_modules/vfile": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
+      "integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz",
+      "integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-message": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz",
+      "integrity": "sha512-jRDZ1IMLttGj41KcZvlrYAaI3CfqpLpfpf+Mfig13viT6NKvRzWZ+lXz0Y5D60w6uJIBAOGq9mSHf0gktF0duw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-stringify-position": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vite": {
+      "version": "6.0.11",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.11.tgz",
+      "integrity": "sha512-4VL9mQPKoHy4+FE0NnRE/kbY51TOfaknxAjt3fJbGJxhIpBZiqVzlZDEesWWsuREXHwNdAoOFZ9MkPEVXczHwg==",
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "^0.24.2",
+        "postcss": "^8.4.49",
+        "rollup": "^4.23.0"
+      },
+      "bin": {
+        "vite": "bin/vite.js"
+      },
+      "engines": {
+        "node": "^18.0.0 || ^20.0.0 || >=22.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/vitejs/vite?sponsor=1"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      },
+      "peerDependencies": {
+        "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0",
+        "jiti": ">=1.21.0",
+        "less": "*",
+        "lightningcss": "^1.21.0",
+        "sass": "*",
+        "sass-embedded": "*",
+        "stylus": "*",
+        "sugarss": "*",
+        "terser": "^5.16.0",
+        "tsx": "^4.8.1",
+        "yaml": "^2.4.2"
+      },
+      "peerDependenciesMeta": {
+        "@types/node": {
+          "optional": true
+        },
+        "jiti": {
+          "optional": true
+        },
+        "less": {
+          "optional": true
+        },
+        "lightningcss": {
+          "optional": true
+        },
+        "sass": {
+          "optional": true
+        },
+        "sass-embedded": {
+          "optional": true
+        },
+        "stylus": {
+          "optional": true
+        },
+        "sugarss": {
+          "optional": true
+        },
+        "terser": {
+          "optional": true
+        },
+        "tsx": {
+          "optional": true
+        },
+        "yaml": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/vite-plugin-singlefile": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/vite-plugin-singlefile/-/vite-plugin-singlefile-2.1.0.tgz",
+      "integrity": "sha512-7tJo+UgZABlKpY/nubth/wxJ4+pUGREPnEwNOknxwl2MM0zTvF14KTU4Ln1lc140gjLLV5mjDrvuoquU7OZqCg==",
+      "license": "MIT",
+      "dependencies": {
+        "micromatch": "^4.0.8"
+      },
+      "engines": {
+        "node": ">18.0.0"
+      },
+      "peerDependencies": {
+        "rollup": "^4.28.1",
+        "vite": "^5.4.11 || ^6.0.0"
+      }
+    },
+    "node_modules/web-namespaces": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz",
+      "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/which": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "license": "ISC",
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "node-which": "bin/node-which"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/word-wrap": {
+      "version": "1.2.5",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
+      "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/wrap-ansi": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
+      "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^6.1.0",
+        "string-width": "^5.0.1",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
+    "node_modules/wrap-ansi-cjs": {
+      "name": "wrap-ansi",
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
+    "node_modules/wrap-ansi-cjs/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/wrap-ansi-cjs/node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "license": "MIT"
+    },
+    "node_modules/wrap-ansi-cjs/node_modules/string-width": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/wrap-ansi-cjs/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/wrap-ansi/node_modules/ansi-styles": {
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
+      "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/yallist": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
+      "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/yaml": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.7.0.tgz",
+      "integrity": "sha512-+hSoy/QHluxmC9kCIJyL/uyFmLmc+e5CFR5Wa+bpIhIj85LVb9ZH2nVnqrHoSvKogwODv0ClqZkmiSSaIH5LTA==",
+      "license": "ISC",
+      "bin": {
+        "yaml": "bin.mjs"
+      },
+      "engines": {
+        "node": ">= 14"
+      }
+    },
+    "node_modules/yocto-queue": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
+      "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/zwitch": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
+      "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    }
+  }
+}
diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json
new file mode 100644
index 000000000..3be2b14de
--- /dev/null
+++ b/examples/server/webui/package.json
@@ -0,0 +1,59 @@
+{
+  "name": "webui",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc -b && vite build",
+    "format": "eslint . && prettier --write .",
+    "lint": "eslint .",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "@heroicons/react": "^2.2.0",
+    "@sec-ant/readable-stream": "^0.6.0",
+    "@vscode/markdown-it-katex": "^1.1.1",
+    "autoprefixer": "^10.4.20",
+    "daisyui": "^4.12.14",
+    "highlight.js": "^11.10.0",
+    "katex": "^0.16.15",
+    "postcss": "^8.4.49",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "react-markdown": "^9.0.3",
+    "react-router": "^7.1.5",
+    "rehype-highlight": "^7.0.2",
+    "rehype-katex": "^7.0.1",
+    "remark-breaks": "^4.0.0",
+    "remark-gfm": "^4.0.0",
+    "remark-math": "^6.0.0",
+    "tailwindcss": "^3.4.15",
+    "textlinestream": "^1.1.1",
+    "vite-plugin-singlefile": "^2.0.3"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.17.0",
+    "@types/markdown-it": "^14.1.2",
+    "@types/node": "^22.13.1",
+    "@types/react": "^18.3.18",
+    "@types/react-dom": "^18.3.5",
+    "@vitejs/plugin-react": "^4.3.4",
+    "eslint": "^9.17.0",
+    "eslint-plugin-react-hooks": "^5.0.0",
+    "eslint-plugin-react-refresh": "^0.4.16",
+    "globals": "^15.14.0",
+    "prettier": "^3.4.2",
+    "sass-embedded": "^1.83.4",
+    "typescript": "~5.6.2",
+    "typescript-eslint": "^8.18.2",
+    "vite": "^6.0.5"
+  },
+  "prettier": {
+    "trailingComma": "es5",
+    "tabWidth": 2,
+    "semi": true,
+    "singleQuote": true,
+    "bracketSameLine": false
+  }
+}
diff --git a/examples/server/webui/postcss.config.js b/examples/server/webui/postcss.config.js
new file mode 100644
index 000000000..2e7af2b7f
--- /dev/null
+++ b/examples/server/webui/postcss.config.js
@@ -0,0 +1,6 @@
+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
diff --git a/examples/server/webui/public/demo-conversation.json b/examples/server/webui/public/demo-conversation.json
new file mode 100644
index 000000000..338b4aea5
--- /dev/null
+++ b/examples/server/webui/public/demo-conversation.json
@@ -0,0 +1,33 @@
+{
+  "demo": true,
+  "id": "conv-1734086746930",
+  "lastModified": 1734087548943,
+  "messages": [
+    {
+      "id": 1734086764521,
+      "role": "user",
+      "content": "this is a demo conversation, used in dev mode"
+    },
+    {
+      "id": 1734087548327,
+      "role": "assistant",
+      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\n$2x + y = z$\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
+      "timings": {
+        "prompt_n": 1,
+        "prompt_ms": 28.923,
+        "predicted_n": 25,
+        "predicted_ms": 573.016
+      }
+    },
+    {
+      "id": 1734087548328,
+      "role": "user",
+      "content": "this is a demo conversation, used in dev mode"
+    },
+    {
+      "id": 1734087548329,
+      "role": "assistant",
+      "content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```"
+    }
+  ]
+}
diff --git a/examples/server/webui/src/App.tsx b/examples/server/webui/src/App.tsx
new file mode 100644
index 000000000..2ce734682
--- /dev/null
+++ b/examples/server/webui/src/App.tsx
@@ -0,0 +1,47 @@
+import { HashRouter, Outlet, Route, Routes } from 'react-router';
+import Header from './components/Header';
+import Sidebar from './components/Sidebar';
+import { AppContextProvider, useAppContext } from './utils/app.context';
+import ChatScreen from './components/ChatScreen';
+import SettingDialog from './components/SettingDialog';
+
+function App() {
+  return (
+    <HashRouter>
+      <div className="flex flex-row drawer lg:drawer-open">
+        <AppContextProvider>
+          <Routes>
+            <Route element={<AppLayout />}>
+              <Route path="/chat/:convId" element={<ChatScreen />} />
+              <Route path="*" element={<ChatScreen />} />
+            </Route>
+          </Routes>
+        </AppContextProvider>
+      </div>
+    </HashRouter>
+  );
+}
+
+function AppLayout() {
+  const { showSettings, setShowSettings } = useAppContext();
+  return (
+    <>
+      <Sidebar />
+      <div
+        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
+        id="main-scroll"
+      >
+        <Header />
+        <Outlet />
+      </div>
+      {
+        <SettingDialog
+          show={showSettings}
+          onClose={() => setShowSettings(false)}
+        />
+      }
+    </>
+  );
+}
+
+export default App;
diff --git a/examples/server/webui/src/Config.ts b/examples/server/webui/src/Config.ts
new file mode 100644
index 000000000..779ed9bf7
--- /dev/null
+++ b/examples/server/webui/src/Config.ts
@@ -0,0 +1,92 @@
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { isNumeric } from './utils/misc';
+
+export const isDev = import.meta.env.MODE === 'development';
+
+// constants
+export const BASE_URL = new URL('.', document.baseURI).href
+  .toString()
+  .replace(/\/$/, '');
+
+export const CONFIG_DEFAULT = {
+  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  // Do not use nested objects, keep it single level. Prefix the key if you need to group them.
+  apiKey: '',
+  systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
+  showThoughtInProgress: false,
+  excludeThoughtOnReq: true,
+  // make sure these default values are in sync with `common.h`
+  samplers: 'edkypmxt',
+  temperature: 0.8,
+  dynatemp_range: 0.0,
+  dynatemp_exponent: 1.0,
+  top_k: 40,
+  top_p: 0.95,
+  min_p: 0.05,
+  xtc_probability: 0.0,
+  xtc_threshold: 0.1,
+  typical_p: 1.0,
+  repeat_last_n: 64,
+  repeat_penalty: 1.0,
+  presence_penalty: 0.0,
+  frequency_penalty: 0.0,
+  dry_multiplier: 0.0,
+  dry_base: 1.75,
+  dry_allowed_length: 2,
+  dry_penalty_last_n: -1,
+  max_tokens: -1,
+  custom: '', // custom json-stringified object
+  // experimental features
+  pyIntepreterEnabled: false,
+};
+export const CONFIG_INFO: Record<string, string> = {
+  apiKey: 'Set the API Key if you are using --api-key option for the server.',
+  systemMessage: 'The starting message that defines how model should behave.',
+  samplers:
+    'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+  temperature:
+    'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+  dynatemp_range:
+    'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+  dynatemp_exponent:
+    'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+  top_k: 'Keeps only k top tokens.',
+  top_p:
+    'Limits tokens to those that together have a cumulative probability of at least p',
+  min_p:
+    'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+  xtc_probability:
+    'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+  xtc_threshold:
+    'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+  typical_p:
+    'Sorts and limits tokens based on the difference between log-probability and entropy.',
+  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+  repeat_penalty:
+    'Controls the repetition of token sequences in the generated text',
+  presence_penalty:
+    'Limits tokens based on whether they appear in the output or not.',
+  frequency_penalty:
+    'Limits tokens based on how often they appear in the output.',
+  dry_multiplier:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+  dry_base:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+  dry_allowed_length:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+  dry_penalty_last_n:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+  max_tokens: 'The maximum number of token per output.',
+  custom: '', // custom json-stringified object
+};
+// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+export const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT)
+  .filter((e) => isNumeric(e[1]))
+  .map((e) => e[0]);
+// list of themes supported by daisyui
+export const THEMES = ['light', 'dark']
+  // make sure light & dark are always at the beginning
+  .concat(
+    Object.keys(daisyuiThemes).filter((t) => t !== 'light' && t !== 'dark')
+  );
diff --git a/examples/server/webui/src/components/CanvasPyInterpreter.tsx b/examples/server/webui/src/components/CanvasPyInterpreter.tsx
new file mode 100644
index 000000000..c2707fe20
--- /dev/null
+++ b/examples/server/webui/src/components/CanvasPyInterpreter.tsx
@@ -0,0 +1,195 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { OpenInNewTab, XCloseButton } from '../utils/common';
+import { CanvasType } from '../utils/types';
+import { PlayIcon, StopIcon } from '@heroicons/react/24/outline';
+import StorageUtils from '../utils/storage';
+
+const canInterrupt = typeof SharedArrayBuffer === 'function';
+
+// adapted from https://pyodide.org/en/stable/usage/webworker.html
+const WORKER_CODE = `
+importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js");
+
+let stdOutAndErr = [];
+
+let pyodideReadyPromise = loadPyodide({
+  stdout: (data) => stdOutAndErr.push(data),
+  stderr: (data) => stdOutAndErr.push(data),
+});
+
+let alreadySetBuff = false;
+
+self.onmessage = async (event) => {
+  stdOutAndErr = [];
+
+  // make sure loading is done
+  const pyodide = await pyodideReadyPromise;
+  const { id, python, context, interruptBuffer } = event.data;
+
+  if (interruptBuffer && !alreadySetBuff) {
+    pyodide.setInterruptBuffer(interruptBuffer);
+    alreadySetBuff = true;
+  }
+
+  // Now load any packages we need, run the code, and send the result back.
+  await pyodide.loadPackagesFromImports(python);
+
+  // make a Python dictionary with the data from content
+  const dict = pyodide.globals.get("dict");
+  const globals = dict(Object.entries(context));
+  try {
+    self.postMessage({ id, running: true });
+    // Execute the python code in this context
+    const result = pyodide.runPython(python, { globals });
+    self.postMessage({ result, id, stdOutAndErr });
+  } catch (error) {
+    self.postMessage({ error: error.message, id });
+  }
+  interruptBuffer[0] = 0;
+};
+`;
+
+let worker: Worker;
+const interruptBuffer = canInterrupt
+  ? new Uint8Array(new SharedArrayBuffer(1))
+  : null;
+
+const startWorker = () => {
+  if (!worker) {
+    worker = new Worker(
+      URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' }))
+    );
+  }
+};
+
+if (StorageUtils.getConfig().pyIntepreterEnabled) {
+  startWorker();
+}
+
+const runCodeInWorker = (
+  pyCode: string,
+  callbackRunning: () => void
+): {
+  donePromise: Promise<string>;
+  interrupt: () => void;
+} => {
+  startWorker();
+  const id = Math.random() * 1e8;
+  const context = {};
+  if (interruptBuffer) {
+    interruptBuffer[0] = 0;
+  }
+
+  const donePromise = new Promise<string>((resolve) => {
+    worker.onmessage = (event) => {
+      const { error, stdOutAndErr, running } = event.data;
+      if (id !== event.data.id) return;
+      if (running) {
+        callbackRunning();
+        return;
+      } else if (error) {
+        resolve(error.toString());
+      } else {
+        resolve(stdOutAndErr.join('\n'));
+      }
+    };
+    worker.postMessage({ id, python: pyCode, context, interruptBuffer });
+  });
+
+  const interrupt = () => {
+    console.log('Interrupting...');
+    console.trace();
+    if (interruptBuffer) {
+      interruptBuffer[0] = 2;
+    }
+  };
+
+  return { donePromise, interrupt };
+};
+
+export default function CanvasPyInterpreter() {
+  const { canvasData, setCanvasData } = useAppContext();
+
+  const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation
+  const [running, setRunning] = useState(false);
+  const [output, setOutput] = useState('');
+  const [interruptFn, setInterruptFn] = useState<() => void>();
+  const [showStopBtn, setShowStopBtn] = useState(false);
+
+  const runCode = async (pycode: string) => {
+    interruptFn?.();
+    setRunning(true);
+    setOutput('Loading Pyodide...');
+    const { donePromise, interrupt } = runCodeInWorker(pycode, () => {
+      setOutput('Running...');
+      setShowStopBtn(canInterrupt);
+    });
+    setInterruptFn(() => interrupt);
+    const out = await donePromise;
+    setOutput(out);
+    setRunning(false);
+    setShowStopBtn(false);
+  };
+
+  // run code on mount
+  useEffect(() => {
+    setCode(canvasData?.content ?? '');
+    runCode(canvasData?.content ?? '');
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [canvasData?.content]);
+
+  if (canvasData?.type !== CanvasType.PY_INTERPRETER) {
+    return null;
+  }
+
+  return (
+    <div className="card bg-base-200 w-full h-full shadow-xl">
+      <div className="card-body">
+        <div className="flex justify-between items-center mb-4">
+          <span className="text-lg font-bold">Python Interpreter</span>
+          <XCloseButton
+            className="bg-base-100"
+            onClick={() => setCanvasData(null)}
+          />
+        </div>
+        <div className="grid grid-rows-3 gap-4 h-full">
+          <textarea
+            className="textarea textarea-bordered w-full h-full font-mono"
+            value={code}
+            onChange={(e) => setCode(e.target.value)}
+          ></textarea>
+          <div className="font-mono flex flex-col row-span-2">
+            <div className="flex items-center mb-2">
+              <button
+                className="btn btn-sm bg-base-100"
+                onClick={() => runCode(code)}
+                disabled={running}
+              >
+                <PlayIcon className="h-6 w-6" /> Run
+              </button>
+              {showStopBtn && (
+                <button
+                  className="btn btn-sm bg-base-100 ml-2"
+                  onClick={() => interruptFn?.()}
+                >
+                  <StopIcon className="h-6 w-6" /> Stop
+                </button>
+              )}
+              <span className="grow text-right text-xs">
+                <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/11762">
+                  Report a bug
+                </OpenInNewTab>
+              </span>
+            </div>
+            <textarea
+              className="textarea textarea-bordered h-full dark-color"
+              value={output}
+              readOnly
+            ></textarea>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/examples/server/webui/src/components/ChatMessage.tsx b/examples/server/webui/src/components/ChatMessage.tsx
new file mode 100644
index 000000000..ec72196ba
--- /dev/null
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@@ -0,0 +1,235 @@
+import { useMemo, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { Message, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
+
+interface SplitMessage {
+  content: PendingMessage['content'];
+  thought?: string;
+  isThinking?: boolean;
+}
+
+export default function ChatMessage({
+  msg,
+  id,
+  scrollToBottom,
+  isPending,
+}: {
+  msg: Message | PendingMessage;
+  id?: string;
+  scrollToBottom: (requiresNearBottom: boolean) => void;
+  isPending?: boolean;
+}) {
+  const { viewingConversation, replaceMessageAndGenerate, config } =
+    useAppContext();
+  const [editingContent, setEditingContent] = useState<string | null>(null);
+  const timings = useMemo(
+    () =>
+      msg.timings
+        ? {
+            ...msg.timings,
+            prompt_per_second:
+              (msg.timings.prompt_n / msg.timings.prompt_ms) * 1000,
+            predicted_per_second:
+              (msg.timings.predicted_n / msg.timings.predicted_ms) * 1000,
+          }
+        : null,
+    [msg.timings]
+  );
+
+  // for reasoning model, we split the message into content and thought
+  // TODO: implement this as remark/rehype plugin in the future
+  const { content, thought, isThinking }: SplitMessage = useMemo(() => {
+    if (msg.content === null || msg.role !== 'assistant') {
+      return { content: msg.content };
+    }
+    let actualContent = '';
+    let thought = '';
+    let isThinking = false;
+    let thinkSplit = msg.content.split('<think>', 2);
+    actualContent += thinkSplit[0];
+    while (thinkSplit[1] !== undefined) {
+      // <think> tag found
+      thinkSplit = thinkSplit[1].split('</think>', 2);
+      thought += thinkSplit[0];
+      isThinking = true;
+      if (thinkSplit[1] !== undefined) {
+        // </think> closing tag found
+        isThinking = false;
+        thinkSplit = thinkSplit[1].split('<think>', 2);
+        actualContent += thinkSplit[0];
+      }
+    }
+    return { content: actualContent, thought, isThinking };
+  }, [msg]);
+
+  if (!viewingConversation) return null;
+
+  const regenerate = async () => {
+    replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () =>
+      scrollToBottom(true)
+    );
+  };
+
+  return (
+    <div className="group" id={id}>
+      <div
+        className={classNames({
+          chat: true,
+          'chat-start': msg.role !== 'user',
+          'chat-end': msg.role === 'user',
+        })}
+      >
+        <div
+          className={classNames({
+            'chat-bubble markdown': true,
+            'chat-bubble-base-300': msg.role !== 'user',
+          })}
+        >
+          {/* textarea for editing message */}
+          {editingContent !== null && (
+            <>
+              <textarea
+                dir="auto"
+                className="textarea textarea-bordered bg-base-100 text-base-content max-w-2xl w-[calc(90vw-8em)] h-24"
+                value={editingContent}
+                onChange={(e) => setEditingContent(e.target.value)}
+              ></textarea>
+              <br />
+              <button
+                className="btn btn-ghost mt-2 mr-2"
+                onClick={() => setEditingContent(null)}
+              >
+                Cancel
+              </button>
+              <button
+                className="btn mt-2"
+                onClick={() =>
+                  replaceMessageAndGenerate(
+                    viewingConversation.id,
+                    msg.id,
+                    editingContent
+                  )
+                }
+              >
+                Submit
+              </button>
+            </>
+          )}
+          {/* not editing content, render message */}
+          {editingContent === null && (
+            <>
+              {content === null ? (
+                <>
+                  {/* show loading dots for pending message */}
+                  <span className="loading loading-dots loading-md"></span>
+                </>
+              ) : (
+                <>
+                  {/* render message as markdown */}
+                  <div dir="auto">
+                    {thought && (
+                      <details
+                        className="collapse bg-base-200 collapse-arrow mb-4"
+                        open={isThinking && config.showThoughtInProgress}
+                      >
+                        <summary className="collapse-title">
+                          {isPending && isThinking ? (
+                            <span>
+                              <span
+                                v-if="isGenerating"
+                                className="loading loading-spinner loading-md mr-2"
+                                style={{ verticalAlign: 'middle' }}
+                              ></span>
+                              <b>Thinking</b>
+                            </span>
+                          ) : (
+                            <b>Thought Process</b>
+                          )}
+                        </summary>
+                        <div className="collapse-content">
+                          <MarkdownDisplay
+                            content={thought}
+                            isGenerating={isPending}
+                          />
+                        </div>
+                      </details>
+                    )}
+                    <MarkdownDisplay
+                      content={content}
+                      isGenerating={isPending}
+                    />
+                  </div>
+                </>
+              )}
+              {/* render timings if enabled */}
+              {timings && config.showTokensPerSecond && (
+                <div className="dropdown dropdown-hover dropdown-top mt-2">
+                  <div
+                    tabIndex={0}
+                    role="button"
+                    className="cursor-pointer font-semibold text-sm opacity-60"
+                  >
+                    Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                  </div>
+                  <div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+                    <b>Prompt</b>
+                    <br />- Tokens: {timings.prompt_n}
+                    <br />- Time: {timings.prompt_ms} ms
+                    <br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
+                    <br />
+                    <b>Generation</b>
+                    <br />- Tokens: {timings.predicted_n}
+                    <br />- Time: {timings.predicted_ms} ms
+                    <br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                    <br />
+                  </div>
+                </div>
+              )}
+            </>
+          )}
+        </div>
+      </div>
+
+      {/* actions for each message */}
+      {msg.content !== null && (
+        <div
+          className={classNames({
+            'mx-4 mt-2 mb-2': true,
+            'text-right': msg.role === 'user',
+          })}
+        >
+          {/* user message */}
+          {msg.role === 'user' && (
+            <button
+              className="badge btn-mini show-on-hover"
+              onClick={() => setEditingContent(msg.content)}
+              disabled={msg.content === null}
+            >
+              ✍️ Edit
+            </button>
+          )}
+          {/* assistant message */}
+          {msg.role === 'assistant' && (
+            <>
+              {!isPending && (
+                <button
+                  className="badge btn-mini show-on-hover mr-2"
+                  onClick={regenerate}
+                  disabled={msg.content === null}
+                >
+                  🔄 Regenerate
+                </button>
+              )}
+              <CopyButton
+                className="badge btn-mini show-on-hover mr-2"
+                content={msg.content}
+              />
+            </>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx
new file mode 100644
index 000000000..dbc683ed1
--- /dev/null
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -0,0 +1,146 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import StorageUtils from '../utils/storage';
+import { useNavigate } from 'react-router';
+import ChatMessage from './ChatMessage';
+import { CanvasType, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import CanvasPyInterpreter from './CanvasPyInterpreter';
+
+export default function ChatScreen() {
+  const {
+    viewingConversation,
+    sendMessage,
+    isGenerating,
+    stopGenerating,
+    pendingMessages,
+    canvasData,
+  } = useAppContext();
+  const [inputMsg, setInputMsg] = useState('');
+  const navigate = useNavigate();
+
+  const currConvId = viewingConversation?.id ?? '';
+  const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];
+
+  const scrollToBottom = (requiresNearBottom: boolean) => {
+    const mainScrollElem = document.getElementById('main-scroll');
+    if (!mainScrollElem) return;
+    const spaceToBottom =
+      mainScrollElem.scrollHeight -
+      mainScrollElem.scrollTop -
+      mainScrollElem.clientHeight;
+    if (!requiresNearBottom || spaceToBottom < 50) {
+      setTimeout(
+        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
+        1
+      );
+    }
+  };
+
+  // scroll to bottom when conversation changes
+  useEffect(() => {
+    scrollToBottom(false);
+  }, [viewingConversation?.id]);
+
+  const sendNewMessage = async () => {
+    if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return;
+    const convId = viewingConversation?.id ?? StorageUtils.getNewConvId();
+    const lastInpMsg = inputMsg;
+    setInputMsg('');
+    if (!viewingConversation) {
+      // if user is creating a new conversation, redirect to the new conversation
+      navigate(`/chat/${convId}`);
+    }
+    scrollToBottom(false);
+    // auto scroll as message is being generated
+    const onChunk = () => scrollToBottom(true);
+    if (!(await sendMessage(convId, inputMsg, onChunk))) {
+      // restore the input message if failed
+      setInputMsg(lastInpMsg);
+    }
+  };
+
+  const hasCanvas = !!canvasData;
+
+  return (
+    <div
+      className={classNames({
+        'grid lg:gap-8 grow transition-[300ms]': true,
+        'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
+        'grid-cols-[1fr_0fr]': !hasCanvas,
+      })}
+    >
+      <div
+        className={classNames({
+          'flex flex-col w-full max-w-[900px] mx-auto': true,
+          'hidden lg:flex': hasCanvas, // adapted for mobile
+          flex: !hasCanvas,
+        })}
+      >
+        {/* chat messages */}
+        <div id="messages-list" className="grow">
+          <div className="mt-auto flex justify-center">
+            {/* placeholder to shift the message to the bottom */}
+            {viewingConversation ? '' : 'Send a message to start'}
+          </div>
+          {viewingConversation?.messages.map((msg) => (
+            <ChatMessage
+              key={msg.id}
+              msg={msg}
+              scrollToBottom={scrollToBottom}
+            />
+          ))}
+
+          {pendingMsg && (
+            <ChatMessage
+              msg={pendingMsg}
+              scrollToBottom={scrollToBottom}
+              isPending
+              id="pending-msg"
+            />
+          )}
+        </div>
+
+        {/* chat input */}
+        <div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
+          <textarea
+            className="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            value={inputMsg}
+            onChange={(e) => setInputMsg(e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === 'Enter' && e.shiftKey) return;
+              if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendNewMessage();
+              }
+            }}
+            id="msg-input"
+            dir="auto"
+          ></textarea>
+          {isGenerating(currConvId) ? (
+            <button
+              className="btn btn-neutral ml-2"
+              onClick={() => stopGenerating(currConvId)}
+            >
+              Stop
+            </button>
+          ) : (
+            <button
+              className="btn btn-primary ml-2"
+              onClick={sendNewMessage}
+              disabled={inputMsg.trim().length === 0}
+            >
+              Send
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
+        {canvasData?.type === CanvasType.PY_INTERPRETER && (
+          <CanvasPyInterpreter />
+        )}
+      </div>
+    </div>
+  );
+}
diff --git a/examples/server/webui/src/components/Header.tsx b/examples/server/webui/src/components/Header.tsx
new file mode 100644
index 000000000..505350313
--- /dev/null
+++ b/examples/server/webui/src/components/Header.tsx
@@ -0,0 +1,176 @@
+import { useEffect, useState } from 'react';
+import StorageUtils from '../utils/storage';
+import { useAppContext } from '../utils/app.context';
+import { classNames } from '../utils/misc';
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { THEMES } from '../Config';
+import { useNavigate } from 'react-router';
+
+export default function Header() {
+  const navigate = useNavigate();
+  const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
+  const { setShowSettings } = useAppContext();
+
+  const setTheme = (theme: string) => {
+    StorageUtils.setTheme(theme);
+    setSelectedTheme(theme);
+  };
+
+  useEffect(() => {
+    document.body.setAttribute('data-theme', selectedTheme);
+    document.body.setAttribute(
+      'data-color-scheme',
+      // @ts-expect-error daisyuiThemes complains about index type, but it should work
+      daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
+    );
+  }, [selectedTheme]);
+
+  const { isGenerating, viewingConversation } = useAppContext();
+  const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? '');
+
+  const removeConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    if (window.confirm('Are you sure to delete this conversation?')) {
+      StorageUtils.remove(convId);
+      navigate('/');
+    }
+  };
+
+  const downloadConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    const conversationJson = JSON.stringify(viewingConversation, null, 2);
+    const blob = new Blob([conversationJson], { type: 'application/json' });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url;
+    a.download = `conversation_${convId}.json`;
+    document.body.appendChild(a);
+    a.click();
+    document.body.removeChild(a);
+    URL.revokeObjectURL(url);
+  };
+
+  return (
+    <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
+      {/* open sidebar button */}
+      <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          width="16"
+          height="16"
+          fill="currentColor"
+          className="bi bi-list"
+          viewBox="0 0 16 16"
+        >
+          <path
+            fillRule="evenodd"
+            d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
+          />
+        </svg>
+      </label>
+
+      <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+      {/* action buttons (top right) */}
+      <div className="flex items-center">
+        <div v-if="messages.length > 0" className="dropdown dropdown-end">
+          {/* "..." button */}
+          <button
+            tabIndex={0}
+            role="button"
+            className="btn m-1"
+            disabled={isCurrConvGenerating}
+          >
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-three-dots-vertical"
+              viewBox="0 0 16 16"
+            >
+              <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
+            </svg>
+          </button>
+          {/* dropdown menu */}
+          <ul
+            tabIndex={0}
+            className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
+          >
+            <li onClick={downloadConversation}>
+              <a>Download</a>
+            </li>
+            <li className="text-error" onClick={removeConversation}>
+              <a>Delete</a>
+            </li>
+          </ul>
+        </div>
+        <div className="tooltip tooltip-bottom" data-tip="Settings">
+          <button className="btn" onClick={() => setShowSettings(true)}>
+            {/* settings button */}
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-gear"
+              viewBox="0 0 16 16"
+            >
+              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
+              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
+            </svg>
+          </button>
+        </div>
+
+        {/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
+        <div className="tooltip tooltip-bottom" data-tip="Themes">
+          <div className="dropdown dropdown-end dropdown-bottom">
+            <div tabIndex={0} role="button" className="btn m-1">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-palette2"
+                viewBox="0 0 16 16"
+              >
+                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
+              </svg>
+            </div>
+            <ul
+              tabIndex={0}
+              className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
+            >
+              <li>
+                <button
+                  className={classNames({
+                    'btn btn-sm btn-block btn-ghost justify-start': true,
+                    'btn-active': selectedTheme === 'auto',
+                  })}
+                  onClick={() => setTheme('auto')}
+                >
+                  auto
+                </button>
+              </li>
+              {THEMES.map((theme) => (
+                <li key={theme}>
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    aria-label={theme}
+                    value={theme}
+                    checked={selectedTheme === theme}
+                    onChange={(e) => e.target.checked && setTheme(theme)}
+                  />
+                </li>
+              ))}
+            </ul>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/examples/server/webui/src/components/MarkdownDisplay.tsx b/examples/server/webui/src/components/MarkdownDisplay.tsx
new file mode 100644
index 000000000..5b7a72591
--- /dev/null
+++ b/examples/server/webui/src/components/MarkdownDisplay.tsx
@@ -0,0 +1,310 @@
+import React, { useMemo, useState } from 'react';
+import Markdown, { ExtraProps } from 'react-markdown';
+import remarkGfm from 'remark-gfm';
+import rehypeHightlight from 'rehype-highlight';
+import rehypeKatex from 'rehype-katex';
+import remarkMath from 'remark-math';
+import remarkBreaks from 'remark-breaks';
+import 'katex/dist/katex.min.css';
+import { classNames, copyStr } from '../utils/misc';
+import { ElementContent, Root } from 'hast';
+import { visit } from 'unist-util-visit';
+import { useAppContext } from '../utils/app.context';
+import { CanvasType } from '../utils/types';
+
+export default function MarkdownDisplay({
+  content,
+  isGenerating,
+}: {
+  content: string;
+  isGenerating?: boolean;
+}) {
+  const preprocessedContent = useMemo(
+    () => preprocessLaTeX(content),
+    [content]
+  );
+  return (
+    <Markdown
+      remarkPlugins={[remarkGfm, remarkMath, remarkBreaks]}
+      rehypePlugins={[rehypeHightlight, rehypeKatex, rehypeCustomCopyButton]}
+      components={{
+        button: (props) => (
+          <CodeBlockButtons
+            {...props}
+            isGenerating={isGenerating}
+            origContent={preprocessedContent}
+          />
+        ),
+        // note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it)
+      }}
+    >
+      {preprocessedContent}
+    </Markdown>
+  );
+}
+
+const CodeBlockButtons: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement> &
+    ExtraProps & { origContent: string; isGenerating?: boolean }
+> = ({ node, origContent, isGenerating }) => {
+  const { config } = useAppContext();
+  const startOffset = node?.position?.start.offset ?? 0;
+  const endOffset = node?.position?.end.offset ?? 0;
+
+  const copiedContent = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, endOffset)
+        .replace(/^```[^\n]+\n/g, '')
+        .replace(/```$/g, ''),
+    [origContent, startOffset, endOffset]
+  );
+
+  const codeLanguage = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, startOffset + 10)
+        .match(/^```([^\n]+)\n/)?.[1] ?? '',
+    [origContent, startOffset]
+  );
+
+  const canRunCode =
+    !isGenerating &&
+    config.pyIntepreterEnabled &&
+    codeLanguage.startsWith('py');
+
+  return (
+    <div
+      className={classNames({
+        'text-right sticky top-[7em] mb-2 mr-2 h-0': true,
+        'display-none': !node?.position,
+      })}
+    >
+      <CopyButton className="badge btn-mini" content={copiedContent} />
+      {canRunCode && (
+        <RunPyCodeButton
+          className="badge btn-mini ml-2"
+          content={copiedContent}
+        />
+      )}
+    </div>
+  );
+};
+
+export const CopyButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const [copied, setCopied] = useState(false);
+  return (
+    <button
+      className={className}
+      onClick={() => {
+        copyStr(content);
+        setCopied(true);
+      }}
+      onMouseLeave={() => setCopied(false)}
+    >
+      {copied ? 'Copied!' : '📋 Copy'}
+    </button>
+  );
+};
+
+export const RunPyCodeButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const { setCanvasData } = useAppContext();
+  return (
+    <>
+      <button
+        className={className}
+        onClick={() =>
+          setCanvasData({
+            type: CanvasType.PY_INTERPRETER,
+            content,
+          })
+        }
+      >
+        ▶️ Run
+      </button>
+    </>
+  );
+};
+
+/**
+ * This injects the "button" element before each "pre" element.
+ * The actual button will be replaced with a react component in the MarkdownDisplay.
+ * We don't replace "pre" node directly because it will cause the node to re-render, which causes this bug: https://github.com/ggerganov/llama.cpp/issues/9608
+ */
+function rehypeCustomCopyButton() {
+  return function (tree: Root) {
+    visit(tree, 'element', function (node) {
+      if (node.tagName === 'pre' && !node.properties.visited) {
+        const preNode = { ...node };
+        // replace current node
+        preNode.properties.visited = 'true';
+        node.tagName = 'div';
+        node.properties = {};
+        // add node for button
+        const btnNode: ElementContent = {
+          type: 'element',
+          tagName: 'button',
+          properties: {},
+          children: [],
+          position: node.position,
+        };
+        node.children = [btnNode, preNode];
+      }
+    });
+  };
+}
+
+/**
+ * The part below is copied and adapted from:
+ * https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
+ * (MIT License)
+ */
+
+// Regex to check if the processed content contains any potential LaTeX patterns
+const containsLatexRegex =
+  /\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/;
+
+// Regex for inline and block LaTeX expressions
+const inlineLatex = new RegExp(/\\\((.+?)\\\)/, 'g');
+const blockLatex = new RegExp(/\\\[(.*?[^\\])\\\]/, 'gs');
+
+// Function to restore code blocks
+const restoreCodeBlocks = (content: string, codeBlocks: string[]) => {
+  return content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[index]
+  );
+};
+
+// Regex to identify code blocks and inline code
+const codeBlockRegex = /(```[\s\S]*?```|`.*?`)/g;
+
+export const processLaTeX = (_content: string) => {
+  let content = _content;
+  // Temporarily replace code blocks and inline code with placeholders
+  const codeBlocks: string[] = [];
+  let index = 0;
+  content = content.replace(codeBlockRegex, (match) => {
+    codeBlocks[index] = match;
+    return `<<CODE_BLOCK_${index++}>>`;
+  });
+
+  // Escape dollar signs followed by a digit or space and digit
+  let processedContent = content.replace(/(\$)(?=\s?\d)/g, '\\$');
+
+  // If no LaTeX patterns are found, restore code blocks and return the processed content
+  if (!containsLatexRegex.test(processedContent)) {
+    return restoreCodeBlocks(processedContent, codeBlocks);
+  }
+
+  // Convert LaTeX expressions to a markdown compatible format
+  processedContent = processedContent
+    .replace(inlineLatex, (_: string, equation: string) => `$${equation}$`) // Convert inline LaTeX
+    .replace(blockLatex, (_: string, equation: string) => `$$${equation}$$`); // Convert block LaTeX
+
+  // Restore code blocks
+  return restoreCodeBlocks(processedContent, codeBlocks);
+};
+
+/**
+ * Preprocesses LaTeX content by replacing delimiters and escaping certain characters.
+ *
+ * @param content The input string containing LaTeX expressions.
+ * @returns The processed string with replaced delimiters and escaped characters.
+ */
+export function preprocessLaTeX(content: string): string {
+  // Step 1: Protect code blocks
+  const codeBlocks: string[] = [];
+  content = content.replace(/(```[\s\S]*?```|`[^`\n]+`)/g, (_, code) => {
+    codeBlocks.push(code);
+    return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
+  });
+
+  // Step 2: Protect existing LaTeX expressions
+  const latexExpressions: string[] = [];
+
+  // Protect block math ($$...$$), \[...\], and \(...\) as before.
+  content = content.replace(
+    /(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g,
+    (match) => {
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  );
+
+  // Protect inline math ($...$) only if it does NOT match a currency pattern.
+  // We assume a currency pattern is one where the inner content is purely numeric (with optional decimals).
+  content = content.replace(/\$([^$]+)\$/g, (match, inner) => {
+    if (/^\s*\d+(?:\.\d+)?\s*$/.test(inner)) {
+      // This looks like a currency value (e.g. "$123" or "$12.34"),
+      // so don't protect it.
+      return match;
+    } else {
+      // Otherwise, treat it as a LaTeX expression.
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  });
+
+  // Step 3: Escape dollar signs that are likely currency indicators.
+  // (Now that inline math is protected, this will only escape dollars not already protected)
+  content = content.replace(/\$(?=\d)/g, '\\$');
+
+  // Step 4: Restore LaTeX expressions
+  content = content.replace(
+    /<<LATEX_(\d+)>>/g,
+    (_, index) => latexExpressions[parseInt(index)]
+  );
+
+  // Step 5: Restore code blocks
+  content = content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[parseInt(index)]
+  );
+
+  // Step 6: Apply additional escaping functions
+  content = escapeBrackets(content);
+  content = escapeMhchem(content);
+
+  return content;
+}
+
+export function escapeBrackets(text: string): string {
+  const pattern =
+    /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
+  return text.replace(
+    pattern,
+    (
+      match: string,
+      codeBlock: string | undefined,
+      squareBracket: string | undefined,
+      roundBracket: string | undefined
+    ): string => {
+      if (codeBlock != null) {
+        return codeBlock;
+      } else if (squareBracket != null) {
+        return `$$${squareBracket}$$`;
+      } else if (roundBracket != null) {
+        return `$${roundBracket}$`;
+      }
+      return match;
+    }
+  );
+}
+
+export function escapeMhchem(text: string) {
+  return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
+}
diff --git a/examples/server/webui/src/components/SettingDialog.tsx b/examples/server/webui/src/components/SettingDialog.tsx
new file mode 100644
index 000000000..592b93fa3
--- /dev/null
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@@ -0,0 +1,536 @@
+import { useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config';
+import { isDev } from '../Config';
+import StorageUtils from '../utils/storage';
+import { classNames, isBoolean, isNumeric, isString } from '../utils/misc';
+import {
+  BeakerIcon,
+  ChatBubbleOvalLeftEllipsisIcon,
+  Cog6ToothIcon,
+  FunnelIcon,
+  HandRaisedIcon,
+  SquaresPlusIcon,
+} from '@heroicons/react/24/outline';
+import { OpenInNewTab } from '../utils/common';
+
+type SettKey = keyof typeof CONFIG_DEFAULT;
+
+const BASIC_KEYS: SettKey[] = [
+  'temperature',
+  'top_k',
+  'top_p',
+  'min_p',
+  'max_tokens',
+];
+const SAMPLER_KEYS: SettKey[] = [
+  'dynatemp_range',
+  'dynatemp_exponent',
+  'typical_p',
+  'xtc_probability',
+  'xtc_threshold',
+];
+const PENALTY_KEYS: SettKey[] = [
+  'repeat_last_n',
+  'repeat_penalty',
+  'presence_penalty',
+  'frequency_penalty',
+  'dry_multiplier',
+  'dry_base',
+  'dry_allowed_length',
+  'dry_penalty_last_n',
+];
+
+enum SettingInputType {
+  SHORT_INPUT,
+  LONG_INPUT,
+  CHECKBOX,
+  CUSTOM,
+}
+
+interface SettingFieldInput {
+  type: Exclude<SettingInputType, SettingInputType.CUSTOM>;
+  label: string | React.ReactElement;
+  help?: string | React.ReactElement;
+  key: SettKey;
+}
+
+interface SettingFieldCustom {
+  type: SettingInputType.CUSTOM;
+  key: SettKey;
+  component:
+    | string
+    | React.FC<{
+        value: string | boolean | number;
+        onChange: (value: string) => void;
+      }>;
+}
+
+interface SettingSection {
+  title: React.ReactElement;
+  fields: (SettingFieldInput | SettingFieldCustom)[];
+}
+
+const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline';
+
+const SETTING_SECTIONS: SettingSection[] = [
+  {
+    title: (
+      <>
+        <Cog6ToothIcon className={ICON_CLASSNAME} />
+        General
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'API Key',
+        key: 'apiKey',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: 'System Message (will be disabled if left empty)',
+        key: 'systemMessage',
+      },
+      ...BASIC_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <FunnelIcon className={ICON_CLASSNAME} />
+        Samplers
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Samplers queue',
+        key: 'samplers',
+      },
+      ...SAMPLER_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <HandRaisedIcon className={ICON_CLASSNAME} />
+        Penalties
+      </>
+    ),
+    fields: PENALTY_KEYS.map((key) => ({
+      type: SettingInputType.SHORT_INPUT,
+      label: key,
+      key,
+    })),
+  },
+  {
+    title: (
+      <>
+        <ChatBubbleOvalLeftEllipsisIcon className={ICON_CLASSNAME} />
+        Reasoning
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Expand though process by default for generating message',
+        key: 'showThoughtInProgress',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label:
+          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
+        key: 'excludeThoughtOnReq',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <SquaresPlusIcon className={ICON_CLASSNAME} />
+        Advanced
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => {
+          const debugImportDemoConv = async () => {
+            const res = await fetch('/demo-conversation.json');
+            const demoConv = await res.json();
+            StorageUtils.remove(demoConv.id);
+            for (const msg of demoConv.messages) {
+              StorageUtils.appendMsg(demoConv.id, msg);
+            }
+          };
+          return (
+            <button className="btn" onClick={debugImportDemoConv}>
+              (debug) Import demo conversation
+            </button>
+          );
+        },
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Show tokens per second',
+        key: 'showTokensPerSecond',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: (
+          <>
+            Custom JSON config (For more info, refer to{' '}
+            <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md">
+              server documentation
+            </OpenInNewTab>
+            )
+          </>
+        ),
+        key: 'custom',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <BeakerIcon className={ICON_CLASSNAME} />
+        Experimental
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => (
+          <>
+            <p className="mb-8">
+              Experimental features are not guaranteed to work correctly.
+              <br />
+              <br />
+              If you encounter any problems, create a{' '}
+              <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/new?template=019-bug-misc.yml">
+                Bug (misc.)
+              </OpenInNewTab>{' '}
+              report on Github. Please also specify <b>webui/experimental</b> on
+              the report title and include screenshots.
+              <br />
+              <br />
+              Some features may require packages downloaded from CDN, so they
+              need internet connection.
+            </p>
+          </>
+        ),
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: (
+          <>
+            <b>Enable Python interpreter</b>
+            <br />
+            <small className="text-xs">
+              This feature uses{' '}
+              <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
+              downloaded from CDN. To use this feature, ask the LLM to generate
+              python code inside a markdown code block. You will see a "Run"
+              button on the code block, near the "Copy" button.
+            </small>
+          </>
+        ),
+        key: 'pyIntepreterEnabled',
+      },
+    ],
+  },
+];
+
+export default function SettingDialog({
+  show,
+  onClose,
+}: {
+  show: boolean;
+  onClose: () => void;
+}) {
+  const { config, saveConfig } = useAppContext();
+  const [sectionIdx, setSectionIdx] = useState(0);
+
+  // clone the config object to prevent direct mutation
+  const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
+    JSON.parse(JSON.stringify(config))
+  );
+
+  const resetConfig = () => {
+    if (window.confirm('Are you sure to reset all settings?')) {
+      setLocalConfig(CONFIG_DEFAULT);
+    }
+  };
+
+  const handleSave = () => {
+    // copy the local config to prevent direct mutation
+    const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
+      JSON.stringify(localConfig)
+    );
+    // validate the config
+    for (const key in newConfig) {
+      const value = newConfig[key as SettKey];
+      const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
+      if (mustBeString) {
+        if (!isString(value)) {
+          alert(`Value for ${key} must be string`);
+          return;
+        }
+      } else if (mustBeNumeric) {
+        const trimedValue = value.toString().trim();
+        const numVal = Number(trimedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
+          alert(`Value for ${key} must be numeric`);
+          return;
+        }
+        // force conversion to number
+        // @ts-expect-error this is safe
+        newConfig[key] = numVal;
+      } else if (mustBeBoolean) {
+        if (!isBoolean(value)) {
+          alert(`Value for ${key} must be boolean`);
+          return;
+        }
+      } else {
+        console.error(`Unknown default type for key ${key}`);
+      }
+    }
+    if (isDev) console.log('Saving config', newConfig);
+    saveConfig(newConfig);
+    onClose();
+  };
+
+  const onChange = (key: SettKey) => (value: string | boolean) => {
+    // note: we do not perform validation here, because we may get incomplete value as user is still typing it
+    setLocalConfig({ ...localConfig, [key]: value });
+  };
+
+  return (
+    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+      <div className="modal-box w-11/12 max-w-3xl">
+        <h3 className="text-lg font-bold mb-6">Settings</h3>
+        <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
+          {/* Left panel, showing sections - Desktop version */}
+          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+            {SETTING_SECTIONS.map((section, idx) => (
+              <div
+                key={idx}
+                className={classNames({
+                  'btn btn-ghost justify-start font-normal w-44 mb-1': true,
+                  'btn-active': sectionIdx === idx,
+                })}
+                onClick={() => setSectionIdx(idx)}
+                dir="auto"
+              >
+                {section.title}
+              </div>
+            ))}
+          </div>
+
+          {/* Left panel, showing sections - Mobile version */}
+          <div className="md:hidden flex flex-row gap-2 mb-4">
+            <details className="dropdown">
+              <summary className="btn bt-sm w-full m-1">
+                {SETTING_SECTIONS[sectionIdx].title}
+              </summary>
+              <ul className="menu dropdown-content bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                {SETTING_SECTIONS.map((section, idx) => (
+                  <div
+                    key={idx}
+                    className={classNames({
+                      'btn btn-ghost justify-start font-normal': true,
+                      'btn-active': sectionIdx === idx,
+                    })}
+                    onClick={() => setSectionIdx(idx)}
+                    dir="auto"
+                  >
+                    {section.title}
+                  </div>
+                ))}
+              </ul>
+            </details>
+          </div>
+
+          {/* Right panel, showing setting fields */}
+          <div className="grow overflow-y-auto px-4">
+            {SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => {
+              const key = `${sectionIdx}-${idx}`;
+              if (field.type === SettingInputType.SHORT_INPUT) {
+                return (
+                  <SettingsModalShortInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.LONG_INPUT) {
+                return (
+                  <SettingsModalLongInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key].toString()}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CHECKBOX) {
+                return (
+                  <SettingsModalCheckbox
+                    key={key}
+                    configKey={field.key}
+                    value={!!localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CUSTOM) {
+                return (
+                  <div key={key} className="mb-2">
+                    {typeof field.component === 'string'
+                      ? field.component
+                      : field.component({
+                          value: localConfig[field.key],
+                          onChange: onChange(field.key),
+                        })}
+                  </div>
+                );
+              }
+            })}
+
+            <p className="opacity-40 mb-6 text-sm mt-8">
+              Settings are saved in browser's localStorage
+            </p>
+          </div>
+        </div>
+
+        <div className="modal-action">
+          <button className="btn" onClick={resetConfig}>
+            Reset to default
+          </button>
+          <button className="btn" onClick={onClose}>
+            Close
+          </button>
+          <button className="btn btn-primary" onClick={handleSave}>
+            Save
+          </button>
+        </div>
+      </div>
+    </dialog>
+  );
+}
+
+function SettingsModalLongInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: string;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  return (
+    <label className="form-control mb-2">
+      <div className="label inline">{label || configKey}</div>
+      <textarea
+        className="textarea textarea-bordered h-24"
+        placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+        value={value}
+        onChange={(e) => onChange(e.target.value)}
+      />
+    </label>
+  );
+}
+
+function SettingsModalShortInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  value: any;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  const helpMsg = CONFIG_INFO[configKey];
+
+  return (
+    <>
+      {/* on mobile, we simply show the help message here */}
+      {helpMsg && (
+        <div className="block md:hidden mb-1">
+          <b>{label || configKey}</b>
+          <br />
+          <p className="text-xs">{helpMsg}</p>
+        </div>
+      )}
+      <label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
+        <div className="dropdown dropdown-hover">
+          <div tabIndex={0} role="button" className="font-bold hidden md:block">
+            {label || configKey}
+          </div>
+          {helpMsg && (
+            <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+              {helpMsg}
+            </div>
+          )}
+        </div>
+        <input
+          type="text"
+          className="grow"
+          placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+          value={value}
+          onChange={(e) => onChange(e.target.value)}
+        />
+      </label>
+    </>
+  );
+}
+
+function SettingsModalCheckbox({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: boolean;
+  onChange: (value: boolean) => void;
+  label: string;
+}) {
+  return (
+    <div className="flex flex-row items-center mb-2">
+      <input
+        type="checkbox"
+        className="toggle"
+        checked={value}
+        onChange={(e) => onChange(e.target.checked)}
+      />
+      <span className="ml-4">{label || configKey}</span>
+    </div>
+  );
+}
diff --git a/examples/server/webui/src/components/Sidebar.tsx b/examples/server/webui/src/components/Sidebar.tsx
new file mode 100644
index 000000000..2fc934e46
--- /dev/null
+++ b/examples/server/webui/src/components/Sidebar.tsx
@@ -0,0 +1,95 @@
+import { useEffect, useMemo, useState } from 'react';
+import { classNames } from '../utils/misc';
+import { Conversation } from '../utils/types';
+import StorageUtils from '../utils/storage';
+import { useNavigate, useParams } from 'react-router';
+
+export default function Sidebar() {
+  const params = useParams();
+  const navigate = useNavigate();
+  const currConv = useMemo(
+    () => StorageUtils.getOneConversation(params.convId ?? ''),
+    [params.convId]
+  );
+
+  const [conversations, setConversations] = useState<Conversation[]>([]);
+
+  useEffect(() => {
+    const handleConversationChange = () => {
+      setConversations(StorageUtils.getAllConversations());
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    handleConversationChange();
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, []);
+
+  return (
+    <>
+      <input
+        id="toggle-drawer"
+        type="checkbox"
+        className="drawer-toggle"
+        defaultChecked
+      />
+
+      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label
+          htmlFor="toggle-drawer"
+          aria-label="close sidebar"
+          className="drawer-overlay"
+        ></label>
+        <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
+          <div className="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 className="font-bold ml-4">Conversations</h2>
+
+            {/* close sidebar button */}
+            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-arrow-bar-left"
+                viewBox="0 0 16 16"
+              >
+                <path
+                  fillRule="evenodd"
+                  d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
+                />
+              </svg>
+            </label>
+          </div>
+
+          {/* list of conversations */}
+          <div
+            className={classNames({
+              'btn btn-ghost justify-start': true,
+              'btn-active': !currConv,
+            })}
+            onClick={() => navigate('/')}
+          >
+            + New conversation
+          </div>
+          {conversations.map((conv) => (
+            <div
+              key={conv.id}
+              className={classNames({
+                'btn btn-ghost justify-start font-normal': true,
+                'btn-active': conv.id === currConv?.id,
+              })}
+              onClick={() => navigate(`/chat/${conv.id}`)}
+              dir="auto"
+            >
+              <span className="truncate">{conv.messages[0].content}</span>
+            </div>
+          ))}
+          <div className="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+    </>
+  );
+}
diff --git a/examples/server/webui/src/index.scss b/examples/server/webui/src/index.scss
new file mode 100644
index 000000000..314b68232
--- /dev/null
+++ b/examples/server/webui/src/index.scss
@@ -0,0 +1,73 @@
+@use 'sass:meta';
+
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+.markdown {
+  h1,
+  h2,
+  h3,
+  h4,
+  h5,
+  h6,
+  ul,
+  ol,
+  li {
+    all: revert;
+  }
+  pre {
+    @apply whitespace-pre-wrap rounded-lg p-2;
+    border: 1px solid currentColor;
+  }
+  p {
+    @apply mb-2;
+  }
+  /* TODO: fix markdown table */
+}
+
+.show-on-hover {
+  @apply md:opacity-0 md:group-hover:opacity-100;
+}
+.btn-mini {
+  @apply cursor-pointer hover:shadow-md;
+}
+.chat-screen {
+  max-width: 900px;
+}
+
+.chat-bubble-base-300 {
+  --tw-bg-opacity: 1;
+  --tw-text-opacity: 1;
+  @apply bg-base-300 text-base-content;
+}
+
+/* Highlight.js */
+[data-color-scheme='light'] {
+  @include meta.load-css('highlight.js/styles/stackoverflow-light');
+  .dark-color {
+    @apply bg-base-content text-base-100;
+  }
+}
+[data-color-scheme='dark'] {
+  @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+}
+[data-color-scheme='auto'] {
+  @media (prefers-color-scheme: light) {
+    @include meta.load-css('highlight.js/styles/stackoverflow-light');
+    .dark-color {
+      @apply bg-base-content text-base-100;
+    }
+  }
+  @media (prefers-color-scheme: dark) {
+    @include meta.load-css('highlight.js/styles/stackoverflow-dark');
+  }
+}
+.hljs {
+  background: transparent !important;
+  padding: 0.5em !important;
+}
+
+.katex-display {
+  margin: 0 0 !important;
+}
diff --git a/examples/server/webui/src/main.tsx b/examples/server/webui/src/main.tsx
new file mode 100644
index 000000000..631e96a79
--- /dev/null
+++ b/examples/server/webui/src/main.tsx
@@ -0,0 +1,10 @@
+import { StrictMode } from 'react';
+import { createRoot } from 'react-dom/client';
+import './index.scss';
+import App from './App.tsx';
+
+createRoot(document.getElementById('root')!).render(
+  <StrictMode>
+    <App />
+  </StrictMode>
+);
diff --git a/examples/server/webui/src/utils/app.context.tsx b/examples/server/webui/src/utils/app.context.tsx
new file mode 100644
index 000000000..af6bd885f
--- /dev/null
+++ b/examples/server/webui/src/utils/app.context.tsx
@@ -0,0 +1,327 @@
+import React, { createContext, useContext, useEffect, useState } from 'react';
+import {
+  APIMessage,
+  CanvasData,
+  Conversation,
+  Message,
+  PendingMessage,
+} from './types';
+import StorageUtils from './storage';
+import {
+  filterThoughtFromMsgs,
+  normalizeMsgsForAPI,
+  getSSEStreamAsync,
+} from './misc';
+import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
+import { matchPath, useLocation } from 'react-router';
+
+interface AppContextValue {
+  // conversations and messages
+  viewingConversation: Conversation | null;
+  pendingMessages: Record<Conversation['id'], PendingMessage>;
+  isGenerating: (convId: string) => boolean;
+  sendMessage: (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<boolean>;
+  stopGenerating: (convId: string) => void;
+  replaceMessageAndGenerate: (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<void>;
+
+  // canvas
+  canvasData: CanvasData | null;
+  setCanvasData: (data: CanvasData | null) => void;
+
+  // config
+  config: typeof CONFIG_DEFAULT;
+  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
+  showSettings: boolean;
+  setShowSettings: (show: boolean) => void;
+}
+
+// for now, this callback is only used for scrolling to the bottom of the chat
+type CallbackGeneratedChunk = () => void;
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+const AppContext = createContext<AppContextValue>({} as any);
+
+export const AppContextProvider = ({
+  children,
+}: {
+  children: React.ReactElement;
+}) => {
+  const { pathname } = useLocation();
+  const params = matchPath('/chat/:convId', pathname);
+  const convId = params?.params?.convId;
+
+  const [viewingConversation, setViewingConversation] =
+    useState<Conversation | null>(null);
+  const [pendingMessages, setPendingMessages] = useState<
+    Record<Conversation['id'], PendingMessage>
+  >({});
+  const [aborts, setAborts] = useState<
+    Record<Conversation['id'], AbortController>
+  >({});
+  const [config, setConfig] = useState(StorageUtils.getConfig());
+  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
+  const [showSettings, setShowSettings] = useState(false);
+
+  // handle change when the convId from URL is changed
+  useEffect(() => {
+    // also reset the canvas data
+    setCanvasData(null);
+    const handleConversationChange = (changedConvId: string) => {
+      if (changedConvId !== convId) return;
+      setViewingConversation(StorageUtils.getOneConversation(convId));
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    setViewingConversation(StorageUtils.getOneConversation(convId ?? ''));
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, [convId]);
+
+  const setPending = (convId: string, pendingMsg: PendingMessage | null) => {
+    // if pendingMsg is null, remove the key from the object
+    if (!pendingMsg) {
+      setPendingMessages((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setPendingMessages((prev) => ({ ...prev, [convId]: pendingMsg }));
+    }
+  };
+
+  const setAbort = (convId: string, controller: AbortController | null) => {
+    if (!controller) {
+      setAborts((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setAborts((prev) => ({ ...prev, [convId]: controller }));
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////
+  // public functions
+
+  const isGenerating = (convId: string) => !!pendingMessages[convId];
+
+  const generateMessage = async (
+    convId: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    const config = StorageUtils.getConfig();
+    const currConversation = StorageUtils.getOneConversation(convId);
+    if (!currConversation) {
+      throw new Error('Current conversation is not found');
+    }
+
+    const abortController = new AbortController();
+    setAbort(convId, abortController);
+
+    let pendingMsg: PendingMessage = {
+      id: Date.now() + 1,
+      role: 'assistant',
+      content: null,
+    };
+    setPending(convId, pendingMsg);
+
+    try {
+      // prepare messages for API
+      let messages: APIMessage[] = [
+        ...(config.systemMessage.length === 0
+          ? []
+          : [{ role: 'system', content: config.systemMessage } as APIMessage]),
+        ...normalizeMsgsForAPI(currConversation?.messages ?? []),
+      ];
+      if (config.excludeThoughtOnReq) {
+        messages = filterThoughtFromMsgs(messages);
+      }
+      if (isDev) console.log({ messages });
+
+      // prepare params
+      const params = {
+        messages,
+        stream: true,
+        cache_prompt: true,
+        samplers: config.samplers,
+        temperature: config.temperature,
+        dynatemp_range: config.dynatemp_range,
+        dynatemp_exponent: config.dynatemp_exponent,
+        top_k: config.top_k,
+        top_p: config.top_p,
+        min_p: config.min_p,
+        typical_p: config.typical_p,
+        xtc_probability: config.xtc_probability,
+        xtc_threshold: config.xtc_threshold,
+        repeat_last_n: config.repeat_last_n,
+        repeat_penalty: config.repeat_penalty,
+        presence_penalty: config.presence_penalty,
+        frequency_penalty: config.frequency_penalty,
+        dry_multiplier: config.dry_multiplier,
+        dry_base: config.dry_base,
+        dry_allowed_length: config.dry_allowed_length,
+        dry_penalty_last_n: config.dry_penalty_last_n,
+        max_tokens: config.max_tokens,
+        timings_per_token: !!config.showTokensPerSecond,
+        ...(config.custom.length ? JSON.parse(config.custom) : {}),
+      };
+
+      // send request
+      const fetchResponse = await fetch(`${BASE_URL}/v1/chat/completions`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          ...(config.apiKey
+            ? { Authorization: `Bearer ${config.apiKey}` }
+            : {}),
+        },
+        body: JSON.stringify(params),
+        signal: abortController.signal,
+      });
+      if (fetchResponse.status !== 200) {
+        const body = await fetchResponse.json();
+        throw new Error(body?.error?.message || 'Unknown error');
+      }
+      const chunks = getSSEStreamAsync(fetchResponse);
+      for await (const chunk of chunks) {
+        // const stop = chunk.stop;
+        if (chunk.error) {
+          throw new Error(chunk.error?.message || 'Unknown error');
+        }
+        const addedContent = chunk.choices[0].delta.content;
+        const lastContent = pendingMsg.content || '';
+        if (addedContent) {
+          pendingMsg = {
+            id: pendingMsg.id,
+            role: 'assistant',
+            content: lastContent + addedContent,
+          };
+        }
+        const timings = chunk.timings;
+        if (timings && config.showTokensPerSecond) {
+          // only extract what's really needed, to save some space
+          pendingMsg.timings = {
+            prompt_n: timings.prompt_n,
+            prompt_ms: timings.prompt_ms,
+            predicted_n: timings.predicted_n,
+            predicted_ms: timings.predicted_ms,
+          };
+        }
+        setPending(convId, pendingMsg);
+        onChunk?.();
+      }
+    } catch (err) {
+      setPending(convId, null);
+      if ((err as Error).name === 'AbortError') {
+        // user stopped the generation via stopGeneration() function
+        // we can safely ignore this error
+      } else {
+        console.error(err);
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        alert((err as any)?.message ?? 'Unknown error');
+        throw err; // rethrow
+      }
+    }
+
+    if (pendingMsg.content) {
+      StorageUtils.appendMsg(currConversation.id, {
+        id: pendingMsg.id,
+        content: pendingMsg.content,
+        role: pendingMsg.role,
+        timings: pendingMsg.timings,
+      });
+    }
+    setPending(convId, null);
+    onChunk?.(); // trigger scroll to bottom
+  };
+
+  const sendMessage = async (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ): Promise<boolean> => {
+    if (isGenerating(convId) || content.trim().length === 0) return false;
+
+    StorageUtils.appendMsg(convId, {
+      id: Date.now(),
+      role: 'user',
+      content,
+    });
+
+    try {
+      await generateMessage(convId, onChunk);
+      return true;
+    } catch (_) {
+      // rollback
+      StorageUtils.popMsg(convId);
+    }
+    return false;
+  };
+
+  const stopGenerating = (convId: string) => {
+    setPending(convId, null);
+    aborts[convId]?.abort();
+  };
+
+  // if content is undefined, we remove last assistant message
+  const replaceMessageAndGenerate = async (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    StorageUtils.filterAndKeepMsgs(convId, (msg) => msg.id < origMsgId);
+    if (content) {
+      StorageUtils.appendMsg(convId, {
+        id: Date.now(),
+        role: 'user',
+        content,
+      });
+    }
+
+    await generateMessage(convId, onChunk);
+  };
+
+  const saveConfig = (config: typeof CONFIG_DEFAULT) => {
+    StorageUtils.setConfig(config);
+    setConfig(config);
+  };
+
+  return (
+    <AppContext.Provider
+      value={{
+        isGenerating,
+        viewingConversation,
+        pendingMessages,
+        sendMessage,
+        stopGenerating,
+        replaceMessageAndGenerate,
+        canvasData,
+        setCanvasData,
+        config,
+        saveConfig,
+        showSettings,
+        setShowSettings,
+      }}
+    >
+      {children}
+    </AppContext.Provider>
+  );
+};
+
+export const useAppContext = () => useContext(AppContext);
diff --git a/examples/server/webui/src/utils/common.tsx b/examples/server/webui/src/utils/common.tsx
new file mode 100644
index 000000000..09b08b5c9
--- /dev/null
+++ b/examples/server/webui/src/utils/common.tsx
@@ -0,0 +1,38 @@
+export const XCloseButton: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement>
+> = ({ className, ...props }) => (
+  <button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
+    <svg
+      xmlns="http://www.w3.org/2000/svg"
+      className="h-6 w-6"
+      fill="none"
+      viewBox="0 0 24 24"
+      stroke="currentColor"
+    >
+      <path
+        strokeLinecap="round"
+        strokeLinejoin="round"
+        strokeWidth="2"
+        d="M6 18L18 6M6 6l12 12"
+      />
+    </svg>
+  </button>
+);
+
+export const OpenInNewTab = ({
+  href,
+  children,
+}: {
+  href: string;
+  children: string;
+}) => (
+  <a
+    className="underline"
+    href={href}
+    target="_blank"
+    rel="noopener noreferrer"
+  >
+    {children}
+  </a>
+);
diff --git a/examples/server/webui/src/utils/misc.ts b/examples/server/webui/src/utils/misc.ts
new file mode 100644
index 000000000..e3153fff5
--- /dev/null
+++ b/examples/server/webui/src/utils/misc.ts
@@ -0,0 +1,90 @@
+// @ts-expect-error this package does not have typing
+import TextLineStream from 'textlinestream';
+import { APIMessage, Message } from './types';
+
+// ponyfill for missing ReadableStream asyncIterator on Safari
+import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
+import { isDev } from '../Config';
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isString = (x: any) => !!x.toLowerCase;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isBoolean = (x: any) => x === true || x === false;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isNumeric = (n: any) => !isString(n) && !isNaN(n) && !isBoolean(n);
+export const escapeAttr = (str: string) =>
+  str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+
+// wrapper for SSE
+export async function* getSSEStreamAsync(fetchResponse: Response) {
+  if (!fetchResponse.body) throw new Error('Response body is empty');
+  const lines: ReadableStream<string> = fetchResponse.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  // @ts-expect-error asyncIterator complains about type, but it should work
+  for await (const line of asyncIterator(lines)) {
+    if (isDev) console.log({ line });
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+}
+
+// copy text to clipboard
+export const copyStr = (textToCopy: string) => {
+  // Navigator clipboard api needs a secure context (https)
+  if (navigator.clipboard && window.isSecureContext) {
+    navigator.clipboard.writeText(textToCopy);
+  } else {
+    // Use the 'out of viewport hidden text area' trick
+    const textArea = document.createElement('textarea');
+    textArea.value = textToCopy;
+    // Move textarea out of the viewport so it's not visible
+    textArea.style.position = 'absolute';
+    textArea.style.left = '-999999px';
+    document.body.prepend(textArea);
+    textArea.select();
+    document.execCommand('copy');
+  }
+};
+
+/**
+ * filter out redundant fields upon sending to API
+ */
+export function normalizeMsgsForAPI(messages: Message[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content: msg.content,
+    };
+  }) as APIMessage[];
+}
+
+/**
+ * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
+ */
+export function filterThoughtFromMsgs(messages: APIMessage[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content:
+        msg.role === 'assistant'
+          ? msg.content.split('</think>').at(-1)!.trim()
+          : msg.content,
+    } as APIMessage;
+  });
+}
+
+export function classNames(classes: Record<string, boolean>): string {
+  return Object.entries(classes)
+    .filter(([_, value]) => value)
+    .map(([key, _]) => key)
+    .join(' ');
+}
+
+export const delay = (ms: number) =>
+  new Promise((resolve) => setTimeout(resolve, ms));
diff --git a/examples/server/webui/src/utils/storage.ts b/examples/server/webui/src/utils/storage.ts
new file mode 100644
index 000000000..8c03fa781
--- /dev/null
+++ b/examples/server/webui/src/utils/storage.ts
@@ -0,0 +1,138 @@
+// coversations is stored in localStorage
+// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+
+import { CONFIG_DEFAULT } from '../Config';
+import { Conversation, Message } from './types';
+
+const event = new EventTarget();
+
+type CallbackConversationChanged = (convId: string) => void;
+let onConversationChangedHandlers: [
+  CallbackConversationChanged,
+  EventListener,
+][] = [];
+const dispatchConversationChange = (convId: string) => {
+  event.dispatchEvent(
+    new CustomEvent('conversationChange', { detail: { convId } })
+  );
+};
+
+// convId is a string prefixed with 'conv-'
+const StorageUtils = {
+  /**
+   * manage conversations
+   */
+  getAllConversations(): Conversation[] {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
+  },
+  /**
+   * can return null if convId does not exist
+   */
+  getOneConversation(convId: string): Conversation | null {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
+  },
+  /**
+   * if convId does not exist, create one
+   */
+  appendMsg(convId: string, msg: Message): void {
+    if (msg.content === null) return;
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * Get new conversation id
+   */
+  getNewConvId(): string {
+    return `conv-${Date.now()}`;
+  },
+  /**
+   * remove conversation by id
+   */
+  remove(convId: string): void {
+    localStorage.removeItem(convId);
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove all conversations
+   */
+  filterAndKeepMsgs(
+    convId: string,
+    predicate: (msg: Message) => boolean
+  ): void {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove last message from conversation
+   */
+  popMsg(convId: string): Message | undefined {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    dispatchConversationChange(convId);
+    return msg;
+  },
+
+  // event listeners
+  onConversationChanged(callback: CallbackConversationChanged) {
+    const fn = (e: Event) => callback((e as CustomEvent).detail.convId);
+    onConversationChangedHandlers.push([callback, fn]);
+    event.addEventListener('conversationChange', fn);
+  },
+  offConversationChanged(callback: CallbackConversationChanged) {
+    const fn = onConversationChangedHandlers.find(([cb, _]) => cb === callback);
+    if (fn) {
+      event.removeEventListener('conversationChange', fn[1]);
+    }
+    onConversationChangedHandlers = [];
+  },
+
+  // manage config
+  getConfig(): typeof CONFIG_DEFAULT {
+    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+    // to prevent breaking changes in the future, we always provide default value for missing keys
+    return {
+      ...CONFIG_DEFAULT,
+      ...savedVal,
+    };
+  },
+  setConfig(config: typeof CONFIG_DEFAULT) {
+    localStorage.setItem('config', JSON.stringify(config));
+  },
+  getTheme(): string {
+    return localStorage.getItem('theme') || 'auto';
+  },
+  setTheme(theme: string) {
+    if (theme === 'auto') {
+      localStorage.removeItem('theme');
+    } else {
+      localStorage.setItem('theme', theme);
+    }
+  },
+};
+
+export default StorageUtils;
diff --git a/examples/server/webui/src/utils/types.ts b/examples/server/webui/src/utils/types.ts
new file mode 100644
index 000000000..7cd12b40a
--- /dev/null
+++ b/examples/server/webui/src/utils/types.ts
@@ -0,0 +1,36 @@
+export interface TimingReport {
+  prompt_n: number;
+  prompt_ms: number;
+  predicted_n: number;
+  predicted_ms: number;
+}
+
+export interface Message {
+  id: number;
+  role: 'user' | 'assistant' | 'system';
+  content: string;
+  timings?: TimingReport;
+}
+
+export type APIMessage = Pick<Message, 'role' | 'content'>;
+
+export interface Conversation {
+  id: string; // format: `conv-{timestamp}`
+  lastModified: number; // timestamp from Date.now()
+  messages: Message[];
+}
+
+export type PendingMessage = Omit<Message, 'content'> & {
+  content: string | null;
+};
+
+export enum CanvasType {
+  PY_INTERPRETER,
+}
+
+export interface CanvasPyInterpreter {
+  type: CanvasType.PY_INTERPRETER;
+  content: string;
+}
+
+export type CanvasData = CanvasPyInterpreter;
diff --git a/examples/server/webui/src/vite-env.d.ts b/examples/server/webui/src/vite-env.d.ts
new file mode 100644
index 000000000..11f02fe2a
--- /dev/null
+++ b/examples/server/webui/src/vite-env.d.ts
@@ -0,0 +1 @@
+/// <reference types="vite/client" />
diff --git a/examples/server/webui/tailwind.config.js b/examples/server/webui/tailwind.config.js
new file mode 100644
index 000000000..c43066a19
--- /dev/null
+++ b/examples/server/webui/tailwind.config.js
@@ -0,0 +1,16 @@
+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {},
+  },
+  plugins: [
+    require('daisyui'),
+  ],
+  daisyui: {
+    themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'],
+  }
+}
diff --git a/examples/server/webui/tsconfig.app.json b/examples/server/webui/tsconfig.app.json
new file mode 100644
index 000000000..cb24c26f3
--- /dev/null
+++ b/examples/server/webui/tsconfig.app.json
@@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
+    "target": "ES2021",
+    "useDefineForClassFields": true,
+    "lib": ["ES2021", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["src"]
+}
diff --git a/examples/server/webui/tsconfig.json b/examples/server/webui/tsconfig.json
new file mode 100644
index 000000000..1ffef600d
--- /dev/null
+++ b/examples/server/webui/tsconfig.json
@@ -0,0 +1,7 @@
+{
+  "files": [],
+  "references": [
+    { "path": "./tsconfig.app.json" },
+    { "path": "./tsconfig.node.json" }
+  ]
+}
diff --git a/examples/server/webui/tsconfig.node.json b/examples/server/webui/tsconfig.node.json
new file mode 100644
index 000000000..db0becc8b
--- /dev/null
+++ b/examples/server/webui/tsconfig.node.json
@@ -0,0 +1,24 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
+    "target": "ES2022",
+    "lib": ["ES2023"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["vite.config.ts"]
+}
diff --git a/examples/server/webui/vite.config.ts b/examples/server/webui/vite.config.ts
new file mode 100644
index 000000000..b8a0f03d9
--- /dev/null
+++ b/examples/server/webui/vite.config.ts
@@ -0,0 +1,80 @@
+import { defineConfig, PluginOption } from 'vite';
+import react from '@vitejs/plugin-react';
+import { viteSingleFile } from 'vite-plugin-singlefile';
+import path from 'node:path';
+import fs from 'node:fs';
+import zlib from 'node:zlib';
+
+/* eslint-disable */
+
+const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
+
+const GUIDE_FOR_FRONTEND = `
+<!--
+  This is a single file build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim();
+
+const FRONTEND_PLUGINS = [react()];
+
+const BUILD_PLUGINS = [
+  ...FRONTEND_PLUGINS,
+  viteSingleFile(),
+  (function llamaCppPlugin() {
+    let config: any;
+    return {
+      name: 'llamacpp:build',
+      apply: 'build',
+      async configResolved(_config: any) {
+        config = _config;
+      },
+      writeBundle() {
+        const outputIndexHtml = path.join(config.build.outDir, 'index.html');
+        const content =
+          GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
+        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), {
+          level: 9,
+        });
+
+        // because gzip header contains machine-specific info, we must remove these data from the header
+        // timestamp
+        compressed[0x4] = 0;
+        compressed[0x5] = 0;
+        compressed[0x6] = 0;
+        compressed[0x7] = 0;
+        // OS
+        compressed[0x9] = 0;
+
+        if (compressed.byteLength > MAX_BUNDLE_SIZE) {
+          throw new Error(
+            `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
+              `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`
+          );
+        }
+
+        const targetOutputFile = path.join(
+          config.build.outDir,
+          '../../public/index.html.gz'
+        );
+        fs.writeFileSync(targetOutputFile, compressed);
+      },
+    } satisfies PluginOption;
+  })(),
+];
+
+export default defineConfig({
+  // @ts-ignore
+  plugins: process.env.ANALYZE ? FRONTEND_PLUGINS : BUILD_PLUGINS,
+  server: {
+    proxy: {
+      '/v1': 'http://localhost:8080',
+    },
+    headers: {
+      'Cross-Origin-Embedder-Policy': 'require-corp',
+      'Cross-Origin-Opener-Policy': 'same-origin',
+    },
+  },
+});
diff --git a/examples/server_embd.py b/examples/server_embd.py
new file mode 100644
index 000000000..0e34c6cea
--- /dev/null
+++ b/examples/server_embd.py
@@ -0,0 +1,35 @@
+import asyncio
+import asyncio.threads
+import requests
+import numpy as np
+
+
+n = 8
+
+result = []
+
+async def requests_post_async(*args, **kwargs):
+    return await asyncio.threads.to_thread(requests.post, *args, **kwargs)
+
+async def main():
+    model_url = "http://127.0.0.1:6900"
+    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
+        url= f"{model_url}/embedding",
+        json= {"content": str(0)*1024}
+    ) for i in range(n)])
+
+    for response in responses:
+        embedding = response.json()["embedding"]
+        print(embedding[-8:])
+        result.append(embedding)
+
+asyncio.run(main())
+
+# compute cosine similarity
+
+for i in range(n-1):
+    for j in range(i+1, n):
+        embedding1 = np.array(result[i])
+        embedding2 = np.array(result[j])
+        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+        print(f"Similarity between {i} and {j}: {similarity:.2f}")
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
new file mode 100644
index 000000000..567f7fbbb
--- /dev/null
+++ b/examples/simple-chat/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-simple-chat)
+add_executable(${TARGET} simple-chat.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-chat/README.md b/examples/simple-chat/README.md
new file mode 100644
index 000000000..f0099ce3d
--- /dev/null
+++ b/examples/simple-chat/README.md
@@ -0,0 +1,7 @@
+# llama.cpp/example/simple-chat
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
+
+```bash
+./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
+...
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
new file mode 100644
index 000000000..c5534cc13
--- /dev/null
+++ b/examples/simple-chat/simple-chat.cpp
@@ -0,0 +1,206 @@
+#include "llama.h"
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    std::string model_path;
+    int ngl = 99;
+    int n_ctx = 2048;
+
+    // parse command line arguments
+    for (int i = 1; i < argc; i++) {
+        try {
+            if (strcmp(argv[i], "-m") == 0) {
+                if (i + 1 < argc) {
+                    model_path = argv[++i];
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-c") == 0) {
+                if (i + 1 < argc) {
+                    n_ctx = std::stoi(argv[++i]);
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-ngl") == 0) {
+                if (i + 1 < argc) {
+                    ngl = std::stoi(argv[++i]);
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else {
+                print_usage(argc, argv);
+                return 1;
+            }
+        } catch (std::exception & e) {
+            fprintf(stderr, "error: %s\n", e.what());
+            print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (model_path.empty()) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    // only print errors
+    llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
+        if (level >= GGML_LOG_LEVEL_ERROR) {
+            fprintf(stderr, "%s", text);
+        }
+    }, nullptr);
+
+    // load dynamic backends
+    ggml_backend_load_all();
+
+    // initialize the model
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = ngl;
+
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
+    if (!model) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // initialize the context
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = n_ctx;
+    ctx_params.n_batch = n_ctx;
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (!ctx) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // initialize the sampler
+    llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+    // helper function to evaluate a prompt and generate a response
+    auto generate = [&](const std::string & prompt) {
+        std::string response;
+
+        const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
+
+        // tokenize the prompt
+        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
+        std::vector<llama_token> prompt_tokens(n_prompt_tokens);
+        if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
+            GGML_ABORT("failed to tokenize the prompt\n");
+        }
+
+        // prepare a batch for the prompt
+        llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+        llama_token new_token_id;
+        while (true) {
+            // check if we have enough space in the context to evaluate this batch
+            int n_ctx = llama_n_ctx(ctx);
+            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+            if (n_ctx_used + batch.n_tokens > n_ctx) {
+                printf("\033[0m\n");
+                fprintf(stderr, "context size exceeded\n");
+                exit(0);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                GGML_ABORT("failed to decode\n");
+            }
+
+            // sample the next token
+            new_token_id = llama_sampler_sample(smpl, ctx, -1);
+
+            // is it an end of generation?
+            if (llama_vocab_is_eog(vocab, new_token_id)) {
+                break;
+            }
+
+            // convert the token to a string, print it and add it to the response
+            char buf[256];
+            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                GGML_ABORT("failed to convert token to piece\n");
+            }
+            std::string piece(buf, n);
+            printf("%s", piece.c_str());
+            fflush(stdout);
+            response += piece;
+
+            // prepare the next batch with the sampled token
+            batch = llama_batch_get_one(&new_token_id, 1);
+        }
+
+        return response;
+    };
+
+    std::vector<llama_chat_message> messages;
+    std::vector<char> formatted(llama_n_ctx(ctx));
+    int prev_len = 0;
+    while (true) {
+        // get user input
+        printf("\033[32m> \033[0m");
+        std::string user;
+        std::getline(std::cin, user);
+
+        if (user.empty()) {
+            break;
+        }
+
+        const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
+
+        // add the user input to the message list and format it
+        messages.push_back({"user", strdup(user.c_str())});
+        int new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        if (new_len > (int)formatted.size()) {
+            formatted.resize(new_len);
+            new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
+        }
+        if (new_len < 0) {
+            fprintf(stderr, "failed to apply the chat template\n");
+            return 1;
+        }
+
+        // remove previous messages to obtain the prompt to generate the response
+        std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
+
+        // generate a response
+        printf("\033[33m");
+        std::string response = generate(prompt);
+        printf("\n\033[0m");
+
+        // add the response to the messages
+        messages.push_back({"assistant", strdup(response.c_str())});
+        prev_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), false, nullptr, 0);
+        if (prev_len < 0) {
+            fprintf(stderr, "failed to apply the chat template\n");
+            return 1;
+        }
+    }
+
+    // free resources
+    for (auto & msg : messages) {
+        free(const_cast<char *>(msg.content));
+    }
+    llama_sampler_free(smpl);
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}
diff --git a/examples/main-cmake-pkg/.gitignore b/examples/simple-cmake-pkg/.gitignore
similarity index 99%
rename from examples/main-cmake-pkg/.gitignore
rename to examples/simple-cmake-pkg/.gitignore
index e32c11c7f..67c01d64c 100644
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/simple-cmake-pkg/.gitignore
@@ -48,4 +48,3 @@
 build*/
 out/
 tmp/
-
diff --git a/examples/simple-cmake-pkg/CMakeLists.txt b/examples/simple-cmake-pkg/CMakeLists.txt
new file mode 100644
index 000000000..128e38c8f
--- /dev/null
+++ b/examples/simple-cmake-pkg/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.12)
+project(llama-simple-cmake-pkg)
+
+set(TARGET llama-simple-cmake-pkg)
+
+find_package(Llama REQUIRED)
+
+add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-cmake-pkg/README.md b/examples/simple-cmake-pkg/README.md
new file mode 100644
index 000000000..8b30049e2
--- /dev/null
+++ b/examples/simple-cmake-pkg/README.md
@@ -0,0 +1,34 @@
+# llama.cpp/example/simple-cmake-pkg
+
+This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+
+## Building
+
+Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
+
+### Considerations
+
+When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
+
+### Build llama.cpp and install to llama.cpp/inst
+
+```sh
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+cmake -S . -B build
+cmake --build build
+cmake --install build --prefix inst
+
+### Build simple-cmake-pkg
+
+```sh
+cd examples/simple-cmake-pkg
+cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
+cmake --build build
+```
+
+### Run simple-cmake-pkg
+
+```sh
+./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+```
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 7da5ff6f3..104ecabfd 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET simple)
+set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 5d24b1046..937008b24 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 39e2d8ea4..10e79a0a6 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,183 +1,206 @@
-#include "common.h"
 #include "llama.h"
-
-#include <cmath>
 #include <cstdio>
+#include <cstring>
 #include <string>
 #include <vector>
 
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
+    printf("\n");
+}
+
 int main(int argc, char ** argv) {
-    gpt_params params;
+    // path to the model gguf file
+    std::string model_path;
+    // prompt to generate text from
+    std::string prompt = "Hello my name is";
+    // number of layers to offload to the GPU
+    int ngl = 99;
+    // number of tokens to predict
+    int n_predict = 32;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
-        return 1 ;
+    // parse command line arguments
+
+    {
+        int i = 1;
+        for (; i < argc; i++) {
+            if (strcmp(argv[i], "-m") == 0) {
+                if (i + 1 < argc) {
+                    model_path = argv[++i];
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-n") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        n_predict = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else if (strcmp(argv[i], "-ngl") == 0) {
+                if (i + 1 < argc) {
+                    try {
+                        ngl = std::stoi(argv[++i]);
+                    } catch (...) {
+                        print_usage(argc, argv);
+                        return 1;
+                    }
+                } else {
+                    print_usage(argc, argv);
+                    return 1;
+                }
+            } else {
+                // prompt starts here
+                break;
+            }
+        }
+        if (model_path.empty()) {
+            print_usage(argc, argv);
+            return 1;
+        }
+        if (i < argc) {
+            prompt = argv[i++];
+            for (; i < argc; i++) {
+                prompt += " ";
+                prompt += argv[i];
+            }
+        }
     }
 
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
+    // load dynamic backends
 
-    if (argc >= 3) {
-        params.prompt = argv[2];
-    }
-
-    if (params.prompt.empty()) {
-        params.prompt = "Hello my name is";
-    }
-
-    // total length of the sequence including the prompt
-    const int n_len = 32;
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    ggml_backend_load_all();
 
     // initialize the model
 
     llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = ngl;
 
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
+    // tokenize the prompt
+
+    // find the number of tokens in the prompt
+    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+
+    // allocate space for the tokens and tokenize the prompt
+    std::vector<llama_token> prompt_tokens(n_prompt);
+    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
+        return 1;
+    }
+
     // initialize the context
 
     llama_context_params ctx_params = llama_context_default_params();
+    // n_ctx is the context size
+    ctx_params.n_ctx = n_prompt + n_predict - 1;
+    // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
+    ctx_params.n_batch = n_prompt;
+    // enable performance counters
+    ctx_params.no_perf = false;
 
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
-    ctx_params.n_threads = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
-    // tokenize the prompt
+    // initialize the sampler
 
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = false;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
-
-    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
-        return 1;
-    }
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
     // print the prompt token-by-token
 
-    fprintf(stderr, "\n");
-
-    for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    for (auto id : prompt_tokens) {
+        char buf[128];
+        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+            return 1;
+        }
+        std::string s(buf, n);
+        printf("%s", s.c_str());
     }
 
-    fflush(stderr);
+    // prepare a batch for the prompt
 
-    // create a llama_batch with size 512
-    // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(512, 0, 1);
-
-    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
-        return 1;
-    }
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
     // main loop
 
-    int n_cur    = batch.n_tokens;
-    int n_decode = 0;
-
     const auto t_main_start = ggml_time_us();
+    int n_decode = 0;
+    llama_token new_token_id;
 
-    while (n_cur <= n_len) {
-        // sample the next token
-        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
-                LOG_TEE("\n");
-
-                break;
-            }
-
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
-
-            // prepare the next batch
-            llama_batch_clear(batch);
-
-            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
-
-            n_decode += 1;
-        }
-
-        n_cur += 1;
-
+    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
             fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
+
+        n_pos += batch.n_tokens;
+
+        // sample the next token
+        {
+            new_token_id = llama_sampler_sample(smpl, ctx, -1);
+
+            // is it an end of generation?
+            if (llama_vocab_is_eog(vocab, new_token_id)) {
+                break;
+            }
+
+            char buf[128];
+            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
+            if (n < 0) {
+                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+                return 1;
+            }
+            std::string s(buf, n);
+            printf("%s", s.c_str());
+            fflush(stdout);
+
+            // prepare the next batch with the sampled token
+            batch = llama_batch_get_one(&new_token_id, 1);
+
+            n_decode += 1;
+        }
     }
 
-    LOG_TEE("\n");
+    printf("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx);
-
+    fprintf(stderr, "\n");
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
     fprintf(stderr, "\n");
 
-    llama_batch_free(batch);
-
+    llama_sampler_free(smpl);
     llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
+    llama_model_free(model);
 
     return 0;
 }
diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt
new file mode 100644
index 000000000..aeaea74fc
--- /dev/null
+++ b/examples/speculative-simple/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-speculative-simple)
+add_executable(${TARGET} speculative-simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative-simple/README.md b/examples/speculative-simple/README.md
new file mode 100644
index 000000000..e3a6c6b4a
--- /dev/null
+++ b/examples/speculative-simple/README.md
@@ -0,0 +1,12 @@
+# llama.cpp/examples/speculative-simple
+
+Demonstration of basic greedy speculative decoding
+
+```bash
+./bin/llama-speculative-simple \
+    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
+    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
+    -f test.txt -c 0 -ngl 99 --color \
+    --sampling-seq k --top-k 1 -fa --temp 0.0 \
+    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+```
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
new file mode 100644
index 000000000..403ba2dd2
--- /dev/null
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -0,0 +1,261 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+        return 1;
+    }
+
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+
+    common_init();
+
+    if (params.speculative.model.empty()) {
+        LOG_ERR("%s: --model-draft is required\n", __func__);
+        return 1;
+    }
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_tgt = NULL;
+    //llama_model * model_dft = NULL;
+
+    llama_context * ctx_tgt = NULL;
+    llama_context * ctx_dft = NULL;
+
+    // load the target model
+    common_init_result llama_init_tgt = common_init_from_params(params);
+
+    model_tgt = llama_init_tgt.model.get();
+    ctx_tgt   = llama_init_tgt.context.get();
+
+    const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
+
+    // load the draft model
+    params.devices      = params.speculative.devices;
+    params.model        = params.speculative.model;
+    params.n_ctx        = params.speculative.n_ctx;
+    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
+    }
+
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+    common_init_result llama_init_dft = common_init_from_params(params);
+
+    //model_dft = llama_init_dft.model.get();
+    ctx_dft   = llama_init_dft.context.get();
+
+    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
+        return 1;
+    }
+
+    // Tokenize the prompt
+    std::vector<llama_token> inp;
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
+
+    if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
+
+        return 1;
+    }
+
+    if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
+        LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
+
+        return 1;
+    }
+
+    LOG("\n\n");
+
+    for (auto id : inp) {
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
+    }
+
+    // how many tokens to draft each time
+    int n_draft     = params.speculative.n_max;
+    int n_draft_min = params.speculative.n_min;
+
+    float p_min = params.speculative.p_min;
+
+    int n_predict = 0;
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    // used to determine end of generation
+    bool has_eos = false;
+
+    // ================================================
+    // everything until here is standard initialization
+    // the relevant stuff for speculative decoding starts here
+
+    const auto t_enc_start = ggml_time_us();
+
+    // target model sampling context
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
+
+    // eval the prompt
+    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+
+    // note: keep the last token separate!
+    llama_token id_last = inp.back();
+
+    // all tokens currently in the target context
+    llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
+    prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
+
+    int n_past = inp.size() - 1;
+
+    // init the speculator
+    struct common_speculative_params params_spec;
+    params_spec.n_draft = n_draft;
+    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
+    params_spec.p_min   = p_min;
+
+    struct common_speculative * spec = common_speculative_init(ctx_dft);
+
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
+
+    const auto t_enc_end = ggml_time_us();
+
+    const auto t_dec_start = ggml_time_us();
+
+    while (true) {
+        // optionally, generate draft tokens that can be appended to the target batch
+        //
+        // this is the most important part of the speculation. the more probable tokens that are provided here
+        // the better the performance will be. in theory, this computation can be performed asynchronously and even
+        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
+        // from a cache or lookup tables.
+        //
+        llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
+
+        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
+
+        // always have a token to evaluate from before - id_last
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+
+        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
+        {
+            // do not waste time on small drafts
+            if (draft.size() < (size_t) n_draft_min) {
+                draft.clear();
+            }
+
+            for (size_t i = 0; i < draft.size(); ++i) {
+                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+            }
+
+            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
+
+            llama_decode(ctx_tgt, batch_tgt);
+        }
+
+        // sample from the full target batch and return the accepted tokens based on the target sampler
+        //
+        // for each token to be accepted, the sampler would have to sample that same token
+        // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
+        // available logits from the batch and sample the next token until we run out of logits or the sampler
+        // disagrees with the draft
+        //
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+
+        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
+
+        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
+
+        n_past    += ids.size() - 1;
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
+        n_accept  += ids.size() - 1;
+        n_predict += ids.size();
+
+        // process the accepted tokens and update contexts
+        //
+        // this is the standard token post-processing that we normally do
+        // in this case, we do it for a group of accepted tokens at once
+        //
+        for (size_t i = 0; i < ids.size(); ++i) {
+            prompt_tgt.push_back(id_last);
+
+            id_last = ids[i];
+
+            if (llama_vocab_is_eog(vocab, id_last)) {
+                has_eos = true;
+                break;
+            }
+
+            const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
+
+            if (params.use_color && i + 1 < ids.size()) {
+                LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
+            } else {
+                LOG("%s", token_str.c_str());
+            }
+        }
+
+        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
+
+        {
+            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
+
+            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+        }
+
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+            break;
+        }
+    }
+
+    auto t_dec_end = ggml_time_us();
+
+    const int n_input = inp.size();
+
+    LOG("\n\n");
+
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
+
+    llama_perf_context_print(ctx_dft);
+
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
+    common_perf_print(ctx_tgt, smpl);
+
+    common_sampler_free(smpl);
+    common_speculative_free(spec);
+
+    llama_backend_free();
+
+    LOG("\n\n");
+
+    return 0;
+}
diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt
index 810f3c46a..c84196bd9 100644
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET speculative)
+set(TARGET llama-speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
index 814efa592..a6608c5fe 100644
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -6,3 +6,4 @@ More info:
 
 - https://github.com/ggerganov/llama.cpp/pull/2926
 - https://github.com/ggerganov/llama.cpp/pull/3624
+- https://github.com/ggerganov/llama.cpp/pull/5625
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 3848791d4..c7ccea50d 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,12 +1,18 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
-#include <cmath>
+#include <algorithm>
 #include <cstdio>
+#include <cstring>
+#include <random>
+#include <set>
 #include <string>
 #include <vector>
 
-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 
 struct seq_draft {
@@ -18,36 +24,41 @@ struct seq_draft {
     std::vector<int> i_batch_tgt;
 
     std::vector<llama_token> tokens;
+    std::vector<std::vector<llama_token_data>> dists;
 
-    struct llama_sampling_context * ctx_sampling;
+    struct common_sampler * smpl = nullptr;
 };
 
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    // needed to get candidate probs even for temp <= 0.0
+    params.sampling.n_probs = 128;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
 
-    if (params.model_draft.empty()) {
-        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+
+    common_init();
+
+    if (params.speculative.model.empty()) {
+        LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
 
     // max number of parallel drafting sequences (i.e. tree branches)
     const int n_seq_dft = params.n_parallel;
 
-    // probability threshold for accepting a token from the draft model
-    const float p_accept = params.p_accept;
-
     // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_split  = params.p_split;
+    const float p_draft_split = params.speculative.p_split;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("speculative", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
+    std::uniform_real_distribution<> u_dist;
 
     // init llama.cpp
     llama_backend_init();
@@ -60,40 +71,72 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
 
     // load the target model
-    params.logits_all = true;
-    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
+    common_init_result llama_init_tgt = common_init_from_params(params);
+
+    model_tgt = llama_init_tgt.model.get();
+    ctx_tgt   = llama_init_tgt.context.get();
 
     // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    params.devices = params.speculative.devices;
+    params.model = params.speculative.model;
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
+    if (params.speculative.cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
+    }
+
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
+    common_init_result llama_init_dft = common_init_from_params(params);
+
+    model_dft = llama_init_dft.model.get();
+    ctx_dft   = llama_init_dft.context.get();
+
+    const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
+    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
+
+    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
+
+    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
+
+    if (vocab_type_tgt != vocab_type_dft) {
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
+        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        return 1;
+    }
+
+    if (
+        llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
+        llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
+        llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
+        llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
+    ) {
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        return 1;
     }
-    params.n_threads_batch = params.n_threads_batch_draft;
-    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
     {
-        const int n_vocab_tgt = llama_n_vocab(model_tgt);
-        const int n_vocab_dft = llama_n_vocab(model_dft);
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
         const int vocab_diff  = n_vocab_tgt > n_vocab_dft
             ? n_vocab_tgt - n_vocab_dft
             : n_vocab_dft - n_vocab_tgt;
 
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
-            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return 1;
         }
 
         for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
-            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
-            const char * token_text_dft = llama_token_get_text(model_dft, i);
+            const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
+            const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
-                        llama_token_to_piece(ctx_tgt, i).c_str(),
-                        llama_token_to_piece(ctx_dft, i).c_str());
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
+                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
                 return 1;
             }
         }
@@ -101,53 +144,39 @@ int main(int argc, char ** argv) {
 
 
     // Tokenize the prompt
-    const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
-    LOG("add_bos tgt: %d\n", add_bos_tgt);
-
-    const bool add_bos_dft = llama_should_add_bos_token(model_dft);
-    LOG("add_bos dft: %d\n", add_bos_dft);
-
-    if (add_bos_tgt != add_bos_dft) {
-        fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
-        fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
-        return 1;
-    }
-
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
 
     const int max_context_size     = llama_n_ctx(ctx_tgt);
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
     }
 
-    fflush(stderr);
-
     const int n_input = inp.size();
 
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
 
     const auto t_enc_end = ggml_time_us();
 
     // the 2 models should have the same vocab
-    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
+    //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
 
     // how many tokens to draft each time
-    int n_draft = params.n_draft;
+    int n_draft = params.speculative.n_max;
 
     int n_predict = 0;
     int n_drafted = 0;
@@ -159,21 +188,19 @@ int main(int argc, char ** argv) {
     // used to determine end of generation
     bool has_eos = false;
 
-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    // target model sampling context (reuse the llama_context's sampling instance)
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
-    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model
-
     for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        // allocate llama_sampler for each draft sequence
+        drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
     }
 
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
 
     const auto t_dec_start = ggml_time_us();
 
@@ -182,84 +209,216 @@ int main(int argc, char ** argv) {
     drafts[0].i_batch_tgt[0] = 0;
 
     while (true) {
+        std::set<int> active_seqs = {};
+
         // print current draft sequences
         for (int s = 0; s < n_seq_dft; ++s) {
             if (!drafts[s].active) {
                 continue;
             }
 
+            active_seqs.insert(s);
             const auto & tokens = drafts[s].tokens;
 
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
         }
 
         int i_dft  = 0;
         int s_keep = 0;
 
+        llama_token token_id;
+        std::string token_str;
+
+        // loop until we fail to accept a drafted token or we run out of drafted tokens
         while (true) {
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-
-            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-
-            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
-
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
-
-            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
-
-            if (!params.use_color) {
-                printf("%s", token_str.c_str());
-            }
-
-            if (id == llama_token_eos(model_tgt)) {
-                has_eos = true;
-            }
-
-            ++n_predict;
 
             // check if the target token matches any of the drafts
+            // for stochastic sampling, attempt to match the token with the drafted tokens
             {
-                bool matches = false;
+                bool accept = false;
+                if (params.sampling.temp > 0) {
+                    // stochastic verification
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
 
-                for (int s = 0; s < n_seq_dft; ++s) {
-                    if (!drafts[s].active) {
-                        continue;
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
+
+                    float p_tgt = 0.0f;
+                    float p_dft = 0.0f;
+
+                    while (active_seqs.size() > 0) {
+                        // randomly select a sequence to verify from active sequences
+                        std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
+                        int s = *std::next(active_seqs.begin(), u_int_dist(rng));
+                        if (i_dft >= (int) drafts[s].tokens.size()) {
+                            drafts[s].active = false;
+                            active_seqs.erase(s);
+                            continue;
+                        }
+                        if (accept) {
+                            // if we already accepted a token, we can skip the rest
+                            if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
+                                drafts[s].active = false;
+                                active_seqs.erase(s);
+                            }
+                            continue;
+                        }
+
+                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        float r = u_dist(rng);
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
+
+                        //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
+
+                        // acquire the token probabilities assigned by the draft and target models
+                        for (size_t i = 0; i < dist_tgt.size; i++) {
+                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
+                                p_tgt = dist_tgt.data[i].p;
+                                break;
+                            }
+                        }
+                        for (size_t i = 0; i < dist_dft.size; i++) {
+                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
+                                p_dft = dist_dft.data[i].p;
+                                break;
+                            }
+                        }
+                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        if (r <= p_tgt / p_dft) {
+                            s_keep = s;
+                            accept = true;
+                            token_id = drafts[s].tokens[i_dft];
+                            token_str = common_token_to_piece(ctx_tgt, token_id);
+                            common_sampler_accept(smpl, token_id, true);
+
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            break;
+                        } else {
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            drafts[s].active = false;
+
+                            // calculate residual probability
+                            GGML_ASSERT(dist_tgt.sorted);
+                            GGML_ASSERT(dist_dft.sorted);
+
+                            // sort dist by id
+                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
+                                return a.id < b.id;
+                            });
+                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
+                                return a.id < b.id;
+                            });
+
+                            float sum_probs = 0.0f;
+
+                            for (size_t i = 0; i < dist_tgt.size; i++) {
+                                if (i < dist_dft.size) {
+                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
+                                } else {
+                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
+                                }
+
+                                sum_probs += dist_tgt.data[i].p;
+                            }
+
+                            for (size_t i = 0; i < dist_tgt.size; i++) {
+                                dist_tgt.data[i].p /= sum_probs;
+                            }
+
+                            // sort dist_tgt by p desc
+                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
+                                return a.p > b.p;
+                            });
+                        }
+
+                        active_seqs.erase(s);
+                        for(int i = 0; i < n_seq_dft; i++) {
+                            if (i == s) {
+                                continue;
+                            }
+                            if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
+                                // synchronize active status for sequences with the same drafted token
+                                drafts[i].active = drafts[i].active && accept;
+                                if (!drafts[i].active) {
+                                    active_seqs.erase(s);
+                                }
+                            }
+                        }
                     }
 
-                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
-                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+                    if (!accept) {
+                        // all drafted tokens were rejected
+                        // sample from the target model
+                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        std::vector<float> probs(dist_tgt.size);
+                        for (size_t i = 0; i < dist_tgt.size; ++i) {
+                            probs[i] = dist_tgt.data[i].p;
+                        }
 
-                        s_keep = s;
-                        matches = true;
-                    } else {
-                        drafts[s].active = false;
+                        std::discrete_distribution<> dist(probs.begin(), probs.end());
+
+                        const int idx = dist(rng);
+
+                        token_id = dist_tgt.data[idx].id;
+                        common_sampler_accept(smpl, token_id, true);
+                        token_str = common_token_to_piece(ctx_tgt, token_id);
+                    }
+                } else {
+                    // greedy verification
+
+                    // sample from the target model
+                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+
+                    common_sampler_accept(smpl, token_id, true);
+
+                    token_str = common_token_to_piece(ctx_tgt, token_id);
+
+                    for (int s = 0; s < n_seq_dft; ++s) {
+                        if (!drafts[s].active) {
+                            continue;
+                        }
+
+                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
+                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+
+                            s_keep = s;
+                            accept = true;
+                        } else {
+                            drafts[s].active = false;
+                        }
                     }
                 }
 
-                if (matches) {
+                if (llama_vocab_is_eog(vocab_tgt, token_id)) {
+                    has_eos = true;
+                }
+                ++n_predict;
+
+                if (accept) {
                     ++n_accept;
                     ++n_past_tgt;
                     ++n_past_dft;
                     ++i_dft;
                     if (params.use_color) {
                         // Color token according to its origin sequence
-                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
-                        fflush(stdout);
+                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                    } else {
+                        LOG("%s", token_str.c_str());
                     }
                     continue;
+                } else {
+                    LOG("%s", token_str.c_str());
+                    break;
                 }
             }
-            if (params.use_color) {
-                printf("%s", token_str.c_str());
-            }
-            fflush(stdout);
+        }
 
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+        {
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
 
             // TODO: simplify
             {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -275,28 +434,31 @@ int main(int argc, char ** argv) {
                 drafts[s].active = false;
                 drafts[s].tokens.clear();
                 drafts[s].i_batch_tgt.clear();
+                drafts[s].dists.clear();
             }
             // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(id);
+            drafts[0].tokens.push_back(token_id);
+            drafts[0].dists.push_back(std::vector<llama_token_data>());
             drafts[0].i_batch_tgt.push_back(0);
 
-            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+            common_batch_clear(batch_dft);
+            common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
             llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode         (ctx_dft, batch_dft);
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
+        }
 
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
             break;
         }
 
-        if (n_predict > params.n_predict || has_eos) {
-            break;
+        if (drafts[0].smpl) {
+            common_sampler_free(drafts[0].smpl);
         }
-
-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+        drafts[0].smpl = common_sampler_clone(smpl);
 
         int n_seq_cur  = 1;
         int n_past_cur = n_past_dft;
@@ -309,8 +471,8 @@ int main(int argc, char ** argv) {
         drafts[0].drafting    = true;
         drafts[0].i_batch_dft = 0;
 
-        llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
 
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
@@ -325,27 +487,21 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
 
-                const auto & cur_p = drafts[s].ctx_sampling->cur;
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
 
-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
-                }
-
-                if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
-                    drafts[s].drafting = false;
-                    continue;
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
+                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                 }
 
                 std::vector<int> sa(1, s);
 
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
+                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
                         llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                         llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -367,10 +523,14 @@ int main(int argc, char ** argv) {
                         drafts[n_seq_cur].skip     = true;
 
                         drafts[n_seq_cur].tokens      = drafts[s].tokens;
+                        drafts[n_seq_cur].dists       = drafts[s].dists;
                         drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                         drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
 
-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                        if (drafts[n_seq_cur].smpl) {
+                            common_sampler_free(drafts[n_seq_cur].smpl);
+                        }
+                        drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
 
                         sa.push_back(n_seq_cur);
 
@@ -382,23 +542,25 @@ int main(int argc, char ** argv) {
 
                 // add drafted token for each sequence
                 for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p[is].id;
+                    const llama_token id = cur_p->data[is].id;
 
                     const int s = sa[is];
 
-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                    common_sampler_accept(drafts[s].smpl, id, true);
 
                     drafts[s].tokens.push_back(id);
+                    // save cur_p.data into drafts[s].dists
+                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
 
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -428,7 +590,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             llama_decode(ctx_tgt, batch_tgt);
             ++n_past_tgt;
         }
@@ -440,45 +602,43 @@ int main(int argc, char ** argv) {
             }
 
             drafts[s].tokens.erase(drafts[s].tokens.begin());
+            drafts[s].dists.erase(drafts[s].dists.begin());
         }
     }
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
+    // TODO: print sampling/grammar timings for all drafts
+    llama_perf_context_print(ctx_dft);
 
-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
+    common_perf_print(ctx_tgt, smpl);
 
-    llama_sampling_free(ctx_sampling);
+    common_sampler_free(smpl);
     for (int s = 0; s < n_seq_dft; ++s) {
-        llama_sampling_free(drafts[s].ctx_sampling);
+        common_sampler_free(drafts[s].smpl);
     }
 
     llama_batch_free(batch_dft);
 
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
-
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
-
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 69cf8932e..e4d5083e6 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -2,7 +2,7 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
-set(TARGET ls-sycl-device)
+set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index b46f17f39..8819d87f5 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -1,20 +1,20 @@
 # llama.cpp/example/sycl
 
-This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+This example program provides the tools for llama.cpp for SYCL on Intel GPU.
 
 ## Tool
 
 |Tool Name| Function|Status|
 |-|-|-|
-|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
 
-### ls-sycl-device
+### llama-ls-sycl-device
 
 List all SYCL devices with ID, compute capability, max work group size, ect.
 
-1. Build the llama.cpp for SYCL for all targets.
+1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
 
-2. Enable oneAPI running environment
+2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
 
 ```
 source /opt/intel/oneapi/setvars.sh
@@ -23,25 +23,19 @@ source /opt/intel/oneapi/setvars.sh
 3. Execute
 
 ```
-./build/bin/ls-sycl-device
+./build/bin/llama-ls-sycl-device
 ```
 
 Check the ID in startup log, like:
 
 ```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+found 2 SYCL devices:
+|  |                   |                                       |       |Max    |        |Max  |Global |                     |
+|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |
+|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|
+|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
+| 0| [level_zero:gpu:0]|                Intel Arc A770 Graphics|    1.3|    512|    1024|   32| 16225M|            1.3.29138|
+| 1| [level_zero:gpu:1]|                 Intel UHD Graphics 750|    1.3|     32|     512|   32| 62631M|            1.3.29138|
 
 ```
 
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
index 26ad2f7da..8fe0a6790 100755
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,13 +8,16 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
 
 #for FP32
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
-#build example/main only
+#build example/main
 #cmake --build . --config Release --target main
 
+#build example/llama-bench
+#cmake --build . --config Release --target llama-bench
+
 #build all binary
-cmake --build . --config Release -v
+cmake --build . --config Release -j -v
diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index 52442e4ca..74a8b7fd8 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -7,7 +7,7 @@
 
 #include "ggml-sycl.h"
 
-int main(int argc, char ** argv) {
+int main() {
     ggml_backend_sycl_print_sycl_devices();
     return 0;
 }
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index f5f4c1e98..3b9ba3b2d 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -4,16 +4,24 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
-INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 
-if [ $# -gt 0 ]; then
-    export GGML_SYCL_DEVICE=$1
-else
-    export GGML_SYCL_DEVICE=0
-fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
 
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+NGL=33
+CONEXT=8192
+
+if [ $# -gt 0 ]; then
+    GGML_SYCL_DEVICE=$1
+    echo "use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
+
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
+fi
diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat
index f9d43f8ed..17dd1ff5c 100644
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -3,21 +3,31 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT
 
-mkdir -p build
+
+IF not exist build (mkdir build)
 cd build
+if %errorlevel% neq 0 goto ERROR
+
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+if %errorlevel% neq 0 goto ERROR
 
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
 
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
+cmake -G "Ninja" ..  -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
 
 ::  build all binary
-make -j
+cmake --build . -j
+if %errorlevel% neq 0 goto ERROR
+
 cd ..
+exit /B 0
+
+:ERROR
+echo comomand error: %errorlevel%
+exit /B %errorlevel%
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat
index cf621c675..c2918d6dc 100644
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -6,8 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 
-set GGML_SYCL_DEVICE=0
-rem set GGML_SYCL_DEBUG=1
-.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
-
-
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
index 5e6654d7e..1690b53e5 100644
--- a/examples/tokenize/CMakeLists.txt
+++ b/examples/tokenize/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET tokenize)
+set(TARGET llama-tokenize)
 add_executable(${TARGET} tokenize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index d95a92475..7375759eb 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,44 +1,416 @@
 #include "common.h"
+//#include "log.h" // TODO: start using log.h
 #include "llama.h"
 
-#include <cmath>
 #include <cstdio>
+#include <cstring>
+#include <fstream>
 #include <string>
 #include <vector>
+#include <iostream> // TODO: remove me
 
-int main(int argc, char ** argv) {
-    if (argc < 3 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <shellapi.h>   // For CommandLineToArgvW
+#endif
+
+static void print_usage_information(const char * argv0) {
+    printf("usage: %s [options]\n\n", argv0);
+    printf("The tokenize program tokenizes a prompt using a given model,\n");
+    printf("and prints the resulting tokens to standard output.\n\n");
+    printf("It needs a model file, a prompt, and optionally other flags\n");
+    printf("to control the behavior of the tokenizer.\n\n");
+    printf("    The possible options are:\n");
+    printf("\n");
+    printf("    -h, --help                           print this help and exit\n");
+    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    printf("    --stdin                              read prompt from standard input.\n");
+    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-escape                          do not escape input (such as \\n, \\t, etc.).\n");
+    printf("    --no-parse-special                   do not parse control tokens.\n");
+    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    printf("    --show-count                         print the total number of tokens.\n");
+}
+
+static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) text;
+    (void) user_data;
+}
+
+static std::string read_prompt_from_file(const char * filepath, bool & success) {
+    success = false;
+
+    std::ifstream in(filepath, std::ios::binary);
+    if (!in) {
+        fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
+        return std::string();
+    }
+    // do not assume the file is seekable (e.g. /dev/stdin)
+    std::stringstream buffer;
+    buffer << in.rdbuf();
+    if (in.fail()) {
+        fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
+        return std::string();
+    }
+
+    success = true;
+    return buffer.str();
+}
+
+//
+// Function: ingest_args(...) -> vector<string>
+//
+//  Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
+//  strings, as an STL vector<string>.
+//
+//  In particular, it handles character encoding shenanigans on Windows.
+//
+// Note: raw_argc and raw_argv are not actually read at all on Windows.
+//       On Windows we call GetCommandLineW to get the arguments in wchar_t
+//       format, ignoring the regular argc/argv arguments to main().
+//
+// TODO: potential opportunity to roll common stuff into common/console.cpp
+//       in relation to Windows wchar_t shenanigans.
+static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
+    std::vector<std::string> argv;
+
+    // Handle Windows, if given non-ASCII arguments.
+    // We convert wchar_t arguments into UTF-8 char* on this platform.
+    // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
+    // without throwing tantrums.
+#if defined(_WIN32)
+    int argc;
+    const LPWSTR cmdline_wargv = GetCommandLineW();
+    LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
+
+    // silence unused arg warnings
+    (void) raw_argc;
+    (void) raw_argv;
+
+    for (int i = 0; i < argc; ++i) {
+        int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
+        char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
+        GGML_ASSERT(output_buf);
+
+        WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
+        output_buf[length_needed] = '\0';
+
+        argv.push_back(output_buf);
+        free(output_buf);
+    }
+
+    LocalFree((HLOCAL) wargv);
+#else
+    int argc = raw_argc;
+    for (int i = 0; i < argc; ++i) {
+        argv.push_back(raw_argv[i]);
+    }
+#endif
+
+    GGML_ASSERT((unsigned int) argc == argv.size());
+
+    return argv;
+}
+
+//
+// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
+//
+// writes a string to standard output; taking into account that on Windows
+// to display correctly you have to use special handling. Works even if the
+// user has not set a unicode code page on a Windows cmd.exe.
+//
+// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
+// a human-readable is written instead.
+//
+// On non-Windows systems, simply printfs() the string.
+static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
+        invalid_utf8 = false;
+
+#if defined(_WIN32)
+        // Are we in a console?
+        HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        DWORD dwMode = 0;
+
+        // According to Microsoft docs:
+        // "WriteConsole fails if it is used with a standard handle that is redirected to a file."
+        // Also according to the docs, you can use GetConsoleMode to check for that.
+        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
+            printf("%s", str);
+            return;
+        }
+
+        // MultiByteToWideChar reports an error if str is empty, don't report
+        // them as invalid_utf8.
+        if (*str == 0) {
+            return;
+        }
+        int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
+        if (length_needed == 0) {
+            DWORD err = GetLastError();
+            if (err == ERROR_NO_UNICODE_TRANSLATION) {
+                invalid_utf8 = true;
+                int len = strlen(str);
+                printf("<");
+                for (int i = 0; i < len; ++i) {
+                    if (i > 0) {
+                        printf(" ");
+                    }
+                    printf("%02x", (uint8_t) str[i]);
+                }
+                printf(">");
+                return;
+            }
+            GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
+        }
+
+        LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
+        GGML_ASSERT(wstr);
+
+        MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
+        WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
+
+        free(wstr);
+#else
+        // TODO: reporting invalid_utf8 would be useful on non-Windows too.
+        // printf will silently just write bad unicode.
+        printf("%s", str);
+#endif
+}
+
+int main(int raw_argc, char ** raw_argv) {
+    const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
+    const int argc = argv.size();
+
+    if (argc <= 1) {
+        print_usage_information(argv[0].c_str());
         return 1;
     }
 
-    const char * model_path = argv[1];
-    const char * prompt     = argv[2];
+    //////
+    // Read out all the command line arguments.
+    //////
 
-    const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
+    // variables where to put any arguments we see.
+    bool printing_ids = false;
+    bool no_bos = false;
+    bool no_escape = false;
+    bool no_parse_special = false;
+    bool disable_logging = false;
+    bool show_token_count = false;
+    const char * model_path = NULL;
+    const char * prompt_path = NULL;
+    const char * prompt_arg = NULL;
+
+    // track which arguments were explicitly given
+    // used for sanity checking down the line
+    bool model_path_set = false;
+    bool prompt_path_set = false;
+    bool prompt_set = false;
+    bool stdin_set = false;
+
+    int iarg = 1;
+    for (; iarg < argc; ++iarg) {
+        std::string arg{argv[iarg]};
+        if (arg == "-h" || arg == "--help") {
+            print_usage_information(argv[0].c_str());
+            return 0;
+        }
+        else if (arg == "--ids") {
+            printing_ids = true;
+        }
+        else if (arg == "-m" || arg == "--model") {
+            if (model_path_set) {
+                fprintf(stderr, "Error: -m or --model specified multiple times.\n");
+                return 1;
+            }
+            model_path = argv[++iarg].c_str();
+            model_path_set = true;
+        }
+        else if (arg == "--no-bos") {
+            no_bos = true;
+        }
+        else if (arg == "--no-escape") {
+            no_escape = true;
+        }
+        else if (arg == "--no-parse-special") {
+            no_parse_special = true;
+        }
+        else if (arg == "-p" || arg == "--prompt") {
+            if (prompt_set) {
+                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
+                return 1;
+            }
+            prompt_arg = argv[++iarg].c_str();
+            prompt_set = true;
+        }
+        else if (arg == "-f" || arg == "--file") {
+            if (prompt_path_set) {
+                fprintf(stderr, "Error: -f or --file specified multiple times.\n");
+                return 1;
+            }
+            prompt_path = argv[++iarg].c_str();
+            prompt_path_set = true;
+        }
+        else if (arg == "--stdin") {
+            stdin_set = true;
+        }
+        else if (arg == "--log-disable") {
+            disable_logging = true;
+        }
+        else if (arg == "--show-count") {
+            show_token_count = true;
+        }
+        else {
+            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
+            return 1;
+        }
+    }
+
+    //////
+    // Sanity check the command line arguments.
+    //////
+
+    // Check that we have the required stuff set.
+    if (model_path_set && model_path == NULL) {
+        fprintf(stderr, "Error: --model requires an argument.\n");
+        return 1;
+    }
+    if (!model_path_set) {
+        fprintf(stderr, "Error: must specify --model.\n");
+        return 1;
+    }
+    if (prompt_path_set && prompt_path == NULL) {
+        fprintf(stderr, "Error: --file requires an argument.\n");
+        return 1;
+    }
+    if (prompt_set && prompt_arg == NULL) {
+        fprintf(stderr, "Error: --prompt requires an argument.\n");
+        return 1;
+    }
+    const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
+    if (prompts_set > 1) {
+        fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
+        return 1;
+    }
+    // Must have some prompt.
+    if (prompts_set == 0) {
+        fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
+        return 1;
+    }
+
+    GGML_ASSERT(model_path);
+    GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
+
+    //////
+    // Figure out where will the prompt come from.
+    //////
+
+    std::string prompt;
+    if (prompt_path_set) {
+        bool success = false;
+        prompt = read_prompt_from_file(prompt_path, success);
+        if (!success) {
+            return 1;
+        }
+    } else if (prompt_set) {
+        prompt = prompt_arg;
+    } else {
+        GGML_ASSERT(stdin_set);
+        // we read stdin *after* loading model (early exit if model cannot
+        // be loaded, which can be a nicer user experience)
+    }
+
+    //////
+    // Start actually doing the tokenizing stuff.
+    //////
+
+    if (disable_logging) {
+        llama_log_set(llama_log_callback_null, NULL);
+    }
 
     llama_backend_init();
 
     llama_model_params model_params = llama_model_default_params();
     model_params.vocab_only = true;
-    llama_model * model = llama_load_model_from_file(model_path, model_params);
+    llama_model * model = llama_model_load_from_file(model_path, model_params);
+    if (!model) {
+        fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
 
     llama_context_params ctx_params = llama_context_default_params();
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (!ctx) {
+        fprintf(stderr, "Error: could not create context.\n");
+        return 1;
+    }
 
-    const bool add_bos = llama_should_add_bos_token(model);
+    // read entire prompt from stdin?
+    if (stdin_set) {
+        GGML_ASSERT(!prompt_path_set && !prompt_set);
+
+        std::stringstream stdin_buffer;
+        stdin_buffer << std::cin.rdbuf();
+        if (std::cin.fail()) {
+            fprintf(stderr, "Error: could not read the entire standard input.\n");
+            return 1;
+        }
+
+        prompt = stdin_buffer.str();
+    }
+
+    const bool model_wants_add_bos = llama_vocab_get_add_bos(vocab);
+    const bool add_bos = model_wants_add_bos && !no_bos;
+    const bool parse_special = !no_parse_special;
+    const bool escape = !no_escape;
+
+    if (escape) {
+        string_process_escapes(prompt);
+    }
 
     std::vector<llama_token> tokens;
+    tokens = common_tokenize(vocab, prompt, add_bos, parse_special);
 
-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    if (printing_ids) {
+        printf("[");
+    }
 
     for (int i = 0; i < (int) tokens.size(); i++) {
         if (printing_ids) {
-            printf("%d\n", tokens[i]);
+            if (i > 0) {
+                printf(", ");
+            }
+            printf("%d", tokens[i]);
         } else {
-            printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
+            bool invalid_utf8 = false;
+            printf("%6d -> '", tokens[i]);
+            write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
+            if (invalid_utf8) {
+                printf("' (utf-8 decode failure)\n");
+            } else {
+                printf("'\n");
+            }
         }
     }
 
+    if (printing_ids) {
+        printf("]\n");
+    }
+
+    if (show_token_count) {
+        printf("Total number of tokens: %zu\n", tokens.size());
+    }
+    // silence valgrind
+    llama_free(ctx);
+    llama_model_free(model);
+
     return 0;
 }
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
deleted file mode 100644
index 4459516d0..000000000
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET train-text-from-scratch)
-add_executable(${TARGET} train-text-from-scratch.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
deleted file mode 100644
index 1b3454069..000000000
--- a/examples/train-text-from-scratch/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# train-text-from-scratch
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# train
-./bin/train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab-llama.gguf \
-        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
-        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
-        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
-        --train-data "shakespeare.txt" \
-        -t 6 -b 16 --seed 1 --adam-iter 256 \
-        --no-checkpointing
-
-# predict
-./bin/main -m ggml-shakespeare-256x16-f32.gguf
-```
-
-Output files will be saved every N iterations (config with `--save-every N`).
-The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
-
-To train GGUF models just pass them to `--checkpoint-in FN`.
diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
deleted file mode 100644
index ed93673bc..000000000
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ /dev/null
@@ -1,499 +0,0 @@
-#!/usr/bin/env python3
-# train-text-from-scratch checkpoint --> gguf conversion
-
-import argparse
-import os
-import struct
-import sys
-import numpy as np
-from pathlib import Path
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
-import gguf
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.product(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        assert(tuple(ne) == tuple(self.ne))
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationParamsV0:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version == 0:
-            params = OptimizationParamsV0()
-            offset = params.load(data, offset)
-            self.past = params.past
-            self.lbfgs_m = params.lbfgs_m
-            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-            self.type = params.type
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            if self.type == 0:
-                # these tensors are stored, but we don't need their data
-                x  = Tensor('f', [self.nx])
-                g  = Tensor('f', [self.nx])
-                g2 = Tensor('f', [self.nx])
-                mh = Tensor('f', [self.nx])
-                vh = Tensor('f', [self.nx])
-
-                offset = x.load(data, offset)
-                offset = g.load(data, offset)
-                offset = g2.load(data, offset)
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = mh.load(data, offset)
-                offset = vh.load(data, offset)
-                offset = self.adam_pf.load(data, offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            else:
-                raise ValueError('Unknown optimizer type')
-
-
-        elif self.version == 1:
-            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            # forgot to save type in version 1:
-            # guess self.type from number of remaining bytes
-            size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                    [self.adam_m, self.adam_v]
-                                    +([self.adam_pf] if (self.past > 0) else [])])
-            size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                     self.lbfgs_lmal, self.lbfgs_lmys,
-                                     self.lbfgs_lms, self.lbfgs_lmy]
-                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
-            # due to alignment padding the size might not by exact
-            # but the difference in size for both types is significant,
-            # so we can just use whichever is closest
-            remaining = len(data) - offset
-            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-                self.type = 0
-            else:
-                self.type = 1
-
-            if self.type == 0:
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = self.adam_pf.load(data,offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError('Invalid version of checkpoint file')
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class ModelParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
-
-class Layer:
-    def __init__(self, params, bid):
-        self.bid = bid
-        self.att_norm = Tensor('f', [params.n_embd])
-        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
-        self.ffn_norm = Tensor('f', [params.n_embd])
-        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
-        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
-        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm.load(data, offset)
-        offset = self.wq.load(data, offset)
-        offset = self.wk.load(data, offset)
-        offset = self.wv.load(data, offset)
-        offset = self.wo.load(data, offset)
-        offset = self.ffn_norm.load(data, offset)
-        offset = self.w1.load(data, offset)
-        offset = self.w2.load(data, offset)
-        offset = self.w3.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
-        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
-        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
-        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
-        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
-        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
-        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
-        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
-        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
-
-class Model:
-    def __init__(self):
-        self.params = ModelParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-
-        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-        self.norm     = Tensor('f', [self.params.n_embd])
-        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-
-        offset = self.tok_embd.load(data, offset)
-        offset = self.norm.load(data, offset)
-        offset = self.output.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-
-        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
-        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
-        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class Checkpoint:
-    def __init__(self):
-        self.model = Model()
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcp':
-            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = Checkpoint()
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
deleted file mode 100644
index 7eafe8515..000000000
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ /dev/null
@@ -1,1248 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "common.h"
-#include "train.h"
-#include "llama.h"
-#include <unordered_map>
-#include <vector>
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <cstdarg>
-#include <ctime>
-#include <random>
-#include <stdexcept>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;
-    uint32_t n_embd  = 4096;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    uint32_t n_ff    = 11008;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data = NULL;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-// gguf constants (sync with gguf.py)
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
-static const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
-static const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
-static const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
-static const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
-static const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
-static const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
-static const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
-static const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
-static const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
-}
-
-static void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.ffn_gate);
-        ggml_set_param(ctx, layer.ffn_down);
-        ggml_set_param(ctx, layer.ffn_up);
-    }
-}
-
-static void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff    = hparams.n_ff;
-
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    // context for model tensors without their data
-    struct ggml_init_params ctx_model_params;
-    ctx_model_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_model_params.mem_buffer = NULL;
-    ctx_model_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_model_params);
-    model->ctx = ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-
-    ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD));
-    ggml_set_name(model->norm,           tn(LLM_TENSOR_OUTPUT_NORM));
-    ggml_set_name(model->output,         tn(LLM_TENSOR_OUTPUT));
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-
-        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
-
-        ggml_set_name(layer.wq,             tni(LLM_TENSOR_ATTN_Q, i));
-        ggml_set_name(layer.wk,             tni(LLM_TENSOR_ATTN_K, i));
-        ggml_set_name(layer.wv,             tni(LLM_TENSOR_ATTN_V, i));
-        ggml_set_name(layer.wo,             tni(LLM_TENSOR_ATTN_OUT, i));
-
-        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));
-
-        ggml_set_name(layer.ffn_gate,       tni(LLM_TENSOR_FFN_GATE, i));
-        ggml_set_name(layer.ffn_down,       tni(LLM_TENSOR_FFN_DOWN, i));
-        ggml_set_name(layer.ffn_up,         tni(LLM_TENSOR_FFN_UP, i));
-    }
-
-    set_param_model(model);
-
-    // allocate data
-    model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm,           rnd);
-    randomize_tensor_normal(model->output,         rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.ffn_gate, rnd);
-        randomize_tensor_normal(layer.ffn_down, rnd);
-        randomize_tensor_normal(layer.ffn_up,   rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_train_graphs(
-        struct my_llama_model * model,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = hparams.n_ff;
-    const float f_norm_rms_eps  = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_custom(
-            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(tokens_input);
-    checkpoints.push_back(targets);
-    checkpoints.push_back(t00);
-    checkpoints.push_back(t01);
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, f_norm_rms_eps);                    set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.ffn_up, t24);                      set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.ffn_gate, t24);                    set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.ffn_down, t28);                    set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        checkpoints.push_back(cur);
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, f_norm_rms_eps);                 set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    checkpoints.push_back(t31);
-    checkpoints.push_back(t32);
-    checkpoints.push_back(t33);
-    checkpoints.push_back(t34);
-    checkpoints.push_back(t35);
-    checkpoints.push_back(t36);
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    if (alloc) {
-        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-        int n_leafs_before = gb->n_leafs;
-        int n_nodes_before = gb->n_nodes;
-        // output tensors
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-        // input gradient
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-        // KQ_pos
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-        ggml_set_input(t36->grad);
-
-        // allocating checkpoints in one block to reduce memory fragmentation
-        // note: they will be freed in reverse order
-        for (int i = 0; i < (int) checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_set_input(checkpoints[i]);
-            }
-        }
-
-        //int n_leafs_after = gb->n_leafs;
-        //int n_nodes_after = gb->n_nodes;
-        if (measure_only) {
-            // FIXME: will still allocate
-            ggml_gallocr_reserve(alloc, gb);
-        } else {
-            ggml_gallocr_alloc_graph(alloc, gb);
-
-            if (!measure_only) {
-                int * data = (int *) KQ_pos->data;
-                for (int i = 0; i < N; ++i) {
-                    data[i] = n_past + i;
-                }
-            }
-        }
-
-        // remove the additional nodes and leafs
-        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-            gb->leafs[i] = NULL;
-        }
-        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-            gb->nodes[i] = NULL;
-        }
-        gb->n_leafs = n_leafs_before;
-        gb->n_nodes = n_nodes_before;
-    }
-
-    *logits = t35;
-    return t36;
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-} while (0)
-
-static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-
-    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
-
-    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-
-    init_model(model);
-
-    copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
-    copy_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
-    copy_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
-        copy_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
-        copy_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
-        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
-        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
-        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        copy_tensor_by_name(layer.ffn_gate,       f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        copy_tensor_by_name(layer.ffn_down,       f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        copy_tensor_by_name(layer.ffn_up,         f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
-    }
-}
-
-static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
-    const char * arch = "llama";
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    // set arch
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    // set hparams
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer                );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot                  );
-
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
-
-    // set vocab by copying from vocab_model gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
-
-        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
-        if (token_idx == -1) {
-            die("cannot find tokenizer vocab in model file");
-        }
-        const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
-
-        const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
-        if (score_idx == -1) {
-            die("cannot find tokenizer scores in model file");
-        }
-
-        const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
-
-        const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
-        if (toktype_idx == -1) {
-            die("cannot find token type list in GGUF file");
-        }
-
-        const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
-
-        std::string tokenizer_name;
-        GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
-
-        gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str());
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab);
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab);
-
-        int32_t special_bos_id = 1;
-        int32_t special_eos_id = 2;
-        int32_t special_unk_id = 0;
-        int32_t special_sep_id = -1;
-        int32_t special_pad_id = -1;
-        if (tokenizer_name == "llama") {
-            // default special tokens
-            special_bos_id = 1;
-            special_eos_id = 2;
-            special_unk_id = 0;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else if (tokenizer_name == "gpt2") {
-            // read and copy bpe merges
-            const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
-            if (merges_keyidx == -1) {
-                die("cannot find tokenizer merges in model file");
-            }
-
-            const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
-
-            std::vector<const char*> merges;
-            merges.resize(n_merges);
-            for (int i = 0; i < n_merges; i++) {
-                merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i);
-            }
-            gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges);
-
-            // default special tokens
-            special_bos_id = 11;
-            special_eos_id = 11;
-            special_unk_id = -1;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else {
-            fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
-            fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__);
-        }
-
-        std::vector<const char*> tokens;
-        tokens.resize(n_vocab);
-        for (uint32_t i = 0; i < n_vocab; i++) {
-            tokens[i] = gguf_get_arr_str(vctx, token_idx, i);
-        }
-        gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab);
-
-        GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
-        GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
-        GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
-        GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
-        GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
-
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id);
-
-        gguf_free(vctx);
-    }
-
-    // add tensors
-    gguf_add_tensor(fctx, model->tok_embeddings);
-    gguf_add_tensor(fctx, model->norm);
-    gguf_add_tensor(fctx, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-
-        gguf_add_tensor(fctx, layer.attention_norm);
-        gguf_add_tensor(fctx, layer.wq);
-        gguf_add_tensor(fctx, layer.wk);
-        gguf_add_tensor(fctx, layer.wv);
-        gguf_add_tensor(fctx, layer.wo);
-        gguf_add_tensor(fctx, layer.ffn_norm);
-        gguf_add_tensor(fctx, layer.ffn_gate);
-        gguf_add_tensor(fctx, layer.ffn_down);
-        gguf_add_tensor(fctx, layer.ffn_up);
-    }
-}
-
-static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) {
-    load_llama_model_gguf(fctx, f_ggml_ctx, model);
-    if (load_train_state_gguf(fctx, f_ggml_ctx, train)) {
-        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
-        GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    } else {
-        printf("%s: loaded llama model as checkpoint\n", __func__);
-    }
-}
-
-static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
-
-    return true;
-}
-
-static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_gguf(fctx, fn_vocab_model, model, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_vocab_model;
-    const char * fn_model_out;
-
-    bool only_write_model;
-
-    int n_ctx;
-    int n_embd;
-    int n_head;
-    int n_layer;
-    int n_ff;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
-
-    params.only_write_model = false;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_ff       =  768;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
-    fprintf(stderr, "  --only-write-model         only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n");
-    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
-    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
-    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--vocab-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "--only-write-model") {
-            params->only_write_model = true;
-        } else if (arg == "--embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--ff") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ff = std::stoi(argv[i]);
-        } else if (arg == "--head") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_head = std::stoi(argv[i]);
-        } else if (arg == "--layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_model_out;
-    const char            * fn_vocab_model;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train);
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model, train);
-
-    }
-    if (strlen(data->fn_model_out) > 0) {
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model);
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_model* model) {
-    int64_t nx = 0;
-    nx += ggml_nelements(model->tok_embeddings);
-    nx += ggml_nelements(model->norm);
-    nx += ggml_nelements(model->output);
-
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
-        nx += ggml_nelements(layer.attention_norm);
-        nx += ggml_nelements(layer.wq);
-        nx += ggml_nelements(layer.wk);
-        nx += ggml_nelements(layer.wv);
-        nx += ggml_nelements(layer.wo);
-        nx += ggml_nelements(layer.ffn_norm);
-        nx += ggml_nelements(layer.ffn_gate);
-        nx += ggml_nelements(layer.ffn_down);
-        nx += ggml_nelements(layer.ffn_up);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params mparams = llama_model_default_params();
-    mparams.vocab_only = true;
-
-    struct llama_context_params cparams = llama_context_default_params();
-
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, mparams);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, cparams);
-
-    struct my_llama_model model;
-    model.hparams.n_vocab = llama_n_vocab(lmodel);
-    model.hparams.n_ctx   = params.common.n_ctx;
-    model.hparams.n_embd  = params.n_embd;
-    model.hparams.n_head  = params.n_head;
-    model.hparams.n_layer = params.n_layer;
-    model.hparams.n_ff    = params.n_ff;
-    // llama.cpp requires n_rot to be exactly n_embd / n_head
-    model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
-    model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    model.hparams.rope_freq_base  = params.rope_freq_base;
-    model.hparams.rope_freq_scale = params.rope_freq_scale;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else {
-        init_model(&model);
-        randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_model) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_model) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_model_out      = params.fn_model_out;
-        save_data.fn_vocab_model    = params.fn_vocab_model;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(model.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // measure required memory for input tensors
-    // allocate input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_train_graphs(
-            &model, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_train_graphs(
-        &model, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_model_out      = params.fn_model_out;
-    save_data.fn_vocab_model    = params.fn_vocab_model;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(model.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}
diff --git a/examples/ts-type-to-grammar.sh b/examples/ts-type-to-grammar.sh
new file mode 100755
index 000000000..9abba2a3d
--- /dev/null
+++ b/examples/ts-type-to-grammar.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
+# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
+#
+set -euo pipefail
+
+readonly type="$1"
+
+# Create a temporary directory
+TMPDIR=""
+trap 'rm -fR "$TMPDIR"' EXIT
+TMPDIR=$(mktemp -d)
+
+DTS_FILE="$TMPDIR/type.d.ts"
+SCHEMA_FILE="$TMPDIR/schema.json"
+
+echo "export type MyType = $type" > "$DTS_FILE"
+
+# This is a fork of typescript-json-schema, actively maintained as of March 2024:
+# https://github.com/vega/ts-json-schema-generator
+npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type MyType -e none > "$SCHEMA_FILE"
+
+# Alternative, not actively maintained as of March 2024:
+# https://github.com/YousefED/typescript-json-schema
+# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
+
+./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt
new file mode 100644
index 000000000..c72bd814c
--- /dev/null
+++ b/examples/tts/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-tts)
+add_executable(${TARGET} tts.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tts/README.md b/examples/tts/README.md
new file mode 100644
index 000000000..4509763c6
--- /dev/null
+++ b/examples/tts/README.md
@@ -0,0 +1,117 @@
+# llama.cpp/example/tts
+This example demonstrates the Text To Speech feature. It uses a
+[model](https://www.outeai.com/blog/outetts-0.2-500m) from
+[outeai](https://www.outeai.com/).
+
+## Quickstart
+If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the
+following command and the required models will be downloaded automatically:
+```console
+$ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav
+```
+For details about the models and how to convert them to the required format
+see the following sections.
+
+### Model conversion
+Checkout or download the model that contains the LLM model:
+```console
+$ pushd models
+$ git clone --branch main --single-branch --depth 1 https://huggingface.co/OuteAI/OuteTTS-0.2-500M
+$ cd OuteTTS-0.2-500M && git lfs install && git lfs pull
+$ popd
+```
+Convert the model to .gguf format:
+```console
+(venv) python convert_hf_to_gguf.py models/OuteTTS-0.2-500M \
+    --outfile models/outetts-0.2-0.5B-f16.gguf --outtype f16
+```
+The generated model will be `models/outetts-0.2-0.5B-f16.gguf`.
+
+We can optionally quantize this to Q8_0 using the following command:
+```console
+$ build/bin/llama-quantize models/outetts-0.2-0.5B-f16.gguf \
+    models/outetts-0.2-0.5B-q8_0.gguf q8_0
+```
+The quantized model will be `models/outetts-0.2-0.5B-q8_0.gguf`.
+
+Next we do something simlar for the audio decoder. First download or checkout
+the model for the voice decoder:
+```console
+$ pushd models
+$ git clone --branch main --single-branch --depth 1 https://huggingface.co/novateur/WavTokenizer-large-speech-75token
+$ cd WavTokenizer-large-speech-75token && git lfs install && git lfs pull
+$ popd
+```
+This model file is PyTorch checkpoint (.ckpt) and we first need to convert it to
+huggingface format:
+```console
+(venv) python examples/tts/convert_pt_to_hf.py \
+    models/WavTokenizer-large-speech-75token/wavtokenizer_large_speech_320_24k.ckpt
+...
+Model has been successfully converted and saved to models/WavTokenizer-large-speech-75token/model.safetensors
+Metadata has been saved to models/WavTokenizer-large-speech-75token/index.json
+Config has been saved to models/WavTokenizer-large-speech-75tokenconfig.json
+```
+Then we can convert the huggingface format to gguf:
+```console
+(venv) python convert_hf_to_gguf.py models/WavTokenizer-large-speech-75token \
+    --outfile models/wavtokenizer-large-75-f16.gguf --outtype f16
+...
+INFO:hf-to-gguf:Model successfully exported to models/wavtokenizer-large-75-f16.gguf
+```
+
+### Running the example
+
+With both of the models generated, the LLM model and the voice decoder model,
+we can run the example:
+```console
+$ build/bin/llama-tts -m  ./models/outetts-0.2-0.5B-q8_0.gguf \
+    -mv ./models/wavtokenizer-large-75-f16.gguf \
+    -p "Hello world"
+...
+main: audio written to file 'output.wav'
+```
+The output.wav file will contain the audio of the prompt. This can be heard
+by playing the file with a media player. On Linux the following command will
+play the audio:
+```console
+$ aplay output.wav
+```
+
+### Running the example with llama-server
+Running this example with `llama-server` is also possible and requires two
+server instances to be started. One will serve the LLM model and the other
+will serve the voice decoder model.
+
+The LLM model server can be started with the following command:
+```console
+$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
+```
+
+And the voice decoder model server can be started using:
+```console
+./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
+```
+
+Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
+
+First create a virtual environment for python and install the required
+dependencies (this in only required to be done once):
+```console
+$ python3 -m venv venv
+$ source venv/bin/activate
+(venv) pip install requests numpy
+```
+
+And then run the python script using:
+```conole
+(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
+spectrogram generated: n_codes: 90, n_embd: 1282
+converting to audio ...
+audio generated: 28800 samples
+audio written to file "output.wav"
+```
+And to play the audio we can again use aplay or any other media player:
+```console
+$ aplay output.wav
+```
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
new file mode 100644
index 000000000..8909a65fd
--- /dev/null
+++ b/examples/tts/convert_pt_to_hf.py
@@ -0,0 +1,180 @@
+# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
+# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
+#
+# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
+
+import torch
+import json
+import os
+import sys
+import re
+
+from safetensors.torch import save_file
+
+# default
+model_path = './model.pt';
+
+# read from CLI
+if len(sys.argv) > 1:
+    model_path = sys.argv[1]
+
+# get the directory of the input model
+path_dst = os.path.dirname(model_path)
+
+print(f"Loading model from {model_path}")
+
+model = torch.load(model_path, map_location='cpu')
+
+#print(model)
+
+# print all keys
+for key in model.keys():
+    print(key)
+    if key == 'hyper_parameters':
+        #print(model[key])
+        # dump as json pretty
+        print(json.dumps(model[key], indent=4))
+    #if key != 'state_dict' and key != 'optimizer_states':
+    #    print(model[key])
+
+# Check if the loaded model is a state_dict or a model instance
+if isinstance(model, torch.nn.Module):
+    state_dict = model.state_dict()
+else:
+    state_dict = model
+
+# Print the structure of the state_dict to understand its format
+print("State dictionary keys:")
+for key in state_dict.keys():
+    print(key)
+
+# Ensure the state_dict is flat and contains only torch.Tensor objects
+def flatten_state_dict(state_dict, parent_key='', sep='.'):
+    items = []
+    items_new = []
+
+    for k, v in state_dict.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, torch.Tensor):
+            items.append((new_key, v))
+        elif isinstance(v, dict):
+            items.extend(flatten_state_dict(v, new_key, sep=sep).items())
+            return dict(items)
+
+    size_total_mb = 0
+
+    for key, value in list(items):
+        # keep only what we need for inference
+        if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
+           not key.startswith('state_dict.backbone.') and \
+           not key.startswith('state_dict.head.out'):
+               print('Skipping key: ', key)
+               continue
+
+        new_key = key
+
+        new_key = new_key.replace('state_dict.', '')
+        new_key = new_key.replace('pos_net', 'posnet')
+
+        # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
+        if new_key.startswith("backbone.posnet."):
+            match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
+            if match:
+               new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"
+
+        # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
+        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
+            new_key = "backbone.embedding.weight"
+
+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key.endswith("norm.scale.weight"):
+            new_key = new_key.replace("norm.scale.weight", "norm.weight")
+            value = value[0]
+
+        if new_key.endswith("norm.shift.weight"):
+            new_key = new_key.replace("norm.shift.weight", "norm.bias")
+            value = value[0]
+
+        if new_key.endswith("gamma"):
+            new_key = new_key.replace("gamma", "gamma.weight")
+
+        # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
+        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
+            value = value.unsqueeze(1)
+
+        if new_key.endswith("dwconv.bias"):
+            value = value.unsqueeze(1)
+
+        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
+        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
+
+        size_total_mb += size_mb
+
+        #print(key, '->', new_key, ': ', value)
+        #print(key, '->', new_key)
+
+        items_new.append((new_key, value))
+
+    print(f"Total size: {size_total_mb:8.2f} MB")
+
+    return dict(items_new)
+
+flattened_state_dict = flatten_state_dict(state_dict)
+
+
+# Convert the model to the safetensors format
+output_path = path_dst + '/model.safetensors'
+save_file(flattened_state_dict, output_path)
+
+print(f"Model has been successfully converted and saved to {output_path}")
+
+# Calculate the total size of the .safetensors file
+total_size = os.path.getsize(output_path)
+
+# Create the weight map
+weight_map = {
+    "model.safetensors": ["*"]  # Assuming all weights are in one file
+}
+
+# Create metadata for the index.json file
+metadata = {
+    "total_size": total_size,
+    "weight_map": weight_map
+}
+
+# Save the metadata to index.json
+index_path = path_dst + '/index.json'
+with open(index_path, 'w') as f:
+    json.dump(metadata, f, indent=4)
+
+print(f"Metadata has been saved to {index_path}")
+
+config = {
+    "architectures": [
+        "WavTokenizerDec"
+    ],
+    "hidden_size": 1282,
+    "n_embd_features": 512,
+    "n_ff": 2304,
+    "vocab_size": 4096,
+    "n_head": 1,
+    "layer_norm_epsilon": 1e-6,
+    "group_norm_epsilon": 1e-6,
+    "group_norm_groups": 32,
+    "max_position_embeddings": 8192, # ?
+    "n_layer": 12,
+    "posnet": {
+        "n_embd": 768,
+        "n_layer": 6
+    },
+    "convnext": {
+        "n_embd": 768,
+        "n_layer": 12
+    },
+}
+
+with open(path_dst + '/config.json', 'w') as f:
+    json.dump(config, f, indent=4)
+
+print(f"Config has been saved to {path_dst + 'config.json'}")
diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
new file mode 100644
index 000000000..3791f9fc3
--- /dev/null
+++ b/examples/tts/tts-outetts.py
@@ -0,0 +1,299 @@
+import sys
+#import json
+#import struct
+import requests
+import re
+import struct
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+
+
+def fill_hann_window(size, periodic=True):
+    if periodic:
+        return np.hanning(size + 1)[:-1]
+    return np.hanning(size)
+
+
+def irfft(n_fft, complex_input):
+    return np.fft.irfft(complex_input, n=n_fft)
+
+
+def fold(buffer, n_out, n_win, n_hop, n_pad):
+    result = np.zeros(n_out)
+    n_frames = len(buffer) // n_win
+
+    for i in range(n_frames):
+        start = i * n_hop
+        end = start + n_win
+        result[start:end] += buffer[i * n_win:(i + 1) * n_win]
+
+    return result[n_pad:-n_pad] if n_pad > 0 else result
+
+
+def process_frame(args):
+    l, n_fft, ST, hann = args
+    frame = irfft(n_fft, ST[l])
+    frame = frame * hann
+    hann2 = hann * hann
+    return frame, hann2
+
+
+def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
+    embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
+
+    n_fft = 1280
+    n_hop = 320
+    n_win = 1280
+    n_pad = (n_win - n_hop) // 2
+    n_out = (n_codes - 1) * n_hop + n_win
+
+    hann = fill_hann_window(n_fft, True)
+
+    E = np.zeros((n_embd, n_codes), dtype=np.float32)
+    for l in range(n_codes):
+        for k in range(n_embd):
+            E[k, l] = embd[l, k]
+
+    half_embd = n_embd // 2
+    S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
+
+    for k in range(half_embd):
+        for l in range(n_codes):
+            mag = E[k, l]
+            phi = E[k + half_embd, l]
+
+            mag = np.clip(np.exp(mag), 0, 1e2)
+            S[l, k] = mag * np.exp(1j * phi)
+
+    res = np.zeros(n_codes * n_fft)
+    hann2_buffer = np.zeros(n_codes * n_fft)
+
+    with ThreadPoolExecutor(max_workers=n_thread) as executor:
+        args = [(l, n_fft, S, hann) for l in range(n_codes)]
+        results = list(executor.map(process_frame, args))
+
+        for l, (frame, hann2) in enumerate(results):
+            res[l*n_fft:(l+1)*n_fft] = frame
+            hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
+
+    audio = fold(res, n_out, n_win, n_hop, n_pad)
+    env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
+
+    mask = env > 1e-10
+    audio[mask] /= env[mask]
+
+    return audio
+
+
+def save_wav(filename, audio_data, sample_rate):
+    num_channels = 1
+    bits_per_sample = 16
+    bytes_per_sample = bits_per_sample // 8
+    data_size = len(audio_data) * bytes_per_sample
+    byte_rate = sample_rate * num_channels * bytes_per_sample
+    block_align = num_channels * bytes_per_sample
+    chunk_size = 36 + data_size  # 36 = size of header minus first 8 bytes
+
+    header = struct.pack(
+        '<4sI4s4sIHHIIHH4sI',
+        b'RIFF',
+        chunk_size,
+        b'WAVE',
+        b'fmt ',
+        16,                # fmt chunk size
+        1,                 # audio format (PCM)
+        num_channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b'data',
+        data_size
+    )
+
+    audio_data = np.clip(audio_data * 32767, -32768, 32767)
+    pcm_data = audio_data.astype(np.int16)
+
+    with open(filename, 'wb') as f:
+        f.write(header)
+        f.write(pcm_data.tobytes())
+
+
+def process_text(text: str):
+    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
+    text = re.sub(r'[-_/,\.\\]', ' ', text)
+    text = re.sub(r'[^a-z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.split()
+
+# usage:
+# python tts-outetts.py http://server-llm:port http://server-dec:port "text"
+
+if len(sys.argv) <= 3:
+    print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"")
+    exit(1)
+
+host_llm = sys.argv[1]
+host_dec = sys.argv[2]
+text = sys.argv[3]
+
+prefix = """<|im_start|>
+<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>"""
+
+words = process_text(text)
+words = "<|text_sep|>".join([i.strip() for i in words])
+words += "<|text_end|>\n"
+
+# voice data
+# TODO: load from json
+#suffix = """<|audio_start|>
+#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>"""
+
+# TODO: tokenization is slow for some reason - here is pre-tokenized input
+suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455,
+          155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413,
+          152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400,
+          152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163,
+          153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321,
+          153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751,
+          152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791,
+          151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325,
+          153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+          152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198,
+          19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810,
+          152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946,
+          151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317,
+          152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325,
+          151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680,
+          152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+          152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144,
+          153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720,
+          153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958,
+          152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071,
+          152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380,
+          153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808,
+          151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034,
+          152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718,
+          152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198,
+          19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733,
+          151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333,
+          151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801,
+          152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992,
+          153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428,
+          153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+          152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122,
+          152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594,
+          153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673,
+          151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779,
+          151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241,
+          152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434,
+          153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223,
+          152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748,
+          152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988,
+          152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390,
+          152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558,
+          152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+          151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341,
+          153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295,
+          152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385,
+          152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593,
+          152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938,
+          152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941,
+          152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+          152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583,
+          152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442,
+          152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+          151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198,
+          275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799,
+          151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257,
+          152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669,
+          152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467,
+          152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+          152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759,
+          153318, 153165, 153349, 151670, ]
+
+response = requests.post(
+    host_llm + "/completion",
+    json={
+        "prompt": [prefix + words, *suffix],
+        "n_predict": 1024,
+        "cache_prompt": True,
+        "return_tokens": True,
+        "samplers": ["top_k"],
+        "top_k": 16,
+        "seed": 1003,
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
+#print(json.dumps(response_json["timings"], indent=4))
+#print(json.dumps(response_json["tokens"], indent=4))
+
+codes = response_json["tokens"]
+
+codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772]
+
+response = requests.post(
+    host_dec + "/embeddings",
+    json={
+        "input": [*codes],
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+
+# spectrogram
+embd = response_json[0]["embedding"]
+
+n_codes = len(embd)
+n_embd = len(embd[0])
+
+print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
+
+# post-process the spectrogram to convert to audio
+print('converting to audio ...')
+audio = embd_to_audio(embd, n_codes, n_embd)
+print('audio generated: %d samples' % len(audio))
+
+filename = "output.wav"
+sample_rate = 24000 # sampling rate
+
+# zero out first 0.25 seconds
+audio[:24000 // 4] = 0.0
+
+save_wav(filename, audio, sample_rate)
+print('audio written to file "%s"' % filename)
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
new file mode 100644
index 000000000..f78f76303
--- /dev/null
+++ b/examples/tts/tts.cpp
@@ -0,0 +1,973 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "log.h"
+#include "llama.h"
+
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+//
+// Terminal utils
+//
+
+#define SQR(X)    ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
+
+/**
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
+ */
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static std::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    std::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+const std::vector<std::string> k_colors = {
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello!\"\n", argv[0]);
+    LOG("\n");
+}
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
+        return;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    file.close();
+}
+
+static void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
+// very poor-man fft
+static void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
+    *real = cos(angle);
+    *imag = sin(angle);
+}
+
+static void irfft(int n, const float * inp_cplx, float * out_real) {
+    int N = n / 2 + 1;
+
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
+    for (int i = 0; i < N; ++i) {
+        real_input[i] = inp_cplx[2 * i];
+        imag_input[i] = inp_cplx[2 * i + 1];
+    }
+
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
+
+    for (int k = 0; k < n; ++k) {
+        real_output[k] = 0.0f;
+        imag_output[k] = 0.0f;
+        for (int m = 0; m < N; ++m) {
+            float twiddle_real;
+            float twiddle_imag;
+
+            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+
+            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+        }
+    }
+
+    for (int i = 0; i < n; ++i) {
+        out_real[i] = real_output[i] / N;
+    }
+}
+
+//
+//  y = torch.nn.functional.fold(
+//       data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+//  )[:, 0, 0, pad:-pad]
+//
+// data.shape =  torch.Size([1, 1280, 261])
+// output_size =  84480
+// win_length =  1280
+// hop_length =  320
+// pad =  480
+//
+static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
+    int64_t output_height = n_out;
+    int64_t kernel_w = n_win;
+    int64_t stride_w = n_hop;
+    int64_t width    = n_out;
+
+    output.resize(width, 0.0f);
+
+    int64_t col_idx = 0;
+    for (int64_t w_col = 0; w_col < width; ++w_col) {
+        int64_t start = w_col * stride_w - n_pad;
+        int64_t end   = start + kernel_w;
+
+        for (int64_t w_im = start; w_im < end; ++w_im) {
+            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
+                output[w_im] += data[col_idx];
+            }
+            col_idx++;
+        }
+    }
+
+    output.resize(n_out - 2 * n_pad);
+}
+
+// TODO: not optimized at all
+static std::vector<float> embd_to_audio(
+        const float * embd,
+        const int n_codes,
+        const int n_embd,
+        const int n_thread) {
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n_codes - 1)*n_hop + n_win;
+
+    std::vector<float> hann(n_fft);
+
+    fill_hann_window(hann.size(), true, hann.data());
+
+    int n_spec = n_embd*n_codes;
+
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n_codes + l] = embd[l*n_embd + k];
+        }
+    }
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n_codes; ++l) {
+            float mag = E[(k           )*n_codes + l];
+            float phi = E[(k + n_embd/2)*n_codes + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
+            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
+        }
+    }
+
+    std::vector<float> res  (n_codes*n_fft);
+    std::vector<float> hann2(n_codes*n_fft);
+
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n_codes; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    std::vector<float> audio;
+    std::vector<float> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
+    }
+
+    return audio;
+}
+
+static const std::map<int, std::string> ones = {
+    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
+    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
+    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
+    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
+};
+
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
+    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
+};
+
+// Convert a number less than 1000 to words
+static std::string convert_less_than_thousand(int num) {
+    std::string result;
+
+    if (num >= 100) {
+        result += ones.at(num / 100) + " hundred ";
+        num %= 100;
+    }
+
+    if (num >= 20) {
+        result += tens.at(num / 10);
+        if (num % 10 > 0) {
+            result += "-" + ones.at(num % 10);
+        }
+    } else if (num > 0) {
+        result += ones.at(num);
+    }
+
+    return result;
+}
+
+static std::string number_to_words(const std::string & number_str) {
+    try {
+        size_t decimal_pos = number_str.find('.');
+        std::string integer_part = number_str.substr(0, decimal_pos);
+
+        int int_number = std::stoi(integer_part);
+        std::string result;
+
+        if (int_number == 0) {
+            result = "zero";
+        } else {
+            if (int_number >= 1000000000) {
+                int billions = int_number / 1000000000;
+                result += convert_less_than_thousand(billions) + " billion ";
+                int_number %= 1000000000;
+            }
+
+            if (int_number >= 1000000) {
+                int millions = int_number / 1000000;
+                result += convert_less_than_thousand(millions) + " million ";
+                int_number %= 1000000;
+            }
+
+            if (int_number >= 1000) {
+                int thousands = int_number / 1000;
+                result += convert_less_than_thousand(thousands) + " thousand ";
+                int_number %= 1000;
+            }
+
+            if (int_number > 0) {
+                result += convert_less_than_thousand(int_number);
+            }
+        }
+
+        // Handle decimal part
+        if (decimal_pos != std::string::npos) {
+            result += " point";
+            std::string decimal_part = number_str.substr(decimal_pos + 1);
+            for (char digit : decimal_part) {
+                result += " " + ones.at(digit - '0');
+            }
+        }
+
+        return result;
+    } catch (const std::exception& e) {
+        // Skip if fails
+        return " ";
+    }
+}
+
+static std::string replace_numbers_with_words(const std::string & input_text) {
+    std::regex number_pattern(R"(\d+(\.\d+)?)");
+    std::string result;
+    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
+    auto end = std::sregex_iterator();
+
+    size_t last_pos = 0;
+    for (std::sregex_iterator i = it; i != end; ++i) {
+        const std::smatch& match = *i;
+        result.append(input_text, last_pos, match.position() - last_pos);
+        result.append(number_to_words(match.str()));
+        last_pos = match.position() + match.length();
+    }
+    result.append(input_text, last_pos);
+
+    return result;
+}
+
+// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
+static std::string process_text(const std::string & text) {
+
+    // For now I skipped text romanization as I am unsure how to handle
+    // uroman and MeCab implementations in C++
+    // maybe something like https://github.com/anyascii/anyascii/ could work.
+    // currently only English would be supported in this function
+
+    std::string processed_text = replace_numbers_with_words(text);
+
+    std::transform(processed_text.begin(), processed_text.end(),
+                  processed_text.begin(), ::tolower);
+
+    std::regex special_chars(R"([-_/,\.\\])");
+    processed_text = std::regex_replace(processed_text, special_chars, " ");
+
+    std::regex non_alpha(R"([^a-z\s])");
+    processed_text = std::regex_replace(processed_text, non_alpha, "");
+
+    std::regex multiple_spaces(R"(\s+)");
+    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+
+    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+
+    /*
+        Replace spaces with the separator token same as in line 365
+
+        for (auto & c : prompt_user) {
+        if (c == ' ') {
+            prompt_clean += "<|text_sep|>";
+    */
+    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
+
+    return processed_text;
+}
+
+static void prompt_add(llama_tokens & prompt, llama_token token) {
+    prompt.push_back(token);
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
+    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_vocab * vocab, const std::string & txt, bool add_special, bool parse_special) {
+    auto tmp = common_tokenize(vocab, txt, add_special, parse_special);
+    prompt_add(prompt, tmp);
+}
+
+static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
+    prompt.clear();
+
+    prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
+}
+
+static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
+    const std::string& delimiter = "<|text_sep|>";
+
+    std::vector<llama_token> result;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    //first token is always a newline, as it was not previously added
+    result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
+
+    while (end != std::string::npos) {
+        std::string current_word = str.substr(start, end - start);
+        auto tmp = common_tokenize(vocab, current_word, false, true);
+        result.push_back(tmp[0]);
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    // Add the last part
+    std::string current_word = str.substr(start);
+    auto tmp = common_tokenize(vocab, current_word, false, true);
+    if (tmp.size() > 0) {
+        result.push_back(tmp[0]);
+    }
+    return result;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.prompt = "";
+
+    params.n_predict = 4096;
+    params.n_batch   = 8192;
+    params.n_ctx     = 8192;
+
+    params.sampling.top_k = 4;
+    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
+        return 1;
+    }
+
+    const int n_parallel = params.n_parallel;
+    const int n_predict  = params.n_predict;
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_ttc = NULL; // text-to-codes
+    llama_model * model_cts = NULL; // codes-to-speech
+
+    llama_context * ctx_ttc = NULL;
+    llama_context * ctx_cts = NULL;
+
+    common_init_result llama_init_ttc = common_init_from_params(params);
+
+    model_ttc = llama_init_ttc.model.get();
+    ctx_ttc   = llama_init_ttc.context.get();
+
+    const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
+
+    // TODO: refactor in a common struct
+    params.model     = params.vocoder.model;
+    params.model_url = params.vocoder.model_url;
+    params.hf_repo   = params.vocoder.hf_repo;
+    params.hf_file   = params.vocoder.hf_file;
+
+    params.embedding = true;
+
+    common_init_result llama_init_cts = common_init_from_params(params);
+
+    model_cts = llama_init_cts.model.get();
+    ctx_cts   = llama_init_cts.context.get();
+
+    std::vector<common_sampler *> smpl(n_parallel);
+    for (int i = 0; i < n_parallel; ++i) {
+        params.sampling.no_perf = (i != 0);
+        params.sampling.seed = params.sampling.seed + 1;
+
+        smpl[i] = common_sampler_init(model_ttc, params.sampling);
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl[0]));
+    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl[0]).c_str());
+
+    LOG_INF("%s: loading done\n", __func__);
+
+    const auto t_main_start = ggml_time_us();
+
+    std::vector<llama_token> codes;
+    std::vector<llama_token> guide_tokens;
+
+    // process prompt and generate voice codes
+    {
+        LOG_INF("%s: constructing prompt ..\n", __func__);
+
+        std::vector<llama_token> prompt_inp;
+
+        prompt_init(prompt_inp, vocab);
+
+        prompt_add(prompt_inp, vocab, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
+
+        // convert the input text into the necessary format expected by OuteTTS
+        {
+            std::string prompt_clean = process_text(params.prompt);
+            if (params.vocoder.use_guide_tokens) {
+                guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
+            }
+
+            LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
+
+            prompt_add(prompt_inp, vocab, prompt_clean, false, true);
+        }
+
+        prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
+
+        // disabled to save time on tokenizing each time
+        // TODO: load voices from the json files
+#if 0
+        const std::string voice_data = R"(<|audio_start|>
+the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
+
+        auto tmp = common_tokenize(vocab, voice_data, false, true);
+        printf("\n\n");
+        for (int i = 0; i < tmp.size(); ++i) {
+            printf("%d, ", tmp[i]);
+        }
+        printf("\n\n");
+#else
+        prompt_add(prompt_inp, llama_tokens {
+            151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
+            152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
+            151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
+            151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
+            153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
+            153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
+            152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
+            152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
+            152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
+            153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
+            153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
+            152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
+            153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
+            153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
+            151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+            152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
+            152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
+            151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
+            152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
+            152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
+            152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
+            152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
+            152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
+            153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
+            155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
+            152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
+            32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+            152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
+            153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
+            152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
+            151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
+            153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
+            152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
+            152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
+            152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
+            153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
+            152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
+            152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
+            153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
+            152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
+            152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
+            152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
+            155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
+            152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
+            14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
+            153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
+            198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
+            152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
+            151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
+            153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
+            152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
+            152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+            152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
+            153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
+            152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
+            152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
+            155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
+            151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
+            153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
+            151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
+            155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
+            151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
+            153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
+            152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
+            152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
+            152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
+            151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
+            152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
+            151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
+            152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
+            151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
+            153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+            151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
+            152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
+            153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
+            151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
+            151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
+            152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
+            151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
+            151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
+            152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
+            151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
+            152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
+            153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+            152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
+            153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
+            152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
+            152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
+            151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+            151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
+            151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
+            151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
+            151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
+            152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
+            152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
+            152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
+            153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
+            152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
+            152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+            152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
+            152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
+            151670,});
+#endif
+
+        // print the prompt token-by-token
+
+        LOG("\n");
+
+        for (auto id : prompt_inp) {
+            LOG("%s", common_token_to_piece(ctx_ttc, id).c_str());
+        }
+
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+
+        LOG("\n");
+
+        // create a llama_batch
+        // we use this object to submit token data for decoding
+        llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
+
+        std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            seq_ids[i] = i;
+        }
+
+        // evaluate the initial prompt
+        for (size_t i = 0; i < prompt_inp.size(); ++i) {
+            common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
+        }
+        GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+
+        // llama_decode will output logits only for the last token of the prompt
+        batch.logits[batch.n_tokens - 1] = true;
+
+        if (llama_decode(ctx_ttc, batch) != 0) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        if (n_parallel > 1) {
+            LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        }
+
+        llama_synchronize(ctx_ttc);
+
+        LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+        const auto t_dec_start = ggml_time_us();
+
+        // main loop
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+        int n_past   = batch.n_tokens;
+        int n_decode = 0;
+
+        bool next_token_uses_guide_token = true;
+
+        while (n_decode <= n_predict) {
+            // prepare the next batch
+            common_batch_clear(batch);
+
+            // sample the next token for each parallel sequence / stream
+            for (int32_t i = 0; i < n_parallel; ++i) {
+                if (i_batch[i] < 0) {
+                    // the stream has already finished
+                    continue;
+                }
+
+                llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+
+                //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
+                if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
+                    llama_token guide_token = guide_tokens[0];
+                    guide_tokens.erase(guide_tokens.begin());
+                    new_token_id = guide_token; //ensure correct word fragment is used
+                }
+
+                //this is the token id that always precedes a new word
+                next_token_uses_guide_token = (new_token_id == 198);
+
+                common_sampler_accept(smpl[i], new_token_id, true);
+
+                codes.push_back(new_token_id);
+
+                const auto * cands = common_sampler_get_candidates(smpl[i]);
+
+                // is it an end of generation? -> mark the stream as finished
+                if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
+                    std::string reason;
+                    if (llama_vocab_is_eog(vocab, new_token_id)) {
+                        reason = "eos";
+                    } else {
+                        reason = "n_predict";
+                    }
+
+                    i_batch[i] = -1;
+
+                    LOG("\n");
+                    if (n_parallel > 1) {
+                        LOG_CNT("\n");
+                        LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str());
+                    }
+
+                    continue;
+                }
+
+                {
+                    const float p = cands->data[cands->selected].p;
+
+                    const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size()))));
+
+                    LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m");
+                    //LOG_CNT("%d", i);
+                }
+
+                i_batch[i] = batch.n_tokens;
+
+                // push this new token for next evaluation
+                common_batch_add(batch, new_token_id, n_past, { i }, true);
+            }
+
+            // all streams are finished
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            n_decode += 1;
+            n_past += 1;
+
+            // evaluate the current batch with the transformer model
+            if (llama_decode(ctx_ttc, batch)) {
+                LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+                return 1;
+            }
+        }
+
+        llama_batch_free(batch);
+
+        LOG("\n");
+        LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
+    }
+
+    common_perf_print(ctx_ttc, smpl[0]);
+
+    //std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
+    //    153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
+    //    153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
+    //    153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
+    //    152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
+    //    153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
+    //    153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
+    //    153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
+    //    198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
+    //    152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
+    //    151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
+    //    153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
+    //    151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
+    //    198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
+    //    152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
+    //    153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
+    //    198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
+    //    152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
+    //    153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
+    //    153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
+    //    152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
+    //    198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
+    //    152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
+    //    153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
+    //    153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
+    //    152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
+    //    155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
+    //    151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
+    //    153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
+    //    153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
+    //    153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
+    //    152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
+    //    152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
+    //    152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
+    //    153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+
+        LOG("\n");
+        LOG_INF("codes: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size());
+    }
+
+    // remove all non-audio tokens (i.e. < 151672 || > 155772)
+    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+        LOG_INF("codes audio: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
+    }
+
+    for (auto & token : codes) {
+        token -= 151672;
+    }
+
+    const auto t_voc_start = ggml_time_us();
+
+    const int n_codes = codes.size();
+
+    llama_batch batch = llama_batch_init(n_codes, 0, 1);
+
+    for (size_t i = 0; i < codes.size(); ++i) {
+        common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
+    }
+    GGML_ASSERT(batch.n_tokens == n_codes);
+
+    if (llama_decode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    llama_synchronize(ctx_cts);
+
+    LOG_INF("%s: time for vocoder:      %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
+
+    const auto t_spec_start = ggml_time_us();
+
+#if 1
+    // spectral operations
+    const int n_embd = llama_model_n_embd(model_cts);
+    const float * embd = llama_get_embeddings(ctx_cts);
+
+    auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
+
+#else
+    // read the spectrogram from a file for debugging purposes
+    std::vector<float> audio;
+    {
+        std::ifstream fin("out.bin", std::ios::binary);
+        if (!fin) {
+            LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin");
+            return 1;
+        }
+
+        std::vector<float> embd;
+
+        int n_codes;
+        int n_embd;
+
+        fin.read(reinterpret_cast<char *>(&n_codes), sizeof(int));
+        fin.read(reinterpret_cast<char *>(&n_embd), sizeof(int));
+
+        embd.resize(n_codes * n_embd);
+        fin.read(reinterpret_cast<char *>(embd.data()), n_codes * n_embd * sizeof(float));
+        fin.close();
+
+        LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd);
+
+        audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
+    }
+#endif
+
+    const std::string fname = "output.wav";
+
+    const int n_sr = 24000; // sampling rate
+
+    // zero out first 0.25 seconds
+    for (int i = 0; i < 24000/4; ++i) {
+        audio[i] = 0.0f;
+    }
+
+    LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
+    LOG_INF("%s: total time:            %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+    save_wav16(fname, audio, n_sr);
+
+    LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/flake.lock b/flake.lock
index 9f659ba8f..d114f4422 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1706830856,
-        "narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=",
+        "lastModified": 1730504689,
+        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f",
+        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1708655239,
-        "narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
+        "lastModified": 1732014248,
+        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
+        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
         "type": "github"
       },
       "original": {
@@ -36,20 +36,14 @@
     },
     "nixpkgs-lib": {
       "locked": {
-        "dir": "lib",
-        "lastModified": 1706550542,
-        "narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "97b17f32362e475016f942bbdfda4a4a72a8a652",
-        "type": "github"
+        "lastModified": 1730504152,
+        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
       },
       "original": {
-        "dir": "lib",
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
+        "type": "tarball",
+        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
       }
     },
     "root": {
diff --git a/flake.nix b/flake.nix
index dc4e503c3..26a258816 100644
--- a/flake.nix
+++ b/flake.nix
@@ -63,7 +63,7 @@
   # nix-repl> :lf github:ggerganov/llama.cpp
   # Added 13 variables.
   # nix-repl> outputs.apps.x86_64-linux.quantize
-  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; }
+  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; }
   # ```
   outputs =
     { self, flake-parts, ... }@inputs:
@@ -107,11 +107,12 @@
         # ```
         #
         # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
-        flake.overlays.default =
-          (final: prev: {
+        flake.overlays.default = (
+          final: prev: {
             llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
             inherit (final.llamaPackages) llama-cpp;
-          });
+          }
+        );
 
         systems = [
           "aarch64-darwin"
@@ -131,6 +132,9 @@
             ...
           }:
           {
+            # For standardised reproducible formatting with `nix fmt`
+            formatter = pkgs.nixfmt-rfc-style;
+
             # Unlike `.#packages`, legacyPackages may contain values of
             # arbitrary types (including nested attrsets) and may even throw
             # exceptions. This attribute isn't recursed into by `nix flake
@@ -141,6 +145,9 @@
             # the same path you would with an overlay.
             legacyPackages = {
               llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
+                inherit llamaVersion;
+              };
               llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
               llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
             };
@@ -151,9 +158,10 @@
               {
                 default = config.legacyPackages.llamaPackages.llama-cpp;
                 vulkan = config.packages.default.override { useVulkan = true; };
+                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
+                python-scripts = config.legacyPackages.llamaPackages.python-scripts;
               }
               // lib.optionalAttrs pkgs.stdenv.isLinux {
-                opencl = config.packages.default.override { useOpenCL = true; };
                 cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
 
                 mpi-cpu = config.packages.default.override { useMpi = true; };
@@ -164,9 +172,14 @@
               };
 
             # Packages exposed in `.#checks` will be built by the CI and by
-            # `nix flake check`. Currently we expose all packages, but we could
-            # make more granular choices
-            checks = config.packages;
+            # `nix flake check`.
+            #
+            # We could test all outputs e.g. as `checks = confg.packages`.
+            #
+            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
+            checks = {
+              inherit (config.packages) default vulkan;
+            };
           };
       };
 }
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
deleted file mode 100644
index 0e5bf0ae1..000000000
--- a/ggml-backend-impl.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // buffer that contains a collection of buffers
-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-
-        void (*GGML_CALL free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations
-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan (async)
-        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-    };
-
-    struct ggml_backend {
-        ggml_guid_t guid;
-
-        struct ggml_backend_i iface;
-
-        ggml_backend_context_t context;
-    };
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-backend.c b/ggml-backend.c
deleted file mode 100644
index c86673b04..000000000
--- a/ggml-backend.c
+++ /dev/null
@@ -1,1845 +0,0 @@
-#include "ggml-backend-impl.h"
-#include "ggml-alloc.h"
-#include "ggml-impl.h"
-
-#include <assert.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-// backend buffer type
-
-const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name(buft);
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    return buft->iface.alloc_buffer(buft, size);
-}
-
-size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_alignment(buft);
-}
-
-size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
-    // get_max_size is optional, defaults to SIZE_MAX
-    if (buft->iface.get_max_size) {
-        return buft->iface.get_max_size(buft);
-    }
-    return SIZE_MAX;
-}
-
-GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
-    // get_alloc_size is optional, defaults to ggml_nbytes
-    if (buft->iface.get_alloc_size) {
-        size_t size = buft->iface.get_alloc_size(buft, tensor);
-        assert(size >= ggml_nbytes(tensor));
-        return size;
-    }
-    return ggml_nbytes(tensor);
-}
-
-bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return buft->iface.supports_backend(buft, backend);
-}
-
-bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
-    if (buft->iface.is_host) {
-        return buft->iface.is_host(buft);
-    }
-    return false;
-}
-
-// backend buffer
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-               ggml_backend_buffer_type_t      buft,
-        struct ggml_backend_buffer_i           iface,
-               ggml_backend_buffer_context_t   context,
-               size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
-
-    (*buffer) = (struct ggml_backend_buffer) {
-        /* .interface = */ iface,
-        /* .buft      = */ buft,
-        /* .context   = */ context,
-        /* .size      = */ size,
-        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
-    };
-
-    return buffer;
-}
-
-const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name(buffer);
-}
-
-void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
-    if (buffer == NULL) {
-        return;
-    }
-
-    if (buffer->iface.free_buffer != NULL) {
-        buffer->iface.free_buffer(buffer);
-    }
-    free(buffer);
-}
-
-size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-    return buffer->size;
-}
-
-void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
-    void * base = buffer->iface.get_base(buffer);
-
-    GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-
-    return base;
-}
-
-GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    // init_tensor is optional
-    if (buffer->iface.init_tensor) {
-        buffer->iface.init_tensor(buffer, tensor);
-    }
-}
-
-size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
-}
-
-size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
-}
-
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
-}
-
-void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    buffer->iface.clear(buffer, value);
-}
-
-bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
-}
-
-void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
-    buffer->usage = usage;
-
-    // FIXME: add a generic callback to the buffer interface
-    if (ggml_backend_buffer_is_multi_buffer(buffer)) {
-        ggml_backend_multi_buffer_set_usage(buffer, usage);
-    }
-}
-
-ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
-    return buffer->buft;
-}
-
-void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
-    if (buffer->iface.reset) {
-        buffer->iface.reset(buffer);
-    }
-}
-
-bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
-    if (dst_buf->iface.cpy_tensor) {
-        return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
-    }
-    return false;
-}
-
-// backend
-
-ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
-    if (backend == NULL) {
-        return NULL;
-    }
-    return backend->guid;
-}
-
-const char * ggml_backend_name(ggml_backend_t backend) {
-    if (backend == NULL) {
-        return "NULL";
-    }
-    return backend->iface.get_name(backend);
-}
-
-void ggml_backend_free(ggml_backend_t backend) {
-    if (backend == NULL) {
-        return;
-    }
-
-    backend->iface.free(backend);
-}
-
-ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
-    return backend->iface.get_default_buffer_type(backend);
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
-    return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
-}
-
-size_t ggml_backend_get_alignment(ggml_backend_t backend) {
-    return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
-}
-
-size_t ggml_backend_get_max_size(ggml_backend_t backend) {
-    return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
-}
-
-void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
-    if (backend->iface.set_tensor_async == NULL) {
-        ggml_backend_tensor_set(tensor, data, offset, size);
-    } else {
-        backend->iface.set_tensor_async(backend, tensor, data, offset, size);
-    }
-}
-
-void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
-    if (backend->iface.get_tensor_async == NULL) {
-        ggml_backend_tensor_get(tensor, data, offset, size);
-    } else {
-        backend->iface.get_tensor_async(backend, tensor, data, offset, size);
-    }
-}
-
-GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(buf != NULL && "tensor buffer not set");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
-    if (!size) {
-        return;
-    }
-
-    tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
-}
-
-GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
-    if (!size) {
-        return;
-    }
-
-    tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
-}
-
-void ggml_backend_synchronize(ggml_backend_t backend) {
-    if (backend->iface.synchronize == NULL) {
-        return;
-    }
-
-    backend->iface.synchronize(backend);
-}
-
-ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_plan_create(backend, cgraph);
-}
-
-void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_free(backend, plan);
-}
-
-void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_compute(backend, plan);
-}
-
-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
-}
-
-bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return backend->iface.supports_op(backend, op);
-}
-
-// backend copy
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
-    if (src == dst) {
-        return;
-    }
-
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
-    } else if (ggml_backend_buffer_is_host(dst->buffer)) {
-        ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-    } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
-#ifndef NDEBUG
-        fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
-#endif
-        size_t nbytes = ggml_nbytes(src);
-        void * data = malloc(nbytes);
-        ggml_backend_tensor_get(src, data, 0, nbytes);
-        ggml_backend_tensor_set(dst, data, 0, nbytes);
-        free(data);
-    }
-}
-
-void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
-    if (src == dst) {
-        return;
-    }
-
-    if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
-        if (backend->iface.cpy_tensor_async != NULL) {
-            if (backend->iface.cpy_tensor_async(backend, src, dst)) {
-                return;
-            }
-        }
-    }
-
-    size_t nbytes = ggml_nbytes(src);
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
-    }
-    else {
-        ggml_backend_tensor_copy(src, dst);
-    }
-}
-
-
-// backend registry
-
-#define GGML_MAX_BACKENDS_REG 16
-
-struct ggml_backend_reg {
-    char name[128];
-    ggml_backend_init_fn init_fn;
-    ggml_backend_buffer_type_t default_buffer_type;
-    void * user_data;
-};
-
-static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
-static size_t ggml_backend_registry_count = 0;
-
-GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
-
-GGML_CALL static void ggml_backend_registry_init(void) {
-    static bool initialized = false;
-
-    if (initialized) {
-        return;
-    }
-
-    initialized = true;
-
-    ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
-
-    // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
-    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
-    ggml_backend_cuda_reg_devices();
-#endif
-
-#ifdef GGML_USE_SYCL
-    extern void ggml_backend_sycl_reg_devices(void);
-    ggml_backend_sycl_reg_devices();
-#endif
-
-#ifdef GGML_USE_METAL
-    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-    ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
-#endif
-
-#ifdef GGML_USE_VULKAN
-    extern GGML_CALL int ggml_backend_vk_reg_devices(void);
-    ggml_backend_vk_reg_devices();
-#endif
-
-#ifdef GGML_USE_KOMPUTE
-    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
-    ggml_backend_kompute_reg_devices();
-#endif
-}
-
-GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
-    GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
-
-    size_t id = ggml_backend_registry_count;
-
-    ggml_backend_registry[id] = (struct ggml_backend_reg) {
-        /* .name                = */ {0},
-        /* .fn                  = */ init_fn,
-        /* .default_buffer_type = */ default_buffer_type,
-        /* .user_data           = */ user_data,
-    };
-
-    snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
-
-#ifndef NDEBUG
-    fprintf(stderr, "%s: registered backend %s\n", __func__, name);
-#endif
-
-    ggml_backend_registry_count++;
-}
-
-size_t ggml_backend_reg_get_count(void) {
-    ggml_backend_registry_init();
-
-    return ggml_backend_registry_count;
-}
-
-size_t ggml_backend_reg_find_by_name(const char * name) {
-    ggml_backend_registry_init();
-
-    for (size_t i = 0; i < ggml_backend_registry_count; i++) {
-        // TODO: case insensitive in a portable way
-        if (strcmp(ggml_backend_registry[i].name, name) == 0) {
-            return i;
-        }
-    }
-
-    // not found
-    return SIZE_MAX;
-}
-
-// init from backend:params string
-ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
-    ggml_backend_registry_init();
-
-    const char * params = strchr(backend_str, ':');
-    char backend_name[128];
-    if (params == NULL) {
-        snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
-        params = "";
-    } else {
-        snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
-        params++;
-    }
-
-    size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
-
-    if (backend_i == SIZE_MAX) {
-        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
-        return NULL;
-    }
-
-    return ggml_backend_reg_init_backend(backend_i, params);
-}
-
-const char * ggml_backend_reg_get_name(size_t i) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].name;
-}
-
-ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
-}
-
-ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].default_buffer_type;
-}
-
-ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
-}
-
-// backend CPU
-
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
-GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CPU";
-
-    GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    uintptr_t data = (uintptr_t)buffer->context;
-
-    // align the buffer
-    if (data % TENSOR_ALIGNMENT != 0) {
-        data = GGML_PAD(data, TENSOR_ALIGNMENT);
-    }
-
-    return (void *)data;
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_name,
-    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// for buffers from ptr, free is not called
-static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_name,
-    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU";
-
-    GGML_UNUSED(buft);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
-    if (data == NULL) {
-        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
-}
-
-GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    GGML_UNUSED(buft);
-}
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type;
-}
-
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buf);
-}
-
-GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    //void * ptr = hbw_malloc(size);
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-struct ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
-
-    ggml_abort_callback abort_callback;
-    void *              abort_callback_data;
-};
-
-GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    free(cpu_ctx->work_data);
-    free(cpu_ctx);
-    free(backend);
-}
-
-GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-    }
-
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    return cpu_plan;
-}
-
-GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    free(cpu_plan->cplan.work_data);
-    free(cpu_plan);
-
-    GGML_UNUSED(backend);
-}
-
-GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        // TODO: may be faster to free and use malloc to avoid the copy
-        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = cpu_ctx->work_data;
-
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
-    ggml_graph_compute(cgraph, &cplan);
-    return true;
-}
-
-GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_CPY:
-            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
-        case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
-        default:
-            return true;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i cpu_backend_i = {
-    /* .get_name                = */ ggml_backend_cpu_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .supports_op             = */ ggml_backend_cpu_supports_op,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
-    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
-    ctx->work_data           = NULL;
-    ctx->work_size           = 0;
-    ctx->abort_callback      = NULL;
-    ctx->abort_callback_data = NULL;
-
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
-    if (cpu_backend == NULL) {
-        free(ctx);
-        return NULL;
-    }
-
-    *cpu_backend = (struct ggml_backend) {
-        /* .guid      = */ ggml_backend_cpu_guid(),
-        /* .interface = */ cpu_backend_i,
-        /* .context   = */ ctx
-    };
-    return cpu_backend;
-}
-
-GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->abort_callback = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
-    GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
-    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
-}
-
-GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-}
-
-// multi-buffer buffer
-
-struct ggml_backend_multi_buffer_context {
-    ggml_backend_buffer_t * buffers;
-    size_t n_buffers;
-};
-
-typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
-
-GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
-
-    return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
-}
-
-GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        ggml_backend_buffer_free(ctx->buffers[i]);
-    }
-
-    free(ctx->buffers);
-    free(ctx);
-}
-
-GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        ggml_backend_buffer_clear(ctx->buffers[i], value);
-    }
-}
-
-static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
-    static struct ggml_backend_buffer_i multi_backend_buffer_i = {
-        /* .get_name        = */ ggml_backend_multi_buffer_get_name,
-        /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
-        /* .get_base        = */ NULL,
-        /* .init_tensor     = */ NULL,
-        /* .set_tensor      = */ NULL,
-        /* .get_tensor      = */ NULL,
-        /* .cpy_tensor      = */ NULL,
-        /* .clear           = */ ggml_backend_multi_buffer_clear,
-        /* .reset           = */ NULL,
-    };
-
-    return multi_backend_buffer_i;
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
-    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
-    ctx->n_buffers = n_buffers;
-    ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
-
-    GGML_ASSERT(ctx->buffers != NULL);
-
-    size_t total_size = 0;
-    for (size_t i = 0; i < n_buffers; i++) {
-        ctx->buffers[i] = buffers[i];
-        total_size += ggml_backend_buffer_get_size(buffers[i]);
-    }
-
-    return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
-}
-
-GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
-}
-
-GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
-    GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
-    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
-    for (size_t i = 0; i < ctx->n_buffers; i++) {
-        ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
-    }
-}
-
-// creates a copy of the tensor with the same memory layout
-static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
-    struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        dup->nb[i] = tensor->nb[i];
-    }
-    return dup;
-}
-
-static bool ggml_is_view_op(enum ggml_op op) {
-    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-
-// scheduler
-
-#define GGML_MAX_BACKENDS 16
-#define GGML_MAX_SPLITS 256
-#define GGML_MAX_SPLIT_INPUTS 16
-
-struct ggml_backend_sched_split {
-    int backend_id;
-    int i_start;
-    int i_end;
-    struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
-    int n_inputs;
-    // graph view of this split
-    struct ggml_cgraph graph;
-};
-
-struct ggml_backend_sched {
-    bool is_reset; // true if the scheduler has been reset since the last graph split
-
-    int n_backends;
-    ggml_backend_t backends[GGML_MAX_BACKENDS];
-    ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
-
-    ggml_gallocr_t galloc;
-
-    // hash keys of the nodes in the graph
-    struct ggml_hash_set    hash_set;
-    // hash values
-    int * tensor_backend_id;
-    struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
-
-    int * node_backend_ids; // [n_nodes]
-    int n_nodes;
-
-    // copy of the graph with modified inputs
-    struct ggml_cgraph * graph;
-
-    struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
-    int n_splits;
-
-    struct ggml_context * ctx;
-
-    ggml_backend_sched_eval_callback callback_eval;
-    void * callback_eval_user_data;
-
-    // align context_buffer to GGML_MEM_ALIGN
-    #ifdef _MSC_VER
-    __declspec(align(GGML_MEM_ALIGN))
-    #else
-    __attribute__((aligned(GGML_MEM_ALIGN)))
-    #endif
-    char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
-};
-
-#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
-#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
-#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
-
-// returns the priority of the backend, lower id is higher priority
-static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->backends[i] == backend) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
-    if (buffer == NULL) {
-        return -1;
-    }
-
-    // find highest prio backend that supports the buffer type
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
-            return i;
-        }
-    }
-    GGML_ASSERT(false && "tensor buffer type not supported by any backend");
-    return -1; // silence warning
-}
-
-#if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
-#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
-#define GET_CAUSE(node) causes[hash_id(node)]
-#else
-#define SET_CAUSE(node, ...)
-#define GET_CAUSE(node) ""
-#endif
-
-// returns the backend that should be used for the node based on the current locations
-static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
-    // TODO: use supports_op to check if the backend supports the op
-
-    // assign pre-allocated nodes to their backend
-    // dst
-    int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
-    if (cur_backend != -1) {
-        SET_CAUSE(node, "1.dst");
-        return cur_backend;
-    }
-    // view_src
-    if (tensor->view_src != NULL) {
-        cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
-        if (cur_backend != -1) {
-            SET_CAUSE(node, "1.vsrc");
-            return cur_backend;
-        }
-    }
-    // assign nodes that use weights to the backend of the weights
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        const struct ggml_tensor * src = tensor->src[i];
-        if (src == NULL) {
-            continue;
-        }
-        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
-            // operations with weights are always run on the same backend as the weights
-            SET_CAUSE(node, "1.wgt%d", i);
-            return src_backend;
-        }
-    }
-
-    return -1;
-}
-
-static char * fmt_size(size_t size) {
-    static char buffer[128];
-    if (size >= 1024*1024) {
-        sprintf(buffer, "%zuM", size/1024/1024);
-    } else {
-        sprintf(buffer, "%zuK", size/1024);
-    }
-    return buffer;
-}
-
-static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    int cur_split = 0;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
-            ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
-                sched->splits[cur_split].n_inputs);
-            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
-                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
-                    fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
-            }
-            fprintf(stderr, "\n");
-            cur_split++;
-        }
-        struct ggml_tensor * node = graph->nodes[i];
-        if (ggml_is_view_op(node->op)) {
-            continue;
-        }
-        ggml_backend_t tensor_backend = tensor_backend(node);
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
-            fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            ggml_backend_t src_backend = tensor_backend(src);
-            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
-                fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
-        }
-        fprintf(stderr, "\n");
-    }
-}
-
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
-// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    // reset splits
-    sched->n_splits = 0;
-    sched->is_reset = false;
-
-    struct ggml_init_params params = {
-        /* .mem_size =   */ sizeof(sched->context_buffer),
-        /* .mem_buffer = */ sched->context_buffer,
-        /* .no_alloc =   */ true
-    };
-
-    ggml_free(sched->ctx);
-
-    sched->ctx = ggml_init(params);
-    if (sched->ctx == NULL) {
-        fprintf(stderr, "%s: failed to initialize context\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    // pass 1: assign backends to ops with pre-allocated inputs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        if (tensor_backend_id(leaf) != -1) {
-            // do not overwrite user assignments
-            continue;
-        }
-        tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
-    }
-
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        if (tensor_backend_id(node) != -1) {
-            // do not overwrite user assignments
-            continue;
-        }
-        tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
-        // src
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            if (tensor_backend_id(src) == -1) {
-                tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
-            }
-        }
-    }
-#ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 2: expand current backend assignments
-    // assign the same backend to adjacent nodes
-    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
-    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-
-    // pass 2.1 expand gpu up
-    {
-        int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int tensor_backend_id = tensor_backend_id(node);
-            if (tensor_backend_id != -1) {
-                if (tensor_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                } else {
-                    cur_backend_id = tensor_backend_id;
-                }
-            } else {
-                tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.1");
-            }
-        }
-    }
-
-    // pass 2.2 expand gpu down
-    {
-        int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int tensor_backend_id = tensor_backend_id(node);
-            if (tensor_backend_id != -1) {
-                if (tensor_backend_id == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_backend_id = -1;
-                } else {
-                    cur_backend_id = tensor_backend_id;
-                }
-            } else {
-                tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.2");
-            }
-        }
-    }
-
-    // pass 2.3 expand rest up
-    {
-        int cur_backend_id = -1;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int tensor_backend_id = tensor_backend_id(node);
-            if (tensor_backend_id != -1) {
-                cur_backend_id = tensor_backend_id;
-            } else {
-                tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.3");
-            }
-        }
-    }
-
-    // pass 2.4 expand rest down
-    {
-        int cur_backend_id = -1;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            int tensor_backend_id = tensor_backend_id(node);
-            if (tensor_backend_id != -1) {
-                cur_backend_id = tensor_backend_id;
-            } else {
-                tensor_backend_id(node) = cur_backend_id;
-                SET_CAUSE(node, "2.4");
-            }
-        }
-    }
-#ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 3: assign backends to remaining src from dst and view_src
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        int cur_backend_id = tensor_backend_id(node);
-        if (node->view_src != NULL && cur_backend_id == -1) {
-            cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
-            SET_CAUSE(node, "3.vsrc");
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            int src_backend_id = tensor_backend_id(src);
-            if (src_backend_id == -1) {
-                if (src->view_src != NULL) {
-                    // views are always on the same backend as the source
-                    tensor_backend_id(src) = tensor_backend_id(src->view_src);
-                    SET_CAUSE(src, "3.vsrc");
-                } else {
-                    tensor_backend_id(src) = cur_backend_id;
-                    SET_CAUSE(src, "3.cur");
-                }
-            }
-        }
-    }
-#ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 4: split graph, find tensors that need to be copied
-    {
-        int cur_split = 0;
-        // find the backend of the first split, skipping view ops
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (!ggml_is_view_op(node->op)) {
-                sched->splits[0].backend_id = tensor_backend_id(node);
-                break;
-            }
-        }
-        sched->splits[0].i_start = 0;
-        sched->splits[0].n_inputs = 0;
-        memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
-        int cur_backend_id = sched->splits[0].backend_id;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-
-            int tensor_backend_id = tensor_backend_id(node);
-
-            GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
-
-            if (tensor_backend_id != cur_backend_id) {
-                sched->splits[cur_split].i_end = i;
-                cur_split++;
-                GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
-                sched->splits[cur_split].backend_id = tensor_backend_id;
-                sched->splits[cur_split].i_start = i;
-                sched->splits[cur_split].n_inputs = 0;
-                cur_backend_id = tensor_backend_id;
-            }
-
-            // find inputs that are not on the same backend
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    continue;
-                }
-                int src_backend_id = tensor_backend_id(src);
-                assert(src_backend_id != -1); // all inputs should be assigned by now
-                if (src_backend_id != tensor_backend_id) {
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->tensor_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = sched->backends[cur_backend_id];
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->tensor_copies[id][cur_backend_id] = tensor_copy;
-                        tensor_backend_id(tensor_copy) = cur_backend_id;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-
-                        int n_inputs = sched->splits[cur_split].n_inputs++;
-                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
-                        sched->splits[cur_split].inputs[n_inputs] = src;
-                    }
-                    node->src[j] = sched->tensor_copies[id][cur_backend_id];
-                }
-            }
-        }
-        sched->splits[cur_split].i_end = graph->n_nodes;
-        sched->n_splits = cur_split + 1;
-    }
-#ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-#ifndef NDEBUG
-    // sanity check: all sources should have the same backend as the node
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        ggml_backend_t tensor_backend = tensor_backend(node);
-        if (tensor_backend == NULL) {
-            fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
-        }
-        if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
-            fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
-                node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
-                node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-            ggml_backend_t src_backend = tensor_backend(src);
-            if (src_backend != tensor_backend /* && src_backend != NULL */) {
-                fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
-                    node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
-                    j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
-            }
-            if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
-                fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
-                    src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
-                    src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
-            }
-        }
-    }
-    fflush(stderr);
-#endif
-
-    // create copies of the graph for each split
-    // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct ggml_backend_sched_split * split = &sched->splits[i];
-        split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
-
-        for (int j = 0; j < split->n_inputs; j++) {
-            struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
-
-            // add a dependency to the input source so that it is not freed before the copy is done
-            struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
-            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
-            graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
-
-            // add a dependency to the input copy so that it is allocated at the start of the split
-            sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
-            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
-        }
-
-        for (int j = split->i_start; j < split->i_end; j++) {
-            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
-            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
-        }
-    }
-    sched->graph = graph_copy;
-}
-
-static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
-    // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
-    if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
-#ifndef NDEBUG
-        fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
-#endif
-        ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
-        if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
-            fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
-    uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
-    uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
-
-    struct ggml_backend_sched_split * splits = sched->splits;
-
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct ggml_backend_sched_split * split = &splits[i];
-        int split_backend_id = split->backend_id;
-        ggml_backend_t split_backend = sched->backends[split_backend_id];
-
-        // copy the input tensors to the split backend
-        uint64_t copy_start_us = ggml_time_us();
-        for (int j = 0; j < split->n_inputs; j++) {
-            struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
-
-            GGML_ASSERT(input->buffer != NULL);
-            GGML_ASSERT(input_cpy->buffer != NULL);
-
-            ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
-        }
-        //ggml_backend_synchronize(split_backend); // necessary to measure copy time
-        int64_t copy_end_us = ggml_time_us();
-        copy_us[split_backend_id] += copy_end_us - copy_start_us;
-
-#if 0
-        char split_filename[GGML_MAX_NAME];
-        snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
-        ggml_graph_dump_dot(split->graph, NULL, split_filename);
-#endif
-
-
-        uint64_t compute_start_us = ggml_time_us();
-        if (!sched->callback_eval) {
-            if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
-                return false;
-            }
-            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
-        } else {
-            // similar to ggml_backend_compare_graph_backend
-            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
-                struct ggml_tensor * t = split->graph.nodes[j0];
-
-                // check if the user needs data from this node
-                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
-
-                int j1 = j0;
-
-                // determine the range [j0, j1] of nodes that can be computed together
-                while (!need && j1 < split->graph.n_nodes - 1) {
-                    t = split->graph.nodes[++j1];
-                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
-                }
-
-                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
-
-                if (!ggml_backend_graph_compute(split_backend, &gv)) {
-                    return false;
-                }
-
-                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
-                    break;
-                }
-
-                j0 = j1;
-            }
-        }
-        uint64_t compute_end_us = ggml_time_us();
-        compute_us[split_backend_id] += compute_end_us - compute_start_us;
-    }
-
-#if 0
-    // per-backend timings
-    fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (copy_us[i] > 0 || compute_us[i] > 0) {
-            fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
-        }
-    }
-#endif
-
-    return true;
-}
-
-ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
-    GGML_ASSERT(n_backends > 0);
-    GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
-
-    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
-
-    // initialize hash table
-    sched->hash_set          = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
-    sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
-    sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
-    sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
-
-    sched->n_backends = n_backends;
-    for (int i = 0; i < n_backends; i++) {
-        sched->backends[i] = backends[i];
-        sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
-    }
-
-    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
-
-    ggml_backend_sched_reset(sched);
-
-    return sched;
-}
-
-void ggml_backend_sched_free(ggml_backend_sched_t sched) {
-    if (sched == NULL) {
-        return;
-    }
-    ggml_gallocr_free(sched->galloc);
-    ggml_free(sched->ctx);
-    free(sched->hash_set.keys);
-    free(sched->tensor_backend_id);
-    free(sched->tensor_copies);
-    free(sched->node_backend_ids);
-    free(sched);
-}
-
-void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
-    // reset state for the next run
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
-    memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
-    memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
-
-    sched->is_reset = true;
-}
-
-bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
-    ggml_backend_sched_split_graph(sched, measure_graph);
-
-    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
-        return false;
-    }
-
-    ggml_backend_sched_reset(sched);
-    return true;
-}
-
-bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
-
-    if (!sched->is_reset) {
-        ggml_backend_sched_reset(sched);
-    }
-
-    ggml_backend_sched_split_graph(sched, graph);
-    if (!ggml_backend_sched_alloc_splits(sched)) {
-        return false;
-    }
-
-    if (!ggml_backend_sched_compute_splits(sched)) {
-        return false;
-    }
-
-    return true;
-}
-
-void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
-    sched->callback_eval = callback;
-    sched->callback_eval_user_data = user_data;
-}
-
-int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
-    return sched->n_splits;
-}
-
-size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
-}
-
-void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    tensor_backend_id(node) = backend_index;
-}
-
-ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
-    int backend_index = tensor_backend_id(node);
-    if (backend_index == -1) {
-        return NULL;
-    }
-    return sched->backends[backend_index];
-}
-
-// utils
-
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->buffer == NULL);
-    GGML_ASSERT(tensor->view_src != NULL);
-    GGML_ASSERT(tensor->view_src->buffer != NULL);
-    GGML_ASSERT(tensor->view_src->data != NULL);
-
-    tensor->buffer = buffer;
-    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    tensor->backend = tensor->view_src->backend;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
-    GGML_ASSERT(tensor->buffer == NULL);
-    GGML_ASSERT(tensor->data == NULL);
-    GGML_ASSERT(tensor->view_src == NULL);
-    GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
-    GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
-                (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
-
-    tensor->buffer = buffer;
-    tensor->data = addr;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
-    struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
-
-    GGML_ASSERT(src != NULL);
-    GGML_ASSERT(src->data && "graph must be allocated");
-
-    size_t id = ggml_hash_insert(hash_set, src);
-    if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
-        return node_copies[ggml_hash_find(hash_set, src)];
-    }
-
-    struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
-    if (src->view_src != NULL) {
-        dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
-        dst->view_offs = src->view_offs;
-    }
-    dst->op = src->op;
-    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
-    ggml_set_name(dst, src->name);
-
-    // copy src
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        struct ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            continue;
-        }
-        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
-    }
-
-    node_copies[id] = dst;
-    return dst;
-}
-
-static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
-    size_t id = ggml_hash_find(hash_set, src);
-    if (node_init[id]) {
-        return;
-    }
-    node_init[id] = true;
-
-    struct ggml_tensor * dst = node_copies[id];
-    if (dst->view_src != NULL) {
-        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        ggml_backend_view_init(dst->view_src->buffer, dst);
-    }
-    else {
-        ggml_backend_tensor_copy(src, dst);
-    }
-
-    // init src
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        struct ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            continue;
-        }
-        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
-    }
-}
-
-struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
-    struct ggml_hash_set hash_set = {
-        /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
-    };
-    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
-    bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
-
-    struct ggml_init_params params = {
-        /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true
-    };
-
-    struct ggml_context * ctx_allocated = ggml_init(params);
-    struct ggml_context * ctx_unallocated = ggml_init(params);
-
-    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
-        fprintf(stderr, "failed to allocate context for graph copy\n");
-        free(hash_set.keys);
-        free(node_copies);
-        free(node_init);
-        ggml_free(ctx_allocated);
-        ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
-
-    // dup nodes
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
-    }
-
-    // allocate nodes
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
-    if (buffer == NULL) {
-        fprintf(stderr, "failed to allocate buffer for graph copy\n");
-        free(hash_set.keys);
-        free(node_copies);
-        free(node_init);
-        ggml_free(ctx_allocated);
-        ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
-
-    //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
-
-    // copy data and init views
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        graph_copy_init_tensor(hash_set, node_copies, node_init, node);
-    }
-
-    // build graph copy
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
-        graph_copy->nodes[i] = node_copy;
-    }
-    graph_copy->n_nodes = graph->n_nodes;
-
-    free(hash_set.keys);
-    free(node_copies);
-    free(node_init);
-
-    return (struct ggml_backend_graph_copy) {
-        /* .buffer           = */ buffer,
-        /* .ctx_allocated    = */ ctx_allocated,
-        /* .ctx_unallocated  = */ ctx_unallocated,
-        /* .graph            = */ graph_copy,
-    };
-}
-
-void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
-    ggml_backend_buffer_free(copy.buffer);
-    ggml_free(copy.ctx_allocated);
-    ggml_free(copy.ctx_unallocated);
-}
-
-bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
-    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
-    if (copy.buffer == NULL) {
-        return false;
-    }
-
-    struct ggml_cgraph * g1 = graph;
-    struct ggml_cgraph * g2 = copy.graph;
-
-    assert(g1->n_nodes == g2->n_nodes);
-
-    for (int i = 0; i < g1->n_nodes; i++) {
-        //printf("eval %d/%d\n", i, g1->n_nodes);
-        struct ggml_tensor * t1 = g1->nodes[i];
-        struct ggml_tensor * t2 = g2->nodes[i];
-
-        assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
-
-        struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
-        struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
-
-        ggml_backend_graph_compute(backend1, &g1v);
-        ggml_backend_graph_compute(backend2, &g2v);
-
-        if (ggml_is_view_op(t1->op)) {
-            continue;
-        }
-
-        // compare results, calculate rms etc
-        if (!callback(i, t1, t2, user_data)) {
-            break;
-        }
-    }
-
-    ggml_backend_graph_copy_free(copy);
-
-    return true;
-}
diff --git a/ggml-backend.h b/ggml-backend.h
deleted file mode 100644
index 8fb54bd92..000000000
--- a/ggml-backend.h
+++ /dev/null
@@ -1,208 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-    };
-
-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
-
-    //
-    // Backend
-    //
-
-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
-    // Create a backend buffer from an existing pointer
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backends to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
-        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
-
-        // initialize buffers from a measure graph
-        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
-
-        // in build_graph:
-        build_graph(...) {
-            // manually assign nodes to a backend (optional, should not be needed in most cases)
-            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
-        }
-
-        // allocate backend buffers from measure graph
-        ggml_backend_sched_init_measure(sched, measure_graph);
-
-        // the scheduler is now ready to compute graphs
-
-        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
-    */
-
-    struct ggml_backend_sched;
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
-    //
-    // when ask == false, the scheduler is passing the node tensor to the user for observation
-    // if the user returns false, the scheduler will cancel the graph compute
-    //
-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
-    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
-    // Initialize backend buffers from a measure graph
-    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-    // Get the number of splits of the last graph
-    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-
-    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-
-    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
deleted file mode 100644
index 0c6501e98..000000000
--- a/ggml-cuda.cu
+++ /dev/null
@@ -1,12347 +0,0 @@
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <float.h>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-#include <map>
-#include <array>
-
-// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
-#define STRINGIZE_IMPL(...) #__VA_ARGS__
-#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
-
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
-#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#ifdef GGML_HIP_UMA
-#define cudaMalloc hipMallocManaged
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
-#else
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#endif
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap abort
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#else
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
-#endif // defined(GGML_USE_HIPBLAS)
-
-#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
-
-#define CC_PASCAL     600
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_OFFSET_AMD 1000000
-#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
-
-#define GGML_CUDA_MAX_NODES 8192
-
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
-#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int &>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __vsub4(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
-    int tmp1;
-    int tmp2;
-    asm("\n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        "
-        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
-        : "v"(a), "v"(b)
-    );
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-#endif // defined(GGML_USE_HIPBLAS)
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-
-[[noreturn]]
-static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
-    int id = -1; // in case cudaGetDevice fails
-    cudaGetDevice(&id);
-
-    fprintf(stderr, "CUDA error: %s\n", msg);
-    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
-    fprintf(stderr, "  %s\n", stmt);
-    // abort with GGML_ASSERT to get a stack trace
-    GGML_ASSERT(!"CUDA error");
-}
-
-#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
-     do {                                                                           \
-        auto err_ = (err);                                                          \
-        if (err_ != (success)) {                                                    \
-            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
-        }                                                                           \
-    } while (0)
-
-#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
-
-#if CUDART_VERSION >= 12000
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        return cublasGetStatusString(err);
-    }
-#else
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        switch (err) {
-            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-            default: return "unknown error";
-        }
-    }
-#endif // CUDART_VERSION >= 12000
-
-#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
-
-#if !defined(GGML_USE_HIPBLAS)
-static const char * cu_get_error_str(CUresult err) {
-    const char * err_str;
-    cuGetErrorString(err, &err_str);
-    return err_str;
-}
-#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
-#endif
-
-#if CUDART_VERSION >= 11100
-#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
-#else
-#define GGML_CUDA_ASSUME(x)
-#endif // CUDART_VERSION >= 11100
-
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<half> to_fp16_cuda_t;
-
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_cuda_op_mul_mat_t)(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
-typedef void (*ggml_cuda_op_flatten_t)(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct {
-    half    d;              // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct {
-    half2   dm;             // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct {
-    half d;                 // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct {
-    half2 dm;               // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct {
-    half    d;              // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct {
-    half2   ds;             // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
-typedef void (*load_tiles_cuda_t)(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-
-//================================= k-quants
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half2 dm;                // super-block scale for quantized scales/mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#ifdef GGML_QKK_64
-    uint8_t scales[2]; // scales, quantized with 8 bits
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    half d;             // super-block scale
-} block_q3_K;
-//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half    dm[2];             // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
-    half2 dm;                  // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half d;                  // super-block scale
-    int8_t scales[QK_K/16];  // block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
-    half2 dm;                     // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    half    d;         // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-#define QR2_XXS 8
-#define QI2_XXS (QK_K / (4*QR2_XXS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-#define QR2_XS 8
-#define QI2_XS (QK_K / (4*QR2_XS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// 2.5625 bpw quants
-#define QR2_S 8
-#define QI2_S (QK_K / (4*QR2_S))
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t scales[QK_K/32];
-} block_iq2_s;
-static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
-
-#define QR3_XXS 8
-#define QI3_XXS (QK_K / (4*QR3_XXS))
-typedef struct {
-    half d;
-    uint8_t qs[3*(QK_K/8)];
-} block_iq3_xxs;
-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
-
-#define QR3_XS 8
-#define QI3_XS (QK_K / (4*QR3_XS))
-#if QK_K == 64
-#define IQ3S_N_SCALE 2
-#else
-#define IQ3S_N_SCALE QK_K/64
-#endif
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t signs[QK_K/8];
-    uint8_t scales[IQ3S_N_SCALE];
-} block_iq3_s;
-static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
-
-#define QR1_S 8
-#define QI1_S (QK_K / (4*QR1_S))
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
-} block_iq1_s;
-static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
-
-#define QK4_NL 32
-#define QR4_NL 2
-#define QI4_NL (QK4_NL / (4*QR4_NL))
-typedef struct {
-    half d;
-    uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
-
-#if QK_K == 64
-#define block_iq4_xs block_iq4_nl
-#define QR4_XS QR4_NL
-#define QI4_XS QI4_NL
-#else
-// QR4_XS = 8 is very slightly faster than QR4_XS = 4
-#define QR4_XS 8
-#define QI4_XS (QK_K / (4*QR4_XS))
-typedef struct {
-    half d;
-    uint16_t scales_h;
-    uint8_t  scales_l[QK_K/64];
-    uint8_t  qs[QK_K/2];
-} block_iq4_xs;
-static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
-#endif
-
-#define WARP_SIZE 32
-#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-
-#define CUDA_GELU_BLOCK_SIZE 256
-#define CUDA_SILU_BLOCK_SIZE 256
-#define CUDA_TANH_BLOCK_SIZE 256
-#define CUDA_RELU_BLOCK_SIZE 256
-#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
-#define CUDA_HARDSWISH_BLOCK_SIZE 256
-#define CUDA_SQR_BLOCK_SIZE 256
-#define CUDA_CPY_BLOCK_SIZE 32
-#define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
-#define CUDA_ROPE_BLOCK_SIZE 256
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-#define CUDA_ALIBI_BLOCK_SIZE 32
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-#define CUDA_CONCAT_BLOCK_SIZE 256
-#define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ACC_BLOCK_SIZE 256
-#define CUDA_IM2COL_BLOCK_SIZE 256
-#define CUDA_POOL2D_BLOCK_SIZE 256
-
-#define CUDA_Q8_0_NE_ALIGN 2048
-
-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
-#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-#define MUL_MAT_SRC1_COL_STRIDE 128
-
-#define MAX_STREAMS 8
-static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
-
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
-};
-
-// this is faster on Windows
-// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-static void ggml_cuda_set_device(const int device) {
-    int current_device;
-    CUDA_CHECK(cudaGetDevice(&current_device));
-
-    if (device == current_device) {
-        return;
-    }
-
-    CUDA_CHECK(cudaSetDevice(device));
-}
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
-
-struct cuda_device_capabilities {
-    int     cc;                 // compute capability
-    size_t  smpb;               // max. shared memory per block
-    bool    vmm;                // virtual memory support
-    size_t  vmm_granularity;    // granularity of virtual memory
-};
-
-static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
-
-static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-[[noreturn]]
-static __device__ void no_device_code(
-    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
-           file_name, line, function_name, arch);
-    (void) arch_list;
-#else
-    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
-           file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    __trap();
-
-    (void) no_device_code; // suppress unused function warning
-}
-
-#ifdef __CUDA_ARCH__
-#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
-#else
-#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
-#endif // __CUDA_ARCH__
-
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
-    return x;
-}
-
-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
-#ifdef GGML_CUDA_F16
-static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-   }
-   return a;
-#else
-   (void) a;
-   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
-#endif // GGML_CUDA_F16
-
-static __device__ __forceinline__ float warp_reduce_max(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
-    return x;
-}
-
-//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-//#pragma unroll
-//    for (int mask = 16; mask > 0; mask >>= 1) {
-//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-//    }
-//    return x;
-//#else
-//    (void) x;
-//    NO_DEVICE_CODE;
-//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-//}
-
-static __device__ __forceinline__ float op_repeat(const float a, const float b) {
-    return b;
-    GGML_UNUSED(a);
-}
-
-static __device__ __forceinline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __device__ __forceinline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __device__ __forceinline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13) {
-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13) {
-
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static __global__ void gelu_f32(const float * x, float * dst, const int k) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
-}
-
-static __global__ void silu_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + expf(-x[i]));
-}
-
-static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
-}
-
-static __global__ void tanh_f32(const float * x, float * dst, int k) {
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = tanhf(x[i]);
-}
-
-static __global__ void relu_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = fmaxf(x[i], 0);
-}
-
-static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
-}
-
-static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
-}
-
-static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
-}
-
-static __global__ void sqr_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-template <int block_size>
-static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    float2 mean_var = make_float2(0.f, 0.f);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        mean_var.x += xi;
-        mean_var.y += xi * xi;
-    }
-
-    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var);
-    if (block_size > WARP_SIZE) {
-        __shared__ float2 s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        __syncthreads();
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var);
-    }
-
-    const float mean = mean_var.x / ncols;
-    const float var = mean_var.y / ncols - mean * mean;
-    const float inv_std = rsqrtf(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (blockIdx.z < ne02) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * gridDim.y;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
-static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
-    int ne0 = ne00 * scale_factor;
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = blockIdx.y / scale_factor;
-    int offset_src =
-        i00 +
-        i01 * ne00 +
-        blockIdx.z * nb02;
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    dst[offset_dst] = x[offset_src];
-}
-
-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
-
-template <int block_size>
-static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
-    int start = blockIdx.x * group_size;
-    int end = start + group_size;
-
-    start += threadIdx.x;
-
-    if (end >= ne_elements) {
-        end = ne_elements;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += block_size) {
-        tmp += x[j];
-    }
-
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += block_size) {
-        float xi = x[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    float variance = tmp / group_size;
-    float scale = rsqrtf(variance + eps);
-    for (int j = start; j < end; j += block_size) {
-        dst[j] *= scale;
-    }
-}
-
-template <int block_size>
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    const float mean = tmp / ncols;
-    const float scale = rsqrtf(mean + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * x[row*ncols + col];
-    }
-}
-
-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {8.0f, 8.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x = (v.x - 8.0f) * d;
-    v.y = (v.y - 8.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x = (v.x * d) + m;
-    v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {16.0f, 16.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x = (v.x - 16.0f) * d;
-    v.y = (v.y - 16.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x = (v.x * d) + m;
-    v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x = x[ib].qs[iqs + 0];
-    v.y = x[ib].qs[iqs + 1];
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-#else
-    v.x *= d;
-    v.y *= d;
-#endif // GGML_CUDA_F16
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-
-    const int i = blockIdx.x;
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int ib = 8*i + ir;
-    if (ib >= nb32) {
-        return;
-    }
-
-    dst_t * y = yy + 256*i + 32*ir + 4*il;
-
-    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
-    const float d = __half2float(x->d);
-    const float dm = -8*d;
-
-    const uint8_t * q = x->qs + 4*il;
-
-    for (int l = 0; l < 4; ++l) {
-        y[l+ 0] = d * (q[l] & 0xF) + dm;
-        y[l+16] = d * (q[l] >>  4) + dm;
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-
-    const int i = blockIdx.x;
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int ib = 8*i + ir;
-    if (ib >= nb32) {
-        return;
-    }
-
-    dst_t * y = yy + 256*i + 32*ir + 4*il;
-
-    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
-    const float2 d = __half22float2(x->dm);
-
-    const uint8_t * q = x->qs + 4*il;
-
-    for (int l = 0; l < 4; ++l) {
-        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
-        y[l+16] = d.x * (q[l] >>  4) + d.y;
-    }
-}
-
-//================================== k-quants
-
-template<typename dst_t>
-static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i = blockIdx.x;
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-#if QK_K == 256
-    const int r = threadIdx.x/4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16*is0 + 4*(threadIdx.x%4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
-}
-
-#if QK_K == 256
-static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-template<typename dst_t>
-static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = blockIdx.x;
-
-#if QK_K == 256
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = __low2half(x[i].dm);
-    const float dmin = __high2half(x[i].dm);
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-#else
-    const int tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = blockIdx.x;
-
-#if QK_K == 256
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = __low2half(x[i].dm);
-    const float dmin = __high2half(x[i].dm);
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = blockIdx.x;
-#if QK_K == 256
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
-}
-
-static const __device__ uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const __device__ uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const __device__ uint64_t iq2s_grid[1024] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
-    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
-    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
-    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
-    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
-    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
-    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
-    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
-    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
-    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
-    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
-    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
-    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
-    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
-    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
-    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
-    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
-    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
-    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
-    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
-    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
-    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
-    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
-    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
-    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
-    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
-    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
-    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
-    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
-    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
-    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
-    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
-    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
-    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
-    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
-    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
-    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
-    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
-    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
-    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
-    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
-    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
-    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
-    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
-    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
-    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
-    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
-    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
-    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
-    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
-    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
-    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
-    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
-    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
-    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
-    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
-    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
-    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
-    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
-    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
-    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
-    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
-    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
-    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
-    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
-    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
-    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
-    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
-    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
-    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
-    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
-    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
-    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
-    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
-    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
-    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
-    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
-    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
-    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
-    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
-    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
-    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
-    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
-    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
-    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
-    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
-    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
-    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
-    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
-    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
-    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
-    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
-    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
-    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
-    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
-    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
-    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
-    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
-    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
-    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
-    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
-    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
-    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
-    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
-    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
-    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
-    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
-    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
-    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
-    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
-    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
-    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
-    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
-    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
-    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
-    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
-    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
-    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
-    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
-    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
-    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
-    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
-    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
-    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
-    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
-    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
-    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
-    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
-    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
-    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
-    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
-    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
-    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
-    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
-    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
-    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
-    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
-    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
-    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
-    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
-    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
-    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
-    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
-    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
-    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
-    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
-    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
-    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
-    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
-    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
-    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
-    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
-    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
-    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
-    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
-    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
-    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
-    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
-    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
-    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
-    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
-    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
-    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
-    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
-    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
-    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
-    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
-    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
-    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
-    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
-    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
-    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
-    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
-    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
-    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
-    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
-    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
-    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
-    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
-    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
-    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
-    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
-    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
-    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
-    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
-    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
-    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
-    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
-    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
-    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
-    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
-    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
-    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
-    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
-    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
-    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
-    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
-    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
-    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
-    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
-    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
-    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
-    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
-    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
-    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
-    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
-    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
-    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
-    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
-    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
-    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
-    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
-    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
-    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
-    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
-    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
-    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
-    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
-    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
-    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
-    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
-    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
-    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
-    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
-    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
-    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
-    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
-    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
-    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
-    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
-    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
-    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
-    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
-    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
-    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
-    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
-    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
-    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
-    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
-    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
-    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
-    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
-    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const __device__ uint32_t iq3xxs_grid[256] = {
-    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
-    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
-    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
-    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
-    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
-    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
-    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
-    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
-    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
-    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
-    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
-    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
-    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
-    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
-    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
-    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
-    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
-    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
-    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
-    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
-    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
-    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
-    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
-    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
-    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
-    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
-    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
-    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
-    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
-    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
-    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
-    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
-};
-
-static const __device__ uint32_t iq3xs_grid[512] = {
-    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
-    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
-    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
-    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
-    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
-    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
-    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
-    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
-    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
-    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
-    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
-    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
-    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
-    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
-    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
-    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
-    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
-    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
-    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
-    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
-    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
-    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
-    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
-    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
-    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
-    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
-    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
-    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
-    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
-    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
-    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
-    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
-    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
-    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
-    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
-    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
-    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
-    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
-    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
-    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
-    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
-    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
-    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
-    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
-    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
-    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
-    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
-    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
-    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
-    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
-    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
-    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
-    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
-    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
-    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
-    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
-    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
-    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
-    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
-    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
-    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
-    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
-    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
-    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
-};
-
-
-static const __device__ uint64_t iq1s_grid[512] = {
-    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
-    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
-    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
-    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
-    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
-    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
-    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
-    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
-    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
-    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
-    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
-    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
-    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
-    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
-    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
-    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
-    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
-    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
-    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
-    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
-    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
-    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
-    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
-    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
-    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
-    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
-    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
-    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
-    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
-    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
-    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
-    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
-    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
-    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
-    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
-    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
-    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
-    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
-    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
-    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
-    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
-    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
-    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
-    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
-    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
-    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
-    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
-    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
-    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
-    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
-    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
-    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
-    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
-    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
-    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
-    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
-    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
-    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
-    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
-    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
-    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
-    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
-    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
-    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
-    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
-    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
-    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
-    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
-    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
-    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
-    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
-    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
-    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
-    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
-    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
-    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
-    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
-    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
-    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
-    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
-    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
-    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
-    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
-    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
-    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
-    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
-    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
-    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
-    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
-    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
-    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
-    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
-    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
-    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
-    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
-    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
-    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
-    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
-    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
-    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
-    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
-    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
-    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
-    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
-    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
-    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
-    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
-    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
-    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
-    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
-    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
-    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
-    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
-    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
-    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
-    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
-    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
-    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
-    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
-    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
-    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
-    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
-    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
-    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
-    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
-    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
-};
-
-static const __device__ uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-static const __device__ uint64_t ksigns64[128] = {
-    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
-    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
-    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
-    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
-    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
-    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
-    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
-    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
-    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
-    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
-    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
-    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
-    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
-    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
-    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
-    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
-    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
-    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
-    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
-    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
-    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
-    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
-    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
-    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
-    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
-    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
-    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
-    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
-    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
-    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
-    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
-    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
-};
-//#endif
-
-static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return true;
-        default:
-            return false;
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
-    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq2_xs * x = (const block_iq2_xs *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
-    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq2_s * x = (const block_iq2_s *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
-    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
-    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t  * q3 = x[i].qs + 8*ib;
-    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
-    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
-    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
-    const uint32_t aux32 = gas[0] | (gas[1] << 16);
-    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
-    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-    }
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq3_s * x = (const block_iq3_s *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint8_t * qs = x[i].qs + 8*ib;
-    const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
-    const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
-    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
-    const uint8_t signs = x[i].signs[4*ib + il];
-    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-    }
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq1_s * x = (const block_iq1_s  *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const int i8 = 4*ib+il;
-    uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
-    const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
-    const float d = (float)x[i].d * (2*(h & 7) + 1);
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
-#else
-    assert(false);
-#endif
-
-}
-
-static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
-
-    const int tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
-    const uint8_t  * q4 = x[ib].qs + 4*il;
-    const float d = (float)x[ib].d;
-    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
-    }
-
-}
-
-#if QK_K != 64
-template<typename dst_t>
-static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
-    const block_iq4_xs * x = (const block_iq4_xs *)vx;
-
-    const int tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
-    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
-    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
-    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
-    }
-}
-#endif
-
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
-            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-
-    const int row = blockIdx.x;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/2;  // 0...15
-    const int ix  = threadIdx.x%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-    }
-
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-#if QK_K == 256
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const half * x = (const half *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x = x[ib + iqs + 0];
-    v.y = x[ib + iqs + 1];
-}
-
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
-    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (ix >= kx_padded) {
-        return;
-    }
-
-    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
-
-    const int i_padded = iy*kx_padded + ix;
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
-
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-    float amax = fabsf(xi);
-    float sum = xi;
-
-    amax = warp_reduce_max(amax);
-    sum = warp_reduce_sum(sum);
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    reinterpret_cast<half&>(y[ib].ds.y) = sum;
-}
-
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-
-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0]        = v.x;
-    dst_row[iybs + iqs + y_offset] = v.y;
-}
-
-template<typename src0_t, typename dst_t>
-static __global__ void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-
-    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
-}
-
-template <typename src_t, typename dst_t>
-static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    const src_t * x = (src_t *) vx;
-
-    y[i] = x[i];
-}
-
-template <bool need_check>
-static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
-#if __CUDA_ARCH__ >= CC_PASCAL
-    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
-
-    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
-    const int * x0 = ((int *) vx) + blockIdx.x * nint;
-    half2 * y2 = (half2 *) (y + i0);
-
-    __shared__ int vals[nint];
-
-#pragma unroll
-    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
-        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
-            break;
-        }
-
-        const int ix = ix0 + threadIdx.x;
-        vals[ix] = x0[ix];
-    }
-
-#pragma unroll
-    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
-        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
-            return;
-        }
-
-        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
-        const half    d = *b0;
-        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
-
-        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
-    }
-#else
-    (void) vx; (void) y; (void) k;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_PASCAL
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
-    const int * v, const int * u, const float & d4, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-#else
-    const float2 dm4f = __half22float2(dm4);
-    const float2 ds8f = __half22float2(ds8);
-    const float d4d8 = dm4f.x * ds8f.x;
-    const float m4s8 = dm4f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-#else
-    const float2 dm5f = __half22float2(dm5);
-    const float2 ds8f = __half22float2(ds8);
-    const float d5d8 = dm5f.x * ds8f.x;
-    const float m5s8 = dm5f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
-    const int * v, const int * u, const float & d8_0, const float & d8_1) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-    return d8_0*d8_1 * sumi;
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-#else
-    const float2 dm8f = __half22float2(dm8);
-    const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8f.x * ds8f.x;
-    const float m8s8 = dm8f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return dm2f.x*sumf_d - dm2f.y*sumf_m;
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float & d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi = __vsubss4(vil, vih);
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d3, const float & d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-
-    }
-
-    const float2 dm5f = __half22float2(dm5);
-
-    return dm5f.x*sumf_d - dm5f.y*sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
-    const float & d6, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
-
-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
-        }
-
-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
-    }
-
-    return d6 * sumf_d;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
-
-    *x_ql = tile_x_ql;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = bq3_K->d;
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
-    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = __vsubss4(vll, vlh);
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2half(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
-    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
-
-#if QR2_XXS == 8
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    uint32_t aux32 = q2[2] | (q2[3] << 16);
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
-        for (int j = 0; j < 8; ++j) {
-            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-        aux32 >>= 7;
-    }
-    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
-    return d * sumi;
-#else
-    // iqs is 0...15
-    const int ib32 = iqs/2;
-    const int il = iqs%2;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
-    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
-    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
-    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
-    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
-    int sumi1 = 0, sumi2 = 0;
-    for (int j = 0; j < 8; ++j) {
-        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
-        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
-    }
-    return d * (sumi1 + sumi2);
-#endif
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
-    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
-
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
-    const uint8_t ls2 = bq2->scales[ib32] >>  4;
-    int sumi1 = 0;
-    for (int l = 0; l < 2; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
-        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
-        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
-        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
-        q8 += 8;
-    }
-    int sumi2 = 0;
-    for (int l = 2; l < 4; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
-        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
-        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
-        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
-        q8 += 8;
-    }
-    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
-    (void) ksigns64;
-    assert(false);
-    return 0.f;
-#endif
-#else
-    (void) ksigns64;
-    assert(false);
-    return 0.f;
-#endif
-}
-
-// TODO
-static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
-    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
-
-    const int ib32 = iqs;
-    const int8_t  * q8 = bq8_1[ib32].qs;
-    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
-    const uint8_t ls2 = bq2->scales[ib32] >>  4;
-    int sumi1 = 0;
-    for (int l = 0; l < 2; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
-        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
-        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
-        q8 += 8;
-    }
-    int sumi2 = 0;
-    for (int l = 2; l < 4; ++l) {
-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
-        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
-        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
-        q8 += 8;
-    }
-    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
-    (void) ksigns64;
-    assert(false);
-    return 0.f;
-#endif
-#else
-    (void) ksigns64;
-    assert(false);
-    return 0.f;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
-    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
-
-    const int ib32 = iqs;
-    const uint8_t  * q3 = bq2->qs + 8*ib32;
-    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    uint32_t aux32 = gas[0] | (gas[1] << 16);
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
-        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
-        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
-        q8 += 8;
-        aux32 >>= 7;
-    }
-    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
-    return d * sumi;
-#else
-    assert(false);
-    return 0.f;
-#endif
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-// TODO: don't use lookup table for signs
-static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-#if QK_K == 256
-    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
-
-    const int ib32 = iqs;
-    const uint8_t  * qs = bq2->qs + 8*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
-        const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
-        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
-        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
-        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
-        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
-        q8 += 8;
-    }
-    const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
-    return d * sumi;
-#else
-    assert(false);
-    return 0.f;
-#endif
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-
-static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
-    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
-
-    const int ib32 = iqs;
-    int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
-    const uint8_t h1 = bq1->scales[2*ib32+0];
-    const uint8_t h2 = bq1->scales[2*ib32+1];
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const int * q8 = (const int *)bq8_1[ib32].qs;
-    const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
-    const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
-    const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
-    const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
-    for (int j = 0; j < 2; ++j) {
-        sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
-        sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
-        sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
-        sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
-    }
-#else
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
-    const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
-    const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
-    const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
-    for (int j = 0; j < 8; ++j) {
-        sumi1 += q8[j+ 0] * grid1[j];
-        sumi2 += q8[j+ 8] * grid2[j];
-        sumi3 += q8[j+16] * grid3[j];
-        sumi4 += q8[j+24] * grid4[j];
-    }
-#endif
-    const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
-    return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
-                sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
-        int & val1, int & val2) {
-
-    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
-    aux32 = q4 & 0x0f0f0f0f;
-    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
-    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
-    val1 = v1 | (v2 << 16);
-    aux32 = (q4 >> 4) & 0x0f0f0f0f;
-    v1 = values[q8[0]] | (values[q8[1]] << 8);
-    v2 = values[q8[2]] | (values[q8[3]] << 8);
-    val2 = v1 | (v2 << 16);
-}
-#endif
-
-static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
-    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
-
-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
-    int v1, v2;
-    int sumi1 = 0, sumi2 = 0;
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
-        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
-        get_int_from_table_16(aux, values, v1, v2);
-        sumi1 = __dp4a(v1, q8[l+0], sumi1);
-        sumi2 = __dp4a(v2, q8[l+4], sumi2);
-    }
-
-#else
-    const uint8_t * q4 = bq->qs + 4*iqs;
-    const int8_t  * q8 = bq8_1->qs + 4*iqs;
-
-    int sumi1 = 0, sumi2 = 0;
-    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
-        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
-        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
-    }
-#endif
-    const float d = (float)bq->d * __low2float(bq8_1->ds);
-    return d * (sumi1 + sumi2);
-}
-
-static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-#if QK_K == 256
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-
-    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
-
-    //// iqs is 0...7
-    //const int ib64 = iqs/2;
-    //const int il = iqs%2;
-    //const int32_t  * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
-    //const int32_t  * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
-    //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
-    //const uint32_t * q4_2 = q4_1 + 4;
-    //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
-    //const int8_t ls2 = (bq4->scales_l[ib64] >>  4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
-    //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
-    //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
-    //int v1, v2;
-    //int sumi1 = 0, sumi2 = 0;
-    //for (int j = 0; j < 2; ++j) {
-    //    get_int_from_table_16(q4_1[j], values, v1, v2);
-    //    sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
-    //    get_int_from_table_16(q4_2[j], values, v1, v2);
-    //    sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
-    //}
-    //return d1 * sumi1 + d2 * sumi2;
-
-    // iqs is 0...7
-    const int ib32 = iqs;
-    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
-    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
-    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
-    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
-    int v1, v2;
-    int sumi1 = 0, sumi2 = 0;
-    for (int j = 0; j < 4; ++j) {
-        get_int_from_table_16(q4[j], values, v1, v2);
-        sumi1 = __dp4a(v1, q8[j+0], sumi1);
-        sumi2 = __dp4a(v2, q8[j+4], sumi2);
-    }
-    return d * (sumi1 + sumi2);
-
-    //// iqs is 0...15
-    //const int ib32 = iqs/2;
-    //const int il = iqs%2;
-    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs + 2*il;
-    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
-    //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
-    //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
-    //int v1, v2;
-    //int sumi1 = 0, sumi2 = 0;
-    //for (int j = 0; j < 2; ++j) {
-    //    get_int_from_table_16(q4[j], values, v1, v2);
-    //    sumi1 = __dp4a(v1, q8[j+0], sumi1);
-    //    sumi2 = __dp4a(v2, q8[j+4], sumi2);
-    //}
-    //return d * (sumi1 + sumi2);
-#else
-    assert(false);
-    return 0.f;
-#endif
-#else
-    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
-#endif
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = blockIdx.x*mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = blockIdx.y*mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    int   * tile_x_ql = nullptr;
-    half2 * tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
-    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir*WARP_SIZE + threadIdx.x;
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
-                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = __low2half(*dsi_src);
-                }
-            }
-
-            __syncthreads();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
-                            threadIdx.x + i, threadIdx.y + j, k);
-                    }
-                }
-            }
-
-            __syncthreads();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-#define  MMQ_X_Q4_0_RDNA2  64
-#define  MMQ_Y_Q4_0_RDNA2  128
-#define NWARPS_Q4_0_RDNA2  8
-#define  MMQ_X_Q4_0_RDNA1  64
-#define  MMQ_Y_Q4_0_RDNA1  64
-#define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
-#define  MMQ_X_Q4_0_AMPERE 64
-#define  MMQ_Y_Q4_0_AMPERE 128
-#define NWARPS_Q4_0_AMPERE 4
-#endif
-#define  MMQ_X_Q4_0_PASCAL 64
-#define  MMQ_Y_Q4_0_PASCAL 64
-#define NWARPS_Q4_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-    const int nwarps = NWARPS_Q4_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-    const int nwarps = NWARPS_Q4_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-    const int nwarps = NWARPS_Q4_0_AMPERE;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-    const int nwarps = NWARPS_Q4_0_PASCAL;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_1_RDNA2  64
-#define  MMQ_Y_Q4_1_RDNA2  128
-#define NWARPS_Q4_1_RDNA2  8
-#define  MMQ_X_Q4_1_RDNA1  64
-#define  MMQ_Y_Q4_1_RDNA1  64
-#define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
-#define  MMQ_X_Q4_1_AMPERE 64
-#define  MMQ_Y_Q4_1_AMPERE 128
-#define NWARPS_Q4_1_AMPERE 4
-#endif
-#define  MMQ_X_Q4_1_PASCAL 64
-#define  MMQ_Y_Q4_1_PASCAL 64
-#define NWARPS_Q4_1_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-    const int nwarps = NWARPS_Q4_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-    const int nwarps = NWARPS_Q4_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-    const int nwarps = NWARPS_Q4_1_AMPERE;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-    const int nwarps = NWARPS_Q4_1_PASCAL;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_0_RDNA2  64
-#define  MMQ_Y_Q5_0_RDNA2  128
-#define NWARPS_Q5_0_RDNA2  8
-#define  MMQ_X_Q5_0_RDNA1  64
-#define  MMQ_Y_Q5_0_RDNA1  64
-#define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
-#define  MMQ_X_Q5_0_AMPERE 128
-#define  MMQ_Y_Q5_0_AMPERE 64
-#define NWARPS_Q5_0_AMPERE 4
-#endif
-#define  MMQ_X_Q5_0_PASCAL 64
-#define  MMQ_Y_Q5_0_PASCAL 64
-#define NWARPS_Q5_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-    const int nwarps = NWARPS_Q5_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-    const int nwarps = NWARPS_Q5_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-    const int nwarps = NWARPS_Q5_0_AMPERE;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-    const int nwarps = NWARPS_Q5_0_PASCAL;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_1_RDNA2  64
-#define  MMQ_Y_Q5_1_RDNA2  128
-#define NWARPS_Q5_1_RDNA2  8
-#define  MMQ_X_Q5_1_RDNA1  64
-#define  MMQ_Y_Q5_1_RDNA1  64
-#define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
-#define  MMQ_X_Q5_1_AMPERE 128
-#define  MMQ_Y_Q5_1_AMPERE 64
-#define NWARPS_Q5_1_AMPERE 4
-#endif
-#define  MMQ_X_Q5_1_PASCAL 64
-#define  MMQ_Y_Q5_1_PASCAL 64
-#define NWARPS_Q5_1_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-    const int nwarps = NWARPS_Q5_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-    const int nwarps = NWARPS_Q5_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-    const int nwarps = NWARPS_Q5_1_AMPERE;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-    const int nwarps = NWARPS_Q5_1_PASCAL;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q8_0_RDNA2  64
-#define  MMQ_Y_Q8_0_RDNA2  128
-#define NWARPS_Q8_0_RDNA2  8
-#define  MMQ_X_Q8_0_RDNA1  64
-#define  MMQ_Y_Q8_0_RDNA1  64
-#define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
-#define  MMQ_X_Q8_0_AMPERE 128
-#define  MMQ_Y_Q8_0_AMPERE 64
-#define NWARPS_Q8_0_AMPERE 4
-#endif
-#define  MMQ_X_Q8_0_PASCAL 64
-#define  MMQ_Y_Q8_0_PASCAL 64
-#define NWARPS_Q8_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-    const int nwarps = NWARPS_Q8_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-    const int nwarps = NWARPS_Q8_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-    const int nwarps = NWARPS_Q8_0_AMPERE;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-    const int nwarps = NWARPS_Q8_0_PASCAL;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q2_K_RDNA2  64
-#define  MMQ_Y_Q2_K_RDNA2  128
-#define NWARPS_Q2_K_RDNA2  8
-#define  MMQ_X_Q2_K_RDNA1  128
-#define  MMQ_Y_Q2_K_RDNA1  32
-#define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
-#define  MMQ_X_Q2_K_AMPERE 64
-#define  MMQ_Y_Q2_K_AMPERE 128
-#define NWARPS_Q2_K_AMPERE 4
-#endif
-#define  MMQ_X_Q2_K_PASCAL 64
-#define  MMQ_Y_Q2_K_PASCAL 64
-#define NWARPS_Q2_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-    const int nwarps = NWARPS_Q2_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-    const int nwarps = NWARPS_Q2_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-    const int nwarps = NWARPS_Q2_K_AMPERE;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-    const int nwarps = NWARPS_Q2_K_PASCAL;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q3_K_RDNA2  128
-#define  MMQ_Y_Q3_K_RDNA2  64
-#define NWARPS_Q3_K_RDNA2  8
-#define  MMQ_X_Q3_K_RDNA1  32
-#define  MMQ_Y_Q3_K_RDNA1  128
-#define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
-#define  MMQ_X_Q3_K_AMPERE 128
-#define  MMQ_Y_Q3_K_AMPERE 128
-#define NWARPS_Q3_K_AMPERE 4
-#endif
-#define  MMQ_X_Q3_K_PASCAL 64
-#define  MMQ_Y_Q3_K_PASCAL 64
-#define NWARPS_Q3_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-    const int nwarps = NWARPS_Q3_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-    const int nwarps = NWARPS_Q3_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-    const int nwarps = NWARPS_Q3_K_AMPERE;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-    const int nwarps = NWARPS_Q3_K_PASCAL;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_K_RDNA2  64
-#define  MMQ_Y_Q4_K_RDNA2  128
-#define NWARPS_Q4_K_RDNA2  8
-#define  MMQ_X_Q4_K_RDNA1  32
-#define  MMQ_Y_Q4_K_RDNA1  64
-#define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
-#define  MMQ_X_Q4_K_AMPERE 64
-#define  MMQ_Y_Q4_K_AMPERE 128
-#define NWARPS_Q4_K_AMPERE 4
-#endif
-#define  MMQ_X_Q4_K_PASCAL 64
-#define  MMQ_Y_Q4_K_PASCAL 64
-#define NWARPS_Q4_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-    const int nwarps = NWARPS_Q4_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-    const int nwarps = NWARPS_Q4_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-    const int nwarps = NWARPS_Q4_K_AMPERE;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-    const int nwarps = NWARPS_Q4_K_PASCAL;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_K_RDNA2  64
-#define  MMQ_Y_Q5_K_RDNA2  128
-#define NWARPS_Q5_K_RDNA2  8
-#define  MMQ_X_Q5_K_RDNA1  32
-#define  MMQ_Y_Q5_K_RDNA1  64
-#define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
-#define  MMQ_X_Q5_K_AMPERE 64
-#define  MMQ_Y_Q5_K_AMPERE 128
-#define NWARPS_Q5_K_AMPERE 4
-#endif
-#define  MMQ_X_Q5_K_PASCAL 64
-#define  MMQ_Y_Q5_K_PASCAL 64
-#define NWARPS_Q5_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-    const int nwarps = NWARPS_Q5_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-    const int nwarps = NWARPS_Q5_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-    const int nwarps = NWARPS_Q5_K_AMPERE;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-    const int nwarps = NWARPS_Q5_K_PASCAL;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q6_K_RDNA2  64
-#define  MMQ_Y_Q6_K_RDNA2  128
-#define NWARPS_Q6_K_RDNA2  8
-#define  MMQ_X_Q6_K_RDNA1  32
-#define  MMQ_Y_Q6_K_RDNA1  64
-#define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
-#define  MMQ_X_Q6_K_AMPERE 64
-#define  MMQ_Y_Q6_K_AMPERE 64
-#define NWARPS_Q6_K_AMPERE 4
-#endif
-#define  MMQ_X_Q6_K_PASCAL 64
-#define  MMQ_Y_Q6_K_PASCAL 64
-#define NWARPS_Q6_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-    const int nwarps = NWARPS_Q6_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-    const int nwarps = NWARPS_Q6_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-    const int nwarps = NWARPS_Q6_K_AMPERE;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-    const int nwarps = NWARPS_Q6_K_PASCAL;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-// tell the compiler to use as many registers as it wants, see nwarps definition below
-__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-static __global__ void mul_mat_vec_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-    constexpr int nwarps              = 1;
-    constexpr int rows_per_cuda_block = 1;
-#else
-    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-
-    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    const     int row0 = rows_per_cuda_block*blockIdx.x;
-    const     int blocks_per_row_x = ncols_x / qk;
-    const     int blocks_per_col_y = nrows_y / QK8_1;
-    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
-        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
-
-        // x block quant index when casting the quants to int
-        const int kqs = vdr * (tid % (qi/vdr));
-
-#pragma unroll
-        for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
-            for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(
-                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
-            }
-        }
-    }
-
-    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
-    if (threadIdx.y > 0) {
-#pragma unroll
-        for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
-            for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
-            }
-        }
-    }
-    __syncthreads();
-    if (threadIdx.y > 0) {
-        return;
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int j = 0; j < ncols_y; ++j) {
-#pragma unroll
-        for (int i = 0; i < rows_per_cuda_block; ++i) {
-#pragma unroll
-            for (int l = 0; l < nwarps-1; ++l) {
-                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
-            }
-            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
-        }
-
-        if (threadIdx.x < rows_per_cuda_block) {
-            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
-        }
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = threadIdx.x;
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
-#else
-            tmp += v.x * y[iybs + iqs + j/qr + 0];
-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static __global__ void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi = __half2float(x[ix]);
-
-        const int row_y = col_x;
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi = __half2float(x[ix]);
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-    tmp = warp_reduce_sum(tmp);
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = __float2half(*xi);
-}
-
-static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const half * xi = (const half *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
-    const half * xi = (const half *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                   const int nb12, const int nb13) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= ne) {
-        return;
-    }
-
-    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i03 = i/(ne00 * ne01 * ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-
-    const int i13 = i/(ne10 * ne11 * ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = fmaxf(amax, fabsf(v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = roundf(x0);
-    }
-}
-
-static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < fabsf(v)) {
-            amax = fabsf(v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x = d;
-    dsti->dm.y = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
-    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i03 = i/(ne00 * ne01 * ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-
-    const int i13 = i/(ne10 * ne11 * ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
-
-    cpy_blck(cx + x_offset, cdst + dst_offset);
-}
-
-static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static __device__ void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, -float(col)/ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
-
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-}
-
-template<typename T, bool has_pos>
-static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    if (ib > 0) {
-        const int i = row*ncols + ib*n_dims + ic;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
-        return;
-    }
-
-    const int i  = row*ncols + ib*n_dims + ic/2;
-    const int i2 = row/p_delta_rows;
-
-    float cur_rot = inv_ndims * ic - ib;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
-
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-}
-
-static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
-    const float sin_block_theta = sinf(block_theta);
-    const float cos_block_theta = cosf(block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
-                                 const int n_heads_log2_floor, const float m0, const float m1) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-
-    const int k = row/k_rows;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = powf(m0, k + 1);
-    } else {
-        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    dst[i] = col * m_k + x[i];
-}
-
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-template<typename T>
-static inline __device__ void swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
-    // bitonic sort
-    int col = threadIdx.x;
-    int row = blockIdx.y;
-
-    if (col >= ncols) return;
-
-    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    __syncthreads();
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            __syncthreads();
-        }
-    }
-}
-
-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
-    const int col = blockDim.y*blockIdx.y + threadIdx.y;
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-}
-
-template <bool vals_smem, int ncols_template, int block_size_template>
-static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
-    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
-
-    const int tid  = threadIdx.x;
-    const int rowx = blockIdx.x;
-    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
-
-    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    float slope = 0.0f;
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const int h = rowx/nrows_y; // head index
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = powf(base, exp);
-    }
-
-    extern __shared__ float data_soft_max_f32[];
-    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
-    // shared memory buffer to cache values between iterations:
-    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
-
-    float max_val = -INFINITY;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-
-        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
-
-        vals[col] = val;
-        max_val = max(max_val, val);
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = -INFINITY;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = max_val;
-        }
-        __syncthreads();
-
-        max_val = buf_iw[lane_id];
-        max_val = warp_reduce_max(max_val);
-    }
-
-    float tmp = 0.0f; // partial sum
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const float val = expf(vals[col] - max_val);
-        tmp += val;
-        vals[col] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = 0.0f;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = tmp;
-        }
-        __syncthreads();
-
-        tmp = buf_iw[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    const float inv_sum = 1.0f / tmp;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            return;
-        }
-
-        const int idst = rowx*ncols + col;
-        dst[idst] = vals[col] * inv_sum;
-    }
-}
-
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-template <typename T>
-static  __global__ void im2col_kernel(
-        const float * x, T * dst, int batch_offset,
-        int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
-        int s0, int s1, int p0, int p1, int d0, int d1) {
-    const int i = threadIdx.x + blockIdx.x * blockDim.x;
-    if (i >= pelements) {
-        return;
-    }
-
-    const int ksize = OW * (KH > 1 ? KW : 1);
-    const int kx = i / ksize;
-    const int kd = kx * ksize;
-    const int ky = (i - kd) / OW;
-    const int ix = i % OW;
-
-    const int oh = blockIdx.y;
-    const int batch = blockIdx.z / IC;
-    const int ic = blockIdx.z % IC;
-
-    const int64_t iiw = ix * s0 + kx * d0 - p0;
-    const int64_t iih = oh * s1 + ky * d1 - p1;
-
-    const int64_t offset_dst =
-        ((batch * OH + oh) * OW + ix) * CHW +
-        (ic * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = 0.0f;
-    } else {
-        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
-        dst[offset_dst] = x[offset_src + iih * IW + iiw];
-    }
-}
-
-template <typename Ti, typename To>
-static  __global__ void pool2d_nchw_kernel(
-        const int ih, const int iw, const int oh, const int ow,
-        const int kh, const int kw, const int sh, const int sw,
-        const int ph, const int pw, const int parallel_elements,
-        const Ti* src, To* dst, const enum ggml_op_pool op) {
-        int idx = threadIdx.x + blockIdx.x * blockDim.x;
-        if (idx >= parallel_elements) {
-            return;
-        }
-
-        const int I_HW = ih * iw;
-        const int O_HW = oh * ow;
-        const int nc = idx / O_HW;
-        const int cur_oh = idx % O_HW / ow;
-        const int cur_ow = idx % O_HW % ow;
-        const Ti* i_ptr = src + nc * I_HW;
-        To* o_ptr = dst + nc * O_HW;
-        const int start_h = cur_oh * sh - ph;
-        const int bh = max(0, start_h);
-        const int eh = min(ih, start_h + kh);
-        const int start_w = cur_ow * sw - pw;
-        const int bw = max(0, start_w);
-        const int ew = min(iw, start_w + kw);
-        const To scale = 1. / (kh * kw);
-        To res = 0;
-
-        switch (op) {
-            case GGML_OP_POOL_AVG: res = 0; break;
-            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
-        }
-
-        for (int i = bh; i < eh; i += 1) {
-            for (int j = bw; j < ew; j += 1) {
-    #if __CUDA_ARCH__ >= 350
-                Ti cur = __ldg(i_ptr + i * iw + j);
-    #else
-                Ti cur = i_ptr[i * iw + j];
-    #endif
-                switch (op) {
-                    case GGML_OP_POOL_AVG: res += cur * scale; break;
-                    case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
-                }
-            }
-        }
-        o_ptr[cur_oh * ow + cur_ow] = res;
-}
-
-template<int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
-            src0_dd, src1_dd, dst_dd,
-            ne00, /*ne01, ne02, ne03,*/
-            /*ne10, ne11,*/ ne12, /*ne13,*/
-            /* s0,*/ s1, s2, s3,
-            /* nb00,*/ nb01, nb02, nb03,
-            s10, s11, s12/*, s13*/);
-
-    (void) dst;
-}
-
-template<typename src0_t>
-static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
-            src0_dd, src1_dd, dst_dd,
-            ne00, /*ne01, ne02, ne03,*/
-            /*ne10, ne11,*/ ne12, /*ne13,*/
-            /* s0,*/ s1, s2, s3,
-            /* nb00,*/ nb01, nb02, nb03,
-            s10, s11, s12/*, s13*/);
-
-    (void) dst;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
-    template<typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
-            cudaStream_t stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            dim3 block_dims;
-            block_dims.x = std::min<unsigned int>(hne0, block_size);
-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
-
-            dim3 block_nums(
-                (hne0 + block_dims.x - 1) / block_dims.x,
-                (ne1 + block_dims.y - 1) / block_dims.y,
-                (ne2*ne3 + block_dims.z - 1) / block_dims.z
-            );
-
-            if (block_nums.z > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s10, */ s11, s12, s13);
-            } else {
-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s10, */ s11, s12, s13);
-            }
-        }
-    }
-};
-
-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
-}
-
-static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
-    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
-    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
-    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
-}
-
-static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
-    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    }
-}
-
-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
-    static const float eps = 1e-6f;
-    if (group_size < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-    }
-}
-
-static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
-    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
-}
-
-static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
-    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
-}
-
-static void pad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02,
-    const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
-}
-
-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    }
-}
-
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, ky, 1);
-    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
-    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-}
-
-static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
-    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
-        const bool need_check = false;
-        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
-    } else {
-        const bool need_check = true;
-        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
-    }
-}
-
-template<typename dst_t>
-static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb32 = k / 32;
-    const int nb = (k + 255) / 256;
-    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
-}
-
-template<typename dst_t>
-static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb32 = k / 32;
-    const int nb = (k + 255) / 256;
-    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
-}
-
-template<typename dst_t>
-static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = (k + QK_K - 1) / QK_K;
-    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = (k + QK_K - 1) / QK_K;
-#if QK_K == 64
-    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
-#else
-    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename src_t, typename dst_t>
-static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
-    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-}
-
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
-    int id;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_row_q4_0_cuda;
-        case GGML_TYPE_Q4_1:
-            return dequantize_row_q4_1_cuda;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            CUDA_CHECK(cudaGetDevice(&id));
-            if (g_device_caps[id].cc >= CC_PASCAL) {
-                return dequantize_block_q8_0_f16_cuda;
-            }
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_IQ2_XXS:
-            return dequantize_row_iq2_xxs_cuda;
-        case GGML_TYPE_IQ2_XS:
-            return dequantize_row_iq2_xs_cuda;
-        case GGML_TYPE_IQ2_S:
-            return dequantize_row_iq2_s_cuda;
-        case GGML_TYPE_IQ3_XXS:
-            return dequantize_row_iq3_xxs_cuda;
-        case GGML_TYPE_IQ1_S:
-            return dequantize_row_iq1_s_cuda;
-        case GGML_TYPE_IQ4_NL:
-            return dequantize_row_iq4_nl_cuda;
-        case GGML_TYPE_IQ4_XS:
-            return dequantize_row_iq4_xs_cuda;
-        case GGML_TYPE_IQ3_S:
-            return dequantize_row_iq3_s_cuda;
-        case GGML_TYPE_F32:
-            return convert_unary_cuda<float>;
-        default:
-            return nullptr;
-    }
-}
-
-static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_row_q4_0_cuda;
-        case GGML_TYPE_Q4_1:
-            return dequantize_row_q4_1_cuda;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_IQ2_XXS:
-            return dequantize_row_iq2_xxs_cuda;
-        case GGML_TYPE_IQ2_XS:
-            return dequantize_row_iq2_xs_cuda;
-        case GGML_TYPE_IQ2_S:
-            return dequantize_row_iq2_s_cuda;
-        case GGML_TYPE_IQ3_XXS:
-            return dequantize_row_iq3_xxs_cuda;
-        case GGML_TYPE_IQ1_S:
-            return dequantize_row_iq1_s_cuda;
-        case GGML_TYPE_IQ4_NL:
-            return dequantize_row_iq4_nl_cuda;
-        case GGML_TYPE_IQ4_XS:
-            return dequantize_row_iq4_xs_cuda;
-        case GGML_TYPE_IQ3_S:
-            return dequantize_row_iq3_s_cuda;
-        case GGML_TYPE_F16:
-            return convert_unary_cuda<half>;
-        default:
-            return nullptr;
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const dim3 block_dims(32, 1, 1);
-    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<1, 1, convert_f16>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
-static void mul_mat_vec_q_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    GGML_ASSERT(ncols_x % qk == 0);
-    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    int64_t nwarps = 1;
-    int64_t rows_per_cuda_block = 1;
-
-    if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
-        switch(ncols_y) {
-            case 1:
-                nwarps = 4;
-                rows_per_cuda_block = 1;
-                break;
-            case 2:
-            case 3:
-            case 4:
-                nwarps = 4;
-                rows_per_cuda_block = 2;
-                break;
-            case 5:
-            case 6:
-            case 7:
-            case 8:
-                nwarps = 2;
-                rows_per_cuda_block = 2;
-                break;
-            default:
-                GGML_ASSERT(false);
-                break;
-        }
-    }
-    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
-    const dim3 block_nums(nblocks, 1, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    switch (ncols_y) {
-        case 1:
-            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 2:
-            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 3:
-            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 4:
-            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 5:
-            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 6:
-            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 7:
-            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        case 8:
-            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-static void ggml_mul_mat_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-        nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-        nwarps = NWARPS_Q4_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-        nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-        nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-        nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-        nwarps = NWARPS_Q4_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-        nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-        nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-        nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-        nwarps = NWARPS_Q5_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-        nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-        nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-        nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-        nwarps = NWARPS_Q5_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-        nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-        nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q8_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-        nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q8_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-        nwarps = NWARPS_Q8_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-        nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-        nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q2_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-        nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q2_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-        nwarps = NWARPS_Q2_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-#if QK_K == 256
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-        nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-        nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q3_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-        nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q3_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-        nwarps = NWARPS_Q3_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-#endif
-}
-
-static void ggml_mul_mat_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-        nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-        nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-        nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-        nwarps = NWARPS_Q4_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-        nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-        nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-        nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-        nwarps = NWARPS_Q5_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-        nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-        nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q6_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-        nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q6_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-        nwarps = NWARPS_Q6_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_p021_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
-    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
-    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
-        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
-}
-
-
-static void ggml_cpy_f16_f32_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_f32_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_f16_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q8_0_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK8_0 == 0);
-    const int num_blocks = ne / QK8_0;
-    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q4_0_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK4_0 == 0);
-    const int num_blocks = ne / QK4_0;
-    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q4_1_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK4_1 == 0);
-    const int num_blocks = ne / QK4_1;
-    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f16_f16_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-
-
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
-}
-
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-template<typename T>
-static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
-    if (pos == nullptr) {
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
-    } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
-    }
-}
-
-template<typename T>
-static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
-    if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
-        );
-    } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
-        );
-    }
-}
-
-static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, int n_ctx, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
-}
-
-static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
-                           const int k_rows, const int n_heads_log2_floor, const float m0,
-                           const float m1, cudaStream_t stream) {
-    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
-    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
-}
-
-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-}
-
-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
-    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-
-    const dim3 block_dims(ncols, 1, 1);
-    const dim3 block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ORDER_ASC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    } else if (order == GGML_SORT_ORDER_DESC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
-    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-    const dim3 block_nums(nrows_x, block_num_x, 1);
-    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
-}
-
-static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const dim3 block_dims(nth,     1, 1);
-    const dim3 block_nums(nrows_x, 1, 1);
-    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
-    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-
-    const uint32_t n_head_kv   = nrows_x/nrows_y;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    if (shmem < g_device_caps[g_main_device].smpb) {
-        switch (ncols_x) {
-            case 32:
-                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 64:
-                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 128:
-                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 256:
-                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 512:
-                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 1024:
-                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 2048:
-                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 4096:
-                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            default:
-                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-        }
-    } else {
-        const size_t shmem_low = WARP_SIZE*sizeof(float);
-        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-    }
-}
-
-template <typename T>
-static void im2col_cuda(const float* x, T* dst,
-    int IW, int IH, int OW, int OH, int KW, int KH, int IC,
-    int batch, int batch_offset, int offset_delta,
-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-    const int parallel_elements = OW * KW * KH;
-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    dim3 block_nums(num_blocks, OH, batch * IC);
-    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
-}
-
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
-
-// #define DEBUG_CUDA_MALLOC
-struct ggml_cuda_buffer {
-    void * ptr = nullptr;
-    size_t size = 0;
-};
-
-static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-
-static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-#ifdef DEBUG_CUDA_MALLOC
-    int nnz = 0;
-    size_t max_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
-            ++nnz;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        return ptr;
-                    }
-                }
-            }
-        }
-    }
-    if (ibest >= 0) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
-        void * ptr = b.ptr;
-        *actual_size = b.size;
-        b.ptr = nullptr;
-        b.size = 0;
-        return ptr;
-    }
-    void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
-    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-    ggml_cuda_set_device(device);
-    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
-    *actual_size = look_ahead_size;
-    g_cuda_pool_size[device] += look_ahead_size;
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
-#endif
-    return ptr;
-}
-
-static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-        if (b.ptr == nullptr) {
-            b.ptr = ptr;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    ggml_cuda_set_device(device);
-    CUDA_CHECK(cudaFree(ptr));
-    g_cuda_pool_size[device] -= size;
-}
-
-#if !defined(GGML_USE_HIPBLAS)
-// pool with virtual memory
-static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
-static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
-
-static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
-    const size_t alignment = 128;
-    size = alignment * ((size + alignment - 1) / alignment);
-
-    size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
-
-    if (size > avail) {
-        // round up to the next multiple of the granularity
-        size_t reserve_size = size - avail;
-        const size_t granularity = g_device_caps[device].vmm_granularity;
-        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-
-        GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-
-        // allocate more physical memory
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = device;
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
-
-        // reserve virtual address space (if not already reserved)
-        if (g_cuda_pool_addr[device] == 0) {
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
-        }
-
-        // map at the end of the pool
-        CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
-
-        // the memory allocation handle is no longer needed after mapping
-        CU_CHECK(cuMemRelease(handle));
-
-        // set access
-        CUmemAccessDesc access = {};
-        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access.location.id = device;
-        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
-
-        // add to the pool
-        g_cuda_pool_size[device] += reserve_size;
-
-        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-        //       (unsigned long long) (reserve_size/1024/1024));
-    }
-
-    GGML_ASSERT(g_cuda_pool_addr[device] != 0);
-
-    void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
-    *actual_size = size;
-    g_cuda_pool_used[device] += size;
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
-#endif
-
-    return ptr;
-}
-
-static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
-#endif
-
-    g_cuda_pool_used[device] -= size;
-
-    // all deallocations must be in reverse order of the allocations
-    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
-}
-
-static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
-    if (g_device_caps[device].vmm) {
-        return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
-    } else {
-        return ggml_cuda_pool_malloc_leg(device, size, actual_size);
-    }
-}
-
-static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
-    if (g_device_caps[device].vmm) {
-        ggml_cuda_pool_free_vmm(device, ptr, size);
-    } else {
-        ggml_cuda_pool_free_leg(device, ptr, size);
-    }
-}
-#else
-#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
-#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
-#endif // !defined(GGML_USE_HIPBLAS)
-
-template<typename T>
-struct cuda_pool_alloc {
-    int device = -1;
-    T * ptr = nullptr;
-    size_t actual_size = 0;
-
-    // size is in number of elements
-    T * alloc(size_t size) {
-        GGML_ASSERT(ptr == nullptr);
-        CUDA_CHECK(cudaGetDevice(&device));
-        ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
-        return ptr;
-    }
-
-    cuda_pool_alloc(size_t size) {
-        alloc(size);
-    }
-
-    ~cuda_pool_alloc() {
-        if (ptr != nullptr) {
-            ggml_cuda_pool_free(device, ptr, actual_size);
-        }
-    }
-
-    T * get() {
-        return ptr;
-    }
-
-    cuda_pool_alloc() = default;
-    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
-    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
-};
-
-static bool g_cublas_loaded = false;
-
-GGML_CALL bool ggml_cublas_loaded(void) {
-    return g_cublas_loaded;
-}
-
-GGML_CALL void ggml_init_cublas() {
-    static bool initialized = false;
-
-    if (!initialized) {
-
-#ifdef __HIP_PLATFORM_AMD__
-        // Workaround for a rocBLAS bug when using multiple graphics cards:
-        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-        rocblas_initialize();
-        CUDA_CHECK(cudaDeviceSynchronize());
-#endif
-
-        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
-            initialized = true;
-            g_cublas_loaded = false;
-            fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
-            return;
-        }
-
-        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
-        int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-#endif
-        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int id = 0; id < g_device_count; ++id) {
-            int device_vmm = 0;
-
-#if !defined(GGML_USE_HIPBLAS)
-            CUdevice device;
-            CU_CHECK(cuDeviceGet(&device, id));
-            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
-
-            if (device_vmm) {
-                CUmemAllocationProp alloc_prop = {};
-                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                alloc_prop.location.id = id;
-                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-            }
-#endif // !defined(GGML_USE_HIPBLAS)
-            g_device_caps[id].vmm = !!device_vmm;
-
-            cudaDeviceProp prop;
-            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
-
-            g_default_tensor_split[id] = total_vram;
-            total_vram += prop.totalGlobalMem;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
-#else
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].smpb = prop.sharedMemPerBlock;
-        }
-        for (int id = 0; id < g_device_count; ++id) {
-            g_default_tensor_split[id] /= total_vram;
-        }
-
-        for (int id = 0; id < g_device_count; ++id) {
-            ggml_cuda_set_device(id);
-
-            // create cuda streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
-            }
-
-            // create cublas handle
-            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
-            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
-        }
-
-        // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-        initialized = true;
-        g_cublas_loaded = true;
-    }
-}
-
-GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * ptr = nullptr;
-    cudaError_t err = cudaMallocHost((void **) &ptr, size);
-    if (err != cudaSuccess) {
-        // clear the error
-        cudaGetLastError();
-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            size/1024.0/1024.0, cudaGetErrorString(err));
-        return nullptr;
-    }
-
-    return ptr;
-}
-
-GGML_CALL void ggml_cuda_host_free(void * ptr) {
-    CUDA_CHECK(cudaFreeHost(ptr));
-}
-
-static cudaError_t ggml_cuda_cpy_tensor_2d(
-    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
-
-    cudaMemcpyKind kind;
-    char * src_ptr;
-    if (src->backend == GGML_BACKEND_TYPE_CPU) {
-        kind = cudaMemcpyHostToDevice;
-        src_ptr = (char *) src->data;
-    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-        kind = cudaMemcpyDeviceToDevice;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-        int id;
-        CUDA_CHECK(cudaGetDevice(&id));
-        src_ptr = (char *) extra->data_device[id];
-    } else {
-        GGML_ASSERT(false);
-    }
-    char * dst_ptr = (char *) dst;
-
-    const int64_t ne0 = src->ne[0];
-    const int64_t nb0 = src->nb[0];
-    const int64_t nb1 = src->nb[1];
-    const int64_t nb2 = src->nb[2];
-    const int64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const int64_t ts = ggml_type_size(type);
-    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
-
-    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
-        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
-    } else if (nb0 == ts) {
-        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
-    } else {
-        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
-            const void * rx = (const void *) ((const char *) x + i1*nb1);
-            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
-            // pretend the row is a matrix with cols=1
-            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
-            if (r != cudaSuccess) return r;
-        }
-        return cudaSuccess;
-    }
-}
-
-static void ggml_cuda_op_get_rows(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-template<class op>
-static void ggml_cuda_op_bin_bcast(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_cuda_op_repeat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-
-    (void) src1;
-    (void) src1_d;
-}
-
-static void ggml_cuda_op_add(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_acc(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-static void ggml_cuda_op_mul(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_div(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_gelu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_silu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_gelu_quick(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_tanh(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_relu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_hardsigmoid(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_hardswish(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_leaky_relu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_sqr(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_group_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int num_groups = dst->op_params[0];
-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_concat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
-static void ggml_cuda_op_upscale(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    const int scale_factor = dst->op_params[0];
-
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_pad(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    pad_f32_cuda(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_rms_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_mul_mat_q(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-}
-
-static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int id = 0; id < g_device_count; ++id) {
-        if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_device_caps[id].cc) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-            if (max_compute_capability < g_device_caps[id].cc) {
-                max_compute_capability = g_device_caps[id].cc;
-            }
-        }
-    }
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#else
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
-    const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
-
-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
-    *row_low -= *row_low % rounding;
-
-    if (id == g_device_count - 1) {
-        *row_high = nrows;
-    } else {
-        *row_high = nrows*tensor_split[id + 1];
-        *row_high -= *row_high % rounding;
-    }
-}
-
-static void ggml_cuda_op_mul_mat_vec_q(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ2_XXS:
-            mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ2_XS:
-            mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ2_S:
-            mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ3_XXS:
-            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ1_S:
-            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ4_NL:
-            mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ4_XS:
-            mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        case GGML_TYPE_IQ3_S:
-            mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
-                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_dequantize_mul_mat_vec(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    cuda_pool_alloc<half> src1_dfloat_a;
-    half * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-        GGML_ASSERT(to_fp16_cuda != nullptr);
-        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_mul_mat_cublas(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    GGML_ASSERT(src0_dd_i  != nullptr);
-    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
-
-    const int compute_capability = g_device_caps[id].cc;
-
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-        //printf("this branch\n");
-        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        cuda_pool_alloc<half> src0_as_f16;
-        if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = row_diff*ne00;
-            src0_as_f16.alloc(ne);
-            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
-        }
-        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
-
-        cuda_pool_alloc<half> src1_as_f16;
-        if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = src1_ncols*ne10;
-            src1_as_f16.alloc(ne);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
-        }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
-        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
-
-        const half alpha_f16 = 1.0f;
-        const half beta_f16 = 0.0f;
-
-        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-        CUBLAS_CHECK(
-            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                    row_diff, src1_ncols, ne10,
-                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
-                                src1_ptr,       CUDA_R_16F, ne10,
-                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
-                    CUBLAS_COMPUTE_16F,
-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    } else {
-        cuda_pool_alloc<float> src0_ddq_as_f32;
-        cuda_pool_alloc<float> src1_ddq_as_f32;
-
-        if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32.alloc(row_diff*ne00);
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
-        }
-        if (src1->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src1_ddq_as_f32.alloc(src1_ncols*ne10);
-            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
-        }
-
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
-        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
-        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-        CUBLAS_CHECK(
-            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                    row_diff, src1_ncols, ne10,
-                    &alpha, src0_ddf_i,  ne00,
-                            src1_ddf1_i, ne10,
-                    &beta,  dst_dd_i,    ldc));
-    }
-
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_rope(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
-    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    const int32_t * pos = nullptr;
-    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
-    }
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
-    // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_alibi(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    //GGML_ASSERT(ne01 + n_past == ne00);
-    GGML_ASSERT(n_head == ne02);
-
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_pool2d(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-
-    const int64_t IH = src0->ne[1];
-    const int64_t IW = src0->ne[0];
-
-    const int64_t N = dst->ne[3];
-    const int64_t OC = dst->ne[2];
-    const int64_t OH = dst->ne[1];
-    const int64_t OW = dst->ne[0];
-
-    const int parallel_elements = N * OC * OH * OW;
-    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
-    dim3 block_nums(num_blocks);
-    pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_im2col(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
-    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
-
-    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
-
-    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
-
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-    const int64_t batch = src1->ne[3];
-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
-
-    if(dst->type == GGML_TYPE_F16) {
-        im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-    } else {
-        im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-    }
-
-    (void) src0;
-    (void) src0_dd;
-}
-
-static void ggml_cuda_op_sum_rows(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_argsort(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_diag_mask_inf(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int nrows0 = ggml_nrows(src0);
-
-    const int n_past = ((int32_t *) dst->op_params)[0];
-
-    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_soft_max(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-
-    const int64_t ne00    = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src0->ne[1];
-
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-
-    // positions tensor
-    float * src2_dd = nullptr;
-    cuda_pool_alloc<float> src2_f;
-
-    ggml_tensor * src2 = dst->src[2];
-    const bool use_src2 = src2 != nullptr;
-
-    if (use_src2) {
-        const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
-
-        if (src2_on_device) {
-            ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
-            src2_dd = (float *) src2_extra->data_device[g_main_device];
-        } else {
-            src2_dd = src2_f.alloc(ggml_nelements(src2));
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
-        }
-    }
-
-    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
-}
-
-static void ggml_cuda_op_scale(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_clamp(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float> src0_f;
-    cuda_pool_alloc<float> src1_f;
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device];
-    } else {
-        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device];
-        } else {
-            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_set_peer_access(const int n_tokens) {
-    static bool peer_access_enabled = false;
-
-    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
-
-    if (peer_access_enabled == enable_peer_access) {
-        return;
-    }
-
-#ifdef NDEBUG
-    for (int id = 0; id < g_device_count; ++id) {
-        ggml_cuda_set_device(id);
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        ggml_cuda_set_device(id);
-
-        for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            if (id == id_other) {
-                continue;
-            }
-            if (id != g_main_device && id_other != g_main_device) {
-                continue;
-            }
-
-            int can_access_peer;
-            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            if (can_access_peer) {
-                if (enable_peer_access) {
-                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
-                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
-                        CUDA_CHECK(err);
-                    }
-                } else {
-                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
-                    if (err != cudaErrorPeerAccessNotEnabled) {
-                        CUDA_CHECK(err);
-                    }
-                }
-            }
-        }
-    }
-#endif // NDEBUG
-
-    peer_access_enabled = enable_peer_access;
-}
-
-// FIXME: move this somewhere else
-struct ggml_backend_cuda_split_buffer_type_context {
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
-};
-
-static void ggml_cuda_op_mul_mat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
-    const bool convert_src1_to_q8_1) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int64_t nrows1 = ggml_nrows(src1);
-
-    GGML_ASSERT(ne03 == ne13);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
-
-    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-
-    const int64_t i02_divisor = ne12 / ne02;
-
-    const size_t src0_ts = ggml_type_size(src0->type);
-    const size_t src0_bs = ggml_blck_size(src0->type);
-    const size_t q8_1_ts = sizeof(block_q8_1);
-    const size_t q8_1_bs = QK8_1;
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-
-    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    GGML_ASSERT(!(split && ne02 > 1));
-    GGML_ASSERT(!(split && ne03 > 1));
-    GGML_ASSERT(!(split && ne02 < ne12));
-
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
-    if (split) {
-        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
-        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
-        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
-        tensor_split = buft_ctx->tensor_split;
-    }
-
-    struct dev_data {
-        cuda_pool_alloc<char>  src0_dd_alloc;
-        cuda_pool_alloc<float> src1_ddf_alloc;
-        cuda_pool_alloc<char>  src1_ddq_alloc;
-        cuda_pool_alloc<float>   dst_dd_alloc;
-
-        char  *  src0_dd = nullptr;
-        float * src1_ddf = nullptr; // float
-        char  * src1_ddq = nullptr; // q8_1
-        float *   dst_dd = nullptr;
-
-        int64_t  row_low;
-        int64_t row_high;
-    };
-
-    dev_data dev[GGML_CUDA_MAX_DEVICES];
-
-    int used_devices = 0;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        // by default, use all rows
-        dev[id].row_low  = 0;
-        dev[id].row_high = ne01;
-
-        // for multi GPU, get the row boundaries from tensor split
-        // and round to mul_mat_q tile sizes
-        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
-
-            if (id != 0) {
-                dev[id].row_low  = ne01*tensor_split[id];
-                if (dev[id].row_low < ne01) {
-                    dev[id].row_low -= dev[id].row_low % rounding;
-                }
-            }
-
-            if (id != g_device_count - 1) {
-                dev[id].row_high  = ne01*tensor_split[id + 1];
-                if (dev[id].row_high < ne01) {
-                    dev[id].row_high -= dev[id].row_high % rounding;
-                }
-            }
-        }
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-            continue;
-        }
-
-        used_devices++;
-
-        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-
-        ggml_cuda_set_device(id);
-        cudaStream_t stream = g_cudaStreams[id][0];
-
-        if (src0_on_device && src0_is_contiguous) {
-            dev[id].src0_dd = (char *) src0_extra->data_device[id];
-        } else {
-            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
-        }
-
-        if (src1_on_device && src1_is_contiguous) {
-            dev[id].src1_ddf = (float *) src1_extra->data_device[id];
-        } else {
-            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
-        }
-
-        if (convert_src1_to_q8_1) {
-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
-
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
-                CUDA_CHECK(cudaGetLastError());
-            }
-        }
-
-        if (dst_on_device) {
-            dev[id].dst_dd = (float *) dst_extra->data_device[id];
-        } else {
-            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
-            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
-        }
-    }
-
-    // if multiple devices are used they need to wait for the main device
-    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && used_devices > 1) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
-    }
-
-    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
-        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
-        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
-        for (int id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-                continue;
-            }
-
-            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
-            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
-
-            ggml_cuda_set_device(id);
-            cudaStream_t stream = g_cudaStreams[id][is];
-
-            // wait for main GPU data if necessary
-            if (split && (id != g_main_device || is != 0)) {
-                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
-            }
-
-            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
-                const int64_t i03 = i0 / ne12;
-                const int64_t i02 = i0 % ne12;
-
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-
-                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-
-                // the main device memory buffer can be on VRAM scratch, with space for all partial results
-                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
-                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
-                }
-
-                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
-                    if (id != g_main_device) {
-                        if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
-                        } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
-                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
-                                                            src1_ncols*ne10*sizeof(float), stream));
-                        }
-                    }
-                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
-                    CUDA_CHECK(cudaGetLastError());
-                }
-
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
-                }
-
-                // do the computation
-                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
-                CUDA_CHECK(cudaGetLastError());
-
-                // copy dst to host or other device if necessary
-                if (!dst_on_device) {
-                    void * dst_off_device;
-                    cudaMemcpyKind kind;
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        dst_off_device = dst->data;
-                        kind = cudaMemcpyDeviceToHost;
-                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device];
-                        kind = cudaMemcpyDeviceToDevice;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                    if (split) {
-                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
-                        // dst is NOT transposed.
-                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
-                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
-#if !defined(GGML_USE_HIPBLAS)
-                        if (kind == cudaMemcpyDeviceToDevice) {
-                            // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
-                            cudaMemcpy3DPeerParms p = {};
-                            p.dstDevice = g_main_device;
-                            p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
-                            p.srcDevice = id;
-                            p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
-                            p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
-                            CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
-                        } else
-#endif
-                        {
-                            CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
-                                                            dst_dd_i, row_diff*sizeof(float),
-                                                            row_diff*sizeof(float), src1_ncols,
-                                                            kind, stream));
-                        }
-                    } else {
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0;
-                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
-                    }
-                }
-
-                // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device || is != 0)) {
-                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
-                }
-            }
-        }
-    }
-
-    // main device waits for all other devices to be finished
-    if (split && g_device_count > 1) {
-        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
-        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-
-        ggml_cuda_set_device(g_main_device);
-        for (int id = 0; id < g_device_count; ++id) {
-            if (dev[id].row_low == dev[id].row_high) {
-                continue;
-            }
-            for (int64_t is = 0; is < is_max; ++is) {
-                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
-}
-
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
-}
-
-static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
-}
-
-static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
-}
-
-static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
-}
-
-static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
-}
-
-static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
-}
-
-static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
-}
-
-static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
-}
-
-static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
-}
-
-static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
-}
-
-static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
-}
-
-static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
-}
-static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
-}
-
-static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
-}
-
-static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
-}
-
-static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
-}
-
-static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
-}
-
-static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
-}
-
-static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
-}
-
-static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
-}
-
-GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_cublas_loaded) return false;
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-            src1->type == GGML_TYPE_F32 &&
-             dst->type == GGML_TYPE_F32 &&
-            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-}
-
-static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-
-static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    const int64_t row_stride_x = nb01 / sizeof(half);
-    const int64_t channel_stride_x = nb02 / sizeof(half);
-
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-
-static __global__ void k_compute_batched_ptrs(
-        const half * src0_as_f16, const half * src1_as_f16, char * dst,
-        const void ** ptrs_src, void ** ptrs_dst,
-        int64_t ne12, int64_t ne13,
-        int64_t ne23,
-        size_t  nb02, size_t  nb03,
-        size_t  nb12, size_t  nb13,
-        size_t  nbd2, size_t  nbd3,
-        int64_t r2,   int64_t r3) {
-    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int64_t i03 = i13 / r3;
-    int64_t i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
-}
-
-static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t ne_dst = ggml_nelements(dst);
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-    half * src0_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    cuda_pool_alloc<half> src1_f16_alloc;
-    if (src1->type != GGML_TYPE_F16) {
-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-        const int64_t ne_src1 = ggml_nelements(src1);
-        src1_f16_alloc.alloc(ne_src1);
-        GGML_ASSERT(to_fp16_cuda != nullptr);
-        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
-    }
-    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
-
-    cuda_pool_alloc<half> dst_f16;
-    char * dst_t;
-
-    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
-    cudaDataType_t      cu_data_type    = CUDA_R_16F;
-
-    // dst strides
-    size_t nbd2 = dst->nb[2];
-    size_t nbd3 = dst->nb[3];
-
-    const half  alpha_f16 = 1.0f;
-    const half  beta_f16  = 0.0f;
-
-    const float alpha_f32 = 1.0f;
-    const float beta_f32  = 0.0f;
-
-    const void * alpha = &alpha_f16;
-    const void * beta  = &beta_f16;
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne_dst);
-
-        nbd2 /= sizeof(float) / sizeof(half);
-        nbd3 /= sizeof(float) / sizeof(half);
-    } else {
-        dst_t = (char *) dst_ddf;
-
-        cu_compute_type = CUBLAS_COMPUTE_32F;
-        cu_data_type    = CUDA_R_32F;
-
-        alpha = &alpha_f32;
-        beta  = &beta_f32;
-    }
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-#if 0
-    // use cublasGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(
-        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
-                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
-                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
-                ne12*ne13,
-                cu_compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    } else {
-        // use cublasGemmBatchedEx
-        const int ne23 = ne12*ne13;
-
-        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
-        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
-
-        dim3 block_dims(ne13, ne12);
-        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_f16, src1_f16, dst_t,
-                ptrs_src.get(), ptrs_dst.get(),
-                ne12, ne13,
-                ne23,
-                nb02, nb03,
-                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
-                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
-                nbd2, nbd3,
-                r2, r3);
-        CUDA_CHECK(cudaGetLastError());
-
-        CUBLAS_CHECK(
-        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
-                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
-                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
-                ne23,
-                cu_compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-#endif
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
-    }
-}
-
-static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_TYPE_GPU) &&
-        ( dst->backend == GGML_BACKEND_TYPE_GPU);
-
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-
-    int64_t min_compute_capability = INT_MAX;
-
-    bool any_pascal_with_slow_fp16 = false;
-    if (split) {
-        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
-        auto & tensor_split = buft_ctx->tensor_split;
-        for (int id = 0; id < g_device_count; ++id) {
-            // skip devices that are not going to do any work:
-            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
-                continue;
-            }
-
-            if (min_compute_capability > g_device_caps[id].cc) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-            if (g_device_caps[id].cc == 610) {
-                any_pascal_with_slow_fp16 = true;
-            }
-        }
-    } else {
-        min_compute_capability    = g_device_caps[g_main_device].cc;
-        any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
-    }
-
-    // check data types and tensor shapes for custom matrix multiplication kernels:
-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
-
-    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-
-    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
-
-#ifdef CUDA_USE_TENSOR_CORES
-    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
-#endif // CUDA_USE_TENSOR_CORES
-
-#else
-
-    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
-    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
-
-    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
-    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
-    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;
-
-#ifdef CUDA_USE_TENSOR_CORES
-    // when tensor cores are available, use them for large batch size
-    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
-#endif // CUDA_USE_TENSOR_CORES
-
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-    // if mmvq is available it's a better choice than dmmv:
-#ifndef GGML_CUDA_FORCE_DMMV
-    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
-#endif // GGML_CUDA_FORCE_DMMV
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
-        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
-    } else if (use_dequantize_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
-    } else if (use_mul_mat_vec_q) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
-    } else if (use_mul_mat_q) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
-    } else {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-    }
-}
-
-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src00->ne[1];
-    const int64_t ne02 = src00->ne[2];
-    const int64_t ne03 = src00->ne[3];
-
-    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use cublasGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    CUBLAS_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-#endif
-
-static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    const size_t nb11 = src1->nb[1];
-    const size_t nb1  =  dst->nb[1];
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
-    cudaStream_t stream = g_cudaStreams[g_main_device][0];
-
-    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
-
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
-
-    ggml_tensor_extra_gpu src1_row_extra;
-    ggml_tensor_extra_gpu dst_row_extra;
-
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    src1_row.backend = GGML_BACKEND_TYPE_GPU;
-    dst_row.backend  = GGML_BACKEND_TYPE_GPU;
-
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
-
-    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
-    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
-
-    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
-
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
-            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
-            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-
-            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
-            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-        }
-    } else {
-        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
-        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
-
-        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
-        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
-
-        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
-            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
-        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
-
-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            int64_t num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
-                                        nb11, src1_kind, stream));
-                num_src1_rows++;
-            }
-
-            if (num_src1_rows == 0) {
-                continue;
-            }
-
-            src1_row.ne[1] = num_src1_rows;
-            dst_row.ne[1] = num_src1_rows;
-
-            src1_row.nb[1] = nb11;
-            src1_row.nb[2] = num_src1_rows*nb11;
-            src1_row.nb[3] = num_src1_rows*nb11;
-
-            dst_row.nb[1] = nb1;
-            dst_row.nb[2] = num_src1_rows*nb1;
-            dst_row.nb[3] = num_src1_rows*nb1;
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-
-            num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
-                                        nb1, dst_kind, stream));
-                num_src1_rows++;
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-    }
-}
-
-static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
-}
-
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
-}
-
-static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
-    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    //GGML_ASSERT(src0->ne[3] == 1);
-
-    const int64_t nb00 = src0->nb[0];
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-    const int64_t nb03 = src0->nb[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-
-    //GGML_ASSERT(src1->ne[3] == 1);
-
-    const int64_t nb10 = src1->nb[0];
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2];
-    const int64_t nb13 = src1->nb[3];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-
-    (void) dst;
-}
-
-static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    ggml_cuda_cpy(src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-}
-
-static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
-}
-
-static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
-}
-
-static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
-}
-
-static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
-}
-
-static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
-}
-
-static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
-}
-
-static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
-}
-
-static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
-GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
-    if (main_device >= g_device_count) {
-        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_device_count, g_main_device);
-        return;
-    }
-
-    if (g_main_device != main_device && g_device_count > 1) {
-        g_main_device = main_device;
-        //cudaDeviceProp prop;
-        //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
-        //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
-    }
-}
-
-GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_cublas_loaded) return false;
-
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
-
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
-            return false;
-        }
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
-            break;
-        case GGML_OP_DUP:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_ADD:
-            func = ggml_cuda_add;
-            break;
-        case GGML_OP_ACC:
-            func = ggml_cuda_acc;
-            break;
-        case GGML_OP_MUL:
-            func = ggml_cuda_mul;
-            break;
-        case GGML_OP_DIV:
-            func = ggml_cuda_div;
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(tensor)) {
-                case GGML_UNARY_OP_GELU:
-                    func = ggml_cuda_gelu;
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    func = ggml_cuda_silu;
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_cuda_gelu_quick;
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    func = ggml_cuda_tanh;
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    func = ggml_cuda_relu;
-                    break;
-                case GGML_UNARY_OP_HARDSIGMOID:
-                    func = ggml_cuda_hardsigmoid;
-                    break;
-                case GGML_UNARY_OP_HARDSWISH:
-                    func = ggml_cuda_hardswish;
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            func = ggml_cuda_norm;
-            break;
-        case GGML_OP_GROUP_NORM:
-            func = ggml_cuda_group_norm;
-            break;
-        case GGML_OP_CONCAT:
-            func = ggml_cuda_concat;
-            break;
-        case GGML_OP_UPSCALE:
-            func = ggml_cuda_upscale;
-            break;
-        case GGML_OP_PAD:
-            func = ggml_cuda_pad;
-            break;
-        case GGML_OP_LEAKY_RELU:
-            func = ggml_cuda_leaky_relu;
-            break;
-        case GGML_OP_RMS_NORM:
-            func = ggml_cuda_rms_norm;
-            break;
-        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat;
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat_id;
-            break;
-        case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
-            break;
-        case GGML_OP_SQR:
-            func = ggml_cuda_sqr;
-            break;
-        case GGML_OP_CLAMP:
-            func = ggml_cuda_clamp;
-            break;
-        case GGML_OP_CPY:
-            func = ggml_cuda_cpy;
-            break;
-        case GGML_OP_CONT:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            func = ggml_cuda_nop;
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            func = ggml_cuda_diag_mask_inf;
-            break;
-        case GGML_OP_SOFT_MAX:
-            func = ggml_cuda_soft_max;
-            break;
-        case GGML_OP_ROPE:
-            func = ggml_cuda_rope;
-            break;
-        case GGML_OP_ALIBI:
-            func = ggml_cuda_alibi;
-            break;
-        case GGML_OP_IM2COL:
-            func = ggml_cuda_im2col;
-            break;
-        case GGML_OP_POOL_2D:
-            func = ggml_cuda_pool2d;
-            break;
-        case GGML_OP_SUM_ROWS:
-            func = ggml_cuda_sum_rows;
-            break;
-        case GGML_OP_ARGSORT:
-            func = ggml_cuda_argsort;
-            break;
-        default:
-            return false;
-    }
-
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return true;
-    }
-    func(tensor->src[0], tensor->src[1], tensor);
-    return true;
-}
-
-GGML_CALL int ggml_cuda_get_device_count() {
-    int device_count;
-    if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
-        return 0;
-    }
-    return device_count;
-}
-
-GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
-    cudaDeviceProp prop;
-    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-    snprintf(description, description_size, "%s", prop.name);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-struct ggml_backend_cuda_context {
-    int device;
-    std::string name;
-};
-
-// cuda buffer
-
-struct ggml_backend_cuda_buffer_context {
-    int device;
-    void * dev_ptr = nullptr;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-    std::string name;
-
-    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
-        device(device), dev_ptr(dev_ptr),
-        name(GGML_CUDA_NAME + std::to_string(device)) {
-    }
-
-    ~ggml_backend_cuda_buffer_context() {
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-        // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-
-        return extra;
-    }
-};
-
-GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
-}
-
-GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    CUDA_CHECK(cudaFree(ctx->dev_ptr));
-    delete ctx;
-}
-
-GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-
-    extra->data_device[ctx->device] = tensor->data;
-
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-    tensor->extra = extra;
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        size_t original_size = ggml_nbytes(tensor);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
-        }
-    }
-}
-
-GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_cuda(src->buffer)) {
-        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
-        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-        ggml_cuda_set_device(src_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        ggml_cuda_set_device(dst_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaDeviceSynchronize());
-
-        return true;
-    }
-    return false;
-}
-
-GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cuda_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// cuda buffer type
-struct ggml_backend_cuda_buffer_type_context {
-    int device;
-    std::string name;
-};
-
-GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-
-    return ctx->name.c_str();
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-
-    ggml_cuda_set_device(buft_ctx->device);
-
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
-    void * dev_ptr;
-    cudaError_t err = cudaMalloc(&dev_ptr, size);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
-        return nullptr;
-    }
-
-    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
-}
-
-GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    size_t size = ggml_nbytes(tensor);
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_cuda(backend)) {
-        return false;
-    }
-
-    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return buft_ctx->device == cuda_ctx->device;
-}
-
-static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    // FIXME: this is not thread safe
-    if (device >= ggml_backend_cuda_get_device_count()) {
-        return nullptr;
-    }
-
-    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
-
-    static bool ggml_backend_cuda_buffer_type_initialized = false;
-
-    if (!ggml_backend_cuda_buffer_type_initialized) {
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-            ggml_backend_cuda_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
-                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
-            };
-        }
-        ggml_backend_cuda_buffer_type_initialized = true;
-    }
-
-    return &ggml_backend_cuda_buffer_types[device];
-}
-
-// cuda split buffer
-
-struct ggml_backend_cuda_split_buffer_context {
-    ~ggml_backend_cuda_split_buffer_context() {
-        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int id = 0; id < g_device_count; ++id) {
-                for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                    if (extra->events[id][is] != nullptr) {
-                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
-                    }
-                }
-                if (extra->data_device[id] != nullptr) {
-                    CUDA_CHECK(cudaFree(extra->data_device[id]));
-                }
-            }
-            delete extra;
-        }
-    }
-
-    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
-};
-
-GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Split";
-
-    UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-    UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
-}
-
-GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    delete ctx;
-}
-
-GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
-    return (void *)0x1000;
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-
-    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
-
-    ctx->tensor_extras.push_back(extra);
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        // FIXME: do not crash if cudaMalloc fails
-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_cuda_set_device(id);
-        char * buf;
-        CUDA_CHECK(cudaMalloc(&buf, size));
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
-        }
-
-        extra->data_device[id] = buf;
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
-        }
-    }
-    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
-    tensor->extra = extra;
-}
-
-GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        const char * buf_host = (const char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
-    }
-}
-
-GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf_host = (char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
-    }
-}
-
-GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    UNUSED(buffer);
-    UNUSED(value);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// cuda split buffer type
-
-GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_CUDA_NAME "_Split";
-
-    UNUSED(buft);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
-    // instead, we allocate them for each tensor separately in init_tensor
-    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
-    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
-    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
-}
-
-GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
-
-    size_t total_size = 0;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        total_size += ggml_nbytes_split(tensor, nrows_split);
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return total_size;
-}
-
-GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
-
-    UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
-    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
-};
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
-    // FIXME: this is not thread safe
-    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
-
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
-
-    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
-    if (all_zero) {
-        tensor_split_arr = g_default_tensor_split;
-    } else {
-        float split_sum = 0.0f;
-        for (int i = 0; i < g_device_count; ++i) {
-            tensor_split_arr[i] = split_sum;
-            split_sum += tensor_split[i];
-        }
-        for (int i = 0; i < g_device_count; ++i) {
-            tensor_split_arr[i] /= split_sum;
-        }
-    }
-
-    auto it = buft_map.find(tensor_split_arr);
-    if (it != buft_map.end()) {
-        return &it->second;
-    }
-
-    struct ggml_backend_buffer_type buft {
-        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
-        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
-    };
-
-    auto result = buft_map.emplace(tensor_split_arr, buft);
-    return &result.first->second;
-}
-
-// host buffer type
-
-GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_CUDA_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Host";
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cuda_host_free(buffer->context);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cuda_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cuda_buffer_type_host;
-}
-
-// backend
-
-GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return cuda_ctx->name.c_str();
-}
-
-GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    delete cuda_ctx;
-    delete backend;
-}
-
-GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
-GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
-}
-
-GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
-}
-
-GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
-        return true;
-    }
-
-    return false;
-}
-
-GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
-
-    UNUSED(backend);
-}
-
-GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    ggml_cuda_set_main_device(cuda_ctx->device);
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_TYPE_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-
-#ifndef NDEBUG
-        assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
-        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-        assert(node->extra != nullptr);
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
-                assert(node->src[j]->extra != nullptr);
-            }
-        }
-#endif
-
-        bool ok = ggml_cuda_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-    }
-
-    return true;
-}
-
-GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_HARDSIGMOID:
-                case GGML_UNARY_OP_HARDSWISH:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_TANH:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-                ggml_type a_type = a->type;
-                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
-                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
-                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
-                        return false;
-                    }
-                }
-                return true;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                        return true;
-                    default:
-                        return false;
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_DUP:
-        case GGML_OP_REPEAT:
-        case GGML_OP_CONCAT:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
-        case GGML_OP_IM2COL:
-        case GGML_OP_POOL_2D:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_ACC:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-static ggml_backend_i ggml_backend_cuda_interface = {
-    /* .get_name                = */ ggml_backend_cuda_name,
-    /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
-    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-};
-
-static ggml_guid_t ggml_backend_cuda_guid() {
-    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
-    return &guid;
-}
-
-GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
-    ggml_init_cublas(); // TODO: remove from ggml.c
-
-    if (device < 0 || device >= ggml_cuda_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
-        return nullptr;
-    }
-
-    // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_cuda_set_main_device(device);
-
-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
-        /* .device = */ device,
-        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
-    };
-
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_cuda_guid(),
-        /* .interface = */ ggml_backend_cuda_interface,
-        /* .context   = */ ctx
-    };
-
-    return cuda_backend;
-}
-
-GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
-}
-
-GGML_CALL int ggml_backend_cuda_get_device_count() {
-    return ggml_cuda_get_device_count();
-}
-
-GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
-    ggml_cuda_get_device_description(device, description, description_size);
-}
-
-GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
-    ggml_cuda_set_device(device);
-
-    CUDA_CHECK(cudaMemGetInfo(free, total));
-}
-
-// backend registry
-GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
-    return cuda_backend;
-
-    UNUSED(params);
-}
-
-extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
-
-GGML_CALL int ggml_backend_cuda_reg_devices() {
-    int device_count = ggml_cuda_get_device_count();
-    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
-    for (int i = 0; i < device_count; i++) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
-    }
-    return device_count;
-}
diff --git a/ggml-cuda.h b/ggml-cuda.h
deleted file mode 100644
index b1ebd61d7..000000000
--- a/ggml-cuda.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_HIPBLAS
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_CUDA_MAX_DEVICES       16
-
-// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API GGML_CALL void   ggml_init_cublas(void);
-
-// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
-
-GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
-GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
-
-GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-
-GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
-GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-impl.h b/ggml-impl.h
deleted file mode 100644
index c5637e4d4..000000000
--- a/ggml-impl.h
+++ /dev/null
@@ -1,261 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    __fp16 tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    __fp16 tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
-
-struct ggml_hash_set ggml_hash_set_new(size_t size);
-
-bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// return index, asserts if table is full
-size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml-metal.m b/ggml-metal.m
deleted file mode 100644
index 71fcca560..000000000
--- a/ggml-metal.m
+++ /dev/null
@@ -1,2831 +0,0 @@
-#import "ggml-metal.h"
-
-#import "ggml-backend-impl.h"
-#import "ggml.h"
-
-#import <Foundation/Foundation.h>
-
-#import <Metal/Metal.h>
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#ifdef GGML_METAL_NDEBUG
-#define GGML_METAL_LOG_INFO(...)
-#define GGML_METAL_LOG_WARN(...)
-#define GGML_METAL_LOG_ERROR(...)
-#else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
-#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#endif
-
-#define UNUSED(x) (void)(x)
-
-struct ggml_metal_kernel {
-    id<MTLComputePipelineState> pipeline;
-};
-
-enum ggml_metal_kernel_type {
-    GGML_METAL_KERNEL_TYPE_ADD,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW,
-    GGML_METAL_KERNEL_TYPE_MUL,
-    GGML_METAL_KERNEL_TYPE_MUL_ROW,
-    GGML_METAL_KERNEL_TYPE_DIV,
-    GGML_METAL_KERNEL_TYPE_DIV_ROW,
-    GGML_METAL_KERNEL_TYPE_SCALE,
-    GGML_METAL_KERNEL_TYPE_SCALE_4,
-    GGML_METAL_KERNEL_TYPE_TANH,
-    GGML_METAL_KERNEL_TYPE_RELU,
-    GGML_METAL_KERNEL_TYPE_GELU,
-    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
-    GGML_METAL_KERNEL_TYPE_SILU,
-    GGML_METAL_KERNEL_TYPE_SOFT_MAX,
-    GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
-    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
-    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
-    GGML_METAL_KERNEL_TYPE_RMS_NORM,
-    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
-    GGML_METAL_KERNEL_TYPE_NORM,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F16,
-    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
-    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
-    GGML_METAL_KERNEL_TYPE_IM2COL_F32,
-    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
-    GGML_METAL_KERNEL_TYPE_PAD_F32,
-    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
-    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
-  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
-  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
-    GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
-    GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
-    GGML_METAL_KERNEL_TYPE_CONCAT,
-    GGML_METAL_KERNEL_TYPE_SQR,
-    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
-
-    GGML_METAL_KERNEL_TYPE_COUNT
-};
-
-struct ggml_metal_context {
-    int n_cb;
-
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue;
-
-    dispatch_queue_t d_queue;
-
-    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
-
-    bool support_simdgroup_reduction;
-    bool support_simdgroup_mm;
-
-    bool should_capture_next_compute;
-};
-
-// MSL code
-// TODO: move the contents here when ready
-//       for now it is easier to work in a separate file
-// static NSString * const msl_library_source = @"see metal.metal";
-
-// Here to assist with NSBundle Path Hack
-@interface GGMLMetalClass : NSObject
-@end
-@implementation GGMLMetalClass
-@end
-
-static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
-    fprintf(stderr, "%s", msg);
-
-    UNUSED(level);
-    UNUSED(user_data);
-}
-
-ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
-void * ggml_metal_log_user_data = NULL;
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
-    if (ggml_metal_log_callback != NULL) {
-        va_list args;
-        va_start(args, format);
-        char buffer[128];
-        int len = vsnprintf(buffer, 128, format, args);
-        if (len < 128) {
-            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
-        } else {
-            char* buffer2 = malloc(len+1);
-            va_end(args);
-            va_start(args, format);
-            vsnprintf(buffer2, len+1, format, args);
-            buffer2[len] = 0;
-            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
-            free(buffer2);
-        }
-        va_end(args);
-    }
-}
-
-static void * ggml_metal_host_malloc(size_t n) {
-    void * data = NULL;
-    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
-    if (result != 0) {
-        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
-    }
-
-    return data;
-}
-
-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
-
-#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
-    // Show all the Metal device instances in the system
-    NSArray * devices = MTLCopyAllDevices();
-    for (id<MTLDevice> device in devices) {
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
-    }
-    [devices release]; // since it was created by a *Copy* C method
-#endif
-
-    // Pick and show default Metal device
-    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
-
-    // Configure context
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-    ctx->device = device;
-    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-    ctx->queue  = [ctx->device newCommandQueue];
-    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-
-    id<MTLLibrary> metal_library;
-
-    // load library
-    {
-        NSBundle * bundle = nil;
-#ifdef SWIFT_PACKAGE
-        bundle = SWIFTPM_MODULE_BUNDLE;
-#else
-        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-#endif
-        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
-            // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
-            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
-            if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-                return NULL;
-            }
-        } else {
-#if GGML_METAL_EMBED_LIBRARY
-            GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
-
-            extern const char ggml_metallib_start[];
-            extern const char ggml_metallib_end[];
-
-            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
-#else
-            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
-
-            NSString * sourcePath;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
-
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
-
-            if (ggmlMetalPathResources) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
-            } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-            }
-            if (sourcePath == nil) {
-                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal.metal";
-            }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
-            if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-                return NULL;
-            }
-#endif
-
-            @autoreleasepool {
-                // dictionary of preprocessor macros
-                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
-
-#ifdef GGML_QKK_64
-                prep[@"QK_K"] = @(64);
-#endif
-
-                MTLCompileOptions* options = [MTLCompileOptions new];
-                options.preprocessorMacros = prep;
-
-                //[options setFastMathEnabled:false];
-
-                metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
-                if (error) {
-                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-                    return NULL;
-                }
-            }
-        }
-    }
-
-    // print MTL GPU family:
-    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-
-    const NSInteger MTLGPUFamilyMetal3 = 5001;
-
-    // determine max supported GPU family
-    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    {
-        for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
-                break;
-            }
-        }
-
-        for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
-                break;
-            }
-        }
-
-        for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
-                break;
-            }
-        }
-    }
-
-    ctx->support_simdgroup_reduction  = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-    ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3];
-
-    ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-
-    GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-
-    ctx->should_capture_next_compute = false;
-
-#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
-    if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
-    }
-#elif TARGET_OS_OSX
-    if (ctx->device.maxTransferRate != 0) {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
-    } else {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
-    }
-#endif
-
-    // load kernels
-    {
-        NSError * error = nil;
-
-        for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
-            ctx->kernels[i].pipeline = nil;
-        }
-
-        /*
-            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
-                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
-                    (int) kernel->pipeline.threadExecutionWidth); \
-        */
-#define GGML_METAL_ADD_KERNEL(e, name, supported) \
-        if (supported) { \
-            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
-            id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
-            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
-            [metal_function release]; \
-            if (error) { \
-                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
-                [metal_library release]; \
-                return NULL; \
-            } \
-        } else { \
-            GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \
-        }
-
-        // simd_sum and simd_max requires MTLGPUFamilyApple7
-
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                       add,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                   add_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                       mul,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                   mul_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                       div,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,           diag_mask_inf_8,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,              get_rows_f32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,              get_rows_f16,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,             get_rows_q4_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,             get_rows_q4_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,             get_rows_q5_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,             get_rows_q5_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,             get_rows_q8_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,             get_rows_q2_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,             get_rows_q3_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,             get_rows_q4_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,             get_rows_q5_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,          get_rows_iq3_xxs,       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,            get_rows_iq3_s,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,            get_rows_iq2_s,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,            get_rows_iq1_s,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,           get_rows_iq4_nl,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,           get_rows_iq4_xs,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                      norm,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,            mul_mv_f32_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,            mul_mv_f16_f16,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,            mul_mv_f16_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,       mul_mv_f16_f32_1row,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,         mul_mv_f16_f32_l4,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,           mul_mv_q4_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,           mul_mv_q4_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,           mul_mv_q5_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,           mul_mv_q5_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,           mul_mv_q8_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,           mul_mv_q2_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,           mul_mv_q3_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,           mul_mv_q4_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,           mul_mv_q5_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,        mul_mv_iq3_xxs_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,          mul_mv_iq3_s_f32,       ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,          mul_mv_iq2_s_f32,       ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,          mul_mv_iq1_s_f32,       ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,         mul_mv_iq4_nl_f32,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,         mul_mv_iq4_xs_f32,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,    mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,      mul_mv_id_f16_f32_l4,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,        mul_mv_id_q4_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,        mul_mv_id_q4_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,        mul_mv_id_q5_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,        mul_mv_id_q5_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,        mul_mv_id_q8_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,        mul_mv_id_q2_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,        mul_mv_id_q3_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,        mul_mv_id_q4_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,        mul_mv_id_q5_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,     mul_mv_id_iq3_xxs_f32,  ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,       mul_mv_id_iq3_s_f32,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,       mul_mv_id_iq2_s_f32,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,       mul_mv_id_iq1_s_f32,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,      mul_mv_id_iq4_nl_f32,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,      mul_mv_id_iq4_xs_f32,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,           mul_mm_q4_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,           mul_mm_q5_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,           mul_mm_q5_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,           mul_mm_q8_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,           mul_mm_q2_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,           mul_mm_q3_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,           mul_mm_q4_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,           mul_mm_q5_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,        mul_mm_iq3_xxs_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,          mul_mm_iq3_s_f32,       ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,          mul_mm_iq2_s_f32,       ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,          mul_mm_iq1_s_f32,       ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,         mul_mm_iq4_nl_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,         mul_mm_iq4_xs_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,        mul_mm_id_q4_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,        mul_mm_id_q5_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,        mul_mm_id_q5_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,        mul_mm_id_q8_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,        mul_mm_id_q2_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,        mul_mm_id_q3_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,        mul_mm_id_q4_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,        mul_mm_id_q5_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,     mul_mm_id_iq3_xxs_f32,  ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,       mul_mm_id_iq3_s_f32,    ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,       mul_mm_id_iq2_s_f32,    ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,       mul_mm_id_iq1_s_f32,    ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,      mul_mm_id_iq4_nl_f32,   ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,      mul_mm_id_iq4_xs_f32,   ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                im2col_f16,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                im2col_f32,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,              cpy_f32_q4_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,              cpy_f32_q4_1,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,              cpy_f32_q5_0,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,              cpy_f32_q5_1,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,               cpy_f16_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,               cpy_f16_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                    concat,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                       sqr,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,               true);
-    }
-
-    [metal_library release];
-    return ctx;
-}
-
-static void ggml_metal_free(struct ggml_metal_context * ctx) {
-    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
-
-    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
-        [ctx->kernels[i].pipeline release];
-    }
-
-    [ctx->queue release];
-    [ctx->device release];
-
-    dispatch_release(ctx->d_queue);
-
-    free(ctx);
-}
-
-// temporarily defined here for compatibility between ggml-backend and the old API
-
-struct ggml_backend_metal_buffer {
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
-struct ggml_backend_metal_buffer_context {
-    void * all_data;
-    size_t all_size;
-    bool owned;
-
-    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
-    int n_buffers;
-    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-};
-
-// finds the Metal buffer that contains the tensor data on the GPU device
-// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
-// Metal buffer based on the host memory pointer
-//
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
-    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
-
-    const int64_t tsize = ggml_nbytes(t);
-
-    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
-
-    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
-
-    // find the view that contains the tensor fully
-    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
-        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
-
-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
-            *offs = (size_t) ioffs;
-
-            //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
-
-            return buf_ctx->buffers[i].metal;
-        }
-    }
-
-    GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
-
-    return nil;
-}
-
-static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_TANH:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_SILU:
-                    return true;
-                default:
-                    return false;
-            }
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
-        case GGML_OP_ADD:
-        case GGML_OP_ACC:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_SUM_ROWS:
-            return true;
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_GROUP_NORM:
-            return ctx->support_simdgroup_reduction;
-        case GGML_OP_NORM:
-        case GGML_OP_ALIBI:
-        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
-            return true;
-        case GGML_OP_POOL_1D:
-        case GGML_OP_POOL_2D:
-            return false;
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction &&
-                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
-        case GGML_OP_CPY:
-        case GGML_OP_DUP:
-        case GGML_OP_CONT:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F32:
-                        switch (op->type) {
-                           case GGML_TYPE_F16:
-                           case GGML_TYPE_F32:
-                           case GGML_TYPE_Q8_0:
-                           case GGML_TYPE_Q4_0:
-                           case GGML_TYPE_Q4_1:
-                                return true;
-                           default:
-                                return false;
-                        }
-                    case GGML_TYPE_F16:
-                        switch (op->type) {
-                           case GGML_TYPE_F16:
-                           case GGML_TYPE_F32:
-                                return true;
-                           default:
-                                return false;
-                        }
-                    default:
-                        return false;
-                };
-            }
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_GET_ROWS:
-            {
-                return op->ne[3] == 1;
-            }
-        default:
-            return false;
-    }
-}
-
-static bool ggml_metal_graph_compute(
-        struct ggml_metal_context * ctx,
-               struct ggml_cgraph * gf) {
-
-    @autoreleasepool {
-    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
-    edesc.dispatchType = MTLDispatchTypeSerial;
-
-    // create multiple command buffers and enqueue them
-    // then, we encode the graph into the command buffers in parallel
-
-    const int n_nodes  = gf->n_nodes;
-    const int n_cb = ctx->n_cb;
-    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
-
-    const bool should_capture = ctx->should_capture_next_compute;
-    if (should_capture) {
-        ctx->should_capture_next_compute = false;
-
-        MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
-        descriptor.captureObject = ctx->queue;
-
-        NSError * error = nil;
-        if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
-            GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
-            GGML_ASSERT(!"capture failed");
-        }
-    }
-
-    id<MTLCommandBuffer> command_buffer_builder[n_cb];
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
-        command_buffer_builder[cb_idx] = command_buffer;
-
-        // enqueue the command buffers in order to specify their execution order
-        [command_buffer enqueue];
-    }
-
-    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
-
-    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
-        const int cb_idx = iter;
-
-        size_t offs_src0 = 0;
-        size_t offs_src1 = 0;
-        size_t offs_src2 = 0;
-        size_t offs_dst  = 0;
-
-        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-
-        const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-        const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
-
-        for (int i = node_start; i < node_end; ++i) {
-            if (i == -1) {
-                [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
-                continue;
-            }
-
-            //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
-            struct ggml_tensor * dst  = gf->nodes[i];
-
-            switch (dst->op) {
-                case GGML_OP_NONE:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_PERMUTE:
-                    {
-                        // noop -> next node
-                    } continue;
-                default:
-                    {
-                    } break;
-            }
-
-            if (!ggml_metal_supports_op(ctx, dst)) {
-                GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                GGML_ASSERT(!"unsupported op");
-            }
-
-            if (should_capture) {
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-            }
-
-            const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-            const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-            const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-            const int64_t  ne03 = src0 ? src0->ne[3] : 0;
-
-            const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-            const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-            const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-            const uint64_t nb03 = src0 ? src0->nb[3] : 0;
-
-            const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-            const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-            const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-            const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
-
-            const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-            const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-            const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-            const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
-
-            const int64_t  ne0  = dst ? dst->ne[0] : 0;
-            const int64_t  ne1  = dst ? dst->ne[1] : 0;
-            const int64_t  ne2  = dst ? dst->ne[2] : 0;
-            const int64_t  ne3  = dst ? dst->ne[3] : 0;
-
-            const uint64_t nb0  = dst ? dst->nb[0] : 0;
-            const uint64_t nb1  = dst ? dst->nb[1] : 0;
-            const uint64_t nb2  = dst ? dst->nb[2] : 0;
-            const uint64_t nb3  = dst ? dst->nb[3] : 0;
-
-            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-
-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
-            id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
-
-            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-            //if (src0) {
-            //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-            //            ggml_is_contiguous(src0), src0->name);
-            //}
-            //if (src1) {
-            //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-            //            ggml_is_contiguous(src1), src1->name);
-            //}
-            //if (dst) {
-            //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-            //            dst->name);
-            //}
-
-            switch (dst->op) {
-                case GGML_OP_CONCAT:
-                    {
-                        const int64_t nb = ne00;
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
-
-                        const int nth = MIN(1024, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ADD:
-                case GGML_OP_MUL:
-                case GGML_OP_DIV:
-                    {
-                        const size_t offs = 0;
-
-                        bool bcast_row = false;
-
-                        int64_t nb = ne00;
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            // src1 is a row
-                            GGML_ASSERT(ne11 == 1);
-
-                            nb = ne00 / 4;
-                            switch (dst->op) {
-                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            }
-
-                            bcast_row = true;
-                        } else {
-                            switch (dst->op) {
-                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            }
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
-
-                        if (bcast_row) {
-                            const int64_t n = ggml_nelements(dst)/4;
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } else {
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        }
-                    } break;
-                case GGML_OP_ACC:
-                    {
-                        GGML_ASSERT(src0t == GGML_TYPE_F32);
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                        GGML_ASSERT(dstt  == GGML_TYPE_F32);
-
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-                        GGML_ASSERT(ggml_is_contiguous(src1));
-
-                        const size_t pnb1 = ((int32_t *) dst->op_params)[0];
-                        const size_t pnb2 = ((int32_t *) dst->op_params)[1];
-                        const size_t pnb3 = ((int32_t *) dst->op_params)[2];
-                        const size_t offs = ((int32_t *) dst->op_params)[3];
-
-                        const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                        if (!inplace) {
-                            // run a separete kernel to cpy src->dst
-                            // not sure how to avoid this
-                            // TODO: make a simpler cpy_bytes kernel
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        }
-
-                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-
-                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        const float scale = *(const float *) dst->op_params;
-
-                        int64_t n = ggml_nelements(dst);
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (n % 4 == 0) {
-                            n /= 4;
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
-                        } else {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_UNARY:
-                    switch (ggml_get_unary_op(gf->nodes[i])) {
-                        case GGML_UNARY_OP_TANH:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_RELU:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_GELU:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_GELU_QUICK:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        case GGML_UNARY_OP_SILU:
-                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } break;
-                        default:
-                            {
-                                GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                GGML_ASSERT(false);
-                            }
-                    } break;
-                case GGML_OP_SQR:
-                    {
-                        GGML_ASSERT(ggml_is_contiguous(src0));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SUM_ROWS:
-                    {
-                        GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_SOFT_MAX:
-                    {
-                        int nth = 32; // SIMD width
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (ne00%4 == 0) {
-                            while (nth < ne00/4 && nth < 256) {
-                                nth *= 2;
-                            }
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
-                        } else {
-                            while (nth < ne00 && nth < 1024) {
-                                nth *= 2;
-                            }
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
-                        }
-
-                        const float scale    = ((float *) dst->op_params)[0];
-                        const float max_bias = ((float *) dst->op_params)[1];
-
-                        const int64_t nrows_x = ggml_nrows(src0);
-                        const int64_t nrows_y = src0->ne[1];
-                        const uint32_t n_head_kv   = nrows_x/nrows_y;
-                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
-
-                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                        if (id_src1) {
-                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                        } else {
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
-                        }
-                        if (id_src2) {
-                            [encoder setBuffer:id_src2 offset:offs_src2   atIndex:2];
-                        } else {
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:2];
-                        }
-                        [encoder setBuffer:id_dst   offset:offs_dst          atIndex:3];
-                        [encoder setBytes:&ne00     length:sizeof(ne00)      atIndex:4];
-                        [encoder setBytes:&ne01     length:sizeof(ne01)      atIndex:5];
-                        [encoder setBytes:&ne02     length:sizeof(ne02)      atIndex:6];
-                        [encoder setBytes:&scale    length:sizeof(scale)     atIndex:7];
-                        [encoder setBytes:&max_bias length:sizeof(max_bias)  atIndex:8];
-                        [encoder setBytes:&m0       length:sizeof(m0)        atIndex:9];
-                        [encoder setBytes:&m1       length:sizeof(m1)        atIndex:10];
-                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                    {
-                        const int n_past = ((int32_t *)(dst->op_params))[0];
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        if (ne00%8 == 0) {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
-                        } else {
-                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
-
-                        if (ne00%8 == 0) {
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        }
-                        else {
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        }
-                    } break;
-                case GGML_OP_MUL_MAT:
-                    {
-                        GGML_ASSERT(ne00 == ne10);
-
-                        // TODO: assert that dim2 and dim3 are contiguous
-                        GGML_ASSERT(ne12 % ne02 == 0);
-                        GGML_ASSERT(ne13 % ne03 == 0);
-
-                        const uint r2 = ne12/ne02;
-                        const uint r3 = ne13/ne03;
-
-                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                        // to the matrix-vector kernel
-                        int ne11_mm_min = 1;
-
-#if 0
-                        // the numbers below are measured on M2 Ultra for 7B and 13B models
-                        // these numbers do not translate to other devices or model sizes
-                        // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-                        }
-#endif
-
-                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                            !ggml_is_transposed(src0) &&
-                            !ggml_is_transposed(src1) &&
-                            src1t == GGML_TYPE_F32 &&
-                            ne00 % 32 == 0 && ne00 >= 64 &&
-                            (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
-                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
-                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                        } else {
-                            int nth0 = 32;
-                            int nth1 = 1;
-                            int nrows = 1;
-                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            // use custom matrix x vector kernel
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
-                                        nrows = 4;
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        if (src1t == GGML_TYPE_F32) {
-                                            if (ne11 * ne12 < 4) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                nrows = ne11;
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } else {
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                            nrows = 4;
-                                        }
-                                    } break;
-                                case GGML_TYPE_Q4_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q8_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q2_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q3_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_K:
-                                    {
-                                        nth0 = 4; //1;
-                                        nth1 = 8; //32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q6_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_NL:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
-                                    } break;
-                                default:
-                                    {
-                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                        GGML_ASSERT(false && "not implemented");
-                                    }
-                            };
-
-                            if (ggml_is_quantized(src0t)) {
-                                GGML_ASSERT(ne00 >= nth0*nth1);
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
-
-                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1  ||
-                                src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0 ||
-                                src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
-                                const int mem_size = 32*sizeof(float);
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                            }
-                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            } else {
-                                const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                        }
-                    } break;
-                case GGML_OP_MUL_MAT_ID:
-                    {
-                        //GGML_ASSERT(ne00 == ne10);
-                        //GGML_ASSERT(ne03 == ne13);
-
-                        GGML_ASSERT(src0t == GGML_TYPE_I32);
-
-                        const int n_as = ((int32_t *) dst->op_params)[1];
-
-                        // TODO: make this more general
-                        GGML_ASSERT(n_as <= 8);
-
-                        // max size of the src1ids array in the kernel stack
-                        GGML_ASSERT(ne11 <= 512);
-
-                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
-                        const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
-
-                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-                        const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
-
-                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
-
-                        GGML_ASSERT(!ggml_is_transposed(src2));
-                        GGML_ASSERT(!ggml_is_transposed(src1));
-
-                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                        const uint r2 = ne12/ne22;
-                        const uint r3 = ne13/ne23;
-
-                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                        // to the matrix-vector kernel
-                        int ne11_mm_min = n_as;
-
-                        const int idx = ((int32_t *) dst->op_params)[0];
-
-                        // batch size
-                        GGML_ASSERT(ne01 == ne11);
-
-                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                        // !!!
-                        // TODO: for now, always use mat-vec kernels until we figure out how to improve the
-                        //       indirect matrix multiplication
-                        // !!!
-                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                            ne20 % 32 == 0 && ne20 >= 64 &&
-                            ne11 > ne11_mm_min) {
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src2->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
-                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
-                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
-                                case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
-                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
-                            // TODO: how to make this an array? read Metal docs
-                            for (int j = 0; j < 8; ++j) {
-                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
-
-                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
-                            }
-
-                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                        } else {
-                            int nth0 = 32;
-                            int nth1 = 1;
-                            int nrows = 1;
-                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            // use custom matrix x vector kernel
-                            switch (src2t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                        nth0 = 32;
-                                        nth1 = 1;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_1:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q8_0:
-                                    {
-                                        nth0 = 8;
-                                        nth1 = 8;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q2_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q3_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q4_K:
-                                    {
-                                        nth0 = 4; //1;
-                                        nth1 = 8; //32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q5_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_Q6_K:
-                                    {
-                                        nth0 = 2;
-                                        nth1 = 32;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_XXS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ3_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ2_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ1_S:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_NL:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
-                                    } break;
-                                case GGML_TYPE_IQ4_XS:
-                                    {
-                                        nth0 = 4;
-                                        nth1 = 16;
-                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
-                                    } break;
-                                default:
-                                    {
-                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                        GGML_ASSERT(false && "not implemented");
-                                    }
-                            };
-
-                            if (ggml_is_quantized(src2t)) {
-                                GGML_ASSERT(ne20 >= nth0*nth1);
-                            }
-
-                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
-                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
-                            [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
-                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
-                            [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
-                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
-                            // TODO: how to make this an array? read Metal docs
-                            for (int j = 0; j < 8; ++j) {
-                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
-
-                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
-                            }
-
-                            if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1  ||
-                                src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1  || src2t == GGML_TYPE_Q8_0 ||
-                                src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_IQ4_NL || src2t == GGML_TYPE_IQ4_XS) {
-                                const int mem_size = 32*sizeof(float);
-                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                            }
-                            else if (src2t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            } else {
-                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                        }
-                    } break;
-                case GGML_OP_GET_ROWS:
-                    {
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0->type) {
-                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
-                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
-                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
-                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
-                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
-                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
-                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
-                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
-                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
-                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
-                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
-                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
-                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
-                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
-                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
-                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S  ].pipeline; break;
-                            case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S  ].pipeline; break;
-                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
-                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
-                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
-                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                            default: GGML_ASSERT(false && "not implemented");
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                        [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
-                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
-                        [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
-                        [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
-                        [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
-                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
-                    } break;
-                case GGML_OP_RMS_NORM:
-                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
-
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-
-                        int nth = 32; // SIMD width
-
-                        while (nth < ne00/4 && nth < 1024) {
-                            nth *= 2;
-                        }
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        const int64_t nrows = ggml_nrows(src0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_GROUP_NORM:
-                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
-
-                        //float eps;
-                        //memcpy(&eps, dst->op_params, sizeof(float));
-
-                        const float eps = 1e-6f; // TODO: temporarily hardcoded
-
-                        const int32_t n_groups = ((int32_t *) dst->op_params)[0];
-
-                        int nth = 32; // SIMD width
-
-                        //while (nth < ne00/4 && nth < 1024) {
-                        //    nth *= 2;
-                        //}
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
-                        [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
-                        [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
-                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_NORM:
-                    {
-                        float eps;
-                        memcpy(&eps, dst->op_params, sizeof(float));
-
-                        const int nth = MIN(256, ne00);
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                        [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
-
-                        const int64_t nrows = ggml_nrows(src0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        GGML_ASSERT((src0t == GGML_TYPE_F32));
-
-                        const int nth = MIN(1024, ne00);
-
-                        //const int n_past = ((int32_t *) dst->op_params)[0];
-                        const int n_head = ((int32_t *) dst->op_params)[1];
-                        float max_bias;
-                        memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-                        const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-                        const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                        [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                        [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                        [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ROPE:
-                    {
-                        GGML_ASSERT(ne10 == ne02);
-
-                        const int nth = MIN(1024, ne00);
-
-                        const int n_past     = ((int32_t *) dst->op_params)[0];
-                        const int n_dims     = ((int32_t *) dst->op_params)[1];
-                        const int mode       = ((int32_t *) dst->op_params)[2];
-                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0->type) {
-                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
-                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
-                            default: GGML_ASSERT(false);
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
-                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
-                        [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
-                        [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
-                        [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
-                        [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
-                        [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
-                        [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
-                        [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
-                        [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
-                        [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                        [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_IM2COL:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
-                        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-                        GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
-                        const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-                        const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-                        const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-                        const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-                        const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-                        const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-
-                        const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-                        const int32_t N  = src1->ne[is_2D ? 3 : 2];
-                        const int32_t IC = src1->ne[is_2D ? 2 : 1];
-                        const int32_t IH = is_2D ? src1->ne[1] : 1;
-                        const int32_t IW =         src1->ne[0];
-
-                        const int32_t KH = is_2D ? src0->ne[1] : 1;
-                        const int32_t KW =         src0->ne[0];
-
-                        const int32_t OH = is_2D ? dst->ne[2] : 1;
-                        const int32_t OW =         dst->ne[1];
-
-                        const int32_t CHW = IC * KH * KW;
-
-                        const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
-                        const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (dst->type) {
-                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
-                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                            default: GGML_ASSERT(false);
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                        [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                        [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                        [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                        [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                        [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                        [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                        [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                        [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                        [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                        [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
-                    } break;
-                case GGML_OP_UPSCALE:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        const int sf = dst->op_params[0];
-
-                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                        [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
-
-                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_PAD:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                        const int nth = MIN(1024, ne0);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                case GGML_OP_ARGSORT:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                        GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-                        const int nrows = ggml_nrows(src0);
-
-                        enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (order) {
-                            case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                            case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                            default: GGML_ASSERT(false);
-                        };
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
-                    } break;
-                case GGML_OP_LEAKY_RELU:
-                    {
-                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                        float slope;
-                        memcpy(&slope, dst->op_params, sizeof(float));
-
-                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
-                        [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
-
-                        const int64_t n = ggml_nelements(dst);
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
-                case GGML_OP_DUP:
-                case GGML_OP_CPY:
-                case GGML_OP_CONT:
-                    {
-                        GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-
-                        int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
-                        id<MTLComputePipelineState> pipeline = nil;
-
-                        switch (src0t) {
-                            case GGML_TYPE_F32:
-                                {
-                                    GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
-
-                                    switch (dstt) {
-                                        case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
-                                        case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
-                                        case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
-                                        case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
-                                        case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
-                                      //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
-                                      //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
-                                        default: GGML_ASSERT(false && "not implemented");
-                                    };
-                                } break;
-                            case GGML_TYPE_F16:
-                                {
-                                    switch (dstt) {
-                                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
-                                        default: GGML_ASSERT(false && "not implemented");
-                                    };
-                                } break;
-                            default: GGML_ASSERT(false && "not implemented");
-                        }
-
-                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                        [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                        [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                        [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                        [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                    } break;
-                default:
-                    {
-                        GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
-                    }
-            }
-
-            if (should_capture) {
-                [encoder popDebugGroup];
-            }
-        }
-
-        [encoder endEncoding];
-
-        [command_buffer commit];
-    });
-
-    // Wait for completion and check status of each command buffer
-    // needed to detect if the device ran out-of-memory for example (#1881)
-
-    for (int i = 0; i < n_cb; ++i) {
-        id<MTLCommandBuffer> command_buffer = command_buffers[i];
-        [command_buffer waitUntilCompleted];
-
-        MTLCommandBufferStatus status = [command_buffer status];
-        if (status != MTLCommandBufferStatusCompleted) {
-            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return false;
-        }
-    }
-
-    if (should_capture) {
-        [[MTLCaptureManager sharedCaptureManager] stopCapture];
-    }
-
-    }
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-// default buffer
-static id<MTLDevice> g_backend_device = nil;
-static int g_backend_device_ref_count = 0;
-
-static id<MTLDevice> ggml_backend_metal_get_device(void) {
-    if (g_backend_device == nil) {
-        g_backend_device = MTLCreateSystemDefaultDevice();
-    }
-
-    g_backend_device_ref_count++;
-
-    return g_backend_device;
-}
-
-static void ggml_backend_metal_free_device(void) {
-    assert(g_backend_device_ref_count > 0);
-
-    g_backend_device_ref_count--;
-
-    if (g_backend_device_ref_count == 0) {
-        [g_backend_device release];
-        g_backend_device = nil;
-    }
-}
-
-GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "Metal";
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    for (int i = 0; i < ctx->n_buffers; i++) {
-        [ctx->buffers[i].metal release];
-    }
-    ggml_backend_metal_free_device();
-
-    if (ctx->owned) {
-        free(ctx->all_data);
-    }
-
-    free(ctx);
-}
-
-GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    return ctx->all_data;
-}
-
-GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-    return false;
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    memset(ctx->all_data, value, ctx->all_size);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
-    /* .get_name        = */ ggml_backend_metal_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_metal_buffer_get_base,
-    /* .init_tensor     = */ NULL,
-    /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_metal_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// default buffer type
-
-GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "Metal";
-
-    UNUSED(buft);
-}
-
-static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
-#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
-    if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-                device.currentAllocatedSize / 1024.0 / 1024.0,
-                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            GGML_METAL_LOG_INFO("\n");
-        }
-    } else {
-        GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-    }
-#endif
-    UNUSED(device);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-
-    ctx->all_data = ggml_metal_host_malloc(size_aligned);
-    ctx->all_size = size_aligned;
-    ctx->owned = true;
-    ctx->n_buffers = 1;
-
-    ctx->buffers[0].data = ctx->all_data;
-    ctx->buffers[0].size = size;
-    ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                    length:size_aligned
-                    options:MTLResourceStorageModeShared
-                    deallocator:nil];
-
-    if (ctx->buffers[0].metal == nil) {
-        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-        free(ctx);
-        ggml_backend_metal_free_device();
-        return NULL;
-    }
-
-    GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-    ggml_backend_metal_log_allocated_size(device);
-
-    return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
-}
-
-GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 32;
-    UNUSED(buft);
-}
-
-GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-    size_t max_size = device.maxBufferLength;
-    ggml_backend_metal_free_device();
-
-    return max_size;
-
-    UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
-
-    UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    UNUSED(buft);
-}
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
-            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
-        },
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_metal;
-}
-
-// buffer from ptr
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
-
-    ctx->all_data = data;
-    ctx->all_size = size;
-    ctx->owned = false;
-    ctx->n_buffers = 0;
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    // page-align the data ptr
-    {
-        const uintptr_t offs = (uintptr_t) data % size_page;
-        data  = (void *) ((char *) data - offs);
-        size += offs;
-    }
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-
-    // the buffer fits into the max buffer size allowed by the device
-    if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data = data;
-        ctx->buffers[ctx->n_buffers].size = size;
-
-        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-        if (ctx->buffers[ctx->n_buffers].metal == nil) {
-            GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-            return false;
-        }
-
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-
-        ++ctx->n_buffers;
-    } else {
-        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
-        // one of the views
-        const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-        const size_t size_step = device.maxBufferLength - size_ovlp;
-        const size_t size_view = device.maxBufferLength;
-
-        for (size_t i = 0; i < size; i += size_step) {
-            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-
-            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size = size_step_aligned;
-
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                return false;
-            }
-
-            GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
-            if (i + size_step < size) {
-                GGML_METAL_LOG_INFO("\n");
-            }
-
-            ++ctx->n_buffers;
-        }
-    }
-
-    ggml_backend_metal_log_allocated_size(device);
-
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
-}
-
-// backend
-
-GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
-    return "Metal";
-
-    UNUSED(backend);
-}
-
-GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_free(ctx);
-    free(backend);
-}
-
-GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_metal_buffer_type();
-
-    UNUSED(backend);
-}
-
-GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-
-    return ggml_metal_graph_compute(metal_ctx, cgraph);
-}
-
-GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-
-    return ggml_metal_supports_op(metal_ctx, op);
-}
-
-static struct ggml_backend_i ggml_backend_metal_i = {
-    /* .get_name                = */ ggml_backend_metal_name,
-    /* .free                    = */ ggml_backend_metal_free,
-    /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_metal_graph_compute,
-    /* .supports_op             = */ ggml_backend_metal_supports_op,
-};
-
-void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_metal_log_callback  = log_callback;
-    ggml_metal_log_user_data = user_data;
-}
-
-static ggml_guid_t ggml_backend_metal_guid(void) {
-    static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
-
-    *metal_backend = (struct ggml_backend) {
-        /* .guid      = */ ggml_backend_metal_guid(),
-        /* .interface = */ ggml_backend_metal_i,
-        /* .context   = */ ctx,
-    };
-
-    return metal_backend;
-}
-
-bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid());
-}
-
-void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-}
-
-bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
-}
-
-void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ctx->should_capture_next_compute = true;
-}
-
-GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
-
-GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
-    return ggml_backend_metal_init();
-
-    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-}
diff --git a/ggml-metal.metal b/ggml-metal.metal
deleted file mode 100644
index 74a5e0b03..000000000
--- a/ggml-metal.metal
+++ /dev/null
@@ -1,7729 +0,0 @@
-#include <metal_stdlib>
-
-using namespace metal;
-
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
-
-#define QK4_0 32
-#define QR4_0 2
-typedef struct {
-    half    d;             // delta
-    uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-typedef struct {
-    half d;                // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-typedef struct {
-    half    d;         // delta
-    int8_t  qs[QK8_0]; // quants
-} block_q8_0;
-
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-
-enum ggml_sort_order {
-    GGML_SORT_ASC,
-    GGML_SORT_DESC,
-};
-
-// general-purpose kernel for addition, multiplication and division of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across all dims
-// cons: not very efficient
-kernel void kernel_add(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        constant  int64_t & offs,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_mul(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_div(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-// assumption: src1 is a row
-// broadcast src1 into src0
-kernel void kernel_add_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig % nb];
-}
-
-kernel void kernel_mul_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % nb];
-}
-
-kernel void kernel_div_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] / src1[tpig % nb];
-}
-
-kernel void kernel_scale(
-        device const float * src0,
-        device       float * dst,
-        constant     float & scale,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
-}
-
-kernel void kernel_scale_4(
-        device const float4 * src0,
-        device       float4 * dst,
-        constant     float  & scale,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
-}
-
-kernel void kernel_relu(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = max(0.0f, src0[tpig]);
-}
-
-kernel void kernel_tanh(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-    dst[tpig] = precise::tanh(x);
-}
-
-constant float GELU_COEF_A     = 0.044715f;
-constant float GELU_QUICK_COEF = -1.702f;
-constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-kernel void kernel_gelu(
-    device const float4 * src0,
-    device       float4 * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-
-    // BEWARE !!!
-    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
-    // This was observed with Falcon 7B and 40B models
-    //
-    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-kernel void kernel_gelu_quick(
-    device const float4 * src0,
-    device       float4 * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-
-    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
-}
-
-kernel void kernel_silu(
-        device const float4 * src0,
-        device       float4 * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-    dst[tpig] = x / (1.0f + exp(-x));
-}
-
-kernel void kernel_sqr(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src0[tpig];
-}
-
-kernel void kernel_sum_rows(
-        device const float * src0,
-        device       float * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
-
-    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
-        return;
-    }
-
-    device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
-    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
-
-    float row_sum = 0;
-
-    for (int64_t i0 = 0; i0 < ne00; i0++) {
-        row_sum += src_row[i0];
-    }
-
-    dst_row[0] = row_sum;
-}
-
-kernel void kernel_soft_max(
-        device const float * src0,
-        device const float * src1,
-        device const float * src2,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant     float & scale,
-        constant     float & max_bias,
-        constant     float & m0,
-        constant     float & m1,
-        constant  uint32_t & n_head_log2,
-        threadgroup  float * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
-
-    device const float * psrc0 =         src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-    device const float * pmask = src1 != src0 ? src1                               + i01*ne00 : nullptr;
-    device const float * ppos  = src2 != src0 ? src2                                          : nullptr;
-    device       float * pdst  =         dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    float slope = 0.0f;
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const int64_t h = i02;
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = pow(base, exp);
-    }
-
-    // parallel max
-    float lmax = -INFINITY;
-
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
-    }
-
-    // find the max value in the block
-    float max_val = simd_max(lmax);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = -INFINITY;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = max_val;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        max_val = buf[tiisg];
-        max_val = simd_max(max_val);
-    }
-
-    // parallel sum
-    float lsum = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
-        lsum += exp_psrc0;
-        pdst[i00] = exp_psrc0;
-    }
-
-    // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
-    threadgroup_barrier(mem_flags::mem_none);
-
-    float sum = simd_sum(lsum);
-
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        sum = buf[tiisg];
-        sum = simd_sum(sum);
-    }
-
-    const float inv_sum = 1.0f/sum;
-
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        pdst[i00] *= inv_sum;
-    }
-}
-
-kernel void kernel_soft_max_4(
-        device const float * src0,
-        device const float * src1,
-        device const float * src2,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant     float & scale,
-        constant     float & max_bias,
-        constant     float & m0,
-        constant     float & m1,
-        constant  uint32_t & n_head_log2,
-        threadgroup  float * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
-
-    device const float4 * psrc4 =                (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-    device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
-    device const float4 * ppos  = src2 != src0 ? (device const float4 *)(src2)                                                 : nullptr;
-    device       float4 * pdst4 =                (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-
-    float slope = 0.0f;
-
-    if (max_bias > 0.0f) {
-        const int64_t h = i02;
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = pow(base, exp);
-    }
-
-    // parallel max
-    float4 lmax4 = -INFINITY;
-
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
-    }
-
-    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-
-    float max_val = simd_max(lmax);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = -INFINITY;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = max_val;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        max_val = buf[tiisg];
-        max_val = simd_max(max_val);
-    }
-
-    // parallel sum
-    float4 lsum4 = 0.0f;
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
-        lsum4 += exp_psrc4;
-        pdst4[i00] = exp_psrc4;
-    }
-
-    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
-
-    // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
-    threadgroup_barrier(mem_flags::mem_none);
-
-    float sum = simd_sum(lsum);
-
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        sum = buf[tiisg];
-        sum = simd_sum(sum);
-    }
-
-    const float inv_sum = 1.0f/sum;
-
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        pdst4[i00] *= inv_sum;
-    }
-}
-
-kernel void kernel_diag_mask_inf(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant       int & n_past,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i02 = tpig[2];
-    const int64_t i01 = tpig[1];
-    const int64_t i00 = tpig[0];
-
-    if (i00 > n_past + i01) {
-        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
-    } else {
-        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
-    }
-}
-
-kernel void kernel_diag_mask_inf_8(
-        device const float4 * src0,
-        device       float4 * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne01,
-        constant        int & n_past,
-        uint3 tpig[[thread_position_in_grid]]) {
-
-    const int64_t i = 2*tpig[0];
-
-    dst[i+0] = src0[i+0];
-    dst[i+1] = src0[i+1];
-    int64_t i4 = 4*i;
-    const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
-    const int64_t i01 = i4/(ne00);      i4 -= i01*ne00;
-    const int64_t i00 = i4;
-    for (int k = 3; k >= 0; --k) {
-        if (i00 + 4 + k <= n_past + i01) {
-            break;
-        }
-        dst[i+1][k] = -INFINITY;
-        if (i00 + k > n_past + i01) {
-            dst[i][k] = -INFINITY;
-        }
-    }
-}
-
-kernel void kernel_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * sum [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
-    // MEAN
-    // parallel sum
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        sum[tpitg] += x[i00];
-    }
-    // reduce
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float mean  = sum[0] / ne00;
-
-    // recenter and VARIANCE
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    device float * y = dst + tgpig*ne00;
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        y[i00] = x[i00] - mean;
-        sum[tpitg] += y[i00] * y[i00];
-    }
-
-    // reduce
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float variance = sum[0] / ne00;
-
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        y[i00] = y[i00] * scale;
-    }
-}
-
-kernel void kernel_rms_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-
-    float4 sumf = 0;
-    float all_sum = 0;
-
-    // parallel sum
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        sumf += x[i00] * x[i00];
-    }
-    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
-    all_sum = simd_sum(all_sum);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = all_sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        all_sum = buf[tiisg];
-        all_sum = simd_sum(all_sum);
-    }
-
-    const float mean  = all_sum/ne00;
-    const float scale = 1.0f/sqrt(mean + eps);
-
-    device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        y[i00] = x[i00] * scale;
-    }
-}
-
-kernel void kernel_group_norm(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int32_t & n_groups,
-        constant     float & eps,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    const int64_t ne = ne00*ne01*ne02;
-    const int64_t gs = ne00*ne01*((ne02 + n_groups - 1) / n_groups);
-
-    int start = tgpig * gs;
-    int end   = start + gs;
-
-    start += tpitg;
-
-    if (end >= ne) {
-        end = ne;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += ntg) {
-        tmp += src0[j];
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    tmp = simd_sum(tmp);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = tmp;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        tmp = buf[tiisg];
-        tmp = simd_sum(tmp);
-    }
-
-    const float mean = tmp / gs;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += ntg) {
-        float xi = src0[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = simd_sum(tmp);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = tmp;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        tmp = buf[tiisg];
-        tmp = simd_sum(tmp);
-    }
-
-    const float variance = tmp / gs;
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int j = start; j < end; j += ntg) {
-        dst[j] *= scale;
-    }
-}
-
-// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q4 quants begin (0 or QK4_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
-    }
-    return d * (sumy * -8.f + acc[0] + acc[1]);
-}
-
-// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q4 quants begin (0 or QK4_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-
-// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (sumy * -16.f + acc[0] + acc[1]);
-}
-
-// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_1/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-
-// putting them in the kernel cause a significant performance penalty
-#define N_DST 4        // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
-//Note: This is a template, but strictly speaking it only applies to
-//      quantizations where the block size is 32. It also does not
-//      guard against the number of rows not being divisible by
-//      N_DST, so this is another explicit assumption of the implementation.
-template<typename block_q_type, int nr, int nsg, int nw>
-void mul_vec_q_n_f32_impl(
-        device const void  * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-                   uint3 tgpig, uint tiisg, uint sgitg) {
-    const int nb = ne00/QK4_0;
-
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[16]; // src1 vector cache
-    float sumf[nr] = {0.f};
-
-    const int ix = (tiisg/2);
-    const int il = (tiisg%2)*8;
-
-    device const float * yb = y + ix * QK4_0 + il;
-
-    // each thread in a SIMD group deals with half a block.
-    for (int ib = ix; ib < nb; ib += nw/2) {
-        float sumy = 0;
-        for (int i = 0; i < 8; i += 2) {
-            sumy += yb[i] + yb[i+1];
-            yl[i+0] = yb[i+ 0];
-            yl[i+1] = yb[i+ 1]/256.f;
-
-            sumy += yb[i+16] + yb[i+17];
-            yl[i+8] = yb[i+16]/16.f;
-            yl[i+9] = yb[i+17]/4096.f;
-        }
-
-        for (int row = 0; row < nr; row++) {
-            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
-        }
-
-        yb += QK4_0 * 16;
-    }
-
-    for (int row = 0; row < nr; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
-        }
-    }
-}
-
-kernel void kernel_mul_mv_q4_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q4_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q5_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q5_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-
-#define NB_Q8_0 8
-
-void kernel_mul_mv_q8_0_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nr  = N_DST;
-    const int nsg = N_SIMDGROUP;
-    const int nw  = N_SIMDWIDTH;
-
-    const int nb = ne00/QK8_0;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[NB_Q8_0];
-    float sumf[nr]={0.f};
-
-    const int ix = tiisg/4;
-    const int il = tiisg%4;
-
-    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
-
-    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
-    for (int ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < NB_Q8_0; ++i) {
-            yl[i] = yb[i];
-        }
-
-        for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
-            float sumq = 0.f;
-            for (int iq = 0; iq < NB_Q8_0; ++iq) {
-                sumq += qs[iq] * yl[iq];
-            }
-            sumf[row] += sumq*x[ib+row*nb].d;
-        }
-
-        yb += NB_Q8_0 * nw;
-    }
-
-    for (int row = 0; row < nr; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q8_0_f32")]]
-kernel void kernel_mul_mv_q8_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-#define N_F32_F32 4
-
-void kernel_mul_mv_f32_f32_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F32_F32;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const float * x = (device const float *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F32_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (float) x[i] * (float) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const float4 * x4 = (device const float4 *)x;
-        for (int row = 0; row < N_F32_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
-            device const float4 * y4 = (device const float4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f32_f32")]]
-kernel void kernel_mul_mv_f32_f32(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-#define N_F16_F16 4
-
-kernel void kernel_mul_mv_f16_f16(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F16_F16;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half * x = (device const half *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F16_F16; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (half) x[i] * (half) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const half4 * x4 = (device const half4 *)x;
-        for (int row = 0; row < N_F16_F16; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
-            device const half4 * y4 = (device const half4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-void kernel_mul_mv_f16_f32_1row_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half  * x = (device const half  *) (src0 + offset0);
-    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-    float sumf = 0;
-    if (ne00 < 128) {
-        for (int i = tiisg; i < ne00; i += 32) {
-            sumf += (float) x[i] * (float) y[i];
-        }
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    } else {
-        device const half4  * x4 = (device const half4  *) x;
-        device const float4 * y4 = (device const float4 *) y;
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
-        }
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f16_f32_1row")]]
-kernel void kernel_mul_mv_f16_f32_1row(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-#define N_F16_F32 4
-
-void kernel_mul_mv_f16_f32_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F16_F32;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half * x = (device const half *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F16_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (float) x[i] * (float) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const half4 * x4 = (device const half4 *)x;
-        for (int row = 0; row < N_F16_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
-            device const float4 * y4 = (device const float4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f16_f32")]]
-kernel void kernel_mul_mv_f16_f32(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-// Assumes row size (ne00) is a multiple of 4
-kernel void kernel_mul_mv_f16_f32_l4(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-
-    const int nrows = ne11;
-    const int64_t r0 = tgpig.x;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half4 * x4 = (device const half4 *) (src0 + offset0);
-
-    for (int r1 = 0; r1 < nrows; ++r1) {
-        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
-
-        float sumf = 0;
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-        }
-
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    }
-}
-
-kernel void kernel_alibi_f32(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        constant     float & m0,
-        constant     float & m1,
-        constant       int & n_heads_log2_floor,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-  //const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    const int64_t k = i3*ne3 + i2;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = pow(m0, k + 1);
-    } else {
-        m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    device       char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
-    device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        const  float   src_v = *(device float *)(src_row + i00*nb00);
-        device float * dst_v =  (device float *)(dst_row + i00*nb0);
-        *dst_v = i00 * m_k + src_v;
-    }
-}
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    thread float * cos_theta, thread float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
-    }
-    *cos_theta = cos(theta) * mscale;
-    *sin_theta = sin(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
-}
-
-static void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
-
-typedef void (rope_t)(
-        device const    void * src0,
-        device const int32_t * src1,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]);
-
-template<typename T>
-kernel void kernel_rope(
-        device const    void * src0,
-        device const int32_t * src1,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i3 = tgpig[2];
-    const int64_t i2 = tgpig[1];
-    const int64_t i1 = tgpig[0];
-
-    const bool is_neox = mode & 2;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    device const int32_t * pos = src1;
-
-    const int64_t p = pos[i2];
-
-    const float theta_0 = (float)p;
-    const float inv_ndims = -1.f/n_dims;
-
-    if (!is_neox) {
-        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
-
-            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
-            float cos_theta, sin_theta;
-            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-            const T x0 = src[0];
-            const T x1 = src[1];
-
-            dst_data[0] = x0*cos_theta - x1*sin_theta;
-            dst_data[1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
-            if (ic < n_dims) {
-                const int64_t ib = 0;
-
-                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cur_rot = inv_ndims*ic - ib;
-
-                const float theta = theta_0 * pow(freq_base, cur_rot);
-                float cos_theta, sin_theta;
-                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-                const int64_t i0 = ib*n_dims + ic/2;
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                const float x0 = src[0];
-                const float x1 = src[n_dims/2];
-
-                dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-            } else {
-                const int64_t i0 = ic;
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                dst_data[0] = src[0];
-                dst_data[1] = src[1];
-            }
-        }
-    }
-}
-
-template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
-template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
-
-typedef void (im2col_t)(
-        device const float * x,
-        device        char * dst,
-        constant   int32_t & ofs0,
-        constant   int32_t & ofs1,
-        constant   int32_t & IW,
-        constant   int32_t & IH,
-        constant   int32_t & CHW,
-        constant   int32_t & s0,
-        constant   int32_t & s1,
-        constant   int32_t & p0,
-        constant   int32_t & p1,
-        constant   int32_t & d0,
-        constant   int32_t & d1,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tgpg[[threadgroups_per_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]);
-
-template <typename T>
-kernel void kernel_im2col(
-        device const float * x,
-        device        char * dst,
-        constant   int32_t & ofs0,
-        constant   int32_t & ofs1,
-        constant   int32_t & IW,
-        constant   int32_t & IH,
-        constant   int32_t & CHW,
-        constant   int32_t & s0,
-        constant   int32_t & s1,
-        constant   int32_t & p0,
-        constant   int32_t & p1,
-        constant   int32_t & d0,
-        constant   int32_t & d1,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tgpg[[threadgroups_per_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
-    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
-
-    const int32_t offset_dst =
-        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
-        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
-
-    device T * pdst = (device T *) (dst);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        pdst[offset_dst] = 0.0f;
-    } else {
-        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
-        pdst[offset_dst] = x[offset_src + iih * IW + iiw];
-    }
-}
-
-template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
-template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
-
-kernel void kernel_upscale_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    constant   int32_t & sf,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1/sf;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = src0_ptr[i0/sf];
-    }
-}
-
-kernel void kernel_pad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            } else {
-                dst_ptr[i0] = 0.0f;
-            }
-        }
-
-        return;
-    }
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = 0.0f;
-    }
-}
-
-// bitonic sort implementation following the CUDA kernels as reference
-typedef void (argsort_t)(
-        device const float * x,
-        device     int32_t * dst,
-        constant   int64_t & ncols,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]]);
-
-template<ggml_sort_order order>
-kernel void kernel_argsort_f32_i32(
-        device const float   * x,
-        device       int32_t * dst,
-        constant     int64_t & ncols,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]]) {
-    // bitonic sort
-    int col = tpitg[0];
-    int row = tgpig[1];
-
-    if (col >= ncols) return;
-
-    device const float   * x_row   = x   + row * ncols;
-    device       int32_t * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        SWAP(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        SWAP(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            threadgroup_barrier(mem_flags::mem_threadgroup);
-        }
-    }
-}
-
-template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ASC>;
-template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_DESC>;
-
-kernel void kernel_leaky_relu_f32(
-        device const float * src0,
-        device       float * dst,
-        constant     float & slope,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
-}
-
-kernel void kernel_cpy_f16_f16(
-        device  const half * src0,
-        device        half * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f16_f32(
-        device  const half * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_f16(
-        device const float * src0,
-        device        half * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_f32(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_q8_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
-
-    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_0; j++) {
-            const float v = src[j];
-            amax = MAX(amax, fabs(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK8_0].d = d;
-
-        for (int j = 0; j < QK8_0; ++j) {
-            const float x0 = src[j]*id;
-
-            dst_data[i00/QK8_0].qs[j] = round(x0);
-        }
-    }
-}
-
-kernel void kernel_cpy_f32_q4_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
-
-    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < QK4_0; j++) {
-            const float v = src[j];
-            if (amax < fabs(v)) {
-                amax = fabs(v);
-                max  = v;
-            }
-        }
-
-        const float d = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK4_0].d = d;
-
-        for (int j = 0; j < QK4_0/2; ++j) {
-            const float x0 = src[0       + j]*id;
-            const float x1 = src[QK4_0/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            dst_data[i00/QK4_0].qs[j]  = xi0;
-            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-kernel void kernel_cpy_f32_q4_1(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
-
-    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < QK4_1; j++) {
-            const float v = src[j];
-            if (min > v) min = v;
-            if (max < v) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK4_1].d = d;
-        dst_data[i00/QK4_1].m = min;
-
-        for (int j = 0; j < QK4_1/2; ++j) {
-            const float x0 = (src[0       + j] - min)*id;
-            const float x1 = (src[QK4_1/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            dst_data[i00/QK4_1].qs[j]  = xi0;
-            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-kernel void kernel_concat(
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne10,
-    constant   int64_t & ne11,
-    constant   int64_t & ne12,
-    constant   int64_t & ne13,
-    constant  uint64_t & nb10,
-    constant  uint64_t & nb11,
-    constant  uint64_t & nb12,
-    constant  uint64_t & nb13,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        if (i02 < ne02) {
-            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
-            src0_ptr += ntg.x*nb00;
-        } else {
-            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
-            src1_ptr += ntg.x*nb10;
-        }
-        dst_ptr += ntg.x*nb0;
-    }
-}
-
-//============================================ k-quants ======================================================
-
-#ifndef QK_K
-#define QK_K 256
-#else
-static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half d;           // super-block scale for quantized scales
-    half dmin;        // super-block scale for quantized mins
-} block_q2_K;
-// 84 bytes / block
-
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    half d;             // super-block scale
-} block_q3_K;
-
-#if QK_K == 64
-typedef struct {
-    half    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-typedef struct {
-    half d;             // super-block scale for quantized scales
-    half dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-#endif
-
-#if QK_K == 64
-typedef struct {
-    half  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-#else
-typedef struct {
-    half d;                      // super-block scale for quantized scales
-    half dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-// 176 bytes / block
-#endif
-
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    half d;                  // super-block scale
-} block_q6_K;
-// 210 bytes / block
-
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-// 66 bytes / block for QK_K = 256, so 2.0625 bpw
-
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-// 74 bytes / block for QK_K = 256, so 2.3125 bpw
-
-// 2.5625 bpw quants
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t scales[QK_K/32];
-} block_iq2_s;
-
-typedef struct {
-    half d;
-    uint8_t qs[3*QK_K/8];
-} block_iq3_xxs;
-// 98 bytes / block for QK_K = 256, so 3.0625 bpw
-
-// 3.4375 bpw
-#if QK_K == 64
-#define IQ3S_N_SCALE 2
-#else
-#define IQ3S_N_SCALE QK_K/64
-#endif
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t signs[QK_K/8];
-    uint8_t scales[IQ3S_N_SCALE];
-} block_iq3_s;
-
-typedef struct {
-    half d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
-} block_iq1_s;
-
-// Non-linear quants
-#define QK4_NL 32
-typedef struct {
-    half    d;
-    uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-
-#if QK_K == 64
-#define block_iq4_xs block_iq4_nl
-#else
-typedef struct {
-    half d;
-    uint16_t scales_h;
-    uint8_t  scales_l[QK_K/64];
-    uint8_t  qs[QK_K/2];
-} block_iq4_xs;
-#endif
-
-//====================================== dot products =========================
-
-void kernel_mul_mv_q2_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q2_K) * nb;
-
-#if QK_K == 256
-    const int ix = tiisg/8;  // 0...3
-    const int it = tiisg%8;  // 0...7
-    const int iq = it/4;     // 0 or 1
-    const int ir = it%4;     // 0...3
-    const int is = (8*ir)/16;// 0 or 1
-
-    device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir;
-
-    for (int ib = ix; ib < nb; ib += 4) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
-        }
-
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*iq + is;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-            float dall = dh[0];
-            float dmin = dh[1] * 1.f/16.f;
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
-
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-
-        y4 += 4 * QK_K;
-    }
-#else
-    const int ix = tiisg/2;  // 0...15
-    const int it = tiisg%2;  // 0...1
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    for (int ib = ix; ib < nb; ib += 16) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
-        }
-
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
-
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-
-        y4 += 16 * QK_K;
-    }
-#endif
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q2_K_f32")]]
-kernel void kernel_mul_mv_q2_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-#if QK_K == 256
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-
-    //const uint16_t kmask1 = 0x3030;
-    //const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = tiisg/4;
-    const int ix  = tiisg%4;
-    const int ip  = tid/4;          // 0 or 1
-    const int il  = 2*((tid%4)/2);  // 0 or 2
-    const int ir  = tid%2;
-    const int n   = 8;
-    const int l0  = n*ir;
-
-    // One would think that the Metal compiler would figure out that ip and il can only have
-    // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
-    // with these two tales.
-    //
-    // Possible masks for the high bit
-    const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200},  // ip = 0, il = 0
-                           {0x0004, 0x0400, 0x0008, 0x0800},  // ip = 0, il = 2
-                           {0x0010, 0x1000, 0x0020, 0x2000},  // ip = 1, il = 0
-                           {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
-
-    // Possible masks for the low 2 bits
-    const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
-
-    const ushort4 hm = mm[2*ip + il/2];
-
-    const int shift = 2*il;
-    const float    v1 = il == 0 ? 4.f : 64.f;
-    const float    v2 = 4.f * v1;
-
-    const uint16_t s_shift1 = 4*ip;
-    const uint16_t s_shift2 = s_shift1 + il;
-
-    const int q_offset = 32*ip + l0;
-    const int y_offset = 128*ip + 32*il + l0;
-
-    const int step = sizeof(block_q3_K) * nb / 2;
-
-    device const float * y1 = yy + ix*QK_K + y_offset;
-
-    uint32_t scales32, aux32;
-    thread uint16_t * scales16 = (thread uint16_t *)&scales32;
-    thread const int8_t * scales = (thread const int8_t *)&scales32;
-
-    float sumf1[2] = {0.f};
-    float sumf2[2] = {0.f};
-    for (int i = ix; i < nb; i += 4) {
-
-        for (int l = 0; l < 8; ++l) {
-            yl[l+ 0] = y1[l+ 0];
-            yl[l+ 8] = y1[l+16];
-            yl[l+16] = y1[l+32];
-            yl[l+24] = y1[l+48];
-        }
-
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
-        device const uint16_t * a = (device const uint16_t *)(x[i].scales);
-        device const half * dh = &x[i].d;
-
-        for (int row = 0; row < 2; ++row) {
-
-            const float d_all = (float)dh[0];
-
-            scales16[0] = a[4];
-            scales16[1] = a[5];
-            aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
-            scales16[0] = a[il+0];
-            scales16[1] = a[il+1];
-            scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
-
-            float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
-            for (int l = 0; l < n; l += 2) {
-                const int32_t qs = q[l/2];
-                s1 += yl[l+0] * (qs & qm[il/2][0]);
-                s2 += yl[l+1] * (qs & qm[il/2][1]);
-                s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
-                s4 += yl[l+16] * (qs & qm[il/2][2]);
-                s5 += yl[l+17] * (qs & qm[il/2][3]);
-                s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
-            }
-            float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
-            float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
-            sumf1[row] += d1 * (scales[0] - 32);
-            sumf2[row] += d2 * (scales[2] - 32);
-
-            s1 = s2 = s3 = s4 = s5 = s6 = 0;
-            for (int l = 0; l < n; l += 2) {
-                const int32_t qs = q[l/2+8];
-                s1 += yl[l+8] * (qs & qm[il/2][0]);
-                s2 += yl[l+9] * (qs & qm[il/2][1]);
-                s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
-                s4 += yl[l+24] * (qs & qm[il/2][2]);
-                s5 += yl[l+25] * (qs & qm[il/2][3]);
-                s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
-            }
-            d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
-            d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
-            sumf1[row] += d1 * (scales[1] - 32);
-            sumf2[row] += d2 * (scales[3] - 32);
-
-            q  += step;
-            h  += step;
-            a  += step;
-            dh += step;
-
-        }
-
-        y1 += 4 * QK_K;
-
-    }
-
-    for (int row = 0; row < 2; ++row) {
-        const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
-        sumf1[row] = simd_sum(sumf);
-    }
-    if (tiisg == 0) {
-        for (int row = 0; row < 2; ++row) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row];
-        }
-    }
-}
-#else
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const int row = 2 * r0 + sgitg;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    const int ix = tiisg/4;
-    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    float2 sum = {0.f, 0.f};
-
-    for (int i = ix; i < nb; i += 8) {
-
-        const float d_all = (float)(x[i].d);
-
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
-        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
-        device const float    * y = yy + i * QK_K + il;
-
-        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
-        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
-        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
-        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
-
-        for (int l = 0; l < 4; l += 2) {
-            const uint16_t hm = h[l/2] >> iq;
-            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
-                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
-                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
-                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
-            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
-                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
-                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
-                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
-        }
-
-    }
-    const float sumf = sum[0] + sum[1] * 1.f/256.f;
-
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-
-}
-#endif
-
-[[host_name("kernel_mul_mv_q3_K_f32")]]
-kernel void kernel_mul_mv_q3_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-#if QK_K == 256
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int ix = tiisg/8;  // 0...3
-    const int it = tiisg%8;  // 0...7
-    const int iq = it/4;     // 0 or 1
-    const int ir = it%4;     // 0...3
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[16];
-    float yh[16];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q4_K) * nb / 2;
-
-    device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
-
-    uint16_t sc16[4];
-    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-
-    for (int ib = ix; ib < nb; ib += 4) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
-            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
-            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
-            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
-        }
-
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq;
-        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            sc16[0] = sc[0] & kmask1;
-            sc16[1] = sc[2] & kmask1;
-            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
-            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
-
-            device const uint16_t * q2 = q1 + 32;
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
-                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
-                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
-                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
-                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
-                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
-                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
-                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
-                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
-                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
-
-            q1 += step;
-            sc += step;
-            dh += step;
-        }
-
-        y4 += 4 * QK_K;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#else
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int ix = tiisg/4;  // 0...7
-    const int it = tiisg%4;  // 0...3
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[8];
-    float yh[8];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q4_K) * nb / 2;
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    uint16_t sc16[4];
-
-    for (int ib = ix; ib < nb; ib += 8) {
-
-        float2 sumy = {0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
-            yh[i] = y4[i+32]; sumy[1] += yh[i];
-        }
-
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            sc16[0] = sc[0] & 0x000f;
-            sc16[1] = sc[0] & 0x0f00;
-            sc16[2] = sc[0] & 0x00f0;
-            sc16[3] = sc[0] & 0xf000;
-
-            float2 acc1 = {0.f, 0.f};
-            float2 acc2 = {0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
-                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
-                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
-                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
-
-            qs += step;
-            sc += step;
-            dh += step;
-        }
-
-        y4 += 8 * QK_K;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#endif
-
-[[host_name("kernel_mul_mv_q4_K_f32")]]
-kernel void kernel_mul_mv_q4_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_q5_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float sumf[2]={0.f};
-
-    const int step = sizeof(block_q5_K) * nb;
-
-#if QK_K == 256
-#
-    float yl[16], yh[16];
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = tiisg/4;
-    const int ix  = tiisg%4;
-    const int iq  = tid/4;
-    const int ir  = tid%4;
-    const int n   = 8;
-
-    const int l0 = n*ir;
-    const int q_offset = 32*iq + l0;
-    const int y_offset = 64*iq + l0;
-
-    const uint8_t hm1 = 1u << (2*iq);
-    const uint8_t hm2 = hm1 << 1;
-    const uint8_t hm3 = hm1 << 4;
-    const uint8_t hm4 = hm2 << 4;
-
-    uint16_t sc16[4];
-    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-
-    device const float * y1 = yy + ix*QK_K + y_offset;
-
-    for (int i = ix; i < nb; i += 4) {
-
-        device const uint8_t * q1 = x[i].qs + q_offset;
-        device const uint8_t * qh = x[i].qh + l0;
-        device const half * dh = &x[i].d;
-        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
-
-        device const float * y2 = y1 + 128;
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 8; ++l) {
-            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
-            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
-            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
-            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
-        }
-
-        for (int row = 0; row < 2; ++row) {
-
-            device const uint8_t * q2 = q1 + 64;
-
-            sc16[0] = a[0] & kmask1;
-            sc16[1] = a[2] & kmask1;
-            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
-            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
-
-            float4 acc1 = {0.f};
-            float4 acc2 = {0.f};
-            for (int l = 0; l < n; ++l) {
-                uint8_t h = qh[l];
-                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
-                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
-                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
-                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
-                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
-                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
-                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
-                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
-            }
-            const float dall = dh[0];
-            const float dmin = dh[1];
-            sumf[row] += dall * (sc8[0] * (acc1[0] +  16.f*acc2[0]) +
-                                 sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
-                                 sc8[4] * (acc1[2] +  16.f*acc2[2]) +
-                                 sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
-                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
-
-            q1 += step;
-            qh += step;
-            dh += step/2;
-            a  += step/2;
-
-        }
-
-        y1 += 4 * QK_K;
-
-    }
-#else
-    float yl[8], yh[8];
-
-    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
-    const int ix = tiisg%8;
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    device const float * y = yy + ix*QK_K + il;
-
-    for (int i = ix; i < nb; i += 8) {
-
-        for (int l = 0; l < 4; ++l) {
-            yl[l+0] = y[l+ 0];
-            yl[l+4] = y[l+16];
-            yh[l+0] = y[l+32];
-            yh[l+4] = y[l+48];
-        }
-
-        device const half * dh = &x[i].d;
-        device const uint8_t * q = x[i].qs + il;
-        device const uint8_t * h = x[i].qh + in;
-        device const int8_t  * s = x[i].scales;
-
-        for (int row = 0; row < 2; ++row) {
-
-            const float d = dh[0];
-
-            float2 acc = {0.f, 0.f};
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t hl = h[l] >> iq;
-                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
-                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
-                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
-                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
-            }
-            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
-
-            q += step;
-            h += step;
-            s += step;
-            dh += step/2;
-
-        }
-
-        y += 8 * QK_K;
-    }
-#endif
-
-    for (int row = 0; row < 2; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q5_K_f32")]]
-kernel void kernel_mul_mv_q5_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_q6_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const uint8_t kmask1 = 0x03;
-    const uint8_t kmask2 = 0x0C;
-    const uint8_t kmask3 = 0x30;
-    const uint8_t kmask4 = 0xC0;
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int     im = tgpig.z;
-
-    const int row = 2 * r0 + sgitg;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float sumf = 0;
-
-#if QK_K == 256
-    const int tid  = tiisg/2;
-    const int ix   = tiisg%2;
-    const int ip   = tid/8;         // 0 or 1
-    const int il   = tid%8;
-    const int n    = 4;
-    const int l0   = n*il;
-    const int is   = 8*ip + l0/16;
-
-    const int y_offset = 128*ip + l0;
-    const int q_offset_l = 64*ip + l0;
-    const int q_offset_h = 32*ip + l0;
-
-    for (int i = ix; i < nb; i += 2) {
-
-        device const uint8_t * q1 = x[i].ql + q_offset_l;
-        device const uint8_t * q2 = q1 + 32;
-        device const uint8_t * qh = x[i].qh + q_offset_h;
-        device const int8_t  * sc = x[i].scales + is;
-
-        device const float * y = yy + i * QK_K + y_offset;
-
-        const float dall = x[i].d;
-
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < n; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
-            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-
-        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
-
-    }
-
-#else
-    const int ix  = tiisg/4;
-    const int il  = 4*(tiisg%4);
-
-    for (int i = ix; i < nb; i += 8) {
-        device const float * y = yy + i * QK_K + il;
-        device const uint8_t * ql = x[i].ql + il;
-        device const uint8_t * qh = x[i].qh + il;
-        device const int8_t  * s  = x[i].scales;
-
-        const float d = x[i].d;
-
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 4; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
-            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
-    }
-
-#endif
-
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-}
-
-[[host_name("kernel_mul_mv_q6_K_f32")]]
-kernel void kernel_mul_mv_q6_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-// ======================= "True" 2-bit
-
-constexpr constant static uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-constexpr constant static uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-constexpr constant static uint64_t iq2s_grid[1024] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
-    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
-    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
-    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
-    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
-    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
-    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
-    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
-    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
-    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
-    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
-    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
-    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
-    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
-    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
-    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
-    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
-    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
-    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
-    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
-    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
-    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
-    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
-    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
-    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
-    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
-    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
-    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
-    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
-    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
-    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
-    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
-    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
-    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
-    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
-    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
-    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
-    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
-    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
-    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
-    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
-    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
-    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
-    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
-    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
-    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
-    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
-    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
-    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
-    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
-    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
-    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
-    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
-    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
-    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
-    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
-    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
-    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
-    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
-    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
-    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
-    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
-    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
-    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
-    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
-    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
-    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
-    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
-    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
-    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
-    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
-    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
-    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
-    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
-    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
-    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
-    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
-    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
-    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
-    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
-    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
-    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
-    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
-    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
-    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
-    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
-    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
-    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
-    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
-    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
-    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
-    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
-    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
-    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
-    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
-    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
-    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
-    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
-    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
-    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
-    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
-    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
-    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
-    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
-    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
-    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
-    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
-    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
-    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
-    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
-    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
-    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
-    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
-    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
-    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
-    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
-    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
-    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
-    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
-    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
-    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
-    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
-    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
-    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
-    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
-    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
-    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
-    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
-    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
-    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
-    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
-    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
-    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
-    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
-    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
-    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
-    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
-    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
-    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
-    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
-    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
-    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
-    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
-    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
-    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
-    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
-    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
-    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
-    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
-    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
-    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
-    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
-    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
-    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
-    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
-    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
-    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
-    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
-    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
-    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
-    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
-    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
-    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
-    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
-    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
-    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
-    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
-    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
-    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
-    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
-    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
-    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
-    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
-    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
-    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
-    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
-    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
-    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
-    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
-    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
-    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
-    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
-    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
-    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
-    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
-    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
-    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
-    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
-    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
-    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
-    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
-    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
-    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
-    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
-    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
-    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
-    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
-    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
-    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
-    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
-    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
-    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
-    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
-    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
-    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
-    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
-    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
-    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
-    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
-    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
-    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
-    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
-    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
-    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
-    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
-    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
-    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
-    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
-    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
-    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
-    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
-    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
-    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
-    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
-    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
-    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
-    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
-    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
-    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
-    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
-    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
-    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
-    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
-    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
-    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
-    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
-    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
-    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
-    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
-    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
-    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
-    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
-    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
-};
-
-constexpr constant static uint32_t iq3xxs_grid[256] = {
-    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
-    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
-    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
-    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
-    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
-    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
-    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
-    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
-    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
-    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
-    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
-    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
-    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
-    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
-    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
-    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
-    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
-    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
-    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
-    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
-    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
-    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
-    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
-    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
-    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
-    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
-    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
-    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
-    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
-    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
-    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
-    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
-};
-
-constexpr constant static uint32_t iq3xs_grid[512] = {
-    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
-    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
-    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
-    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
-    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
-    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
-    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
-    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
-    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
-    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
-    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
-    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
-    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
-    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
-    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
-    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
-    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
-    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
-    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
-    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
-    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
-    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
-    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
-    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
-    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
-    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
-    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
-    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
-    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
-    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
-    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
-    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
-    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
-    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
-    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
-    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
-    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
-    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
-    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
-    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
-    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
-    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
-    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
-    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
-    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
-    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
-    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
-    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
-    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
-    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
-    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
-    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
-    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
-    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
-    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
-    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
-    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
-    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
-    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
-    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
-    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
-    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
-    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
-    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
-};
-
-#define NGRID_IQ1S 512
-constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
-    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
-    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
-    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
-    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
-    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
-    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
-    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
-    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
-    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
-    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
-    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
-    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
-    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
-    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
-    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
-    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
-    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
-    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
-    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
-    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
-    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
-    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
-    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
-    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
-    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
-    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
-    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
-    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
-    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
-    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
-    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
-    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
-    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
-    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
-    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
-    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
-    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
-    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
-    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
-    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
-    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
-    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
-    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
-    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
-    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
-    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
-    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
-    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
-    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
-    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
-    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
-    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
-    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
-    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
-    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
-    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
-    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
-    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
-    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
-    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
-    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
-    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
-    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
-    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
-    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
-    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
-    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
-    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
-    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
-    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
-    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
-    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
-    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
-    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
-    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
-    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
-    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
-    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
-    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
-    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
-    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
-    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
-    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
-    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
-    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
-    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
-    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
-    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
-    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
-    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
-    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
-    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
-    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
-    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
-    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
-    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
-    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
-    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
-    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
-    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
-    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
-    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
-    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
-    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
-    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
-    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
-    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
-    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
-    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
-    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
-    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
-    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
-    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
-    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
-    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
-    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
-    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
-    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
-    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
-    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
-    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
-    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
-    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
-    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
-    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
-    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
-};
-
-constexpr constant static uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-constexpr constant static uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-void kernel_mul_mv_iq2_xxs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0;
-    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
-    {
-        int nval = 4;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i];
-        nval = 2;
-        pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq2_xxs * xr = x + ibl;
-        device const uint16_t * q2 = xr->qs + 4 * ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            device const uint8_t * aux8 = (device const uint8_t *)q2;
-            const uint32_t aux32 = q2[2] | (q2[3] << 16);
-            const float d = db * (0.5f + (aux32 >> 28));
-
-            float sum = 0;
-            for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]);
-                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            sumf[row] += d * sum;
-
-            dh += nb*sizeof(block_iq2_xxs)/2;
-            q2 += nb*sizeof(block_iq2_xxs)/2;
-        }
-
-        y4 += 32 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq2_xxs_f32")]]
-kernel void kernel_mul_mv_iq2_xxs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq2_xs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 512);
-    {
-        int nval = 8;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
-        nval = 2;
-        pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq2_xs * xr = x + ibl;
-        device const uint16_t * q2 = xr->qs + 4 * ib;
-        device const uint8_t  * sc = xr->scales + ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            const uint8_t ls1 = sc[0] & 0xf;
-            const uint8_t ls2 = sc[0] >>  4;
-            const float d1 = db * (0.5f + ls1);
-            const float d2 = db * (0.5f + ls2);
-
-            float sum1 = 0, sum2 = 0;
-            for (int l = 0; l < 2; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
-                for (int j = 0; j < 8; ++j) {
-                    sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            for (int l = 2; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
-                for (int j = 0; j < 8; ++j) {
-                    sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            sumf[row] += d1 * sum1 + d2 * sum2;
-
-            dh += nb*sizeof(block_iq2_xs)/2;
-            q2 += nb*sizeof(block_iq2_xs)/2;
-            sc += nb*sizeof(block_iq2_xs);
-        }
-
-        y4 += 32 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq2_xs_f32")]]
-kernel void kernel_mul_mv_iq2_xs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq3_xxs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
-    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
-    {
-        int nval = 4;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
-        nval = 2;
-        pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq3_xxs * xr = x + ibl;
-        device const uint8_t  * q3 = xr->qs + 8 * ib;
-        device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            const uint32_t aux32 = gas[0] | (gas[1] << 16);
-            const float d = db * (0.5f + (aux32 >> 28));
-
-            float2 sum = {0};
-            for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
-                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
-                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 4; ++j) {
-                    sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-                    sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-                }
-            }
-            sumf[row] += d * (sum[0] + sum[1]);
-
-            dh  += nb*sizeof(block_iq3_xxs)/2;
-            q3  += nb*sizeof(block_iq3_xxs);
-            gas += nb*sizeof(block_iq3_xxs)/2;
-        }
-
-        y4 += 32 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
-kernel void kernel_mul_mv_iq3_xxs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq3_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
-    {
-        int nval = 8;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq3_s * xr = x + ibl;
-        device const uint8_t * qs = xr->qs + 8 * ib;
-        device const uint8_t * qh = xr->qh + ib;
-        device const uint8_t * sc = xr->scales + (ib/2);
-        device const uint8_t * signs = xr->signs + 4 * ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf));
-
-            float2 sum = {0};
-            for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
-                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
-                    sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
-                }
-            }
-            sumf[row] += d * (sum[0] + sum[1]);
-
-            dh  += nb*sizeof(block_iq3_s)/2;
-            qs  += nb*sizeof(block_iq3_s);
-            qh  += nb*sizeof(block_iq3_s);
-            sc  += nb*sizeof(block_iq3_s);
-            signs += nb*sizeof(block_iq3_s);
-        }
-
-        y4 += 32 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq3_s_f32")]]
-kernel void kernel_mul_mv_iq3_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq2_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    //threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    //{
-    //    int nval = 32;
-    //    int pos  = (32*sgitg + tiisg)*nval;
-    //    for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
-    //    threadgroup_barrier(mem_flags::mem_threadgroup);
-    //}
-
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq2_s * xr = x + ibl;
-        device const uint8_t * qs = xr->qs + 4 * ib;
-        device const uint8_t * qh = xr->qh + ib;
-        device const uint8_t * sc = xr->scales + ib;
-        device const uint8_t * signs = qs + QK_K/8;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            const float d1 = db * (0.5f + (sc[0] & 0xf));
-            const float d2 = db * (0.5f + (sc[0] >>  4));
-
-            float2 sum = {0};
-            for (int l = 0; l < 2; ++l) {
-                //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
-                //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
-                constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
-                constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sum[0] += yl[8*l + j +  0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
-                    sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
-                }
-            }
-            sumf[row] += d1 * sum[0] + d2 * sum[1];
-
-            dh  += nb*sizeof(block_iq2_s)/2;
-            qs  += nb*sizeof(block_iq2_s);
-            qh  += nb*sizeof(block_iq2_s);
-            sc  += nb*sizeof(block_iq2_s);
-            signs += nb*sizeof(block_iq2_s);
-        }
-
-        y4 += 32 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq2_s_f32")]]
-kernel void kernel_mul_mv_iq2_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq1_s_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
-    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[16];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    const int ix = tiisg/2;
-    const int il = tiisg%2;
-
-    device const float * y4 = y + 32 * ix + 16 * il;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
-
-        for (int i = 0; i < 16; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq1_s * xr = x + ibl;
-        device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
-        device const uint8_t * sc = xr->scales + 2 * ib + il;
-        device const half    * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
-            constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
-
-            float2 sum = {0};
-            for (int j = 0; j < 8; ++j) {
-                sum[0] += yl[j+ 0] * grid1[j];
-                sum[1] += yl[j+ 8] * grid2[j];
-            }
-            sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
-
-            dh += nb*sizeof(block_iq1_s)/2;
-            qs += nb*sizeof(block_iq1_s);
-            sc += nb*sizeof(block_iq1_s);
-        }
-
-        y4 += 16 * 32;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-
-constexpr constant static float kvalues_iq4nl_f[16] = {
-    -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
-};
-
-void kernel_mul_mv_iq4_nl_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup float  * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK4_NL;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = (r0 * 2 + sgitg) * 2;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    const int ix = tiisg/2;  // 0...15
-    const int it = tiisg%2;  // 0 or 1
-
-    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    float4 yl[4];
-    float sumf[2]={0.f}, all_sum;
-
-    device const float * yb = y + ix * QK4_NL + it * 8;
-
-    uint32_t aux32[2];
-    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
-
-    float4 qf1, qf2;
-
-    for (int ib = ix; ib < nb; ib += 16) {
-
-        device const float4 * y4 = (device const float4 *)yb;
-        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
-
-        for (int row = 0; row < 2; ++row) {
-
-            device const block_iq4_nl & xb = x[row*nb + ib];
-            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
-
-            float4 acc1 = {0.f}, acc2 = {0.f};
-
-            aux32[0] = q4[0] | (q4[1] << 16);
-            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
-            aux32[0] &= 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
-            acc1 += yl[0] * qf1;
-            acc2 += yl[1] * qf2;
-
-            aux32[0] = q4[2] | (q4[3] << 16);
-            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
-            aux32[0] &= 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
-            acc1 += yl[2] * qf1;
-            acc2 += yl[3] * qf2;
-
-            acc1 += acc2;
-
-            sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
-
-        }
-
-        yb += 16 * QK4_NL;
-    }
-
-    for (int row = 0; row < 2; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-
-#if QK_K != 64
-void kernel_mul_mv_iq4_xs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup float  * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = (r0 * 2 + sgitg) * 2;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-    device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    const int ix = tiisg/16;  // 0 or 1
-    const int it = tiisg%16;  // 0...15
-    const int ib = it/2;
-    const int il = it%2;
-
-    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    float4 yl[4];
-    float sumf[2]={0.f}, all_sum;
-
-    device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
-
-    uint32_t aux32[2];
-    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
-
-    float4 qf1, qf2;
-
-    for (int ibl = ix; ibl < nb; ibl += 2) {
-
-        device const float4 * y4 = (device const float4 *)yb;
-        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
-
-        for (int row = 0; row < 2; ++row) {
-
-            device const block_iq4_xs & xb = x[row*nb + ibl];
-            device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
-
-            float4 acc1 = {0.f}, acc2 = {0.f};
-
-            aux32[0] = q4[0] & 0x0f0f0f0f;
-            aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
-            acc1 += yl[0] * qf1;
-            acc2 += yl[1] * qf2;
-
-            aux32[0] = q4[1] & 0x0f0f0f0f;
-            aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
-            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
-            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
-            acc1 += yl[2] * qf1;
-            acc2 += yl[3] * qf2;
-
-            acc1 += acc2;
-
-            const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
-            sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
-
-        }
-
-        yb += 2 * QK_K;
-    }
-
-    for (int row = 0; row < 2; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#endif
-
-[[host_name("kernel_mul_mv_iq1_s_f32")]]
-kernel void kernel_mul_mv_iq1_s_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-[[host_name("kernel_mul_mv_iq4_nl_f32")]]
-kernel void kernel_mul_mv_iq4_nl_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup float * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-[[host_name("kernel_mul_mv_iq4_xs_f32")]]
-kernel void kernel_mul_mv_iq4_xs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup float * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-
-#if QK_K == 64
-    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#else
-    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-#endif
-}
-
-//============================= templates and their specializations =============================
-
-// NOTE: this is not dequantizing - we are simply fitting the template
-template <typename type4x4>
-void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
-    float4x4 temp = *(((device float4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
-    half4x4 temp = *(((device half4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float md = -8.h * xb->d;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
-        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float  m = xb->m;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
-        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
-    const float d = xb->d;
-    const float md = -16.h * xb->d;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + md;
-        reg[i/2][2*(i%2)+1] = d * x1 + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
-    const float d = xb->d;
-    const float m = xb->m;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + m;
-        reg[i/2][2*(i%2)+1] = d * x1 + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
-    device const int8_t * qs = ((device const int8_t *)xb->qs);
-    const half d = xb->d;
-
-    for (int i = 0; i < 16; i++) {
-        reg[i/4][i%4] = (qs[i + 16*il] * d);
-    }
-}
-
-template <typename type4x4>
-void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
-    const float d = xb->d;
-    const float min = xb->dmin;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    float dl, ml;
-    uint8_t sc = xb->scales[il];
-
-#if QK_K == 256
-    q = q + 32*(il/8) + 16*(il&1);
-    il = (il/2)%4;
-#endif
-    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    device const uint8_t * h = (device const uint8_t *)xb->hmask;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-#if QK_K == 256
-    q = q + 32 * (il/8) + 16 * (il&1);
-    h = h + 16 * (il&1);
-    uint8_t m = 1 << (il/2);
-    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
-                                 ((il/4)>0 ? 12  : 3);
-    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
-    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
-    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
-                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
-    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
-    const float ml = 4.f * dl;
-
-    il = (il/2) & 3;
-    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl *= coef;
-
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
-    }
-#else
-    float    kcoef = il&1 ? 1.f/16.f : 1.f;
-    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
-    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
-    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    uint8_t  m = 1<<(il*2);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
-    }
-#endif
-}
-
-static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
-    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
-                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
-}
-
-template <typename type4x4>
-void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
-    device const uchar * q = xb->qs;
-
-#if QK_K == 256
-    short is = (il/4) * 2;
-    q = q + (il/4) * 32 + 16 * (il&1);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d   = il < 2 ? xb->d : xb->d / 16.h;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-#else
-    (void) get_scale_min_k4_just2;
-
-    q = q + 16 * (il&1);
-    device const uint8_t * s = xb->scales;
-    device const half2 * dh = (device const half2 *)xb->d;
-    const float2 d = (float2)dh[0];
-    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
-    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
-#endif
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
-    device const uint8_t * q  = xb->qs;
-    device const uint8_t * qh = xb->qh;
-
-#if QK_K == 256
-    short is = (il/4) * 2;
-    q  = q + 32 * (il/4) + 16 * (il&1);
-    qh = qh + 16 * (il&1);
-    uint8_t ul = 1 << (il/2);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d = il < 2 ? xb->d : xb->d / 16.f;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-
-    const ushort mask  = il<2 ? 0x0F : 0xF0;
-    const float qh_val = il<2 ? 16.f : 256.f;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
-    }
-#else
-    q = q + 16 * (il&1);
-    device const int8_t * s = xb->scales;
-    const float dl = xb->d * s[il];
-    uint8_t m = 1<<(il*2);
-    const float  coef = il<2 ? 1.f  : 1.f/16.f;
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
-    }
-#endif
-}
-
-template <typename type4x4>
-void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * ql = (device const uint8_t *)xb->ql;
-    device const uint8_t * qh = (device const uint8_t *)xb->qh;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-#if QK_K == 256
-    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
-    qh = qh + 32*(il/8) + 16*(il&1);
-    float sc = scales[(il%2) + 2 * ((il/2))];
-    il = (il/2) & 3;
-#else
-    ql = ql + 16 * (il&1);
-    float sc = scales[il];
-#endif
-    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
-    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
-    const float       coef = il>1 ? 1.f/16.f          : 1.f;
-    const float ml = d_all * sc * 32.f;
-    const float dl = d_all * sc * coef;
-    for (int i = 0; i < 16; ++i) {
-        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
-                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
-        reg[i/4][i%4] = dl * q - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
-    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
-    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
-    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
-    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
-    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
-    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * q3 = xb->qs + 8*ib32;
-    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
-    const uint32_t aux32 = gas[0] | (gas[1] << 16);
-    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
-    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
-    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
-        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
-    }
-    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
-    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
-    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
-    for (int i = 0; i < 4; ++i) {
-        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
-        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * qs = xb->qs + 8*ib32;
-    device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
-    const uint8_t qh = xb->qh[ib32] >> 4*il;
-    const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256)));
-    for (int i = 0; i < 4; ++i) {
-        reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
-        reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
-    }
-    grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256)));
-    grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256)));
-    for (int i = 0; i < 4; ++i) {
-        reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
-        reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
-    device const uint8_t * signs = qs + QK_K/8;
-    const uint8_t qh = xb->qh[ib32] >> 4*il;
-    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
-    constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
-    constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
-        reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    device const uint8_t * qs = xb->qs + 2*il;
-    device const uint8_t * sc = xb->scales + il;
-    const float dl1 = d * (2*(sc[0] & 7) + 1);
-    const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1);
-    constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
-    constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4+0][i%4] = dl1 * grid1[i];
-        reg[i/4+2][i%4] = dl2 * grid2[i];
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
-    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
-    const float d = xb->d;
-    uint32_t aux32;
-    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
-    for (int i = 0; i < 4; ++i) {
-        aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
-        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
-        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
-        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
-        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
-#if QK_K == 64
-    dequantize_iq4_nl(xb, il, reg);
-#else
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
-    const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
-    const float d = (float)xb->d * (ls - 32);
-    uint32_t aux32;
-    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
-    for (int i = 0; i < 4; ++i) {
-        aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
-        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
-        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
-        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
-        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
-    }
-#endif
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
-kernel void kernel_get_rows(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    //const int64_t i = tgpig;
-    //const int64_t r = ((device int32_t *) src1)[i];
-
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) {
-        float4x4 temp;
-        dequantize_func(
-            ((device const block_q *) ((device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
-        *(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
-    }
-}
-
-kernel void kernel_get_rows_f32(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-kernel void kernel_get_rows_f16(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-kernel void kernel_get_rows_i32(
-        device const  void * src0,
-        device const  char * src1,
-        device     int32_t * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-
-#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
-#define BLOCK_SIZE_K 32
-#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
-#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
-#define THREAD_PER_BLOCK 128
-#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
-#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
-#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
-#define SG_MAT_ROW 8
-
-// each block_q contains 16*nl weights
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_impl(device const  uchar * src0,
-                        device const  uchar * src1,
-                        device        float * dst,
-                        constant    int64_t & ne00,
-                        constant    int64_t & ne02,
-                        constant   uint64_t & nb01,
-                        constant   uint64_t & nb02,
-                        constant    int64_t & ne12,
-                        constant   uint64_t & nb10,
-                        constant   uint64_t & nb11,
-                        constant   uint64_t & nb12,
-                        constant    int64_t & ne0,
-                        constant    int64_t & ne1,
-                        constant       uint & r2,
-                        constant       uint & r3,
-                        threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                        uint3                 tgpig[[threadgroup_position_in_grid]],
-                        uint                  tiitg[[thread_index_in_threadgroup]],
-                        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
-
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
-    const uint im = tgpig.z;
-
-    // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
-
-    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
-
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
-    for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    short il = (tiitg % THREAD_PER_ROW);
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
-    ushort offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * im
-        + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
-
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        // load data and store to threadgroup memory
-        half4x4 temp_a;
-        dequantize_func(x, il, temp_a);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        #pragma unroll(16)
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
-        }
-
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
-
-        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
-        y += BLOCK_SIZE_K;
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
-
-        #pragma unroll(4)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
-            #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
-                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
-            }
-            simdgroup_barrier(mem_flags::mem_none);
-            #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
-                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
-            }
-
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
-            #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
-            }
-        }
-    }
-
-    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
-                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
-        }
-    } else {
-        // block is smaller than 64x32, we should avoid writing data outside of the matrix
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-        if (sgitg == 0) {
-            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
-                }
-            }
-        }
-    }
-}
-
-// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_id_impl(
-        device const  uchar * src0,
-        device const  uchar * src1,
-        thread        short * src1ids,
-        device        float * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne02,
-        constant   uint64_t & nb01,
-        constant   uint64_t & nb02,
-        constant    int64_t & ne12,
-        constant   uint64_t & nb10,
-        constant   uint64_t & nb11,
-        constant   uint64_t & nb12,
-        constant    int64_t & ne0,
-                    int64_t   ne1,
-        constant       uint & r2,
-        constant       uint & r3,
-        threadgroup   uchar * shared_memory,
-        uint3                 tgpig[[threadgroup_position_in_grid]],
-        uint                  tiitg[[thread_index_in_threadgroup]],
-        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
-
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
-    const uint im = tgpig.z;
-
-    if (r1 * BLOCK_SIZE_N >= ne1) return;
-
-    // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
-
-    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
-
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
-    for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    short il = (tiitg % THREAD_PER_ROW);
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
-    ushort offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * im
-        + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col]
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
-
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        // load data and store to threadgroup memory
-        half4x4 temp_a;
-        dequantize_func(x, il, temp_a);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
-        }
-
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
-
-        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
-        y += BLOCK_SIZE_K;
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
-
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
-            for (int i = 0; i < 4; i++) {
-                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
-            }
-            simdgroup_barrier(mem_flags::mem_none);
-            for (int i = 0; i < 2; i++) {
-                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
-            }
-
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
-            for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
-            }
-        }
-    }
-
-    {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0;
-        if (sgitg == 0) {
-            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                    *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
-                }
-            }
-        }
-    }
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm(device const  uchar * src0,
-                          device const  uchar * src1,
-                          device        float * dst,
-                          constant    int64_t & ne00,
-                          constant    int64_t & ne02,
-                          constant   uint64_t & nb01,
-                          constant   uint64_t & nb02,
-                          constant    int64_t & ne12,
-                          constant   uint64_t & nb10,
-                          constant   uint64_t & nb11,
-                          constant   uint64_t & nb12,
-                          constant    int64_t & ne0,
-                          constant    int64_t & ne1,
-                          constant       uint & r2,
-                          constant       uint & r3,
-                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                          uint3                 tgpig[[threadgroup_position_in_grid]],
-                          uint                  tiitg[[thread_index_in_threadgroup]],
-                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
-        src0,
-        src1,
-        dst,
-        ne00,
-        ne02,
-        nb01,
-        nb02,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_memory,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm_id(
-        device const   uchar * ids,
-        device const   uchar * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const   uchar * src00,
-        device const   uchar * src01,
-        device const   uchar * src02,
-        device const   uchar * src03,
-        device const   uchar * src04,
-        device const   uchar * src05,
-        device const   uchar * src06,
-        device const   uchar * src07,
-        threadgroup    uchar * shared_memory [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const uchar * src0s[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    // expert id
-    const int32_t id = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    // row indices of src1 for expert id
-    int64_t _ne1 = 0;
-    short src1ids[512];
-
-    for (int64_t i1 = 0; i1 < ne1; i1++) {
-        if (((device int32_t *) (ids + i1*nbi1))[idx] == id) {
-            src1ids[_ne1++] = i1;
-        }
-    }
-
-    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
-        src0s[id],
-        src1,
-        src1ids,
-        dst,
-        ne00,
-        ne02,
-        nb01,
-        nb02,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        _ne1,
-        r2,
-        r3,
-        shared_memory,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
-#if QK_K == 256
-#define QK_NL 16
-#else
-#define QK_NL 4
-#endif
-
-//
-// get rows
-//
-
-typedef void (get_rows_t)(
-        device const void * src0,
-        device const char * src1,
-        device      float * dst,
-        constant  int64_t & ne00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant  int64_t & ne10,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        uint3, uint, uint3);
-
-//template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows<float4x4,   1, dequantize_f32>;
-//template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
-template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
-template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
-template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
-template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
-template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
-template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_t kernel_get_rows<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_get_rows_iq3_s")]]   kernel get_rows_t kernel_get_rows<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_rows<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
-template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
-
-//
-// matrix-matrix multiplication
-//
-
-typedef void (mat_mm_t)(
-        device const  uchar * src0,
-        device const  uchar * src1,
-        device        float * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne02,
-        constant   uint64_t & nb01,
-        constant   uint64_t & nb02,
-        constant    int64_t & ne12,
-        constant   uint64_t & nb10,
-        constant   uint64_t & nb11,
-        constant   uint64_t & nb12,
-        constant    int64_t & ne0,
-        constant    int64_t & ne1,
-        constant       uint & r2,
-        constant       uint & r3,
-        threadgroup   uchar *,
-        uint3, uint, uint);
-
-template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<float4x4,   1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
-template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
-#else
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
-
-//
-// indirect matrix-matrix multiplication
-//
-
-typedef void (mat_mm_id_t)(
-        device const   uchar * ids,
-        device const   uchar * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const   uchar * src00,
-        device const   uchar * src01,
-        device const   uchar * src02,
-        device const   uchar * src03,
-        device const   uchar * src04,
-        device const   uchar * src05,
-        device const   uchar * src06,
-        device const   uchar * src07,
-        threadgroup    uchar *,
-        uint3, uint, uint);
-
-template [[host_name("kernel_mul_mm_id_f32_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<float4x4,   1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<half4x4,    1,     dequantize_f16>;
-template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0, 2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1, 2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0, 2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1, 2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0, 2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-#if QK_K == 64
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
-#else
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
-#endif
-
-//
-// matrix-vector multiplication
-//
-
-[[host_name("kernel_mul_mv_id_f32_f32")]]
-kernel void kernel_mul_mv_id_f32_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_f32_f32_impl(
-        src0[id],
-        src1 + bid*nb11,
-        dst  + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        nb00,
-        nb01,
-        nb02,
-        ne10,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg);
-}
-
-[[host_name("kernel_mul_mv_id_f16_f32")]]
-kernel void kernel_mul_mv_id_f16_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_f16_f32_impl(
-        src0[id],
-        src1 + bid*nb11,
-        dst  + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        nb00,
-        nb01,
-        nb02,
-        ne10,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg);
-}
-
-[[host_name("kernel_mul_mv_id_q8_0_f32")]]
-kernel void kernel_mul_mv_id_q8_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q8_0_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_0_f32")]]
-kernel void kernel_mul_mv_id_q4_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_1_f32")]]
-kernel void kernel_mul_mv_id_q4_1_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_0_f32")]]
-kernel void kernel_mul_mv_id_q5_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_1_f32")]]
-kernel void kernel_mul_mv_id_q5_1_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q2_K_f32")]]
-kernel void kernel_mul_mv_id_q2_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q2_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q3_K_f32")]]
-kernel void kernel_mul_mv_id_q3_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q3_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_K_f32")]]
-kernel void kernel_mul_mv_id_q4_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q4_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_K_f32")]]
-kernel void kernel_mul_mv_id_q5_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q5_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q6_K_f32")]]
-kernel void kernel_mul_mv_id_q6_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q6_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq2_xxs_f32")]]
-kernel void kernel_mul_mv_id_iq2_xxs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq2_xxs_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq2_xs_f32")]]
-kernel void kernel_mul_mv_id_iq2_xs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq2_xs_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]]
-kernel void kernel_mul_mv_id_iq3_xxs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq3_xxs_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq3_s_f32")]]
-kernel void kernel_mul_mv_id_iq3_s_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq3_s_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq2_s_f32")]]
-kernel void kernel_mul_mv_id_iq2_s_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq2_s_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq1_s_f32")]]
-kernel void kernel_mul_mv_id_iq1_s_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq1_s_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
-kernel void kernel_mul_mv_id_iq4_nl_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup float    * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq4_nl_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq4_xs_f32")]]
-kernel void kernel_mul_mv_id_iq4_xs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup float    * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-#if QK_K == 64
-    kernel_mul_mv_iq4_nl_f32_impl(
-#else
-    kernel_mul_mv_iq4_xs_f32_impl(
-#endif
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
diff --git a/ggml-mpi.c b/ggml-mpi.c
deleted file mode 100644
index ae176d707..000000000
--- a/ggml-mpi.c
+++ /dev/null
@@ -1,216 +0,0 @@
-#include "ggml-mpi.h"
-
-#include "ggml.h"
-
-#include <mpi.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define UNUSED GGML_UNUSED
-
-struct ggml_mpi_context {
-    int rank;
-    int size;
-};
-
-void ggml_mpi_backend_init(void) {
-    MPI_Init(NULL, NULL);
-}
-
-void ggml_mpi_backend_free(void) {
-    MPI_Finalize();
-}
-
-struct ggml_mpi_context * ggml_mpi_init(void) {
-    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
-
-    return ctx;
-}
-
-void ggml_mpi_free(struct ggml_mpi_context * ctx) {
-    free(ctx);
-}
-
-int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
-    return ctx->rank;
-}
-
-void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads) {
-    UNUSED(ctx_mpi);
-
-    // synchronize the worker node parameters with the root node
-    MPI_Barrier(MPI_COMM_WORLD);
-
-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
-}
-
-static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
-    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
-    if (t == NULL) {
-        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
-        return -1;
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        if (gf->nodes[i] == t) {
-            return i;
-        }
-    }
-
-    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
-    return -1;
-}
-
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
-    MPI_Datatype mpi_type;
-
-    switch (t->type) {
-        case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
-        case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
-        default: GGML_ASSERT(false && "not implemented");
-    }
-
-    MPI_Status status; UNUSED(status);
-
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
-    GGML_ASSERT(retval == MPI_SUCCESS);
-}
-
-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
-
-    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
-    if (inp_tokens == NULL) {
-        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
-        return;
-    }
-
-    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
-    if (inp0 == NULL) {
-        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
-        return;
-    }
-
-    GGML_ASSERT(inp0 == gf->nodes[0]);
-
-    // distribute the compute graph into slices across the MPI nodes
-    //
-    // the main node (0) processes the last layers + the remainder of the compute graph
-    // and is responsible to pass the input tokens to the first node (1)
-    //
-    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
-    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
-    // ...
-    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
-    // node 0:   [(n-1) * n_per_node,            n_nodes)
-    //
-    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
-        }
-    } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
-
-        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
-    }
-
-    {
-        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
-
-        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
-        const int il0 =               (mpi_idx + 0) * n_per_node;
-        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-
-        char name_l0[GGML_MAX_NAME];
-        char name_l1[GGML_MAX_NAME];
-
-        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
-        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
-
-        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
-        if (idx_l0 < 0 || idx_l1 < 0) {
-            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
-            return;
-        }
-
-        // attach the input data to all nodes that need it
-        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
-        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
-            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
-            }
-        }
-
-        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
-        for (int i = 1; i < idx_l1 - idx_l0; i++) {
-            gf->nodes[i] = gf->nodes[idx_l0 + i];
-            gf->grads[i] = gf->grads[idx_l0 + i];
-        }
-
-        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-        if (mpi_idx != 0) {
-            gf->nodes[0]->op = GGML_OP_NONE;
-        }
-
-        gf->n_nodes = idx_l1 - idx_l0;
-
-        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
-    }
-}
-
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
-
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
-
-    // send the output data to the next node
-    if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
-    }
-}
diff --git a/ggml-mpi.h b/ggml-mpi.h
deleted file mode 100644
index eda119d44..000000000
--- a/ggml-mpi.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-struct ggml_context;
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_mpi_context;
-
-void ggml_mpi_backend_init(void);
-void ggml_mpi_backend_free(void);
-
-struct ggml_mpi_context * ggml_mpi_init(void);
-void ggml_mpi_free(struct ggml_mpi_context * ctx);
-
-int ggml_mpi_rank(struct ggml_mpi_context * ctx);
-
-void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads);
-
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
-
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
deleted file mode 100644
index df619a884..000000000
--- a/ggml-opencl.cpp
+++ /dev/null
@@ -1,2296 +0,0 @@
-#include "ggml.h"
-#include "ggml-opencl.h"
-#include "ggml-backend-impl.h"
-
-#include <array>
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <sstream>
-#include <vector>
-
-#define CL_TARGET_OPENCL_VERSION 120
-#include <clblast.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define CL_DMMV_LOCAL_SIZE 32
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 1
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-static std::string program_source = MULTILINE_QUOTE(
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-struct __attribute__ ((packed)) block_q4_0
-{
-    half d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q4_1
-{
-    half d;
-    half m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_0
-{
-    half d;
-    uint32_t qh;
-    uint8_t qs[QK5_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_1
-{
-    half d;
-    half m;
-    uint32_t qh;
-    uint8_t qs[QK5_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q8_0
-{
-    half d;
-    int8_t qs[QK8_0];
-};
-
-struct __attribute__((packed)) block_q2_K
-{
-    uint8_t scales[16];
-    uint8_t qs[64];
-    half d;
-    half dmin;
-};
-
-struct __attribute__((packed)) block_q3_K
-{
-    uint8_t hmask[32];
-    uint8_t qs[64];
-    uint8_t scales[12];
-    half d;
-};
-
-struct __attribute__((packed)) block_q4_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q5_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qh[32];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q6_K
-{
-    uint8_t ql[128];
-    uint8_t qh[64];
-    int8_t scales[16];
-    half d;
-};
-
-__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
-    const uint i = get_global_id(0);
-
-    y[i] = vload_half(0, &x[i]);
-}
-
-void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = (vi0 - 8)*d;
-    *v1 = (vi1 - 8)*d;
-}
-void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = vi0*d + m;
-    *v1 = vi1*d + m;
-}
-void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
-
-    *v0 = x0*d;
-    *v1 = x1*d;
-}
-void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-    *v0 = x0*d + m;
-    *v1 = x1*d + m;
-}
-void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const int8_t vi0 = x[ib].qs[iqs + 0];
-    const int8_t vi1 = x[ib].qs[iqs + 1];
-
-    *v0 = vi0*d;
-    *v1 = vi1*d;
-}
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
-    *v0 = vload_half(0, &x[ib + 0]);
-    *v1 = vload_half(0, &x[ib + 1]);
-}
-);
-
-static std::string k_quants_source = MULTILINE_QUOTE(
-inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
-{
-    if (j < 4)
-    {
-        *d = q[j] & 63;
-        *m = q[j + 4] & 63;
-    }
-    else
-    {
-        *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
-        *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
-    }
-}
-
-__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int n = tid / 32;
-    const int l = tid - 32 * n;
-    const int is = 8 * n + l / 16;
-
-    const uint8_t q = x[i].qs[32 * n + l];
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
-    y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
-    y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
-    y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
-}
-
-__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
-{
-    int r = get_local_id(0) / 4;
-    int i = get_group_id(0) + get_global_offset(0);
-    int tid = r / 2;
-    int is0 = r % 2;
-    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
-    int n = tid / 4;
-    int j = tid - 4 * n;
-
-    uint8_t m = 1 << (4 * n + j);
-    int is = 8 * n + 2 * j + is0;
-    int shift = 2 * j;
-
-    int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
-              : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
-              : is < 12  ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
-              : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
-    float d_all = vload_half(0, &x[i].d);
-    float dl = d_all * (us - 32);
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
-    const __global uint8_t *q = x[i].qs + 32 * n;
-    const __global uint8_t *hm = x[i].hmask;
-
-    for (int l = l0; l < l0 + 4; ++l)
-        y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-}
-
-__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 8;
-    const int ir = tid % 8;
-    const int is = 2 * il;
-    const int n = 4;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    float d1 = dall * sc;
-    float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    float d2 = dall * sc;
-    float m2 = dmin * m;
-    for (int l = 0; l < n; ++l)
-    {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l + 32] = d2 * (q[l] >> 4) - m2;
-    }
-}
-
-__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 16;
-    const int ir = tid % 16;
-    const int is = 2 * il;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
-    __global const uint8_t *qh = x[i].qh + 2 * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    uint8_t hm = 1 << (2 * il);
-    y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
-    y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
-}
-
-__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int ip = tid / 32;
-    const int il = tid - 32 * ip;
-    const int is = 8 * ip + il / 16;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
-
-    const float d = vload_half(0, &x[i].d);
-
-    __global const uint8_t *ql = x[i].ql + 64 * ip + il;
-    const uint8_t qh = x[i].qh[32 * ip + il];
-    __global const int8_t *sc = x[i].scales + is;
-
-    y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-}
-
-__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q2_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q3_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-        __global const uint8_t * h = x[i].hmask + l0;
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = vload_half(0, &x[i].d);
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp[16 * ix + tid] += d * sum;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    //to rename it later, just to test now
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
-
-    const int step = 8/K_QUANTS_PER_ITERATION;
-
-    const int il  = tid/step;     // 0...3
-    const int ir  = tid - step*il;// 0...3
-    const int n   = 2*K_QUANTS_PER_ITERATION;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q4_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const uint8_t * q1 = x[i].qs + q_offset;
-        __global const uint8_t * q2 = q1 + 64;
-        __global const float   * y1 = yy + i*QK_K + y_offset;
-        __global const float   * y2 = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 s = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
-            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/2;  // 0...15
-    const int ix  = get_local_id(0)%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q5_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        __global const uint8_t * ql1 = x[i].qs + q_offset;
-        __global const uint8_t * ql2 = ql1 + 64;
-        __global const uint8_t * qh  = x[i].qh + l0;
-        __global const float   * y1  = yy + i*QK_K + y_offset;
-        __global const float   * y2  = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q6_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-
-\n#else\n
-
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-
-\n#endif\n
-
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * ql = x[i].ql + ql_offset;
-        __global const uint8_t * qh = x[i].qh + qh_offset;
-        __global const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = vload_half(0, &x[i].d);
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp[16 * ix + tid] += sum;
-\n#else\n
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp[16 * ix + tid] += sum;
-\n#endif\n
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-);
-
-
-std::string dequant_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int ib = i/qk + get_global_offset(0); // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    float v0, v1;
-    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-    y[iybs + iqs + 0] = v0;
-    y[iybs + iqs + y_offset] = v1;
-}
-);
-
-std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int local_size = get_local_size(0);
-    const int row = get_group_id(0);
-    const int tid = get_local_id(0);
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int col_step = local_size * 2;
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    x += get_global_offset(0);
-
-    tmp[tid] = 0;
-
-    for (int col = tid*2; col < ncols; col += col_step) {
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=local_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-);
-
-
-std::string mul_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
-}
-);
-
-std::string add_template = MULTILINE_QUOTE(
-__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
-}
-);
-
-#define CL_CHECK(err)                                               \
-    do {                                                            \
-        cl_int err_ = (err);                                        \
-        if (err_ != CL_SUCCESS) {                                   \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-#define CLBLAST_CHECK(err)                                          \
-    do {                                                            \
-        CLBlastStatusCode err_ = (err);                             \
-        if (err_ != CLBlastSuccess) {                               \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-std::array<std::string, 5> dequant_str_keys = {
-    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
-};
-
-std::array<std::string, 30> dequant_str_values = {
-    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_row_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
-    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 2> mul_str_keys = {
-    "KERNEL_NAME", "TYPE"
-};
-std::array<std::string, 2> mul_str_values = {
-    "mul_f32", "float"
-};
-
-static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
-    size_t pos = 0;
-    while ((pos = s.find(from, pos)) != std::string::npos) {
-         s.replace(pos, from.length(), to);
-         pos += to.length();
-    }
-    return s;
-}
-
-static std::string generate_kernels() {
-    std::stringstream src;
-    src << program_source << '\n';
-    src << k_quants_source << '\n';
-    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
-        std::string dequant_kernel = dequant_template;
-        std::string dmmv_kernel = dequant_mul_mat_vec_template;
-        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
-            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
-            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
-        }
-        src << dequant_kernel << '\n';
-        src << dmmv_kernel << '\n';
-    }
-    for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
-        std::string mul_kernel = mul_template;
-        for (size_t j = 0; j < mul_str_keys.size(); j++) {
-            replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
-        }
-        src << mul_kernel << '\n';
-    }
-    src << add_template << '\n';
-
-    return src.str();
-}
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel convert_row_f16_cl;
-static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
-static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
-static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
-static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
-static cl_kernel mul_f32_cl;
-static cl_kernel add_f32_cl;
-static bool fp16_support;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size;
-    size_t log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
-                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
-                               "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
-
-    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    static bool initialized = false;
-    if (initialized) {
-        return;
-    }
-    initialized = true;
-
-    cl_int err;
-
-    struct cl_device;
-    struct cl_platform {
-        cl_platform_id id;
-        unsigned number;
-        char name[128];
-        char vendor[128];
-        struct cl_device * devices;
-        unsigned n_devices;
-        struct cl_device * default_device;
-    };
-
-    struct cl_device {
-        struct cl_platform * platform;
-        cl_device_id id;
-        unsigned number;
-        cl_device_type type;
-        char name[128];
-    };
-
-    enum { NPLAT = 16, NDEV = 16 };
-
-    struct cl_platform platforms[NPLAT];
-    unsigned n_platforms = 0;
-    struct cl_device devices[NDEV];
-    unsigned n_devices = 0;
-    struct cl_device * default_device = NULL;
-
-    platform = NULL;
-    device = NULL;
-
-    cl_platform_id platform_ids[NPLAT];
-    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
-
-    for (unsigned i = 0; i < n_platforms; i++) {
-        struct cl_platform * p = &platforms[i];
-        p->number = i;
-        p->id = platform_ids[i];
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
-
-        cl_device_id device_ids[NDEV];
-        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
-        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
-            p->n_devices = 0;
-        } else {
-            CL_CHECK(clGetDeviceIDsError);
-        }
-        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
-        p->default_device = NULL;
-
-        for (unsigned j = 0; j < p->n_devices; j++) {
-            struct cl_device * d = &devices[n_devices];
-            d->number = n_devices++;
-            d->id = device_ids[j];
-            d->platform = p;
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
-
-            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
-                p->default_device = d;
-            }
-        }
-
-        if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
-        }
-    }
-
-    if (n_devices == 0) {
-        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
-        exit(1);
-    }
-
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
-
-    unsigned n;
-    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
-        user_platform_number = (int)n;
-    }
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
-        user_device_number = (int)n;
-    }
-    if (user_platform_number != -1 && user_device_number != -1) {
-        cl_platform* platform = &platforms[user_platform_number];
-        if ((unsigned)user_device_number >= platform->n_devices) {
-            fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
-            exit(1);
-        }
-        default_device = &platform->devices[user_device_number];
-    } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
-        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
-            for (unsigned i = 0; i < n_platforms; i++) {
-                struct cl_platform * p = &platforms[i];
-                if (strstr(p->name, user_platform_string) != NULL ||
-                    strstr(p->vendor, user_platform_string) != NULL) {
-                    user_platform_number = (int)i;
-                    break;
-                }
-            }
-            if (user_platform_number == -1) {
-                fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
-                exit(1);
-            }
-        }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
-        }
-
-        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
-                if (strstr(d->name, user_device_string) != NULL) {
-                    user_device_number = d->number;
-                    break;
-                }
-            }
-            if (user_device_number == -1) {
-                fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
-                exit(1);
-            }
-        }
-        if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
-        }
-
-        GGML_ASSERT(n_selected_devices > 0);
-
-        if (default_device == NULL) {
-            default_device = &selected_devices[0];
-        }
-    }
-
-    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
-    }
-
-    platform = default_device->platform->id;
-    device = default_device->id;
-
-    size_t ext_str_size;
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
-    char *ext_buffer = (char *)alloca(ext_str_size + 1);
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
-    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
-    // Disabled due to faulty outputs
-    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
-
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
-    };
-
-    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
-
-    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
-        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
-        (queue = clCreateCommandQueue(context, device, 0, &err), err)
-    )));
-
-    const std::string kernel_src = generate_kernels();
-
-    program = build_program_from_source(context, device, kernel_src.c_str());
-
-    // FP16 to FP32 kernel
-    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
-
-    // Dequantize kernels
-    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
-    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
-    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
-    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
-    CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
-    CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
-    CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
-    CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
-
-    // dequant mul mat kernel
-    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
-    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
-
-    // mul kernel
-    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
-
-    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
-}
-
-static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_row_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_row_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_row_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_row_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_row_q8_0_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_block_q2_k_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_block_q3_k_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_block_q4_k_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_block_q5_k_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_block_q6_k_cl;
-        case GGML_TYPE_F16:
-            return &convert_row_f16_cl;
-        default:
-            return nullptr;
-    }
-}
-
-static size_t ggml_cl_global_denom(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 4;
-        case GGML_TYPE_Q4_K:
-            return 8;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 4;
-        case GGML_TYPE_F16:
-        default:
-            return 1;
-    }
-}
-
-static size_t ggml_cl_local_size(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 0;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 64;
-        case GGML_TYPE_Q4_K:
-            return 32;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 64;
-        case GGML_TYPE_F16:
-        default:
-            return 0;
-    }
-}
-
-static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_mul_mat_vec_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_mul_mat_vec_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_mul_mat_vec_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_mul_mat_vec_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_mul_mat_vec_q8_0_cl;
-        case GGML_TYPE_F16:
-            return &convert_mul_mat_vec_f16_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_mul_mat_vec_q2_K_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_mul_mat_vec_q3_K_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_mul_mat_vec_q4_K_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_mul_mat_vec_q5_K_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_mul_mat_vec_q6_K_cl;
-        default:
-            return nullptr;
-    }
-}
-
-// buffer pool for cl
-#define MAX_CL_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-struct cl_buffer {
-    cl_mem mem;
-    size_t size = 0;
-};
-
-static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
-static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
-
-static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-    cl_int err;
-
-    int best_i = -1;
-    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
-    int worst_i = -1;
-    size_t worst_size = 0; //largest unused buffer seen so far
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer &b = g_cl_buffer_pool[i];
-        if (b.size > 0 && b.size >= size && b.size < best_size)
-        {
-            best_i = i;
-            best_size = b.size;
-        }
-        if (b.size > 0 && b.size > worst_size)
-        {
-            worst_i = i;
-            worst_size = b.size;
-        }
-    }
-    if(best_i!=-1) //found the smallest buffer that fits our needs
-    {
-        cl_buffer& b = g_cl_buffer_pool[best_i];
-        cl_mem mem = b.mem;
-        *actual_size = b.size;
-        b.size = 0;
-        return mem;
-    }
-    if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
-    {
-         cl_buffer& b = g_cl_buffer_pool[worst_i];
-         cl_mem mem = b.mem;
-         b.size = 0;
-         clReleaseMemObject(mem);
-    }
-    cl_mem mem;
-    CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
-    *actual_size = size;
-    return mem;
-}
-
-static void ggml_cl_pool_free(cl_mem mem, size_t size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer& b = g_cl_buffer_pool[i];
-        if (b.size == 0) {
-            b.mem = mem;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
-    clReleaseMemObject(mem);
-}
-
-void ggml_cl_free_data(const struct ggml_tensor* tensor) {
-    if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
-        return;
-    }
-
-    cl_mem mem = (cl_mem)tensor->extra;
-    clReleaseMemObject(mem);
-}
-
-static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
-    cl_int err;
-    const uint64_t ne0 = src->ne[0];
-    const uint64_t ne1 = src->ne[1];
-    const uint64_t nb0 = src->nb[0];
-    const uint64_t nb1 = src->nb[1];
-    const uint64_t nb2 = src->nb[2];
-    const uint64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
-    const uint64_t row_size = ts*ne0/bs;
-
-    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == row_size) {
-        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
-    }
-    if (nb0 == ts) {
-        const size_t buffer_origin[3] = { offset, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { row_size, ne1, 1 };
-        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
-    }
-    std::vector<cl_event> events;
-    if (ev && ne1>1) events.reserve(ne1-1);
-    for (uint64_t i1 = 0; i1 < ne1; i1++) {
-        // pretend the row is a matrix with cols=1
-        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts, ne0/bs, 1 };
-        // if an event is requested, make the last write wait for all previous writes to complete
-        if (ev && i1) {
-            events.push_back(*ev);
-        }
-        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
-        if (err != CL_SUCCESS) {
-            for (auto event : events) {
-                clReleaseEvent(event);
-            }
-            return err;
-        }
-    }
-    for (auto event : events) {
-        CL_CHECK(clReleaseEvent(event));
-    }
-    return CL_SUCCESS;
-}
-
-static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_mul_f32(src0, src1, dst);
-}
-
-static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_add_f32(src0, src1, dst);
-}
-
-static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D =  dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // copy src1 to device
-                    if (src1->backend == GGML_BACKEND_TYPE_CPU) {
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-                    }
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    if (src1->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_Y, y_size);
-    }
-    if (dst->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_D, d_size);
-    }
-}
-
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(fp16_support);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
-    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
-    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
-
-    bool src1_cont_rows = nb10 == sizeof(float);
-    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                // FIXME: convert on device
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // convert src1 to fp16
-                    // TODO: use multiple threads
-                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-                    if (src1_cont_rows) {
-                        if (src1_cont_cols) {
-                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
-                        }
-                        else {
-                            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
-                            }
-                        }
-                    }
-                    else {
-                        for (int64_t i11 = 0; i11 < ne11; i11++) {
-                            for (int64_t i10 = 0; i10 < ne10; i10++) {
-                                // very slow due to no inlining
-                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
-                            }
-                        }
-                    }
-
-                    // copy src1 to device
-                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host, then convert to float
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
-                    } else {
-                        // FIXME: convert dst to fp32 on device
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
-    const size_t q_sz = ggml_type_size(type) * x_bps;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    size_t q_size;
-    cl_mem d_X;
-    if (!mul_mat_vec) {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-    cl_mem d_Q;
-    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-        d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
-    }
-
-    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
-    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
-    GGML_ASSERT(to_fp32_cl != nullptr);
-
-    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
-
-    size_t ev_idx = 0;
-    std::vector<cl_event> events;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy and dequantize src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                // copy src0 to device if necessary
-                if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-                    events.emplace_back();
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    d_Q = (cl_mem) src0->extra;
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (!mul_mat_vec) {
-                    // convert src0 to fp32 on device
-                    const size_t global = x_ne / global_denom;
-                    const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                        // copy src1 to device
-                        events.emplace_back();
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                        // compute
-                        const size_t global = ne01 * local;
-                        const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                        const cl_int ncols = ne00;
-                        events.emplace_back();
-                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-                    } else { // CLBlast matrix matrix multiplication
-                        // copy src1 to device
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                        // wait for conversion
-                        CL_CHECK(clFinish(queue));
-
-                        // compute
-                        events.emplace_back();
-                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                                   ne01, ne11, ne10,
-                                                                   alpha,
-                                                                   d_X, 0, ne00,
-                                                                   d_Y, 0, ne10,
-                                                                   beta,
-                                                                   d_D, 0, ne01,
-                                                                   &queue, events.data() + ev_idx++);
-
-                        if (status != clblast::StatusCode::kSuccess) {
-                            GGML_ASSERT(false);
-                        }
-                    }
-
-                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-                    for (auto *event : events) {
-                        clReleaseEvent(event);
-                    }
-
-                    ev_idx = 0;
-                    events.clear();
-                }
-            }
-        }
-    }
-
-    if (!mul_mat_vec) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-        ggml_cl_pool_free(d_Q, q_size);
-    }
-}
-
-
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
-        return true;
-    }
-
-    return false;
-}
-
-static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
-    // If device doesn't support FP16
-    if (!fp16_support) {
-        return false;
-    }
-
-    size_t src0_sz = ggml_nbytes(src0);
-    size_t src1_sz = ggml_nbytes(src1);
-
-    // mul_mat_q: src0 is converted to fp32 on device
-    size_t mul_mat_q_transfer = src0_sz + src1_sz;
-
-    // mul_mat_f16: src1 is converted to fp16 on cpu
-    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
-
-    // choose the smaller one to transfer to the device
-    // TODO: this is not always the best choice due to the overhead of converting to fp16
-    return mul_mat_f16_transfer < mul_mat_q_transfer;
-}
-
-void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
-
-    if (src0->type == GGML_TYPE_F32) {
-        ggml_cl_mul_mat_f32(src0, src1, dst);
-    }
-    else if (src0->type == GGML_TYPE_F16) {
-        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
-        }
-        else {
-            ggml_cl_mul_mat_q_f32(src0, src1, dst);
-        }
-    }
-    else if (ggml_is_quantized(src0->type)) {
-        ggml_cl_mul_mat_q_f32(src0, src1, dst);
-    }
-    else {
-        GGML_ASSERT(false);
-    }
-}
-
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
-    }
-    return 0;
-}
-
-void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
-    const int64_t ne0 = tensor->ne[0];
-    const int64_t ne1 = tensor->ne[1];
-    const int64_t ne2 = tensor->ne[2];
-    const int64_t ne3 = tensor->ne[3];
-
-    const ggml_type type = tensor->type;
-    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
-    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
-
-    size_t q_size;
-    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
-
-    tensor->data = data;
-    // copy tensor to device
-    size_t offset = 0;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
-            offset += s_sz;
-        }
-    }
-
-    CL_CHECK(clFinish(queue));
-
-    tensor->extra = dst;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-}
-
-// ggml-backend
-
-// buffer
-
-struct ggml_backend_opencl_buffer_context {
-    ~ggml_backend_opencl_buffer_context() {
-        if (buffer) {
-            clReleaseMemObject(buffer);
-        }
-        for (auto * sub_buffer : sub_buffers) {
-            clReleaseMemObject(sub_buffer);
-        }
-    }
-
-    cl_mem buffer;
-    std::vector<cl_mem> sub_buffers;
-};
-
-static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
-
-static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return cl_ptr_base;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        tensor->extra = tensor->view_src->extra;
-    } else {
-        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
-        cl_int err;
-        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-        CL_CHECK(err);
-        ctx->sub_buffers.push_back(sub_buffer);
-        tensor->extra = sub_buffer;
-    }
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-}
-
-static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-}
-
-static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        clReleaseMemObject(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
-static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_opencl_buffer_clear,
-    /* .reset           = */ ggml_backend_opencl_buffer_reset,
-};
-
-// buffer type
-
-static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
-    ggml_cl_init();
-
-    cl_int err;
-    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
-    if (err != CL_SUCCESS) {
-        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
-        return nullptr;
-    }
-
-    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
-
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
-    }
-    return alignment;
-
-    GGML_UNUSED(buffer_type);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
-    static size_t max_size = -1;
-    if (max_size == (size_t)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
-    }
-    return max_size;
-}
-
-static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
-    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ NULL,
-    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
-#if 0
-// host buffer type
-
-static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CL_Host";
-
-    GGML_UNUSED(buft);
-}
-
-static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CL_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_opencl_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
-    return "OpenCL";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_opencl_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        ggml_tensor * node = graph->nodes[i];
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
-                break;
-            case GGML_OP_MUL:
-                ggml_cl_mul(node->src[0], node->src[1], node);
-                break;
-            default:
-                GGML_ASSERT(false);
-        }
-    }
-
-    return true;
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
-        case GGML_OP_MUL:
-            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
-            return true;
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_i opencl_backend_i = {
-    /* .get_name                = */ ggml_backend_opencl_name,
-    /* .free                    = */ ggml_backend_opencl_free,
-    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
-    /* .supports_op             = */ ggml_backend_opencl_supports_op,
-};
-
-ggml_backend_t ggml_backend_opencl_init() {
-    ggml_backend_t backend = new ggml_backend {
-        /* .interface = */ opencl_backend_i,
-        /* .context   = */ nullptr
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_opencl(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_opencl_name;
-}
-#endif
diff --git a/ggml-opencl.h b/ggml-opencl.h
deleted file mode 100644
index 257a6be6a..000000000
--- a/ggml-opencl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-GGML_API void ggml_cl_init(void);
-
-GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
-GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-
-// GGML_API void * ggml_cl_host_malloc(size_t size);
-// GGML_API void   ggml_cl_host_free(void * ptr);
-
-GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
-
-GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-// backend API
-
-// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
-
-// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-quants.c b/ggml-quants.c
deleted file mode 100644
index 371826f14..000000000
--- a/ggml-quants.c
+++ /dev/null
@@ -1,12643 +0,0 @@
-#include "ggml-quants.h"
-#include "ggml-impl.h"
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <stdlib.h> // for qsort
-#include <stdio.h>  // for GGML_ASSERT
-
-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define UNUSED GGML_UNUSED
-
-#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
-}
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#elif defined(__AVX__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytesl = _mm_or_si128(bytesl, bit_mask);
-    bytesh = _mm_or_si128(bytesh, bit_mask);
-    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-    return MM256_SET_M128I(bytesh, bytesl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-    __m128i tmph = _mm_srli_epi16(tmpl, 4);
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    tmpl = _mm_and_si128(lowMask, tmpl);
-    tmph = _mm_and_si128(lowMask, tmph);
-    return MM256_SET_M128I(tmph, tmpl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-    const __m128i ones = _mm_set1_epi16(1);
-    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    const __m128i axl = _mm256_castsi256_si128(ax);
-    const __m128i axh = _mm256_extractf128_si256(ax, 1);
-    const __m128i syl = _mm256_castsi256_si128(sy);
-    const __m128i syh = _mm256_extractf128_si256(sy, 1);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m128i xl = _mm256_castsi256_si128(x);
-    const __m128i xh = _mm256_extractf128_si256(x, 1);
-    const __m128i yl = _mm256_castsi256_si128(y);
-    const __m128i yh = _mm256_extractf128_si256(y, 1);
-    // Get absolute values of x vectors
-    const __m128i axl = _mm_sign_epi8(xl, xl);
-    const __m128i axh = _mm_sign_epi8(xh, xh);
-    // Sign the values of the y vectors
-    const __m128i syl = _mm_sign_epi8(yl, xl);
-    const __m128i syh = _mm_sign_epi8(yh, xh);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-#endif
-#elif defined(__SSSE3__)
-// horizontally add 4x4 floats
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =_mm_hadd_ps(a, b);
-    __m128 res_1 =_mm_hadd_ps(c, d);
-    __m128 res =_mm_hadd_ps(res_0, res_1);
-    res =_mm_hadd_ps(res, res);
-    res =_mm_hadd_ps(res, res);
-
-    return _mm_cvtss_f32(res);
-}
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-
-#if defined(__ARM_NEON)
-
-#ifdef _MSC_VER
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif
-
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-// vaddvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-#define ggml_vqtbl1q_s8   vqtbl1q_s8
-#define ggml_vqtbl1q_u8   vqtbl1q_u8
-
-#endif
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif
-
-#endif
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_0_reference(x, y, k);
-}
-
-
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
-    const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_1_reference(x, y, k);
-}
-
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
-            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(qh));
-    }
-}
-
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_0_reference(x, y, k);
-}
-
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
-    const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_1_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_0; j++) {
-            const float v = x[i*QK8_0 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < QK8_0; ++j) {
-            const float x0 = x[i*QK8_0 + j]*id;
-
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-}
-
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
-    assert(QK8_1 == 32);
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_1; j++) {
-            const float v = x[i*QK8_1 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum = 0;
-
-        for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1           + j]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
-
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
-
-            sum += y[i].qs[          j];
-            sum += y[i].qs[QK8_1/2 + j];
-        }
-
-        y[i].s = sum*d;
-    }
-}
-
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = d * vaddvq_s32(accv);
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        v128_t accv = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-
-            accv = wasm_i32x4_add(accv, vi);
-        }
-
-        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
-                      wasm_i32x4_extract_lane(accv, 1) +
-                      wasm_i32x4_extract_lane(accv, 2) +
-                      wasm_i32x4_extract_lane(accv, 3));
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d  = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-
-        // compute sum for y[i].s
-        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
-        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
-
-        // set y[i].s
-        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
-        y[i].s = sum*d;
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_1_reference(x, y, k);
-#endif
-}
-
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F) - 8;
-            const int x1 = (x[i].qs[j] >>   4) - 8;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F);
-            const int x1 = (x[i].qs[j] >>   4);
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
-            const int x1 = (x[i].qs[j] >>   4) | xh_1;
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK8_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
-        }
-    }
-}
-
-//
-// 2-6 bit quantization in super-blocks
-//
-
-//
-// ===================== Helper functions
-//
-static inline int nearest_int(float fval) {
-    assert(fval <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
-        const float * restrict qw) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (amax < 1e-30f) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (rmse_type == 0) {
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        return 1/iscale;
-    }
-    bool return_early = false;
-    if (rmse_type < 0) {
-        rmse_type = -rmse_type;
-        return_early = true;
-    }
-    float sumlx = 0;
-    float suml2 = 0;
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 0; i < n; ++i) {
-#else
-    for (int i = 0; i < n; ++i) {
-#endif
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-        sumlx += w*x[i]*l;
-        suml2 += w*l*l;
-    }
-    float scale = sumlx/suml2;
-    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
-    float best = scale * sumlx;
-    for (int is = -9; is <= 9; ++is) {
-        if (is == 0) {
-            continue;
-        }
-        iscale = -(nmax + 0.1f*is) / max;
-        sumlx = suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
-            for (int i = 0; i < n; ++i) {
-                int l = nearest_int(iscale * x[i]);
-                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-            }
-            scale = sumlx/suml2; best = scale*sumlx;
-        }
-    }
-    return scale;
-}
-
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (!amax) { // all zero
-        for (int i = 0; i < n; ++i) { L[i] = 0; }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (do_rmse) {
-        float sumlx = 0;
-        float suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            L[i] = l;
-            float w = x[i]*x[i];
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        for (int itry = 0; itry < 5; ++itry) {
-            int n_changed = 0;
-            for (int i = 0; i < n; ++i) {
-                float w = x[i]*x[i];
-                float slx = sumlx - w*x[i]*L[i];
-                if (slx > 0) {
-                    float sl2 = suml2 - w*L[i]*L[i];
-                    int new_l = nearest_int(x[i] * sl2 / slx);
-                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
-                    if (new_l != L[i]) {
-                        slx += w*x[i]*new_l;
-                        sl2 += w*new_l*new_l;
-                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
-                            L[i] = new_l; sumlx = slx; suml2 = sl2;
-                            ++n_changed;
-                        }
-                    }
-                }
-            }
-            if (!n_changed) {
-                break;
-            }
-        }
-        for (int i = 0; i < n; ++i) {
-            L[i] += nmax;
-        }
-        return sumlx / suml2;
-    }
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-    }
-    return 1/iscale;
-}
-
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) min = 0;
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff * diff;
-        float w = weights[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights[i];
-            sum_l += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff * diff;
-                float w = weights[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
-#if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
-    if (j < 4) {
-        *d = q[j] & 63; *m = q[j + 4] & 63;
-    } else {
-        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-//========================- 2-bit (de)-quantization
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[16];
-    float   weights[16];
-    float mins[QK_K/16];
-    float scales[QK_K/16];
-
-    const float q4scale = 15.f;
-
-    for (int i = 0; i < nb; i++) {
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
-            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-        if (max_scale > 0) {
-            float iscale = q4scale/max_scale;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*scales[j]);
-                y[i].scales[j] = l;
-            }
-            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
-        } else {
-            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-        if (max_min > 0) {
-            float iscale = q4scale/max_min;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*mins[j]);
-                y[i].scales[j] |= (l << 4);
-            }
-            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
-        } else {
-            y[i].dmin = GGML_FP32_TO_FP16(0.f);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int((x[16*j + ii] + dm)/d);
-                l = MAX(0, MIN(3, l));
-                L[16*j + ii] = l;
-            }
-        }
-
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
-        int is = 0;
-        float dl, ml;
-        for (int n = 0; n < QK_K; n += 128) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-
-                uint8_t sc = x[i].scales[is++];
-                dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
-
-                sc = x[i].scales[is++];
-                dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
-
-                shift += 2;
-            }
-            q += 32;
-        }
-#else
-        float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
-        float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
-        float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
-        float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
-        for (int l = 0; l < 16; ++l) {
-            y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1;
-            y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2;
-            y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3;
-            y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4;
-        }
-        y += QK_K;
-#endif
-    }
-}
-
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q2_K_reference(x, vy, k);
-}
-
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
-        quantize_row_q2_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q2_K));
-}
-
-static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights ? weights[0] : x[0]*x[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights ? weights[i] : x[i]*x[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) {
-        min = 0;
-    }
-    if (max <= min) {
-        memset(L, 0, n);
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff*diff;
-        float w = weights ? weights[i] : x[i]*x[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights ? weights[i] : x[i]*x[i];
-            sum_l  += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff*diff;
-                float w = weights ? weights[i] : x[i]*x[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
-    float max = 0;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    if (!max) { // all zero
-        for (int i = 0; i < n; ++i) { L[i] = 0; }
-        return 0.f;
-    }
-    float iscale = nmax / max;
-    for (int i = 0; i < n; ++i) {
-        L[i] = nearest_int(iscale * x[i]);
-    }
-    float scale = 1/iscale;
-    float best_mse = 0;
-    for (int i = 0; i < n; ++i) {
-        float diff = x[i] - scale*L[i];
-        float w = quant_weights[i];
-        best_mse += w*diff*diff;
-    }
-    for (int is = -4; is <= 4; ++is) {
-        if (is == 0) continue;
-        float iscale_is = (0.1f*is + nmax)/max;
-        float scale_is = 1/iscale_is;
-        float mse = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale_is*x[i]);
-            l = MIN(nmax, l);
-            float diff = x[i] - scale_is*l;
-            float w = quant_weights[i];
-            mse += w*diff*diff;
-        }
-        if (mse < best_mse) {
-            best_mse = mse;
-            iscale = iscale_is;
-        }
-    }
-    float sumlx = 0;
-    float suml2 = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale * x[i]);
-        l = MIN(nmax, l);
-        L[i] = l;
-        float w = quant_weights[i];
-        sumlx += w*x[i]*l;
-        suml2 += w*l*l;
-    }
-    for (int itry = 0; itry < 5; ++itry) {
-        int n_changed = 0;
-        for (int i = 0; i < n; ++i) {
-            float w = quant_weights[i];
-            float slx = sumlx - w*x[i]*L[i];
-            float sl2 = suml2 - w*L[i]*L[i];
-            if (slx > 0 && sl2 > 0) {
-                int new_l = nearest_int(x[i] * sl2 / slx);
-                new_l = MIN(nmax, new_l);
-                if (new_l != L[i]) {
-                    slx += w*x[i]*new_l;
-                    sl2 += w*new_l*new_l;
-                    if (slx*slx*suml2 > sumlx*sumlx*sl2) {
-                        L[i] = new_l; sumlx = slx; suml2 = sl2;
-                        ++n_changed;
-                    }
-                }
-            }
-        }
-        if (!n_changed) {
-            break;
-        }
-    }
-    return sumlx / suml2;
-}
-
-static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
-    GGML_ASSERT(quant_weights);
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-    const bool requantize = true;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[16];
-    float mins[QK_K/16];
-    float scales[QK_K/16];
-    float sw[QK_K/16];
-    float weight[16];
-    uint8_t Ls[QK_K/16], Lm[QK_K/16];
-
-    for (int i = 0; i < nb; i++) {
-        memset(sw, 0, QK_K/16*sizeof(float));
-        float sumx2 = 0;
-        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
-        float sigma2 = sumx2/QK_K;
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float * restrict qw = quant_weights + QK_K * i + 16*j;
-            for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
-            for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
-            scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
-        }
-
-        float dm, mm;
-#if QK_K == 64
-        float max_scale = 0, max_min = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            max_scale = MAX(max_scale, scales[j]);
-            max_min   = MAX(max_min,   mins[j]);
-        }
-        dm = max_scale/15;
-        mm = max_min/15;
-        if (max_scale) {
-            float id = 1/dm;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(id*scales[j]);
-                Ls[j] = MAX(0, MIN(15, l));
-            }
-        } else {
-            memset(Ls, 0, QK_K/16);
-        }
-        if (max_min) {
-            float id = 1/mm;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(id*mins[j]);
-                Lm[j] = MAX(0, MIN(15, l));
-            }
-        } else {
-            memset(Lm, 0, QK_K/16);
-        }
-#else
-        dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
-        mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
-#endif
-        y[i].d    = GGML_FP32_TO_FP16(dm);
-        y[i].dmin = GGML_FP32_TO_FP16(mm);
-        dm        = GGML_FP16_TO_FP32(y[i].d);
-        mm        = GGML_FP16_TO_FP32(y[i].dmin);
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            y[i].scales[j] = Ls[j] | (Lm[j] << 4);
-        }
-
-        if (requantize) {
-            for (int j = 0; j < QK_K/16; ++j) {
-                const float d = dm * (y[i].scales[j] & 0xF);
-                if (!d) continue;
-                const float m = mm * (y[i].scales[j] >> 4);
-                for (int ii = 0; ii < 16; ++ii) {
-                    int l = nearest_int((x[16*j + ii] + m)/d);
-                    l = MAX(0, MIN(3, l));
-                    L[16*j + ii] = l;
-                }
-            }
-        }
-
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
-    if (!quant_weights) {
-        quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
-    }
-    else {
-        char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
-            quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
-            src += n_per_row;
-            qrow += row_size;
-        }
-    }
-    return nrow * row_size;
-}
-
-//========================= 3-bit (de)-quantization
-
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    int8_t L[QK_K];
-    float scales[QK_K / 16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0;
-        float amax = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
-            float scale = fabsf(scales[j]);
-            if (scale > amax) {
-                amax = scale; max_scale = scales[j];
-            }
-        }
-
-#if QK_K == 256
-        memset(y[i].scales, 0, 12);
-        if (max_scale) {
-            float iscale = -32.f/max_scale;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int8_t l = nearest_int(iscale*scales[j]);
-                l = MAX(-32, MIN(31, l)) + 32;
-                if (j < 8) {
-                    y[i].scales[j] = l & 0xF;
-                } else {
-                    y[i].scales[j-8] |= ((l & 0xF) << 4);
-                }
-                l >>= 4;
-                y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
-            }
-            y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        } else {
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-
-        int8_t sc;
-        for (int j = 0; j < QK_K/16; ++j) {
-            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
-#else
-        if (max_scale) {
-            float iscale = -8.f/max_scale;
-            for (int j = 0; j < QK_K/16; j+=2) {
-                int l1 = nearest_int(iscale*scales[j]);
-                l1 = 8 + MAX(-8, MIN(7, l1));
-                int l2 = nearest_int(iscale*scales[j+1]);
-                l2 = 8 + MAX(-8, MIN(7, l2));
-                y[i].scales[j/2] = l1 | (l2 << 4);
-            }
-            y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        } else {
-            for (int j = 0; j < QK_K/16; j+=2) {
-                y[i].scales[j/2] = 0;
-            }
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
-            float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
-#endif
-
-        memset(y[i].hmask, 0, QK_K/8);
-        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
-        int m = 0;
-        uint8_t hm = 1;
-        for (int j = 0; j < QK_K; ++j) {
-            if (L[j] > 3) {
-                y[i].hmask[m] |= hm;
-                L[j] -= 4;
-            }
-            if (++m == QK_K/8) {
-                m = 0; hm <<= 1;
-            }
-        }
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
-        }
-#endif
-
-        x += QK_K;
-    }
-}
-
-#if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    uint32_t aux[4];
-    const int8_t * scales = (const int8_t*)aux;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        uint8_t m = 1;
-
-        memcpy(aux, x[i].scales, 12);
-        uint32_t tmp = aux[2];
-        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-
-        int is = 0;
-        float dl;
-        for (int n = 0; n < QK_K; n += 128) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-
-                dl = d_all * (scales[is++] - 32);
-                for (int l = 0; l < 16; ++l) {
-                    *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
-                }
-
-                dl = d_all * (scales[is++] - 32);
-                for (int l = 0; l < 16; ++l) {
-                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
-                }
-
-                shift += 2;
-                m <<= 1;
-            }
-            q += 32;
-        }
-
-    }
-}
-#else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    assert(QK_K == 64);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-
-        const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
-        const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
-        const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
-        const float d4 = d_all * ((x[i].scales[1] >>  4) - 8);
-
-        for (int l=0; l<8; ++l) {
-            uint8_t h = hm[l];
-            y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
-            y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
-            y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
-            y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
-            y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
-            y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
-            y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
-            y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
-        }
-        y += QK_K;
-    }
-}
-#endif
-
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q3_K_reference(x, vy, k);
-}
-
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
-        quantize_row_q3_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q3_K));
-}
-
-static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
-#if QK_K != 256
-    (void)quant_weights;
-    quantize_row_q3_K_reference(x, y, n_per_row);
-#else
-    assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
-
-    int8_t L[QK_K];
-    float scales[QK_K / 16];
-    float weight[16];
-    float sw[QK_K / 16];
-    int8_t Ls[QK_K / 16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float sumx2 = 0;
-        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
-        float sigma2 = 2*sumx2/QK_K;
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            if (quant_weights) {
-                const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
-                for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
-            } else {
-                for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
-            }
-            float sumw = 0;
-            for (int l = 0; l < 16; ++l) sumw += weight[l];
-            sw[j] = sumw;
-
-            scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
-
-        }
-
-        memset(y[i].scales, 0, 12);
-
-        float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
-        for (int j = 0; j < QK_K/16; ++j) {
-            int l = Ls[j];
-            if (j < 8) {
-                y[i].scales[j] = l & 0xF;
-            } else {
-                y[i].scales[j-8] |= ((l & 0xF) << 4);
-            }
-            l >>= 4;
-            y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
-        }
-        y[i].d = GGML_FP32_TO_FP16(d_block);
-
-        int8_t sc;
-        for (int j = 0; j < QK_K/16; ++j) {
-            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
-
-        memset(y[i].hmask, 0, QK_K/8);
-        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
-        int m = 0;
-        uint8_t hm = 1;
-        for (int j = 0; j < QK_K; ++j) {
-            if (L[j] > 3) {
-                y[i].hmask[m] |= hm;
-                L[j] -= 4;
-            }
-            if (++m == QK_K/8) {
-                m = 0; hm <<= 1;
-            }
-        }
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-
-        x += QK_K;
-    }
-#endif
-}
-
-size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
-    if (!quant_weights) {
-        quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
-    }
-    else {
-        char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
-            quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
-            src += n_per_row;
-            qrow += row_size;
-        }
-    }
-    return nrow * row_size;
-}
-
-// ====================== 4-bit (de)-quantization
-
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[32];
-    float   weights[32];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-#if QK_K == 256
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-            }
-        }
-#else
-        const float s_factor = 15.f;
-        float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? s_factor/max_min   : 0.f;
-        int d1 = nearest_int(inv_scale*scales[0]);
-        int m1 = nearest_int(inv_min*mins[0]);
-        int d2 = nearest_int(inv_scale*scales[1]);
-        int m2 = nearest_int(inv_min*mins[1]);
-        y[i].scales[0] = d1 | (m1 << 4);
-        y[i].scales[1] = d2 | (m2 << 4);
-        y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor);
-        y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor);
-
-        float sumlx = 0;
-        int   suml2 = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            const uint8_t sd = y[i].scales[j] & 0xF;
-            const uint8_t sm = y[i].scales[j] >>  4;
-            const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd;
-            if (!d) continue;
-            const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + m)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-                sumlx += (x[32*j + ii] + m)*l*sd;
-                suml2 += l*l*sd*sd;
-            }
-        }
-        if (suml2) {
-            y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2);
-        }
-#endif
-        uint8_t * q = y[i].qs;
-        for (int j = 0; j < QK_K; j += 64) {
-            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
-            q += 32;
-        }
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
-
-        const float d   = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        int is = 0;
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K; j += 64) {
-            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-            const float d1 = d * sc; const float m1 = min * m;
-            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-            const float d2 = d * sc; const float m2 = min * m;
-            for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
-            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
-            q += 32; is += 2;
-        }
-#else
-        const float dall = GGML_FP16_TO_FP32(x[i].d[0]);
-        const float mall = GGML_FP16_TO_FP32(x[i].d[1]);
-        const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
-        const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
-        for (int l = 0; l < 32; ++l) {
-            y[l+ 0] = d1 * (q[l] & 0xF) - m1;
-            y[l+32] = d2 * (q[l] >>  4) - m2;
-        }
-        y += QK_K;
-#endif
-
-    }
-}
-
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
-    quantize_row_q4_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
-        quantize_row_q4_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q4_K));
-}
-
-static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
-#if QK_K != 256
-    (void)quant_weights;
-    quantize_row_q4_K_reference(x, y, n_per_row);
-#else
-    assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[32];
-    uint8_t Ls[QK_K/32];
-    uint8_t Lm[QK_K/32];
-    float   weights[32];
-    float   sw[QK_K/32];
-    float   mins[QK_K/32];
-    float   scales[QK_K/32];
-
-    for (int i = 0; i < nb; i++) {
-
-        float sum_x2 = 0;
-        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
-        float sigma2 = 2*sum_x2/QK_K;
-        float av_x = sqrtf(sigma2);
-
-        for (int j = 0; j < QK_K/32; ++j) {
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*i + 32*j;
-                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
-            } else {
-                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            }
-            float sumw = 0;
-            for (int l = 0; l < 32; ++l) sumw += weights[l];
-            sw[j] = sumw;
-            scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
-        }
-
-        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
-        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = Ls[j];
-            uint8_t lm = Lm[j];
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(d_block);
-        y[i].dmin = GGML_FP32_TO_FP16(m_block);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-            }
-        }
-        uint8_t * q = y[i].qs;
-        for (int j = 0; j < QK_K; j += 64) {
-            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
-            q += 32;
-        }
-
-        x += QK_K;
-
-    }
-#endif
-}
-
-size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
-    if (!quant_weights) {
-        quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
-    }
-    else {
-        char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
-            quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
-            src += n_per_row;
-            qrow += row_size;
-        }
-    }
-    return nrow * row_size;
-}
-
-// ====================== 5-bit (de)-quantization
-
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-#if QK_K == 256
-    uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
-    float weights[32];
-    uint8_t Laux[32];
-#else
-    int8_t L[QK_K];
-    float scales[QK_K/16];
-#endif
-
-    for (int i = 0; i < nb; i++) {
-
-#if QK_K == 256
-
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
-                L[32*j + ii] = l;
-            }
-        }
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        uint8_t m1 = 1, m2 = 2;
-        for (int n = 0; n < QK_K; n += 64) {
-            for (int j = 0; j < 32; ++j) {
-                int l1 = L[n + j];
-                if (l1 > 15) {
-                    l1 -= 16; qh[j] |= m1;
-                }
-                int l2 = L[n + j + 32];
-                if (l2 > 15) {
-                    l2 -= 16; qh[j] |= m2;
-                }
-                ql[j] = l1 | (l2 << 4);
-            }
-            m1 <<= 2; m2 <<= 2;
-            ql += 32;
-        }
-#else
-        float max_scale = 0, amax = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL);
-            float abs_scale = fabsf(scales[j]);
-            if (abs_scale > amax) {
-                amax = abs_scale;
-                max_scale = scales[j];
-            }
-        }
-
-        float iscale = -128.f/max_scale;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int l = nearest_int(iscale*scales[j]);
-            y[i].scales[j] = MAX(-128, MIN(127, l));
-        }
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) continue;
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-16, MIN(15, l));
-                L[16*j + ii] = l + 16;
-            }
-        }
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        for (int j = 0; j < 32; ++j) {
-            int jm = j%8;
-            int is = j/8;
-            int l1 = L[j];
-            if (l1 > 15) {
-                l1 -= 16; qh[jm] |= (1 << is);
-            }
-            int l2 = L[j + 32];
-            if (l2 > 15) {
-                l2 -= 16; qh[jm] |= (1 << (4 + is));
-            }
-            ql[j] = l1 | (l2 << 4);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * ql = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-
-#if QK_K == 256
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        int is = 0;
-        uint8_t sc, m;
-        uint8_t u1 = 1, u2 = 2;
-        for (int j = 0; j < QK_K; j += 64) {
-            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-            const float d1 = d * sc; const float m1 = min * m;
-            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-            const float d2 = d * sc; const float m2 = min * m;
-            for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
-            for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
-            ql += 32; is += 2;
-            u1 <<= 2; u2 <<= 2;
-        }
-#else
-        float d = GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict s = x[i].scales;
-        for (int l = 0; l < 8; ++l) {
-            y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
-            y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
-            y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
-            y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
-            y[l+32] = d * s[2] * ((ql[l+ 0] >>  4) - (qh[l] & 0x10 ? 0 : 16));
-            y[l+40] = d * s[2] * ((ql[l+ 8] >>  4) - (qh[l] & 0x20 ? 0 : 16));
-            y[l+48] = d * s[3] * ((ql[l+16] >>  4) - (qh[l] & 0x40 ? 0 : 16));
-            y[l+56] = d * s[3] * ((ql[l+24] >>  4) - (qh[l] & 0x80 ? 0 : 16));
-        }
-        y += QK_K;
-#endif
-    }
-}
-
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
-    quantize_row_q5_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
-        quantize_row_q5_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q5_K));
-}
-
-static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
-#if QK_K != 256
-    (void)quant_weights;
-    quantize_row_q5_K_reference(x, y, n_per_row);
-#else
-    assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[32];
-    uint8_t Ls[QK_K/32];
-    uint8_t Lm[QK_K/32];
-    float   mins[QK_K/32];
-    float   scales[QK_K/32];
-    float   sw[QK_K/32];
-    float   weights[32];
-
-    for (int i = 0; i < nb; i++) {
-
-        float sum_x2 = 0;
-        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
-        float sigma2 = 2*sum_x2/QK_K;
-        float av_x = sqrtf(sigma2);
-
-        for (int j = 0; j < QK_K/32; ++j) {
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*i + 32*j;
-                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
-            } else {
-                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            }
-            float sumw = 0;
-            for (int l = 0; l < 32; ++l) sumw += weights[l];
-            sw[j] = sumw;
-
-            scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
-        }
-
-        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
-        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
-
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = Ls[j];
-            uint8_t lm = Lm[j];
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(d_block);
-        y[i].dmin = GGML_FP32_TO_FP16(m_block);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
-                L[32*j + ii] = l;
-            }
-        }
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        uint8_t m1 = 1, m2 = 2;
-        for (int n = 0; n < QK_K; n += 64) {
-            for (int j = 0; j < 32; ++j) {
-                int l1 = L[n + j];
-                if (l1 > 15) {
-                    l1 -= 16; qh[j] |= m1;
-                }
-                int l2 = L[n + j + 32];
-                if (l2 > 15) {
-                    l2 -= 16; qh[j] |= m2;
-                }
-                ql[j] = l1 | (l2 << 4);
-            }
-            m1 <<= 2; m2 <<= 2;
-            ql += 32;
-        }
-
-        x += QK_K;
-
-    }
-#endif
-}
-
-size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
-    if (!quant_weights) {
-        quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
-    }
-    else {
-        char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
-            quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
-            src += n_per_row;
-            qrow += row_size;
-        }
-    }
-    return nrow * row_size;
-}
-
-// ====================== 6-bit (de)-quantization
-
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    int8_t L[QK_K];
-    float   scales[QK_K/16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0;
-        float max_abs_scale = 0;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-
-            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
-            scales[ib] = scale;
-
-            const float abs_scale = fabsf(scale);
-            if (abs_scale > max_abs_scale) {
-                max_abs_scale = abs_scale;
-                max_scale = scale;
-            }
-
-        }
-
-        if (!max_abs_scale) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-            x += QK_K;
-            continue;
-        }
-
-        float iscale = -128.f/max_scale;
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
-        }
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
-                L[16*j + ii] = l + 32;
-            }
-        }
-
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                const uint8_t q1 = L[j + l +  0] & 0xF;
-                const uint8_t q2 = L[j + l + 32] & 0xF;
-                const uint8_t q3 = L[j + l + 64] & 0xF;
-                const uint8_t q4 = L[j + l + 96] & 0xF;
-                ql[l+ 0] = q1 | (q3 << 4);
-                ql[l+32] = q2 | (q4 << 4);
-                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
-            }
-            ql += 64;
-            qh += 32;
-        }
-#else
-        for (int l = 0; l < 32; ++l) {
-            const uint8_t q1 = L[l +  0] & 0xF;
-            const uint8_t q2 = L[l + 32] & 0xF;
-            ql[l] = q1 | (q2 << 4);
-        }
-        for (int l = 0; l < 16; ++l) {
-            qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
-
-#if QK_K == 256
-        for (int n = 0; n < QK_K; n += 128) {
-            for (int l = 0; l < 32; ++l) {
-                int is = l/16;
-                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-                y[l +  0] = d * sc[is + 0] * q1;
-                y[l + 32] = d * sc[is + 2] * q2;
-                y[l + 64] = d * sc[is + 4] * q3;
-                y[l + 96] = d * sc[is + 6] * q4;
-            }
-            y  += 128;
-            ql += 64;
-            qh += 32;
-            sc += 8;
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-            const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-            const int8_t q3 = (int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-            const int8_t q4 = (int8_t)((ql[l+16]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            y[l+ 0] = d * sc[0] * q1;
-            y[l+16] = d * sc[1] * q2;
-            y[l+32] = d * sc[2] * q3;
-            y[l+48] = d * sc[3] * q4;
-        }
-        y  += 64;
-#endif
-
-    }
-}
-
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
-    quantize_row_q6_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
-        quantize_row_q6_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q6_K));
-}
-
-static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
-#if QK_K != 256
-    (void)quant_weights;
-    quantize_row_q6_K_reference(x, y, n_per_row);
-#else
-    assert(n_per_row % QK_K == 0);
-    const int nb = n_per_row / QK_K;
-
-    int8_t L[QK_K];
-    float   scales[QK_K/16];
-    //float   weights[16];
-
-    for (int i = 0; i < nb; i++) {
-
-        //float sum_x2 = 0;
-        //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
-        //float sigma2 = sum_x2/QK_K;
-
-        float max_scale = 0;
-        float max_abs_scale = 0;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-
-            float scale;
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*i + 16*ib;
-                //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
-                //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
-                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
-            } else {
-                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
-            }
-            scales[ib] = scale;
-
-            const float abs_scale = fabsf(scale);
-            if (abs_scale > max_abs_scale) {
-                max_abs_scale = abs_scale;
-                max_scale = scale;
-            }
-
-        }
-
-        if (!max_abs_scale) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-            x += QK_K;
-            continue;
-        }
-
-        float iscale = -128.f/max_scale;
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
-        }
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
-                L[16*j + ii] = l + 32;
-            }
-        }
-
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                const uint8_t q1 = L[j + l +  0] & 0xF;
-                const uint8_t q2 = L[j + l + 32] & 0xF;
-                const uint8_t q3 = L[j + l + 64] & 0xF;
-                const uint8_t q4 = L[j + l + 96] & 0xF;
-                ql[l+ 0] = q1 | (q3 << 4);
-                ql[l+32] = q2 | (q4 << 4);
-                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
-            }
-            ql += 64;
-            qh += 32;
-        }
-
-        x += QK_K;
-
-    }
-#endif
-}
-
-size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
-    if (!quant_weights) {
-        quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
-    }
-    else {
-        char * qrow = (char *)dst;
-        for (int row = 0; row < nrow; ++row) {
-            quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
-            src += n_per_row;
-            qrow += row_size;
-        }
-    }
-    return nrow * row_size;
-}
-
-static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK4_0 == 32, "QK4_0 must be 32");
-
-    if (!quant_weights) {
-        quantize_row_q4_0_reference(x, y, n_per_row);
-        return;
-    }
-
-    float weight[QK4_0];
-    int8_t L[QK4_0];
-
-    float sum_x2 = 0;
-    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
-    float sigma2 = sum_x2/n_per_row;
-
-    const int nb = n_per_row/QK4_0;
-    for (int ib = 0; ib < nb; ++ib) {
-        const float * xb = x + QK4_0 * ib;
-        const float * qw = quant_weights + QK4_0 * ib;
-        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
-        y[ib].d = GGML_FP32_TO_FP16(d);
-        for (int j = 0; j < 16; ++j) {
-            y[ib].qs[j] = L[j] | (L[j+16] << 4);
-        }
-    }
-}
-
-size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    if (!quant_weights) {
-        return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
-    }
-    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += row_size;
-    }
-    return nrow * row_size;
-}
-
-static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK4_1 == 32, "QK4_1 must be 32");
-
-    if (!quant_weights) {
-        quantize_row_q4_1_reference(x, y, n_per_row);
-        return;
-    }
-
-    float weight[QK4_1];
-    uint8_t L[QK4_1], Laux[QK4_1];
-
-    float sum_x2 = 0;
-    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
-    float sigma2 = sum_x2/n_per_row;
-
-    const int nb = n_per_row/QK4_1;
-    for (int ib = 0; ib < nb; ++ib) {
-        const float * xb = x + QK4_1 * ib;
-        const float * qw = quant_weights + QK4_1 * ib;
-        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float min;
-        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
-        y[ib].d = GGML_FP32_TO_FP16(d);
-        y[ib].m = GGML_FP32_TO_FP16(-min);
-        for (int j = 0; j < 16; ++j) {
-            y[ib].qs[j] = L[j] | (L[j+16] << 4);
-        }
-    }
-}
-
-size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    if (!quant_weights) {
-        return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
-    }
-    size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += row_size;
-    }
-    return nrow * row_size;
-}
-
-static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK5_0 == 32, "QK5_0 must be 32");
-
-    if (!quant_weights) {
-        quantize_row_q5_0_reference(x, y, n_per_row);
-        return;
-    }
-
-    float weight[QK5_0];
-    int8_t L[QK5_0];
-
-    float sum_x2 = 0;
-    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
-    float sigma2 = sum_x2/n_per_row;
-
-    const int nb = n_per_row/QK5_0;
-    for (int ib = 0; ib < nb; ++ib) {
-        const float * xb = x + QK5_0 * ib;
-        const float * qw = quant_weights + QK5_0 * ib;
-        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
-        y[ib].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < 16; ++j) {
-            const uint8_t xi0 = L[j];
-            const uint8_t xi1 = L[j+16];
-            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
-        }
-
-        memcpy(&y[ib].qh, &qh, sizeof(qh));
-    }
-}
-
-size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    if (!quant_weights) {
-        return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
-    }
-    size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += row_size;
-    }
-    return nrow * row_size;
-}
-
-static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK5_1 == 32, "QK5_1 must be 32");
-
-    if (!quant_weights) {
-        quantize_row_q5_1_reference(x, y, n_per_row);
-        return;
-    }
-
-    float weight[QK5_1];
-    uint8_t L[QK5_1], Laux[QK5_1];
-
-    float sum_x2 = 0;
-    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
-    float sigma2 = sum_x2/n_per_row;
-
-    const int nb = n_per_row/QK5_1;
-    for (int ib = 0; ib < nb; ++ib) {
-        const float * xb = x + QK5_1 * ib;
-        const float * qw = quant_weights + QK5_1 * ib;
-        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float min;
-        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
-        y[ib].d = GGML_FP32_TO_FP16(d);
-        y[ib].m = GGML_FP32_TO_FP16(-min);
-
-        uint32_t qh = 0;
-        for (int j = 0; j < 16; ++j) {
-            const uint8_t xi0 = L[j];
-            const uint8_t xi1 = L[j+16];
-            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
-        }
-        memcpy(&y[ib].qh, &qh, sizeof(qh));
-    }
-}
-
-size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    if (!quant_weights) {
-        return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
-    }
-    size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += row_size;
-    }
-    return nrow * row_size;
-}
-
-// ====================== "True" 2-bit (de)-quantization
-
-static const  uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const uint64_t iq2s_grid[1024] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
-    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
-    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
-    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
-    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
-    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
-    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
-    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
-    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
-    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
-    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
-    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
-    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
-    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
-    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
-    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
-    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
-    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
-    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
-    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
-    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
-    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
-    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
-    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
-    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
-    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
-    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
-    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
-    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
-    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
-    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
-    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
-    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
-    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
-    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
-    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
-    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
-    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
-    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
-    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
-    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
-    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
-    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
-    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
-    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
-    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
-    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
-    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
-    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
-    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
-    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
-    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
-    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
-    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
-    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
-    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
-    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
-    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
-    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
-    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
-    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
-    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
-    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
-    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
-    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
-    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
-    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
-    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
-    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
-    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
-    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
-    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
-    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
-    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
-    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
-    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
-    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
-    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
-    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
-    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
-    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
-    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
-    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
-    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
-    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
-    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
-    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
-    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
-    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
-    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
-    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
-    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
-    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
-    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
-    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
-    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
-    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
-    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
-    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
-    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
-    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
-    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
-    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
-    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
-    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
-    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
-    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
-    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
-    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
-    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
-    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
-    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
-    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
-    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
-    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
-    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
-    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
-    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
-    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
-    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
-    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
-    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
-    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
-    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
-    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
-    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
-    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
-    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
-    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
-    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
-    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
-    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
-    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
-    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
-    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
-    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
-    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
-    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
-    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
-    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
-    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
-    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
-    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
-    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
-    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
-    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
-    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
-    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
-    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
-    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
-    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
-    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
-    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
-    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
-    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
-    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
-    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
-    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
-    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
-    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
-    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
-    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
-    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
-    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
-    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
-    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
-    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
-    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
-    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
-    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
-    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
-    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
-    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
-    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
-    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
-    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
-    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
-    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
-    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
-    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
-    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
-    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
-    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
-    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
-    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
-    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
-    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
-    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
-    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
-    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
-    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
-    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
-    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
-    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
-    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
-    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
-    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
-    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
-    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
-    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
-    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
-    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
-    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
-    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
-    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
-    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
-    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
-    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
-    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
-    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
-    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
-    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
-    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
-    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
-    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
-    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
-    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
-    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
-    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
-    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
-    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
-    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
-    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
-    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
-    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
-    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
-    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
-    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
-    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
-    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
-    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
-    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
-    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
-    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
-    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
-    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
-    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
-    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
-    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
-    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
-    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
-    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
-    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const uint32_t iq3xxs_grid[256] = {
-    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
-    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
-    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
-    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
-    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
-    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
-    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
-    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
-    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
-    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
-    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
-    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
-    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
-    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
-    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
-    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
-    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
-    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
-    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
-    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
-    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
-    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
-    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
-    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
-    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
-    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
-    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
-    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
-    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
-    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
-    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
-    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
-};
-
-static const uint32_t iq3xs_grid[512] = {
-    0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
-    0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
-    0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
-    0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
-    0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
-    0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
-    0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
-    0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
-    0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
-    0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
-    0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
-    0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
-    0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
-    0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
-    0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
-    0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
-    0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
-    0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
-    0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
-    0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
-    0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
-    0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
-    0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
-    0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
-    0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
-    0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
-    0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
-    0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
-    0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
-    0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
-    0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
-    0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
-    0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
-    0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
-    0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
-    0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
-    0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
-    0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
-    0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
-    0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
-    0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
-    0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
-    0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
-    0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
-    0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
-    0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
-    0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
-    0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
-    0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
-    0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
-    0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
-    0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
-    0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
-    0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
-    0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
-    0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
-    0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
-    0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
-    0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
-    0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
-    0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
-    0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
-    0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
-    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
-};
-
-#define NGRID_IQ2XXS 512
-static const  uint64_t iq1s_grid[NGRID_IQ2XXS] = {
-    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
-    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
-    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
-    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
-    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
-    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
-    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
-    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
-    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
-    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
-    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
-    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
-    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
-    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
-    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
-    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
-    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
-    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
-    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
-    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
-    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
-    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
-    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
-    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
-    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
-    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
-    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
-    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
-    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
-    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
-    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
-    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
-    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
-    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
-    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
-    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
-    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
-    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
-    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
-    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
-    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
-    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
-    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
-    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
-    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
-    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
-    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
-    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
-    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
-    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
-    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
-    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
-    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
-    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
-    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
-    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
-    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
-    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
-    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
-    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
-    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
-    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
-    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
-    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
-    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
-    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
-    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
-    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
-    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
-    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
-    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
-    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
-    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
-    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
-    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
-    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
-    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
-    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
-    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
-    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
-    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
-    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
-    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
-    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
-    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
-    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
-    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
-    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
-    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
-    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
-    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
-    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
-    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
-    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
-    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
-    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
-    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
-    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
-    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
-    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
-    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
-    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
-    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
-    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
-    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
-    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
-    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
-    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
-    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
-    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
-    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
-    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
-    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
-    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
-    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
-    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
-    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
-    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
-    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
-    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
-    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
-    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
-    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
-    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
-    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
-    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
-
-};
-
-static const uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
-            const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-// ====================== 2.3125 bpw (de)-quantization
-
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    float db[2];
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
-            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-// ====================== 2.5625 bpw (de)-quantization
-
-void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    float db[2];
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-        const uint8_t * signs = qs + QK_K/8;
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
-            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const float dl = db[l/2];
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-            qs += 4;
-            signs += 4;
-        }
-    }
-}
-
-// ====================== 3.0625 bpw (de)-quantization
-
-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint32_t aux32;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * scales_and_signs = qs + QK_K/4;
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t));
-            const float db = d * (0.5f + (aux32 >> 28)) * 0.5f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
-                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]);
-                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]);
-                for (int j = 0; j < 4; ++j) {
-                    y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-                    y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-            qs += 8;
-        }
-    }
-}
-
-// ====================== 3.3125 bpw (de)-quantization
-
-void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-        const uint8_t * signs = x[i].signs;
-
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f;
-            const float db2 = d * (0.5f + (x[i].scales[ib32/2] >>  4)) * 0.5f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
-                    y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-            qs += 8;
-            signs += 4;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
-                    y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-            qh += 2;
-            qs += 8;
-            signs += 4;
-        }
-    }
-}
-
-// ====================== 1.5625 bpw (de)-quantization
-
-void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    float db[4];
-    uint16_t idx[4];
-    //const int8_t * grid[4];
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * sc = x[i].scales;
-        const uint8_t * qs = x[i].qs;
-
-        for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
-            idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
-            idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
-            idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
-            idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
-            //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
-            //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
-            //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
-            //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
-            db[0] = d * (2*(sc[0] & 7) + 1);
-            db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
-            db[2] = d * (2*(sc[1] & 7) + 1);
-            db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
-                for (int j = 0; j < 8; ++j) {
-                    //y[j] = db[l] * grid[l][j];
-                    y[j] = db[l] * grid[j];
-                }
-                y += 8;
-            }
-            qs += 4;
-            sc += 2;
-        }
-    }
-}
-
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
-void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
-    assert(k % QK4_NL == 0);
-    const int nb = k / QK4_NL;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * qs = x[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        for (int j = 0; j < QK4_NL/2; ++j) {
-            y[j+       0] = d * kvalues_iq4nl[qs[j] & 0xf];
-            y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >>  4];
-        }
-        y  += QK4_NL;
-        qs += QK4_NL/2;
-    }
-}
-
-void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-#if QK_K == 64
-    dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
-#else
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * qs = x[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
-            const float dl = d * (ls - 32);
-            for (int j = 0; j < 16; ++j) {
-                y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
-                y[j+16] = dl * kvalues_iq4nl[qs[j] >>  4];
-            }
-            y  += 32;
-            qs += 16;
-        }
-    }
-#endif
-}
-
-//===================================== Q8_K ==============================================
-
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        float max = 0;
-        float amax = 0;
-        for (int j = 0; j < QK_K; ++j) {
-            float ax = fabsf(x[j]);
-            if (ax > amax) {
-                amax = ax; max = x[j];
-            }
-        }
-        if (!amax) {
-            y[i].d = 0;
-            memset(y[i].qs, 0, QK_K);
-            x += QK_K;
-            continue;
-        }
-        //const float iscale = -128.f/max;
-        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
-        const float iscale = -127.f/max;
-        for (int j = 0; j < QK_K; ++j) {
-            int v = nearest_int(iscale*x[j]);
-            y[i].qs[j] = MIN(127, v);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            int sum = 0;
-            for (int ii = 0; ii < 16; ++ii) {
-                sum += y[i].qs[j*16 + ii];
-            }
-            y[i].bsums[j] = sum;
-        }
-        y[i].d = 1/iscale;
-        x += QK_K;
-    }
-}
-
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < QK_K; ++j) {
-            *y++ = x[i].d * x[i].qs[j];
-        }
-    }
-}
-
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q8_K_reference(x, y, k);
-}
-
-//===================================== Dot ptoducts =================================
-
-//
-// Helper functions
-//
-#if __AVX__ || __AVX2__ || __AVX512F__
-
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
-}
-#endif
-
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_0 * restrict vx0 = vx;
-        const block_q4_0 * restrict vx1 = vx + bx;
-
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = vy + by;
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_0 * restrict b_x0 = &vx0[i];
-            const block_q4_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-            const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // sub 8
-            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
-            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
-            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
-            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s, vget_low_f32(sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        __m256i qx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        qx = _mm256_sub_epi8( qx, off );
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
-
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
-
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    // First round without accumulation
-    {
-        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        acc_0 = _mm_mul_ps( d_0_1, p0 );
-        acc_1 = _mm_mul_ps( d_0_1, p1 );
-        acc_2 = _mm_mul_ps( d_2_3, p2 );
-        acc_3 = _mm_mul_ps( d_2_3, p3 );
-    }
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    // Main loop
-    for (int i = 2; i < nb; i+=2) {
-        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        // subtract offset
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F) - 8;
-            const int v1 = (x[i].qs[j] >>   4) - 8;
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_1 * restrict vx0 = vx;
-        const block_q4_1 * restrict vx1 = vx + bx;
-        const block_q8_1 * restrict vy0 = vy;
-        const block_q8_1 * restrict vy1 = vy + by;
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-        float32x4_t summs0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_1 * restrict b_x0 = &vx0[i];
-            const block_q4_1 * restrict b_x1 = &vx1[i];
-            const block_q8_1 * restrict b_y0 = &vy0[i];
-            const block_q8_1 * restrict b_y1 = &vy1[i];
-
-            float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
-                                   GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
-                                   GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
-                                   GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
-            summs0 += summs_t;
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            // mmla into int32x4_t
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-        sumv2 = sumv2 + summs0;
-
-        vst1_f32(s, vget_low_f32(sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        const float d1 = y[i].d;
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        const __m256 d0v = _mm256_set1_ps( d0 );
-        const __m256 d1v = _mm256_set1_ps( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i qx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F);
-            const int v1 = (x[i].qs[j] >>   4);
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i qx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // These temporary registers are for masking and shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
-
-    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
-    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-
-        // ((qh & (1u << (j + 16))) >> (j + 12));
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
-        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_1);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
-        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv,
-                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i qx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // temporary registers for shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // load qh
-        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
-
-        // ((qh >> (j +  0)) << 4) & 0x10;
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
-
-        // ((qh >> (j + 12))     ) & 0x10;
-        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q8_0 * restrict vx0 = vx;
-        const block_q8_0 * restrict vx1 = vx + bx;
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = vy + by;
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q8_0 * restrict b_x0 = &vx0[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
-
-            const block_q8_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
-
-            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
-            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
-            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
-            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                             GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s, vget_low_f32(sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-        return;
-    }
-#endif
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-        __m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-    size_t vl = __riscv_vsetvl_e8m1(qk);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
-        vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
-
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[i].qs[j]*y[i].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#endif
-}
-
-#if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-    const uint8x16_t m4 = vdupq_n_u8(0xF);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q2bytes;
-    uint8_t aux[16];
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
-
-        const uint8x16_t mins_and_scales = vld1q_u8(sc);
-        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
-        vst1q_u8(aux, scales);
-
-        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
-        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
-                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
-        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
-                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
-        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
-
-        int isum = 0;
-        int is = 0;
-
-// We use this macro instead of a function call because for some reason
-// the code runs 2-3% slower, even if the function is declared inline
-#define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
-
-#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
-        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
-        MULTIPLY_ACCUM_WITH_SCALE((index));
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
-
-            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
-            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
-
-            MULTIPLY_ACCUM_WITH_SCALE(0);
-
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
-
-            is += 8;
-        }
-
-        sum += d * isum;
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
-        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
-
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
-            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
-            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
-            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
-
-            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
-            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
-
-            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
-            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
-            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
-            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
-
-            p0 = _mm256_add_epi32(p0, p1);
-            p2 = _mm256_add_epi32(p2, p3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(0x3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // load mins and scales from block_q2_K.scales[QK_K/16]
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
-        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
-
-        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
-        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
-        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
-
-        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
-
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
-            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
-            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
-            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
-            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
-            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
-            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
-            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
-            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
-            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
-
-            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
-
-            p0 = _mm_add_epi32(p0, p1);
-            p2 = _mm_add_epi32(p2, p3);
-            p4 = _mm_add_epi32(p4, p5);
-            p6 = _mm_add_epi32(p6, p7);
-
-            // isum in 32bits*4*2
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
-        }
-
-        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        size_t vl = 16;
-
-        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
-        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
-
-        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
-
-        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
-        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
-        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
-        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
-        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-
-        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
-
-        vl = 32;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
-
-        uint8_t is=0;
-        int isum=0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load Q2
-            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
-
-            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
-            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
-
-            // duplicate scale elements for product
-            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
-            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
-            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
-            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
-
-            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
-            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
-            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
-            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
-
-            // load Q8
-            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
-            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
-
-            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
-            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
-            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
-            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
-
-            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
-
-            q2+=32;  q8+=128;  is=8;
-
-        }
-
-        sumf += dall * isum;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < 16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        int isum = 0;
-        int is = 0;
-        int d;
-        for (int k = 0; k < QK_K/128; ++k) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-                d = sc[is++] & 0xF;
-                int isuml = 0;
-                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                d = sc[is++] & 0xF;
-                isuml = 0;
-                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                shift += 2;
-                q8 += 32;
-            }
-            q2 += 32;
-        }
-        sumf += dall * isum - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q2bytes;
-
-    uint32_t aux32[2];
-    const uint8_t * scales = (const uint8_t *)aux32;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-
-        aux32[0] = sc[0] & 0x0f0f0f0f;
-        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
-
-        int isum1 = 0, isum2 = 0;
-
-        const uint8x16_t q2bits = vld1q_u8(q2);
-
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
-        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
-        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
-
-        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
-        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
-        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
-
-        sum += d * (isum1 + isum2);
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
-
-    float summs = 0;
-
-    // TODO: optimize this
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-        ud = (sc[0] >> 0) & 0x0f0f0f0f;
-        um = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
-        summs += dmin * smin;
-
-        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
-        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-        const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-
-        const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0));
-        const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1));
-        const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0));
-        const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1));
-
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
-
-    float summs = 0;
-
-    // TODO: optimize this
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-        ud = (sc[0] >> 0) & 0x0f0f0f0f;
-        um = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
-        summs += dmin * smin;
-
-        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-        const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-        const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-        const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
-
-        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
-        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
-        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
-        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __riscv_v_intrinsic
-
-    uint32_t aux32[2];
-    const uint8_t * scales = (const uint8_t *)aux32;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-
-        aux32[0] = sc[0] & 0x0f0f0f0f;
-        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
-
-        int isum1 = 0;
-        int isum2 = 0;
-
-        size_t vl = 16;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        // load Q2
-        vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
-
-        vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
-        vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
-        vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
-        vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
-
-        // load Q8, and take product with Q2
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
-        vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
-        vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
-        vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
-
-        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
-        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
-        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
-        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
-
-        sumf += d * (isum1 + isum2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    float sumf = 0;
-
-    int isum[QK_K/16];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memset(isum, 0, (QK_K/16)*sizeof(int));
-        for (int l =  0; l < 16; ++l) {
-            isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
-            isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
-            isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
-            isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
-        }
-        for (int l = 0; l < QK_K/16; ++l) {
-            isum[l] *= (sc[l] & 0xF);
-        }
-        sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m0 = vdupq_n_u8(1);
-    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
-    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
-    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
-    const int8_t m32 = 32;
-
-    ggml_int8x16x4_t q3bytes;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q3h;
-
-        int32_t isum = 0;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= m32;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
-            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
-            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
-            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
-            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
-            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
-
-            scale += 4;
-
-            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
-            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
-            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
-            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
-
-            scale += 4;
-
-            if (j == 0) {
-                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
-                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
-            }
-
-        }
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i mone = _mm256_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        // high bit
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
-
-        // integer accumulator
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-        int is  = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
-
-            // prepare low and high bits
-            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
-            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
-            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
-            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
-            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            // load Q8 quants
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            // multiply with scales
-            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
-
-            // accumulate
-            p16_0 = _mm256_add_epi32(p16_0, p16_1);
-            p16_2 = _mm256_add_epi32(p16_2, p16_3);
-            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
-
-        }
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i mone = _mm_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    const uint32_t *aux;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        aux = (const uint32_t *)x[i].scales;
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
-
-        // integer accumulator
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
-            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-
-            // prepare low and high bits
-            const int bit = j << 2;
-
-            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
-            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
-            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
-            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
-
-            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
-            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
-            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-
-            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
-            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
-            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-
-            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
-            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
-            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-
-            // load Q8 quants from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            // multiply with scales
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
-
-            // accumulate
-            p16_0 = _mm_add_epi32(p16_0, p16_1);
-            p16_2 = _mm_add_epi32(p16_2, p16_3);
-            p16_4 = _mm_add_epi32(p16_4, p16_5);
-            p16_6 = _mm_add_epi32(p16_6, p16_7);
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
-
-        }
-
-        // multiply with block scale and accumulate
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= 32;
-
-
-        size_t vl = 32;
-        uint8_t m =  1;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
-
-        int sum_t = 0;
-
-        for (int j = 0; j < QK_K; j += 128) {
-
-            vl = 32;
-
-            // load Q3
-            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
-
-            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
-            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
-            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
-            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
-
-            // compute mask for subtraction
-            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
-            m <<= 1;
-
-            // load Q8 and take product with Q3
-            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            // retrieve lane to multiply with scale
-            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
-            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
-            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
-            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
-            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
-            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
-            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
-            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
-
-            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q3 += 32;    q8 += 128;   scale += 8;
-
-        }
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        sumf += d*sum_t;
-
-    }
-
-    *s = sumf;
-
-#else
-    // scalar version
-    // This function is written like this so the compiler can manage to vectorize most of it
-    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
-    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
-    // The ideal situation would be if we could just write the code once, and the compiler would
-    // automatically produce the best possible set of machine instructions, instead of us having to manually
-    // write vectorized versions for AVX, ARM_NEON, etc.
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint32_t auxs[4];
-    const int8_t * scales = (const int8_t*)auxs;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            q3 += 32;
-        }
-        a = aux8;
-
-        memcpy(auxs, x[i].scales, 12);
-        uint32_t tmp = auxs[2];
-        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-
-#else
-
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const uint8x16_t mh  = vdupq_n_u8(4);
-
-    ggml_int8x16x4_t q3bytes;
-
-    uint16_t aux16[2];
-    int8_t * scales = (int8_t *)aux16;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        ggml_uint8x16x4_t q3h;
-
-        const uint8x8_t  hbits    = vld1_u8(x[i].hmask);
-        const uint8x16_t q3bits   = vld1q_u8(x[i].qs);
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        for (int j = 0; j < 4; ++j) scales[j] -= 8;
-
-        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
-        q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
-        q3h.val[1] = vandq_u8(mh, htmp);
-        q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2));
-        q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4));
-
-        q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b),                q3h.val[0]));
-        q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1]));
-        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
-        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));
-
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
-
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i m1 = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint64_t aux64;
-
-    uint16_t aux16[2];
-    const int8_t * aux8 = (const int8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
-        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
-
-        memcpy(&aux64, x[i].hmask, 8);
-
-        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
-        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
-        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
-        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
-
-        // load low 2 bits
-        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
-
-        // prepare low and high bits
-        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
-        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
-        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
-
-        // load Q8 quants
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-        // and 2 if the high bit was set)
-        const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-        const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-
-        __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-        __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-
-        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-
-        // multiply with scales
-        p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-        p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-        p16_0 = _mm256_add_epi32(p16_0, p16_1);
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint64_t aux64;
-
-    uint16_t aux16[2];
-    const int8_t * aux8 = (const int8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
-        const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
-        const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
-        const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
-
-        memcpy(&aux64, x[i].hmask, 8);
-
-        __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
-        __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
-        __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
-        q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
-        q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
-        q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
-        q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
-
-        // load low 2 bits
-        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
-
-        // prepare low and high bits
-        const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
-        const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
-        const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
-        const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
-
-        // load Q8 quants
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
-        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-        // and 2 if the high bit was set)
-        const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
-
-        __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
-        __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
-        __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
-        __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
-
-        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-
-        // multiply with scales
-        p16_0 = _mm_madd_epi16(scale_0, p16_0);
-        p16_1 = _mm_madd_epi16(scale_1, p16_1);
-        p16_2 = _mm_madd_epi16(scale_2, p16_2);
-        p16_3 = _mm_madd_epi16(scale_3, p16_3);
-
-        p16_0 = _mm_add_epi32(p16_0, p16_2);
-        p16_1 = _mm_add_epi32(p16_1, p16_3);
-        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
-
-        // multiply with block scale and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    uint16_t aux16[2];
-    int8_t * scales = (int8_t *)aux16;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        for (int j = 0; j < 4; ++j) scales[j] -= 8;
-
-        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load qh
-        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
-        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
-
-        size_t vl = 16;
-
-        // extend and combine both qh_x1 and qh_x2
-        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
-
-        vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
-        vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
-        vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
-        vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
-
-        // load Q3
-        vuint8mf2_t q3_x  = __riscv_vle8_v_u8mf2(q3, vl);
-
-        vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
-        vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
-        vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
-        vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
-
-        vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
-        vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
-        vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
-        vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
-
-        // load Q8 and take product with Q3
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
-
-        sumf += d * isum;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    int32_t scales[4];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 8; ++l) {
-            a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
-            a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
-            a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4);
-            a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4);
-            a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4);
-            a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4);
-            a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4);
-            a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4);
-        }
-
-        scales[0] = (x[i].scales[0] & 0xF) - 8;
-        scales[1] = (x[i].scales[0] >>  4) - 8;
-        scales[2] = (x[i].scales[1] & 0xF) - 8;
-        scales[3] = (x[i].scales[1] >>  4) - 8;
-
-        memset(aux32, 0, 8*sizeof(int32_t));
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x2_t q8bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-
-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        sumf -= dmin * vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-
-            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4l = _mm256_and_si256(q4bits, m4);
-            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-            p16l = _mm256_madd_epi16(scale_l, p16l);
-
-            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-            p16h = _mm256_madd_epi16(scale_h, p16h);
-            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
-
-            sumi = _mm256_add_epi32(sumi, sumj);
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-
-            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_0 = _mm_add_epi32(sumi_0, p16l);
-            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_1 = _mm_add_epi32(sumi_1, p16l);
-
-            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_0 = _mm_add_epi32(sumi_0, p16h);
-            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_1 = _mm_add_epi32(sumi_1, p16h);
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        size_t vl = 8;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        vl = 32;
-
-        int32_t sum_1 = 0;
-        int32_t sum_2 = 0;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q4
-            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-            // load Q8 and multiply it with lower Q4 nibble
-            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
-            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
-
-            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
-
-            // load Q8 and multiply it with upper Q4 nibble
-            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
-            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
-
-            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
-
-            q4 += 32;    q8 += 64;
-
-        }
-
-        sumf += d*(sum_1 + sum_2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            a += 32;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            a += 32; q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#else
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    float sumf = 0;
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x4_t q8bytes;
-
-    float sum_mins = 0.f;
-
-    uint16_t aux16[2];
-    const uint8_t * restrict scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
-
-        const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
-
-        q8bytes = ggml_vld1q_s8_x4(q8);
-        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-        const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
-
-        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-        const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
-        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
-
-        sumf += d * (sumi1 + sumi2);
-    }
-
-    *s = sumf - sum_mins;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    uint16_t aux16[2];
-    const uint8_t * scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
-        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
-        const __m256 vd = _mm256_set1_ps(d);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
-        const __m256i q4l = _mm256_and_si256(q4bits, m4);
-        const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-        const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-        const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-
-        const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc);
-
-        const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc);
-
-    }
-
-    *s = hsum_float_8(acc) - summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    uint16_t aux16[2];
-    const uint8_t * scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
-        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
-        const __m256 vd = _mm256_set1_ps(d);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
-        const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
-        const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
-        const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
-        const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
-        const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
-
-        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
-        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
-
-        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
-        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
-
-    }
-
-    *s = hsum_float_8(acc) - summs;
-
-#elif defined __riscv_v_intrinsic
-
-    uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
-        s16[0] = b[0] & 0x0f0f;
-        s16[1] = (b[0] >> 4) & 0x0f0f;
-
-        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
-
-        size_t vl = 32;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        // load Q4
-        vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-        // load Q8 and multiply it with lower Q4 nibble
-        vint8m1_t  q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-        vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
-        vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
-
-        sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
-
-        // load Q8 and multiply it with upper Q4 nibble
-        vint8m1_t  q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-        vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-        vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
-
-        sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    uint8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
-        for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
-
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
-        s16[0] = b[0] & 0x0f0f;
-        s16[1] = (b[0] >> 4) & 0x0f0f;
-
-        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
-
-        for (int j = 0; j < QK_K/32; ++j) {
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            q8 += 16; a += 16;
-            for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l];
-            q8 += 16; a += 16;
-            const float dl = d * scales[j];
-            for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]);
-        }
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mone = vdupq_n_u8(1);
-    const uint8x16_t mtwo = vdupq_n_u8(2);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        int32_t sumi_mins = vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q5h;
-
-        int32_t sumi = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
-            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
-            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
-            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
-            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
-
-            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
-            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
-            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
-            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
-
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
-        }
-
-        sumf += d * sumi - dmin * sumi_mins;
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-   for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-#if QK_K == 256
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-#else
-        // TODO
-        const float d = 0, dmin = 0;
-#endif
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
-        __m256i hmask = mone;
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
-
-            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
-
-            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m128i mone  = _mm_set1_epi8(1);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
-        __m128i hmask = mone;
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int bit = 0;
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-
-            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
-            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
-            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_0 = _mm_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm_madd_epi16(scale_0, p16_1);
-
-            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
-            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
-            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_2 = _mm_madd_epi16(scale_1, p16_2);
-            p16_3 = _mm_madd_epi16(scale_1, p16_3);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-    float sums = 0.0;
-
-    size_t vl;
-
-    for (int i = 0; i < nb; ++i) {
-
-        vl = 8;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        vl = 32;
-        int32_t aux32 = 0;
-        int is = 0;
-
-        uint8_t m = 1;
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q5 and Q8
-            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
-            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
-
-            // compute mask for addition
-            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
-            m <<= 1;
-
-            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
-            m <<= 1;
-
-            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
-            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
-
-            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
-            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
-
-            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
-            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
-
-            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
-            q5 += 32;    q8 += 64;
-
-        }
-
-        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
-        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
-
-    }
-
-    *s = sumf+sums;
-
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mh = vdupq_n_u8(16);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-    ggml_uint8x16x4_t q5h;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * sc = x[i].scales;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint8x8_t qhbits = vld1_u8(qh);
-
-        const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
-        q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
-        q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2));
-        q5h.val[2] = vbicq_u8(mh, htmp);
-        q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2));
-
-        q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0]));
-        q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1]));
-        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
-        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
-
-        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
-        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
-        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
-        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
-
-        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
-
-        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
-        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
-
-        int64_t aux64;
-        memcpy(&aux64, x[i].qh, 8);
-        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
-
-        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
-        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
-
-        const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-        const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0));
-        const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1));
-        const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0));
-        const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1));
-
-        const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1));
-
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mone  = _mm_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
-
-        const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
-        const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
-        const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
-        const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
-
-        int64_t aux64;
-        memcpy(&aux64, x[i].qh, 8);
-        const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
-
-        const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
-        const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
-        const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
-        const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
-
-        const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
-        const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
-        const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
-        const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
-        const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
-        const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
-        const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
-        const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
-        const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
-        const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
-        const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
-
-        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
-        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * sc = x[i].scales;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load qh
-        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(qh, 8);
-        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
-
-        size_t vl = 16;
-
-        // combine both qh_1 and qh_2
-        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
-
-        vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
-        vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
-        vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
-        vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
-
-        vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
-        vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
-        vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
-        vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
-
-        // load q5
-        vuint8mf2_t q5_x1  = __riscv_vle8_v_u8mf2(q5, vl);
-        vuint8mf2_t q5_x2  = __riscv_vle8_v_u8mf2(q5+16, vl);
-
-        vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
-        vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
-        vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
-        vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
-
-        vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
-        vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
-        vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
-        vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
-
-        // load Q8 and multiply it with Q5
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
-        int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
-        int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
-        int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
-
-        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) {
-            a[l+ 0] = q4[l] & 0xF;
-            a[l+32] = q4[l]  >> 4;
-        }
-        for (int is = 0; is < 8; ++is) {
-            uint8_t m = 1 << is;
-            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float dl = d * sc[j];
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
-            q8 += 16; a += 16;
-        }
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#endif
-
-
-#if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-    //const int8x16_t  m32s = vdupq_n_s8(32);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const int8x16_t scales = vld1q_s8(scale);
-        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
-
-        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
-                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
-        int32_t isum_mins = vaddvq_s32(prod);
-
-        int32_t isum = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
-            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
-            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 2);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-            scale += 4;
-
-            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            shifted = vshrq_n_u8(qhbits.val[0], 4);
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[0], 6);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 6);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-            scale += 4;
-        }
-        //sum += isum * d_all * y[i].d;
-        sum += d_all * y[i].d * (isum - 32 * isum_mins);
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
-
-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
-
-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
-
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-
-            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
-            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
-            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
-            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
-
-            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
-
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-
-            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
-            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
-            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
-
-        }
-
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        size_t vl;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        int sum_t = 0;
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            vl = 32;
-
-            // load qh
-            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
-
-            // load Q6
-            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
-            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
-
-            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
-            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
-            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
-            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
-
-            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
-            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
-
-            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
-            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
-            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
-            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
-
-            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
-            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
-            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
-            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
-
-            // load Q8 and take product
-            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
-            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
-            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
-            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
-            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
-            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
-            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
-            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
-
-            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q6 += 64;   qh += 32;   q8 += 128;   is=8;
-
-        }
-
-        sumf += d * sum_t;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a  += 128;
-            q4 += 64;
-            qh += 32;
-        }
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int8x16_t  m32s = vdupq_n_s8(32);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        int32_t isum = 0;
-
-        uint8x16_t qhbits = vld1q_u8(qh);
-        ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
-        ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
-        uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
-        q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-        shifted = vshrq_n_u8(qhbits, 4);
-        q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-        shifted = vshrq_n_u8(qhbits, 6);
-        q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-        q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-        q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
-        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
-
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-        sum += isum * d_all * y[i].d;
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
-        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
-        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
-        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
-        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
-
-        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
-
-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
-
-        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-        __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-
-        __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-        __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-
-        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-
-        p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-        p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-
-        sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
-        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
-        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
-        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
-        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
-
-        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
-
-        const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
-        const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
-        const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
-        const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
-
-        const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
-        const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
-        const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
-        const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
-        __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
-        __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
-        __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
-
-        __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
-        __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
-        __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
-        __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
-
-        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-
-        p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-        p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-        p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-        p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
-
-        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        int32_t isum = 0;
-
-        size_t vl = 16;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load Q6
-        vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
-        vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
-
-        // load qh
-        vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
-
-        vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-
-        vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
-        vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
-        vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
-        vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
-
-        vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
-        vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
-        vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
-        vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
-
-        // load Q8 and take product
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
-
-        sumf += isum * d_all * y[i].d;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 16; ++l) {
-            a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-            a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-            a[l+32] = (int8_t)((q4[l+ 0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-            a[l+48] = (int8_t)((q4[l+16] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-        }
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#endif
-
-#if defined (__AVX2__) || defined (__ARM_NEON)
-static const int8_t keven_signs_q2xs[1024] = {
-     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
-     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
-     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
-     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
-     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
-     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
-     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
-     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
-     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
-     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
-     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
-     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
-     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
-     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
-     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
-     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
-     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
-     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
-     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
-     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
-     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
-     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
-     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
-     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
-     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
-     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
-     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
-     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
-     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
-     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
-     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
-     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-#endif
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, q2, 2*sizeof(uint32_t));
-            q2 += 4;
-            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    int32x4x4_t scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        const uint8x8_t scales8 = vld1_u8(x[i].scales);
-        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
-        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
-        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
-        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
-        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
-        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
-        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
-        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
-        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
-        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
-        int32x4_t sumi = vdupq_n_s32(0);
-        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
-            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
-            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
-            q2 += 8;
-        }
-        sumf += d*vaddvq_s32(sumi);
-    }
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-    const __m256i mone = _mm256_set1_epi8(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
-    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
-    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
-
-#if QK_K == 64
-    static const uint8_t k_bit_helper[16] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper);
-    const __m128i m511 = _mm_set1_epi16(511);
-    typedef union {
-        __m128i vec_index;
-        uint16_t index[8];
-    } index_t;
-
-    index_t idx;
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs);
-        idx.vec_index = _mm_and_si128(q2_data, m511);
-
-        const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9);
-        const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13);
-        const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper);
-
-        const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
-        const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
-        const __m256i full_signs = _mm256_set_m128i(full_sign_bits, full_sign_bits);
-
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
-        const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
-
-        const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]],
-                                               iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]);
-        const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]],
-                                               iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]);
-
-        __m256i signs;
-        signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1);
-        signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-        const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
-
-        signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2);
-        signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-        const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
-
-        const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-        const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-
-        const __m256i sc1 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
-        const __m256i sc2 = _mm256_set_m128i(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
-
-        const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-#else
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
-    const __m256i m511 = _mm256_set1_epi16(511);
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
-            aux_gindex = _mm256_and_si256(q2_data, m511);
-
-            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
-            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
-            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
-
-            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
-            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
-
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
-                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
-                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
-            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
-                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
-            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
-                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-
-            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
-            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
-            const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
-
-            __m256i signs;
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
-            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
-
-            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
-            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
-            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-#endif
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
-            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls2;
-            q2 += 4;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
-    const uint8x16_t   mask2 = vld1q_u8(k_mask2);
-    const uint8x16_t m1 = vdupq_n_u8(1);
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
-            qs += 8;
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
-            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            signs += 4;
-
-            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
-            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
-
-            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
-
-            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
-            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
-            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
-            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    uint64_t aux64;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
-        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
-                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
-                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            qs += 8;
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
-
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-        const uint8_t * signs = qs + QK_K/8;
-
-        int bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
-            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
-            int sumi1 = 0, sumi2 = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += ls1 * sumi1 + ls2 * sumi2;
-            qs += 4;
-            signs += 4;
-        }
-
-        sumf += d * bsum;
-    }
-
-    *s = 0.125f * sumf;
-
-#endif
-
-}
-
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
-            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
-            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
-            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
-            q3 += 16;
-            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
-            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
-            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
-            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
-            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
-            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.5f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
-                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
-            const uint32_t ls = 2*(aux32 >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
-                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            q3 += 8;
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.25f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1);
-    const uint8x16_t   mask2 = vld1q_u8(k_mask2);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t   * restrict q8 = y[i].qs;
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)],
-                                          iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]};
-            const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)],
-                                          iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]};
-            const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)],
-                                          iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]};
-            const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)],
-                                          iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]};
-            qs += 16;
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0]));
-            q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1]));
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            signs += 4;
-
-            q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0]));
-            q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1]));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf));
-            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >>  4));
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)],
-                                                  iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)],
-                                                  iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)],
-                                                  iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)],
-                                                  iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)],
-                                                  iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)],
-                                                  iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)],
-                                                  iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]);
-            qs += 8;
-            const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)],
-                                                  iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)],
-                                                  iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)],
-                                                  iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)],
-                                                  iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)],
-                                                  iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)],
-                                                  iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)],
-                                                  iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]);
-            qs += 8;
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint8_t * restrict signs = x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
-            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls2;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.25f * sumf;
-#endif
-}
-
-
-#ifdef __AVX2__
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return _mm256_maddubs_epi16(ax, sy);
-}
-#endif
-
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    // TODO: implement for QK_K = 64
-#if defined __ARM_NEON && QK_K == 256
-
-    const uint8x16_t m8 = vdupq_n_u8(0x08);
-    const uint8x16_t m7 = vdupq_n_u8(0x07);
-    const uint8x16_t m1 = vdupq_n_u8(0x01);
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    uint16_t gindex[8];
-    uint16x8x2_t vindex;
-    int8x16x4_t q1b;
-    ggml_int8x16x4_t q8b;
-    uint16x8x4_t scales;
-    int32x4x2_t sumi;
-    int32x4x2_t dotq;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        sumi.val[0] = sumi.val[1] = vzero;
-
-        for (int i128 = 0; i128 < QK_K/128; ++i128) {
-            const uint8x16_t ql = vld1q_u8(qs); qs += 16;
-            const uint8x8_t tm1 = vld1_u8 (sc); sc +=  8;
-            const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
-            const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
-            const uint8x16_t hbit = vandq_u8(qh, m8);
-            vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
-            vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
-            const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
-            scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
-            scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
-
-            for (int l = 0; l < 2; ++l) {
-                vst1q_u16(gindex+0, vindex.val[l]);
-                q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
-                q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
-                q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
-                q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
-                q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-                dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
-                dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
-
-                sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
-                sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
-            }
-        }
-
-        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
-    }
-
-    *s = sumf;
-
-    // TODO: implement for QK_K = 64
-#elif defined __AVX2__ && QK_K == 256
-
-    const __m128i m8 = _mm_set1_epi8(0x08);
-    const __m128i m7 = _mm_set1_epi8(0x07);
-    const __m128i m1 = _mm_set1_epi8(0x01);
-    const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
-    const __m128i shuffle_s[4] = {
-        _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
-        _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
-        _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
-        _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
-    };
-
-    uint64_t aux64;
-
-    typedef union m256i_uint16 {
-        __m256i reg;
-        uint16_t s[16];
-    } m256i_uint16_t;
-
-    m256i_uint16_t v_gindex;
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        __m256i sumi = _mm256_setzero_si256();
-        for (int i128 = 0; i128 < QK_K/128; ++i128) {
-            const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
-            memcpy(&aux64, sc, 8); sc += 8;
-            const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
-            const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
-            v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
-            const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
-
-            for (int i32 = 0; i32 < 4; ++i32) {
-                const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-                const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]],
-                                                      iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]);
-                const __m256i dot = mul_add_epi8(q1b, q8b);
-                const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
-                const __m256i p   = _mm256_madd_epi16(s16, dot);
-                sumi = _mm256_add_epi32(sumi, p);
-            }
-
-        }
-
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
-
-    }
-
-    *s = hsum_float_8(accum);
-
-#else
-
-    int db[4];
-    uint16_t idx[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int sumi = 0;
-        for (int i32 = 0; i32 < QK_K/32; ++i32) {
-            idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
-            idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
-            idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
-            idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
-            db[0] = (2*(sc[0] & 7) + 1);
-            db[1] = (2*((sc[0] >> 4) & 7) + 1);
-            db[2] = (2*(sc[1] & 7) + 1);
-            db[3] = (2*((sc[1] >> 4) & 7) + 1);
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
-                int suml = 0;
-                for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
-                sumi += db[l] * suml;
-                q8 += 8;
-            }
-            qs += 4;
-            sc += 2;
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
-    }
-
-    *s = sumf;
-
-#endif
-}
-
-void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK4_NL == 0);
-    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
-
-    const block_iq4_nl * restrict x = vx;
-    const block_q8_0   * restrict y = vy;
-
-    const int nb = n / QK4_NL;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    uint8x16x2_t q4bits;
-    int8x16x4_t q4b;
-    int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    float sumf = 0;
-
-    for (int ib = 0; ib < nb; ib += 2) {
-
-        q4bits.val[0] = vld1q_u8(x[ib+0].qs);
-        q4bits.val[1] = vld1q_u8(x[ib+1].qs);
-        q8b.val[0]    = vld1q_s8(y[ib+0].qs);
-        q8b.val[1]    = vld1q_s8(y[ib+0].qs + 16);
-        q8b.val[2]    = vld1q_s8(y[ib+1].qs);
-        q8b.val[3]    = vld1q_s8(y[ib+1].qs + 16);
-
-        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-        sumf +=
-            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
-            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m256i mone = _mm256_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (int ib = 0; ib < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
-        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
-        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
-        const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                               _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
-        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
-        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
-                _mm256_cvtepi32_ps(p_1), accum1);
-        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
-                _mm256_cvtepi32_ps(p_2), accum2);
-
-        y += 2;
-        x += 2;
-    }
-
-    *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
-
-#else
-    float sumf = 0;
-    for (int ib = 0; ib < nb; ++ib) {
-        const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
-        int sumi1 = 0, sumi2 = 0;
-        for (int j = 0; j < QK4_NL/2; ++j) {
-            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_K == 0);
-#if QK_K == 64
-    ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
-#else
-
-    const block_iq4_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    ggml_uint8x16x2_t q4bits;
-    ggml_int8x16x4_t q4b;
-    ggml_int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    float sumf = 0;
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-
-        const int8_t  * q8 = y[ibl].qs;
-        const uint8_t * q4 = x[ibl].qs;
-        uint16_t h = x[ibl].scales_h;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/64; ++ib) {
-
-            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
-            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
-            h >>= 4;
-            sumi1 += vaddvq_s32(prod_1) * ls1;
-            sumi2 += vaddvq_s32(prod_2) * ls2;
-
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-            const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                                   _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
-            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
-            sumi1 = _mm256_add_epi32(p_1, sumi1);
-            sumi2 = _mm256_add_epi32(p_2, sumi2);
-        }
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#else
-    float sumf = 0;
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
-        uint16_t h = x[ibl].scales_h;
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
-            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
-            h >>= 4;
-            const float d1 = d4d8*(ls1 - 32);
-            const float d2 = d4d8*(ls2 - 32);
-            int sumi1 = 0, sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d1 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-            sumi1 = sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d2 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-        }
-    }
-    *s = sumf;
-#endif
-#endif
-}
-
-// ================================ IQ2 quantization =============================================
-
-typedef struct {
-    uint64_t * grid;
-    int      * map;
-    uint16_t * neighbours;
-} iq2_entry_t;
-
-static iq2_entry_t iq2_data[4] = {
-    {NULL, NULL, NULL},
-    {NULL, NULL, NULL},
-    {NULL, NULL, NULL},
-    {NULL, NULL, NULL},
-};
-
-static inline int iq2_data_index(enum ggml_type type) {
-    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
-    return type == GGML_TYPE_IQ2_XXS ? 0 :
-           type == GGML_TYPE_IQ2_XS  ? 1 :
-           type == GGML_TYPE_IQ1_S   ? 2 : 3;
-}
-
-static inline int iq2_grid_size(enum ggml_type type) {
-    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
-    return type == GGML_TYPE_IQ2_XXS ? 256 :
-           type == GGML_TYPE_IQ2_XS  ? 512 :
-           type == GGML_TYPE_IQ1_S   ? 512 : 1024;
-}
-
-static int iq2_compare_func(const void * left, const void * right) {
-    const int * l = (const int *)left;
-    const int * r = (const int *)right;
-    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
-}
-
-void iq2xs_init_impl(enum ggml_type type) {
-    const int gindex = iq2_data_index(type);
-    const int grid_size = iq2_grid_size(type);
-    if (iq2_data[gindex].grid) {
-        return;
-    }
-    static const uint16_t kgrid_2bit_256[256] = {
-            0,     2,     5,     8,    10,    17,    20,    32,    34,    40,    42,    65,    68,    80,    88,    97,
-          100,   128,   130,   138,   162,   257,   260,   272,   277,   320,   388,   408,   512,   514,   546,   642,
-         1025,  1028,  1040,  1057,  1060,  1088,  1090,  1096,  1120,  1153,  1156,  1168,  1188,  1280,  1282,  1288,
-         1312,  1350,  1385,  1408,  1425,  1545,  1552,  1600,  1668,  1700,  2048,  2053,  2056,  2068,  2088,  2113,
-         2116,  2128,  2130,  2184,  2308,  2368,  2562,  2580,  4097,  4100,  4112,  4129,  4160,  4192,  4228,  4240,
-         4245,  4352,  4360,  4384,  4432,  4442,  4480,  4644,  4677,  5120,  5128,  5152,  5157,  5193,  5248,  5400,
-         5474,  5632,  5654,  6145,  6148,  6160,  6208,  6273,  6400,  6405,  6560,  6737,  8192,  8194,  8202,  8260,
-         8289,  8320,  8322,  8489,  8520,  8704,  8706,  9217,  9220,  9232,  9280,  9302,  9472,  9537,  9572,  9872,
-        10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
-        16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
-        17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
-        20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
-        22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
-        25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
-        33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
-        37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
-    };
-    static const uint16_t kgrid_2bit_512[512] = {
-            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
-           73,    80,    82,    85,    88,    97,   100,   128,   130,   133,   136,   145,   148,   153,   160,   257,
-          260,   262,   265,   272,   274,   277,   280,   282,   289,   292,   320,   322,   325,   328,   337,   340,
-          352,   360,   385,   388,   400,   512,   514,   517,   520,   529,   532,   544,   577,   580,   592,   597,
-          640,   650,  1025,  1028,  1030,  1033,  1040,  1042,  1045,  1048,  1057,  1060,  1088,  1090,  1093,  1096,
-         1105,  1108,  1110,  1120,  1153,  1156,  1168,  1280,  1282,  1285,  1288,  1297,  1300,  1312,  1345,  1348,
-         1360,  1377,  1408,  1537,  1540,  1552,  1574,  1600,  1602,  1668,  2048,  2050,  2053,  2056,  2058,  2065,
-         2068,  2080,  2085,  2113,  2116,  2128,  2136,  2176,  2208,  2218,  2305,  2308,  2320,  2368,  2433,  2441,
-         2560,  2592,  2600,  2710,  2720,  4097,  4100,  4102,  4105,  4112,  4114,  4117,  4120,  4129,  4132,  4160,
-         4162,  4165,  4168,  4177,  4180,  4192,  4202,  4225,  4228,  4240,  4352,  4354,  4357,  4360,  4369,  4372,
-         4384,  4417,  4420,  4432,  4480,  4500,  4502,  4609,  4612,  4614,  4624,  4672,  4704,  5120,  5122,  5125,
-         5128,  5137,  5140,  5152,  5185,  5188,  5193,  5200,  5220,  5248,  5377,  5380,  5392,  5440,  5632,  5652,
-         5705,  6145,  6148,  6160,  6162,  6208,  6228,  6278,  6400,  6405,  6502,  6737,  6825,  8192,  8194,  8197,
-         8200,  8202,  8209,  8212,  8224,  8257,  8260,  8272,  8320,  8352,  8449,  8452,  8464,  8512,  8520,  8549,
-         8704,  8738,  8832,  8872,  9217,  9220,  9232,  9257,  9280,  9472,  9537,  9554,  9625,  9729,  9754,  9894,
-        10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
-        16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
-        16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
-        16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
-        17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
-        18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
-        20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
-        21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
-        22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
-        24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
-        32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
-        33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
-        33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
-        35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
-        37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
-        40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
-        42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
-    };
-    static const uint16_t kgrid_1bit_512[512] = {
-           10,    33,    41,    85,   132,   134,   160,   162,   277,   337,   340,   345,   357,   405,   516,   545,
-          553,   598,   641,   650,   681,  1042,  1044,  1097,  1169,  1176,  1320,  1345,  1365,  1378,  1434,  1444,
-         1545,  1617,  1642,  1685,  2053,  2080,  2089,  2133,  2176,  2182,  2208,  2214,  2306,  2384,  2393,  2440,
-         2453,  2581,  2664,  2690,  2721,  4117,  4161,  4182,  4184,  4261,  4357,  4369,  4372,  4377,  4390,  4422,
-         4432,  4437,  4449,  4457,  4485,  4497,  4505,  4629,  4677,  4696,  4774,  5205,  5217,  5225,  5386,  5397,
-         5409,  5445,  5457,  5460,  5461,  5462,  5465,  5472,  5477,  5525,  5545,  5650,  5668,  5717,  5729,  5769,
-         5777,  6212,  6234,  6244,  6293,  6424,  6482,  6485,  6502,  6505,  6529,  6538,  6565,  6656,  6682,  6788,
-         6806,  6820,  8218,  8224,  8226,  8232,  8277,  8326,  8354,  8469,  8521,  8530,  8549,  8596,  8737,  8794,
-         9221,  9253,  9348,  9369,  9380,  9474,  9557,  9633,  9732,  9753,  9793,  9830,  9862,  9880, 10240, 10272,
-        10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
-        16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
-        17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
-        18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
-        20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
-        20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
-        21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
-        21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
-        21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
-        22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
-        23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
-        25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
-        25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
-        26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
-        32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
-        33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
-        34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
-        37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
-        38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
-        38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
-        39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
-        41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
-        42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
-    };
-    static const uint16_t kgrid_2bit_1024[1024] = {
-            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
-           73,    80,    82,    85,    88,    97,   100,   102,   105,   128,   130,   133,   136,   145,   148,   160,
-          165,   170,   257,   260,   262,   265,   272,   274,   277,   280,   289,   292,   320,   322,   325,   328,
-          337,   340,   342,   345,   352,   357,   360,   385,   388,   400,   402,   405,   417,   420,   512,   514,
-          517,   520,   529,   532,   544,   554,   577,   580,   582,   585,   592,   597,   640,   645,   650,   660,
-          674,  1025,  1028,  1030,  1033,  1040,  1042,  1045,  1048,  1057,  1060,  1062,  1065,  1088,  1090,  1093,
-         1096,  1098,  1105,  1108,  1110,  1113,  1120,  1122,  1125,  1153,  1156,  1158,  1161,  1168,  1173,  1176,
-         1185,  1188,  1280,  1282,  1285,  1288,  1290,  1297,  1300,  1302,  1305,  1312,  1317,  1320,  1345,  1348,
-         1350,  1353,  1360,  1362,  1365,  1368,  1377,  1380,  1408,  1410,  1413,  1416,  1425,  1428,  1440,  1537,
-         1540,  1542,  1545,  1552,  1557,  1600,  1605,  1608,  1617,  1620,  1632,  1665,  1668,  1680,  2048,  2050,
-         2053,  2056,  2065,  2068,  2070,  2073,  2080,  2085,  2090,  2113,  2116,  2118,  2121,  2128,  2130,  2133,
-         2136,  2145,  2148,  2176,  2181,  2196,  2218,  2305,  2308,  2320,  2322,  2325,  2328,  2337,  2368,  2373,
-         2376,  2385,  2388,  2400,  2433,  2448,  2560,  2577,  2580,  2594,  2600,  2602,  2640,  2713,  4097,  4100,
-         4102,  4105,  4112,  4114,  4117,  4120,  4129,  4132,  4134,  4160,  4162,  4165,  4168,  4177,  4180,  4182,
-         4185,  4192,  4194,  4197,  4200,  4225,  4228,  4230,  4240,  4245,  4248,  4257,  4260,  4352,  4354,  4357,
-         4360,  4362,  4369,  4372,  4374,  4377,  4384,  4386,  4389,  4392,  4417,  4420,  4422,  4425,  4432,  4434,
-         4437,  4440,  4449,  4452,  4480,  4482,  4485,  4488,  4497,  4500,  4609,  4612,  4617,  4624,  4629,  4641,
-         4644,  4672,  4677,  4689,  4692,  4737,  4740,  4752,  5120,  5122,  5125,  5128,  5137,  5140,  5142,  5145,
-         5152,  5157,  5160,  5185,  5188,  5190,  5193,  5200,  5202,  5205,  5208,  5217,  5220,  5248,  5250,  5253,
-         5256,  5265,  5268,  5280,  5377,  5380,  5382,  5385,  5392,  5394,  5397,  5400,  5409,  5412,  5440,  5442,
-         5445,  5448,  5457,  5460,  5472,  5505,  5508,  5520,  5632,  5637,  5640,  5649,  5652,  5664,  5697,  5700,
-         5712,  5760,  5802,  6145,  6148,  6150,  6153,  6160,  6165,  6168,  6177,  6208,  6210,  6213,  6216,  6225,
-         6228,  6240,  6273,  6276,  6400,  6402,  6405,  6408,  6417,  6420,  6432,  6465,  6468,  6480,  6505,  6562,
-         6660,  6672,  6720,  6742,  8192,  8194,  8197,  8200,  8209,  8212,  8214,  8217,  8224,  8229,  8234,  8257,
-         8260,  8272,  8274,  8277,  8292,  8320,  8330,  8340,  8362,  8449,  8452,  8464,  8466,  8469,  8481,  8512,
-         8514,  8517,  8529,  8532,  8544,  8577,  8580,  8592,  8704,  8714,  8738,  8744,  8746,  8772,  8784,  8840,
-         8842,  8872,  9217,  9220,  9222,  9225,  9232,  9237,  9240,  9249,  9252,  9280,  9282,  9285,  9288,  9297,
-         9300,  9312,  9345,  9348,  9360,  9472,  9477,  9480,  9489,  9492,  9504,  9537,  9540,  9552,  9574,  9600,
-         9729,  9732,  9744,  9792,  9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
-        10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
-        16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
-        16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
-        16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
-        16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
-        17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
-        17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
-        17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
-        17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
-        18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
-        18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
-        18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
-        20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
-        20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
-        20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
-        21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
-        21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
-        22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
-        22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
-        24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
-        24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
-        25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
-        26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
-        32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
-        33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
-        33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
-        33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
-        34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
-        35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
-        36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
-        37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
-        38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
-        39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
-        41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
-        42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
-    };
-
-    const int kmap_size = 43692;
-    //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
-    const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
-    const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
-                             type == GGML_TYPE_IQ2_XS  ? kgrid_2bit_512 :
-                             type == GGML_TYPE_IQ1_S   ? kgrid_1bit_512 : kgrid_2bit_1024;
-    uint64_t * kgrid_q2xs;
-    int      * kmap_q2xs;
-    uint16_t * kneighbors_q2xs;
-
-    printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
-    uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
-    for (int k = 0; k < grid_size; ++k) {
-        int8_t * pos = (int8_t *)(the_grid + k);
-        for (int i = 0; i < 8; ++i) {
-            int l = (kgrid[k] >> 2*i) & 0x3;
-            pos[i] = 2*l + 1;
-        }
-    }
-    kgrid_q2xs = the_grid;
-    iq2_data[gindex].grid = the_grid;
-    kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
-    iq2_data[gindex].map = kmap_q2xs;
-    for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
-    uint64_t aux64;
-    uint8_t * aux8 = (uint8_t *)&aux64;
-    for (int i = 0; i < grid_size; ++i) {
-        aux64 = kgrid_q2xs[i];
-        uint16_t index = 0;
-        for (int k=0; k<8; ++k) {
-            uint16_t q = (aux8[k] - 1)/2;
-            index |= (q << 2*k);
-        }
-        kmap_q2xs[index] = i;
-    }
-    int8_t pos[8];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
-    int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
-    }
-    printf("%s: %d neighbours in total\n", __func__, num_neighbors);
-    kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
-    iq2_data[gindex].neighbours = kneighbors_q2xs;
-    int counter = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        kmap_q2xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q2xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q2xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
-}
-
-void iq2xs_free_impl(enum ggml_type type) {
-    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S);
-    const int gindex = iq2_data_index(type);
-    if (iq2_data[gindex].grid) {
-        free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
-        free(iq2_data[gindex].map);        iq2_data[gindex].map  = NULL;
-        free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
-    }
-}
-
-static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
-    int num_neighbors = neighbours[0];
-    GGML_ASSERT(num_neighbors > 0);
-    float best_d2 = FLT_MAX;
-    int grid_index = -1;
-    for (int j = 1; j <= num_neighbors; ++j) {
-        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
-        float d2 = 0;
-        for (int i = 0; i < 8; ++i) {
-            float q = pg[i];
-            float diff = scale*q - xval[i];
-            d2 += weight[i]*diff*diff;
-        }
-        if (d2 < best_d2) {
-            best_d2 = d2; grid_index = neighbours[j];
-        }
-    }
-    GGML_ASSERT(grid_index >= 0);
-    const int8_t * pg = (const int8_t *)(grid + grid_index);
-    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
-    return grid_index;
-}
-
-static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-
-    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
-
-    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
-    const int      * kmap_q2xs       = iq2_data[gindex].map;
-    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
-
-    GGML_ASSERT(quant_weights   && "missing quantization weights");
-    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int kMaxQ = 3;
-
-    const int nbl = n/QK_K;
-
-    block_iq2_xxs * y = vy;
-
-    float scales[QK_K/32];
-    float weight[32];
-    float xval[32];
-    int8_t L[32];
-    int8_t Laux[32];
-    float  waux[32];
-    uint8_t block_signs[4];
-    uint32_t q2[2*(QK_K/32)];
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
-        memset(q2, 0, QK_K/4);
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            const float * xb = xbl + 32*ib;
-            const float * qw = quant_weights + QK_K*ibl + 32*ib;
-            for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
-            for (int k = 0; k < 4; ++k) {
-                int nflip = 0;
-                uint8_t s = 0;
-                for (int i = 0; i < 8; ++i) {
-                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
-                    else {
-                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
-                    }
-                }
-                if (nflip%2) {
-                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
-                    for (int i = 1; i < 8; ++i) {
-                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
-                        if (ax < min) {
-                            min = ax; imin = i;
-                        }
-                    }
-                    xval[8*k+imin] = -xval[8*k+imin];
-                    s ^= (1 << imin);
-                }
-                block_signs[k] = s & 127;
-            }
-            float max = xval[0];
-            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
-            if (!max) {
-                scales[ib] = 0;
-                memset(L, 0, 32);
-                continue;
-            }
-            float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
-            float eff_max = scale*kMaxQ;
-            float best = 0;
-            for (int is = -6; is <= 6; ++is) {
-                float id = (2*kMaxQ-1+is*0.1f)/eff_max;
-                float this_scale = 1/id;
-                for (int k = 0; k < 4; ++k) {
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
-                    }
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
-                    int grid_index = kmap_q2xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 32; ++i) {
-                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                    scale = sumqx/sumq2; best = scale*sumqx;
-                    memcpy(L, Laux, 32);
-                }
-            }
-            if (scale > 0) {
-                float id = 1/scale;
-                for (int k = 0; k < 4; ++k) {
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        l = MAX(0, MIN(kMaxQ-1, l));
-                        u |= (l << 2*i);
-                    }
-                    int grid_index = kmap_q2xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
-                    }
-                    const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
-                    for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 32; ++i) {
-                    float w = weight[i];
-                    float q = 2*L[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0) scale = sumqx/sumq2;
-            }
-            if (scale < 0) {
-                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
-                // and correspondingly flip quant signs.
-                scale = -scale;
-                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
-            }
-            for (int k = 0; k < 4; ++k) {
-                uint16_t u = 0;
-                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
-                int grid_index = kmap_q2xs[u];
-                if (grid_index < 0) {
-                    printf("Oops: found point %u not on grid:", u);
-                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
-                    printf("\n");
-                    GGML_ASSERT(false);
-                }
-                q2[2*ib+0] |= (grid_index << 8*k);
-                q2[2*ib+1] |= (block_signs[k] << 7*k);
-            }
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            memset(y[ibl].qs, 0, QK_K/4);
-            continue;
-        }
-
-        float d = max_scale/31;
-        y[ibl].d = GGML_FP32_TO_FP16(d);
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]-1));
-            l = MAX(0, MIN(15, l));
-            q2[2*ib+1] |= ((uint32_t)l << 28);
-        }
-        memcpy(y[ibl].qs, q2, QK_K/4);
-    }
-}
-
-static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-
-    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
-
-    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
-    const int      * kmap_q2xs       = iq2_data[gindex].map;
-    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
-
-    GGML_ASSERT(quant_weights   && "missing quantization weights");
-    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int kMaxQ = 3;
-
-    const int nbl = n/QK_K;
-
-    block_iq2_xs * y = vy;
-
-    float scales[QK_K/16];
-    float weight[16];
-    float xval[16];
-    int8_t L[16];
-    int8_t Laux[16];
-    float  waux[16];
-    bool   is_on_grid[2];
-    bool   is_on_grid_aux[2];
-    uint8_t block_signs[2];
-    uint16_t q2[2*(QK_K/16)];
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
-        memset(q2, 0, QK_K/4);
-        memset(y[ibl].scales, 0, QK_K/32);
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            const float * xb = xbl + 16*ib;
-            const float * qw = quant_weights + QK_K*ibl + 16*ib;
-            for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
-            for (int k = 0; k < 2; ++k) {
-                int nflip = 0;
-                uint8_t s = 0;
-                for (int i = 0; i < 8; ++i) {
-                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
-                    else {
-                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
-                    }
-                }
-                if (nflip%2) {
-                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
-                    for (int i = 1; i < 8; ++i) {
-                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
-                        if (ax < min) {
-                            min = ax; imin = i;
-                        }
-                    }
-                    xval[8*k+imin] = -xval[8*k+imin];
-                    s ^= (1 << imin);
-                }
-                block_signs[k] = s & 127;
-            }
-            float max = xval[0];
-            for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
-            if (!max) {
-                scales[ib] = 0;
-                memset(L, 0, 16);
-                continue;
-            }
-            float best = 0;
-            float scale = max/(2*kMaxQ-1);
-            is_on_grid[0] = is_on_grid[1] = true;
-            for (int is = -9; is <= 9; ++is) {
-                float id = (2*kMaxQ-1+is*0.1f)/max;
-                float this_scale = 1/id;
-                for (int k = 0; k < 2; ++k) {
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
-                    }
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
-                    int grid_index = kmap_q2xs[u];
-                    is_on_grid_aux[k] = true;
-                    if (grid_index < 0) {
-                        is_on_grid_aux[k] = false;
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 16; ++i) {
-                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                    scale = sumqx/sumq2; best = scale*sumqx;
-                    for (int i = 0; i < 16; ++i) L[i] = Laux[i];
-                    for (int k = 0; k <  2; ++k) is_on_grid[k] = is_on_grid_aux[k];
-                }
-            }
-            int n_not_ongrid = 0;
-            for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
-            if (n_not_ongrid > 0 && scale > 0) {
-                float id = 1/scale;
-                for (int k = 0; k < 2; ++k) {
-                    if (is_on_grid[k]) continue;
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        l = MAX(0, MIN(kMaxQ-1, l));
-                        u |= (l << 2*i);
-                        L[8*k + i] = l;
-                    }
-                    int grid_index = kmap_q2xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 16; ++i) {
-                    float w = weight[i];
-                    float q = 2*L[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0) scale = sumqx/sumq2;
-            }
-            if (scale < 0) {
-                scale = -scale;
-                for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
-            }
-            for (int k = 0; k < 2; ++k) {
-                uint16_t u = 0;
-                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
-                int grid_index = kmap_q2xs[u];
-                if (grid_index < 0) {
-                    printf("Oops: found point %u not on grid:", u);
-                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
-                    printf("\n");
-                    GGML_ASSERT(false);
-                }
-                q2[2*ib+k] = grid_index | (block_signs[k] << 9);
-            }
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            memset(y[ibl].qs, 0, QK_K/4);
-            continue;
-        }
-
-        float d = max_scale/31;
-        y[ibl].d = GGML_FP32_TO_FP16(d);
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]-1));
-            l = MAX(0, MIN(15, l));
-            if (ib%2 == 0) y[ibl].scales[ib/2] = l;
-            else y[ibl].scales[ib/2] |= (l << 4);
-        }
-        memcpy(y[ibl].qs, q2, QK_K/4);
-
-    }
-}
-
-size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq2_xxs);
-    }
-    return nrow * nblock * sizeof(block_iq2_xxs);
-}
-
-size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq2_xs);
-    }
-    return nrow * nblock * sizeof(block_iq2_xs);
-}
-
-//
-// ============================================= 3-bit using D4 lattice
-//
-
-typedef struct {
-    uint32_t * grid;
-    int      * map;
-    uint16_t * neighbours;
-} iq3_entry_t;
-
-static iq3_entry_t iq3_data[2] = {
-    {NULL, NULL, NULL},
-    {NULL, NULL, NULL},
-};
-
-static inline int iq3_data_index(int grid_size) {
-    (void)grid_size;
-    GGML_ASSERT(grid_size == 256 || grid_size == 512);
-    return grid_size == 256 ? 0 : 1;
-}
-
-static int iq3_compare_func(const void * left, const void * right) {
-    const int * l = (const int *)left;
-    const int * r = (const int *)right;
-    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
-}
-
-void iq3xs_init_impl(int grid_size) {
-    const int gindex = iq3_data_index(grid_size);
-    if (iq3_data[gindex].grid) {
-        return;
-    }
-    static const uint16_t kgrid_256[256] = {
-            0,     2,     4,     9,    11,    15,    16,    18,    25,    34,    59,    61,    65,    67,    72,    74,
-           81,    85,    88,    90,    97,   108,   120,   128,   130,   132,   137,   144,   146,   153,   155,   159,
-          169,   175,   189,   193,   199,   200,   202,   213,   248,   267,   287,   292,   303,   315,   317,   321,
-          327,   346,   362,   413,   436,   456,   460,   462,   483,   497,   513,   515,   520,   522,   529,   531,
-          536,   538,   540,   551,   552,   576,   578,   585,   592,   594,   641,   643,   648,   650,   657,   664,
-          698,   704,   706,   720,   729,   742,   758,   769,   773,   808,   848,   852,   870,   889,   901,   978,
-          992,  1024,  1026,  1033,  1035,  1040,  1042,  1046,  1049,  1058,  1089,  1091,  1093,  1096,  1098,  1105,
-         1112,  1139,  1143,  1144,  1152,  1154,  1161,  1167,  1168,  1170,  1183,  1184,  1197,  1217,  1224,  1228,
-         1272,  1276,  1309,  1323,  1347,  1367,  1377,  1404,  1473,  1475,  1486,  1509,  1537,  1544,  1546,  1553,
-         1555,  1576,  1589,  1594,  1600,  1602,  1616,  1625,  1636,  1638,  1665,  1667,  1672,  1685,  1706,  1722,
-         1737,  1755,  1816,  1831,  1850,  1856,  1862,  1874,  1901,  1932,  1950,  1971,  2011,  2032,  2052,  2063,
-         2077,  2079,  2091,  2095,  2172,  2192,  2207,  2208,  2224,  2230,  2247,  2277,  2308,  2345,  2356,  2389,
-         2403,  2424,  2501,  2504,  2506,  2520,  2570,  2593,  2616,  2624,  2630,  2646,  2669,  2700,  2714,  2746,
-         2754,  2795,  2824,  2835,  2839,  2874,  2882,  2905,  2984,  3028,  3042,  3092,  3108,  3110,  3124,  3153,
-         3185,  3215,  3252,  3288,  3294,  3364,  3397,  3434,  3483,  3523,  3537,  3587,  3589,  3591,  3592,  3610,
-         3626,  3670,  3680,  3722,  3749,  3754,  3776,  3789,  3803,  3824,  3857,  3873,  3904,  3906,  3924,  3992,
-    };
-    static const uint16_t kgrid_512[512] = {
-            0,     1,     2,     5,     7,     8,     9,    10,    12,    14,    16,    17,    21,    27,    32,    34,
-           37,    39,    41,    43,    48,    50,    57,    60,    63,    64,    65,    66,    68,    72,    73,    77,
-           80,    83,    87,    89,    93,   100,   113,   117,   122,   128,   129,   133,   135,   136,   139,   142,
-          145,   149,   152,   156,   162,   165,   167,   169,   171,   184,   187,   195,   201,   205,   208,   210,
-          217,   219,   222,   228,   232,   234,   247,   249,   253,   256,   267,   271,   273,   276,   282,   288,
-          291,   297,   312,   322,   324,   336,   338,   342,   347,   353,   357,   359,   374,   379,   390,   393,
-          395,   409,   426,   441,   448,   450,   452,   464,   466,   470,   475,   488,   492,   512,   513,   514,
-          516,   520,   521,   523,   525,   527,   528,   530,   537,   540,   542,   556,   558,   561,   570,   576,
-          577,   579,   582,   584,   588,   593,   600,   603,   609,   616,   618,   632,   638,   640,   650,   653,
-          655,   656,   660,   666,   672,   675,   685,   688,   698,   705,   708,   711,   712,   715,   721,   727,
-          728,   732,   737,   754,   760,   771,   773,   778,   780,   793,   795,   802,   806,   808,   812,   833,
-          840,   843,   849,   856,   858,   873,   912,   916,   919,   932,   934,   961,   963,   968,   970,   977,
-          989,   993,  1010,  1016,  1024,  1025,  1027,  1029,  1031,  1032,  1034,  1036,  1038,  1041,  1043,  1047,
-         1048,  1050,  1057,  1059,  1061,  1064,  1066,  1079,  1080,  1083,  1085,  1088,  1090,  1096,  1099,  1103,
-         1106,  1109,  1113,  1116,  1122,  1129,  1153,  1156,  1159,  1169,  1171,  1176,  1183,  1185,  1195,  1199,
-         1209,  1212,  1216,  1218,  1221,  1225,  1234,  1236,  1241,  1243,  1250,  1256,  1270,  1281,  1287,  1296,
-         1299,  1306,  1309,  1313,  1338,  1341,  1348,  1353,  1362,  1375,  1376,  1387,  1400,  1408,  1410,  1415,
-         1425,  1453,  1457,  1477,  1481,  1494,  1496,  1507,  1512,  1538,  1545,  1547,  1549,  1551,  1554,  1561,
-         1563,  1565,  1570,  1572,  1575,  1577,  1587,  1593,  1601,  1603,  1605,  1612,  1617,  1619,  1632,  1648,
-         1658,  1662,  1664,  1674,  1680,  1690,  1692,  1704,  1729,  1736,  1740,  1745,  1747,  1751,  1752,  1761,
-         1763,  1767,  1773,  1787,  1795,  1801,  1806,  1810,  1817,  1834,  1840,  1844,  1857,  1864,  1866,  1877,
-         1882,  1892,  1902,  1915,  1934,  1953,  1985,  1987,  2000,  2002,  2013,  2048,  2052,  2058,  2064,  2068,
-         2071,  2074,  2081,  2088,  2104,  2114,  2119,  2121,  2123,  2130,  2136,  2141,  2147,  2153,  2157,  2177,
-         2179,  2184,  2189,  2193,  2203,  2208,  2223,  2226,  2232,  2244,  2249,  2251,  2256,  2258,  2265,  2269,
-         2304,  2306,  2324,  2335,  2336,  2361,  2373,  2375,  2385,  2418,  2443,  2460,  2480,  2504,  2509,  2520,
-         2531,  2537,  2562,  2568,  2572,  2578,  2592,  2596,  2599,  2602,  2614,  2620,  2625,  2627,  2629,  2634,
-         2641,  2650,  2682,  2688,  2697,  2707,  2712,  2718,  2731,  2754,  2759,  2760,  2775,  2788,  2793,  2805,
-         2811,  2817,  2820,  2832,  2842,  2854,  2890,  2902,  2921,  2923,  2978,  3010,  3012,  3026,  3081,  3083,
-         3085,  3097,  3099,  3120,  3136,  3152,  3159,  3188,  3210,  3228,  3234,  3245,  3250,  3256,  3264,  3276,
-         3281,  3296,  3349,  3363,  3378,  3392,  3395,  3420,  3440,  3461,  3488,  3529,  3531,  3584,  3588,  3591,
-         3600,  3602,  3614,  3616,  3628,  3634,  3650,  3657,  3668,  3683,  3685,  3713,  3716,  3720,  3726,  3729,
-         3736,  3753,  3778,  3802,  3805,  3819,  3841,  3845,  3851,  3856,  3880,  3922,  3938,  3970,  3993,  4032,
-    };
-
-    const int kmap_size = 4096;
-    const int nwant = grid_size == 256 ? 2 : 3;
-    const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
-    uint32_t * kgrid_q3xs;
-    int      * kmap_q3xs;
-    uint16_t * kneighbors_q3xs;
-
-    printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
-    uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
-    for (int k = 0; k < grid_size; ++k) {
-        int8_t * pos = (int8_t *)(the_grid + k);
-        for (int i = 0; i < 4; ++i) {
-            int l = (kgrid[k] >> 3*i) & 0x7;
-            pos[i] = 2*l + 1;
-        }
-    }
-    kgrid_q3xs = the_grid;
-    iq3_data[gindex].grid = the_grid;
-    kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
-    iq3_data[gindex].map = kmap_q3xs;
-    for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
-    uint32_t aux32;
-    uint8_t * aux8 = (uint8_t *)&aux32;
-    for (int i = 0; i < grid_size; ++i) {
-        aux32 = kgrid_q3xs[i];
-        uint16_t index = 0;
-        for (int k=0; k<4; ++k) {
-            uint16_t q = (aux8[k] - 1)/2;
-            index |= (q << 3*k);
-        }
-        kmap_q3xs[index] = i;
-    }
-    int8_t pos[4];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
-    int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
-    }
-    printf("%s: %d neighbours in total\n", __func__, num_neighbors);
-    kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
-    iq3_data[gindex].neighbours = kneighbors_q3xs;
-    int counter = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        kmap_q3xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q3xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q3xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
-}
-
-void iq3xs_free_impl(int grid_size) {
-    GGML_ASSERT(grid_size == 256 || grid_size == 512);
-    const int gindex = iq3_data_index(grid_size);
-    if (iq3_data[gindex].grid) {
-        free(iq3_data[gindex].grid);       iq3_data[gindex].grid = NULL;
-        free(iq3_data[gindex].map);        iq3_data[gindex].map  = NULL;
-        free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
-    }
-}
-
-static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
-    int num_neighbors = neighbours[0];
-    GGML_ASSERT(num_neighbors > 0);
-    float best_d2 = FLT_MAX;
-    int grid_index = -1;
-    for (int j = 1; j <= num_neighbors; ++j) {
-        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
-        float d2 = 0;
-        for (int i = 0; i < 4; ++i) {
-            float q = pg[i];
-            float diff = scale*q - xval[i];
-            d2 += weight[i]*diff*diff;
-        }
-        if (d2 < best_d2) {
-            best_d2 = d2; grid_index = neighbours[j];
-        }
-    }
-    GGML_ASSERT(grid_index >= 0);
-    const int8_t * pg = (const int8_t *)(grid + grid_index);
-    for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
-    return grid_index;
-}
-
-static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n,
-        const float * restrict quant_weights) {
-
-    const int gindex = iq3_data_index(grid_size);
-
-    const uint32_t * kgrid_q3xs      = iq3_data[gindex].grid;
-    const int      * kmap_q3xs       = iq3_data[gindex].map;
-    const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
-
-    //GGML_ASSERT(quant_weights   && "missing quantization weights");
-    GGML_ASSERT(kgrid_q3xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kmap_q3xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int kMaxQ = 8;
-
-    const int nbl = n/QK_K;
-
-    ggml_fp16_t * dh;
-    uint8_t * qs;
-    int block_size;
-    if (grid_size == 256) {
-        block_iq3_xxs * y = vy;
-        dh = &y->d;
-        qs = y->qs;
-        block_size = sizeof(block_iq3_xxs);
-    } else {
-        block_iq3_s * y = vy;
-        dh = &y->d;
-        qs = y->qs;
-        block_size = sizeof(block_iq3_s);
-    }
-    int quant_size = block_size - sizeof(ggml_fp16_t);
-
-    float scales[QK_K/32];
-    float weight[32];
-    float xval[32];
-    int8_t L[32];
-    int8_t Laux[32];
-    float  waux[32];
-    bool   is_on_grid[8];
-    bool   is_on_grid_aux[8];
-    uint8_t block_signs[8];
-    uint8_t q3[3*(QK_K/8)+QK_K/32];
-    uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
-    uint8_t  * qh = q3 + 3*(QK_K/8);
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        dh[0] = GGML_FP32_TO_FP16(0.f);
-        memset(q3, 0, 3*QK_K/8+QK_K/32);
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = 2*sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            const float * xb = xbl + 32*ib;
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*ibl + 32*ib;
-                for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            } else {
-                for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
-            }
-            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
-            for (int k = 0; k < 4; ++k) {
-                int nflip = 0;
-                uint8_t s = 0;
-                for (int i = 0; i < 8; ++i) {
-                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
-                    else {
-                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
-                    }
-                }
-                if (nflip%2) {
-                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
-                    for (int i = 1; i < 8; ++i) {
-                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
-                        if (ax < min) {
-                            min = ax; imin = i;
-                        }
-                    }
-                    xval[8*k+imin] = -xval[8*k+imin];
-                    s ^= (1 << imin);
-                }
-                block_signs[k] = s & 127;
-            }
-            float max = xval[0];
-            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
-            if (!max) {
-                scales[ib] = 0;
-                memset(L, 0, 32);
-                continue;
-            }
-            float best = 0;
-            float scale = max/(2*kMaxQ-1);
-            for (int is = -15; is <= 15; ++is) {
-                float id = (2*kMaxQ-1+is*0.2f)/max;
-                float this_scale = 1/id;
-                for (int k = 0; k < 8; ++k) {
-                    for (int i = 0; i < 4; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
-                        Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
-                    }
-                    uint16_t u = 0;
-                    for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
-                    int grid_index = kmap_q3xs[u];
-                    is_on_grid_aux[k] = true;
-                    if (grid_index < 0) {
-                        is_on_grid_aux[k] = false;
-                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
-                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 32; ++i) {
-                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                    scale = sumqx/sumq2; best = scale*sumqx;
-                    for (int i = 0; i < 32; ++i) L[i] = Laux[i];
-                    for (int k = 0; k <  8; ++k) is_on_grid[k] = is_on_grid_aux[k];
-                }
-            }
-            int n_not_ongrid = 0;
-            for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
-            if (n_not_ongrid > 0 && scale > 0) {
-                float id = 1/scale;
-                for (int k = 0; k < 8; ++k) {
-                    if (is_on_grid[k]) continue;
-                    uint16_t u = 0;
-                    for (int i = 0; i < 4; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
-                        l = MAX(0, MIN(kMaxQ-1, l));
-                        u |= (l << 3*i);
-                    }
-                    int grid_index = kmap_q3xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
-                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
-                    }
-                    const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
-                    for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 32; ++i) {
-                    float w = weight[i];
-                    float q = 2*L[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0) scale = sumqx/sumq2;
-            }
-            if (scale < 0) {
-                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
-                // and correspondingly flip quant signs.
-                scale = -scale;
-                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
-            }
-            for (int k = 0; k < 8; ++k) {
-                uint16_t u = 0;
-                for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
-                int grid_index = kmap_q3xs[u];
-                if (grid_index < 0) {
-                    printf("Oops: found point %u not on grid:", u);
-                    for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
-                    printf("\n");
-                    GGML_ASSERT(false);
-                }
-                if (grid_size == 256) {
-                    q3[8*ib+k] = grid_index;
-                } else {
-                    q3[8*ib+k] = grid_index & 255;
-                    qh[ib] |= ((grid_index >> 8) << k);
-                }
-
-            }
-            scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            memset(qs, 0, quant_size);
-            dh += block_size/sizeof(ggml_fp16_t);
-            qs += block_size;
-            continue;
-        }
-
-        float d = max_scale/31;
-        dh[0] = GGML_FP32_TO_FP16(d * 1.0125f);  // small improvement via this fudge factor
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]-1));
-            l = MAX(0, MIN(15, l));
-            scales_and_signs[ib] |= ((uint32_t)l << 28);
-        }
-        memcpy(qs, q3, quant_size);
-
-        dh += block_size/sizeof(ggml_fp16_t);
-        qs += block_size;
-
-    }
-}
-
-size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq3_xxs);
-    }
-    return nrow * nblock * sizeof(block_iq3_xxs);
-}
-
-void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq3_xxs * restrict y = vy;
-    quantize_row_iq3_xxs_reference(x, y, k);
-}
-
-void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
-    assert(k % QK_K == 0);
-    quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
-}
-
-static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
-        const float * restrict quant_weights,
-        float   * scales,
-        float   * weight,
-        float   * xval,
-        int8_t  * L,
-        int8_t  * Laux,
-        float   * waux,
-        bool    * is_on_grid,
-        bool    * is_on_grid_aux,
-        uint8_t * block_signs) {
-
-    const int gindex = iq3_data_index(512);
-
-    const uint32_t * kgrid_q3xs      = iq3_data[gindex].grid;
-    const int      * kmap_q3xs       = iq3_data[gindex].map;
-    const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
-
-    //GGML_ASSERT(quant_weights   && "missing quantization weights");
-    GGML_ASSERT(kgrid_q3xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kmap_q3xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int kMaxQ = 8;
-
-    const int nbl = n/QK_K;
-
-    block_iq3_s * y = vy;
-
-    const int bs4 = block_size/4;
-    const int bs8 = block_size/8;
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        memset(&y[ibl], 0, sizeof(block_iq3_s));
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
-
-        uint8_t * qs = y[ibl].qs;
-        uint8_t * qh = y[ibl].qh;
-        uint8_t * signs = y[ibl].signs;
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = 2*sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/block_size; ++ib) {
-            const float * xb = xbl + block_size*ib;
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*ibl + block_size*ib;
-                for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            } else {
-                for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
-            }
-            for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
-            for (int k = 0; k < bs8; ++k) {
-                uint8_t s = 0;
-                for (int i = 0; i < 8; ++i) {
-                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
-                    else {
-                        xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
-                    }
-                }
-                block_signs[k] = s;
-            }
-            float max = xval[0];
-            for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
-            if (!max) {
-                scales[ib] = 0;
-                continue;
-            }
-            float best = 0;
-            float scale = max/(2*kMaxQ-1);
-            for (int is = -15; is <= 15; ++is) {
-                float id = (2*kMaxQ-1+is*0.2f)/max;
-                float this_scale = 1/id;
-                for (int k = 0; k < bs4; ++k) {
-                    for (int i = 0; i < 4; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
-                        Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
-                    }
-                    uint16_t u = 0;
-                    for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
-                    int grid_index = kmap_q3xs[u];
-                    is_on_grid_aux[k] = true;
-                    if (grid_index < 0) {
-                        is_on_grid_aux[k] = false;
-                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
-                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < block_size; ++i) {
-                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                    scale = sumqx/sumq2; best = scale*sumqx;
-                    for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
-                    for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
-                }
-            }
-            int n_not_ongrid = 0;
-            for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
-            if (n_not_ongrid > 0 && scale > 0) {
-                float id = 1/scale;
-                for (int k = 0; k < bs4; ++k) {
-                    if (is_on_grid[k]) continue;
-                    uint16_t u = 0;
-                    for (int i = 0; i < 4; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
-                        l = MAX(0, MIN(kMaxQ-1, l));
-                        u |= (l << 3*i);
-                    }
-                    int grid_index = kmap_q3xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
-                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
-                    }
-                    const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
-                    for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < block_size; ++i) {
-                    float w = weight[i];
-                    float q = 2*L[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0) scale = sumqx/sumq2;
-            }
-            if (scale < 0) {
-                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
-                // and correspondingly flip quant signs.
-                scale = -scale;
-                for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
-            }
-            for (int k = 0; k < bs4; ++k) {
-                uint16_t u = 0;
-                for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
-                int grid_index = kmap_q3xs[u];
-                if (grid_index < 0) {
-                    printf("Oops: found point %u not on grid:", u);
-                    for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
-                    printf("\n");
-                    GGML_ASSERT(false);
-                }
-                qs[k] = grid_index & 255;
-                qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
-            }
-            qs += bs4;
-            for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
-            signs += bs8;
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            continue;
-        }
-
-        float d = max_scale/31;
-        y[ibl].d = GGML_FP32_TO_FP16(d);
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/block_size; ib += 2) {
-            int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
-            l1 = MAX(0, MIN(15, l1));
-            int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
-            l2 = MAX(0, MIN(15, l2));
-            y[ibl].scales[ib/2] = l1 | (l2 << 4);
-        }
-
-    }
-}
-
-#define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    float scales[QK_K/IQ3S_BLOCK_SIZE];
-    float weight[IQ3S_BLOCK_SIZE];
-    float xval[IQ3S_BLOCK_SIZE];
-    int8_t L[IQ3S_BLOCK_SIZE];
-    int8_t Laux[IQ3S_BLOCK_SIZE];
-    float  waux[IQ3S_BLOCK_SIZE];
-    bool   is_on_grid[IQ3S_BLOCK_SIZE/4];
-    bool   is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
-    uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
-                scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq3_s);
-    }
-    return nrow * nblock * sizeof(block_iq3_s);
-}
-
-void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq3_s * restrict y = vy;
-    quantize_row_iq3_s_reference(x, y, k);
-}
-
-void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
-    assert(k % QK_K == 0);
-    quantize_iq3_s(x, y, 1, k, NULL, NULL);
-}
-
-
-// =================================== 1.5 bpw ===================================================
-
-static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
-    int num_neighbors = neighbours[0];
-    GGML_ASSERT(num_neighbors > 0);
-    float best_score = 0;
-    int grid_index = -1;
-    for (int j = 1; j <= num_neighbors; ++j) {
-        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
-        float sumqx = 0, sumq2 = 0;
-        for (int i = 0; i < 8; ++i) {
-            float q = (pg[i] - 3)/2;
-            float w = weight[i];
-            sumqx += w*q*xval[i];
-            sumq2 += w*q*q;
-        }
-        if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-            *scale = sumqx/sumq2; best_score = *scale * sumqx;
-            grid_index = neighbours[j];
-        }
-    }
-    if (grid_index < 0) {
-        for (int i = 0; i < ngrid; ++i) {
-            const int8_t * grid_i = (const int8_t *)(grid + i);
-            float sumqx = 0, sumq2 = 0;
-            for (int j = 0; j < 8; ++j) {
-                float w = weight[j];
-                float q = (grid_i[j] - 3)/2;
-                sumqx += w*q*xval[j];
-                sumq2 += w*q*q;
-            }
-            if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-                *scale = sumqx/sumq2; best_score = *scale*sumqx;
-                grid_index = i;
-            }
-        }
-    }
-    if (grid_index < 0) {
-        printf("Oops, did not find grid point\n");
-        printf("Have %d neighbours\n", num_neighbors);
-        for (int j = 1; j <= num_neighbors; ++j) {
-            const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
-            float sumqx = 0, sumq2 = 0;
-            for (int i = 0; i < 8; ++i) {
-                float q = (pg[i] - 3)/2;
-                float w = weight[i];
-                sumqx += w*q*xval[i];
-                sumq2 += w*q*q;
-            }
-            printf("    neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
-        }
-    }
-    GGML_ASSERT(grid_index >= 0);
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    *scale *= 1.05f;  // This is a fudge factor. Don't ask me why it improves the result.
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    const int8_t * pg = (const int8_t *)(grid + grid_index);
-    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
-    return grid_index;
-}
-
-static int iq1_sort_helper(const void * left, const void * right) {
-    const float * l = left;
-    const float * r = right;
-    return *l < *r ? -1 : *l > *r ? 1 : 0;
-}
-
-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-
-    const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
-
-    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
-    const int      * kmap_q2xs       = iq2_data[gindex].map;
-    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
-
-    GGML_ASSERT(quant_weights   && "missing quantization weights");
-    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int nbl = n/QK_K;
-
-    block_iq1_s * y = vy;
-
-    float  scales[QK_K/8];
-    float  weight[8];
-    int8_t L[8];
-    float  sumx[9];
-    float  sumw[9];
-    float  pairs[16];
-    int * idx = (int *)(pairs + 1);
-    uint8_t hbit[QK_K/8];
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
-        memset(y[ibl].qs, 0, QK_K/8);
-        memset(y[ibl].scales, 0, QK_K/16);
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/8; ++ib) {
-            const float * xb = xbl + 8*ib;
-            const float * qw = quant_weights + QK_K*ibl + 8*ib;
-            for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            float max = fabsf(xb[0]);
-            for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
-            if (!max) {
-                scales[ib] = 0;
-                memset(L, 1, 8);
-                continue;
-            }
-            // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
-            // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
-            // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
-            // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
-            // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
-            // for each possible and score for each split.
-            for (int j = 0; j < 8; ++j) {
-                pairs[2*j] = xb[j];
-                idx[2*j] = j;
-            }
-            qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
-            {
-                sumx[0] = sumw[0] = 0;
-                for (int j = 0; j < 8; ++j) {
-                    int i = idx[2*j];
-                    sumx[j+1] = sumx[j] + weight[i]*xb[i];
-                    sumw[j+1] = sumw[j] + weight[i];
-                }
-            }
-            float best_score = 0, scale = max;
-            int besti1 = 0, besti2 = 0;
-            for (int i1 = 0; i1 <= 8; ++i1) {
-                for (int i2 = i1; i2 <= 8; ++i2) {
-                    float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
-                    float sumq2 =  (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
-                    if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
-                        scale = sumqx/sumq2; best_score = scale*sumqx;
-                        besti1 = i1; besti2 = i2;
-                    }
-                }
-            }
-            for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
-            for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
-            for (int j = besti2; j <      8; ++j) L[idx[2*j]] = 2;
-            if (scale < 0) {
-                for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
-                scale = -scale;
-            }
-            // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
-            // grid point that minimizes SSD.
-            uint16_t u = 0;
-            for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
-            int grid_index = kmap_q2xs[u];
-            if (grid_index < 0) {
-                const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
-                GGML_ASSERT(grid_index >= 0);
-            }
-            y[ibl].qs[ib] = grid_index & 255;
-            hbit[ib] = grid_index >> 8;
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            memset(y[ibl].qs, 0, QK_K/8);
-            continue;
-        }
-
-        float d = max_scale/15;
-        y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/8; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]-1));
-            l = MAX(0, MIN(7, l));
-            if (hbit[ib]) l |= 8;
-            y[ibl].scales[ib/2] |= (l << 4*(ib%2));
-        }
-    }
-}
-
-size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq1_s);
-    }
-    return nrow * nblock * sizeof(block_iq1_s);
-}
-
-// ============================ 4-bit non-linear quants
-
-static inline int best_index_int8(int n, const int8_t * val, float x) {
-    if (x <= val[0]) return 0;
-    if (x >= val[n-1]) return n-1;
-    int ml = 0, mu = n-1;
-    while (mu-ml > 1) {
-        int mav = (ml+mu)/2;
-        if (x < val[mav]) mu = mav; else ml = mav;
-    }
-    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
-}
-
-static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
-        ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
-        float * scales, float * weight, uint8_t * L,
-        const int8_t * values,
-        const float * quant_weights) {
-
-    const int ntry = 7;
-
-    float sigma2 = 0;
-    for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
-    sigma2 *= 2.f/super_block_size;
-
-    memset(q4, 0, super_block_size/2);
-    dh[0] = GGML_FP32_TO_FP16(0.f);
-
-    float max_scale = 0, amax_scale = 0;
-    for (int ib = 0; ib < super_block_size/block_size; ++ib) {
-        const float * xb = x + ib*block_size;
-        if (quant_weights) {
-            const float * qw = quant_weights + ib*block_size;
-            for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        } else {
-            for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
-        }
-        float amax = 0, max = 0;
-        for (int j = 0; j < block_size; ++j) {
-            float ax = fabsf(xb[j]);
-            if (ax > amax) {
-                amax = ax; max = xb[j];
-            }
-        }
-        if (!amax) {
-            scales[ib] = 0;
-            continue;
-        }
-        float d = -max/values[0];
-        float id = 1/d;
-        float sumqx = 0, sumq2 = 0;
-        for (int j = 0; j < block_size; ++j) {
-            float al = id*xb[j];
-            int l = best_index_int8(16, values, al);
-            float q = values[l];
-            float w = weight[j];
-            sumqx += w*q*xb[j];
-            sumq2 += w*q*q;
-        }
-        d = sumqx/sumq2;
-        float best = d*sumqx;
-        for (int itry = -ntry; itry <= ntry; ++itry) {
-            id = (itry + values[0])/max;
-            sumqx = sumq2 = 0;
-            for (int j = 0; j < block_size; ++j) {
-                float al = id*xb[j];
-                int l = best_index_int8(16, values, al);
-                float q = values[l];
-                float w = weight[j];
-                sumqx += w*q*xb[j];
-                sumq2 += w*q*q;
-            }
-            if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                d = sumqx/sumq2; best = d * sumqx;
-            }
-        }
-        scales[ib] = d;
-        float abs_d = fabsf(d);
-        if (abs_d > amax_scale) {
-            amax_scale = abs_d; max_scale = d;
-        }
-    }
-
-    if (super_block_size/block_size > 1) {
-        int nb = super_block_size/block_size;
-        memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
-        float d = -max_scale/32;
-        dh[0] = GGML_FP32_TO_FP16(d);
-        float id = d ? 1/d : 0.f;
-        for (int ib = 0; ib < super_block_size/block_size; ++ib) {
-            int l = nearest_int(id*scales[ib]);
-            l = MAX(-32, MIN(31, l));
-            float dl = d * l;
-            float idl = dl ? 1/dl : 0.f;
-            uint8_t * Lb = L + ib*block_size;
-            const float * xb = x + ib*block_size;
-            for (int j = 0; j < block_size; ++j) {
-                Lb[j] = best_index_int8(16, values, idl*xb[j]);
-            }
-            l += 32;
-            uint8_t l_l = l & 0xf;
-            uint8_t l_h = l >>  4;
-            if (ib%2 == 0) scales_l[ib/2] = l_l;
-            else scales_l[ib/2] |= (l_l << 4);
-            scales_h[ib/8] |= (l_h << 2*(ib%8));
-        }
-    } else {
-        dh[0] = GGML_FP32_TO_FP16(scales[0]);
-        float id = scales[0] ? 1/scales[0] : 0;
-        for (int j = 0; j < super_block_size; ++j) {
-            L[j] = best_index_int8(16, values, id*x[j]);
-        }
-    }
-
-    for (int i = 0; i < super_block_size/32; ++i) {
-        for (int j = 0; j < 16; ++j) {
-            q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
-        }
-    }
-}
-
-size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK4_NL == 0);
-    int nblock = n_per_row/QK4_NL;
-    char * qrow = (char *)dst;
-    uint8_t L[QK4_NL];
-    float weight[QK4_NL];
-    uint16_t unused_h;
-    uint8_t * unused_l = NULL;
-    float scale;
-    for (int row = 0; row < nrow; ++row) {
-        block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
-        for (int ibl = 0; ibl < nblock; ++ibl) {
-            const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
-            quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                    &scale, weight, L, kvalues_iq4nl, qw);
-        }
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq4_nl);
-    }
-    return nrow * nblock * sizeof(block_iq4_nl);
-}
-
-void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_NL == 0);
-    block_iq4_nl * restrict y = vy;
-    quantize_row_iq4_nl_reference(x, y, k);
-}
-
-void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
-    assert(k % QK4_NL == 0);
-    quantize_iq4_nl(x, y, 1, k, NULL, NULL);
-}
-
-size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-#if QK_K == 64
-    return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
-#else
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    uint8_t L[QK_K];
-    float weight[32];
-    float scales[QK_K/32];
-    for (int row = 0; row < nrow; ++row) {
-        block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
-        for (int ibl = 0; ibl < nblock; ++ibl) {
-            const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
-            quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
-                    scales, weight, L, kvalues_iq4nl, qw);
-        }
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq4_xs);
-    }
-    return nrow * nblock * sizeof(block_iq4_xs);
-#endif
-}
-
-void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq4_xs * restrict y = vy;
-    quantize_row_iq4_xs_reference(x, y, k);
-}
-
-void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
-    assert(k % QK_K == 0);
-    quantize_iq4_xs(x, y, 1, k, NULL, NULL);
-}
-
-// =============================== 2.5625 bpw
-
-static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-
-    const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
-
-    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
-    const int      * kmap_q2xs       = iq2_data[gindex].map;
-    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
-
-    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
-    GGML_ASSERT(n%QK_K == 0);
-
-    const int kMaxQ = 3;
-
-    const int nbl = n/QK_K;
-
-    block_iq2_s * y = vy;
-
-    float scales[QK_K/16];
-    float weight[16];
-    float xval[16];
-    int8_t L[16];
-    int8_t Laux[16];
-    float  waux[16];
-    bool   is_on_grid[2];
-    bool   is_on_grid_aux[2];
-    uint8_t block_signs[2];
-
-    for (int ibl = 0; ibl < nbl; ++ibl) {
-
-        memset(&y[ibl], 0, sizeof(block_iq2_s));
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
-
-        float max_scale = 0;
-
-        const float * xbl = x + QK_K*ibl;
-        float sumx2 = 0;
-        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
-        float sigma2 = 2*sumx2/QK_K;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            const float * xb = xbl + 16*ib;
-            if (quant_weights) {
-                const float * qw = quant_weights + QK_K*ibl + 16*ib;
-                for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
-            } else {
-                for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
-            }
-            for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
-            for (int k = 0; k < 2; ++k) {
-                uint8_t s = 0;
-                for (int i = 0; i < 8; ++i) {
-                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
-                    else {
-                        xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
-                    }
-                }
-                block_signs[k] = s;
-            }
-            float max = xval[0];
-            for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
-            if (!max) {
-                scales[ib] = 0;
-                continue;
-            }
-            float best = 0;
-            float scale = max/(2*kMaxQ-1);
-            is_on_grid[0] = is_on_grid[1] = true;
-            for (int is = -9; is <= 9; ++is) {
-                float id = (2*kMaxQ-1+is*0.1f)/max;
-                float this_scale = 1/id;
-                for (int k = 0; k < 2; ++k) {
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
-                    }
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
-                    int grid_index = kmap_q2xs[u];
-                    is_on_grid_aux[k] = true;
-                    if (grid_index < 0) {
-                        is_on_grid_aux[k] = false;
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 16; ++i) {
-                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                    scale = sumqx/sumq2; best = scale*sumqx;
-                    for (int i = 0; i < 16; ++i) L[i] = Laux[i];
-                    for (int k = 0; k <  2; ++k) is_on_grid[k] = is_on_grid_aux[k];
-                }
-            }
-            int n_not_ongrid = 0;
-            for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
-            if (n_not_ongrid > 0 && scale > 0) {
-                float id = 1/scale;
-                for (int k = 0; k < 2; ++k) {
-                    if (is_on_grid[k]) continue;
-                    uint16_t u = 0;
-                    for (int i = 0; i < 8; ++i) {
-                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
-                        l = MAX(0, MIN(kMaxQ-1, l));
-                        u |= (l << 2*i);
-                        L[8*k + i] = l;
-                    }
-                    int grid_index = kmap_q2xs[u];
-                    if (grid_index < 0) {
-                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
-                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
-                    }
-                }
-                float sumqx = 0, sumq2 = 0;
-                for (int i = 0; i < 16; ++i) {
-                    float w = weight[i];
-                    float q = 2*L[i] + 1;
-                    sumqx += w*xval[i]*q;
-                    sumq2 += w*q*q;
-                }
-                if (sumq2 > 0) scale = sumqx/sumq2;
-            }
-            if (scale < 0) {
-                scale = -scale;
-                for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
-            }
-            for (int k = 0; k < 2; ++k) {
-                uint16_t u = 0;
-                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
-                int grid_index = kmap_q2xs[u];
-                if (grid_index < 0) {
-                    printf("Oops: found point %u not on grid:", u);
-                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
-                    printf("\n");
-                    GGML_ASSERT(false);
-                }
-                const int i8 = 2*ib + k;
-                y[ibl].qs[i8] = grid_index & 255;
-                y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
-                y[ibl].qs[QK_K/8 + i8] = block_signs[k];
-            }
-            GGML_ASSERT(scale >= 0);
-            scales[ib] = scale;
-            max_scale = MAX(max_scale, scale);
-        }
-
-        if (!max_scale) {
-            continue;
-        }
-
-        float d = max_scale/31;
-        y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
-        float id = 1/d;
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]-1));
-            l = MAX(0, MIN(15, l));
-            if (ib%2 == 0) y[ibl].scales[ib/2] = l;
-            else y[ibl].scales[ib/2] |= (l << 4);
-        }
-    }
-}
-
-size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
-    (void)hist;
-    GGML_ASSERT(n_per_row%QK_K == 0);
-    int nblock = n_per_row/QK_K;
-    char * qrow = (char *)dst;
-    for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
-        src += n_per_row;
-        qrow += nblock*sizeof(block_iq2_s);
-    }
-    return nrow * nblock * sizeof(block_iq2_s);
-}
-
-void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
-    assert(k % QK_K == 0);
-    quantize_iq2_s(x, y, 1, k, NULL, NULL);
-}
-
-void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_s * restrict y = vy;
-    quantize_row_iq2_s_reference(x, y, k);
-}
diff --git a/ggml-quants.h b/ggml-quants.h
deleted file mode 100644
index 316e35687..000000000
--- a/ggml-quants.h
+++ /dev/null
@@ -1,362 +0,0 @@
-#pragma once
-
-#include "ggml-impl.h"
-
-// GGML internal header
-
-#include <stdint.h>
-#include <stddef.h>
-
-#define QK4_0 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    ggml_fp16_t m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    int8_t  qs[QK8_0];     // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
-    float d;               // delta
-    float s;               // d * sum(qs[i])
-    int8_t  qs[QK8_1];     // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
-
-//
-// Super-block quantization structures
-//
-
-// Super-block size
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    ggml_fp16_t d;           // super-block scale for quantized scales
-    ggml_fp16_t dmin;        // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[2];
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
-#else
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[12];        // scales, quantized with 6 bits
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
-#endif
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d[2];          // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;             // super-block scale for quantized scales
-    ggml_fp16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d;               // super-block scale
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;               // super-block scale for quantized scales
-    ggml_fp16_t dmin;            // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_fp16_t d;           // super-block scale
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
-
-// This is only used for intermediate quantization and dot products
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
-
-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// 2.5625 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t scales[QK_K/32];
-} block_iq2_s;
-static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
-
-// (Almost) "true" 3-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 3.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[3*QK_K/8];
-} block_iq3_xxs;
-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
-
-// 3.4375 bpw
-#if QK_K == 64
-#define IQ3S_N_SCALE 2
-#else
-#define IQ3S_N_SCALE QK_K/64
-#endif
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK_K/4];
-    uint8_t qh[QK_K/32];
-    uint8_t signs[QK_K/8];
-    uint8_t scales[IQ3S_N_SCALE];
-} block_iq3_s;
-static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
-
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
-} block_iq1_s;
-static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
-
-// Non-linear quants
-#define QK4_NL 32
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
-
-#if QK_K == 64
-#define block_iq4_xs block_iq4_nl
-//typedef struct block_iq4_nl block_iq4_xs;
-#else
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t scales_h;
-    uint8_t  scales_l[QK_K/64];
-    uint8_t  qs[QK_K/2];
-} block_iq4_xs;
-static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
-
-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
-void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int k);
-void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int k);
-void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int k);
-void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int k);
-
-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-
-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-
-// Dequantization
-void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-
-// Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-//
-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-//
-size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq2_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq1_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq3_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-
-void iq2xs_init_impl(enum ggml_type type);
-void iq2xs_free_impl(enum ggml_type type);
-void iq3xs_init_impl(int grid_size);
-void iq3xs_free_impl(int grid_size);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
deleted file mode 100644
index 6f391b0c6..000000000
--- a/ggml-sycl.cpp
+++ /dev/null
@@ -1,15198 +0,0 @@
-//
-// MIT license
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: MIT
-//
-
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <float.h>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include <cmath>
-#include <iostream>
-#include <fstream>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-
-#include <sycl/sycl.hpp>
-#include <sycl/half_type.hpp>
-
-#include "ggml-sycl.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
-/*
-Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
-*/
-// COPY from DPCT head files
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include <map>
-
-#if defined(__linux__)
-#include <sys/mman.h>
-#elif defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#error "Only support Windows and Linux."
-#endif
-
-#if defined(__linux__)
-#include <unistd.h>
-#include <sys/syscall.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-#define DPCT_COMPATIBILITY_TEMP (900)
-
-#if defined(_MSC_VER)
-#define __dpct_align__(n) __declspec(align(n))
-#define __dpct_inline__ __forceinline
-#else
-#define __dpct_align__(n) __attribute__((aligned(n)))
-#define __dpct_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __dpct_noinline__ __declspec(noinline)
-#else
-#define __dpct_noinline__ __attribute__((noinline))
-#endif
-
-namespace dpct
-{
-    typedef sycl::queue *queue_ptr;
-    typedef sycl::event *event_ptr;
-    typedef char *device_ptr;
-    typedef uint8_t byte_t;
-    typedef sycl::buffer<byte_t> buffer_t;
-
-    /// SYCL default exception handler
-    inline auto exception_handler = [](sycl::exception_list exceptions)
-    {
-        for (std::exception_ptr const &e : exceptions)
-        {
-            try
-            {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e)
-            {
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                          << e.what() << std::endl
-                          << "Exception caught at file:" << __FILE__
-                          << ", line:" << __LINE__ << std::endl;
-            }
-        }
-    };
-
-    enum error_code
-    {
-        success = 0,
-        default_error = 999
-    };
-
-    enum memcpy_direction
-    {
-        host_to_host,
-        host_to_device,
-        device_to_host,
-        device_to_device,
-        automatic
-    };
-
-    enum memory_region
-    {
-        global = 0, // device global memory
-        constant,   // device constant memory
-        local,      // device local memory
-        shared,     // memory which can be accessed by host and device
-    };
-
-    enum class library_data_t : unsigned char
-    {
-        real_float = 0,
-        complex_float,
-        real_double,
-        complex_double,
-        real_half,
-        complex_half,
-        real_bfloat16,
-        complex_bfloat16,
-        real_int4,
-        complex_int4,
-        real_uint4,
-        complex_uint4,
-        real_int8,
-        complex_int8,
-        real_uint8,
-        complex_uint8,
-        real_int16,
-        complex_int16,
-        real_uint16,
-        complex_uint16,
-        real_int32,
-        complex_int32,
-        real_uint32,
-        complex_uint32,
-        real_int64,
-        complex_int64,
-        real_uint64,
-        complex_uint64,
-        real_int8_4,
-        real_int8_32,
-        real_uint8_4,
-        library_data_t_size
-    };
-
-    template <typename T>
-    struct DataType
-    {
-        using T2 = T;
-    };
-    template <typename T>
-    struct DataType<sycl::vec<T, 2>>
-    {
-        using T2 = std::complex<T>;
-    };
-
-    static void destroy_event(event_ptr event)
-    {
-        delete event;
-    }
-
-    static inline unsigned int get_tid()
-    {
-#if defined(__linux__)
-        return syscall(SYS_gettid);
-#elif defined(_WIN64)
-        return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-    }
-
-    namespace detail
-    {
-        static void get_version(const sycl::device &dev, int &major, int &minor)
-        {
-            // Version string has the following format:
-            // a. OpenCL<space><major.minor><space><vendor-specific-information>
-            // b. <major.minor>
-            std::string ver;
-            ver = dev.get_info<sycl::info::device::version>();
-            std::string::size_type i = 0;
-            while (i < ver.size())
-            {
-                if (isdigit(ver[i]))
-                    break;
-                i++;
-            }
-            major = std::stoi(&(ver[i]));
-            while (i < ver.size())
-            {
-                if (ver[i] == '.')
-                    break;
-                i++;
-            }
-            i++;
-            minor = std::stoi(&(ver[i]));
-        }
-
-        template <typename tag, typename T>
-        class generic_error_type
-        {
-        public:
-            generic_error_type() = default;
-            generic_error_type(T value) : value{value} {}
-            operator T() const { return value; }
-
-        private:
-            T value;
-        };
-
-    } // namespace detail
-
-    /// Pitched 2D/3D memory data.
-    class pitched_data
-    {
-    public:
-        pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-        pitched_data(void *data, size_t pitch, size_t x, size_t y)
-            : _data(data), _pitch(pitch), _x(x), _y(y) {}
-
-        void *get_data_ptr() { return _data; }
-        void set_data_ptr(void *data) { _data = data; }
-
-        size_t get_pitch() { return _pitch; }
-        void set_pitch(size_t pitch) { _pitch = pitch; }
-
-        size_t get_x() { return _x; }
-        void set_x(size_t x) { _x = x; };
-
-        size_t get_y() { return _y; }
-        void set_y(size_t y) { _y = y; }
-
-    private:
-        void *_data;
-        size_t _pitch, _x, _y;
-    };
-
-    class device_info
-    {
-    public:
-        // get interface
-        const char *get_name() const { return _name; }
-        char *get_name() { return _name; }
-        template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
-        auto get_max_work_item_sizes() const
-        {
-            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-                return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
-            else
-            {
-                return _max_work_item_sizes_i;
-            }
-        }
-        template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
-        auto get_max_work_item_sizes()
-        {
-            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-                return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
-            else
-            {
-                return _max_work_item_sizes_i;
-            }
-        }
-        bool get_host_unified_memory() const { return _host_unified_memory; }
-        int get_major_version() const { return _major; }
-        int get_minor_version() const { return _minor; }
-        int get_integrated() const { return _integrated; }
-        int get_max_clock_frequency() const { return _frequency; }
-        int get_max_compute_units() const { return _max_compute_units; }
-        int get_max_work_group_size() const { return _max_work_group_size; }
-        int get_max_sub_group_size() const { return _max_sub_group_size; }
-        int get_max_work_items_per_compute_unit() const
-        {
-            return _max_work_items_per_compute_unit;
-        }
-        int get_max_register_size_per_work_group() const
-        {
-            return _max_register_size_per_work_group;
-        }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
-        auto get_max_nd_range_size() const
-        {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-                return _max_nd_range_size;
-            else
-                return _max_nd_range_size_i;
-        }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
-        auto get_max_nd_range_size()
-        {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-                return _max_nd_range_size;
-            else
-                return _max_nd_range_size_i;
-        }
-        size_t get_global_mem_size() const { return _global_mem_size; }
-        size_t get_local_mem_size() const { return _local_mem_size; }
-        size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
-        /// Returns the maximum clock rate of device's global memory in kHz. If
-        /// compiler does not support this API then returns default value 3200000 kHz.
-        unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-        /// Returns the maximum bus width between device and memory in bits. If
-        /// compiler does not support this API then returns default value 64 bits.
-        unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-        uint32_t get_device_id() const { return _device_id; }
-        std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-        /// Returns global memory cache size in bytes.
-        unsigned int get_global_mem_cache_size() const
-        {
-            return _global_mem_cache_size;
-        }
-
-        // set interface
-        void set_name(const char *name)
-        {
-            size_t length = strlen(name);
-            if (length < 256)
-            {
-                std::memcpy(_name, name, length + 1);
-            }
-            else
-            {
-                std::memcpy(_name, name, 255);
-                _name[255] = '\0';
-            }
-        }
-        void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
-        {
-            for (int i = 0; i < 3; ++i)
-                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-        }
-        [[deprecated]] void
-        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
-        {
-            for (int i = 0; i < 3; ++i)
-            {
-                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-            }
-        }
-        void set_host_unified_memory(bool host_unified_memory)
-        {
-            _host_unified_memory = host_unified_memory;
-        }
-        void set_major_version(int major) { _major = major; }
-        void set_minor_version(int minor) { _minor = minor; }
-        void set_integrated(int integrated) { _integrated = integrated; }
-        void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-        void set_max_compute_units(int max_compute_units)
-        {
-            _max_compute_units = max_compute_units;
-        }
-        void set_global_mem_size(size_t global_mem_size)
-        {
-            _global_mem_size = global_mem_size;
-        }
-        void set_local_mem_size(size_t local_mem_size)
-        {
-            _local_mem_size = local_mem_size;
-        }
-        void set_max_mem_alloc_size(size_t max_mem_alloc_size)
-        {
-            _max_mem_alloc_size = max_mem_alloc_size;
-        }
-        void set_max_work_group_size(int max_work_group_size)
-        {
-            _max_work_group_size = max_work_group_size;
-        }
-        void set_max_sub_group_size(int max_sub_group_size)
-        {
-            _max_sub_group_size = max_sub_group_size;
-        }
-        void
-        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
-        {
-            _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-        }
-        void set_max_nd_range_size(int max_nd_range_size[])
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                _max_nd_range_size[i] = max_nd_range_size[i];
-                _max_nd_range_size_i[i] = max_nd_range_size[i];
-            }
-        }
-        void set_memory_clock_rate(unsigned int memory_clock_rate)
-        {
-            _memory_clock_rate = memory_clock_rate;
-        }
-        void set_memory_bus_width(unsigned int memory_bus_width)
-        {
-            _memory_bus_width = memory_bus_width;
-        }
-        void
-        set_max_register_size_per_work_group(int max_register_size_per_work_group)
-        {
-            _max_register_size_per_work_group = max_register_size_per_work_group;
-        }
-        void set_device_id(uint32_t device_id)
-        {
-            _device_id = device_id;
-        }
-        void set_uuid(std::array<unsigned char, 16> uuid)
-        {
-            _uuid = std::move(uuid);
-        }
-        void set_global_mem_cache_size(unsigned int global_mem_cache_size)
-        {
-            _global_mem_cache_size = global_mem_cache_size;
-        }
-
-    private:
-        char _name[256];
-        int _max_work_item_sizes_i[3];
-        bool _host_unified_memory = false;
-        int _major;
-        int _minor;
-        int _integrated = 0;
-        int _frequency;
-        // Set estimated value 3200000 kHz as default value.
-        unsigned int _memory_clock_rate = 3200000;
-        // Set estimated value 64 bits as default value.
-        unsigned int _memory_bus_width = 64;
-        unsigned int _global_mem_cache_size;
-        int _max_compute_units;
-        int _max_work_group_size;
-        int _max_sub_group_size;
-        int _max_work_items_per_compute_unit;
-        int _max_register_size_per_work_group;
-        size_t _global_mem_size;
-        size_t _local_mem_size;
-        size_t _max_mem_alloc_size;
-        size_t _max_nd_range_size[3];
-        int _max_nd_range_size_i[3];
-        uint32_t _device_id;
-        std::array<unsigned char, 16> _uuid;
-    };
-
-    static int get_major_version(const sycl::device &dev)
-    {
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        return major;
-    }
-
-    static int get_minor_version(const sycl::device &dev)
-    {
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        return minor;
-    }
-
-    static void get_device_info(device_info &out, const sycl::device &dev)
-    {
-        device_info prop;
-        prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
-
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        prop.set_major_version(major);
-        prop.set_minor_version(minor);
-
-        prop.set_max_work_item_sizes(
-#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
-            // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
-            // is an enum class element
-            dev.get_info<sycl::info::device::max_work_item_sizes>());
-#else
-            // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
-            // an int
-            dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
-#endif
-        prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
-
-        prop.set_max_clock_frequency(
-            dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
-
-        prop.set_max_compute_units(
-            dev.get_info<sycl::info::device::max_compute_units>());
-        prop.set_max_work_group_size(
-            dev.get_info<sycl::info::device::max_work_group_size>());
-        prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
-        prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
-        prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-        if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
-        {
-            unsigned int tmp =
-                dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-            if (tmp != 0)
-                prop.set_memory_clock_rate(1000 * tmp);
-        }
-        if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
-        {
-            prop.set_memory_bus_width(
-                dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
-        }
-        if (dev.has(sycl::aspect::ext_intel_device_id))
-        {
-            prop.set_device_id(
-                dev.get_info<sycl::ext::intel::info::device::device_id>());
-        }
-        if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
-        {
-            prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
-        }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value."
-#endif
-
-        size_t max_sub_group_size = 1;
-        std::vector<size_t> sub_group_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-
-        for (const auto &sub_group_size : sub_group_sizes)
-        {
-            if (max_sub_group_size < sub_group_size)
-                max_sub_group_size = sub_group_size;
-        }
-
-        prop.set_max_sub_group_size(max_sub_group_size);
-
-        prop.set_max_work_items_per_compute_unit(
-            dev.get_info<sycl::info::device::max_work_group_size>());
-        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-        prop.set_max_nd_range_size(max_nd_range_size);
-
-        // Estimates max register size per work group, feel free to update the value
-        // according to device properties.
-        prop.set_max_register_size_per_work_group(65536);
-
-        prop.set_global_mem_cache_size(
-            dev.get_info<sycl::info::device::global_mem_cache_size>());
-        out = prop;
-    }
-
-    /// dpct device extension
-    class device_ext : public sycl::device
-    {
-        typedef std::mutex mutex_type;
-
-    public:
-        device_ext() : sycl::device(), _ctx(*this) {}
-        ~device_ext()
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-        }
-        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            init_queues();
-        }
-
-        int is_native_atomic_supported() { return 0; }
-        int get_major_version() const
-        {
-            return dpct::get_major_version(*this);
-        }
-
-        int get_minor_version() const
-        {
-            return dpct::get_minor_version(*this);
-        }
-
-        int get_max_compute_units() const
-        {
-            return get_device_info().get_max_compute_units();
-        }
-
-        /// Return the maximum clock frequency of this device in KHz.
-        int get_max_clock_frequency() const
-        {
-            return get_device_info().get_max_clock_frequency();
-        }
-
-        int get_integrated() const { return get_device_info().get_integrated(); }
-
-        int get_max_sub_group_size() const
-        {
-            return get_device_info().get_max_sub_group_size();
-        }
-
-        int get_max_register_size_per_work_group() const
-        {
-            return get_device_info().get_max_register_size_per_work_group();
-        }
-
-        int get_max_work_group_size() const
-        {
-            return get_device_info().get_max_work_group_size();
-        }
-
-        int get_mem_base_addr_align() const
-        {
-            return get_info<sycl::info::device::mem_base_addr_align>();
-        }
-
-        size_t get_global_mem_size() const
-        {
-            return get_device_info().get_global_mem_size();
-        }
-
-        size_t get_max_mem_alloc_size() const
-        {
-            return get_device_info().get_max_mem_alloc_size();
-        }
-
-        /// Get the number of bytes of free and total memory on the SYCL device.
-        /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-        /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-        void get_memory_info(size_t &free_memory, size_t &total_memory)
-        {
-#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-            if (!has(sycl::aspect::ext_intel_free_memory))
-            {
-                std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-                free_memory = 0;
-            }
-            else
-            {
-                free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-            }
-#else
-            std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-            free_memory = 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("Querying the number of bytes of free memory is not supported")
-#else
-#warning "Querying the number of bytes of free memory is not supported"
-#endif
-#endif
-            total_memory = get_device_info().get_global_mem_size();
-        }
-
-        void get_device_info(device_info &out) const
-        {
-            dpct::get_device_info(out, *this);
-        }
-
-        device_info get_device_info() const
-        {
-            device_info prop;
-            dpct::get_device_info(prop, *this);
-            return prop;
-        }
-
-        void reset()
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-            init_queues();
-        }
-
-        sycl::queue &in_order_queue() { return *_q_in_order; }
-
-        sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
-
-        sycl::queue &default_queue()
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return out_of_order_queue();
-#else
-            return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        void queues_wait_and_throw()
-        {
-            std::unique_lock<mutex_type> lock(m_mutex);
-            std::vector<std::shared_ptr<sycl::queue>> current_queues(
-                _queues);
-            lock.unlock();
-            for (const auto &q : current_queues)
-            {
-                q->wait_and_throw();
-            }
-            // Guard the destruct of current_queues to make sure the ref count is safe.
-            lock.lock();
-        }
-
-        sycl::queue *create_queue(bool enable_exception_handler = false)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return create_out_of_order_queue(enable_exception_handler);
-#else
-            return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler,
-                                     sycl::property::queue::in_order());
-        }
-
-        sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler);
-        }
-
-        void destroy_queue(sycl::queue *&queue)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
-                                         {
-                                             return q.get() == queue;
-                                         }),
-                          _queues.end());
-            queue = nullptr;
-        }
-        void set_saved_queue(sycl::queue *q)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _saved_queue = q;
-        }
-        sycl::queue *get_saved_queue() const
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return _saved_queue;
-        }
-        sycl::context get_context() const { return _ctx; }
-
-    private:
-        void clear_queues()
-        {
-            _queues.clear();
-            _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-        }
-
-        void init_queues()
-        {
-            _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
-            _q_out_of_order = create_queue_impl(true);
-            _saved_queue = &default_queue();
-        }
-
-        /// Caller should acquire resource \p m_mutex before calling this function.
-        template <class... Properties>
-        sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                       Properties... properties)
-        {
-            sycl::async_handler eh = {};
-            if (enable_exception_handler)
-            {
-                eh = exception_handler;
-            }
-            _queues.push_back(std::make_shared<sycl::queue>(
-                _ctx, *this, eh,
-                sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-                    sycl::property::queue::enable_profiling(),
-#endif
-                    properties...)));
-
-            return _queues.back().get();
-        }
-
-        void get_version(int &major, int &minor) const
-        {
-            detail::get_version(*this, major, minor);
-        }
-        sycl::queue *_q_in_order, *_q_out_of_order;
-        sycl::queue *_saved_queue;
-        sycl::context _ctx;
-        std::vector<std::shared_ptr<sycl::queue>> _queues;
-        mutable mutex_type m_mutex;
-    };
-
-    /// device manager
-    class dev_mgr
-    {
-    public:
-        device_ext &current_device()
-        {
-            unsigned int dev_id = current_device_id();
-            check_id(dev_id);
-            return *_devs[dev_id];
-        }
-        device_ext &cpu_device() const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            if (_cpu_device == -1)
-            {
-                throw std::runtime_error("no valid cpu device");
-            }
-            else
-            {
-                return *_devs[_cpu_device];
-            }
-        }
-        device_ext &get_device(unsigned int id) const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            check_id(id);
-            return *_devs[id];
-        }
-        unsigned int current_device_id() const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            auto it = _thread2dev_map.find(get_tid());
-            if (it != _thread2dev_map.end())
-                return it->second;
-            return DEFAULT_DEVICE_ID;
-        }
-
-        /// Select device with a device ID.
-        /// \param [in] id The id of the device which can
-        /// be obtained through get_device_id(const sycl::device).
-        void select_device(unsigned int id)
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            check_id(id);
-            _thread2dev_map[get_tid()] = id;
-        }
-        unsigned int device_count() { return _devs.size(); }
-
-        unsigned int get_device_id(const sycl::device &dev)
-        {
-            unsigned int id = 0;
-            for (auto dev_item : _devs)
-            {
-                if (*dev_item == dev)
-                {
-                    break;
-                }
-                id++;
-            }
-            return id;
-        }
-
-        template <class DeviceSelector>
-        std::enable_if_t<
-            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
-        {
-            sycl::device selected_device = sycl::device(selector);
-            unsigned int selected_device_id = get_device_id(selected_device);
-            select_device(selected_device_id);
-        }
-
-        /// Returns the instance of device manager singleton.
-        static dev_mgr &instance()
-        {
-            static dev_mgr d_m;
-            return d_m;
-        }
-        dev_mgr(const dev_mgr &) = delete;
-        dev_mgr &operator=(const dev_mgr &) = delete;
-        dev_mgr(dev_mgr &&) = delete;
-        dev_mgr &operator=(dev_mgr &&) = delete;
-
-    private:
-        mutable std::recursive_mutex m_mutex;
-        dev_mgr()
-        {
-            sycl::device default_device =
-                sycl::device(sycl::default_selector_v);
-            _devs.push_back(std::make_shared<device_ext>(default_device));
-
-            std::vector<sycl::device> sycl_all_devs =
-                sycl::device::get_devices(sycl::info::device_type::all);
-            // Collect other devices except for the default device.
-            if (default_device.is_cpu())
-                _cpu_device = 0;
-            for (auto &dev : sycl_all_devs)
-            {
-                if (dev == default_device)
-                {
-                    continue;
-                }
-                _devs.push_back(std::make_shared<device_ext>(dev));
-                if (_cpu_device == -1 && dev.is_cpu())
-                {
-                    _cpu_device = _devs.size() - 1;
-                }
-            }
-        }
-        void check_id(unsigned int id) const
-        {
-            if (id >= _devs.size())
-            {
-                throw std::runtime_error("invalid device id");
-            }
-        }
-        std::vector<std::shared_ptr<device_ext>> _devs;
-        /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-        /// thread id in _thread2dev_map, which means default device should be used
-        /// for the current thread.
-        const unsigned int DEFAULT_DEVICE_ID = 0;
-        /// thread-id to device-id map.
-        std::map<unsigned int, unsigned int> _thread2dev_map;
-        int _cpu_device = -1;
-    };
-
-    static inline sycl::queue &get_default_queue()
-    {
-        return dev_mgr::instance().current_device().default_queue();
-    }
-
-    namespace detail
-    {
-        enum class pointer_access_attribute
-        {
-            host_only = 0,
-            device_only,
-            host_device,
-            end
-        };
-
-        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-                                                              const void *ptr)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().is_device_ptr(ptr)
-                       ? pointer_access_attribute::device_only
-                       : pointer_access_attribute::host_only;
-#else
-            switch (sycl::get_pointer_type(ptr, q.get_context()))
-            {
-            case sycl::usm::alloc::unknown:
-                return pointer_access_attribute::host_only;
-            case sycl::usm::alloc::device:
-                return pointer_access_attribute::device_only;
-            case sycl::usm::alloc::shared:
-            case sycl::usm::alloc::host:
-                return pointer_access_attribute::host_device;
-            }
-#endif
-        }
-
-        template <typename ArgT>
-        inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
-        {
-            static_assert((unsigned char)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
-            static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
-            return (std::uint64_t)Val;
-        }
-
-        template <typename FirstT, typename... RestT>
-        inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                               RestT... RestVal)
-        {
-            static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
-            static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
-            static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
-            return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
-        }
-
-        class mem_mgr
-        {
-            mem_mgr()
-            {
-                // Reserved address space, no real memory allocation happens here.
-#if defined(__linux__)
-                mapped_address_space =
-                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#elif defined(_WIN64)
-                mapped_address_space = (byte_t *)VirtualAlloc(
-                    NULL,               // NULL specified as the base address parameter
-                    mapped_region_size, // Size of allocation
-                    MEM_RESERVE,        // Allocate reserved pages
-                    PAGE_NOACCESS);     // Protection = no access
-#else
-#error "Only support Windows and Linux."
-#endif
-                next_free = mapped_address_space;
-            };
-
-        public:
-            using buffer_id_t = int;
-
-            struct allocation
-            {
-                buffer_t buffer;
-                byte_t *alloc_ptr;
-                size_t size;
-            };
-
-            ~mem_mgr()
-            {
-#if defined(__linux__)
-                munmap(mapped_address_space, mapped_region_size);
-#elif defined(_WIN64)
-                VirtualFree(mapped_address_space, 0, MEM_RELEASE);
-#else
-#error "Only support Windows and Linux."
-#endif
-            };
-
-            mem_mgr(const mem_mgr &) = delete;
-            mem_mgr &operator=(const mem_mgr &) = delete;
-            mem_mgr(mem_mgr &&) = delete;
-            mem_mgr &operator=(mem_mgr &&) = delete;
-
-            /// Allocate
-            void *mem_alloc(size_t size)
-            {
-                if (!size)
-                    return nullptr;
-                std::lock_guard<std::mutex> lock(m_mutex);
-                if (next_free + size > mapped_address_space + mapped_region_size)
-                {
-                    throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
-                }
-                // Allocation
-                sycl::range<1> r(size);
-                buffer_t buf(r);
-                allocation A{buf, next_free, size};
-                // Map allocation to device pointer
-                void *result = next_free;
-                m_map.emplace(next_free + size, A);
-                // Update pointer to the next free space.
-                next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
-
-                return result;
-            }
-
-            /// Deallocate
-            void mem_free(const void *ptr)
-            {
-                if (!ptr)
-                    return;
-                std::lock_guard<std::mutex> lock(m_mutex);
-                auto it = get_map_iterator(ptr);
-                m_map.erase(it);
-            }
-
-            /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-            allocation translate_ptr(const void *ptr)
-            {
-                std::lock_guard<std::mutex> lock(m_mutex);
-                auto it = get_map_iterator(ptr);
-                return it->second;
-            }
-
-            /// Check if the pointer represents device pointer or not.
-            bool is_device_ptr(const void *ptr) const
-            {
-                std::lock_guard<std::mutex> lock(m_mutex);
-                return (mapped_address_space <= ptr) &&
-                       (ptr < mapped_address_space + mapped_region_size);
-            }
-
-            /// Returns the instance of memory manager singleton.
-            static mem_mgr &instance()
-            {
-                static mem_mgr m;
-                return m;
-            }
-
-        private:
-            std::map<byte_t *, allocation> m_map;
-            mutable std::mutex m_mutex;
-            byte_t *mapped_address_space;
-            byte_t *next_free;
-            const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
-            const size_t alignment = 256;
-            /// This padding may be defined to some positive value to debug
-            /// out of bound accesses.
-            const size_t extra_padding = 0;
-
-            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
-            {
-                auto it = m_map.upper_bound((byte_t *)ptr);
-                if (it == m_map.end())
-                {
-                    // Not a virtual pointer.
-                    throw std::runtime_error("can not get buffer from non-virtual pointer");
-                }
-                const allocation &alloc = it->second;
-                if (ptr < alloc.alloc_ptr)
-                {
-                    // Out of bound.
-                    // This may happen if there's a gap between allocations due to alignment
-                    // or extra padding and pointer points to this gap.
-                    throw std::runtime_error("invalid virtual pointer");
-                }
-                return it;
-            }
-        };
-
-        template <class T, memory_region Memory, size_t Dimension>
-        class accessor;
-        template <memory_region Memory, class T = byte_t>
-        class memory_traits
-        {
-        public:
-            static constexpr sycl::access::target target =
-                sycl::access::target::device;
-            static constexpr sycl::access_mode mode =
-                (Memory == constant) ? sycl::access_mode::read
-                                     : sycl::access_mode::read_write;
-            static constexpr size_t type_size = sizeof(T);
-            using element_t =
-                typename std::conditional<Memory == constant, const T, T>::type;
-            using value_t = typename std::remove_cv<T>::type;
-            template <size_t Dimension = 1>
-            using accessor_t = typename std::conditional<
-                Memory == local, sycl::local_accessor<value_t, Dimension>,
-                sycl::accessor<T, Dimension, mode, target>>::type;
-            using pointer_t = T *;
-        };
-
-        static inline void *dpct_malloc(size_t size, sycl::queue &q)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
-            return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                                        sycl::queue &q)
-        {
-            pitch = PITCH_DEFAULT_ALIGN(x);
-            return dpct_malloc(pitch * y * z, q);
-        }
-
-        /**
-         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] dev_ptr Pointer to the virtual device memory address.
-         * @param [in] value The value to be set.
-         * @param [in] size Number of elements to be set to the value.
-         * @return An event representing the memset operation.
-         */
-        template <typename valueT>
-        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-                                              valueT value, size_t size)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            assert(mm.is_device_ptr(dev_ptr));
-            auto alloc = mm.translate_ptr(dev_ptr);
-            size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
-
-            return q.submit([&](sycl::handler &cgh)
-                            {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<valueT>(
-        sycl::range<1>(alloc.size / sizeof(valueT)));
-    sycl::accessor<valueT, 1, sycl::access_mode::write,
-                sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, value); });
-#else
-            return q.fill(dev_ptr, value, size);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        /**
-         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] data Pointer to the pitched device memory region.
-         * @param [in] value The value to be set.
-         * @param [in] size 3D memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
-        template <typename valueT>
-        static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-                    sycl::range<3> size)
-        {
-            std::vector<sycl::event> event_list;
-            size_t slice = data.get_pitch() * data.get_y();
-            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
-            for (size_t z = 0; z < size.get(2); ++z)
-            {
-                unsigned char *data_ptr = data_surface;
-                for (size_t y = 0; y < size.get(1); ++y)
-                {
-                    event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
-                    data_ptr += data.get_pitch();
-                }
-                data_surface += slice;
-            }
-            return event_list;
-        }
-
-        /**
-         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] ptr Pointer to the virtual device memory.
-         * @param [in] pitch The pitch size by number of elements, including padding.
-         * @param [in] val The value to be set.
-         * @param [in] x The width of memory region by number of elements.
-         * @param [in] y The height of memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
-        template <typename valueT>
-        static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-                    size_t y)
-        {
-            return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                               sycl::range<3>(x, y, 1));
-        }
-
-        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-                                                        const void *from_ptr,
-                                                        memcpy_direction dir)
-        {
-            switch (dir)
-            {
-            case memcpy_direction::host_to_host:
-            case memcpy_direction::host_to_device:
-            case memcpy_direction::device_to_host:
-            case memcpy_direction::device_to_device:
-                return dir;
-            case memcpy_direction::automatic:
-            {
-                // table[to_attribute][from_attribute]
-                static const memcpy_direction
-                    direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
-                                       {{memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_host,
-                                         memcpy_direction::host_to_host},
-                                        {memcpy_direction::host_to_device,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device},
-                                        {memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device}};
-                return direction_table[static_cast<unsigned>(get_pointer_attribute(
-                    q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
-            }
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-        }
-
-        static sycl::event
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                    memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
-        {
-            if (!size)
-                return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-            switch (real_direction)
-            {
-            case host_to_host:
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-            case host_to_device:
-            {
-                auto alloc = mm.translate_ptr(to_ptr);
-                size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(from_ptr, acc); });
-            }
-            case device_to_host:
-            {
-                auto alloc = mm.translate_ptr(from_ptr);
-                size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(acc, to_ptr); });
-            }
-            case device_to_device:
-            {
-                auto to_alloc = mm.translate_ptr(to_ptr);
-                auto from_alloc = mm.translate_ptr(from_ptr);
-                size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh, r, to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh, r, from_o);
-    cgh.copy(from_acc, to_acc); });
-            }
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-#else
-            return q.memcpy(to_ptr, from_ptr, size, dep_events);
-            GGML_UNUSED(direction);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        // Get actual copy range and make sure it will not exceed range.
-        static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                            size_t pitch)
-        {
-            return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-        }
-
-        static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                        size_t pitch)
-        {
-            return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-        }
-
-        /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-        /// and \p from_range to another specified by \p to_ptr and \p to_range.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    sycl::range<3> to_range, sycl::range<3> from_range,
-                    sycl::id<3> to_id, sycl::id<3> from_id,
-                    sycl::range<3> size, memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
-        {
-            // RAII for host pointer
-            class host_buffer
-            {
-                void *_buf;
-                size_t _size;
-                sycl::queue &_q;
-                const std::vector<sycl::event> &_deps; // free operation depends
-
-            public:
-                host_buffer(size_t size, sycl::queue &q,
-                            const std::vector<sycl::event> &deps)
-                    : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-                void *get_ptr() const { return _buf; }
-                size_t get_size() const { return _size; }
-                ~host_buffer()
-                {
-                    if (_buf)
-                    {
-                        _q.submit([&](sycl::handler &cgh)
-                                  {
-        cgh.depends_on(_deps);
-        cgh.host_task([buf = _buf] { std::free(buf); }); });
-                    }
-                }
-            };
-            std::vector<sycl::event> event_list;
-
-            size_t to_slice = to_range.get(1) * to_range.get(0),
-                   from_slice = from_range.get(1) * from_range.get(0);
-            unsigned char *to_surface =
-                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-            const unsigned char *from_surface =
-                (const unsigned char *)from_ptr +
-                get_offset(from_id, from_slice, from_range.get(0));
-
-            if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-            {
-                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                    direction, dep_events)};
-            }
-            direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-            size_t size_slice = size.get(1) * size.get(0);
-            switch (direction)
-            {
-            case host_to_host:
-                for (size_t z = 0; z < size.get(2); ++z)
-                {
-                    unsigned char *to_ptr = to_surface;
-                    const unsigned char *from_ptr = from_surface;
-                    if (to_range.get(0) == from_range.get(0) &&
-                        to_range.get(0) == size.get(0))
-                    {
-                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                         direction, dep_events));
-                    }
-                    else
-                    {
-                        for (size_t y = 0; y < size.get(1); ++y)
-                        {
-                            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                             direction, dep_events));
-                            to_ptr += to_range.get(0);
-                            from_ptr += from_range.get(0);
-                        }
-                    }
-                    to_surface += to_slice;
-                    from_surface += from_slice;
-                }
-                break;
-            case host_to_device:
-            {
-                host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                                event_list);
-                std::vector<sycl::event> host_events;
-                if (to_slice == size_slice)
-                {
-                    // Copy host data to a temp host buffer with the shape of target.
-                    host_events =
-                        dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                    host_to_host, dep_events);
-                }
-                else
-                {
-                    // Copy host data to a temp host buffer with the shape of target.
-                    host_events = dpct_memcpy(
-                        q, buf.get_ptr(), from_surface, to_range, from_range,
-                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-                        // If has padding data, not sure whether it is useless. So fill temp
-                        // buffer with it.
-                        std::vector<sycl::event>{
-                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                        device_to_host, dep_events)});
-                }
-                // Copy from temp host buffer to device with only one submit.
-                event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                                 buf.get_size(), host_to_device,
-                                                 host_events));
-                break;
-            }
-            case device_to_host:
-            {
-                host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                                event_list);
-                // Copy from host temp buffer to host target with reshaping.
-                event_list = dpct_memcpy(
-                    q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-                    sycl::id<3>(0, 0, 0), size, host_to_host,
-                    // Copy from device to temp host buffer with only one submit.
-                    std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                         buf.get_size(),
-                                                         device_to_host, dep_events)});
-                break;
-            }
-            case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-            {
-                auto &mm = mem_mgr::instance();
-                auto to_alloc = mm.translate_ptr(to_surface);
-                auto from_alloc = mm.translate_ptr(from_surface);
-                size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
-    cgh.depends_on(dep_events);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh,
-                get_copy_range(size, to_slice, to_range.get(0)), to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh,
-                get_copy_range(size, from_slice, from_range.get(0)), from_o);
-    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-        size,
-        [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-        }); }));
-            }
-#else
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
-    cgh.depends_on(dep_events);
-    cgh.parallel_for<class dpct_memcpy_3d_detail>(
-        size,
-        [=](sycl::id<3> id) {
-            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                from_surface[get_offset(id, from_slice, from_range.get(0))];
-        }); }));
-#endif
-            break;
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-            return event_list;
-        }
-
-        /// memcpy 2D/3D matrix specified by pitched_data.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                    memcpy_direction direction = automatic)
-        {
-            return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                               size, direction);
-        }
-
-        /// memcpy 2D matrix with pitch.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                    memcpy_direction direction = automatic)
-        {
-            return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                               sycl::range<3>(from_pitch, y, 1),
-                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                               sycl::range<3>(x, y, 1), direction);
-        }
-
-        namespace deprecated
-        {
-
-            template <typename T, sycl::usm::alloc AllocKind>
-            class usm_allocator
-            {
-            private:
-                using Alloc = sycl::usm_allocator<T, AllocKind>;
-                Alloc _impl;
-
-            public:
-                using value_type = typename std::allocator_traits<Alloc>::value_type;
-                using pointer = typename std::allocator_traits<Alloc>::pointer;
-                using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
-                using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
-                using const_void_pointer =
-                    typename std::allocator_traits<Alloc>::const_void_pointer;
-                using reference = typename std::allocator_traits<Alloc>::value_type &;
-                using const_reference =
-                    const typename std::allocator_traits<Alloc>::value_type &;
-                using difference_type =
-                    typename std::allocator_traits<Alloc>::difference_type;
-                using size_type = typename std::allocator_traits<Alloc>::size_type;
-                using propagate_on_container_copy_assignment = typename std::allocator_traits<
-                    Alloc>::propagate_on_container_copy_assignment;
-                using propagate_on_container_move_assignment = typename std::allocator_traits<
-                    Alloc>::propagate_on_container_move_assignment;
-                using propagate_on_container_swap =
-                    typename std::allocator_traits<Alloc>::propagate_on_container_swap;
-                using is_always_equal =
-                    typename std::allocator_traits<Alloc>::is_always_equal;
-
-                template <typename U>
-                struct rebind
-                {
-                    typedef usm_allocator<U, AllocKind> other;
-                };
-
-                usm_allocator() : _impl(dpct::get_default_queue()) {}
-                ~usm_allocator() {}
-                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
-                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
-                pointer address(reference r) { return &r; }
-                const_pointer address(const_reference r) { return &r; }
-                pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
-                {
-                    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
-                }
-                void deallocate(pointer p, size_type cnt)
-                {
-                    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
-                }
-                size_type max_size() const
-                {
-                    return std::allocator_traits<Alloc>::max_size(_impl);
-                }
-                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
-                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
-            };
-
-        } // namespace deprecated
-
-        inline void dpct_free(void *ptr,
-                              const sycl::queue &q)
-        {
-            if (ptr)
-            {
-#ifdef DPCT_USM_LEVEL_NONE
-                detail::mem_mgr::instance().mem_free(ptr);
-#else
-                sycl::free(ptr, q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-            }
-        }
-
-        template <typename T>
-        inline auto get_memory(const void *x)
-        {
-            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-            return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
-            return new_x;
-#endif
-        }
-
-        template <typename T>
-        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
-        {
-            using Ty = typename DataType<T>::T2;
-            Ty s_h;
-            if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-                detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
-                    .wait();
-            else
-                s_h = *reinterpret_cast<const Ty *>(s);
-            return s_h;
-        }
-
-    } // namespace detail
-
-    template <typename T>
-    inline auto get_value(const T *s, sycl::queue &q)
-    {
-        return detail::get_value(s, q);
-    }
-
-    namespace detail
-    {
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                              oneapi::mkl::transpose b_trans, int m, int n, int k,
-                              const void *alpha, const void *a, int lda, const void *b,
-                              int ldb, const void *beta, void *c, int ldc)
-        {
-#ifndef __INTEL_MKL__
-            GGML_UNUSED(q);
-            GGML_UNUSED(a_trans);
-            GGML_UNUSED(b_trans);
-            GGML_UNUSED(m);
-            GGML_UNUSED(n);
-            GGML_UNUSED(k);
-            GGML_UNUSED(alpha);
-            GGML_UNUSED(a);
-            GGML_UNUSED(lda);
-            GGML_UNUSED(b);
-            GGML_UNUSED(ldb);
-            GGML_UNUSED(beta);
-            GGML_UNUSED(c);
-            GGML_UNUSED(ldc);
-            throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                                     "Project does not support this API.");
-#else
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-            auto data_a = get_memory<const Ta>(a);
-            auto data_b = get_memory<const Tb>(b);
-            auto data_c = get_memory<Tc>(c);
-            oneapi::mkl::blas::column_major::gemm(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                data_b, ldb, beta_value, data_c, ldc);
-#endif
-        }
-
-        template <typename VecT, class BinaryOperation, class = void>
-        class vectorized_binary
-        {
-        public:
-            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-            {
-                VecT v4;
-                for (size_t i = 0; i < v4.size(); ++i)
-                {
-                    v4[i] = binary_op(a[i], b[i]);
-                }
-                return v4;
-            }
-        };
-
-        template <typename VecT, class BinaryOperation>
-        class vectorized_binary<
-            VecT, BinaryOperation,
-            std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
-        {
-        public:
-            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-            {
-                return binary_op(a, b).template as<VecT>();
-            }
-        };
-
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
-                                    const void *alpha, const void **a, int lda,
-                                    const void **b, int ldb, const void *beta, void **c,
-                                    int ldc, int batch_size)
-        {
-            struct matrix_info_t
-            {
-                oneapi::mkl::transpose transpose_info[2];
-                Ts value_info[2];
-                std::int64_t size_info[3];
-                std::int64_t ld_info[3];
-                std::int64_t groupsize_info;
-            };
-
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-
-            matrix_info_t *matrix_info =
-                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-            matrix_info->transpose_info[0] = a_trans;
-            matrix_info->transpose_info[1] = b_trans;
-            matrix_info->value_info[0] = alpha_value;
-            matrix_info->value_info[1] = beta_value;
-            matrix_info->size_info[0] = m;
-            matrix_info->size_info[1] = n;
-            matrix_info->size_info[2] = k;
-            matrix_info->ld_info[0] = lda;
-            matrix_info->ld_info[1] = ldb;
-            matrix_info->ld_info[2] = ldc;
-            matrix_info->groupsize_info = batch_size;
-
-            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
-                matrix_info->size_info, matrix_info->size_info + 1,
-                matrix_info->size_info + 2, matrix_info->value_info,
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
-                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-
-            q.submit([&](sycl::handler &cgh)
-                     {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); }); });
-        }
-
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void
-        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                        oneapi::mkl::transpose b_trans, int m, int n,
-                        int k, const void *alpha, const void *a, int lda,
-                        long long int stride_a, const void *b, int ldb,
-                        long long int stride_b, const void *beta, void *c,
-                        int ldc, long long int stride_c, int batch_size)
-        {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-            auto data_a = get_memory<const Ta>(a);
-            auto data_b = get_memory<const Tb>(b);
-            auto data_c = get_memory<Tc>(c);
-            oneapi::mkl::blas::column_major::gemm_batch(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                stride_a, data_b, ldb, stride_b, beta_value,
-                data_c, ldc, stride_c, batch_size);
-        }
-
-    } // namespace detail
-
-    template <typename VecT, class BinaryOperation>
-    inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                      const BinaryOperation binary_op)
-    {
-        sycl::vec<unsigned, 1> v0{a}, v1{b};
-        auto v2 = v0.as<VecT>();
-        auto v3 = v1.as<VecT>();
-        auto v4 =
-            detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
-        v0 = v4.template as<sycl::vec<unsigned, 1>>();
-        return v0;
-    }
-
-    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                                  memcpy_direction direction = automatic,
-                                  sycl::queue &q = dpct::get_default_queue())
-    {
-        detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
-    }
-
-    static inline unsigned int select_device(unsigned int id)
-    {
-        dev_mgr::instance().select_device(id);
-        return id;
-    }
-
-    template <typename T>
-    T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                               unsigned int logical_sub_group_size = 32)
-    {
-        unsigned int id = g.get_local_linear_id();
-        unsigned int start_index =
-            id / logical_sub_group_size * logical_sub_group_size;
-        unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-        return sycl::select_from_group(g, x,
-                                       target_offset < logical_sub_group_size
-                                           ? start_index + target_offset
-                                           : id);
-    }
-
-    template <typename T>
-    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
-    {
-        return sycl::vec<T, 1>(val)
-            .template as<sycl::vec<
-                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-            .template convert<T>();
-    }
-
-    template <typename T1, typename T2>
-    using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                           uint32_t, int32_t>;
-
-    template <typename T1, typename T2, typename T3>
-    inline auto dp4a(T1 a, T2 b, T3 c)
-    {
-        dot_product_acc_t<T1, T2> res = c;
-        auto va = extract_and_sign_or_zero_extend4(a);
-        auto vb = extract_and_sign_or_zero_extend4(b);
-        res += va[0] * vb[0];
-        res += va[1] * vb[1];
-        res += va[2] * vb[2];
-        res += va[3] * vb[3];
-        return res;
-    }
-
-    struct sub_sat
-    {
-        template <typename T>
-        auto operator()(const T x, const T y) const
-        {
-            return sycl::sub_sat(x, y);
-        }
-    };
-
-    template <typename S, typename T>
-    inline T vectorized_min(T a, T b)
-    {
-        sycl::vec<T, 1> v0{a}, v1{b};
-        auto v2 = v0.template as<S>();
-        auto v3 = v1.template as<S>();
-        auto v4 = sycl::min(v2, v3);
-        v0 = v4.template as<sycl::vec<T, 1>>();
-        return v0;
-    }
-
-    inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-    inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-    inline float pow(const float a, const float b) { return sycl::pow(a, b); }
-    inline double pow(const double a, const double b) { return sycl::pow(a, b); }
-    template <typename T, typename U>
-    inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-    pow(const T a, const U b)
-    {
-        return sycl::pow(a, static_cast<T>(b));
-    }
-    template <typename T, typename U>
-    inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-    pow(const T a, const U b)
-    {
-        return sycl::pow(static_cast<double>(a), static_cast<double>(b));
-    }
-
-    inline double min(const double a, const float b)
-    {
-        return sycl::fmin(a, static_cast<double>(b));
-    }
-    inline double min(const float a, const double b)
-    {
-        return sycl::fmin(static_cast<double>(a), b);
-    }
-    inline float min(const float a, const float b) { return sycl::fmin(a, b); }
-    inline double min(const double a, const double b) { return sycl::fmin(a, b); }
-    inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint32_t>(b));
-    }
-    inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
-    {
-        return sycl::min(static_cast<std::uint32_t>(a), b);
-    }
-    inline std::int32_t min(const std::int32_t a, const std::int32_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::int64_t min(const std::int64_t a, const std::int64_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    // max function overloads.
-    // For floating-point types, `float` or `double` arguments are acceptable.
-    // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-    // `std::int64_t` type arguments are acceptable.
-    inline double max(const double a, const float b)
-    {
-        return sycl::fmax(a, static_cast<double>(b));
-    }
-    inline double max(const float a, const double b)
-    {
-        return sycl::fmax(static_cast<double>(a), b);
-    }
-    inline float max(const float a, const float b) { return sycl::fmax(a, b); }
-    inline double max(const double a, const double b) { return sycl::fmax(a, b); }
-    inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint32_t>(b));
-    }
-    inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
-    {
-        return sycl::max(static_cast<std::uint32_t>(a), b);
-    }
-    inline std::int32_t max(const std::int32_t a, const std::int32_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::int64_t max(const std::int64_t a, const std::int64_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-
-    inline void
-    has_capability_or_fail(const sycl::device &dev,
-                           const std::initializer_list<sycl::aspect> &props)
-    {
-        for (const auto &it : props)
-        {
-            if (dev.has(it))
-                continue;
-            switch (it)
-            {
-            case sycl::aspect::fp64:
-                throw std::runtime_error("'double' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
-                break;
-            case sycl::aspect::fp16:
-                throw std::runtime_error("'half' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
-                break;
-            default:
-#define __SYCL_ASPECT(ASPECT, ID) \
-    case sycl::aspect::ASPECT:    \
-        return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
-                {
-                    switch (AspectNum)
-                    {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-                    default:
-                        return "unknown aspect";
-                    }
-                };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-                throw std::runtime_error(
-                    "'" + getAspectNameStr(it) + "' is not supported in '" +
-                    dev.get_info<sycl::info::device::name>() + "' device");
-            }
-            break;
-        }
-    }
-
-    static inline unsigned int get_current_device_id()
-    {
-        return dev_mgr::instance().current_device_id();
-    }
-
-    static inline device_ext &get_current_device()
-    {
-        return dev_mgr::instance().current_device();
-    }
-
-    static inline sycl::queue &get_in_order_queue()
-    {
-        return dev_mgr::instance().current_device().in_order_queue();
-    }
-
-    static sycl::event
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-        if (!size)
-            return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-        auto &mm = mem_mgr::instance();
-        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-        switch (real_direction)
-        {
-        case host_to_host:
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-        case host_to_device:
-        {
-            auto alloc = mm.translate_ptr(to_ptr);
-            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(from_ptr, acc); });
-        }
-        case device_to_host:
-        {
-            auto alloc = mm.translate_ptr(from_ptr);
-            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(acc, to_ptr); });
-        }
-        case device_to_device:
-        {
-            auto to_alloc = mm.translate_ptr(to_ptr);
-            auto from_alloc = mm.translate_ptr(from_ptr);
-            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh, r, to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh, r, from_o);
-        cgh.copy(from_acc, to_acc); });
-        }
-        default:
-            throw std::runtime_error("dpct_memcpy: invalid direction value");
-        }
-#else
-        return q.memcpy(to_ptr, from_ptr, size, dep_events);
-        GGML_UNUSED(direction);
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    // Get actual copy range and make sure it will not exceed range.
-    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                        size_t pitch)
-    {
-        return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-    }
-
-    static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                    size_t pitch)
-    {
-        return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-    }
-
-    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-    /// and \p from_range to another specified by \p to_ptr and \p to_range.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                sycl::range<3> to_range, sycl::range<3> from_range,
-                sycl::id<3> to_id, sycl::id<3> from_id,
-                sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-        // RAII for host pointer
-        class host_buffer
-        {
-            void *_buf;
-            size_t _size;
-            sycl::queue &_q;
-            const std::vector<sycl::event> &_deps; // free operation depends
-
-        public:
-            host_buffer(size_t size, sycl::queue &q,
-                        const std::vector<sycl::event> &deps)
-                : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-            void *get_ptr() const { return _buf; }
-            size_t get_size() const { return _size; }
-            ~host_buffer()
-            {
-                if (_buf)
-                {
-                    _q.submit([&](sycl::handler &cgh)
-                              {
-            cgh.depends_on(_deps);
-            cgh.host_task([buf = _buf] { std::free(buf); }); });
-                }
-            }
-        };
-        std::vector<sycl::event> event_list;
-
-        size_t to_slice = to_range.get(1) * to_range.get(0),
-               from_slice = from_range.get(1) * from_range.get(0);
-        unsigned char *to_surface =
-            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-        const unsigned char *from_surface =
-            (const unsigned char *)from_ptr +
-            get_offset(from_id, from_slice, from_range.get(0));
-
-        if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-        {
-            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                direction, dep_events)};
-        }
-        direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-        size_t size_slice = size.get(1) * size.get(0);
-        switch (direction)
-        {
-        case host_to_host:
-            for (size_t z = 0; z < size.get(2); ++z)
-            {
-                unsigned char *to_ptr = to_surface;
-                const unsigned char *from_ptr = from_surface;
-                if (to_range.get(0) == from_range.get(0) &&
-                    to_range.get(0) == size.get(0))
-                {
-                    event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                     direction, dep_events));
-                }
-                else
-                {
-                    for (size_t y = 0; y < size.get(1); ++y)
-                    {
-                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                         direction, dep_events));
-                        to_ptr += to_range.get(0);
-                        from_ptr += from_range.get(0);
-                    }
-                }
-                to_surface += to_slice;
-                from_surface += from_slice;
-            }
-            break;
-        case host_to_device:
-        {
-            host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                            event_list);
-            std::vector<sycl::event> host_events;
-            if (to_slice == size_slice)
-            {
-                // Copy host data to a temp host buffer with the shape of target.
-                host_events =
-                    dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                host_to_host, dep_events);
-            }
-            else
-            {
-                // Copy host data to a temp host buffer with the shape of target.
-                host_events = dpct_memcpy(
-                    q, buf.get_ptr(), from_surface, to_range, from_range,
-                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-                    // If has padding data, not sure whether it is useless. So fill temp
-                    // buffer with it.
-                    std::vector<sycl::event>{
-                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                    device_to_host, dep_events)});
-            }
-            // Copy from temp host buffer to device with only one submit.
-            event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                             buf.get_size(), host_to_device,
-                                             host_events));
-            break;
-        }
-        case device_to_host:
-        {
-            host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                            event_list);
-            // Copy from host temp buffer to host target with reshaping.
-            event_list = dpct_memcpy(
-                q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-                sycl::id<3>(0, 0, 0), size, host_to_host,
-                // Copy from device to temp host buffer with only one submit.
-                std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                     buf.get_size(),
-                                                     device_to_host, dep_events)});
-            break;
-        }
-        case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-        {
-            auto &mm = mem_mgr::instance();
-            auto to_alloc = mm.translate_ptr(to_surface);
-            auto from_alloc = mm.translate_ptr(from_surface);
-            size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh,
-                    get_copy_range(size, to_slice, to_range.get(0)), to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh,
-                    get_copy_range(size, from_slice, from_range.get(0)), from_o);
-        cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-            size,
-            [=](sycl::id<3> id) {
-                to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                    from_acc[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-        }
-#else
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        cgh.parallel_for<class dpct_memcpy_3d_detail>(
-            size,
-            [=](sycl::id<3> id) {
-                to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                    from_surface[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-#endif
-        break;
-        default:
-            throw std::runtime_error("dpct_memcpy: invalid direction value");
-        }
-        return event_list;
-    }
-
-    /// memcpy 2D/3D matrix specified by pitched_data.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                memcpy_direction direction = automatic)
-    {
-        return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                           size, direction);
-    }
-
-    /// memcpy 2D matrix with pitch.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                memcpy_direction direction = automatic)
-    {
-        return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                           sycl::range<3>(from_pitch, y, 1),
-                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                           sycl::range<3>(x, y, 1), direction);
-    }
-
-    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                     oneapi::mkl::transpose b_trans, int m, int n, int k,
-                     const void *alpha, const void *a, library_data_t a_type,
-                     int lda, const void *b, library_data_t b_type, int ldb,
-                     const void *beta, void *c, library_data_t c_type, int ldc,
-                     library_data_t scaling_type)
-    {
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_impl<std::complex<float>, std::complex<float>,
-                              std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_impl<std::complex<double>, std::complex<double>,
-                              std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                                          lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                                          a, lda, b, ldb, &beta_half, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-    } // gemm()
-
-    /// Computes a batch of matrix-matrix product with general matrices.
-    /// \param [in] q The queue where the routine should be executed.
-    /// \param [in] a_trans Specifies the operation applied to A.
-    /// \param [in] b_trans Specifies the operation applied to B.
-    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-    /// \param [in] alpha Scaling factor for the matrix-matrix product.
-    /// \param [in] a Input matrix A.
-    /// \param [in] a_type Data type of the matrix A.
-    /// \param [in] lda Leading dimension of A.
-    /// \param [in] b Input matrix B.
-    /// \param [in] b_type Data type of the matrix B.
-    /// \param [in] ldb Leading dimension of B.
-    /// \param [in] beta Scaling factor for matrix C.
-    /// \param [in, out] c Input/Output matrix C.
-    /// \param [in] c_type Data type of the matrix C.
-    /// \param [in] ldc Leading dimension of C.
-    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-    /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a[],
-                           library_data_t a_type, int lda, const void *b[],
-                           library_data_t b_type, int ldb, const void *beta,
-                           void *c[], library_data_t c_type, int ldc,
-                           int batch_size, library_data_t scaling_type)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-        throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, b, ldb, beta, c, ldc,
-                                                batch_size);
-            break;
-        }
-#ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           b, ldb, beta, c, ldc, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                           a, lda, b, ldb, &beta_float, c, ldc,
-                                           batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-#endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-                batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-#endif
-    }
-
-    /// Computes a batch of matrix-matrix product with general matrices.
-    /// \param [in] q The queue where the routine should be executed.
-    /// \param [in] a_trans Specifies the operation applied to A.
-    /// \param [in] b_trans Specifies the operation applied to B.
-    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-    /// \param [in] alpha Scaling factor for the matrix-matrix product.
-    /// \param [in] a Input matrix A.
-    /// \param [in] a_type Data type of the matrix A.
-    /// \param [in] lda Leading dimension of A.
-    /// \param [in] stride_a Stride between the different A matrices.
-    /// \param [in] b Input matrix B.
-    /// \param [in] b_type Data type of the matrix B.
-    /// \param [in] ldb Leading dimension of B.
-    /// \param [in] stride_b Stride between the different B matrices.
-    /// \param [in] beta Scaling factor for matrix C.
-    /// \param [in, out] c Input/Output matrix C.
-    /// \param [in] c_type Data type of the matrix C.
-    /// \param [in] ldc Leading dimension of C.
-    /// \param [in] stride_c Stride between the different C matrices.
-    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-    /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a, library_data_t a_type,
-                           int lda, long long int stride_a, const void *b,
-                           library_data_t b_type, int ldb, long long int stride_b,
-                           const void *beta, void *c, library_data_t c_type,
-                           int ldc, long long int stride_c, int batch_size,
-                           library_data_t scaling_type)
-    {
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, stride_a, b, ldb, stride_b,
-                                                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-#ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           stride_a, b, ldb, stride_b, beta, c, ldc,
-                                           stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                                                  a, lda, stride_a, b, ldb, stride_b,
-                                                  beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-#endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-                &beta_half, c, ldc, stride_c, batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-    }
-
-    static inline void
-    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                      size_t from_pitch, size_t x, size_t y,
-                      memcpy_direction direction = automatic,
-                      sycl::queue &q = get_default_queue())
-    {
-        detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-                            direction);
-    }
-
-    using err0 = detail::generic_error_type<struct err0_tag, int>;
-    using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-} // COPY from DPCT head files
-
-
-static int g_ggml_sycl_debug=0;
-#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
-
-#define CHECK_TRY_ERROR(expr)                                                  \
-  [&]() {                                                                      \
-    try {                                                                      \
-      expr;                                                                    \
-      return dpct::success;                                                    \
-    } catch (std::exception const &e) {                                        \
-      std::cerr << e.what()<< "\nException caught at file:" << __FILE__        \
-        << ", line:" << __LINE__ <<", func:"<<__func__<< std::endl;            \
-      return dpct::default_error;                                              \
-    }                                                                          \
-  }()
-
-// #define DEBUG_SYCL_MALLOC
-
-static int g_work_group_size = 0;
-// typedef sycl::half ggml_fp16_t;
-
-#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define VER_4VEC   610          //todo for hardward optimize.
-#define VER_GEN9      700       //todo for hardward optimize.
-#define VER_GEN12 1000000       //todo for hardward optimize.
-#define VER_GEN13      (VER_GEN12 + 1030)   //todo for hardward optimize.
-
-#define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
-
-
-//define for XMX in Intel GPU
-//TODO: currently, it's not used for XMX really.
-#define SYCL_USE_XMX
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define XMX_MAX_BATCH_SIZE 32
-
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-
-static void crash(){
-    int *ptr = NULL;
-    *ptr = 0;
-}
-
-static void ggml_sycl_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
-    fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
-    fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
-    GGML_ASSERT(!"SYCL error");
-}
-
-#define SYCL_CHECK(err) do {                                                   \
-    auto err_ = (err); if (err_ != 0) ggml_sycl_error(                         \
-        #err, __func__, __FILE__, __LINE__,                                    \
-        "Meet error in this line code!");   \
-} while (0)
-
-#if DPCT_COMPAT_RT_VERSION >= 11100
-#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
-#else
-#define GGML_SYCL_ASSUME(x)
-#endif // DPCT_COMPAT_RT_VERSION >= 11100
-
-#ifdef GGML_SYCL_F16
-typedef sycl::half dfloat; // dequantize float
-typedef sycl::half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef sycl::float2 dfloat2;
-#endif //GGML_SYCL_F16
-
-bool   ggml_sycl_loaded(void);
-void * ggml_sycl_host_malloc(size_t size);
-void   ggml_sycl_host_free(void * ptr);
-bool   ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_sycl_set_tensor_split(const float * tensor_split);
-void   ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
-void   ggml_sycl_free_data(struct ggml_tensor * tensor);
-void   ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
-void   ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void   ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void   ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
-void   ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
-void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
-void   ggml_sycl_set_main_device(int main_device);
-void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
-void   ggml_sycl_set_scratch_size(size_t scratch_size);
-void   ggml_sycl_free_scratch(void);
-void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
-bool   ggml_backend_is_sycl(ggml_backend_t backend);
-int    ggml_backend_sycl_get_device(ggml_backend_t backend);
-int    get_main_device();
-void   print_ggml_tensor(const char*name, struct ggml_tensor *src);
-void   log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
-
-static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
-                                              const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
-                                                     const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
-                                                      const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-template <typename T>
-using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
-                             int k, dpct::queue_ptr stream);
-typedef to_t_sycl_t<float> to_fp32_sycl_t;
-typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
-
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_sycl_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_sycl_op_mul_mat_t)(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream);
-typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream);
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct dpct_type_471834 {
-    sycl::half d;           // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct dpct_type_143705 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct dpct_type_673649 {
-    sycl::half d;           // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct dpct_type_135589 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct dpct_type_122878 {
-    sycl::half d;           // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct dpct_type_143721 {
-    sycl::half2 ds;         // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
-                                      int **x_qh, int **x_sc);
-typedef void (*load_tiles_sycl_t)(const void *__restrict__ vx,
-                                  int *__restrict__ x_ql,
-                                  sycl::half2 *__restrict__ x_dm,
-                                  int *__restrict__ x_qh,
-                                  int *__restrict__ x_sc, const int &i_offset,
-                                  const int &i_max, const int &k,
-                                  const int &blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_sycl_t)(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
-    const int &i, const int &j, const int &k);
-
-//================================= k-quants
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct dpct_type_619598 {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    sycl::half2 dm;          // super-block scale for quantized scales/mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct dpct_type_138576 {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#ifdef GGML_QKK_64
-    uint8_t scales[2]; // scales, quantized with 8 bits
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    sycl::half d; // super-block scale
-} block_q3_K;
-//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half    dm[2];             // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct dpct_type_154943 {
-    sycl::half2 dm;            // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half d;                  // super-block scale
-    int8_t scales[QK_K/16];  // block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct dpct_type_866817 {
-    sycl::half2 dm;               // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct dpct_type_107281 {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    sycl::half d;            // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-#define WARP_SIZE 32
-#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-
-#define SYCL_GELU_BLOCK_SIZE 256
-#define SYCL_SILU_BLOCK_SIZE 256
-#define SYCL_TANH_BLOCK_SIZE 256
-#define SYCL_RELU_BLOCK_SIZE 256
-#define SYCL_SQR_BLOCK_SIZE 256
-#define SYCL_CPY_BLOCK_SIZE 32
-#define SYCL_SCALE_BLOCK_SIZE 256
-#define SYCL_CLAMP_BLOCK_SIZE 256
-#define SYCL_ROPE_BLOCK_SIZE 256
-#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
-#define SYCL_ALIBI_BLOCK_SIZE 32
-#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
-#define SYCL_QUANTIZE_BLOCK_SIZE 256
-#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
-#define SYCL_GET_ROWS_BLOCK_SIZE 256
-#define SYCL_UPSCALE_BLOCK_SIZE 256
-#define SYCL_CONCAT_BLOCK_SIZE 256
-#define SYCL_PAD_BLOCK_SIZE 256
-#define SYCL_ACC_BLOCK_SIZE 256
-#define SYCL_IM2COL_BLOCK_SIZE 256
-
-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_SYCL_DMMV_X
-#define GGML_SYCL_DMMV_X 32
-#endif
-#ifndef GGML_SYCL_MMV_Y
-#define GGML_SYCL_MMV_Y 1
-#endif
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
-#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
-
-#define MUL_MAT_SRC1_COL_STRIDE 128
-
-#define MAX_STREAMS 8
-static dpct::queue_ptr g_syclStreams[GGML_SYCL_MAX_DEVICES][MAX_STREAMS] = {
-    {0}};
-
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split tensors
-    dpct::event_ptr
-        events[GGML_SYCL_MAX_DEVICES]
-              [MAX_STREAMS]; // events for synchronizing multiple GPUs
-};
-
-inline dpct::err0 ggml_sycl_set_device(const int device) try {
-    int current_device;
-
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        current_device = dpct::dev_mgr::instance().current_device_id()));
-
-    // GGML_SYCL_DEBUG("ggml_sycl_set_device device=%d, current_device=%d\n", device, current_device);
-    if (device == current_device) {
-        return 0;
-    }
-
-    return CHECK_TRY_ERROR(dpct::select_device(device));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  crash();
-  std::exit(1);
-}
-
-static int g_device_count = -1;
-static int g_all_sycl_device_count = -1;
-static int g_main_device = -1;
-static int g_main_device_index = -1;
-
-static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
-
-struct sycl_device_capabilities {
-    int     cc;                 // compute capability
-    bool    vmm;                // virtual memory support
-    size_t  vmm_granularity;    // granularity of virtual memory
-    int device_id;
-};
-
-static sycl_device_capabilities g_device_caps[GGML_SYCL_MAX_DEVICES] = { {0, false, 0, -1} };
-
-struct sycl_device_id2index {
-    int index;
-};
-
-static sycl_device_id2index g_sycl_device_id2index[GGML_SYCL_MAX_DEVICES] = { {-1} };
-
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 0; // disabled by default
-static size_t g_scratch_offset = 0;
-
-static dpct::queue_ptr g_sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr};
-
-int get_main_device(){
-    return g_main_device;
-}
-
-[[noreturn]]
-static void bad_arch(const sycl::stream &stream_ct1) {
-    stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
-                  "current GPU architecture.\n";
-    // __trap();
-    std::exit(1);
-
-    (void) bad_arch; // suppress unused function warning
-}
-
-void log_ggml_var_device(const char*name, float *src, size_t total_elements, bool src_on_device){
-    if(!g_ggml_sycl_debug) return;
-    if(!src){
-        printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
-        return;
-    }
-    char filename[1024];
-    sprintf(filename, "%s.txt", name);
-    printf("GGML Tensor:%s save to %s\n", name, filename);
-
-    size_t total_size = total_elements*sizeof(float);
-    float *local_buf = NULL;
-    // printf("total_size %d2, src_on_device %d\n", total_size, src_on_device);
-    if(src_on_device) {
-        local_buf = (float *) ggml_sycl_host_malloc(total_size);
-        // printf("local buf %p size %d bytes\n", local_buf, total_size);
-        ggml_sycl_set_device(g_main_device);
-        dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-        main_stream->memcpy(local_buf, src, total_size);
-    }
-    else {
-        local_buf = (float *)src;
-        // printf("local buf from src-> data %p\n", local_buf);
-    }
-
-    std::ofstream logfile;
-    logfile.open(filename);
-    // printf("local buf element %d\n", total_elements);
-    for(size_t i=0; i<total_elements; i++){
-        if((i+1)%20 ==0) logfile <<std::endl;
-        else logfile << local_buf[i] <<" ";
-    }
-    logfile <<std::endl;
-    logfile.close();
-
-    if(src_on_device) ggml_sycl_host_free(local_buf);
-}
-
-//todo: debug for crash in some case
-void print_ggml_tensor(const char*name, struct ggml_tensor *src){
-    if(!g_ggml_sycl_debug) return;
-    if(!src){
-        printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
-        return;
-    }
-
-    size_t total_elements = ggml_nelements(src);
-
-    const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    float *src_data =NULL;
-    if(src_on_device) {
-        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *)  src->extra;
-        src_data = (float*)src_extra->data_device[g_main_device_index];
-    }
-    else {
-        src_data = (float *)src->data;
-    }
-
-    log_ggml_var_device(name, src_data, total_elements, src_on_device);
-}
-
-static int log_file_name_idx=0;
-void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt) {
-    stop_cnt = 4;
-    if(log_file_name_idx>=stop_cnt) return;
-    char filename[1280];
-    sprintf(filename, "%s_%07d", name, log_file_name_idx);
-    log_file_name_idx++;
-    print_ggml_tensor(filename, src);
-    // print_ggml_tensor("ggml_sycl_rms_norm_src0", (ggml_tensor *)src0);
-    // print_ggml_tensor("ggml_sycl_rms_norm_src1", (ggml_tensor *)src1);
-    // int *ptr = NULL;
-    // *ptr = 0;
-}
-
-static __dpct_inline__ float warp_reduce_sum(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
-    }
-    return x;
-}
-
-static __dpct_inline__ sycl::float2
-warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
-                                                mask);
-        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
-                                                mask);
-    }
-    return a;
-}
-
-static __dpct_inline__ float warp_reduce_max(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
-                              item_ct1.get_sub_group(), x, mask));
-    }
-    return x;
-}
-
-static __dpct_inline__ float op_repeat(const float a, const float b) {
-    return b;
-    GGML_UNUSED(a);
-}
-
-static __dpct_inline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __dpct_inline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __dpct_inline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static void gelu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f * xi *
-             (1.0f +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
-}
-
-static void silu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
-}
-
-static void gelu_quick_f32(const float *x, float *dst, int k,
-                           const sycl::nd_item<3> &item_ct1) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
-}
-
-static void tanh_f32(const float *x, float *dst, int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::tanh((float)(x[i]));
-}
-
-static void relu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0);
-}
-
-static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
-             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
-}
-
-static void sqr_f32(const float * x, float * dst, const int k,
-                    const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        mean_var.x() += xi;
-        mean_var.y() += xi * xi;
-    }
-
-    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        /*
-        DPCT1118:0: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var, item_ct1);
-    }
-
-    const float mean = mean_var.x() / ncols;
-    const float var = mean_var.y() / ncols - mean * mean;
-    const float inv_std = sycl::rsqrt(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
-                       const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (item_ct1.get_group(0) < ne02) { // src0
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
-static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
-                        const sycl::nd_item<3> &item_ct1) {
-    int ne0 = ne00 * scale_factor;
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = item_ct1.get_group(1) / scale_factor;
-    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    dst[offset_dst] = x[offset_src];
-}
-
-static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
-                    const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
-        item_ct1.get_group(0) < ne02) {
-        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
-                         item_ct1.get_group(0) * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
-
-static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
-                           const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
-    int start = item_ct1.get_group(2) * group_size;
-    int end = start + group_size;
-
-    start += item_ct1.get_local_id(2);
-
-    if (end >= ne_elements) {
-        end = ne_elements;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += block_size) {
-        tmp += x[j];
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:1: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += block_size) {
-        float xi = x[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:2: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float variance = tmp / group_size;
-    float scale = sycl::rsqrt(variance + eps);
-    for (int j = start; j < end; j += block_size) {
-        dst[j] *= scale;
-    }
-}
-
-static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                         const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:3: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float mean = tmp / ncols;
-    const float scale = sycl::rsqrt(mean + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * x[row*ncols + col];
-    }
-}
-
-static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_SYCL_F16
-    // v = v - {8.0f, 8.0f};
-    // v = v * {d, d};
-    v.s0() = (v.s0() - 8.0f) * d;
-    v.s1() = (v.s1() - 8.0f) * d;
-
-#else
-    v.x() = (v.x() - 8.0f) * d;
-    v.y() = (v.y() - 8.0f) * d;
-#endif // GGML_SYCL_F16
-}
-
-static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = x[ib].dm[0];
-    const dfloat m = x[ib].dm[1];
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_SYCL_F16
-    // v = v * {d, d};
-    // v = v + {m, m};
-    v.s0() = (v.s0() * d) + m;
-    v.s1() = (v.s1() * d) + m;
-
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_SYCL_F16
-}
-
-static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_SYCL_F16
-    // v = v - {16.0f, 16.0f};
-    // v = v * {d, d};
-    v.s0() = (v.s0() - 16.0f) * d;
-    v.s1() = (v.s1() - 16.0f) * d;
-
-#else
-    v.x() = (v.x() - 16.0f) * d;
-    v.y() = (v.y() - 16.0f) * d;
-#endif // GGML_SYCL_F16
-}
-
-static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = x[ib].dm[0];
-    const dfloat m = x[ib].dm[1];
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_SYCL_F16
-    // v = v * {d, d};
-    // v = v + {m, m};
-    v.s0() = (v.s0() * d) + m;
-    v.s1() = (v.s1() * d) + m;
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_SYCL_F16
-}
-
-static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x() = x[ib].qs[iqs + 0];
-    v.y() = x[ib].qs[iqs + 1];
-
-#ifdef GGML_SYCL_F16
-    // v = v * {d, d};
-    v.s0() *= d;
-    v.s1() *= d;
-#else
-    v.x() *= d;
-    v.y() *= d;
-#endif // GGML_SYCL_F16
-}
-
-//================================== k-quants
-
-template<typename dst_t>
-static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    float dall = x[i].dm[0];
-    float dmin = x[i].dm[1];
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
-}
-
-template<typename dst_t>
-static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-#if QK_K == 256
-    const int r = item_ct1.get_local_id(2) / 4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
-}
-
-#if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-template<typename dst_t>
-static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 32 threads
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = x[i].dm[0];
-    const float dmin = x[i].dm[1];
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-#else
-    const int tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = x[i].dm[0];
-    const float dmin = x[i].dm[1];
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-#if QK_K == 256
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
-}
-
-/*
-DPCT1110:4: The total declared local variable size in device function
-dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:5: The total declared local variable size in device function
-dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:6: The total declared local variable size in device function
-dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
-            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
-                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
-               dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:7: The total declared local variable size in device function
-dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
-    const int ix = item_ct1.get_local_id(2) % 2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x() +=
-                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
-                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
-            sum.y() +=
-                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
-                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
-            sum.z() +=
-                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
-                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
-            sum.w() +=
-                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
-                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
-                       sum.w() * sc[5]) -
-               dmin * smin;
-    }
-
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-#if QK_K == 256
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const sycl::half *x = (const sycl::half *)vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const float * x = (const float *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2);
-
-    if (ix >= kx_padded) {
-        return;
-    }
-
-    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                   item_ct1.get_local_id(1);
-
-    const int i_padded = iy*kx_padded + ix;
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
-
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-    float amax = sycl::fabs((float)xi);
-    float sum = xi;
-
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
-                                    item_ct1.get_sub_group(), amax, mask));
-        sum +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
-    }
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
-    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
-}
-
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                     item_ct1.get_local_id(2)) *
-                    2;
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0] = v.x();
-    dst_row[iybs + iqs + y_offset] = v.y();
-}
-
-template<typename src0_t, typename dst_t>
-static void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                    item_ct1.get_local_id(2);
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
-                             const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  2 * item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0] = v.x();
-    y[iybs + iqs + y_offset] = v.y();
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d4,
-                                                    const sycl::half2 &ds8) {
-    int sumi = 0;
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
-        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
-    }
-
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm4,
-                                                    const sycl::half2 &ds8) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
-        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
-    }
-
-#ifdef GGML_SYCL_F16
-    const sycl::float2 tmp =
-        (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>();
-    const float d4d8 = tmp.x();
-    const float m4s8 = tmp.y();
-#else
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d4d8 = dm4f.x() * ds8f.x();
-    const float m4s8 = dm4f.y() * ds8f.y();
-#endif // GGML_SYCL_F16
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const float &d5, const sycl::half2 &ds8) {
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = dpct::dp4a(vi0, u[2 * i + 0],
-                          sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = dpct::dp4a(vi1, u[2 * i + 1],
-                          sumi); // SIMD dot product of quantized values
-    }
-
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
-}
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = dpct::dp4a(vi0, u[2 * i + 0],
-                          sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = dpct::dp4a(vi1, u[2 * i + 1],
-                          sumi); // SIMD dot product of quantized values
-    }
-
-#ifdef GGML_SYCL_F16
-     const sycl::float2 tmp =
-        (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>();
-    const float d5d8 = tmp.x();
-    const float m5s8 = tmp.y();
-
-
-#else
-    const sycl::float2 dm5f =
-        dm5.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d5d8 = dm5f.x() * ds8f.x();
-    const float m5s8 = dm5f.y() * ds8f.y();
-#endif // GGML_SYCL_F16
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d8_0,
-                                                    const float &d8_1) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(v[i], u[i], sumi);
-    }
-
-    return d8_0*d8_1 * sumi;
-}
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm8,
-                                                    const sycl::half2 &ds8) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(v[i], u[i], sumi);
-    }
-
-#ifdef GGML_SYCL_F16
-    const sycl::float2 tmp =
-        (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>();
-    const float d8d8 = tmp.x();
-    const float m8s8 = tmp.y();
-#else
-    const sycl::float2 dm8f =
-        dm8.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d8d8 = dm8f.x() * ds8f.x();
-    const float m8s8 = dm8f.y() * ds8f.y();
-#endif // GGML_SYCL_F16
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
-    const sycl::half2 &dm2, const float *__restrict__ d8) {
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d +=
-            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] *
-                  dpct::dp4a(
-                      m, u[i],
-                      0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const sycl::float2 dm2f =
-        dm2.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const uint8_t *__restrict__ scales,
-                           const sycl::half2 &dm2, const float &d8) {
-
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m = dpct::dp4a(m, u[i],
-                                sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const sycl::float2 dm2f =
-        dm2.convert<float, sycl::rounding_mode::automatic>();
-
-    return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int &vl, const int &vh, const int *__restrict__ u,
-    const uint8_t *__restrict__ scales, const int &scale_offset,
-    const float &d3, const float *__restrict__ d8) {
-
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi =
-            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
-
-        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ scales, const float &d3,
-                           const float &d8) {
-
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const float *__restrict__ d8) {
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 =
-            dpct::dp4a(v1i, u[2 * i + 1],
-                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
-        const int dot2 =
-            dpct::dp4a(0x01010101, u[2 * i + 1],
-                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
-                                u[i * QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const sycl::float2 ds8f =
-            ds8[i].convert<float, sycl::rounding_mode::automatic>();
-
-        sumf_d += ds8f.x() * (sc[i] * sumi_d);
-        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int *__restrict__ vl, const int *__restrict__ vh,
-    const int *__restrict__ u, const uint8_t *__restrict__ sc,
-    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
-    const float *__restrict__ d8) {
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 =
-            dpct::dp4a(v0i, u[2 * i + 0],
-                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
-        const int dot2 =
-            dpct::dp4a(0x01010101, u[2 * i + 0],
-                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-
-    }
-
-    const sycl::float2 dm5f =
-        dm5.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
-                                sumi_d); // SIMD dot product
-        }
-
-        const sycl::float2 ds8f =
-            ds8[i].convert<float, sycl::rounding_mode::automatic>();
-
-        sumf_d += ds8f.x() * (sc[i] * sumi_d);
-        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
-                            const int *__restrict__ u,
-                            const int8_t *__restrict__ scales, const float &d,
-                            const float *__restrict__ d8) {
-
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = dpct::vectorized_binary<sycl::char4>(
-            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ sc, const float &d6,
-                           const float *__restrict__ d8) {
-
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
-                                    sumi_d.x()); // SIMD dot product
-            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
-                                    sumi_d.x()); // SIMD dot product
-
-            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
-                                    sumi_d.y()); // SIMD dot product
-            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
-                                    sumi_d.y()); // SIMD dot product
-        }
-
-        sumf_d += d8[i0 / 4] *
-                  (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y());
-    }
-
-    return d6 * sumf_d;
-}
-
-static __dpct_inline__ float
-vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q4_0, float *tile_x_d_q4_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q4_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q4_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q4_1;
-    *x_dm = tile_x_dm_q4_1;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_0, float *tile_x_d_q5_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql_q5_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q5_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0 = dpct::vectorized_binary<sycl::char4>(
-            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1 = dpct::vectorized_binary<sycl::char4>(
-            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql_q5_1;
-    *x_dm = tile_x_dm_q5_1;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset < nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
-                                                      bq8_1->ds[0]);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q8_0, float *tile_x_d_q8_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q8_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q8_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[0];
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K,
-                    int *tile_x_sc_q2_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q2_K;
-    *x_dm = tile_x_dm_q2_K;
-    *x_sc = tile_x_sc_q2_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = bq3_K->d;
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[0];
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K,
-                    int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) {
-
-    *x_ql = tile_x_ql_q3_K;
-    *x_dm = tile_x_dm_q3_K;
-    *x_qh = tile_x_qh_q3_K;
-    *x_sc = tile_x_sc_q3_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = dpct::vectorized_binary<sycl::char4>(
-            sc_low | sc_high, 0x20202020, dpct::sub_sat());
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K,
-                    int *tile_x_sc_q4_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q4_K;
-    *x_dm = tile_x_dm_q4_K;
-    *x_sc = tile_x_sc_q4_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    bad_arch();
-#endif // __SYCL_ARCH__ >= VER_4VEC
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K,
-                    int *tile_x_sc_q5_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q5_K;
-    *x_dm = tile_x_dm_q5_K;
-    *x_sc = tile_x_sc_q5_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_SYCL_ASSUME(i_offset >= 0);
-    GGML_SYCL_ASSUME(i_offset <  nwarps);
-    GGML_SYCL_ASSUME(k >= 0);
-    GGML_SYCL_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
-            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
-                                                 dpct::sub_sat());
-        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
-            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
-                                                 dpct::sub_sat());
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
-          int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
-          vec_dot_q_mul_mat_sycl_t vec_dot>
-/*
-DPCT1110:8: The total declared local variable size in device function mul_mat_q
-exceeds 128 bytes and may cause high register pressure. Consult with your
-hardware vendor to find the total register size available and adjust the code,
-or use smaller sub-group size to avoid high register pressure.
-*/
-static __dpct_inline__ void
-mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
-          float *__restrict__ dst, const int ncols_x, const int nrows_x,
-          const int ncols_y, const int nrows_y, const int nrows_dst,
-          int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
-          int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
-          sycl::half2 *tile_y_ds) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
-                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
-                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
-                   blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = dpct::min(
-                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
-                    ncols_y - 1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
-                                    kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(
-                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids =
-                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
-                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
-                    mmq_x;
-                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
-                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const sycl::half2 *dsi_src =
-                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
-                       ir * (WARP_SIZE / QI8_1) + kby]
-                         .ds;
-                sycl::half2 *dsi_dst =
-                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src)[0];
-                }
-            }
-
-            /*
-            DPCT1118:9: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
-                            item_ct1.get_local_id(1) + j, k);
-                    }
-                }
-            }
-
-            /*
-            DPCT1118:10: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-#define  MMQ_X_Q4_0_RDNA2  64
-#define  MMQ_Y_Q4_0_RDNA2  128
-#define NWARPS_Q4_0_RDNA2  8
-#define  MMQ_X_Q4_0_RDNA1  64
-#define  MMQ_Y_Q4_0_RDNA1  64
-#define NWARPS_Q4_0_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
-#define  MMQ_X_Q4_0_AMPERE 64
-#define  MMQ_Y_Q4_0_AMPERE 128
-#define NWARPS_Q4_0_AMPERE 4
-#endif
-#define  MMQ_X_Q4_0_PASCAL 64
-#define  MMQ_Y_Q4_0_PASCAL 64
-#define NWARPS_Q4_0_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-
-    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-    const int nwarps = NWARPS_Q4_0_AMPERE;
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q4_0, tile_x_d_q4_0);
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
-              vec_dot_q4_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q4_1_RDNA2  64
-#define  MMQ_Y_Q4_1_RDNA2  128
-#define NWARPS_Q4_1_RDNA2  8
-#define  MMQ_X_Q4_1_RDNA1  64
-#define  MMQ_Y_Q4_1_RDNA1  64
-#define NWARPS_Q4_1_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
-#define  MMQ_X_Q4_1_AMPERE 64
-#define  MMQ_Y_Q4_1_AMPERE 128
-#define NWARPS_Q4_1_AMPERE 4
-#endif
-#define  MMQ_X_Q4_1_PASCAL 64
-#define  MMQ_Y_Q4_1_PASCAL 64
-#define NWARPS_Q4_1_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,
-    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-    const int nwarps = NWARPS_Q4_1_AMPERE;
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q4_1, tile_x_dm_q4_1);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
-              vec_dot_q4_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q5_0_RDNA2  64
-#define  MMQ_Y_Q5_0_RDNA2  128
-#define NWARPS_Q5_0_RDNA2  8
-#define  MMQ_X_Q5_0_RDNA1  64
-#define  MMQ_Y_Q5_0_RDNA1  64
-#define NWARPS_Q5_0_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
-#define  MMQ_X_Q5_0_AMPERE 128
-#define  MMQ_Y_Q5_0_AMPERE 64
-#define NWARPS_Q5_0_AMPERE 4
-#endif
-#define  MMQ_X_Q5_0_PASCAL 64
-#define  MMQ_Y_Q5_0_PASCAL 64
-#define NWARPS_Q5_0_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-    const int nwarps = NWARPS_Q5_0_AMPERE;
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_0, tile_x_d_q5_0);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
-              vec_dot_q5_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q5_1_RDNA2  64
-#define  MMQ_Y_Q5_1_RDNA2  128
-#define NWARPS_Q5_1_RDNA2  8
-#define  MMQ_X_Q5_1_RDNA1  64
-#define  MMQ_Y_Q5_1_RDNA1  64
-#define NWARPS_Q5_1_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
-#define  MMQ_X_Q5_1_AMPERE 128
-#define  MMQ_Y_Q5_1_AMPERE 64
-#define NWARPS_Q5_1_AMPERE 4
-#endif
-#define  MMQ_X_Q5_1_PASCAL 64
-#define  MMQ_Y_Q5_1_PASCAL 64
-#define NWARPS_Q5_1_PASCAL 8
-
-template <bool need_check> static void
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,
-    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-    const int nwarps = NWARPS_Q5_1_AMPERE;
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_1, tile_x_dm_q5_1);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
-              vec_dot_q5_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q8_0_RDNA2  64
-#define  MMQ_Y_Q8_0_RDNA2  128
-#define NWARPS_Q8_0_RDNA2  8
-#define  MMQ_X_Q8_0_RDNA1  64
-#define  MMQ_Y_Q8_0_RDNA1  64
-#define NWARPS_Q8_0_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
-#define  MMQ_X_Q8_0_AMPERE 128
-#define  MMQ_Y_Q8_0_AMPERE 64
-#define NWARPS_Q8_0_AMPERE 4
-#endif
-#define  MMQ_X_Q8_0_PASCAL 64
-#define  MMQ_Y_Q8_0_PASCAL 64
-#define NWARPS_Q8_0_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-    const int nwarps = NWARPS_Q8_0_AMPERE;
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q8_0, tile_x_d_q8_0);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
-              vec_dot_q8_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q2_K_RDNA2  64
-#define  MMQ_Y_Q2_K_RDNA2  128
-#define NWARPS_Q2_K_RDNA2  8
-#define  MMQ_X_Q2_K_RDNA1  128
-#define  MMQ_Y_Q2_K_RDNA1  32
-#define NWARPS_Q2_K_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
-#define  MMQ_X_Q2_K_AMPERE 64
-#define  MMQ_Y_Q2_K_AMPERE 128
-#define NWARPS_Q2_K_AMPERE 4
-#endif
-#define  MMQ_X_Q2_K_PASCAL 64
-#define  MMQ_Y_Q2_K_PASCAL 64
-#define NWARPS_Q2_K_PASCAL 8
-
-template <bool need_check> static void
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,
-    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-    const int nwarps = NWARPS_Q2_K_AMPERE;
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
-              vec_dot_q2_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q3_K_RDNA2  128
-#define  MMQ_Y_Q3_K_RDNA2  64
-#define NWARPS_Q3_K_RDNA2  8
-#define  MMQ_X_Q3_K_RDNA1  32
-#define  MMQ_Y_Q3_K_RDNA1  128
-#define NWARPS_Q3_K_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
-#define  MMQ_X_Q3_K_AMPERE 128
-#define  MMQ_Y_Q3_K_AMPERE 128
-#define NWARPS_Q3_K_AMPERE 4
-#endif
-#define  MMQ_X_Q3_K_PASCAL 64
-#define  MMQ_Y_Q3_K_PASCAL 64
-#define NWARPS_Q3_K_PASCAL 8
-
-template <bool need_check> static void
-mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
-    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-    const int nwarps = NWARPS_Q3_K_AMPERE;
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K,
-                               tile_x_sc_q3_K);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
-              vec_dot_q3_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q4_K_RDNA2  64
-#define  MMQ_Y_Q4_K_RDNA2  128
-#define NWARPS_Q4_K_RDNA2  8
-#define  MMQ_X_Q4_K_RDNA1  32
-#define  MMQ_Y_Q4_K_RDNA1  64
-#define NWARPS_Q4_K_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
-#define  MMQ_X_Q4_K_AMPERE 64
-#define  MMQ_Y_Q4_K_AMPERE 128
-#define NWARPS_Q4_K_AMPERE 4
-#endif
-#define  MMQ_X_Q4_K_PASCAL 64
-#define  MMQ_Y_Q4_K_PASCAL 64
-#define NWARPS_Q4_K_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,
-    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-    const int nwarps = NWARPS_Q4_K_AMPERE;
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
-              vec_dot_q4_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q5_K_RDNA2  64
-#define  MMQ_Y_Q5_K_RDNA2  128
-#define NWARPS_Q5_K_RDNA2  8
-#define  MMQ_X_Q5_K_RDNA1  32
-#define  MMQ_Y_Q5_K_RDNA1  64
-#define NWARPS_Q5_K_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
-#define  MMQ_X_Q5_K_AMPERE 64
-#define  MMQ_Y_Q5_K_AMPERE 128
-#define NWARPS_Q5_K_AMPERE 4
-#endif
-#define  MMQ_X_Q5_K_PASCAL 64
-#define  MMQ_Y_Q5_K_PASCAL 64
-#define NWARPS_Q5_K_PASCAL 8
-
-template <bool need_check> static void
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,
-    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-    const int nwarps = NWARPS_Q5_K_AMPERE;
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
-              vec_dot_q5_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-#define  MMQ_X_Q6_K_RDNA2  64
-#define  MMQ_Y_Q6_K_RDNA2  128
-#define NWARPS_Q6_K_RDNA2  8
-#define  MMQ_X_Q6_K_RDNA1  32
-#define  MMQ_Y_Q6_K_RDNA1  64
-#define NWARPS_Q6_K_RDNA1  8
-#if defined(SYCL_USE_XMX)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
-#define  MMQ_X_Q6_K_AMPERE 64
-#define  MMQ_Y_Q6_K_AMPERE 64
-#define NWARPS_Q6_K_AMPERE 4
-#endif
-#define  MMQ_X_Q6_K_PASCAL 64
-#define  MMQ_Y_Q6_K_PASCAL 64
-#define NWARPS_Q6_K_PASCAL 8
-
-template <bool need_check> static void
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    // int   * tile_x_ql = nullptr;
-    // sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    // int   * tile_x_sc = nullptr;
-
-//sycl_todo: change according to hardware
-    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-    const int nwarps = NWARPS_Q6_K_AMPERE;
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql, tile_x_dm, tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
-              vec_dot_q6_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
-static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row * blocks_per_row + i +
-                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
-
-        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
-                        (qk / QK8_1); // y block index that aligns with ibx
-
-        const int iqs =
-            vdr *
-            (item_ct1.get_local_id(2) %
-             (qi / vdr)); // x block quant index when casting the quants to int
-
-        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
-                                   const sycl::nd_item<3> &item_ct1) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = item_ct1.get_local_id(2);
-
-    const int iter_stride = 2*GGML_SYCL_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_SYCL_F16
-    sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_SYCL_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_SYCL_F16
-            dfloat2 t1{y[iybs + iqs + j / qr + 0],
-                        y[iybs + iqs + j / qr + y_offset]};
-
-            tmp += v * t1;
-#else
-            tmp += v.x() * y[iybs + iqs + j / qr + 0];
-            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
-#endif // GGML_SYCL_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-#ifdef GGML_SYCL_F16
-        dst[row] = tmp.x() + tmp.y();
-#else
-        dst[row] = tmp;
-#endif // GGML_SYCL_F16
-    }
-}
-
-static void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi =
-            sycl::vec<sycl::half, 1>(x[ix])
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        const int row_y = col_x;
-
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi =
-            sycl::vec<sycl::half, 1>(x[ix])
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = sycl::vec<float, 1>(*xi)
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-}
-
-static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const sycl::half *xi = (const sycl::half *)cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
-    const sycl::half *xi = (const sycl::half *)cxi;
-    float *dsti = (float *)cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
-    const int16_t *xi = (const int16_t *)cxi;
-    int16_t *dsti = (int16_t *)cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
-    const int32_t *xi = (const int32_t *)cxi;
-    int32_t *dsti = (int32_t *)cdsti;
-
-    *dsti = *xi;
-}
-
-template <cpy_kernel_t cpy_1>
-static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                        const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                        const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                        const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= ne) {
-        return;
-    }
-
-    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i03 = i/(ne00 * ne01 * ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-
-    const int i13 = i/(ne10 * ne11 * ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = sycl::fmax(amax, sycl::fabs((float)v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = sycl::round((float)x0);
-    }
-}
-
-static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float)v)) {
-            amax = sycl::fabs((float)v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x() = d;
-    dsti->dm.y() = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-template <cpy_kernel_t cpy_blck, int qk>
-static void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                      const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                      const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                      const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
-    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2)) *
-                  qk;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i03 = i/(ne00 * ne01 * ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
-
-    const int i13 = i/(ne10 * ne11 * ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
-
-    cpy_blck(cx + x_offset, cdst + dst_offset);
-}
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
-    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
-    }
-    *cos_theta = sycl::cos(theta) * mscale;
-    *sin_theta = sycl::sin(theta) * mscale;
-}
-
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
-
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-}
-
-template<typename T, bool has_pos>
-static void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    if (ib > 0) {
-        const int i = row*ncols + ib*n_dims + ic;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
-        return;
-    }
-
-    const int i  = row*ncols + ib*n_dims + ic/2;
-    const int i2 = row/p_delta_rows;
-
-    float cur_rot = inv_ndims * ic - ib;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base =
-        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
-
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-}
-
-static void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-, const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
-    const float sin_theta = sycl::sin((float)theta);
-    const float cos_theta = sycl::cos((float)theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta =
-        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
-    const float sin_block_theta = sycl::sin((float)block_theta);
-    const float cos_block_theta = sycl::cos((float)block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
-                                 const int n_heads_log2_floor, const float m0, const float m1,
-                                 const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-
-    const int k = row/k_rows;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = dpct::pow(m0, k + 1);
-    } else {
-        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    dst[i] = col * m_k + x[i];
-}
-
-static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(1);
-    const int col = item_ct1.get_local_id(2);
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum, item_ct1);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-template<typename T>
-static inline void swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
-                              const sycl::nd_item<3> &item_ct1) {
-    // bitonic sort
-    int col = item_ct1.get_local_id(2);
-    int row = item_ct1.get_group(1);
-
-    if (col >= ncols) return;
-
-    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    /*
-    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
-    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-    performance if there is no access to global memory.
-    */
-    item_ct1.barrier();
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            /*
-            DPCT1118:11: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-}
-
-static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
-                              const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-}
-
-
-template <bool vals_smem, int ncols_template, int block_size_template>
-static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
-                         const int nrows_y, const float scale, const float max_bias, const float m0,
-                         const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
-    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
-
-    const int tid = item_ct1.get_local_id(2);
-    const int rowx = item_ct1.get_group(2);
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
-
-    const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
-
-    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-
-    float slope = 0.0f;
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const uint32_t h = rowx/nrows_y; // head index
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = sycl::pow(base, float(exp));
-    }
-
-    float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
-    float max_val = -INFINITY;
-
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-
-        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
-
-        vals[col] = val;
-        max_val = sycl::max(max_val, val);
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = -INFINITY;
-        }
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-
-        if (lane_id == 0) {
-            buf[warp_id] = max_val;
-        }
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-
-        max_val = buf[lane_id];
-        max_val = warp_reduce_max(max_val, item_ct1);
-    }
-
-    float tmp = 0.f;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-                if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const float val = sycl::native::exp(vals[col] - max_val);
-        tmp += val;
-        vals[col] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = 0.f;
-        }
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-
-        if (lane_id == 0) {
-            buf[warp_id] = tmp;
-        }
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-
-        tmp = buf[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float inv_sum = 1.f / tmp;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            return;
-        }
-
-        const int idst = rowx*ncols + col;
-        dst[idst] = vals[col] * inv_sum;
-    }
-}
-
-static void scale_f32(const float * x, float * dst, const float scale, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-template <typename T>
-static void im2col_kernel(const float *x, T *dst, int offset_delta,
-                           int IW, int IH, int OW, int KW, int KH,
-                           int pelements, int CHW, int s0, int s1, int p0,
-                           int p1, int d0, int d1,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_id(2) +
-                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (i >= pelements) {
-        return;
-    }
-
-    const int ksize = OW * (KH > 1 ? KW : 1);
-    const int kx = i / ksize;
-    const int kd = kx * ksize;
-    const int ky = (i - kd) / OW;
-    const int ix = i % OW;
-
-    const int64_t iiw = ix * s0 + kx * d0 - p0;
-    const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
-
-    const int64_t offset_dst =
-        (item_ct1.get_group(1) * OW + ix) * CHW +
-        (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] =
-            sycl::vec<float, 1>(0.0f)
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    } else {
-        const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
-        dst[offset_dst] =
-            sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_sycl(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst, const void *src0_dd,
-                          const int32_t *src1_dd, float *dst_dd,
-                          dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             k_get_rows<qk, qr, dq>(
-                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
-
-    (void) dst;
-}
-
-template <typename src0_t>
-static void get_rows_sycl_float(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const src0_t *src0_dd, const int32_t *src1_dd,
-                                float *dst_dd, dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-            });
-    }
-
-    (void) dst;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_sycl {
-    template <typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor *src0,
-                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
-                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
-                    dpct::queue_ptr stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] = std::min<unsigned int>(
-                ne1, block_size / (unsigned int)block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(
-                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
-                                   (unsigned int)block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
-                                s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:16: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
-        }
-    }
-};
-
-static void acc_f32_sycl(const float *x, const float *y, float *dst,
-                         const int n_elements, const int ne10, const int ne11,
-                         const int ne12, const int nb1, const int nb2,
-                         const int offset, dpct::queue_ptr stream) {
-    int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
-                    item_ct1);
-        });
-}
-
-static void gelu_f32_sycl(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void silu_f32_sycl(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            silu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void tanh_f32_sycl(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            tanh_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void relu_f32_sycl(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            relu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
-                                const float negative_slope,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
-        });
-}
-
-static void sqr_f32_sycl(const float *x, float *dst, const int k,
-                         dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqr_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void norm_f32_sycl(const float *x, float *dst, const int ncols,
-                          const int nrows, const float eps,
-                          dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32(x, dst, ncols, eps, item_ct1,
-                                            s_sum_acc_ct1.get_pointer(), WARP_SIZE);
-                    });
-        });
-    } else {
-        const int work_group_size = g_work_group_size;
-        const sycl::range<3> block_dims(1, 1, work_group_size);
-        /*
-        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32(x, dst, ncols, eps, item_ct1,
-                                       s_sum_acc_ct1.get_pointer(), work_group_size);
-                    });
-        });
-    }
-}
-
-static void group_norm_f32_sycl(const float *x, float *dst,
-                                const int num_groups, const int group_size,
-                                const int ne_elements, dpct::queue_ptr stream) {
-    static const float eps = 1e-6f;
-    if (group_size < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32(
-                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                            s_sum_acc_ct1.get_pointer(), WARP_SIZE);
-                    });
-        });
-    } else {
-        const int work_group_size = g_work_group_size;
-        const sycl::range<3> block_dims(1, 1, work_group_size);
-        /*
-        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32(x, dst, group_size, ne_elements,
-                                             eps_ct4, item_ct1,
-                                             s_sum_acc_ct1.get_pointer(), work_group_size);
-                    });
-        });
-    }
-}
-
-static void concat_f32_sycl(const float *x, const float *y, float *dst,
-                            const int ne0, int ne1, int ne2, int ne02,
-                            dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            concat_f32(x, y, dst, ne0, ne02, item_ct1);
-        });
-}
-
-static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
-                             const int ne01, const int ne02,
-                             const int scale_factor, dpct::queue_ptr stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
-        });
-}
-
-static void pad_f32_sycl(const float *x, float *dst, const int ne00,
-                         const int ne01, const int ne02, const int ne0,
-                         const int ne1, const int ne2, dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
-        });
-}
-
-static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
-                              const int nrows, const float eps,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                                                s_sum_acc_ct1.get_pointer(), WARP_SIZE);
-                    });
-        });
-    } else {
-        const int work_group_size = g_work_group_size;
-        const sycl::range<3> block_dims(1, 1, work_group_size);
-        /*
-        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                                           s_sum_acc_ct1.get_pointer(), work_group_size);
-                    });
-        });
-    }
-}
-
-static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
-                                   const int ky, const int kx_padded,
-                                   dpct::queue_ptr stream) {
-    const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
-    const sycl::range<3> num_blocks(1, ky, block_num_x);
-    const sycl::range<3> block_size(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
-            });
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_sycl(const void *__restrict__ vx,
-                                  dst_t *__restrict__ y, const int k,
-                                  dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(
-                sycl::range<3>(1, 1, num_blocks) *
-                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
-                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
-            });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, item_ct1);
-                             });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_sycl;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_sycl;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_sycl;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_sycl;
-        case GGML_TYPE_F32:
-            return dequantize_block_sycl<1, 1, convert_f32>;
-        default:
-            return nullptr;
-    }
-}
-
-static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_sycl;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_sycl;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_sycl;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_sycl;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_sycl;
-        case GGML_TYPE_F16:
-            return dequantize_block_sycl<1, 1, convert_f16>;
-        default:
-            return nullptr;
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const sycl::range<3> block_dims(1, 1, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
-                                         float *dst, const int ncols,
-                                         const int nrows,
-                                         dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
-                                                          nrows, item_ct1);
-            });
-    }
-}
-
-template <int qk, int qi, typename block_q_t, int vdr,
-          vec_dot_q_sycl_t vec_dot_q_sycl>
-static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
-                                         float *dst, const int ncols,
-                                         const int nrows,
-                                         dpct::queue_ptr stream) {
-  GGML_ASSERT(ncols % QK4_0 == 0);
-  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-  const sycl::range<3> block_nums(1, 1, block_num_y);
-  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-  stream->parallel_for(
-      sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
-  ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-        mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
-            vx, vy, dst, ncols, nrows, item_ct1);
-      });
-}
-
-int get_device_index_by_id(int id){
-    int res = g_sycl_device_id2index[id].index;
-    // GGML_SYCL_DEBUG("get_device_index_by_id id=%d device_index=%d\n", id, res);
-    GGML_ASSERT(res>=0);
-    return res;
-}
-
-int get_device_id_by_index(int index){
-    int res = g_device_caps[index].device_id;
-    GGML_ASSERT(res>=0);
-    return res;
-}
-
-
-int get_current_device_index(){
-    return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id());
-}
-
-static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-        nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q4_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q4_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-        nwarps = NWARPS_Q4_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-        nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-        nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q4_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-        nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q4_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-        nwarps = NWARPS_Q4_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-        nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-        nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q5_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-        nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q5_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-        nwarps = NWARPS_Q5_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-        nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-        nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q5_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-        nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q5_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-        nwarps = NWARPS_Q5_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-        nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-        nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q8_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-        nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q8_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-        nwarps = NWARPS_Q8_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q8_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q8_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-        nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-        nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q2_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-        nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q2_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-        nwarps = NWARPS_Q2_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q2_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q2_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-#if QK_K == 256
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-        nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-        nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q3_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-        nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q3_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-        nwarps = NWARPS_Q3_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q3_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q3_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-#endif
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-        nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-        nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q4_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-        nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q4_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-        nwarps = NWARPS_Q4_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-        nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-        nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q5_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-        nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q5_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-        nwarps = NWARPS_Q5_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= VER_GEN13) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-        nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= VER_GEN12) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-        nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= VER_GEN9) {
-        mmq_x  =  MMQ_X_Q6_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-        nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= VER_4VEC) {
-        mmq_x  =  MMQ_X_Q6_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-        nwarps = NWARPS_Q6_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q6_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q6_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
-                                           float *dst, const int ncols_x,
-                                           const int nrows_x,
-                                           const int nchannels_x,
-                                           const int nchannels_y,
-                                           dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
-                                     nchannels_y, item_ct1);
-            });
-    }
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_sycl(
-    const void *vx, const float *y, float *dst, const int ncols_x,
-    const int nrows_x, const int row_stride_x, const int nchannels_x,
-    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
-                                       row_stride_x, channel_stride_x,
-                                       nchannels_y / nchannels_x, item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int ne02, const int nb00,
-                                  const int nb01, const int nb02,
-                                  const int nb03, const int ne10,
-                                  const int ne11, const int ne12,
-                                  const int nb10, const int nb11,
-                                  const int nb12, const int nb13,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int ne02, const int nb00,
-                                  const int nb01, const int nb02,
-                                  const int nb03, const int ne10,
-                                  const int ne11, const int ne12,
-                                  const int nb10, const int nb11,
-                                  const int nb12, const int nb13,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int ne02, const int nb00,
-                                   const int nb01, const int nb02,
-                                   const int nb03, const int ne10,
-                                   const int ne11, const int ne12,
-                                   const int nb10, const int nb11,
-                                   const int nb12, const int nb13,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK8_0 == 0);
-    const int num_blocks = ne / QK8_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
-                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                 item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int ne02, const int nb00,
-                                   const int nb01, const int nb02,
-                                   const int nb03, const int ne10,
-                                   const int ne11, const int ne12,
-                                   const int nb10, const int nb11,
-                                   const int nb12, const int nb13,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_0 == 0);
-    const int num_blocks = ne / QK4_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
-                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                 item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int ne02, const int nb00,
-                                   const int nb01, const int nb02,
-                                   const int nb03, const int ne10,
-                                   const int ne11, const int ne12,
-                                   const int nb10, const int nb11,
-                                   const int nb12, const int nb13,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_1 == 0);
-    const int num_blocks = ne / QK4_1;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
-                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                 item_ct1);
-                         });
-}
-
-static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int ne02, const int nb00,
-                                  const int nb01, const int nb02,
-                                  const int nb03, const int ne10,
-                                  const int ne11, const int ne12,
-                                  const int nb10, const int nb11,
-                                  const int nb12, const int nb13,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int ne02, const int nb00,
-                                  const int nb01, const int nb02,
-                                  const int nb03, const int ne10,
-                                  const int ne11, const int ne12,
-                                  const int nb10, const int nb11,
-                                  const int nb12, const int nb13,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
-    {
-        // dpct::has_capability_or_fail(stream->get_device(),
-        //                              {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int ne02, const int nb00,
-                                  const int nb01, const int nb02,
-                                  const int nb03, const int ne10,
-                                  const int ne11, const int ne12,
-                                  const int nb10, const int nb11,
-                                  const int nb12, const int nb13,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
-    {
-        // dpct::has_capability_or_fail(stream->get_device(),
-        //                              {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
-                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
-                                           item_ct1);
-            });
-    }
-}
-
-static void scale_f32_sycl(const float *x, float *dst, const float scale,
-                           const int k, dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            scale_f32(x, dst, scale, k, item_ct1);
-        });
-}
-
-static void clamp_f32_sycl(const float *x, float *dst, const float min,
-                           const float max, const int k,
-                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            clamp_f32(x, dst, min, max, k, item_ct1);
-        });
-}
-
-template <typename T>
-static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
-                      const int32_t *pos, float freq_scale, int p_delta_rows,
-                      float freq_base, float ext_factor, float attn_factor,
-                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-    if (pos == nullptr) {
-        /*
-        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                               freq_base, ext_factor, attn_factor, corr_dims,
-                               item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                              freq_base, ext_factor, attn_factor, corr_dims,
-                              item_ct1);
-            });
-    }
-}
-
-template <typename T>
-static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
-                           const int32_t *pos, float freq_scale,
-                           int p_delta_rows, float freq_base, float ext_factor,
-                           float attn_factor, rope_corr_dims corr_dims,
-                           dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
-    if (pos == nullptr) {
-        /*
-        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
-                                    p_delta_rows, ext_factor, attn_factor,
-                                    corr_dims, theta_scale, inv_ndims,
-                                    item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
-                                   p_delta_rows, ext_factor, attn_factor,
-                                   corr_dims, theta_scale, inv_ndims, item_ct1);
-            });
-    }
-}
-
-static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
-                              const int32_t *pos, float freq_scale,
-                              int p_delta_rows, float freq_base, int n_ctx,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
-    const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
-                                          p_delta_rows, freq_base, n_ctx,
-                                          item_ct1);
-                         });
-}
-
-static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
-                           const int nrows, const int k_rows,
-                           const int n_heads_log2_floor, const float m0,
-                           const float m1, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
-    const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             alibi_f32(x, dst, ncols, k_rows,
-                                       n_heads_log2_floor, m0, m1, item_ct1);
-                         });
-}
-
-static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
-                              const int nrows, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(32)]] {
-                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
-                             });
-}
-
-static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
-                                 const int nrows, ggml_sort_order order,
-                                 dpct::queue_ptr stream) {
-    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-
-    const sycl::range<3> block_dims(1, 1, ncols);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ORDER_ASC) {
-        /*
-        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
-            });
-    } else if (order == GGML_SORT_ORDER_DESC) {
-        /*
-        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
-            });
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static void diag_mask_inf_f32_sycl(const float *x, float *dst,
-                                   const int ncols_x, const int nrows_x,
-                                   const int rows_per_channel, const int n_past,
-                                   dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             diag_mask_inf_f32(x, dst, ncols_x,
-                                               rows_per_channel, n_past,
-                                               item_ct1);
-                         });
-}
-
-template <bool vals_smem, int ncols_template, int block_size_template>
-static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
-                                   const int nrows_y, const float scale, const float max_bias, const float m0,
-                                   const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
-                                   const size_t n_local_scratch, dpct::queue_ptr stream) {
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
-                                                                             nrows_y, scale, max_bias, m0,
-                                                                             m1, n_head_log2, item_ct1,
-                                                                             local_buf_acc.get_pointer());
-            });
-    });
-}
-
-static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
-                              float * dst, const int ncols_x, const int nrows_x,
-                              const int nrows_y, const float scale, const float max_bias,
-                              dpct::queue_ptr stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const sycl::range<3> block_dims(1, 1, nth);
-    const sycl::range<3> block_nums(1, 1, nrows_x);
-    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
-    static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-
-    const uint32_t n_head_kv   = nrows_x/nrows_y;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
-    if (n_local_scratch*sizeof(float) < local_mem_size) {
-        switch (ncols_x) {
-            case 32:
-                soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                     max_bias, m0, m1, n_head_log2, block_nums,
-                                                     block_dims, n_local_scratch, stream);
-                break;
-            case 64:
-                soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                     max_bias, m0, m1, n_head_log2, block_nums,
-                                                     block_dims, n_local_scratch, stream);
-                break;
-            case 128:
-                soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                       max_bias, m0, m1, n_head_log2, block_nums,
-                                                       block_dims, n_local_scratch, stream);
-                break;
-            case 256:
-                soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                       max_bias, m0, m1, n_head_log2, block_nums,
-                                                       block_dims, n_local_scratch, stream);
-                break;
-            case 512:
-                soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                       max_bias, m0, m1, n_head_log2, block_nums,
-                                                       block_dims, n_local_scratch, stream);
-                break;
-            case 1024:
-                soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                         max_bias, m0, m1, n_head_log2, block_nums,
-                                                         block_dims, n_local_scratch, stream);
-                break;
-            case 2048:
-                soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                         max_bias, m0, m1, n_head_log2, block_nums,
-                                                         block_dims, n_local_scratch, stream);
-                break;
-            case 4096:
-                soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                         max_bias, m0, m1, n_head_log2, block_nums,
-                                                         block_dims, n_local_scratch, stream);
-                break;
-            default:
-                soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                                   max_bias, m0, m1, n_head_log2, block_nums,
-                                                   block_dims, n_local_scratch, stream);
-                break;
-        }
-    } else {
-        soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                            max_bias, m0, m1, n_head_log2, block_nums,
-                                            block_dims, WARP_SIZE, stream);
-    }
-}
-
-template <typename T>
-static void im2col_sycl(const float *x, T *dst, int IW, int IH,
-                                int OW, int OH, int KW, int KH, int IC,
-                                int offset_delta, int s0, int s1, int p0,
-                                int p1, int d0, int d1,
-                                dpct::queue_ptr stream) {
-    const int parallel_elements = OW * KW * KH;
-    const int num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
-    sycl::range<3> block_nums(IC, OH, num_blocks);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums *
-                                  sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
-                               parallel_elements, (IC * KH * KW), s0, s1, p0,
-                               p1, d0, d1, item_ct1);
-            });
-    }
-}
-
-// buffer pool for sycl
-#define MAX_SYCL_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-static std::atomic_flag g_sycl_pool_lock = ATOMIC_FLAG_INIT;
-
-// #define DEBUG_SYCL_MALLOC
-struct sycl_buffer {
-    void * ptr = nullptr;
-    size_t size = 0;
-};
-
-static sycl_buffer g_sycl_buffer_pool[GGML_SYCL_MAX_DEVICES][MAX_SYCL_BUFFERS];
-static size_t g_sycl_pool_size[GGML_SYCL_MAX_DEVICES] = {0};
-
-static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_sycl_pool_lock);
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg index %d\n", id);
-#ifdef DEBUG_SYCL_MALLOC
-    int nnz = 0;
-    size_t max_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
-    for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
-        sycl_buffer& b = g_sycl_buffer_pool[id][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_SYCL_MALLOC
-            ++nnz;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 1 %p\n", ptr);
-                        return ptr;
-                    }
-                }
-            }
-        }
-    }
-    if (ibest >= 0) {
-        sycl_buffer& b = g_sycl_buffer_pool[id][ibest];
-        void * ptr = b.ptr;
-        *actual_size = b.size;
-        b.ptr = nullptr;
-        b.size = 0;
-        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 2 %p\n", ptr);
-        return ptr;
-    }
-    void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
-    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-
-    const dpct::queue_ptr stream = g_syclStreams[id][0];
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
-                             look_ahead_size, *stream)));
-    *actual_size = look_ahead_size;
-    g_sycl_pool_size[id] += look_ahead_size;
-
-#ifdef DEBUG_SYCL_MALLOC
-    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-#endif
-    // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return %p\n", ptr);
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_sycl_pool_lock);
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-
-    const dpct::queue_ptr stream = g_syclStreams[id][0];
-    for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
-        sycl_buffer& b = g_sycl_buffer_pool[id][i];
-        if (b.ptr == nullptr) {
-            b.ptr = ptr;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_SYCL_BUFFERS\n");
-    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *stream)));
-    g_sycl_pool_size[id] -= size;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-// pool with virtual memory
-/*
-DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
-*/
-// static std::vector<CUmemGenericAllocationHandle>
-//     g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES];
-static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0};
-static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0};
-
-static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try {
-    GGML_UNUSED(size);
-    GGML_UNUSED(actual_size);
-    return NULL;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_pool_free_vmm(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_sycl_pool_lock);
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-#ifdef DEBUG_SYCL_MALLOC
-    printf("sycl pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
-#endif
-
-    g_sycl_pool_used[id] -= size;
-
-    // all deallocations must be in reverse order of the allocations
-    GGML_ASSERT(ptr == (void *) (g_sycl_pool_addr[id] + g_sycl_pool_used[id]));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void *ggml_sycl_pool_malloc(size_t size, size_t *actual_size) try {
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    if (g_device_caps[id].vmm) {
-        return ggml_sycl_pool_malloc_vmm(size, actual_size);
-    } else {
-        return ggml_sycl_pool_malloc_leg(size, actual_size);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_pool_free(void *ptr, size_t size) try {
-    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-    if (g_device_caps[id].vmm) {
-        ggml_sycl_pool_free_vmm(ptr, size);
-    } else {
-        ggml_sycl_pool_free_leg(ptr, size);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-
-template<typename T>
-struct sycl_pool_alloc {
-    T * ptr = nullptr;
-    size_t actual_size = 0;
-
-    // size is in number of elements
-    T * alloc(size_t size) {
-        GGML_ASSERT(ptr == nullptr);
-        ptr = (T *) ggml_sycl_pool_malloc(size * sizeof(T), &this->actual_size);
-        // GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
-        return ptr;
-    }
-
-    sycl_pool_alloc(size_t size) {
-        alloc(size);
-    }
-
-    ~sycl_pool_alloc() {
-        if (ptr != nullptr) {
-            ggml_sycl_pool_free(ptr, actual_size);
-        }
-    }
-
-    T * get() {
-        return ptr;
-    }
-
-    sycl_pool_alloc() = default;
-    sycl_pool_alloc(const sycl_pool_alloc &) = delete;
-    sycl_pool_alloc(sycl_pool_alloc &&) = delete;
-    sycl_pool_alloc& operator=(const sycl_pool_alloc &) = delete;
-    sycl_pool_alloc& operator=(sycl_pool_alloc &&) = delete;
-};
-
-static bool g_sycl_loaded = false;
-
-bool ggml_sycl_loaded(void) {
-    return g_sycl_loaded;
-}
-
-void ggml_backend_sycl_print_sycl_devices(){
-    int device_count = dpct::dev_mgr::instance().device_count();
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    for (int id = 0; id < device_count; ++id) {
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(id))));
-        sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
-        fprintf(stderr, "  Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id,
-                prop.get_name(), prop.get_major_version(),
-                prop.get_minor_version(),
-                prop.get_max_compute_units(),
-                prop.get_max_work_group_size(),
-                prop.get_max_sub_group_size(),
-                prop.get_global_mem_size()
-                );
-    }
-    // fprintf(stderr, "\n");
-}
-
-int get_sycl_env(const char* env_name, int default_val){
-    char * user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1) {
-            user_number = (int)n;
-        } else {
-            user_number=default_val;
-        }
-    return user_number;
-}
-
-int get_work_group_size(int user_device_id){
-    dpct::device_info prop;
-    dpct::get_device_info(
-        prop,
-        dpct::dev_mgr::instance().get_device(user_device_id));
-    return prop.get_max_work_group_size();
-}
-
-void ggml_init_sycl() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug);
-
-        int user_device_id = get_sycl_env("GGML_SYCL_DEVICE", 0);
-
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                                 dpct::dev_mgr::instance().device_count()) !=
-            0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        int64_t total_vram = 0;
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_F16:   no\n", __func__);
-#endif
-
-
-#if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-        ggml_backend_sycl_print_sycl_devices();
-        for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
-            g_sycl_device_id2index[id].index = -1;
-            g_device_caps[id].vmm = 0;
-            g_device_caps[id].device_id = -1;
-            g_device_caps[id].cc = 0;
-            g_tensor_split[id] = 0;
-        }
-
-        int device_inx = -1;
-        for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_id) continue;
-
-            device_inx++;
-
-            g_device_caps[device_inx].vmm = 0;
-            g_device_caps[device_inx].device_id = id;
-            g_sycl_device_id2index[id].index = device_inx;
-
-            dpct::device_info prop;
-            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(id))));
-
-            g_tensor_split[device_inx] = total_vram;
-            total_vram += prop.get_global_mem_size();
-
-            g_device_caps[device_inx].cc =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-
-        }
-        device_inx = -1;
-        for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_id) continue;
-            device_inx++;
-            g_tensor_split[device_inx] /= total_vram;
-        }
-
-        device_inx = -1;
-        for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_id) continue;
-            device_inx++;
-            SYCL_CHECK(ggml_sycl_set_device(id));
-
-            // create sycl streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                /*
-                DPCT1025:88: The SYCL queue is created ignoring the flag and
-                priority options.
-                */
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_syclStreams[device_inx][is] =
-                        dpct::get_current_device().create_queue()));
-            }
-
-            const dpct::queue_ptr stream = g_syclStreams[device_inx][0];
-            // create sycl handle
-            SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[device_inx] =
-                                              stream));
-            /*
-            DPCT1027:89: The call to syclSetMathMode was replaced with 0
-            because this functionality is redundant in SYCL.
-            */
-            SYCL_CHECK(0);
-        }
-
-        // configure logging to stdout
-        // SYCL_CHECK(syclLoggerConfigure(1, 1, 0, nullptr));
-
-        //hardcode, force set to 1 device
-        g_device_count = 1;
-        ggml_sycl_set_main_device(user_device_id);
-        ggml_sycl_set_device(user_device_id);
-        g_work_group_size = get_work_group_size(user_device_id);
-        // fprintf(stderr, "Using Device %d\n", user_device_id);
-
-        // for (int id = 0; id < g_all_sycl_device_count; ++id) {
-        //     GGML_SYCL_DEBUG("id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
-        //     g_device_caps[id].device_id, id, g_sycl_device_id2index[id].index);
-        // }
-
-        initialized = true;
-        g_sycl_loaded = true;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-
-void ggml_sycl_set_tensor_split(const float * tensor_split) {
-    if (tensor_split == nullptr) {
-        return;
-    }
-    bool all_zero = true;
-    for (int i = 0; i < g_device_count; ++i) {
-        if (tensor_split[i] != 0.0f) {
-            all_zero = false;
-            break;
-        }
-    }
-    if (all_zero) {
-        return;
-    }
-    float split_sum = 0.0f;
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] = split_sum;
-        split_sum += tensor_split[i];
-    }
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] /= split_sum;
-    }
-}
-
-void *ggml_sycl_host_malloc(size_t size) try {
-    if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * ptr = nullptr;
-    //allow to use dpct::get_in_order_queue() for host malloc
-    dpct::err0 err = CHECK_TRY_ERROR(
-        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
-    /*
-    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.
-    */
-    if (err != 0) {
-        // clear the error
-        /*
-        DPCT1026:83: The call to syclGetLastError was removed because this
-        functionality is redundant in SYCL.
-        */
-        /*
-        DPCT1001:81: The statement could not be removed.
-        */
-        fprintf(
-            stderr,
-            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            /*
-            DPCT1009:84: SYCL uses exceptions to report errors and does not use
-            the error codes. The original code was commented out and a warning
-            string was inserted. You need to rewrite this code.
-            */
-            size / 1024.0 / 1024.0,
-            "syclGetErrorString is not supported" /*syclGetErrorString(err)*/);
-        return nullptr;
-    }
-
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_host_free(void *ptr) try {
-    //allow to use dpct::get_in_order_queue() for host malloc
-    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
-                                          const struct ggml_tensor *src,
-                                          int64_t i3, int64_t i2,
-                                          int64_t i1_low, int64_t i1_high,
-                                          dpct::queue_ptr stream) try {
-
-    dpct::memcpy_direction kind;
-    char * src_ptr;
-    if (src->backend == GGML_BACKEND_TYPE_CPU) {
-        kind = dpct::host_to_device;
-        src_ptr = (char *) src->data;
-        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
-    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-        kind = dpct::device_to_device;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-        int id;
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            id = get_current_device_index()));
-        // GGML_SYCL_DEBUG("current device index %d\n", id);
-        src_ptr = (char *) extra->data_device[id];
-    } else {
-        // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
-        GGML_ASSERT(false);
-    }
-    char * dst_ptr = (char *) dst;
-
-    GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
-    GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
-    const enum ggml_type type = src->type;
-    const int64_t ts = ggml_type_size(type);
-    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
-
-    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
-        // GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
-        // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
-        return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1,
-                                    kind, *stream));
-
-    } else if (nb0 == ts) {
-        return CHECK_TRY_ERROR(
-            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
-                                    ts * ne0 / bs, i1_diff, kind, *stream));
-    } else {
-        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
-            const void * rx = (const void *) ((const char *) x + i1*nb1);
-            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
-            // pretend the row is a matrix with cols=1
-            dpct::err0 r = CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
-                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
-            /*
-            DPCT1001:85: The statement could not be removed.
-            */
-            /*
-            DPCT1000:86: Error handling if-stmt was detected but could not be
-            rewritten.
-            */
-            if (r != 0) return r;
-        }
-        return 0;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_op_get_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_d, const float *src1_d,
-                                  float *dst_d, const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_sycl_float(src0, src1, dst, (const sycl::half *)src0_d,
-                                src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_sycl_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-template <class op>
-inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
-                                   const ggml_tensor *src1, ggml_tensor *dst,
-                                   const float *src0_dd, const float *src1_dd,
-                                   float *dst_dd,
-                                   const dpct::queue_ptr &main_stream) {
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
-             (sycl::half *)dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
-             main_stream);
-    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
-        op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
-             main_stream);
-    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
-        op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
-             main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_sycl_op_repeat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_d, const float *src1_d,
-                                float *dst_d,
-                                const dpct::queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-
-    (void) src1;
-    (void) src1_d;
-}
-
-inline void ggml_sycl_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_sycl_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_sycl_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_gelu_quick(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_leaky_relu(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_group_norm(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int num_groups = dst->op_params[0];
-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_concat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
-inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    const int scale_factor = dst->op_params[0];
-
-    upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    pad_f32_sycl(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_rms_norm(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_mul_mat_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    const int64_t ne00 = src0->ne[0];
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int device_id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(device_id = dpct::dev_mgr::instance().current_device_id()));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static int64_t get_row_rounding(ggml_type type) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_device_caps[id].cc) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-            if (max_compute_capability < g_device_caps[id].cc) {
-                max_compute_capability = g_device_caps[id].cc;
-            }
-        }
-    }
-
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= VER_GEN9 ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-            return max_compute_capability >= VER_GEN9 ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-}
-
-inline void ggml_sycl_op_mul_mat_vec_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(ggml_nrows(src1) == 1);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    // TODO: support these quantization types
-    GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
-                  src0->type == GGML_TYPE_IQ2_XS ||
-                  src0->type == GGML_TYPE_IQ3_XXS ||
-                  src0->type == GGML_TYPE_IQ1_S));
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-          mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
-                                       VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q4_1:
-          mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
-                                       VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q5_0:
-          mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
-                                       VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q5_1:
-          mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
-                                       VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q8_0:
-          mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
-                                       VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q2_K:
-          mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
-                                       VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q3_K:
-          mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
-                                       VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q4_K:
-          mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
-                                       VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q5_K:
-          mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
-                                       VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        case GGML_TYPE_Q6_K:
-          mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
-                                       VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
-              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-          break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_sycl_op_dequantize_mul_mat_vec(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int64_t row_diff = row_high - row_low;
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_SYCL_F16
-    sycl_pool_alloc<sycl::half> src1_dfloat_a;
-    sycl::half *src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        if (src1->type == GGML_TYPE_F16) {
-            src1_dfloat = (sycl::half *)src1->data + src1_padded_row_size;
-        } else {
-            src1_dfloat = src1_dfloat_a.alloc(ne00);
-            ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
-                                  ne00, ne00, ne01, ne02, nb00, nb01, nb02,
-                                  nb03, ne10, ne11, ne12, nb10, nb11, nb12,
-                                  nb13, stream);
-        }
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_SYCL_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_sycl_op_mul_mat_sycl(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    GGML_ASSERT(src0_dd_i  != nullptr);
-    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    int device_id = dpct::dev_mgr::instance().current_device_id();
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_index()));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
-
-#ifdef GGML_SYCL_F16
-    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
-#else
-    bool use_fp16 = false;
-#endif
-    // if (compute_capability >= VER_GEN9 && (src0->type == GGML_TYPE_F16 ||
-    // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
-    // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
-        dst->op_params[0] == GGML_PREC_DEFAULT) {
-
-        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
-        sycl_pool_alloc<sycl::half> src0_as_f16;
-        if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
-            GGML_ASSERT(to_fp16_sycl != nullptr);
-            size_t ne = row_diff*ne00;
-            src0_as_f16.alloc(ne);
-            to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream);
-        }
-        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src0_dd_i
-                                         : src0_as_f16.get();
-
-        sycl_pool_alloc<sycl::half> src1_as_f16;
-        if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
-            GGML_ASSERT(to_fp16_sycl != nullptr);
-            size_t ne = src1_ncols*ne10;
-            src1_as_f16.alloc(ne);
-            to_fp16_sycl(src1_ddf_i, src1_as_f16.get(), ne, stream);
-        }
-        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
-                ? (const sycl::half *)src1->data + src1_padded_row_size
-                                         : src1_as_f16.get();
-        sycl_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
-
-        const sycl::half alpha_f16 = 1.0f;
-        const sycl::half beta_f16 = 0.0f;
-
-        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[id] = stream));
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
-            *g_sycl_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
-            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
-            dst_f16.get(), dpct::library_data_t::real_half, ldc,
-            dpct::library_data_t::real_half)));
-
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
-        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    }
-    else {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
-        sycl_pool_alloc<float> src0_ddq_as_f32;
-
-        if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
-            GGML_ASSERT(to_fp32_sycl != nullptr);
-            src0_ddq_as_f32.alloc(row_diff*ne00);
-            to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
-        }
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
-        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[id] = stream));
-        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *g_sycl_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            dpct::get_value(&alpha, *g_sycl_handles[id]), src0_ddf_i, ne00,
-            src1_ddf_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]),
-            dst_dd_i, ldc)));
-    }
-
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
-    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    const int32_t * pos = nullptr;
-    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
-    }
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
-    // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_sycl(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
-                           ne00, n_dims, nrows, pos, freq_scale, ne01,
-                           freq_base, ext_factor, attn_factor, corr_dims,
-                           main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_sycl(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
-                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                      attn_factor, corr_dims, main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    //GGML_ASSERT(ne01 + n_past == ne00);
-    GGML_ASSERT(n_head == ne02);
-
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
-    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
-
-    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
-
-    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
-
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-
-    if (dst->type == GGML_TYPE_F16) {
-        im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-    } else {
-        im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-    }
-
-    (void) src0;
-    (void) src0_dd;
-}
-
-inline void ggml_sycl_op_sum_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_argsort(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_diag_mask_inf(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int nrows0 = ggml_nrows(src0);
-
-    const int n_past = ((int32_t *) dst->op_params)[0];
-
-    diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src0->ne[1];
-
-    float scale = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale, dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, dst->op_params + 1, sizeof(float));
-
-    // positions tensor
-    float * src2_dd = nullptr;
-    sycl_pool_alloc<float> src2_f;
-
-    ggml_tensor * src2 = dst->src[2];
-    const bool use_src2 = src2 != nullptr;
-
-    if (use_src2) {
-        const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
-
-        if (src2_on_device) {
-            ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
-            src2_dd = (float *) src2_extra->data_device[g_main_device];
-        } else {
-            src2_dd = src2_f.alloc(ggml_nelements(src2));
-            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
-        }
-    }
-
-    soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
-                      nrows_x, nrows_y, scale, max_bias, main_stream);
-}
-
-inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    SYCL_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_sycl_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    SYCL_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_sycl_op_flatten(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const ggml_sycl_op_flatten_t op) try {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    sycl_pool_alloc<float> src0_f;
-    sycl_pool_alloc<float> src1_f;
-    sycl_pool_alloc<float>  dst_f;
-
-    ggml_sycl_set_device(g_main_device);
-    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-    // GGML_SYCL_DEBUG("g_main_device_index=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
-        // g_main_device_index, main_stream, src0_on_device, src1_on_device, dst_on_device);
-
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device_index];
-    } else {
-        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        // GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
-        SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
-        } else {
-            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
-        // printf("zjy dst_ddf=%p main_stream=%p g_main_device_index=%d\n", dst_ddf, main_stream, g_main_device_index);
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
-        // src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    // do the computation
-    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    /*
-    DPCT1010:89: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    SYCL_CHECK(0);
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-    // print_ggml_tensor("tensor", dst);
-}
-catch (sycl::exception const &exc) {
-
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_set_peer_access(const int n_tokens) {
-    static bool peer_access_enabled = false;
-
-    const bool enable_peer_access = n_tokens <= GGML_SYCL_PEER_MAX_BATCH_SIZE;
-
-    if (peer_access_enabled == enable_peer_access) {
-        return;
-    }
-
-#ifdef NDEBUG
-    for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-        // SYCL_CHECK(syclDeviceSynchronize());
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-        int device_id = g_device_caps[id].device_id;
-
-        for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            int device_id_other = g_device_caps[id_other].device_id;
-            if (device_id == id_other) {
-                continue;
-            }
-            if (device_id != g_main_device && device_id_other != g_main_device) {
-                continue;
-            }
-
-            // int can_access_peer;
-            // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            // if (can_access_peer) {
-            //     if (enable_peer_access) {
-            //         SYCL_CHECK(syclDeviceEnablePeerAccess(id_other, 0));
-            //     } else {
-            //         SYCL_CHECK(syclDeviceDisablePeerAccess(id_other));
-            //     }
-            // }
-        }
-    }
-#endif // NDEBUG
-
-    peer_access_enabled = enable_peer_access;
-}
-
-static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 ggml_sycl_op_mul_mat_t op,
-                                 const bool convert_src1_to_q8_1) try {
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
-    const int64_t nrows1 = ggml_nrows(src1);
-
-    GGML_ASSERT(ne03 == ne13);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
-
-    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-
-    const int64_t i02_divisor = ne12 / ne02;
-
-    const size_t src0_ts = ggml_type_size(src0->type);
-    const size_t src0_bs = ggml_blck_size(src0->type);
-    const size_t q8_1_ts = sizeof(block_q8_1);
-    const size_t q8_1_bs = QK8_1;
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-
-    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    GGML_ASSERT(!(split && ne02 > 1));
-    GGML_ASSERT(!(split && ne03 > 1));
-    GGML_ASSERT(!(split && ne02 < ne12));
-
-    // dd = data device
-    char  *  src0_dd[GGML_SYCL_MAX_DEVICES] = {nullptr};
-    float * src1_ddf[GGML_SYCL_MAX_DEVICES] = {nullptr}; // float
-    char  * src1_ddq[GGML_SYCL_MAX_DEVICES] = {nullptr}; // q8_1
-    float *   dst_dd[GGML_SYCL_MAX_DEVICES] = {nullptr};
-
-    // as = actual size
-    size_t  src0_as[GGML_SYCL_MAX_DEVICES] = {0};
-    size_t src1_asf[GGML_SYCL_MAX_DEVICES] = {0};
-    size_t src1_asq[GGML_SYCL_MAX_DEVICES] = {0};
-    size_t   dst_as[GGML_SYCL_MAX_DEVICES] = {0};
-
-    int64_t  row_low[GGML_SYCL_MAX_DEVICES];
-    int64_t row_high[GGML_SYCL_MAX_DEVICES];
-
-    int used_devices = 0;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        // by default, use all rows
-        row_low[id]  = 0;
-        row_high[id] = ne01;
-
-        // for multi GPU, get the row boundaries from tensor split
-        // and round to mul_mat_q tile sizes
-        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type);
-
-            if (id != 0) {
-                row_low[id]  = ne01*g_tensor_split[id];
-                if (row_low[id] < ne01) {
-                    row_low[id] -= row_low[id] % rounding;
-                }
-            }
-
-            if (id != g_device_count - 1) {
-                row_high[id]  = ne01*g_tensor_split[id + 1];
-                if (row_high[id] < ne01) {
-                    row_high[id] -= row_high[id] % rounding;
-                }
-            }
-        }
-    }
-    for (int64_t id = 0; id < g_device_count; ++id) {
-
-        if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
-            continue;
-        }
-
-        used_devices++;
-
-        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-
-        ggml_sycl_set_device(get_device_id_by_index(id));
-        const dpct::queue_ptr stream = g_syclStreams[id][0];
-
-        if (src0_on_device && src0_is_contiguous) {
-            src0_dd[id] = (char *) src0_extra->data_device[id];
-        } else {
-            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_sycl_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
-        }
-
-        if (src1_on_device && src1_is_contiguous) {
-            src1_ddf[id] = (float *) src1_extra->data_device[id];
-        } else {
-            src1_ddf[id] = (float *) ggml_sycl_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
-        }
-
-        if (convert_src1_to_q8_1) {
-            src1_ddq[id] = (char *) ggml_sycl_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
-
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_sycl(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                /*
-                DPCT1010:90: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                SYCL_CHECK(0);
-            }
-        }
-
-        if (dst_on_device) {
-            dst_dd[id] = (float *) dst_extra->data_device[id];
-        } else {
-            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_sycl_pool_malloc(size_dst_ddf, &dst_as[id]);
-        }
-    }
-
-    // if multiple devices are used they need to wait for the main device
-    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && used_devices > 1) {
-        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-        /*
-        DPCT1024:91: The original code returned the error code that was further
-        consumed by the program logic. This original code was replaced with 0.
-        You may need to rewrite the program logic consuming the error code.
-        */
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            *src0_extra->events[g_main_device_index][0] =
-                g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier()));
-    }
-
-    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
-        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
-        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
-                continue;
-            }
-
-            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-            const int64_t row_diff = row_high[id] - row_low[id];
-
-            ggml_sycl_set_device(get_device_id_by_index(id));
-            const dpct::queue_ptr stream = g_syclStreams[id][is];
-
-            // wait for main GPU data if necessary
-            if (split && (id != g_main_device_index || is != 0)) {
-                SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
-                    {*src0_extra->events[g_main_device_index][0]})));
-            }
-
-            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
-                const int64_t i03 = i0 / ne12;
-                const int64_t i02 = i0 % ne12;
-
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-
-                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-
-                // the main device memory buffer can be on VRAM scratch, with space for all partial results
-                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
-                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
-                }
-
-                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
-                    if (id != g_main_device_index) {
-                        if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
-                            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
-                                src1_ddq_i, src1_ddq_i_source,
-                                src1_ncols * src1_padded_col_size * q8_1_ts /
-                                    q8_1_bs)));
-                        } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device_index];
-                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
-                                src1_ddf_i, src1_ddf_i_source,
-                                src1_ncols * ne10 * sizeof(float))));
-                        }
-                    }
-                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
-                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
-                    /*
-                    DPCT1010:92: SYCL uses exceptions to report errors and does
-                    not use the error codes. The call was replaced with 0. You
-                    need to rewrite this code.
-                    */
-                    SYCL_CHECK(0);
-                }
-
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
-                }
-                if (src1->type == GGML_TYPE_F16) {
-                    src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
-                }
-                // do the computation
-                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
-                /*
-                DPCT1010:93: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                SYCL_CHECK(0);
-
-                // copy dst to host or other device if necessary
-                if (!dst_on_device) {
-                    void * dst_off_device;
-                    dpct::memcpy_direction kind;
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        dst_off_device = dst->data;
-                        kind = dpct::device_to_host;
-                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device_index];
-                        kind = dpct::device_to_device;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                    if (split) {
-                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
-                        // dst is NOT transposed.
-                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
-                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
-                        SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
-                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
-                            row_diff * sizeof(float), row_diff * sizeof(float),
-                            src1_ncols, kind, *stream)));
-                    } else {
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0;
-                        SYCL_CHECK(CHECK_TRY_ERROR(
-                            stream->memcpy(dhf_dst_i, dst_dd_i,
-                                           src1_ncols * ne0 * sizeof(float))));
-                    }
-                }
-
-                // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device_index || is != 0)) {
-                    /*
-                    DPCT1024:94: The original code returned the error code that
-                    was further consumed by the program logic. This original
-                    code was replaced with 0. You may need to rewrite the
-                    program logic consuming the error code.
-                    */
-                    SYCL_CHECK(CHECK_TRY_ERROR(
-                        *src0_extra->events[id][is] =
-                            stream->ext_oneapi_submit_barrier()));
-                }
-            }
-        }
-    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
-            continue;
-        }
-        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-
-        // free buffers again when done
-        if (dst_as[id] > 0) {
-            ggml_sycl_pool_free(dst_dd[id], dst_as[id]);
-        }
-        if (src1_asq[id] > 0) {
-            ggml_sycl_pool_free(src1_ddq[id], src1_asq[id]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_sycl_pool_free(src1_ddf[id], src1_asf[id]);
-        }
-        if (src0_as[id] > 0) {
-            ggml_sycl_pool_free(src0_dd[id], src0_as[id]);
-        }
-    }
-
-    // main device waits for all other devices to be finished
-    if (split && g_device_count > 1) {
-        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
-        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-
-        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if (row_low[id] == row_high[id]) {
-                continue;
-            }
-            for (int64_t is = 0; is < is_max; ++is) {
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier(
-                        {*src0_extra->events[id][is]})));
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_repeat);
-}
-
-static void ggml_sycl_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_get_rows);
-}
-
-static void ggml_sycl_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_add);
-    // log_tensor_with_cnt("log_ggml_sycl_add_src0", (struct ggml_tensor *) src0, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_add_src1", (struct ggml_tensor *)src1, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_add_dst", dst, 6);
-}
-
-static void ggml_sycl_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_acc);
-}
-
-static void ggml_sycl_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_mul);
-    // log_tensor_with_cnt("log_ggml_sycl_mul_src0", (struct ggml_tensor *)src0, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_mul_src1", (struct ggml_tensor *)src1, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_mul_dst", dst, 6);
-
-}
-
-static void ggml_sycl_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_div);
-}
-
-static void ggml_sycl_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu);
-}
-
-static void ggml_sycl_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_silu);
-}
-
-static void ggml_sycl_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu_quick);
-}
-
-static void ggml_sycl_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_tanh);
-}
-
-static void ggml_sycl_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_relu);
-}
-
-static void ggml_sycl_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_leaky_relu);
-}
-
-static void ggml_sycl_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_sqr);
-}
-
-static void ggml_sycl_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_norm);
-}
-
-static void ggml_sycl_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_group_norm);
-}
-
-static void ggml_sycl_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_concat);
-}
-
-static void ggml_sycl_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_upscale);
-}
-
-static void ggml_sycl_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pad);
-}
-
-
-static void ggml_sycl_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rms_norm);
-    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src0", (struct ggml_tensor *)src0, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src1", (struct ggml_tensor *)src1, 6);
-    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_dst", dst, 6);
-}
-
-bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_sycl_loaded) return false;
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-            src1->type == GGML_TYPE_F32 &&
-             dst->type == GGML_TYPE_F32 &&
-            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-}
-
-static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst) try {
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
-
-    ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
-                                     const ggml_tensor *src1,
-                                     ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
-
-    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
-    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
-
-    ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
-                                   const sycl::half *src1_as_f16, char *dst,
-                                   const void **ptrs_src, void **ptrs_dst,
-                                   int64_t ne12, int64_t ne13, int64_t ne23,
-                                   size_t nb02, size_t nb03, size_t nb12,
-                                   size_t nb13, size_t nbd2, size_t nbd3,
-                                   int64_t r2, int64_t r3,
-                                   const sycl::nd_item<3> &item_ct1) {
-    int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                  item_ct1.get_local_id(2);
-    int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
-                  item_ct1.get_local_id(1);
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int64_t i03 = i13 / r3;
-    int64_t i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
-}
-
-static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
-                                             const ggml_tensor *src1,
-                                             ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t ne_dst  = ggml_nelements(dst);
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(g_sycl_handles[g_main_device_index] = main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device_index];
-    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
-
-    // convert src1 to fp16
-    sycl_pool_alloc<sycl::half> src1_f16_alloc;
-    if (src1->type != GGML_TYPE_F16) {
-      const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
-      const int64_t ne_src1 = ggml_nelements(src1);
-      src1_f16_alloc.alloc(ne_src1);
-      GGML_ASSERT(to_fp16_sycl != nullptr);
-      to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
-    }
-    sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
-                                                       : src1_f16_alloc.get();
-
-    sycl_pool_alloc<sycl::half> dst_f16;
-    char * dst_t;
-
-    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_half;
-    dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
-
-    // dst strides
-    size_t nbd2 = dst->nb[2];
-    size_t nbd3 = dst->nb[3];
-
-    const sycl::half alpha_f16 = 1.0f;
-    const sycl::half beta_f16 = 0.0f;
-
-    const float alpha_f32 = 1.0f;
-    const float beta_f32  = 0.0f;
-
-    const void * alpha = &alpha_f16;
-    const void * beta  = &beta_f16;
-
-    // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
-    // once oneMKL open source supports half, half, float, float: datatypes
-    dst_t = (char *) dst_f16.alloc(ne_dst);
-
-    nbd2 /= sizeof(float) / sizeof(sycl::half);
-    nbd3 /= sizeof(float) / sizeof(sycl::half);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-#if 0
-    // use syclGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                SYCL_CHECK(
-                        syclGemmEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , SYCL_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use syclGemmStridedBatchedEx
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const char *)src0_as_f16, dpct::library_data_t::real_half,
-            nb01 / nb00, nb02 / nb00,
-            (const char *)src1_f16, dpct::library_data_t::real_half,
-            nb11 / nb10, nb12 / nb10, beta,
-            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
-            ne12 * ne13, cu_compute_type)));
-    } else {
-        // use syclGemmBatchedEx
-        const int ne23 = ne12*ne13;
-
-        sycl_pool_alloc<const void *> ptrs_src(2*ne23);
-        sycl_pool_alloc<      void *> ptrs_dst(1*ne23);
-
-        sycl::range<3> block_dims(1, ne12, ne13);
-        /*
-        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(main_stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            main_stream->submit([&](sycl::handler &cgh) {
-                const void **ptrs_src_get = ptrs_src.get();
-                void **ptrs_dst_get = ptrs_dst.get();
-                size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
-                size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
-                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
-                                 [=](sycl::nd_item<3> item_ct1) {
-                                     k_compute_batched_ptrs(
-                                         src0_as_f16, src1_f16,
-                                         dst_t, ptrs_src_get,
-                                         ptrs_dst_get, ne12, ne13, ne23,
-                                         nb02, nb03, nb12_scaled, nb13_scaled,
-                                         nbd2, nbd3, r2, r3, item_ct1);
-                                 });
-            });
-        }
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const void **)(ptrs_src.get() + 0 * ne23),
-            dpct::library_data_t::real_half, nb01 / nb00,
-            (const void **)(ptrs_src.get() + 1 * ne23),
-            dpct::library_data_t::real_half, nb11 / nb10, beta,
-            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
-            cu_compute_type)));
-    }
-#endif
-
-    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
-    to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_TYPE_GPU) &&
-        ( dst->backend == GGML_BACKEND_TYPE_GPU);
-
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-
-    int64_t min_compute_capability = INT_MAX;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            min_compute_capability = g_device_caps[id].cc;
-        }
-    }
-
-#ifdef SYCL_USE_XMX
-    const bool use_xmx = true;
-#else
-    const bool use_xmx = false;
-#endif
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
-        ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
-        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
-        ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
-        // KQ + KQV multi-batch
-        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
-        ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
-#ifdef GGML_SYCL_FORCE_DMMV
-            const bool use_mul_mat_vec_q = false;
-#else
-            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
-#endif // GGML_SYCL_FORCE_DMMV
-
-            if (use_mul_mat_vec_q) {
-                // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
-            } else {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
-            }
-        } else {
-            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
-
-            if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
-            if (use_mul_mat_q) {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
-            } else {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
-            }
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_sycl_t to_fp16 = ggml_get_to_fp16_sycl(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, syclStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
-
-    //const int64_t nb01 = src00->nb[1];
-    GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
-
-    GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
-    //const int64_t nb11 = src1->nb[1];
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    syclStream_t main_stream = g_syclStreams[g_main_device_index][0];
-
-    SYCL_CHECK(syclSetStream(g_sycl_handles[g_main_device_index], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device_index];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
-
-    // convert src1 to fp16
-    const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
-    GGML_ASSERT(to_fp16_sycl != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_sycl_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_sycl(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_sycl_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use syclGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_sycl_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_sycl_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_sycl_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device_index] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device_index] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device_index] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device_index] : nullptr
-    );
-    SYCL_CHECK(syclGetLastError());
-
-    SYCL_CHECK(
-    syclGemmBatchedEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), SYCL_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), SYCL_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), SYCL_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_sycl_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_sycl_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_sycl_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
-    to_fp32_sycl(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_sycl_pool_free(src1_as_f16, src1_as);
-    ggml_sycl_pool_free(dst_f16, dst_as);
-}
-#endif
-
-static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
-                                 const ggml_tensor *src1,
-                                 ggml_tensor *dst) try {
-#if 0
-    ggml_sycl_mul_mat_id_sycl(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
-    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-
-    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
-
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
-
-    ggml_tensor_extra_gpu src1_row_extra;
-    ggml_tensor_extra_gpu dst_row_extra;
-
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    src1_row.backend = GGML_BACKEND_TYPE_GPU;
-    dst_row.backend  = GGML_BACKEND_TYPE_GPU;
-
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
-
-    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
-    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device_index];
-
-    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
-
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
-            //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
-            //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
-
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            src1_row_extra.data_device[g_main_device_index] = src1_original + i01*src1->nb[1];
-            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-
-            dst_row_extra.data_device[g_main_device_index] = dst_original + i01*dst->nb[1];
-            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-
-            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
-        }
-    } else {
-        sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
-        sycl_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
-
-        src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
-        dst_row_extra.data_device[g_main_device_index]  =  dst_contiguous.get();
-
-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            int64_t num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
-                                   src1_original + i01 * nb11, nb11)));
-                num_src1_rows++;
-            }
-
-            if (num_src1_rows == 0) {
-                continue;
-            }
-
-            src1_row.ne[1] = num_src1_rows;
-            dst_row.ne[1] = num_src1_rows;
-
-            src1_row.nb[1] = nb11;
-            src1_row.nb[2] = num_src1_rows*nb11;
-            src1_row.nb[3] = num_src1_rows*nb11;
-
-            dst_row.nb[1] = nb1;
-            dst_row.nb[2] = num_src1_rows*nb1;
-            dst_row.nb[3] = num_src1_rows*nb1;
-
-            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
-
-            num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
-                    dst_original + i01 * nb1,
-                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
-                num_src1_rows++;
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_scale);
-}
-
-static void ggml_sycl_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_clamp);
-}
-
-static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst) try {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
-    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
-        ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
-        ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-
-    (void) dst;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_sycl_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    ggml_sycl_cpy(src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_sycl_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_diag_mask_inf);
-}
-
-static void ggml_sycl_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_soft_max);
-}
-
-static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
-}
-
-static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
-}
-
-static void ggml_sycl_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_im2col);
-}
-
-static void ggml_sycl_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_sum_rows);
-}
-
-static void ggml_sycl_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_argsort);
-}
-
-static void ggml_sycl_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
-void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
-    const int64_t nrows = ggml_nrows(tensor);
-
-    const int64_t ne0 = tensor->ne[0];
-
-    const size_t nb1 = tensor->nb[1];
-
-    ggml_backend_type backend = tensor->backend;
-    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
-            continue;
-        }
-        ggml_sycl_set_device(get_device_id_by_index(id));
-        const dpct::queue_ptr stream = g_syclStreams[id][0];
-
-        int64_t row_low, row_high;
-        if (backend == GGML_BACKEND_TYPE_GPU) {
-            row_low = 0;
-            row_high = nrows;
-        } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-            const int64_t rounding = get_row_rounding(tensor->type);
-
-            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
-            row_low -= row_low % rounding;
-
-            if (id == g_device_count - 1) {
-                row_high = nrows;
-            } else {
-                row_high = nrows*g_tensor_split[id + 1];
-                row_high -= row_high % rounding;
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-        if (row_low == row_high) {
-            continue;
-        }
-
-        int64_t nrows_split = row_high - row_low;
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf;
-        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, *stream)));
-        char * buf_host = (char *)data + offset_split;
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            SYCL_CHECK(CHECK_TRY_ERROR(
-                (*stream)
-                .memset(buf + original_size, 0, size - original_size)
-                .wait()));
-        }
-
-        SYCL_CHECK(CHECK_TRY_ERROR((*stream)
-                                    .memcpy(buf, buf_host, original_size)
-                                    .wait()));
-
-        extra->data_device[id] = buf;
-
-        if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
-                                                new sycl::event()));
-            }
-        }
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
-    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        const dpct::queue_ptr stream = g_syclStreams[id][0];
-        if (extra->data_device[id] != nullptr) {
-            SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[id], *stream)));
-        }
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            if (extra->events[id][is] != nullptr) {
-                SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    dpct::destroy_event(extra->events[id][is])));
-            }
-        }
-    }
-
-    delete extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
-static size_t g_temp_tensor_extra_index = 0;
-
-static ggml_tensor_extra_gpu * ggml_sycl_alloc_temp_tensor_extra() {
-    if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_SYCL_MAX_NODES];
-    }
-
-    size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_SYCL_MAX_NODES;
-    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
-    memset(extra, 0, sizeof(*extra));
-
-    return extra;
-}
-
-static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
-                                          bool scratch, bool force_inplace,
-                                          bool no_alloc) try {
-    if (scratch && g_scratch_size == 0) {
-        return;
-    }
-
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
-        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
-        }
-    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
-        ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
-    }
-
-    if (scratch && no_alloc) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra;
-
-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
-        tensor->op == GGML_OP_VIEW ||
-        force_inplace;
-    const size_t size = ggml_nbytes(tensor);
-
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
-        size_t offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->op_params, sizeof(size_t));
-        }
-        extra = ggml_sycl_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device_index] = src0_ddc + offset;
-    } else if (tensor->op == GGML_OP_CPY) {
-        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
-        void * src1_ddv = src1_extra->data_device[g_main_device_index];
-        extra = ggml_sycl_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device_index] = src1_ddv;
-    } else if (scratch) {
-        GGML_ASSERT(size <= g_scratch_size);
-        if (g_scratch_offset + size > g_scratch_size) {
-            g_scratch_offset = 0;
-        }
-
-        char * data = (char *) g_scratch_buffer;
-        if (data == nullptr) {
-            SYCL_CHECK(CHECK_TRY_ERROR(
-                data = (char *)sycl::malloc_device(
-                    g_scratch_size, *stream)));
-            g_scratch_buffer = data;
-        }
-        extra = ggml_sycl_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device_index] = data + g_scratch_offset;
-
-        g_scratch_offset += size;
-
-        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
-    } else { // allocate new buffers outside of scratch
-        void * data;
-        SYCL_CHECK(CHECK_TRY_ERROR(data = (void *)sycl::malloc_device(
-                                        size, *stream)));
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            (*stream).memset(data, 0, size).wait()));
-        extra = new ggml_tensor_extra_gpu;
-        memset(extra, 0, sizeof(*extra));
-        extra->data_device[g_main_device_index] = data;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
-                                     size_t offset) try {
-    if (g_scratch_size == 0) {
-        return;
-    }
-    if (g_scratch_buffer == nullptr) {
-        ggml_sycl_set_device(g_main_device);
-        const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-        SYCL_CHECK(
-            CHECK_TRY_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
-                                 g_scratch_size, *stream)));
-    }
-
-    ggml_tensor_extra_gpu * extra = ggml_sycl_alloc_temp_tensor_extra();
-
-    const bool inplace = tensor->view_src != nullptr;
-
-    if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
-        size_t view_offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
-        }
-        extra->data_device[g_main_device_index] = src0_ddc + view_offset;
-    } else {
-        extra->data_device[g_main_device_index] = (char *) g_scratch_buffer + offset;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(ggml_is_contiguous(tensor));
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
-                                    .memcpy(extra->data_device[g_main_device_index],
-                                            tensor->data, ggml_nbytes(tensor))
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_sycl_assign_buffers_impl(tensor, true, false, false);
-}
-
-void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
-    ggml_sycl_assign_buffers_impl(tensor, true, false, true);
-}
-
-void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_sycl_assign_buffers_impl(tensor, false, false, false);
-}
-
-void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_sycl_assign_buffers_impl(tensor, false, true, false);
-}
-
-void ggml_sycl_set_main_device(const int main_device) try {
-
-    if (main_device >= g_all_sycl_device_count) {
-        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_all_sycl_device_count, g_main_device);
-        return;
-    }
-
-    if (g_main_device != main_device && g_device_count >= 1) {
-        g_main_device = main_device;
-        g_main_device_index = get_device_index_by_id(g_main_device);
-        dpct::device_info prop;
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
-        fprintf(stderr, "Using device %d (%s) as main device\n",
-                g_main_device, prop.get_name());
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_sycl_set_scratch_size(const size_t scratch_size) {
-    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
-    // it still won't always work as expected, but it's better than nothing
-    if (scratch_size > g_scratch_size) {
-        ggml_sycl_free_scratch();
-    }
-    g_scratch_size = std::max(g_scratch_size, scratch_size);
-}
-
-void ggml_sycl_free_scratch() try {
-    if (g_scratch_buffer == nullptr) {
-        return;
-    }
-    ggml_sycl_set_device(g_main_device);
-    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        sycl::free(g_scratch_buffer, *stream)));
-    g_scratch_buffer = nullptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_sycl_loaded) return false;
-
-    ggml_sycl_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
-
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
-            return false;
-        }
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_sycl_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_sycl_get_rows;
-            break;
-        case GGML_OP_DUP:
-            func = ggml_sycl_dup;
-            break;
-        case GGML_OP_ADD:
-            func = ggml_sycl_add;
-            break;
-        case GGML_OP_ACC:
-            func = ggml_sycl_acc;
-            break;
-        case GGML_OP_MUL:
-            func = ggml_sycl_mul;
-            break;
-        case GGML_OP_DIV:
-            func = ggml_sycl_div;
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(tensor)) {
-                case GGML_UNARY_OP_GELU:
-                    func = ggml_sycl_gelu;
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    func = ggml_sycl_silu;
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_sycl_gelu_quick;
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    func = ggml_sycl_tanh;
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    func = ggml_sycl_relu;
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            func = ggml_sycl_norm;
-            break;
-        case GGML_OP_GROUP_NORM:
-            func = ggml_sycl_group_norm;
-            break;
-        case GGML_OP_CONCAT:
-            func = ggml_sycl_concat;
-            break;
-        case GGML_OP_UPSCALE:
-            func = ggml_sycl_upscale;
-            break;
-        case GGML_OP_PAD:
-            func = ggml_sycl_pad;
-            break;
-        case GGML_OP_LEAKY_RELU:
-            func = ggml_sycl_leaky_relu;
-            break;
-        case GGML_OP_RMS_NORM:
-            func = ggml_sycl_rms_norm;
-            break;
-        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_sycl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_sycl_mul_mat;
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_sycl_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_sycl_mul_mat_id;
-            break;
-        case GGML_OP_SCALE:
-            func = ggml_sycl_scale;
-            break;
-        case GGML_OP_SQR:
-            func = ggml_sycl_sqr;
-            break;
-        case GGML_OP_CLAMP:
-            func = ggml_sycl_clamp;
-            break;
-        case GGML_OP_CPY:
-            func = ggml_sycl_cpy;
-            break;
-        case GGML_OP_CONT:
-            func = ggml_sycl_dup;
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            func = ggml_sycl_nop;
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            func = ggml_sycl_diag_mask_inf;
-            break;
-        case GGML_OP_SOFT_MAX:
-            func = ggml_sycl_soft_max;
-            break;
-        case GGML_OP_ROPE:
-            func = ggml_sycl_rope;
-            break;
-        case GGML_OP_ALIBI:
-            func = ggml_sycl_alibi;
-            break;
-        case GGML_OP_IM2COL:
-            func = ggml_sycl_im2col;
-            break;
-        case GGML_OP_SUM_ROWS:
-            func = ggml_sycl_sum_rows;
-            break;
-        case GGML_OP_ARGSORT:
-            func = ggml_sycl_argsort;
-            break;
-        default:
-            return false;
-    }
-
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return true;
-    }
-    func(tensor->src[0], tensor->src[1], tensor);
-    return true;
-}
-
-GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
-    int max_compute_units = -1;
-    for(int i=0;i<max_len;i++) id_list[i] = 0;
-
-    int device_count = dpct::dev_mgr::instance().device_count();
-
-    for(int id=0; id< device_count; id++){
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        if (!device.is_gpu()) continue;
-        dpct::device_info prop;
-        dpct::get_device_info(prop, device);
-        if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
-    }
-
-    for(int id=0;id< device_count;id++){
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        if (!device.is_gpu()) continue;
-        dpct::device_info prop;
-        dpct::get_device_info(prop, device);
-        if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
-            id_list[id] = 1;
-        }
-    }
-    return;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-int ggml_sycl_get_device_count() try {
-    int device_count;
-    if (CHECK_TRY_ERROR(device_count =
-                             dpct::dev_mgr::instance().device_count()) != 0) {
-        return 0;
-    }
-    return device_count;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
-                                      size_t description_size) try {
-    dpct::device_info prop;
-    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-        prop, dpct::dev_mgr::instance().get_device(device))));
-    snprintf(description, description_size, "%s", prop.get_name());
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-struct ggml_backend_sycl_context {
-    int device;
-    std::string name;
-};
-
-// sycl buffer
-
-struct ggml_backend_sycl_buffer_context {
-    int device;
-    void * dev_ptr = nullptr;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-    std::string name;
-
-     ggml_backend_sycl_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
-
-    ~ ggml_backend_sycl_buffer_context() {
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_sycl_alloc_temp_tensor_extra() {
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_SYCL_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_SYCL_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-
-        return extra;
-    }
-};
-
-GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
-}
-
-static void
-ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-    ggml_sycl_set_device(ctx->device);
-    int device_index = get_device_index_by_id(ctx->device);
-    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
-
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, *stream)));
-    delete ctx;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
-                                                 ggml_tensor *tensor) try {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_sycl_alloc_temp_tensor_extra();
-
-    extra->data_device[ctx->device] = tensor->data;
-
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-    tensor->extra = extra;
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[ctx->device][0]->memset(
-                (char *)tensor->data + original_size, 0,
-                padded_size - original_size)));
-        }
-    }
-
-    UNUSED(buffer);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                                ggml_tensor *tensor,
-                                                const void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    int device_index = get_device_index_by_id(ctx->device);
-    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream)
-                             .memcpy((char *)tensor->data + offset, data, size)
-                             .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                                const ggml_tensor *tensor,
-                                                void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    int device_index = get_device_index_by_id(ctx->device);
-    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
-
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        (*stream)
-            .memcpy(data, (const char *)tensor->data + offset, size)
-            .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
-                                           uint8_t value) try {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
-    ggml_sycl_set_device(ctx->device);
-    int device_index = get_device_index_by_id(ctx->device);
-    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
-                                    .memset(ctx->dev_ptr, value, buffer->size)
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_sycl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_sycl_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// sycl buffer type
-struct ggml_backend_sycl_buffer_type_context {
-    int device;
-    std::string name;
-};
-
-GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-
-    return ctx->name.c_str();
-}
-
-static ggml_backend_buffer_t
-ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) try {
-    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
-    int device = (int) buft_ctx->device;
-
-    ggml_sycl_set_device(device);
-    int device_index = get_device_index_by_id(device);
-    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
-    size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
-
-    void * dev_ptr;
-    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, *stream)));
-
-     ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(device, dev_ptr);
-
-    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    return dpct::get_current_device().get_max_mem_alloc_size();
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_sycl(backend);
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
-    /* .is_host          = */ nullptr,
-};
-
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
-    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
-
-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
-            ggml_backend_sycl_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
-            };
-        }
-        ggml_backend_sycl_buffer_type_initialized = true;
-    }
-
-    return &ggml_backend_sycl_buffer_types[device];
-}
-
-// host buffer type
-
-GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_SYCL_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_SYCL_NAME "_Host";
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_sycl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_sycl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    // FIXME: this is a hack to avoid having to implement a new buffer type
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_sycl_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
-    return GGML_SYCL_NAME;
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_sycl_free(ggml_backend_t backend) {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    delete sycl_ctx;
-    delete backend;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    return ggml_backend_sycl_buffer_type(sycl_ctx->device);
-}
-
-static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
-                                               ggml_tensor *tensor,
-                                               const void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
-        (char *)tensor->data + offset, data, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
-                                               const ggml_tensor *tensor,
-                                               void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
-        data, (const char *)tensor->data + offset, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
-
-    UNUSED(backend);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, const ggml_cgraph * cgraph) {
-    GGML_ASSERT(!"not implemented");
-
-    return nullptr;
-
-    UNUSED(backend);
-    UNUSED(cgraph);
-}
-
-static void ggml_backend_sycl_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-
-    ggml_sycl_set_main_device(sycl_ctx->device);
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_TYPE_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
-            continue;
-
-        assert(node->backend == GGML_BACKEND_TYPE_GPU);
-        assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
-        assert(node->extra != nullptr);
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
-                assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
-                assert(node->src[j]->extra != nullptr);
-            }
-        }
-
-        bool ok = ggml_sycl_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            syclDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            syclMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), syclMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
-    }
-
-    UNUSED(backend);
-    return true;
-}
-
-static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_TANH:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-
-                if (a->type == GGML_TYPE_IQ1_S) {
-                    return false;
-                }
-                if (a->type == GGML_TYPE_IQ3_XXS) {
-                  return false;
-                }
-                if (a->type == GGML_TYPE_IQ2_XXS) {
-                    return false;
-                }
-                if (a->type == GGML_TYPE_IQ2_XS) {
-                    return false;
-                }
-
-                return true;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                        return true;
-                    default:
-                        return false;
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                if (src0_type == GGML_TYPE_F32) {
-                    return true;
-                } else {
-                    return false;
-                }
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_REPEAT:
-        case GGML_OP_DUP:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
-        case GGML_OP_IM2COL:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_ACC:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-static ggml_backend_i ggml_backend_sycl_interface = {
-    /* .get_name                = */ ggml_backend_sycl_name,
-    /* .free                    = */ ggml_backend_sycl_free,
-    /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ ggml_backend_sycl_synchronize,
-    /* .graph_plan_create       = */ ggml_backend_sycl_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_sycl_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_sycl_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
-    /* .supports_op             = */ ggml_backend_sycl_supports_op,
-};
-
-static ggml_guid_t ggml_backend_sycl_guid() {
-    static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
-    return &guid;
-}
-
-ggml_backend_t ggml_backend_sycl_init(int device) {
-    ggml_init_sycl(); // TODO: remove from ggml.c
-
-    if (device < 0 || device >= ggml_sycl_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
-        return nullptr;
-    }
-
-    // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_sycl_set_main_device(device);
-
-    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
-        /* .device = */ device,
-        /* .name   = */ GGML_SYCL_NAME + std::to_string(device),
-    };
-
-    ggml_backend_t sycl_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_sycl_guid(),
-        /* .interface = */ ggml_backend_sycl_interface,
-        /* .context   = */ ctx
-    };
-
-    return sycl_backend;
-}
-
-bool ggml_backend_is_sycl(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
-}
-
-static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
-    ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
-    return sycl_backend;
-
-    UNUSED(params);
-}
-
-extern "C" int ggml_backend_sycl_reg_devices();
-
-int ggml_backend_sycl_reg_devices() {
-    int device_count = ggml_sycl_get_device_count();
-
-    for (int i = 0; i < device_count; i++) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
-    }
-    return device_count;
-}
diff --git a/ggml-sycl.h b/ggml-sycl.h
deleted file mode 100644
index 891f2d00a..000000000
--- a/ggml-sycl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-//  MIT license
-//  Copyright (C) 2024 Intel Corporation
-//  SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_SYCL_MAX_DEVICES       16
-#define GGML_SYCL_NAME "SYCL"
-
-GGML_API void   ggml_init_sycl(void);
-GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
-GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp
deleted file mode 100644
index e5e7a8414..000000000
--- a/ggml-vulkan-shaders.hpp
+++ /dev/null
@@ -1,51714 +0,0 @@
-#include <cstdint>
-
-unsigned char add_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x1f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x20,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x2d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x12,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x20,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x24,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x25,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x81,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x38,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t add_f32_len = 1456;
-
-unsigned char clamp_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x55,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x21,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x29,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x4d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0xb8,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x35,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x36,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0xba,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x11,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x36,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x11,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x26,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x48,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x4c,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t clamp_f32_len = 1448;
-
-unsigned char cpy_f16_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xaa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x0e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x96,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa7,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x08,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x0e,0x00,0x0e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0f,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x92,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x95,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x95,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x9c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x07,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xa8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0a,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x18,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x92,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t cpy_f16_f16_len = 2480;
-
-unsigned char cpy_f32_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xad,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x0e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x96,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa0,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x08,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x0e,0x00,0x0e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0f,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x92,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x95,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x95,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x9c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa2,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x07,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xab,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0a,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xac,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x19,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x18,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xab,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa2,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x92,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa6,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa7,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xab,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xab,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t cpy_f32_f16_len = 2524;
-
-unsigned char cpy_f32_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xaa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x0e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x93,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x94,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x94,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x96,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x9d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa7,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x08,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x08,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x0e,0x00,
-0x0e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0f,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x92,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x95,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x95,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x9c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x07,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xa8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0a,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x19,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x18,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x92,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t cpy_f32_f32_len = 2472;
-
-unsigned char dequant_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x86,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x4e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x4e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x50,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x5e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x5e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x83,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x14,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x18,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x23,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x48,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x5d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x5f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x5f,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x0a,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x84,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0d,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0xa8,0x00,0x04,0x00,
-0x23,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x23,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x33,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x32,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x84,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x33,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x48,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x53,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x53,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x70,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x53,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x7f,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x84,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x84,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t dequant_f16_len = 1816;
-
-unsigned char dequant_q2_K_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x13,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x33,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x5f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x5f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x5f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x61,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x61,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x63,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x63,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x03,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x14,0x00,0x02,0x00,
-0x11,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x5d,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,
-0x5f,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x60,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x61,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x69,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x6c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x7b,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x7c,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x15,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x2a,0x00,0x03,0x00,0x11,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x29,0x00,0x03,0x00,0x11,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x04,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x18,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x09,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x02,0x01,0x00,0x00,0x0d,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x10,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x12,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x69,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x71,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa0,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x69,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xab,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x71,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xbf,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x71,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x69,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x71,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x7e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,
-0xff,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x11,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x0a,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x01,0x00,0x00,0x04,0x01,0x00,0x00,
-0x0a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x04,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x04,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q2_K_len = 4056;
-
-unsigned char dequant_q3_K_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x42,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x33,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x73,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x75,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x77,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x79,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x06,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x07,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x07,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x07,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x09,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x09,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2f,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x14,0x00,0x02,0x00,
-0x11,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x71,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x73,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x75,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x76,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x77,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xde,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe2,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x06,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x08,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x07,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x08,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x15,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x2a,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x29,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x30,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x18,0x00,0x00,0x00,0x31,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x0d,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x10,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x12,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x38,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x30,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x19,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x6f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6f,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x92,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x97,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x96,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x97,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xab,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xb1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb0,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x52,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xb1,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc5,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7f,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7f,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0xd6,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb1,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb1,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0xc4,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x97,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x97,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6f,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0x91,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x97,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x76,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0xde,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0xde,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0xde,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x49,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x49,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0xf6,0x00,0x04,0x00,0xff,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x05,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfe,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x18,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7f,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xab,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x09,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x24,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0xde,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x25,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0xde,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0xed,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xe2,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xff,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x38,0x01,0x00,0x00,0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x11,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x36,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x41,0x01,0x00,0x00,
-0x30,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t dequant_q3_K_len = 4828;
-
-unsigned char dequant_q4_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x19,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x51,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x76,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x76,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x76,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x78,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x14,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x53,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x5f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x00,0x00,0x00,0x41,
-0x1d,0x00,0x03,0x00,0x75,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x76,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x0a,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xff,0x02,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0x13,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x02,0x03,0x00,0x00,0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x03,0x03,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x04,0x03,0x00,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x06,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x07,0x03,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x08,0x03,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x09,0x03,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x19,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x03,0x00,0x00,0x1a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x10,0x03,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x12,0x03,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x03,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x03,0x00,0x00,0x1d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x03,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x03,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0d,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0xa8,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x24,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9b,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x34,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x87,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc1,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0xff,0x02,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xcb,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xf3,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00,0x01,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0x01,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0x00,0x01,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x03,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x02,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x05,0x01,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x07,0x01,0x00,0x00,
-0x06,0x01,0x00,0x00,0xfd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x07,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x11,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x07,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x37,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x38,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x02,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x43,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x03,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x54,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x55,0x01,0x00,0x00,0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x03,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x57,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x75,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x05,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x78,0x01,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x78,0x01,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x7e,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x7e,0x01,0x00,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x89,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x7f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x91,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x93,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x07,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x08,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xbb,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x09,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xc9,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xca,0x01,0x00,0x00,
-0xc9,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xcb,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xce,0x01,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd8,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe3,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0xff,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x01,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x09,0x02,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0x09,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x16,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x18,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x19,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x15,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x25,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x0e,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x33,0x02,0x00,0x00,
-0x32,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3c,0x02,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x3c,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x40,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x41,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x43,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x45,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x51,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x10,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5b,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x66,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x11,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x67,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x69,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x78,0x02,0x00,0x00,
-0x77,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x75,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x79,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x81,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x82,0x02,0x00,0x00,0x81,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x83,0x02,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x83,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x90,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x83,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x13,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x14,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xab,0x02,0x00,0x00,
-0xaa,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x15,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xc7,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc9,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xd1,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0xd1,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,0x78,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd3,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xde,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe0,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xe1,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xe4,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe4,0x02,0x00,0x00,0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xe6,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0xdd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xed,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x17,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xef,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf0,0x02,0x00,0x00,
-0xef,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xf1,0x02,0x00,0x00,0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xed,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xf1,0x02,0x00,0x00,
-0xf0,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0x18,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xf9,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,0xf9,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,
-0x78,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xfb,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q4_0_len = 8856;
-
-unsigned char dequant_q4_1_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x59,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x51,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x53,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x14,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x18,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x4a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,
-0x51,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x53,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x64,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x68,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x7a,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x7b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x03,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x03,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x41,0x03,0x00,0x00,
-0x13,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x42,0x03,0x00,0x00,0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x43,0x03,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x03,0x00,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x45,0x03,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x46,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x47,0x03,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x48,0x03,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x03,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x03,0x00,0x00,0x19,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x1a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x03,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x50,0x03,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x51,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x52,0x03,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x03,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x54,0x03,0x00,0x00,0x1d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x55,0x03,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x56,0x03,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x58,0x03,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xa0,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0d,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa1,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0xa8,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x24,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x34,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x57,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x8c,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x3f,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xd3,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x40,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xff,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x81,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1d,0x01,0x00,0x00,0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x85,0x00,0x00,0x00,0x41,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x17,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x34,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x37,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x40,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x38,0x01,0x00,0x00,
-0x38,0x01,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00,
-0x85,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,0x43,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4d,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x85,0x00,0x00,0x00,
-0x42,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x55,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x57,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x43,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x6e,0x01,0x00,0x00,0x64,0x01,0x00,0x00,0x64,0x01,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x6e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x85,0x00,0x00,0x00,
-0x43,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x78,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x75,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x79,0x01,0x00,0x00,
-0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x85,0x00,0x00,0x00,0x44,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x81,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x83,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x83,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x91,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x81,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x9b,0x01,0x00,0x00,0x99,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0x85,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0x85,0x00,0x00,0x00,0x46,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xaf,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x47,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xc2,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x85,0x00,0x00,0x00,0x47,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd8,0x01,0x00,0x00,0x85,0x00,0x00,0x00,
-0x48,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xdb,0x01,0x00,0x00,
-0xda,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xe4,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x49,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xee,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xe5,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x85,0x00,0x00,0x00,
-0x49,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xfb,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xfd,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x85,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0xf3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x07,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x04,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x07,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x13,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x4b,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x16,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x18,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x19,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x14,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x85,0x00,0x00,0x00,0x4b,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x85,0x00,0x00,0x00,0x4c,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x33,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x3c,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x3c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x43,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x45,0x02,0x00,0x00,
-0x44,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x48,0x02,0x00,0x00,0x45,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x49,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x40,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x49,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x85,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x55,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,0x85,0x00,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x5f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x69,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x4f,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x72,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x73,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x74,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,0x85,0x00,0x00,0x00,
-0x4f,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x81,0x02,0x00,0x00,
-0x80,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x85,0x00,0x00,0x00,0x50,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x77,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x8b,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x51,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x98,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x85,0x00,0x00,0x00,0x51,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0xab,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xad,0x02,0x00,0x00,0xac,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x85,0x00,0x00,0x00,0x52,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb7,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc7,0x02,0x00,0x00,
-0xc6,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xcb,0x02,0x00,0x00,
-0xca,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xcc,0x02,0x00,0x00,0xc9,0x02,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xcd,0x02,0x00,0x00,
-0xcc,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xcd,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,
-0x85,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xd9,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd9,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,0x85,0x00,0x00,0x00,
-0x54,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xcf,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe1,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xe3,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xec,0x02,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xed,0x02,0x00,0x00,0xec,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xf0,0x02,0x00,0x00,0xef,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x64,0x00,0x00,0x00,0xf1,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x55,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0xf1,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xf3,0x02,0x00,0x00,0xf2,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xf4,0x02,0x00,0x00,
-0xf3,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xf5,0x02,0x00,0x00,0xf4,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf3,0x02,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xf7,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,
-0xf5,0x02,0x00,0x00,0xf7,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0xf9,0x02,0x00,0x00,0xf8,0x02,0x00,0x00,
-0xed,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0xfa,0x02,0x00,0x00,0xf0,0x02,0x00,0x00,0xf0,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,
-0xf9,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x85,0x00,0x00,0x00,
-0x55,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x03,0x03,0x00,0x00,0xfb,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x04,0x03,0x00,0x00,
-0x03,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x01,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x05,0x03,0x00,0x00,
-0x04,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x03,0x00,0x00,0x85,0x00,0x00,0x00,0x56,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0xfb,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0f,0x03,0x00,0x00,0x0e,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x18,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x18,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x1b,0x03,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x57,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x20,0x03,0x00,0x00,0x1f,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x20,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x22,0x03,0x00,0x00,0x1f,0x03,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x23,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x24,0x03,0x00,0x00,0x21,0x03,0x00,0x00,
-0x23,0x03,0x00,0x00,0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x24,0x03,0x00,0x00,0x19,0x03,0x00,0x00,
-0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x26,0x03,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x1c,0x03,0x00,0x00,0x81,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x27,0x03,0x00,0x00,0x25,0x03,0x00,0x00,
-0x26,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x03,0x00,0x00,0x85,0x00,0x00,0x00,0x57,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,
-0x27,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x30,0x03,0x00,0x00,0x2f,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x31,0x03,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x2d,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x31,0x03,0x00,0x00,0x30,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0x85,0x00,0x00,0x00,0x58,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x39,0x03,0x00,0x00,0x27,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x39,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3b,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa0,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t dequant_q4_1_len = 9704;
-
-unsigned char dequant_q4_K_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb1,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x33,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4b,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x4c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x4c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x4e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x4e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x50,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x08,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x09,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x09,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x36,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x14,0x00,0x02,0x00,
-0x11,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x45,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x4b,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x4c,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x08,0x01,0x00,0x00,
-0x45,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x15,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2a,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x29,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x23,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x37,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x18,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x38,0x01,0x00,0x00,0x34,0x01,0x00,0x00,0x0d,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x10,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x12,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x52,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x45,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x52,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x45,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x71,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x76,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x7f,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xbc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xbb,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0xd1,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x47,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x76,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x50,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0x45,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0x1c,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x66,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0x63,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xff,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x30,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x30,0x01,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x76,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x63,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x66,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x68,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x76,0x00,0x00,0x00,0x72,0x01,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x71,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x75,0x01,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x77,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x7b,0x01,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x63,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x86,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x76,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x50,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x91,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x92,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x97,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x98,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x99,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x66,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x63,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xa4,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x11,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x3d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x48,0x01,0x00,0x00,0x37,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x3d,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q4_K_len = 5940;
-
-unsigned char dequant_q5_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x9b,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x54,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x58,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa1,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x14,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x54,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x55,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x56,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x61,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x80,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x84,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x00,0x00,0x80,0x41,0x1d,0x00,0x03,0x00,0x9e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x0a,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x82,0x04,0x00,0x00,0x0d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x83,0x04,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x84,0x04,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x85,0x04,0x00,0x00,0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x86,0x04,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x87,0x04,0x00,0x00,
-0x13,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x04,0x00,0x00,0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x04,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8a,0x04,0x00,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x04,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x04,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x04,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x04,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x90,0x04,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x91,0x04,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x04,0x00,0x00,0x19,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x04,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x94,0x04,0x00,0x00,0x1a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x95,0x04,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x96,0x04,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x97,0x04,0x00,0x00,0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x98,0x04,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x04,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x04,0x00,0x00,0x1f,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0d,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc4,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0xa8,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x24,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5a,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x61,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbe,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0xd6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x82,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x02,0x01,0x00,0x00,0x01,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x83,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x37,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0x1e,0x01,0x00,0x00,0x84,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x24,0x01,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x30,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x33,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x32,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x33,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x37,0x01,0x00,0x00,0x36,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xa9,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x37,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x41,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x85,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x37,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x4a,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4b,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0xa3,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x60,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x86,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x64,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x6e,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x65,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x73,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x6e,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x75,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x76,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x55,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xa9,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x80,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x87,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x76,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8a,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x94,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x98,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x71,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x48,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xab,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xac,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,0xab,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0xac,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xb1,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb1,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0xa9,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x88,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc9,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0xda,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x89,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xde,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe1,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x83,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x89,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xea,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xed,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xd3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xa9,0x00,0x00,0x00,0x89,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xfe,0x01,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x8a,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0xf4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x11,0x02,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x12,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x14,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x14,0x02,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x18,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x16,0x02,0x00,0x00,0x19,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x8b,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x1d,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x85,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x8b,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x25,0x02,0x00,0x00,0x24,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x27,0x02,0x00,0x00,0x26,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x26,0x02,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x22,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x39,0x02,0x00,0x00,0xa9,0x00,0x00,0x00,0x8b,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x3c,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x3d,0x02,0x00,0x00,0x3c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x8c,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x45,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x55,0x02,0x00,0x00,
-0x54,0x02,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x57,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x58,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x59,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,
-0x59,0x02,0x00,0x00,0x8d,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x87,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x60,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,0x60,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x63,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x66,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x61,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x70,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x72,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0xa9,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x78,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x7c,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x02,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x8e,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x85,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x83,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x86,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x90,0x02,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x97,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x98,0x02,0x00,0x00,0x8f,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x88,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x8f,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa3,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0xa7,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xa8,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xaa,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0xaa,0x02,0x00,0x00,0xac,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0xad,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xaf,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xae,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,
-0xaf,0x02,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xa9,0x00,0x00,0x00,0x8f,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xbb,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x90,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc5,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,
-0xce,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xd1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0xd1,0x02,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xd5,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xd3,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x91,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd9,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xda,0x02,0x00,0x00,0xd9,0x02,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdb,0x02,0x00,0x00,
-0xda,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdd,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,0x8a,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xde,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x91,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe4,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,
-0xdb,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xec,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xed,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0xed,0x02,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0xee,0x02,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xf0,0x02,0x00,0x00,0xef,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x02,0x00,0x00,0xa9,0x00,0x00,0x00,0x91,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,
-0xf0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xf9,0x02,0x00,0x00,0xf8,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xf6,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xfa,0x02,0x00,0x00,0xf9,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x92,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0xf0,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x03,0x03,0x00,0x00,0x02,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x04,0x03,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x04,0x03,0x00,0x00,0x03,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,
-0x0d,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x10,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x10,0x03,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x11,0x03,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x14,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x12,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x16,0x03,0x00,0x00,0x93,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0x17,0x03,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x18,0x03,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x19,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x16,0x03,0x00,0x00,0x8c,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x20,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x93,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x21,0x03,0x00,0x00,0x20,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x21,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x23,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x25,0x03,0x00,0x00,
-0x1a,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x26,0x03,0x00,0x00,0x23,0x03,0x00,0x00,0x25,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x26,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x28,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x1e,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x03,0x00,0x00,0x28,0x03,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x2b,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x2d,0x03,0x00,0x00,0x27,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,
-0x2d,0x03,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,
-0x0e,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x35,0x03,0x00,0x00,0xa9,0x00,0x00,0x00,0x93,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x37,0x03,0x00,0x00,
-0x2f,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0x37,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x39,0x03,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x35,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x39,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x03,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x94,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x41,0x03,0x00,0x00,0x2f,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x42,0x03,0x00,0x00,0x41,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x43,0x03,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x43,0x03,0x00,0x00,0x42,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x03,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x4c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x50,0x03,0x00,0x00,0x4f,0x03,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x51,0x03,0x00,0x00,
-0x50,0x03,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x54,0x03,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x55,0x03,0x00,0x00,0x51,0x03,0x00,0x00,0x54,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x56,0x03,0x00,0x00,
-0x55,0x03,0x00,0x00,0x95,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x57,0x03,0x00,0x00,0x56,0x03,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x58,0x03,0x00,0x00,0x57,0x03,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x59,0x03,0x00,0x00,
-0x58,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5b,0x03,0x00,0x00,0x55,0x03,0x00,0x00,0x8e,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x5b,0x03,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x03,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x95,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x60,0x03,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x61,0x03,0x00,0x00,
-0x60,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x62,0x03,0x00,0x00,0x61,0x03,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x64,0x03,0x00,0x00,
-0x59,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x65,0x03,0x00,0x00,0x62,0x03,0x00,0x00,0x64,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0x65,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x67,0x03,0x00,0x00,0x61,0x03,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x69,0x03,0x00,0x00,
-0x5d,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x03,0x00,0x00,0x67,0x03,0x00,0x00,0x69,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x6b,0x03,0x00,0x00,
-0x6a,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x6c,0x03,0x00,0x00,0x66,0x03,0x00,0x00,0x6b,0x03,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x6d,0x03,0x00,0x00,
-0x6c,0x03,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x6e,0x03,0x00,0x00,0x6d,0x03,0x00,0x00,
-0x4d,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x03,0x00,0x00,0xa9,0x00,0x00,0x00,0x95,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x76,0x03,0x00,0x00,
-0x6e,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x77,0x03,0x00,0x00,0x76,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x78,0x03,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x74,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x78,0x03,0x00,0x00,0x77,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x03,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x96,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x80,0x03,0x00,0x00,0x6e,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x81,0x03,0x00,0x00,0x80,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x82,0x03,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x7f,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x82,0x03,0x00,0x00,0x81,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x8b,0x03,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x8c,0x03,0x00,0x00,
-0x8b,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x8e,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x8f,0x03,0x00,0x00,0x8e,0x03,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x90,0x03,0x00,0x00,
-0x8f,0x03,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x92,0x03,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x93,0x03,0x00,0x00,
-0x92,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x94,0x03,0x00,0x00,0x90,0x03,0x00,0x00,0x93,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x95,0x03,0x00,0x00,
-0x94,0x03,0x00,0x00,0x77,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x96,0x03,0x00,0x00,0x95,0x03,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x97,0x03,0x00,0x00,0x96,0x03,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x98,0x03,0x00,0x00,
-0x97,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9a,0x03,0x00,0x00,0x94,0x03,0x00,0x00,0x90,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9b,0x03,0x00,0x00,
-0x9a,0x03,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x03,0x00,0x00,0x9b,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x9e,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x9f,0x03,0x00,0x00,0x9e,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa0,0x03,0x00,0x00,
-0x9f,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa1,0x03,0x00,0x00,0xa0,0x03,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa3,0x03,0x00,0x00,
-0x98,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa4,0x03,0x00,0x00,0xa1,0x03,0x00,0x00,0xa3,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xa5,0x03,0x00,0x00,
-0xa4,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa6,0x03,0x00,0x00,0xa0,0x03,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa8,0x03,0x00,0x00,
-0x9c,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa9,0x03,0x00,0x00,0xa6,0x03,0x00,0x00,0xa8,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xaa,0x03,0x00,0x00,
-0xa9,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xab,0x03,0x00,0x00,0xa5,0x03,0x00,0x00,0xaa,0x03,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xac,0x03,0x00,0x00,
-0xab,0x03,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xad,0x03,0x00,0x00,0xac,0x03,0x00,0x00,
-0x8c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x03,0x00,0x00,0xa9,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xb5,0x03,0x00,0x00,
-0xad,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xb6,0x03,0x00,0x00,0xb5,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xb7,0x03,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb3,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb7,0x03,0x00,0x00,0xb6,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x03,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x97,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xbf,0x03,0x00,0x00,0xad,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc0,0x03,0x00,0x00,0xbf,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0xc1,0x03,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xbe,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc1,0x03,0x00,0x00,0xc0,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xca,0x03,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xcb,0x03,0x00,0x00,
-0xca,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xcd,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xce,0x03,0x00,0x00,0xcd,0x03,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xcf,0x03,0x00,0x00,
-0xce,0x03,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xd1,0x03,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd2,0x03,0x00,0x00,
-0xd1,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd3,0x03,0x00,0x00,0xcf,0x03,0x00,0x00,0xd2,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd4,0x03,0x00,0x00,
-0xd3,0x03,0x00,0x00,0x82,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd5,0x03,0x00,0x00,0xd4,0x03,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd6,0x03,0x00,0x00,0xd5,0x03,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd7,0x03,0x00,0x00,
-0xd6,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd9,0x03,0x00,0x00,0xd3,0x03,0x00,0x00,0x92,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xda,0x03,0x00,0x00,
-0xd9,0x03,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x03,0x00,0x00,0xda,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0xdd,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0xde,0x03,0x00,0x00,0xdd,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xdf,0x03,0x00,0x00,
-0xde,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe0,0x03,0x00,0x00,0xdf,0x03,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe2,0x03,0x00,0x00,
-0xd7,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe3,0x03,0x00,0x00,0xe0,0x03,0x00,0x00,0xe2,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xe4,0x03,0x00,0x00,
-0xe3,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe5,0x03,0x00,0x00,0xdf,0x03,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe7,0x03,0x00,0x00,
-0xdb,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe8,0x03,0x00,0x00,0xe5,0x03,0x00,0x00,0xe7,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xe9,0x03,0x00,0x00,
-0xe8,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0xea,0x03,0x00,0x00,0xe4,0x03,0x00,0x00,0xe9,0x03,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0xeb,0x03,0x00,0x00,
-0xea,0x03,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0xec,0x03,0x00,0x00,0xeb,0x03,0x00,0x00,
-0xcb,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x03,0x00,0x00,0xa9,0x00,0x00,0x00,0x82,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xf4,0x03,0x00,0x00,
-0xec,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xf5,0x03,0x00,0x00,0xf4,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0xf6,0x03,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xf2,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xf6,0x03,0x00,0x00,0xf5,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x03,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x98,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xfe,0x03,0x00,0x00,0xec,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xff,0x03,0x00,0x00,0xfe,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xfd,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x00,0x04,0x00,0x00,0xff,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x09,0x04,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x0a,0x04,0x00,0x00,
-0x09,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x0c,0x04,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x04,0x00,0x00,0x0c,0x04,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x0e,0x04,0x00,0x00,
-0x0d,0x04,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x10,0x04,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x11,0x04,0x00,0x00,
-0x10,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x12,0x04,0x00,0x00,0x0e,0x04,0x00,0x00,0x11,0x04,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x13,0x04,0x00,0x00,
-0x12,0x04,0x00,0x00,0x84,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x14,0x04,0x00,0x00,0x13,0x04,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x15,0x04,0x00,0x00,0x14,0x04,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x04,0x00,0x00,
-0x15,0x04,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x18,0x04,0x00,0x00,0x12,0x04,0x00,0x00,0x94,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x19,0x04,0x00,0x00,
-0x18,0x04,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x04,0x00,0x00,0x19,0x04,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x1c,0x04,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x84,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x1d,0x04,0x00,0x00,0x1c,0x04,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x1e,0x04,0x00,0x00,
-0x1d,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1f,0x04,0x00,0x00,0x1e,0x04,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x21,0x04,0x00,0x00,
-0x16,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x22,0x04,0x00,0x00,0x1f,0x04,0x00,0x00,0x21,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x23,0x04,0x00,0x00,
-0x22,0x04,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x24,0x04,0x00,0x00,0x1e,0x04,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x26,0x04,0x00,0x00,
-0x1a,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x27,0x04,0x00,0x00,0x24,0x04,0x00,0x00,0x26,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x28,0x04,0x00,0x00,
-0x27,0x04,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x29,0x04,0x00,0x00,0x23,0x04,0x00,0x00,0x28,0x04,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x2a,0x04,0x00,0x00,
-0x29,0x04,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x2b,0x04,0x00,0x00,0x2a,0x04,0x00,0x00,
-0x0a,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x04,0x00,0x00,0xa9,0x00,0x00,0x00,0x84,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x33,0x04,0x00,0x00,
-0x2b,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x34,0x04,0x00,0x00,0x33,0x04,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x35,0x04,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x31,0x04,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x35,0x04,0x00,0x00,0x34,0x04,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3c,0x04,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x99,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x3d,0x04,0x00,0x00,0x2b,0x04,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3e,0x04,0x00,0x00,0x3d,0x04,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x3f,0x04,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3c,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3f,0x04,0x00,0x00,0x3e,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x48,0x04,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x49,0x04,0x00,0x00,
-0x48,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x4b,0x04,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x4c,0x04,0x00,0x00,0x4b,0x04,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x4d,0x04,0x00,0x00,
-0x4c,0x04,0x00,0x00,0x48,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x04,0x00,0x00,0x67,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x50,0x04,0x00,0x00,
-0x4f,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x51,0x04,0x00,0x00,0x4d,0x04,0x00,0x00,0x50,0x04,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x52,0x04,0x00,0x00,
-0x51,0x04,0x00,0x00,0x86,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x04,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x54,0x04,0x00,0x00,0x53,0x04,0x00,0x00,0x52,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x55,0x04,0x00,0x00,
-0x54,0x04,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x57,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x96,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x58,0x04,0x00,0x00,
-0x57,0x04,0x00,0x00,0x52,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x59,0x04,0x00,0x00,0x58,0x04,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x5b,0x04,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x86,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x5c,0x04,0x00,0x00,0x5b,0x04,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5d,0x04,0x00,0x00,
-0x5c,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5e,0x04,0x00,0x00,0x5d,0x04,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x60,0x04,0x00,0x00,
-0x55,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x61,0x04,0x00,0x00,0x5e,0x04,0x00,0x00,0x60,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x62,0x04,0x00,0x00,
-0x61,0x04,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x63,0x04,0x00,0x00,0x5d,0x04,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x65,0x04,0x00,0x00,
-0x59,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x66,0x04,0x00,0x00,0x63,0x04,0x00,0x00,0x65,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x67,0x04,0x00,0x00,
-0x66,0x04,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x68,0x04,0x00,0x00,0x62,0x04,0x00,0x00,0x67,0x04,0x00,0x00,
-0x83,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x69,0x04,0x00,0x00,
-0x68,0x04,0x00,0x00,0xcf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x6a,0x04,0x00,0x00,0x69,0x04,0x00,0x00,
-0x49,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x70,0x04,0x00,0x00,0xa9,0x00,0x00,0x00,0x86,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x72,0x04,0x00,0x00,
-0x6a,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x73,0x04,0x00,0x00,0x72,0x04,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,0x74,0x04,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x70,0x04,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x74,0x04,0x00,0x00,0x73,0x04,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x04,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x9a,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x7c,0x04,0x00,0x00,0x6a,0x04,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7d,0x04,0x00,0x00,0x7c,0x04,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x7e,0x04,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x7b,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x04,0x00,0x00,0x7d,0x04,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc3,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t dequant_q5_0_len = 13952;
-
-unsigned char dequant_q5_1_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x95,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x51,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x53,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x9b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbd,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x14,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x18,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x4a,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x51,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x53,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x64,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x80,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x9a,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x04,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x04,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x04,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7f,0x04,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x80,0x04,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x04,0x00,0x00,0x13,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x82,0x04,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x83,0x04,0x00,0x00,0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x84,0x04,0x00,0x00,0x15,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x85,0x04,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x86,0x04,0x00,0x00,0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x87,0x04,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x04,0x00,0x00,
-0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x04,0x00,0x00,0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x04,0x00,0x00,0x18,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8b,0x04,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x04,0x00,0x00,0x19,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8e,0x04,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x04,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x90,0x04,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x91,0x04,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x92,0x04,0x00,0x00,0x1d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x93,0x04,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x94,0x04,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xbe,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0d,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbf,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0xa8,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x2c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x24,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x33,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xbe,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x57,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xab,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xab,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xd6,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xd8,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x7c,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x7d,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x06,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x37,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x18,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x7e,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x10,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x13,0x01,0x00,0x00,0x81,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x30,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x37,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x7f,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x55,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x6e,0x01,0x00,0x00,0x6d,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0x6e,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x76,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0x70,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x78,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x7a,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,0x81,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x70,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x84,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x99,0x01,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x99,0x01,0x00,0x00,0x48,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xaa,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xab,0x01,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xac,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,0xab,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0xac,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0x91,0x01,0x00,0x00,
-0x91,0x01,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb9,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x82,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc3,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x83,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,0xd3,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x7d,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,
-0xde,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x83,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xde,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe0,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x81,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,0x83,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xf8,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x84,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0xee,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xff,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x02,0x02,0x00,0x00,0x01,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x0b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x11,0x02,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x11,0x02,0x00,0x00,0x85,0x04,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x13,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x14,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x7f,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x19,0x02,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x85,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x15,0x02,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x22,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x24,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x85,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x35,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x37,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,0x86,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x41,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x51,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x87,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x52,0x02,0x00,0x00,0x51,0x02,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x53,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x56,0x02,0x00,0x00,0x81,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x58,0x02,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x87,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x54,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x61,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x63,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x64,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x66,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0x66,0x02,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x63,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x72,0x02,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x87,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x72,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x76,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x88,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x80,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x90,0x02,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x89,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x82,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x98,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x89,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x93,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xa7,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xaa,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,0x89,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,
-0xab,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x8a,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0xab,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbf,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xce,0x02,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xce,0x02,0x00,0x00,0x8b,0x04,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd1,0x02,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0xd1,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0x84,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0xda,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x8b,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0xdb,0x02,0x00,0x00,0xda,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xdd,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xe4,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0xe8,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x8b,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf3,0x02,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xf4,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xf0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xf4,0x02,0x00,0x00,
-0xf3,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfb,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,0x8c,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xfc,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,0xfc,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xfe,0x02,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x07,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x08,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x8d,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x0e,0x03,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x10,0x03,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x10,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x13,0x03,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x15,0x03,0x00,0x00,0x13,0x03,0x00,0x00,0x86,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x16,0x03,0x00,0x00,
-0x15,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,0x16,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0x19,0x03,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,0x19,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,
-0x1a,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x1b,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x11,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1c,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x20,0x03,0x00,0x00,
-0x1f,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x1b,0x03,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x23,0x03,0x00,0x00,
-0x17,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x24,0x03,0x00,0x00,0x21,0x03,0x00,0x00,0x23,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x25,0x03,0x00,0x00,
-0x24,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x26,0x03,0x00,0x00,0x20,0x03,0x00,0x00,0x25,0x03,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x26,0x03,0x00,0x00,0x08,0x03,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x28,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x27,0x03,0x00,0x00,0x28,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x8d,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x31,0x03,0x00,0x00,0x29,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0x31,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x33,0x03,0x00,0x00,0x32,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x8e,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x3c,0x03,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x3d,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x3d,0x03,0x00,0x00,
-0x3c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x46,0x03,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x47,0x03,0x00,0x00,0x46,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x4a,0x03,0x00,0x00,0x49,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x4c,0x03,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x4c,0x03,0x00,0x00,0x8f,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x4f,0x03,0x00,0x00,0x4e,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x50,0x03,0x00,0x00,
-0x4f,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x52,0x03,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x54,0x03,0x00,0x00,0x52,0x03,0x00,0x00,
-0x88,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x55,0x03,0x00,0x00,0x54,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x56,0x03,0x00,0x00,
-0x55,0x03,0x00,0x00,0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,
-0x58,0x03,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x8f,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x59,0x03,0x00,0x00,
-0x58,0x03,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x59,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x5b,0x03,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x5d,0x03,0x00,0x00,0x50,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x5e,0x03,0x00,0x00,0x5b,0x03,0x00,0x00,
-0x5d,0x03,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x5f,0x03,0x00,0x00,0x5e,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x60,0x03,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x62,0x03,0x00,0x00,0x56,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x63,0x03,0x00,0x00,0x60,0x03,0x00,0x00,
-0x62,0x03,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x64,0x03,0x00,0x00,0x63,0x03,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x65,0x03,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x64,0x03,0x00,0x00,0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x66,0x03,0x00,0x00,0x65,0x03,0x00,0x00,0x47,0x03,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x67,0x03,0x00,0x00,
-0x4a,0x03,0x00,0x00,0x4a,0x03,0x00,0x00,0x81,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x68,0x03,0x00,0x00,0x66,0x03,0x00,0x00,
-0x67,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x03,0x00,0x00,0xa4,0x00,0x00,0x00,0x8f,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x70,0x03,0x00,0x00,
-0x68,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x71,0x03,0x00,0x00,0x70,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x72,0x03,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x6e,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x72,0x03,0x00,0x00,0x71,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x03,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x90,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x7a,0x03,0x00,0x00,0x68,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7b,0x03,0x00,0x00,0x7a,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x7c,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x79,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7c,0x03,0x00,0x00,0x7b,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x85,0x03,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x86,0x03,0x00,0x00,
-0x85,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x88,0x03,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x89,0x03,0x00,0x00,0x88,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x8b,0x03,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x8c,0x03,0x00,0x00,0x8b,0x03,0x00,0x00,0x71,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x8d,0x03,0x00,0x00,
-0x8c,0x03,0x00,0x00,0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x8e,0x03,0x00,0x00,0x8d,0x03,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x03,0x00,0x00,0x8e,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x91,0x03,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x93,0x03,0x00,0x00,
-0x91,0x03,0x00,0x00,0x8a,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x94,0x03,0x00,0x00,0x93,0x03,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x95,0x03,0x00,0x00,0x94,0x03,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0x97,0x03,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x98,0x03,0x00,0x00,0x97,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x99,0x03,0x00,0x00,0x98,0x03,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9a,0x03,0x00,0x00,
-0x99,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x9c,0x03,0x00,0x00,0x8f,0x03,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9d,0x03,0x00,0x00,
-0x9a,0x03,0x00,0x00,0x9c,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x9e,0x03,0x00,0x00,0x9d,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x9f,0x03,0x00,0x00,
-0x99,0x03,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xa1,0x03,0x00,0x00,0x95,0x03,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xa2,0x03,0x00,0x00,
-0x9f,0x03,0x00,0x00,0xa1,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0xa3,0x03,0x00,0x00,0xa2,0x03,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xa4,0x03,0x00,0x00,
-0x9e,0x03,0x00,0x00,0xa3,0x03,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xa5,0x03,0x00,0x00,0xa4,0x03,0x00,0x00,
-0x86,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xa6,0x03,0x00,0x00,0x89,0x03,0x00,0x00,0x89,0x03,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xa7,0x03,0x00,0x00,
-0xa5,0x03,0x00,0x00,0xa6,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xad,0x03,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xaf,0x03,0x00,0x00,0xa7,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xb0,0x03,0x00,0x00,
-0xaf,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xb1,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xad,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xb1,0x03,0x00,0x00,
-0xb0,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x03,0x00,0x00,0xa4,0x00,0x00,0x00,0x91,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0xb9,0x03,0x00,0x00,
-0xa7,0x03,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xba,0x03,0x00,0x00,0xb9,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0xbb,0x03,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb8,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xbb,0x03,0x00,0x00,0xba,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xc4,0x03,0x00,0x00,
-0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0xc5,0x03,0x00,0x00,0xc4,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc7,0x03,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xc8,0x03,0x00,0x00,
-0xc7,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xca,0x03,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0xcb,0x03,0x00,0x00,0xca,0x03,0x00,0x00,
-0x7c,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xcc,0x03,0x00,0x00,0xcb,0x03,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xcd,0x03,0x00,0x00,
-0xcc,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xce,0x03,0x00,0x00,0xcd,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd0,0x03,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd2,0x03,0x00,0x00,0xd0,0x03,0x00,0x00,0x8c,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0xd3,0x03,0x00,0x00,
-0xd2,0x03,0x00,0x00,0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd4,0x03,0x00,0x00,0xd3,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,0xd6,0x03,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x7c,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0xd7,0x03,0x00,0x00,0xd6,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd8,0x03,0x00,0x00,
-0xd7,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xd9,0x03,0x00,0x00,0xd8,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xdb,0x03,0x00,0x00,
-0xce,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xdc,0x03,0x00,0x00,0xd9,0x03,0x00,0x00,0xdb,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xdd,0x03,0x00,0x00,
-0xdc,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xde,0x03,0x00,0x00,0xd8,0x03,0x00,0x00,0x69,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe0,0x03,0x00,0x00,
-0xd4,0x03,0x00,0x00,0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0xe1,0x03,0x00,0x00,0xde,0x03,0x00,0x00,0xe0,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0xe2,0x03,0x00,0x00,
-0xe1,0x03,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xe3,0x03,0x00,0x00,0xdd,0x03,0x00,0x00,0xe2,0x03,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0xe4,0x03,0x00,0x00,
-0xe3,0x03,0x00,0x00,0xc5,0x03,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0xe5,0x03,0x00,0x00,0xc8,0x03,0x00,0x00,
-0xc8,0x03,0x00,0x00,0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0xe6,0x03,0x00,0x00,0xe4,0x03,0x00,0x00,0xe5,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x03,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x7c,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0xee,0x03,0x00,0x00,0xe6,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xef,0x03,0x00,0x00,0xee,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0xf0,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xec,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xf0,0x03,0x00,0x00,0xef,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf7,0x03,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x92,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0xf8,0x03,0x00,0x00,0xe6,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf9,0x03,0x00,0x00,
-0xf8,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0xfa,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xf7,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xfa,0x03,0x00,0x00,
-0xf9,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x03,0x04,0x00,0x00,0x58,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x04,0x04,0x00,0x00,0x03,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x06,0x04,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x07,0x04,0x00,0x00,0x06,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x09,0x04,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x0a,0x04,0x00,0x00,
-0x09,0x04,0x00,0x00,0x7e,0x04,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x0b,0x04,0x00,0x00,0x0a,0x04,0x00,0x00,
-0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x0c,0x04,0x00,0x00,0x0b,0x04,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0d,0x04,0x00,0x00,
-0x0c,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0f,0x04,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x0f,0x04,0x00,0x00,
-0x8e,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x12,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x04,0x00,0x00,
-0x12,0x04,0x00,0x00,0x41,0x00,0x08,0x00,0x7c,0x00,0x00,0x00,
-0x15,0x04,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x7e,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x16,0x04,0x00,0x00,
-0x15,0x04,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x17,0x04,0x00,0x00,0x16,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x18,0x04,0x00,0x00,0x17,0x04,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x1a,0x04,0x00,0x00,0x0d,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1b,0x04,0x00,0x00,0x18,0x04,0x00,0x00,
-0x1a,0x04,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x1c,0x04,0x00,0x00,0x1b,0x04,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x04,0x00,0x00,0x17,0x04,0x00,0x00,
-0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x1f,0x04,0x00,0x00,0x13,0x04,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x20,0x04,0x00,0x00,0x1d,0x04,0x00,0x00,
-0x1f,0x04,0x00,0x00,0x70,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
-0x21,0x04,0x00,0x00,0x20,0x04,0x00,0x00,0x50,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x22,0x04,0x00,0x00,0x1c,0x04,0x00,0x00,
-0x21,0x04,0x00,0x00,0x8e,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x23,0x04,0x00,0x00,0x22,0x04,0x00,0x00,0x04,0x04,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x24,0x04,0x00,0x00,
-0x07,0x04,0x00,0x00,0x07,0x04,0x00,0x00,0x81,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x25,0x04,0x00,0x00,0x23,0x04,0x00,0x00,
-0x24,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x04,0x00,0x00,0xa4,0x00,0x00,0x00,0x7e,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x2d,0x04,0x00,0x00,
-0x25,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x2e,0x04,0x00,0x00,0x2d,0x04,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x2f,0x04,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x2b,0x04,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2f,0x04,0x00,0x00,0x2e,0x04,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x04,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x93,0x04,0x00,0x00,0x51,0x00,0x05,0x00,
-0x4a,0x00,0x00,0x00,0x37,0x04,0x00,0x00,0x25,0x04,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x38,0x04,0x00,0x00,0x37,0x04,0x00,0x00,0x41,0x00,0x06,0x00,
-0x57,0x00,0x00,0x00,0x39,0x04,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x36,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x39,0x04,0x00,0x00,0x38,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x42,0x04,0x00,0x00,0x58,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x43,0x04,0x00,0x00,
-0x42,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x45,0x04,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x46,0x04,0x00,0x00,0x45,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x48,0x04,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x49,0x04,0x00,0x00,0x48,0x04,0x00,0x00,0x80,0x04,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x4a,0x04,0x00,0x00,
-0x49,0x04,0x00,0x00,0x69,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x4b,0x04,0x00,0x00,0x4a,0x04,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x04,0x00,0x00,0x4b,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x4e,0x04,0x00,0x00,0x65,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x50,0x04,0x00,0x00,
-0x4e,0x04,0x00,0x00,0x90,0x04,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x51,0x04,0x00,0x00,0x50,0x04,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x52,0x04,0x00,0x00,0x51,0x04,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0x54,0x04,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x80,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x55,0x04,0x00,0x00,0x54,0x04,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x56,0x04,0x00,0x00,0x55,0x04,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x57,0x04,0x00,0x00,
-0x56,0x04,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x59,0x04,0x00,0x00,0x4c,0x04,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5a,0x04,0x00,0x00,
-0x57,0x04,0x00,0x00,0x59,0x04,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x5b,0x04,0x00,0x00,0x5a,0x04,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5c,0x04,0x00,0x00,
-0x56,0x04,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x5e,0x04,0x00,0x00,0x52,0x04,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x5f,0x04,0x00,0x00,
-0x5c,0x04,0x00,0x00,0x5e,0x04,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x60,0x04,0x00,0x00,0x5f,0x04,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x61,0x04,0x00,0x00,
-0x5b,0x04,0x00,0x00,0x60,0x04,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x62,0x04,0x00,0x00,0x61,0x04,0x00,0x00,
-0x43,0x04,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x63,0x04,0x00,0x00,0x46,0x04,0x00,0x00,0x46,0x04,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x64,0x04,0x00,0x00,
-0x62,0x04,0x00,0x00,0x63,0x04,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6a,0x04,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x80,0x04,0x00,0x00,0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,
-0x6c,0x04,0x00,0x00,0x64,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x6d,0x04,0x00,0x00,
-0x6c,0x04,0x00,0x00,0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,
-0x6e,0x04,0x00,0x00,0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x6a,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,0x6e,0x04,0x00,0x00,
-0x6d,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x75,0x04,0x00,0x00,0xa4,0x00,0x00,0x00,0x94,0x04,0x00,0x00,
-0x51,0x00,0x05,0x00,0x4a,0x00,0x00,0x00,0x76,0x04,0x00,0x00,
-0x64,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x77,0x04,0x00,0x00,0x76,0x04,0x00,0x00,
-0x41,0x00,0x06,0x00,0x57,0x00,0x00,0x00,0x78,0x04,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x75,0x04,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x78,0x04,0x00,0x00,0x77,0x04,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbe,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbe,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q5_1_len = 13548;
-
-unsigned char dequant_q5_K_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xa0,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x33,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4b,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x4e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x4e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x4e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x4e,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x50,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x12,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x13,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x13,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x15,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x15,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x89,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x14,0x00,0x02,0x00,
-0x11,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x45,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x4b,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x4e,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x4f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x50,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x82,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x12,0x01,0x00,0x00,
-0x45,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x13,0x01,0x00,0x00,
-0x12,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x14,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x14,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0x40,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x15,0x00,0x00,0x00,0x89,0x01,0x00,0x00,0x88,0x01,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2a,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x29,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x8a,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x18,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x0d,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x10,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x12,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x30,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x19,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x45,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x54,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x76,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x75,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x76,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x8f,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x47,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x42,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x82,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc1,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc1,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc1,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x47,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0xd6,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x42,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x42,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x41,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x29,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0x98,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7b,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0x52,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0xab,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x54,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x67,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x36,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x37,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x38,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,0x98,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x66,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7b,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x52,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x41,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0xab,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x09,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x42,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x54,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0xab,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x7f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x42,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x54,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x67,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0x71,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x78,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,0xab,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x09,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7f,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x7f,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,
-0x83,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x83,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x54,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x11,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x90,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x99,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x90,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8a,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q5_K_len = 5988;
-
-unsigned char dequant_q6_K_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x10,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x23,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x33,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x62,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x62,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x62,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x64,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x64,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x79,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x05,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x14,0x00,0x02,0x00,
-0x11,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x5b,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x61,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x62,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x65,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x6f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x78,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x15,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x5c,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x2a,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x29,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x06,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x18,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x07,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x07,0x01,0x00,0x00,0x03,0x01,0x00,0x00,0x0d,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x0e,0x01,0x00,0x00,0x10,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x12,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x19,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x74,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x83,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x57,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x6f,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x6f,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x61,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x74,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x83,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x6c,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x57,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6f,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x74,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xbf,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x83,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x57,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x57,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6f,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x74,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x83,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x57,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x57,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x6f,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xff,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x74,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x01,0x01,0x00,0x00,
-0x00,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x2f,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0f,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x0c,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x06,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x06,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q6_K_len = 4296;
-
-unsigned char dequant_q8_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x23,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x70,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x70,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x14,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x49,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x1e,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x51,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x52,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x5f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x6f,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x07,0x03,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x08,0x03,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x10,0x03,0x00,0x00,0x0d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x13,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,0x14,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x18,0x03,0x00,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x19,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x03,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x03,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x22,0x03,0x00,0x00,0x1f,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0d,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x96,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0xa8,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x18,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xaf,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x24,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x18,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x81,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x90,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xbb,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc5,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x07,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x08,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0xd6,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd8,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x08,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xee,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x09,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0xff,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x01,0x01,0x00,0x00,
-0x00,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,0x01,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0x02,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x09,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x03,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x0d,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x14,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x24,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x40,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x0e,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x69,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x10,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x77,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x87,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x88,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,0x10,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x7e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x92,0x01,0x00,0x00,0x91,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,0x11,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x12,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbb,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x13,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x13,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xd8,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xda,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x14,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0xe4,0x01,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xe1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xe4,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xed,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x15,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xff,0x01,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x15,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0xf9,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0x01,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x03,0x02,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xff,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x03,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,0x16,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0d,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x16,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x18,0x02,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x17,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x19,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x18,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x22,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2c,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0x34,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0x34,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0x36,0x02,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x36,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x40,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x41,0x02,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x19,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x43,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x43,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x19,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0x55,0x02,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x5f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x69,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x70,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x72,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x73,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x74,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x1b,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x7e,0x02,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x85,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x86,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x87,0x02,0x00,0x00,0x86,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x88,0x02,0x00,0x00,0x87,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0x93,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x98,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xa7,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x57,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xba,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x1f,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x20,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xc6,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x1f,0x03,0x00,0x00,0x51,0x00,0x05,0x00,
-0x49,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xce,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x56,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd0,0x02,0x00,0x00,0xcf,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x20,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0xd9,0x02,0x00,0x00,
-0xd8,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0xda,0x02,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xda,0x02,0x00,0x00,
-0xd9,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0xe4,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x54,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x21,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0xe8,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x5f,0x00,0x00,0x00,0xea,0x02,0x00,0x00,0x54,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x22,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xeb,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0xed,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0xed,0x02,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0xee,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,
-0xf7,0x02,0x00,0x00,0xef,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,
-0xf7,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,
-0xf9,0x02,0x00,0x00,0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xf5,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x00,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x51,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0xef,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x01,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x03,0x03,0x00,0x00,
-0x72,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x00,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x03,0x03,0x00,0x00,0x02,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x95,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t dequant_q8_0_len = 8868;
-
-unsigned char diag_mask_inf_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x4c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x15,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x30,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x32,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x49,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x18,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x29,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2f,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x31,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x18,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x4a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x4b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x34,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0xac,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x29,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x47,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x4a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4a,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t diag_mask_inf_f32_len = 1480;
-
-unsigned char f32_to_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x11,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x33,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x34,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x36,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x36,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x40,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x40,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x42,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x11,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x12,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x23,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x32,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x3f,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x41,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x23,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x29,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x27,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x28,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x23,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x29,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x29,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x23,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x31,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x4a,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x31,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x31,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t f32_to_f16_len = 1596;
-
-unsigned char gelu_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x36,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x38,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x38,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x21,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x2a,0x42,0x4c,0x3f,
-0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x13,0x27,0x37,0x3d,0x1d,0x00,0x03,0x00,
-0x35,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x00,0x00,0x00,0x3f,0x2b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x00,0x00,0x00,0x40,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x49,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x49,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x11,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x26,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x45,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x49,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x49,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t gelu_f32_len = 1484;
-
-unsigned char get_rows_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x65,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x78,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x51,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x62,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x58,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x58,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x58,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x6e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x58,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x76,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_f16_len = 1948;
-
-unsigned char get_rows_f16_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x7a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x65,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x51,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x6d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x58,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x58,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x6d,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x6e,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x6d,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x75,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t get_rows_f16_f32_len = 1932;
-
-unsigned char get_rows_q4_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x98,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x79,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x1e,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x63,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x67,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x1d,0x00,0x03,0x00,
-0x79,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,
-0x67,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x8f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x90,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x63,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x67,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x67,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x67,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5c,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5c,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8c,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q4_0_len = 2372;
-
-unsigned char get_rows_q4_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x97,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x79,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x1e,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x63,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x67,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x1d,0x00,0x03,0x00,
-0x79,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2c,0x00,0x05,0x00,0x67,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x8e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8e,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x63,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x67,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x67,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x67,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x83,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x84,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x83,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x8b,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8e,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8e,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q4_0_f32_len = 2356;
-
-unsigned char get_rows_q4_1_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x7f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x7f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x7f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x81,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x81,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x68,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x7e,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x7f,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x95,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x94,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x6c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5c,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x5c,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x91,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x94,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x94,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q4_1_len = 2440;
-
-unsigned char get_rows_q4_1_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x7f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x7f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x7f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x81,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x81,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x92,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x68,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x7e,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x7f,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x88,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x93,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x94,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x93,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x68,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x6c,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x6c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x81,0x00,0x05,0x00,
-0x6c,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x88,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x89,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x88,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x90,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x93,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x93,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t get_rows_q4_1_f32_len = 2424;
-
-unsigned char get_rows_q5_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x59,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x5a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa2,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xa3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xa3,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb7,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x84,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x88,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x00,0x00,0x80,0x41,0x1d,0x00,0x03,0x00,
-0xa2,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa4,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xa4,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,
-0x88,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xb8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5e,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x84,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x88,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x88,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x88,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5e,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x5e,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xb5,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q5_0_len = 2884;
-
-unsigned char get_rows_q5_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x58,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x59,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x5a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa2,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xa3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xa3,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb6,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x58,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x5a,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x84,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x88,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x00,0x00,0x80,0x41,0x1d,0x00,0x03,0x00,
-0xa2,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa4,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xa4,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xac,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2c,0x00,0x05,0x00,0x88,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xb7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5e,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x84,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x88,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x88,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x88,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xac,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xad,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xac,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t get_rows_q5_0_f32_len = 2868;
-
-unsigned char get_rows_q5_1_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x58,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x9f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa1,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb3,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x69,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x84,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x9e,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xb4,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb5,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x5c,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa9,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x5c,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xb1,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb4,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t get_rows_q5_1_len = 2796;
-
-unsigned char get_rows_q5_1_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x58,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x9f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa1,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x56,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x69,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x84,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x9e,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9f,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xa8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xb3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb4,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xb3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5c,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x69,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa9,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xb3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xb3,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q5_1_f32_len = 2780;
-
-unsigned char get_rows_q8_0_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x8b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x74,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x74,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x88,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x55,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x56,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x73,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x75,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x89,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x89,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x5a,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x63,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x63,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x5e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x1c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5a,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x5a,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x86,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x89,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x89,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t get_rows_q8_0_len = 2296;
-
-unsigned char get_rows_q8_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x8a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x56,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x56,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x58,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x74,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x74,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x1d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x55,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x56,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x73,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x75,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x88,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x89,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x10,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x88,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x27,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x30,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5a,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x63,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x10,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x63,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x10,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x7d,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x7d,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x85,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x88,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x88,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t get_rows_q8_0_f32_len = 2280;
-
-unsigned char matmul_f16_aligned_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x5b,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x04,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8a,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x01,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x02,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x04,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x04,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x8a,0x01,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3c,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3d,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x3c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x01,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x03,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x03,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1f,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x29,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x42,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x57,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x42,0x03,0x00,0x00,
-0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xab,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xea,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xea,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x03,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x42,0x03,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x48,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x13,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,0x20,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x27,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x09,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x48,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x01,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x49,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x37,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x80,0x01,0x00,0x00,
-0x52,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x02,0x00,0x00,
-0x54,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x49,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x61,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x61,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0x66,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x69,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x50,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x52,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xba,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x94,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x44,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x2a,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x8c,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xc7,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc3,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xc7,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x26,0x03,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xcf,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xd3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0x24,0x03,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x2e,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe3,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe7,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x30,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x22,0x03,0x00,0x00,0xec,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xeb,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xef,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x30,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf5,0x02,0x00,0x00,0xf2,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xf7,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xf5,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf6,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xfb,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb8,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0xfd,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfe,0x02,0x00,0x00,0xff,0x02,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x0a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x08,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xbf,0x02,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xd7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2b,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x17,0x03,0x00,0x00,0x19,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x1f,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x34,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x20,0x03,0x00,0x00,
-0x1e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,0x00,0x03,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x00,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc4,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0x2b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc3,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_aligned_l_len = 11936;
-
-unsigned char matmul_f16_aligned_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf3,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x53,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x56,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x56,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x58,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x57,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x57,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xab,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x99,0x02,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9b,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9b,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xb7,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xda,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8d,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x0e,0x01,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x43,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x79,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x87,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x8e,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x39,0x02,0x00,0x00,0x95,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x98,0x01,0x00,0x00,0x93,0x01,0x00,0x00,0x94,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xf2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc9,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcd,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xef,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xe2,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf7,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfb,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x35,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xff,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x03,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x33,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x07,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0b,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x13,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0xac,0x01,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x17,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x08,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0xe8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x00,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x95,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x95,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x40,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x52,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5f,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x63,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x67,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x73,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7f,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7b,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7f,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x83,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x87,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x82,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8d,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x96,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x97,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,
-0xad,0x02,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xb7,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x84,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x64,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_aligned_l_fp32_len = 10512;
-
-unsigned char matmul_f16_aligned_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x5b,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x04,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8a,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x01,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x02,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x04,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x04,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x8a,0x01,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3c,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3d,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x3c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x01,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x03,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x03,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1f,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x29,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x42,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x57,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x42,0x03,0x00,0x00,
-0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xab,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xea,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xea,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x03,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x42,0x03,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x48,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x13,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,0x20,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x27,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x09,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x48,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x01,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x49,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x37,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x80,0x01,0x00,0x00,
-0x52,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x02,0x00,0x00,
-0x54,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x49,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x61,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x61,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0x66,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x69,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x50,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x52,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xba,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x94,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x44,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x2a,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x8c,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xc7,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc3,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xc7,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x26,0x03,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xcf,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xd3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0x24,0x03,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x2e,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe3,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe7,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x30,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x22,0x03,0x00,0x00,0xec,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xeb,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xef,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x30,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf5,0x02,0x00,0x00,0xf2,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xf7,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xf5,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf6,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xfb,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb8,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0xfd,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfe,0x02,0x00,0x00,0xff,0x02,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x0a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x08,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xbf,0x02,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xd7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2b,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x17,0x03,0x00,0x00,0x19,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x1f,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x34,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x20,0x03,0x00,0x00,
-0x1e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,0x00,0x03,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x00,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc4,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0x2b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc3,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_aligned_m_len = 11936;
-
-unsigned char matmul_f16_aligned_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf3,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x53,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x56,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x56,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x58,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x57,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x57,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xab,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x99,0x02,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9b,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9b,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xb7,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xda,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8d,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x0e,0x01,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x43,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x79,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x87,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x8e,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x39,0x02,0x00,0x00,0x95,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x98,0x01,0x00,0x00,0x93,0x01,0x00,0x00,0x94,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xf2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc9,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcd,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xef,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xe2,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf7,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfb,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x35,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xff,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x03,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x33,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x07,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0b,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x13,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0xac,0x01,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x17,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x08,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0xe8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x00,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x95,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x95,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x40,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x52,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5f,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x63,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x67,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x73,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7f,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7b,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7f,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x83,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x87,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x82,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8d,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x96,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x97,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,
-0xad,0x02,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xb7,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x84,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x64,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_aligned_m_fp32_len = 10512;
-
-unsigned char matmul_f16_aligned_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x5b,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x04,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8a,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8b,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x01,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x02,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x04,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x04,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x8a,0x01,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x8b,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x8c,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3c,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3d,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x3c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x01,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x03,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x03,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1f,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x10,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x29,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x42,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x3e,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x57,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0x3b,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x42,0x03,0x00,0x00,
-0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xab,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xea,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xea,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0x80,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3b,0x03,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x03,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x42,0x03,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x48,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x13,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x48,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,0x20,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x27,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x27,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x09,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x48,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x01,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x49,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x37,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x49,0x03,0x00,0x00,0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x44,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x80,0x01,0x00,0x00,
-0x52,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x26,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x02,0x00,0x00,
-0x54,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x49,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x61,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x61,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0x66,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x69,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x50,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x52,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x26,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xba,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x94,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x50,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x4a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfa,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x44,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x2a,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x8c,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xc7,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc3,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xc7,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x26,0x03,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xcf,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xd3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x02,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x47,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0x24,0x03,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x2e,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe3,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe7,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x30,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x22,0x03,0x00,0x00,0xec,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xeb,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xef,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x30,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf5,0x02,0x00,0x00,0xf2,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xf7,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xf5,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf6,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xfb,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfc,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb8,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0xfd,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfe,0x02,0x00,0x00,0xff,0x02,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0xff,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x0a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x08,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xbf,0x02,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x0d,0x03,0x00,0x00,0xd7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2b,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x16,0x03,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x17,0x03,0x00,0x00,0x19,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x1d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x1f,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x04,0x03,0x00,0x00,0x34,0x00,0x00,0x00,
-0x11,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x20,0x03,0x00,0x00,
-0x1e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,0x00,0x03,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x00,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x30,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc4,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0x2b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xc3,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_aligned_s_len = 11936;
-
-unsigned char matmul_f16_aligned_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf3,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x53,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x55,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x56,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x56,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x56,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x58,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x57,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x57,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xab,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x99,0x02,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9b,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9b,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xb7,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xda,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8d,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x0e,0x01,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x43,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x79,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x87,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0xda,0x02,0x00,0x00,0x8e,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x39,0x02,0x00,0x00,0x95,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x98,0x01,0x00,0x00,0x93,0x01,0x00,0x00,0x94,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xf2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc9,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcd,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xef,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xe1,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xe2,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf7,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfb,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x35,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xff,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x03,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x33,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x07,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0b,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xea,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0f,0x02,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x13,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x17,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x1b,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0xac,0x01,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x17,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x08,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0xe8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x00,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x95,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x95,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x92,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x40,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x52,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5f,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x63,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x67,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x73,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7f,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7b,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7f,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x83,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x87,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x82,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8d,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x8e,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x96,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x97,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,
-0xad,0x02,0x00,0x00,0xae,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xb7,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x84,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x64,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x64,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x63,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_aligned_s_fp32_len = 10512;
-
-unsigned char matmul_f16_f32_aligned_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x65,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8c,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8d,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc6,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x0d,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0d,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0f,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0f,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x8a,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x8c,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x8d,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x8e,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x91,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x1a,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x31,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x48,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x0c,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x0c,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0e,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x33,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x34,0x03,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x34,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x48,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x22,0x01,0x00,0x00,
-0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x44,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x45,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x4c,0x03,0x00,0x00,0x78,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa3,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xab,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe9,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xf7,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x45,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x48,0x03,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x4c,0x03,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x08,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x03,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x52,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x0c,0x02,0x00,0x00,
-0x0d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x10,0x02,0x00,0x00,0x0b,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x64,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x14,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x18,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x13,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x22,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x32,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x02,0x00,0x00,0x64,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x14,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x61,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x46,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x41,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0x52,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x80,0x01,0x00,0x00,0x5d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x60,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x42,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x54,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xaa,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6c,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x58,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7c,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x80,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0x95,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x31,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9f,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x34,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xc7,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x35,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xce,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd2,0x02,0x00,0x00,0xcd,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x36,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcd,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xda,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd6,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xda,0x02,0x00,0x00,
-0xd5,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x35,0x03,0x00,0x00,0x52,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x38,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xee,0x02,0x00,0x00,0xef,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf2,0x02,0x00,0x00,
-0xed,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xed,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xed,0x02,0x00,0x00,0x2c,0x03,0x00,0x00,0xf7,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xf6,0x02,0x00,0x00,0xf7,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfa,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xf6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x00,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x01,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x06,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x02,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf5,0x02,0x00,0x00,0x08,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x0b,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x09,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x16,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0xca,0x02,0x00,0x00,
-0x17,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x18,0x03,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x21,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x03,0x00,0x00,0x36,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x24,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x28,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x27,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x28,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x91,0x01,0x00,0x00,0x2a,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xef,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xef,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x03,0x00,0x00,
-0x36,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0x35,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_l_len = 12096;
-
-unsigned char matmul_f16_f32_aligned_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x51,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x56,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x57,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x57,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x57,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x59,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x59,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x98,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9a,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x55,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x58,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5b,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa9,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x97,0x02,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x99,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xd3,0x02,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x43,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x5b,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x89,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x37,0x02,0x00,0x00,0x93,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x96,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9e,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9e,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xae,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xde,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xed,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0xec,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0xde,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0x35,0x02,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xdf,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x33,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x31,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x06,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x15,0x02,0x00,0x00,
-0x16,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x20,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x15,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x06,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,
-0xe5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x93,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0xd9,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x49,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5d,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x65,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x65,0x02,0x00,0x00,
-0x60,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x60,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x72,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x77,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7d,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0xb7,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x85,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8d,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8b,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x96,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x94,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x95,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xab,0x02,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xad,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5b,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x96,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x82,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x82,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x81,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_l_fp32_len = 10464;
-
-unsigned char matmul_f16_f32_aligned_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x65,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8c,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8d,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc6,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x0d,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0d,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0f,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0f,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x8a,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x8c,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x8d,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x8e,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x91,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x1a,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x31,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x48,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x0c,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x0c,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0e,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x33,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x34,0x03,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x34,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x48,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x22,0x01,0x00,0x00,
-0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x44,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x45,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x4c,0x03,0x00,0x00,0x78,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa3,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xab,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe9,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xf7,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x45,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x48,0x03,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x4c,0x03,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x08,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x03,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x52,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x0c,0x02,0x00,0x00,
-0x0d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x10,0x02,0x00,0x00,0x0b,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x64,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x14,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x18,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x13,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x22,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x32,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x02,0x00,0x00,0x64,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x14,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x61,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x46,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x41,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0x52,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x80,0x01,0x00,0x00,0x5d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x60,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x42,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x54,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xaa,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6c,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x58,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7c,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x80,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0x95,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x31,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9f,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x34,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xc7,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x35,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xce,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd2,0x02,0x00,0x00,0xcd,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x36,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcd,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xda,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd6,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xda,0x02,0x00,0x00,
-0xd5,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x35,0x03,0x00,0x00,0x52,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x38,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xee,0x02,0x00,0x00,0xef,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf2,0x02,0x00,0x00,
-0xed,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xed,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xed,0x02,0x00,0x00,0x2c,0x03,0x00,0x00,0xf7,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xf6,0x02,0x00,0x00,0xf7,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfa,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xf6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x00,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x01,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x06,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x02,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf5,0x02,0x00,0x00,0x08,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x0b,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x09,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x16,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0xca,0x02,0x00,0x00,
-0x17,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x18,0x03,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x21,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x03,0x00,0x00,0x36,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x24,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x28,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x27,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x28,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x91,0x01,0x00,0x00,0x2a,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xef,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xef,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x03,0x00,0x00,
-0x36,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0x35,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_m_len = 12096;
-
-unsigned char matmul_f16_f32_aligned_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x51,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x56,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x57,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x57,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x57,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x59,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x59,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x98,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9a,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x55,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x58,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5b,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa9,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x97,0x02,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x99,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xd3,0x02,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x43,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x5b,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x89,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x37,0x02,0x00,0x00,0x93,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x96,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9e,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9e,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xae,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xde,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xed,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0xec,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0xde,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0x35,0x02,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xdf,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x33,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x31,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x06,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x15,0x02,0x00,0x00,
-0x16,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x20,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x15,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x06,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,
-0xe5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x93,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0xd9,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x49,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5d,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x65,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x65,0x02,0x00,0x00,
-0x60,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x60,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x72,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x77,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7d,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0xb7,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x85,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8d,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8b,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x96,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x94,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x95,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xab,0x02,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xad,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5b,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x96,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x82,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x82,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x81,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_m_fp32_len = 10464;
-
-unsigned char matmul_f16_f32_aligned_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x65,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x03,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8c,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x8d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8d,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8d,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc6,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x03,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x0d,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x0d,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0f,0x03,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0f,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x8a,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,0x8b,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x8c,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x8d,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x8e,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x91,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x1a,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x31,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x48,0x02,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x0c,0x03,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x0d,0x03,0x00,0x00,
-0x0c,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0e,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x03,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x03,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1b,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x49,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x33,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x48,0x03,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x34,0x03,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x34,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x44,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x48,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x22,0x01,0x00,0x00,
-0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x28,0x01,0x00,0x00,0x27,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x36,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x40,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x44,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0xeb,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xfc,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x44,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x45,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x45,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x4c,0x03,0x00,0x00,0x78,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x87,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa3,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xab,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xae,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xdb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe9,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x91,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xf7,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x45,0x03,0x00,0x00,
-0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x48,0x03,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x4c,0x03,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x02,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x02,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x08,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x03,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x52,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x52,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x0c,0x02,0x00,0x00,
-0x0d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x10,0x02,0x00,0x00,0x0b,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x64,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x14,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x18,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0x14,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x13,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x52,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x22,0x02,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x64,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x32,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x02,0x00,0x00,0x64,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x14,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x52,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x61,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x46,0x02,0x00,0x00,
-0x41,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x41,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x53,0x03,0x00,0x00,0x52,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x80,0x01,0x00,0x00,0x5d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,0x60,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x42,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x54,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,
-0xaa,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6c,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x58,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7c,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x80,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x5c,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x31,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0x95,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x31,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9f,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0x5c,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x5a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x54,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x05,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x4e,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x02,0x00,0x00,0x34,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xc7,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc7,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x35,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xce,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd2,0x02,0x00,0x00,0xcd,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x36,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcd,0x02,0x00,0x00,0x30,0x03,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xda,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd6,0x02,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xda,0x02,0x00,0x00,
-0xd5,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x02,0x00,0x00,0x36,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xe1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x35,0x03,0x00,0x00,0x52,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,0x2e,0x03,0x00,0x00,
-0xef,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x38,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xee,0x02,0x00,0x00,0xef,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf2,0x02,0x00,0x00,
-0xed,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xed,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xed,0x02,0x00,0x00,0x2c,0x03,0x00,0x00,0xf7,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xfa,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xf6,0x02,0x00,0x00,0xf7,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xfa,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,
-0xf6,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf5,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x02,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x00,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x01,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x05,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x06,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x02,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x02,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x09,0x03,0x00,0x00,
-0x00,0x03,0x00,0x00,0xf5,0x02,0x00,0x00,0x08,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x0b,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x09,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0a,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0xea,0x02,0x00,0x00,0x38,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x13,0x03,0x00,0x00,0x16,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x03,0x00,0x00,0xca,0x02,0x00,0x00,
-0x17,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x18,0x03,0x00,0x00,0xe2,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x1a,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x35,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x21,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x03,0x00,0x00,0x36,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x24,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x28,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x27,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x29,0x03,0x00,0x00,0x28,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x91,0x01,0x00,0x00,0x2a,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf6,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xef,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xef,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0x38,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xec,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x03,0x00,0x00,
-0x36,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0x35,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_s_len = 12096;
-
-unsigned char matmul_f16_f32_aligned_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x51,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf9,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2c,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2d,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x56,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x57,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x57,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x57,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x59,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x59,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x98,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x98,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9a,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9a,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xf4,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xff,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x49,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x48,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x4a,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x55,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x58,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x58,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5b,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xba,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xa9,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xd5,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x97,0x02,0x00,0x00,
-0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x98,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x99,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x99,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xd3,0x02,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x01,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x13,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfb,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xf4,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x1c,0x01,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfb,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0xf9,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x30,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x36,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x74,0x00,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x43,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x6b,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x70,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x5b,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xff,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0x5b,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x85,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x89,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x37,0x02,0x00,0x00,0x93,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x96,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x98,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9e,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9e,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0xac,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xef,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,
-0xae,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xef,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xdd,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x98,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xde,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xde,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xec,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xd9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xff,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xed,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0xec,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0xde,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc5,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0x35,0x02,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xdf,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x33,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfd,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x01,0x02,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x31,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x05,0x02,0x00,0x00,
-0x06,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x15,0x02,0x00,0x00,
-0x16,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0xaa,0x01,0x00,0x00,0x20,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x15,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x28,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x0b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x06,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,
-0xe5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x03,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0xdf,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x93,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0xd9,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x90,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x49,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x54,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5d,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x65,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x65,0x02,0x00,0x00,
-0x60,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x60,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x69,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x72,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x71,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x77,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7d,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0xb7,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x85,0x02,0x00,0x00,0x80,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x80,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x8d,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8b,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x93,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x8d,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x96,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x94,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x95,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,
-0xa5,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xab,0x02,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xad,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x5b,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x96,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x96,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x82,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x82,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x81,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x77,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x62,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x61,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_f32_aligned_s_fp32_len = 10464;
-
-unsigned char matmul_f16_f32_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xac,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x57,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5f,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x66,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x78,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9a,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd4,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe0,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xec,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x04,0x02,0x00,0x00,
-0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x05,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x07,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x55,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x92,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_l_len = 10172;
-
-unsigned char matmul_f16_f32_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x71,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x97,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x98,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0xb2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x72,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7b,0x01,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x97,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xbe,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x03,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x39,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x8e,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_l_fp32_len = 10100;
-
-unsigned char matmul_f16_f32_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xac,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x57,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5f,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x66,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x78,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9a,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd4,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe0,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xec,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x04,0x02,0x00,0x00,
-0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x05,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x07,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x55,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x92,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_m_len = 10172;
-
-unsigned char matmul_f16_f32_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x71,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x97,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x98,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0xb2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x72,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7b,0x01,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x97,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xbe,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x03,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x39,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x8e,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_m_fp32_len = 10100;
-
-unsigned char matmul_f16_f32_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xac,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x57,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5f,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x66,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x78,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x9a,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa6,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd4,0x01,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe0,0x01,0x00,0x00,
-0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xec,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x04,0x02,0x00,0x00,
-0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x05,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x07,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,0xb6,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x27,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,
-0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x55,0x02,0x00,0x00,
-0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x92,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_s_len = 10172;
-
-unsigned char matmul_f16_f32_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x71,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x97,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x98,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5e,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x65,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0xb2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6f,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x72,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7b,0x01,0x00,0x00,
-0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x97,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xbe,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x03,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x08,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xb8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x39,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x39,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x65,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x64,0x02,0x00,0x00,
-0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x99,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0x52,0x01,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0x34,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x8e,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_f32_s_fp32_len = 10100;
-
-unsigned char matmul_f16_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcc,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x59,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x80,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x97,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x72,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x73,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x90,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xab,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x54,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x55,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x64,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x68,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6e,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x76,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7a,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7e,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x94,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x82,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x96,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x97,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x01,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x05,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x25,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x35,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x38,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x34,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x38,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3c,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x40,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x54,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x58,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x60,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x60,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x66,0x02,0x00,0x00,
-0x63,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x68,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x66,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x90,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_l_len = 10156;
-
-unsigned char matmul_f16_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xca,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x99,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5e,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x10,0x02,0x00,0x00,0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7b,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x95,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd6,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0x0a,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe2,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x83,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x01,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x03,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x36,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x39,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x56,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x65,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x77,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x32,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_l_fp32_len = 10116;
-
-unsigned char matmul_f16_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcc,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x59,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x80,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x97,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x72,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x73,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x90,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xab,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x54,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x55,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x64,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x68,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6e,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x76,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7a,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7e,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x94,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x82,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x96,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x97,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x01,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x05,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x25,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x35,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x38,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x34,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x38,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3c,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x40,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x54,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x58,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x60,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x60,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x66,0x02,0x00,0x00,
-0x63,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x68,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x66,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x90,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_m_len = 10156;
-
-unsigned char matmul_f16_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xca,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x99,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5e,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x10,0x02,0x00,0x00,0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7b,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x95,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd6,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0x0a,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe2,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x83,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x01,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x03,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x36,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x39,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x56,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x65,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x77,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x32,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_m_fp32_len = 10116;
-
-unsigned char matmul_f16_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcc,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x73,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf9,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x07,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x59,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x80,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x91,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x97,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xae,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x72,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x73,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x74,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x73,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x90,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x81,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xab,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xac,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x9b,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x54,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x55,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0a,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x5d,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,0x61,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x64,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x68,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x12,0x02,0x00,0x00,
-0x6b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6e,0x01,0x00,0x00,
-0x69,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x76,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7a,0x01,0x00,0x00,
-0x79,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x7e,0x01,0x00,0x00,0x79,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x94,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x96,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0x98,0x01,0x00,0x00,0x82,0x01,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,0x96,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0xcb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x73,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa8,0x01,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x0a,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x10,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x0e,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd6,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,
-0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe2,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xc3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x97,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x82,0x01,0x00,0x00,
-0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x97,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x01,0x02,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x02,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x05,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xc3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x19,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x25,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,0x27,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x32,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x35,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x38,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x34,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x38,0x02,0x00,0x00,0x33,0x02,0x00,0x00,
-0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x33,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3a,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3c,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x40,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,0x3c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x55,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x54,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x58,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x60,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5c,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x60,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x66,0x02,0x00,0x00,
-0x63,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x68,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x66,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x67,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x6c,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x68,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x50,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x79,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x84,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x87,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x90,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f16_s_len = 10156;
-
-unsigned char matmul_f16_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xca,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x16,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x45,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x45,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x47,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2a,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x71,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x73,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xef,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x51,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x3a,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x44,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x46,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x46,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x81,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xaf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x70,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x72,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8e,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x82,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaf,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0xad,0x02,0x00,0x00,0x03,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x0c,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x15,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x28,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x12,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x2f,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x2d,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x99,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x37,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x55,0x01,0x00,0x00,0x54,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x56,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5e,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x37,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x60,0x01,0x00,0x00,0xaa,0x02,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x20,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x65,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x10,0x02,0x00,0x00,0x6c,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6f,0x01,0x00,0x00,0x6a,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7b,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0x85,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,
-0xb7,0x02,0x00,0x00,0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x95,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x97,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x83,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x98,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x79,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa0,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa4,0x01,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0xc6,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xc6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xb8,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xce,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd2,0x01,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xcd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0x0c,0x02,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd6,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xda,0x01,0x00,0x00,0xd5,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0x0a,0x02,0x00,0x00,0xdf,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xde,0x01,0x00,0x00,
-0xdf,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe2,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdd,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x08,0x02,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xea,0x01,0x00,0x00,
-0xe5,0x01,0x00,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,
-0xec,0x01,0x00,0x00,0xbf,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x83,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x01,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x03,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdf,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd7,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x17,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,0x29,0x02,0x00,0x00,
-0x2c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x33,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x36,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x32,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x38,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x95,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3a,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,
-0x39,0x02,0x00,0x00,0x3a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x39,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x9a,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x56,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x9f,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5a,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x61,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x66,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x64,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x65,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x66,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x66,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x6f,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x77,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x82,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x33,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x32,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f16_s_fp32_len = 10116;
-
-unsigned char matmul_f32_aligned_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x6a,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,0x14,0x03,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6a,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x92,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x95,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x95,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xcb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x03,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x03,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x14,0x03,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x00,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x51,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,0x86,0x01,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x87,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x87,0x01,0x00,0x00,0x88,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x92,0x01,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x92,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x94,0x01,0x00,0x00,0x95,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x03,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x1f,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x20,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4d,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x57,0x02,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x11,0x03,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x11,0x03,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x13,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x20,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x51,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x39,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x39,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x49,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x01,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1d,0x01,0x00,0x00,0x1c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x30,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x36,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4a,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x59,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x49,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x51,0x03,0x00,0x00,0x80,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x99,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xac,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb6,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xda,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe0,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xe4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0x4d,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x51,0x03,0x00,0x00,
-0x03,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x12,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x57,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x15,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x10,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x17,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x69,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1d,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x34,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x37,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x69,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x19,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x11,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x58,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x11,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3f,0x02,0x00,0x00,
-0x40,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x45,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x4b,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x57,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x60,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x63,0x02,0x00,0x00,
-0x88,0x01,0x00,0x00,0x62,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x65,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0x66,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x47,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x59,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x76,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x81,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7d,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x81,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x61,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x85,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x89,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x61,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x36,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa4,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x09,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x39,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xca,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xcd,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0xd4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd3,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd7,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xd3,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xdb,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdf,0x02,0x00,0x00,
-0xda,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xda,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xeb,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xda,0x02,0x00,0x00,0x33,0x03,0x00,0x00,
-0xf4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf7,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf3,0x02,0x00,0x00,0xf4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf7,0x02,0x00,0x00,
-0xf2,0x02,0x00,0x00,0xf3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x31,0x03,0x00,0x00,0xfc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xff,0x02,0x00,0x00,
-0x3f,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfb,0x02,0x00,0x00,0xfc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xff,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xfa,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x03,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x3f,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x05,0x03,0x00,0x00,0x02,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x07,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x06,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0c,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0xfa,0x02,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x10,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x19,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x18,0x03,0x00,0x00,0x1b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x03,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x23,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x27,0x03,0x00,0x00,0x29,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x2d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2e,0x03,0x00,0x00,0x2d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0xfc,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x21,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2f,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x10,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x35,0x03,0x00,0x00,
-0x3b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_aligned_l_len = 12168;
-
-unsigned char matmul_f32_aligned_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xe9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf5,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf6,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x28,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x50,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x51,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4a,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x90,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x91,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x91,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xe8,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf5,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfd,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x50,0x01,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xa1,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x90,0x02,0x00,0x00,0xba,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x92,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x92,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x87,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x32,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xfe,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x74,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x71,0x01,0x00,0x00,0x46,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x71,0x01,0x00,0x00,
-0x70,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x82,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x85,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x30,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x8b,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x8a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x97,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9b,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9f,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xab,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x94,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xd6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xc0,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xc4,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xbf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcc,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xf2,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xed,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xf7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfa,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0xff,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfe,0x01,0x00,0x00,0xff,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x02,0x02,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x28,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0a,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,0xde,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0xa3,0x01,0x00,0x00,0x19,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0xd0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x24,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x23,0x02,0x00,0x00,0x25,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xef,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xee,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7e,0x02,0x00,0x00,
-0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x86,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x86,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x86,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x86,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0xac,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xfa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xae,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_aligned_l_fp32_len = 10348;
-
-unsigned char matmul_f32_aligned_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x6a,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,0x14,0x03,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6a,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x92,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x95,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x95,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xcb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x03,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x03,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x14,0x03,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x00,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x51,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,0x86,0x01,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x87,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x87,0x01,0x00,0x00,0x88,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x92,0x01,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x92,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x94,0x01,0x00,0x00,0x95,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x03,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x1f,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x20,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4d,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x57,0x02,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x11,0x03,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x11,0x03,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x13,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x20,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x51,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x39,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x39,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x49,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x01,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1d,0x01,0x00,0x00,0x1c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x30,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x36,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4a,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x59,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x49,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x51,0x03,0x00,0x00,0x80,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x99,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xac,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb6,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xda,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe0,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xe4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0x4d,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x51,0x03,0x00,0x00,
-0x03,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x12,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x57,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x15,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x10,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x17,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x69,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1d,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x34,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x37,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x69,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x19,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x11,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x58,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x11,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3f,0x02,0x00,0x00,
-0x40,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x45,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x4b,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x57,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x60,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x63,0x02,0x00,0x00,
-0x88,0x01,0x00,0x00,0x62,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x65,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0x66,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x47,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x59,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x76,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x81,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7d,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x81,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x61,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x85,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x89,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x61,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x36,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa4,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x09,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x39,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xca,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xcd,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0xd4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd3,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd7,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xd3,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xdb,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdf,0x02,0x00,0x00,
-0xda,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xda,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xeb,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xda,0x02,0x00,0x00,0x33,0x03,0x00,0x00,
-0xf4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf7,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf3,0x02,0x00,0x00,0xf4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf7,0x02,0x00,0x00,
-0xf2,0x02,0x00,0x00,0xf3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x31,0x03,0x00,0x00,0xfc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xff,0x02,0x00,0x00,
-0x3f,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfb,0x02,0x00,0x00,0xfc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xff,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xfa,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x03,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x3f,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x05,0x03,0x00,0x00,0x02,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x07,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x06,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0c,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0xfa,0x02,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x10,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x19,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x18,0x03,0x00,0x00,0x1b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x03,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x23,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x27,0x03,0x00,0x00,0x29,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x2d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2e,0x03,0x00,0x00,0x2d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0xfc,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x21,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2f,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x10,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x35,0x03,0x00,0x00,
-0x3b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_aligned_m_len = 12168;
-
-unsigned char matmul_f32_aligned_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xe9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf5,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf6,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x28,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x50,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x51,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4a,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x90,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x91,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x91,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xe8,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf5,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfd,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x50,0x01,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xa1,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x90,0x02,0x00,0x00,0xba,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x92,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x92,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x87,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x32,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xfe,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x74,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x71,0x01,0x00,0x00,0x46,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x71,0x01,0x00,0x00,
-0x70,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x82,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x85,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x30,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x8b,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x8a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x97,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9b,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9f,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xab,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x94,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xd6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xc0,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xc4,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xbf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcc,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xf2,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xed,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xf7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfa,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0xff,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfe,0x01,0x00,0x00,0xff,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x02,0x02,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x28,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0a,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,0xde,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0xa3,0x01,0x00,0x00,0x19,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0xd0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x24,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x23,0x02,0x00,0x00,0x25,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xef,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xee,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7e,0x02,0x00,0x00,
-0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x86,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x86,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x86,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x86,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0xac,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xfa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xae,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_aligned_m_fp32_len = 10348;
-
-unsigned char matmul_f32_aligned_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x6a,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x95,0x01,0x00,0x00,0xcb,0x02,0x00,0x00,0x14,0x03,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xf8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xf8,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6a,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x92,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x93,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x95,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x95,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xcb,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x03,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x03,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x14,0x03,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xea,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0xf5,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x18,0x00,0x04,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x00,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6b,0x01,0x00,0x00,0x51,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x1c,0x00,0x04,0x00,0x86,0x01,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0x87,0x01,0x00,0x00,0x04,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x87,0x01,0x00,0x00,0x88,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x92,0x01,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x93,0x01,0x00,0x00,
-0x92,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x94,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x94,0x01,0x00,0x00,0x95,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x03,0x02,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x1f,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x20,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x36,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4d,0x02,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x57,0x02,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x11,0x03,0x00,0x00,0xba,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x12,0x03,0x00,0x00,0x11,0x03,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,0x12,0x03,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x13,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x19,0x03,0x00,0x00,0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x20,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x38,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x51,0x03,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x39,0x03,0x00,0x00,0x7a,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x39,0x03,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x49,0x03,0x00,0x00,0x37,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd9,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x4d,0x03,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x01,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1d,0x01,0x00,0x00,0x1c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x30,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x34,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x39,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x36,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x45,0x01,0x00,0x00,
-0x43,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x4a,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x59,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x61,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x68,0x01,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x49,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x77,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x74,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x51,0x03,0x00,0x00,0x80,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x01,0x00,0x00,
-0x81,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,
-0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x99,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xac,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb0,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb6,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xd2,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xda,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdb,0x01,0x00,0x00,
-0xda,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xde,0x01,0x00,0x00,0xdd,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,0xde,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xdb,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe0,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xe4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xee,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf6,0x01,0x00,0x00,0x62,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0xfc,0x00,0x00,0x00,0xf9,0x01,0x00,0x00,0x95,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x6d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0x4d,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x51,0x03,0x00,0x00,
-0x03,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x07,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x07,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x53,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0xb1,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x09,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x0d,0x02,0x00,0x00,0x08,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x08,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x12,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x57,0x03,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x11,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x15,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x10,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x17,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x69,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x39,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1d,0x02,0x00,0x00,
-0x18,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x18,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x25,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x57,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x69,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x34,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x25,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x37,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x69,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x17,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x19,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x57,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x11,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x58,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x11,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x58,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x3f,0x02,0x00,0x00,
-0x40,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x43,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x45,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x67,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x4b,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x53,0x02,0x00,0x00,
-0x51,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x02,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x58,0x03,0x00,0x00,0x57,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x58,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5c,0x02,0x00,0x00,0x66,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x5e,0x02,0x00,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x60,0x02,0x00,0x00,0x53,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x00,0x01,0x00,0x00,0x63,0x02,0x00,0x00,
-0x88,0x01,0x00,0x00,0x62,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x65,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x67,0x02,0x00,0x00,0x66,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x45,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x47,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x40,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x02,0x00,0x00,
-0x58,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x59,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x6d,0x02,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x71,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x79,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x75,0x02,0x00,0x00,0x76,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x74,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0xab,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x81,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x7d,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x81,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7c,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x83,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x61,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x85,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x89,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x84,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x61,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x91,0x02,0x00,0x00,
-0x61,0x03,0x00,0x00,0x41,0x00,0x05,0x00,0x36,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x73,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x36,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,
-0xa1,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa4,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0x61,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x83,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7e,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xab,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x7d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x76,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x5d,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x75,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x59,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x0a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x0a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x53,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x09,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xff,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0x39,0x03,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xca,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,0xcb,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xcd,0x02,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xce,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xc6,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0xd4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd3,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd7,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xd3,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xd9,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x35,0x03,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xdb,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdf,0x02,0x00,0x00,
-0xda,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xda,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x02,0x00,0x00,0x3b,0x03,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe4,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xeb,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xda,0x02,0x00,0x00,0x33,0x03,0x00,0x00,
-0xf4,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xf7,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf3,0x02,0x00,0x00,0xf4,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf7,0x02,0x00,0x00,
-0xf2,0x02,0x00,0x00,0xf3,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf2,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x03,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf2,0x02,0x00,0x00,0x31,0x03,0x00,0x00,0xfc,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xff,0x02,0x00,0x00,
-0x3f,0x03,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfb,0x02,0x00,0x00,0xfc,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xff,0x02,0x00,0x00,0xfa,0x02,0x00,0x00,
-0xfb,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xfa,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x03,0x00,0x00,
-0xe7,0x02,0x00,0x00,0x3f,0x03,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x05,0x03,0x00,0x00,0x02,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x07,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x05,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x06,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x0c,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x07,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x07,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0xfa,0x02,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x06,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0x10,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x0e,0x03,0x00,0x00,
-0x0f,0x03,0x00,0x00,0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x0f,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x03,0x00,0x00,0xef,0x02,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x12,0x00,0x00,0x00,0x19,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x1a,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x18,0x03,0x00,0x00,0x1b,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,0xcf,0x02,0x00,0x00,
-0x1c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,0xe7,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x03,0x00,0x00,
-0x1f,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x23,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x26,0x03,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x27,0x03,0x00,0x00,0x29,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc3,0x00,0x00,0x00,0x2d,0x03,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x2e,0x03,0x00,0x00,0x2d,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0xfc,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,0x14,0x03,0x00,0x00,
-0x34,0x00,0x00,0x00,0x21,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x2f,0x03,0x00,0x00,0x2e,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x10,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x10,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfc,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x03,0x00,0x00,0x3f,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfb,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf1,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf3,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xdc,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdc,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x35,0x03,0x00,0x00,
-0x3b,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd9,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xdb,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd4,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x03,0x00,0x00,0x3a,0x03,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_aligned_s_len = 12168;
-
-unsigned char matmul_f32_aligned_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xe9,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb2,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf5,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf6,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf8,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x28,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x50,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x51,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x51,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4a,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x90,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x91,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x91,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x93,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xba,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xc3,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xe8,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0xf4,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf5,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xf7,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfd,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x45,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x50,0x01,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x52,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x52,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xa1,0x01,0x00,0x00,0xba,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xce,0x01,0x00,0x00,0xba,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x90,0x02,0x00,0x00,0xba,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x92,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x92,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x98,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa2,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb7,0x02,0x00,0x00,0xb7,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x87,0x01,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0xcd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x7a,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x32,0x02,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd5,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd9,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xfe,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x18,0x01,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0xea,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xc8,0x02,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0xc9,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x35,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x30,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x74,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
-0xd0,0x02,0x00,0x00,0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0xfa,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x60,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x69,0x01,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0xfd,0x00,0x00,0x00,0x71,0x01,0x00,0x00,0x46,0x01,0x00,0x00,
-0x6d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x71,0x01,0x00,0x00,
-0x70,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x76,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x79,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x41,0x00,0x07,0x00,0xfa,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x34,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x21,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x7e,0x01,0x00,0x00,0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x2b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x31,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0xcc,0x02,0x00,0x00,0x82,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x87,0x01,0x00,0x00,0xd0,0x02,0x00,0x00,
-0x85,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x30,0x02,0x00,0x00,0x8c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0xd2,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x8b,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x8f,0x01,0x00,0x00,0x8a,0x01,0x00,0x00,
-0x8b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x8a,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,
-0xbc,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x93,0x01,0x00,0x00,
-0x94,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x97,0x01,0x00,0x00,0x92,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x92,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x99,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0xba,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x9f,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9b,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9f,0x01,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9a,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0xd6,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xab,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xac,0x01,0x00,0x00,0xae,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xea,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0xe8,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x99,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x94,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x94,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xd6,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x91,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x93,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xc0,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xc4,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xbf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xcc,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd7,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0xd2,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xfd,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x46,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xe6,0x01,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe6,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0xe5,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x01,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xbe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xc0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xd8,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x2e,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xee,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xf2,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xee,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xed,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xf4,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0x2c,0x02,0x00,0x00,
-0xf7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0xfa,0x01,0x00,0x00,0xdc,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xf6,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xfa,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xde,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x2a,0x02,0x00,0x00,0xff,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xde,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xfe,0x01,0x00,0x00,0xff,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x02,0x02,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xfd,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0x28,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x06,0x02,0x00,0x00,
-0x05,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x0a,0x02,0x00,0x00,0x05,0x02,0x00,0x00,0x06,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,0xde,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x13,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0xa3,0x01,0x00,0x00,0x19,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0xd0,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,
-0x23,0x02,0x00,0x00,0xc0,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xba,0x00,0x00,0x00,0x24,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xba,0x00,0x00,0x00,
-0x25,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x24,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x23,0x02,0x00,0x00,0x25,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0xe0,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x04,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x06,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xff,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xff,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xde,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xfc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xfe,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xf4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xf6,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xef,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xec,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xee,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x89,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8b,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x02,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x02,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x44,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x4d,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb5,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x52,0x02,0x00,0x00,0x53,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x51,0x02,0x00,0x00,
-0x52,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x51,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x5b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x59,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0x65,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x59,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x72,0x02,0x00,0x00,
-0x73,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x71,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x71,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x78,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7a,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7e,0x02,0x00,0x00,
-0x79,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x79,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x81,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x81,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x86,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x84,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x86,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb8,0x00,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x86,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x86,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb8,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x8f,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x8d,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x8f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xb9,0x02,0x00,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x02,0x00,0x00,0xa4,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc3,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0xad,0x02,0x00,0x00,0xac,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xfa,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xae,0x02,0x00,0x00,0xad,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x78,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x70,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x72,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5b,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x53,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x53,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x52,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_aligned_s_fp32_len = 10348;
-
-unsigned char matmul_f32_l_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x18,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x46,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x46,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x46,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x48,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x48,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x75,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xf0,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x3c,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3c,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x46,0x01,0x00,0x00,0x45,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,
-0x0a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x12,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x14,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x21,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x25,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x30,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x33,0x01,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x20,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x38,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x44,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5f,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x22,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x66,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x6c,0x01,0x00,0x00,0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x78,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x75,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xb8,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x12,0x02,0x00,0x00,0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd4,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xdc,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe0,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xec,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x07,0x02,0x00,0x00,0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x27,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x68,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_l_len = 10164;
-
-unsigned char matmul_f32_l_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc6,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x45,0x01,0x00,0x00,0x27,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf9,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x15,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x43,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x27,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6d,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x6e,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x6e,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x06,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x09,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x37,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x38,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x39,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x39,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x42,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x44,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x6d,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xad,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x06,0x01,0x00,0x00,0x07,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x07,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x13,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x13,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xa5,0x02,0x00,0x00,0x18,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x1e,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x22,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x26,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2b,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x1d,0x01,0x00,0x00,0x32,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x35,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5b,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x18,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x0d,0x02,0x00,0x00,0x69,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x68,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x67,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x70,0x01,0x00,0x00,
-0x71,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x74,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x77,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x9d,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa1,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xad,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x0b,0x02,0x00,0x00,0xcc,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0x09,0x02,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x07,0x02,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xdb,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdf,0x01,0x00,0x00,0xda,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xda,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0xe2,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe3,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe7,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x02,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdb,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x14,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x26,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x37,0x02,0x00,0x00,
-0x38,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x02,0x00,0x00,0x15,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x50,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x4f,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x53,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x58,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x57,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x63,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x6c,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6a,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x78,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x83,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x8b,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_l_fp32_len = 10048;
-
-unsigned char matmul_f32_m_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x18,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x46,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x46,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x46,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x48,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x48,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x75,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xf0,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x3c,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3c,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x46,0x01,0x00,0x00,0x45,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,
-0x0a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x12,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x14,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x21,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x25,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x30,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x33,0x01,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x20,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x38,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x44,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5f,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x22,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x66,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x6c,0x01,0x00,0x00,0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x78,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x75,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xb8,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x12,0x02,0x00,0x00,0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd4,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xdc,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe0,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xec,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x07,0x02,0x00,0x00,0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x27,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x68,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_m_len = 10164;
-
-unsigned char matmul_f32_m_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc6,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x45,0x01,0x00,0x00,0x27,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf9,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x15,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x43,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x27,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6d,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x6e,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x6e,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x06,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x09,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x37,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x38,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x39,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x39,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x42,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x44,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x6d,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xad,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x06,0x01,0x00,0x00,0x07,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x07,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x13,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x13,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xa5,0x02,0x00,0x00,0x18,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x1e,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x22,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x26,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2b,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x1d,0x01,0x00,0x00,0x32,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x35,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5b,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x18,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x0d,0x02,0x00,0x00,0x69,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x68,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x67,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x70,0x01,0x00,0x00,
-0x71,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x74,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x77,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x9d,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa1,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xad,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x0b,0x02,0x00,0x00,0xcc,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0x09,0x02,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x07,0x02,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xdb,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdf,0x01,0x00,0x00,0xda,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xda,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0xe2,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe3,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe7,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x02,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdb,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x14,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x26,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x37,0x02,0x00,0x00,
-0x38,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x02,0x00,0x00,0x15,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x50,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x4f,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x53,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x58,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x57,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x63,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x6c,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6a,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x78,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x83,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x8b,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_m_fp32_len = 10048;
-
-unsigned char matmul_f32_s_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xcd,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x09,0x00,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x2e,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x60,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xfa,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xfc,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x17,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x18,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x46,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x46,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x46,0x01,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x48,0x01,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x48,0x01,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2e,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x74,0x02,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x75,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x75,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x02,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x77,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xec,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xf0,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x07,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x3b,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x3c,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x3c,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x45,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x46,0x01,0x00,0x00,0x45,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x47,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x47,0x01,0x00,0x00,
-0x48,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x82,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x83,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x93,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0xb0,0x01,0x00,0x00,0xec,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,0x86,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x26,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x74,0x02,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x75,0x02,0x00,0x00,
-0x74,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0x76,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x76,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbe,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x83,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xb1,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x68,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x02,0x00,0x00,0x93,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x79,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x03,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x07,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfc,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x0c,0x01,0x00,0x00,
-0x0a,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x0d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x12,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0b,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x14,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xac,0x02,0x00,0x00,0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x21,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x25,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x20,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x30,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2e,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x33,0x01,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x30,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x30,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x20,0x01,0x00,0x00,
-0x35,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x38,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x36,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x37,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x73,0x00,0x00,0x00,
-0xad,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
-0x34,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x44,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5f,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x22,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x1b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x21,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x63,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x66,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x14,0x02,0x00,0x00,0x6d,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x6c,0x01,0x00,0x00,0x6d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x70,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x75,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0xba,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x74,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x78,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x73,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x7a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xcc,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x80,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x80,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xba,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x8c,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x92,0x01,0x00,0x00,
-0x93,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9a,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xcc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7a,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x75,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x75,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0xba,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x72,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x74,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xa2,0x01,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa6,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa1,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xae,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xae,0x01,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xaa,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa9,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,0xba,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xb8,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc3,0x01,0x00,0x00,0xb6,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0b,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xec,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc8,0x01,0x00,0x00,0xc7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0xc9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xaa,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa2,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xce,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x12,0x02,0x00,0x00,0xd1,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd0,0x01,0x00,0x00,
-0xd1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd4,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcf,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0x10,0x02,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xdc,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd8,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xdc,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd7,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xde,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xde,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0x0e,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xe0,0x01,0x00,0x00,0xe1,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xe4,0x01,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xe0,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xdf,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe6,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xe7,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xec,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe8,0x01,0x00,0x00,
-0xe7,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xec,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe7,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xc4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x99,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,
-0x99,0x01,0x00,0x00,0x03,0x02,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0xec,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x73,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x07,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,0x09,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x07,0x02,0x00,0x00,0x09,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0xc4,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe6,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,
-0xc2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xde,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe0,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd9,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd6,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd8,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xce,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd0,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x6c,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x27,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x31,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x34,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x37,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x36,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3a,0x02,0x00,0x00,0x35,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3c,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x3f,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x3e,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x42,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x49,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0xba,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x4e,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x02,0x00,0x00,0x68,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x51,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x54,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0x57,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5a,0x02,0x00,0x00,
-0x55,0x02,0x00,0x00,0x56,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x62,0x02,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x5e,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x62,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x5d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x4a,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x68,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x6a,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x68,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6d,0x02,0x00,0x00,0x6f,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6a,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x6a,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,0x71,0x02,0x00,0x00,
-0x68,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,0x73,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x71,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x72,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x52,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x12,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x7f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x02,0x00,0x00,0x80,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x88,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0xc2,0x00,0x00,0x00,0x90,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x90,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x07,0x01,0x00,0x00,0x92,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x34,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x73,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x73,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x57,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0xa0,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x54,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x56,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x37,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x34,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x36,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t matmul_f32_s_len = 10164;
-
-unsigned char matmul_f32_s_fp32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc6,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0f,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x3a,0x01,0x00,0x00,0x45,0x01,0x00,0x00,0x27,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x10,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x10,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x10,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf8,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf9,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfb,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x15,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x01,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x43,0x01,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x45,0x01,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x01,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x27,0x02,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6d,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x6e,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x6e,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x70,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x10,0x00,0x10,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0xb7,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xbd,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc2,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0xee,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xef,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xef,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xf9,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xfa,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xfa,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x06,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x09,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x32,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x33,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x14,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x51,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x37,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x38,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x39,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x39,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x04,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x42,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0x44,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x44,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x86,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x7e,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x7d,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,0x80,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0xab,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x20,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0xab,0x01,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0x86,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x0d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x6d,0x02,0x00,0x00,0xb9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0x6f,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x05,0x00,0x00,0x00,
-0x34,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbe,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7f,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xac,0x01,0x00,0x00,0xad,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x94,0x02,0x00,0x00,0xb6,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x94,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xad,0x02,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x64,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x93,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x79,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xd5,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd8,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x95,0x02,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe3,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xeb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xa9,0x02,0x00,0x00,0x02,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x06,0x01,0x00,0x00,0x07,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x07,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x0a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x0b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x13,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x13,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xeb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xeb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0xa5,0x02,0x00,0x00,0x18,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x9c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x1e,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x22,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x1d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0x26,0x01,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x2d,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x2b,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6e,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0xb7,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x1d,0x01,0x00,0x00,0x32,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x35,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x53,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x50,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x34,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x41,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x51,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x53,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x73,0x00,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x56,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5b,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x18,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x1c,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1e,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x5f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xad,0x02,0x00,0x00,
-0x62,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x0d,0x02,0x00,0x00,0x69,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,
-0xaf,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x68,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x6c,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x67,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x70,0x01,0x00,0x00,
-0x71,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x74,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x76,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x97,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x7c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x78,0x01,0x00,0x00,0x77,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7c,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x78,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x77,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x82,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x01,0x00,0x00,0xb3,0x02,0x00,0x00,0x61,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x86,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,0x64,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8c,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,
-0x8f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x90,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x92,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x84,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x01,0x00,0x00,0xc5,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x76,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x78,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x71,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x71,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x70,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x70,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x9d,0x01,0x00,0x00,
-0x9e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa1,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9c,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa9,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x01,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0x59,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0xb4,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb6,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xba,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x09,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x3a,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xad,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa5,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,
-0xb4,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9b,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x9d,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x9d,0x01,0x00,0x00,
-0x0b,0x02,0x00,0x00,0xcc,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcb,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xcf,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xca,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0x09,0x02,0x00,0x00,
-0xd4,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xd3,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd7,0x01,0x00,0x00,
-0xd2,0x01,0x00,0x00,0xd3,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xd2,0x01,0x00,0x00,0x07,0x02,0x00,0x00,0xdc,0x01,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0xdf,0x01,0x00,0x00,
-0xbb,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xdb,0x01,0x00,0x00,0xdc,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xdf,0x01,0x00,0x00,0xda,0x01,0x00,0x00,
-0xdb,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xda,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0x05,0x02,0x00,0x00,0xe2,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xe3,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe7,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0xb5,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xef,0x01,0x00,0x00,0xb9,0x02,0x00,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0xbd,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0xad,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xbf,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0xb9,0x00,0x00,0x00,
-0x02,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf8,0x01,0x00,0x00,0xfe,0x01,0x00,0x00,0x01,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x02,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe1,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xe3,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xdc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdc,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xdb,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd1,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xd3,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0xb5,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x69,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x69,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x66,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x68,0x01,0x00,0x00,0xe0,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcc,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x6c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc9,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x02,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x14,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x59,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x12,0x00,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x21,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x02,0x00,0x00,0x47,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x26,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0x22,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb4,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x2f,0x02,0x00,0x00,0x30,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x33,0x02,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x2f,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x2e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0x38,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x3b,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x37,0x02,0x00,0x00,
-0x38,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x3b,0x02,0x00,0x00,0x36,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x36,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x97,0x02,0x00,0x00,
-0x61,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x02,0x00,0x00,0x15,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,
-0x64,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x40,0x02,0x00,0x00,
-0x42,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,0x68,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x48,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x99,0x02,0x00,0x00,0x3e,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x50,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,
-0xb7,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x4f,0x02,0x00,0x00,
-0x50,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x53,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4e,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x55,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x58,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x62,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x57,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5b,0x02,0x00,0x00,
-0x56,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x56,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x43,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x63,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x61,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0x63,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x62,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x02,0x00,0x00,0xb0,0x00,0x05,0x00,0xb7,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x68,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x63,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x63,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0xb7,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x56,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x62,0x02,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x6c,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6a,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x12,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x77,0x02,0x00,0x00,
-0x76,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x78,0x02,0x00,0x00,0x74,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x78,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0xb1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x02,0x00,0x00,0x7f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x02,0x00,0x00,0x81,0x02,0x00,0x00,0x82,0x02,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x86,0x02,0x00,0x00,0x83,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x88,0x02,0x00,0x00,0x86,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0xc2,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x06,0x01,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x70,0x02,0x00,0x00,0x34,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x8b,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6c,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6c,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x58,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x58,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x55,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x57,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4d,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x4f,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x38,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x38,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x37,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x30,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x96,0x02,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2d,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2f,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t matmul_f32_s_fp32_len = 10048;
-
-unsigned char mul_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x3e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x1f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x20,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x2d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x12,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x20,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x24,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x25,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x2c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x38,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_f32_len = 1456;
-
-unsigned char mul_mat_vec_f16_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x4f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x62,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x63,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x63,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x65,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xa8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xa8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xa8,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb2,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x17,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x19,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x4e,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x4f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x62,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x6d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0xa7,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xa8,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x54,0x00,0x00,0x00,
-0x55,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x54,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x6d,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x6d,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x88,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0xaa,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xa6,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa4,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa5,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x6d,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb1,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa6,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa6,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_f16_f32_len = 2372;
-
-unsigned char mul_mat_vec_nc_f16_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1b,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x1b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x6d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x6d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x6d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x78,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7a,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa8,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xa9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xa9,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xab,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xab,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb0,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x1e,0x00,0x09,0x00,0x1b,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x20,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x34,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x36,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x37,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x37,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x34,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x3b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x46,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x6b,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x6e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x6e,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x77,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x79,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xa8,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x20,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x3b,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x3c,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3e,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3e,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x40,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x46,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x50,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x40,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x51,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x20,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x71,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x6b,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x34,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x7c,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x34,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x34,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3c,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x41,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x41,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8a,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x1e,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0xad,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x8c,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x90,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8b,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0x46,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x95,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x3b,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x34,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x34,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3c,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x96,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x96,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8d,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8a,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8c,0x00,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x46,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xa7,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa6,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x3b,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x34,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x7c,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xaf,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_nc_f16_f32_len = 2824;
-
-unsigned char mul_mat_vec_p021_f16_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xbc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x1b,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x1b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x63,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x64,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x64,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x76,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x77,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x79,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xad,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xae,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xae,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xae,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb0,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb0,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb5,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1e,0x00,0x08,0x00,
-0x1b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x20,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x32,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x39,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x44,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x62,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x65,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x68,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x76,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x77,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x78,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xad,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xae,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xaf,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x12,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x20,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x20,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x20,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x39,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x3a,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0xb0,0x00,0x05,0x00,0x44,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x3d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x44,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x4f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x4e,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x3e,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x4f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x68,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x62,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x7b,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x32,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x3a,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x3f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x1e,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0xad,0x00,0x05,0x00,0x44,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x91,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x95,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x90,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x44,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x99,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x39,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x32,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x32,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x3a,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x9b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9b,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x92,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x92,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x91,0x00,0x00,0x00,
-0xaa,0x00,0x05,0x00,0x44,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xac,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xab,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x39,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x32,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x7b,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xac,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xac,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_p021_f16_f32_len = 2768;
-
-unsigned char mul_mat_vec_q2_K_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc6,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x68,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x6a,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x6d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x6d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x6d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x6e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x6f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x6f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x6f,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x71,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x8c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x8c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x8c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8e,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xb1,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xb2,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xb2,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xb2,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb4,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xb4,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbc,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x47,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x49,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x4a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x5d,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x68,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x6a,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x6b,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x6d,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x6e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x6f,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x70,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x70,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x8b,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x8c,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x8d,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x50,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0x60,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x84,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,0x70,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xab,0x01,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x08,0x01,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xb1,0x02,0x00,0x00,0x47,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xb2,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x20,0x00,0x04,0x00,0xb3,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,0xb3,0x02,0x00,0x00,
-0xb4,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x0a,0x00,0x00,0x00,0xbc,0x02,0x00,0x00,0x48,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x56,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x56,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x5d,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x57,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0x75,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x6b,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x75,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x6b,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x84,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x84,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x47,0x00,0x00,0x00,
-0xc2,0x02,0x00,0x00,0x51,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x85,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x47,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0x51,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0x85,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x80,0x02,0x00,0x00,
-0x85,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x5d,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x25,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8a,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x85,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x97,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x97,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x97,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0xfc,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
-0x95,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x97,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x33,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9f,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x13,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x0b,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0x25,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
-0x21,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x47,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
-0x01,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x95,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x97,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x31,0x01,0x00,0x00,
-0x30,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0x33,0x00,0x00,0x00,0x36,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0x38,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x37,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x39,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0xaf,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x66,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,0x49,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x4a,0x01,0x00,0x00,0xb4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x27,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x95,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x97,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x8e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x58,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x33,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x64,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x6e,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x6e,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x70,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x72,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x73,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,
-0x75,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x65,0x01,0x00,0x00,0x73,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x95,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x97,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x01,0x00,0x00,
-0x33,0x00,0x00,0x00,0x84,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9f,0x00,0x00,0x00,0x86,0x01,0x00,0x00,0x71,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x85,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x86,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x89,0x01,0x00,0x00,
-0x88,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x89,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x8a,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,0x8b,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x95,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x97,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x99,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x47,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0x95,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x97,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xac,0x01,0x00,0x00,0x33,0x00,0x00,0x00,0xab,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9f,0x00,0x00,0x00,0xad,0x01,0x00,0x00,
-0x71,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xac,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xae,0x01,0x00,0x00,0xad,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0xae,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xb2,0x01,0x00,0x00,0xb1,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xb2,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0xbc,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x66,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x84,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0xbd,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xb4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb3,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x81,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0xa0,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd8,0x01,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xe4,0x01,0x00,0x00,0xbf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,0xc6,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xee,0x01,0x00,0x00,
-0xed,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xef,0x01,0x00,0x00,0xee,0x01,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xf0,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,0xf0,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xce,0x01,0x00,0x00,
-0xda,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0xe4,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,
-0x03,0x02,0x00,0x00,0x02,0x02,0x00,0x00,0x36,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x04,0x02,0x00,0x00,
-0x03,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x05,0x02,0x00,0x00,0x04,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x05,0x02,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x07,0x02,0x00,0x00,0x06,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x47,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,0x07,0x02,0x00,0x00,
-0xf2,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x0a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x11,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x1b,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x1c,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,0x09,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x30,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x38,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x66,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x34,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x29,0x02,0x00,0x00,
-0x35,0x02,0x00,0x00,0x20,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x57,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x47,0x02,0x00,0x00,
-0x5f,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,
-0x48,0x02,0x00,0x00,0x47,0x02,0x00,0x00,0x36,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x49,0x02,0x00,0x00,
-0x48,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x49,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x4c,0x02,0x00,0x00,0x4b,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x47,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x4c,0x02,0x00,0x00,
-0x37,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x7e,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x86,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x66,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,
-0x60,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x63,0x02,0x00,0x00,
-0x62,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,
-0x65,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x63,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x75,0x02,0x00,0x00,0xad,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x66,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0x75,0x02,0x00,0x00,
-0x36,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0xa4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x47,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x65,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0x8f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x84,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x86,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x80,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x47,0x00,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0xc5,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x53,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x47,0x00,0x00,0x00,
-0x8e,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x53,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x91,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,0x25,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x56,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x92,0x02,0x00,0x00,
-0x92,0x02,0x00,0x00,0x93,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x95,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x95,0x02,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x58,0x00,0x00,0x00,0xac,0x02,0x00,0x00,
-0x98,0x02,0x00,0x00,0xad,0x00,0x05,0x00,0x5d,0x00,0x00,0x00,
-0x9b,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x97,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x9b,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x96,0x02,0x00,0x00,0xb1,0x00,0x05,0x00,0x5d,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0x26,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xa0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x9e,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0x9f,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0x26,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x4b,0x00,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xa6,0x02,0x00,0x00,0xa5,0x02,0x00,0x00,0x41,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0xa7,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x47,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0xa7,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x02,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x92,0x02,0x00,0x00,0x92,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x98,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x98,0x02,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xac,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x8f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x95,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x97,0x02,0x00,0x00,0xaa,0x00,0x05,0x00,0x5d,0x00,0x00,0x00,
-0xae,0x02,0x00,0x00,0x26,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xb0,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xae,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xaf,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x15,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x47,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x97,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0xb8,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xbb,0x02,0x00,0x00,0xba,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xb0,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xb0,0x02,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_q2_K_f32_len = 7628;
-
-unsigned char mul_mat_vec_q3_K_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x08,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0xf9,0x02,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x73,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x79,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8f,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x90,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x90,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x90,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x92,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x92,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf6,0x02,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf7,0x02,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf7,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf7,0x02,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf9,0x02,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xf9,0x02,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x02,0x03,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x4f,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x6a,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x73,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x75,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1e,0x00,0x06,0x00,0x79,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x7a,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x7b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x82,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x8f,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x90,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x91,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x91,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x37,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x60,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x70,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x06,0x02,0x00,0x00,0x0b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x50,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x85,0x02,0x00,0x00,0x70,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd8,0x02,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0xf6,0x02,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xf7,0x02,0x00,0x00,
-0xf6,0x02,0x00,0x00,0x20,0x00,0x04,0x00,0xf8,0x02,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xf7,0x02,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xf8,0x02,0x00,0x00,0xf9,0x02,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,
-0x02,0x03,0x00,0x00,0x50,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0x01,0x03,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x5b,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x63,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x63,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x03,0x03,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,
-0x66,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x03,0x03,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x6b,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x64,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x03,0x03,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x03,0x03,0x00,0x00,0x41,0x00,0x07,0x00,0x82,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x88,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x88,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x4f,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0x59,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x89,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x16,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0xca,0x02,0x00,0x00,
-0x89,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x06,0x03,0x00,0x00,0x25,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x8e,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x89,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x06,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x06,0x03,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x06,0x03,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0xcf,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xab,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0xc1,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0x25,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x07,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x81,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0xcc,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x3f,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x11,0x01,0x00,0x00,0x13,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
-0x15,0x01,0x00,0x00,0xab,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4f,0x00,0x00,0x00,0x1c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0x1b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x24,0x01,0x00,0x00,0x98,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9a,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
-0x92,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x2a,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x60,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0xac,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x60,0x00,0x00,0x00,
-0x37,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0x38,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x39,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,
-0x81,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x30,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x41,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x44,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x35,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x57,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x5a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,0xab,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
-0x5f,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x4f,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x1c,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x98,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x72,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x72,0x01,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x74,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x77,0x01,0x00,0x00,0x76,0x01,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
-0xef,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x7f,0x01,0x00,0x00,0x7c,0x01,0x00,0x00,0x38,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x80,0x01,0x00,0x00,
-0x7f,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x81,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x81,0x01,0x00,0x00,
-0x81,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x01,0x00,0x00,0x82,0x01,0x00,0x00,0x39,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x01,0x00,0x00,
-0x77,0x01,0x00,0x00,0x83,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x84,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x86,0x01,0x00,0x00,0x44,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x88,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0x6c,0x01,0x00,0x00,0x88,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x92,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x93,0x01,0x00,0x00,
-0x92,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x93,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x94,0x01,0x00,0x00,
-0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x35,0x00,0x00,0x00,0x9f,0x01,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x9d,0x01,0x00,0x00,0x9f,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,0xab,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x95,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x4f,0x00,0x00,0x00,
-0xa8,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x89,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,0x62,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,
-0x98,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xb8,0x01,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb9,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xba,0x01,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0xba,0x01,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,
-0xc1,0x01,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,0x81,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0x39,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,0xbb,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0xc9,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xcb,0x01,0x00,0x00,0xca,0x01,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xcb,0x01,0x00,0x00,0x44,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0xce,0x01,0x00,0x00,
-0xb1,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xd5,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0xd7,0x01,0x00,0x00,
-0xd6,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xd8,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,0xd8,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xda,0x01,0x00,0x00,
-0xd9,0x01,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0xe6,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xe7,0x01,0x00,0x00,
-0xe6,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0xe7,0x01,0x00,0x00,0xab,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,0xe8,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xea,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xeb,0x01,0x00,0x00,0xda,0x01,0x00,0x00,0xea,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0xec,0x01,0x00,0x00,
-0xeb,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x4f,0x00,0x00,0x00,
-0xee,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x98,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0xf7,0x01,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xfd,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xff,0x01,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0xff,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x01,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x02,0x02,0x00,0x00,0x01,0x02,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0x07,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x06,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x08,0x02,0x00,0x00,0x07,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x08,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x0c,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,0x81,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x39,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x02,0x02,0x00,0x00,
-0x0f,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x12,0x02,0x00,0x00,0x11,0x02,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x13,0x02,0x00,0x00,
-0x12,0x02,0x00,0x00,0x44,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x14,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0x15,0x02,0x00,0x00,
-0xf8,0x01,0x00,0x00,0x14,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x1e,0x02,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x25,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x20,0x02,0x00,0x00,0x1f,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x02,0x00,0x00,
-0x20,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x21,0x02,0x00,0x00,0x81,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,0x13,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0xab,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x30,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x32,0x02,0x00,0x00,0x31,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0x22,0x02,0x00,0x00,0x32,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x34,0x02,0x00,0x00,
-0x33,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x4f,0x00,0x00,0x00,
-0x36,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x15,0x02,0x00,0x00,0x34,0x02,0x00,0x00,0xee,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,
-0x98,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9a,0x00,0x00,0x00,0x3f,0x02,0x00,0x00,0x92,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,0x45,0x02,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x44,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0x46,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0x48,0x02,0x00,0x00,
-0x46,0x02,0x00,0x00,0x60,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x48,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x4a,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x53,0x02,0x00,0x00,0x50,0x02,0x00,0x00,0x38,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x54,0x02,0x00,0x00,
-0x53,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x55,0x02,0x00,0x00,0x54,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x55,0x02,0x00,0x00,
-0x81,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x02,0x00,0x00,0x56,0x02,0x00,0x00,0x39,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x02,0x00,0x00,
-0x4b,0x02,0x00,0x00,0x57,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5a,0x02,0x00,0x00,
-0x59,0x02,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,0x44,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,
-0x5b,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x40,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x66,0x02,0x00,0x00,
-0xd6,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0x66,0x02,0x00,0x00,0x39,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x68,0x02,0x00,0x00,
-0x67,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x69,0x02,0x00,0x00,0x68,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,0x69,0x02,0x00,0x00,
-0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x73,0x02,0x00,0x00,0xe2,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x35,0x00,0x00,0x00,0x76,0x02,0x00,0x00,0x73,0x02,0x00,0x00,
-0x59,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x76,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x78,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0xab,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,0x79,0x02,0x00,0x00,
-0x78,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,0x79,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x02,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4f,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x02,0x00,0x00,0x98,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9a,0x00,0x00,0x00,0x87,0x02,0x00,0x00,
-0x92,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x86,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x87,0x02,0x00,0x00,0x41,0x00,0x08,0x00,0xa0,0x00,0x00,0x00,
-0x8d,0x02,0x00,0x00,0x7d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x8e,0x02,0x00,0x00,
-0x8d,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x35,0x00,0x00,0x00,
-0x90,0x02,0x00,0x00,0x8e,0x02,0x00,0x00,0x60,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x92,0x02,0x00,0x00,0x91,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x07,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x35,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x38,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x9d,0x02,0x00,0x00,0x81,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9f,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x39,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x93,0x02,0x00,0x00,0x9f,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0xa0,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x44,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0xa4,0x02,0x00,0x00,0xa3,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x4f,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,0x88,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0xae,0x02,0x00,0x00,0xd6,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x35,0x00,0x00,0x00,0xaf,0x02,0x00,0x00,0xae,0x02,0x00,0x00,
-0x70,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0xaf,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x35,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x35,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,
-0xbb,0x02,0x00,0x00,0x9f,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc0,0x02,0x00,0x00,
-0xbf,0x02,0x00,0x00,0xab,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0xc1,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xc2,0x02,0x00,0x00,
-0xc1,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xc2,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xc4,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4f,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xa5,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x4f,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x07,0x03,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x02,0x00,0x00,0x06,0x03,0x00,0x00,0x38,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x88,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8a,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0xd3,0x02,0x00,0x00,0x5b,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4f,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x07,0x03,0x00,0x00,
-0xd3,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,0x5b,0x00,0x00,0x00,
-0xd4,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0x66,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x66,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x02,0x00,0x00,0x03,0x03,0x00,0x00,
-0x25,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x63,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x65,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x37,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xda,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xda,0x02,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x04,0x03,0x00,0x00,0x54,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0xf1,0x02,0x00,0x00,0xdd,0x02,0x00,0x00,0xad,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,0x04,0x03,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xdc,0x02,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe0,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdb,0x02,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0x26,0x00,0x00,0x00,
-0x04,0x03,0x00,0x00,0xf7,0x00,0x03,0x00,0xe5,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe3,0x02,0x00,0x00,
-0xe4,0x02,0x00,0x00,0xe5,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe4,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0x26,0x00,0x00,0x00,0x04,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xea,0x02,0x00,0x00,
-0x53,0x00,0x00,0x00,0xe9,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,0xea,0x02,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xec,0x02,0x00,0x00,
-0x53,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xed,0x02,0x00,0x00,0xec,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x4f,0x00,0x00,0x00,0xee,0x02,0x00,0x00,
-0xed,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xec,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe5,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x02,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x37,0x01,0x00,0x00,0x37,0x01,0x00,0x00,
-0xd8,0x02,0x00,0x00,0xf9,0x00,0x02,0x00,0xdd,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdd,0x02,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf1,0x02,0x00,0x00,0x04,0x03,0x00,0x00,
-0x38,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xda,0x02,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xdc,0x02,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0xf3,0x02,0x00,0x00,0x26,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xf5,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xf3,0x02,0x00,0x00,
-0xf4,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf4,0x02,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0xfa,0x02,0x00,0x00,0x15,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfb,0x02,0x00,0x00,
-0xfa,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfd,0x02,0x00,0x00,0xfb,0x02,0x00,0x00,0x11,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,
-0x53,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0xff,0x02,0x00,0x00,0xfe,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9a,0x00,0x00,0x00,0x00,0x03,0x00,0x00,
-0xf9,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x00,0x03,0x00,0x00,0xff,0x02,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xf5,0x02,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xf5,0x02,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t mul_mat_vec_q3_K_f32_len = 9252;
-
-unsigned char mul_mat_vec_q4_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xd1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x78,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x79,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x79,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xbf,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xc0,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xc0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xc0,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc2,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xca,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x17,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x63,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x00,0x00,0x00,0x41,0x1d,0x00,0x03,0x00,0x78,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x79,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x83,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xbf,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xc1,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x63,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x63,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x83,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x83,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x17,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa3,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xad,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xa4,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xae,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xac,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xad,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xae,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xae,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa6,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa6,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa5,0x00,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xbe,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xbc,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x83,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xbe,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xbe,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t mul_mat_vec_q4_0_f32_len = 3180;
-
-unsigned char mul_mat_vec_q4_1_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xd4,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x51,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x52,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x52,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x80,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc4,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xc5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xc5,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xc5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xc7,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xc7,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xcf,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x17,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x19,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x68,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x7f,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x88,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0xc4,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xc5,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xc6,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xc6,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x68,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x68,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x68,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x88,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x17,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x88,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0xd2,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x24,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0xad,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xae,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xb3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xb1,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xb3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb3,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0xa5,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xab,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xab,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xaa,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xc1,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc2,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x88,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc3,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_q4_1_f32_len = 3248;
-
-unsigned char mul_mat_vec_q4_K_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xa2,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0xef,0x01,0x00,0x00,
-0x94,0x03,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x77,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x77,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x78,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x79,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x79,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xec,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xed,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xed,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xef,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xef,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x91,0x03,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x92,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x92,0x03,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x92,0x03,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x94,0x03,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x03,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9c,0x03,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x50,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x58,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x63,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x70,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x71,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x74,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x77,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x79,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x93,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0x41,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x42,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x43,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xec,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xee,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xed,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xee,0x01,0x00,0x00,0xef,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xf4,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x36,0x02,0x00,0x00,
-0x22,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x23,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x72,0x03,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x73,0x03,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0x91,0x03,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x92,0x03,0x00,0x00,
-0x91,0x03,0x00,0x00,0x20,0x00,0x04,0x00,0x93,0x03,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x92,0x03,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x93,0x03,0x00,0x00,0x94,0x03,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0x9c,0x03,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x58,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x59,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x5c,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x03,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x71,0x03,0x00,0x00,
-0x5d,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x9d,0x03,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x64,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5d,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x9d,0x03,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x9d,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x70,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x7f,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x70,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x93,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xcf,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x93,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0xd0,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0d,0x01,0x00,0x00,0x0c,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,0x16,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0xdf,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x18,0x01,0x00,0x00,
-0x25,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,0x19,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x1c,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0xed,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x27,0x01,0x00,0x00,0x26,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x31,0x01,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x32,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x32,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x33,0x01,0x00,0x00,0xdf,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
-0x25,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x35,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x37,0x01,0x00,0x00,
-0x36,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x38,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x93,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
-0x40,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x41,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x43,0x01,0x00,0x00,
-0x42,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x4e,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x50,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x46,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x59,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x5c,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x5d,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00,
-0x46,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x93,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x65,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x69,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,
-0x6a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x6c,0x01,0x00,0x00,0x6b,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x74,0x01,0x00,0x00,
-0x73,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x7c,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x85,0x01,0x00,0x00,0x58,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x86,0x01,0x00,0x00,
-0x85,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x8e,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x95,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,
-0x96,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x95,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x97,0x01,0x00,0x00,
-0x96,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x98,0x01,0x00,0x00,0x97,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x99,0x01,0x00,0x00,0x98,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0x9b,0x01,0x00,0x00,0x9a,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x9c,0x01,0x00,0x00,
-0x9b,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa3,0x01,0x00,0x00,0x46,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa8,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0xa9,0x01,0x00,0x00,0xa8,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xaa,0x01,0x00,0x00,0xa9,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,
-0x46,0x00,0x00,0x00,0xb0,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x93,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0xb1,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,0xb3,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb5,0x01,0x00,0x00,
-0xb4,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0xb5,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0xb7,0x01,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0xb7,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0xbe,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x93,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x7b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0xbf,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,0xc2,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc4,0x01,0x00,0x00,
-0xc3,0x01,0x00,0x00,0xd4,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x9a,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,0xc4,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xc6,0x01,0x00,0x00,
-0xc5,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0x96,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xd7,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xd8,0x01,0x00,0x00,0xd7,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xe0,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xe1,0x01,0x00,0x00,0xe0,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xe9,0x01,0x00,0x00,0xc0,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xea,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x15,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf1,0x01,0x00,0x00,
-0xf0,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf3,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,0x69,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xf8,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x90,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,
-0xff,0x01,0x00,0x00,0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0xfe,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0xff,0x01,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x51,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x00,0x02,0x00,0x00,0x02,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x03,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x09,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,0x25,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x0a,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x09,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x0a,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x0d,0x02,0x00,0x00,0x5e,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,
-0x04,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x14,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,0x64,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x15,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x14,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x16,0x02,0x00,0x00,
-0x15,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x6c,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x16,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x0f,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x20,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,0x42,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x21,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x22,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x24,0x02,0x00,0x00,0x74,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,
-0x2c,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,0x7d,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x30,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x24,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,0x36,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x38,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0x38,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3b,0x02,0x00,0x00,0x86,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x39,0x02,0x00,0x00,0x3b,0x02,0x00,0x00,
-0x31,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x02,0x00,0x00,0xf3,0x01,0x00,0x00,0x42,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x44,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x43,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x45,0x02,0x00,0x00,
-0x44,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x8f,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x02,0x00,0x00,0xf1,0x01,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x52,0x02,0x00,0x00,0x9c,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x90,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,
-0x59,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,0xaa,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x52,0x02,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x63,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x25,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x64,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x63,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x65,0x02,0x00,0x00,
-0x64,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x67,0x02,0x00,0x00,0xb8,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x69,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x65,0x02,0x00,0x00,0x67,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x64,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x6f,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x72,0x02,0x00,0x00,0xc6,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x74,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x70,0x02,0x00,0x00,0x72,0x02,0x00,0x00,
-0x69,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x42,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x7b,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0xcf,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,
-0x85,0x02,0x00,0x00,0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x02,0x00,0x00,0x85,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0xd8,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x86,0x02,0x00,0x00,0x88,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x36,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x90,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x8f,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x91,0x02,0x00,0x00,
-0x90,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x93,0x02,0x00,0x00,0xe1,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x93,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9a,0x02,0x00,0x00,0x4e,0x02,0x00,0x00,0x42,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xf4,0x01,0x00,0x00,0x9b,0x02,0x00,0x00,
-0xef,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x9e,0x02,0x00,0x00,0xea,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x9e,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xf5,0x01,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xc7,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xa7,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x1c,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xb5,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xc6,0x02,0x00,0x00,0x7b,0x02,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x38,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0xca,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xc6,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd1,0x02,0x00,0x00,0xff,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0xd5,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xd1,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0xca,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xdc,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xd5,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,
-0xbd,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,0x85,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0xf6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xf2,0x02,0x00,0x00,
-0xc8,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x01,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xfd,0x02,0x00,0x00,
-0xa9,0x02,0x00,0x00,0xf6,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x08,0x03,0x00,0x00,0x38,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x08,0x03,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x01,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x13,0x03,0x00,0x00,0x64,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x17,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x13,0x03,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x0c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x90,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x17,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0x15,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x2d,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x29,0x03,0x00,0x00,
-0xa9,0x02,0x00,0x00,0x22,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x34,0x03,0x00,0x00,0x44,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x38,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x34,0x03,0x00,0x00,
-0xb3,0x02,0x00,0x00,0x2d,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x3f,0x03,0x00,0x00,0x6f,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x43,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x3f,0x03,0x00,0x00,
-0xbd,0x02,0x00,0x00,0x38,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x4e,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x4a,0x03,0x00,0x00,
-0xc8,0x02,0x00,0x00,0x43,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x56,0x03,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0x5b,0x03,0x00,0x00,0x49,0x02,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x56,0x03,0x00,0x00,0x5b,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x5f,0x03,0x00,0x00,0xe4,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x61,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x74,0x02,0x00,0x00,
-0x5f,0x03,0x00,0x00,0x5c,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x64,0x03,0x00,0x00,0x00,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x4d,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xa0,0x02,0x00,0x00,
-0x64,0x03,0x00,0x00,0x61,0x03,0x00,0x00,0x85,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x6a,0x03,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x4e,0x03,0x00,0x00,0x7f,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xa1,0x03,0x00,0x00,0x6a,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x4d,0x00,0x00,0x00,0x6b,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x66,0x03,0x00,0x00,
-0xa1,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x6d,0x03,0x00,0x00,0x59,0x00,0x00,0x00,0x81,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x6e,0x03,0x00,0x00,0x6d,0x03,0x00,0x00,
-0x6b,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x59,0x00,0x00,0x00,
-0x6e,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x71,0x03,0x00,0x00,0x9d,0x03,0x00,0x00,0x25,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x5c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x5e,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x72,0x03,0x00,0x00,
-0x72,0x03,0x00,0x00,0x73,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x75,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x75,0x03,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9e,0x03,0x00,0x00,
-0x52,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x8c,0x03,0x00,0x00,
-0x78,0x03,0x00,0x00,0xad,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x7b,0x03,0x00,0x00,0x9e,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x77,0x03,0x00,0x00,0x78,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x7b,0x03,0x00,0x00,
-0x76,0x03,0x00,0x00,0x77,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x03,0x00,0x00,0xb1,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x7e,0x03,0x00,0x00,0x26,0x00,0x00,0x00,0x9e,0x03,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x80,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x7e,0x03,0x00,0x00,0x7f,0x03,0x00,0x00,
-0x80,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x7f,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x84,0x03,0x00,0x00,
-0x26,0x00,0x00,0x00,0x9e,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x85,0x03,0x00,0x00,0x51,0x00,0x00,0x00,
-0x84,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x86,0x03,0x00,0x00,0x85,0x03,0x00,0x00,0x41,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x87,0x03,0x00,0x00,0x51,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x88,0x03,0x00,0x00,0x87,0x03,0x00,0x00,0x81,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x89,0x03,0x00,0x00,0x88,0x03,0x00,0x00,
-0x86,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,0x87,0x03,0x00,0x00,
-0x89,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,0x80,0x03,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x80,0x03,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x72,0x03,0x00,0x00,0x72,0x03,0x00,0x00,0x73,0x03,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x78,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x78,0x03,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x03,0x00,0x00,0x9e,0x03,0x00,0x00,0x90,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x75,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x77,0x03,0x00,0x00,0xaa,0x00,0x05,0x00,0x63,0x00,0x00,0x00,
-0x8e,0x03,0x00,0x00,0x26,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x90,0x03,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x8e,0x03,0x00,0x00,0x8f,0x03,0x00,0x00,
-0x90,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x8f,0x03,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x95,0x03,0x00,0x00,
-0x15,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x96,0x03,0x00,0x00,0x95,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x03,0x00,0x00,
-0x96,0x03,0x00,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x58,0x00,0x00,0x00,0x99,0x03,0x00,0x00,0x51,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x9a,0x03,0x00,0x00,0x99,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0xf4,0x01,0x00,0x00,0x9b,0x03,0x00,0x00,0x94,0x03,0x00,0x00,
-0x16,0x00,0x00,0x00,0x98,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x9b,0x03,0x00,0x00,0x9a,0x03,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x90,0x03,0x00,0x00,0xf8,0x00,0x02,0x00,0x90,0x03,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_q4_K_f32_len = 9176;
-
-unsigned char mul_mat_vec_q5_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xfa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x55,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x55,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x55,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x57,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa2,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xa3,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xa3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa5,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xe7,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xe7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xe7,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xe9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xf1,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x17,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x55,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x59,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x61,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x81,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x85,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x00,0x00,0x80,0x41,
-0x1d,0x00,0x03,0x00,0xa2,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xa4,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xa4,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xad,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0xe6,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xe7,0x00,0x00,0x00,
-0xe6,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xe8,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xe8,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x2c,0x00,0x05,0x00,0x85,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
-0x59,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x61,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x61,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x79,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x81,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x8d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x50,0x00,0x05,0x00,
-0x85,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x85,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x8e,0x00,0x05,0x00,0x85,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xad,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xad,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xad,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xd0,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xd5,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd3,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xcd,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcd,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xaa,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe3,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe4,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xad,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe5,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe5,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_q5_0_f32_len = 3676;
-
-unsigned char mul_mat_vec_q5_1_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x52,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x9e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x9e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x9e,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa0,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xe3,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0xe4,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xe4,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xe4,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xe6,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xe6,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xee,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x17,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x19,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x1e,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x30,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x50,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x64,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x7c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x80,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x9d,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9e,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x9f,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xa8,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0xe3,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xe4,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xe5,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xe5,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x64,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x09,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x09,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x7c,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x09,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x80,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x50,0x00,0x05,0x00,0x80,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x80,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa8,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa8,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0xc4,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xc7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xc7,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0xad,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xc9,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xcd,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc8,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xd2,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd6,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0xc4,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xca,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xc7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xaa,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xe2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe0,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe1,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2b,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa8,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xed,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe2,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t mul_mat_vec_q5_1_f32_len = 3604;
-
-unsigned char mul_mat_vec_q5_K_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x8d,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x7f,0x04,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x80,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x81,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x83,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x84,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x84,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x84,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x85,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x86,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x86,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x86,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x88,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x88,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xf7,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xf8,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xf8,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xf8,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xfa,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xfa,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7c,0x04,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7d,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7d,0x04,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7d,0x04,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7f,0x04,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7f,0x04,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x87,0x04,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x5a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x5c,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5d,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x70,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x80,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x81,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x83,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x84,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x85,0x00,0x00,0x00,
-0x84,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x86,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x87,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x87,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
-0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x41,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbb,0x01,0x00,0x00,
-0x50,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc9,0x01,0x00,0x00,0x51,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xf7,0x01,0x00,0x00,0x5a,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xf8,0x01,0x00,0x00,0xf7,0x01,0x00,0x00,0x20,0x00,0x04,0x00,
-0xf9,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xf9,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xff,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x84,0x02,0x00,0x00,0x21,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x31,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x5d,0x04,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x5e,0x04,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0x7c,0x04,0x00,0x00,
-0x5a,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x7d,0x04,0x00,0x00,
-0x7c,0x04,0x00,0x00,0x20,0x00,0x04,0x00,0x7e,0x04,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7d,0x04,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7e,0x04,0x00,0x00,0x7f,0x04,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0x87,0x04,0x00,0x00,
-0x5b,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x26,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x65,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x66,0x00,0x00,0x00,
-0x64,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x69,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x88,0x04,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x5c,0x04,0x00,0x00,0x6a,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x88,0x04,0x00,0x00,0x1b,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x6a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x88,0x04,0x00,0x00,0x1a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x88,0x04,0x00,0x00,0x41,0x00,0x08,0x00,
-0x8c,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x8c,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xcc,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0xae,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x05,0x01,0x00,0x00,0x04,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x05,0x01,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0x25,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0xfb,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0xda,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0x13,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x16,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
-0x1f,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x21,0x01,0x00,0x00,0x20,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x22,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0x25,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x17,0x01,0x00,0x00,0x23,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x26,0x01,0x00,0x00,
-0x25,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0xf7,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x3c,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x25,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x33,0x01,0x00,0x00,0x3f,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x41,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x42,0x01,0x00,0x00,
-0x41,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,
-0x49,0x01,0x00,0x00,0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x4c,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x4d,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
-0x4e,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x55,0x01,0x00,0x00,0x46,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x57,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x57,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x59,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x59,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x5b,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5b,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00,
-0x46,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0x63,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x64,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
-0x65,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
-0x67,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x69,0x01,0x00,0x00,0x68,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,
-0x71,0x01,0x00,0x00,0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x70,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x72,0x01,0x00,0x00,
-0x71,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x74,0x01,0x00,0x00,0x73,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x75,0x01,0x00,0x00,
-0x74,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x76,0x01,0x00,0x00,0x75,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x77,0x01,0x00,0x00,
-0x76,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x7e,0x01,0x00,0x00,0x49,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x87,0x01,0x00,0x00,0x56,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x88,0x01,0x00,0x00,0x87,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x90,0x01,0x00,0x00,0x63,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x91,0x01,0x00,0x00,0x90,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x99,0x01,0x00,0x00,0x71,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x99,0x01,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x46,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0xa0,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0xa2,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xa4,0x01,0x00,0x00,0xa3,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa5,0x01,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0xa5,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xae,0x01,0x00,0x00,
-0x46,0x00,0x00,0x00,0xad,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0xaf,0x01,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
-0xae,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb0,0x01,0x00,0x00,0xaf,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xb1,0x01,0x00,0x00,0xb0,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb2,0x01,0x00,0x00,
-0xb1,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x01,0x00,0x00,0xb2,0x01,0x00,0x00,0xde,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0xb4,0x01,0x00,0x00,
-0xb3,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xb5,0x01,0x00,0x00,0xb4,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbc,0x01,0x00,0x00,0x46,0x00,0x00,0x00,
-0xbb,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0xbc,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,0xbf,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x01,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0xc1,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xc3,0x01,0x00,0x00,
-0xc2,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xca,0x01,0x00,0x00,0x46,0x00,0x00,0x00,0xc9,0x01,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0xca,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xcd,0x01,0x00,0x00,
-0xcc,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xce,0x01,0x00,0x00,0xcd,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,
-0xde,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xd9,0x01,0x00,0x00,
-0xa1,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0xda,0x01,0x00,0x00,0xd9,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xe2,0x01,0x00,0x00,
-0xaf,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0xe3,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0xf5,0x01,0x00,0x00,0xf4,0x01,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x15,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,0xfb,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,
-0xfc,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xfe,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x03,0x02,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0x09,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x0a,0x02,0x00,0x00,0x09,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x0c,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,
-0x56,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x02,0x00,0x00,0x0c,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x02,0x00,0x00,0x0d,0x02,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x0e,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x10,0x02,0x00,0x00,0x0f,0x02,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x11,0x02,0x00,0x00,0x04,0x02,0x00,0x00,
-0x10,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x12,0x02,0x00,0x00,0x11,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x18,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,
-0x50,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,
-0x19,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x18,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x19,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x22,0x02,0x00,0x00,0x40,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x9e,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x88,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x22,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x24,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x26,0x02,0x00,0x00,
-0x24,0x02,0x00,0x00,0x56,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x27,0x02,0x00,0x00,0x26,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,
-0x27,0x02,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x28,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x2a,0x02,0x00,0x00,
-0x29,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x1d,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x2c,0x02,0x00,0x00,0x2b,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x2d,0x02,0x00,0x00,
-0x1a,0x02,0x00,0x00,0x2c,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x12,0x02,0x00,0x00,
-0x2d,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x33,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x34,0x02,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x33,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x35,0x02,0x00,0x00,
-0x34,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x37,0x02,0x00,0x00,0x69,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x38,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x40,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x41,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0x56,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x43,0x02,0x00,0x00,0x42,0x02,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x45,0x02,0x00,0x00,0x44,0x02,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,0x38,0x02,0x00,0x00,
-0x45,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x35,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,0x6f,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x4f,0x02,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x4e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x50,0x02,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x52,0x02,0x00,0x00,0x77,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x53,0x02,0x00,0x00,0x52,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x02,0x00,0x00,
-0x40,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,
-0x9e,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x88,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x58,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x5a,0x02,0x00,0x00,0x59,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x5c,0x02,0x00,0x00,0x5a,0x02,0x00,0x00,
-0x56,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x5d,0x02,0x00,0x00,0x5c,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5e,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x61,0x02,0x00,0x00,0x53,0x02,0x00,0x00,
-0x60,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x61,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x50,0x02,0x00,0x00,0x62,0x02,0x00,0x00,
-0x49,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,0x42,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x6b,0x02,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x7f,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x75,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0xc4,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0x77,0x02,0x00,0x00,0x56,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x78,0x02,0x00,0x00,
-0x75,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x79,0x02,0x00,0x00,0x78,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7a,0x02,0x00,0x00,
-0x79,0x02,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x7a,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,
-0x7b,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x7c,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0xfe,0x01,0x00,0x00,0x84,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0x86,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0x85,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x87,0x02,0x00,0x00,0x86,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x89,0x02,0x00,0x00,
-0x88,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x89,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x23,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x97,0x02,0x00,0x00,0x96,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x98,0x02,0x00,0x00,
-0x97,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,
-0x8a,0x02,0x00,0x00,0x98,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,0x99,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,
-0x87,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x6c,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,
-0x9b,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0xfe,0x01,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0xa3,0x02,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xa6,0x02,0x00,0x00,0x91,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x02,0x00,0x00,0xa6,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0xae,0x02,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0xb1,0x02,0x00,0x00,0xae,0x02,0x00,0x00,0x77,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xb2,0x02,0x00,0x00,
-0xb1,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xb2,0x02,0x00,0x00,0xab,0x00,0x05,0x00,
-0x70,0x00,0x00,0x00,0xb4,0x02,0x00,0x00,0xb3,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xb5,0x02,0x00,0x00,0xb4,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb6,0x02,0x00,0x00,0xa7,0x02,0x00,0x00,0xb5,0x02,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0xb6,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0xb9,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xa4,0x02,0x00,0x00,0xb7,0x02,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xfe,0x01,0x00,0x00,0xbe,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0xc0,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xc3,0x02,0x00,0x00,
-0x9a,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xcb,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0xce,0x02,0x00,0x00,
-0xcb,0x02,0x00,0x00,0x77,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xcf,0x02,0x00,0x00,0xce,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0xd1,0x02,0x00,0x00,0xd0,0x02,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xd2,0x02,0x00,0x00,
-0xd1,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd3,0x02,0x00,0x00,
-0xc4,0x02,0x00,0x00,0xd2,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xd4,0x02,0x00,0x00,0xd3,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xd4,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdb,0x02,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,
-0xdc,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0xdb,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0xdd,0x02,0x00,0x00,0xdc,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0xe6,0x02,0x00,0x00,0x09,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0xe8,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x59,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xe9,0x02,0x00,0x00,0xe8,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xea,0x02,0x00,0x00,0xe9,0x02,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0xeb,0x02,0x00,0x00,
-0xea,0x02,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xec,0x02,0x00,0x00,0xeb,0x02,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xed,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xec,0x02,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0xee,0x02,0x00,0x00,0xed,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x02,0x00,0x00,0xdb,0x02,0x00,0x00,
-0x50,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,
-0xf5,0x02,0x00,0x00,0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf4,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0xf6,0x02,0x00,0x00,0xf5,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,0xb5,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x00,0x03,0x00,0x00,0x23,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x00,0x03,0x00,0x00,
-0x59,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x03,0x03,0x00,0x00,0x02,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x04,0x03,0x00,0x00,0x03,0x03,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x05,0x03,0x00,0x00,
-0x04,0x03,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x06,0x03,0x00,0x00,0x05,0x03,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x07,0x03,0x00,0x00,0xf9,0x02,0x00,0x00,
-0x06,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x08,0x03,0x00,0x00,0x07,0x03,0x00,0x00,0x85,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x09,0x03,0x00,0x00,0xf6,0x02,0x00,0x00,
-0x08,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xdd,0x02,0x00,0x00,0xee,0x02,0x00,0x00,0x09,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0xdb,0x02,0x00,0x00,0x5f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0x10,0x03,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x11,0x03,0x00,0x00,0x10,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x13,0x03,0x00,0x00,
-0xc3,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x14,0x03,0x00,0x00,0x13,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x1b,0x03,0x00,0x00,0x3e,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x1b,0x03,0x00,0x00,0x59,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x03,0x00,0x00,
-0x1e,0x03,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x20,0x03,0x00,0x00,0x1f,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x21,0x03,0x00,0x00,
-0x20,0x03,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0x14,0x03,0x00,0x00,0x21,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x23,0x03,0x00,0x00,0x22,0x03,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,0x25,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x11,0x03,0x00,0x00,
-0x23,0x03,0x00,0x00,0x0a,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2a,0x03,0x00,0x00,0xdb,0x02,0x00,0x00,
-0x6f,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,
-0x2b,0x03,0x00,0x00,0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x2c,0x03,0x00,0x00,0x2b,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x2e,0x03,0x00,0x00,0xd1,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x03,0x00,0x00,
-0x2e,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x36,0x03,0x00,0x00,0x59,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x4d,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0x36,0x03,0x00,0x00,
-0x59,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x39,0x03,0x00,0x00,0x38,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x03,0x00,0x00,0x39,0x03,0x00,0x00,
-0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x03,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x03,0x00,0x00,0x2f,0x03,0x00,0x00,
-0x3c,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x3e,0x03,0x00,0x00,0x3d,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x40,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,0x3e,0x03,0x00,0x00,
-0x25,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x03,0x00,0x00,0xdb,0x02,0x00,0x00,0x42,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x47,0x03,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x46,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x48,0x03,0x00,0x00,
-0x47,0x03,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x4a,0x03,0x00,0x00,0xda,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x4b,0x03,0x00,0x00,0x4a,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x51,0x03,0x00,0x00,
-0x09,0x02,0x00,0x00,0xc4,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0x53,0x03,0x00,0x00,0x59,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x54,0x03,0x00,0x00,
-0x51,0x03,0x00,0x00,0x53,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x55,0x03,0x00,0x00,0x54,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x56,0x03,0x00,0x00,
-0x55,0x03,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x56,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x58,0x03,0x00,0x00,
-0x57,0x03,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x59,0x03,0x00,0x00,
-0x4b,0x03,0x00,0x00,0x58,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x59,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x60,0x03,0x00,0x00,
-0xdb,0x02,0x00,0x00,0x84,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0x61,0x03,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0x60,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x62,0x03,0x00,0x00,0x61,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x64,0x03,0x00,0x00,
-0xe3,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x65,0x03,0x00,0x00,0x64,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0x6c,0x03,0x00,0x00,0x23,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0x6f,0x03,0x00,0x00,
-0x6c,0x03,0x00,0x00,0x53,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x70,0x03,0x00,0x00,0x6f,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x71,0x03,0x00,0x00,
-0x70,0x03,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0x72,0x03,0x00,0x00,0x71,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x73,0x03,0x00,0x00,
-0x72,0x03,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x74,0x03,0x00,0x00,
-0x65,0x03,0x00,0x00,0x73,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x75,0x03,0x00,0x00,0x74,0x03,0x00,0x00,
-0x85,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x76,0x03,0x00,0x00,
-0x62,0x03,0x00,0x00,0x75,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0x77,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x48,0x03,0x00,0x00,0x5a,0x03,0x00,0x00,
-0x76,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x7c,0x03,0x00,0x00,0xdb,0x02,0x00,0x00,0xa1,0x02,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x7d,0x03,0x00,0x00,
-0xfa,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x7c,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x7e,0x03,0x00,0x00,
-0x7d,0x03,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x80,0x03,0x00,0x00,0xec,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x81,0x03,0x00,0x00,0x80,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x88,0x03,0x00,0x00,
-0x3e,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,
-0x8b,0x03,0x00,0x00,0x88,0x03,0x00,0x00,0x53,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x8c,0x03,0x00,0x00,
-0x8b,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8d,0x03,0x00,0x00,0x8c,0x03,0x00,0x00,0xab,0x00,0x05,0x00,
-0x70,0x00,0x00,0x00,0x8e,0x03,0x00,0x00,0x8d,0x03,0x00,0x00,
-0x16,0x00,0x00,0x00,0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x03,0x00,0x00,0x8e,0x03,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x90,0x03,0x00,0x00,0x81,0x03,0x00,0x00,0x8f,0x03,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x91,0x03,0x00,0x00,
-0x90,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x93,0x03,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x7e,0x03,0x00,0x00,0x91,0x03,0x00,0x00,0x77,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x98,0x03,0x00,0x00,
-0xdb,0x02,0x00,0x00,0xbe,0x02,0x00,0x00,0x41,0x00,0x06,0x00,
-0xff,0x01,0x00,0x00,0x99,0x03,0x00,0x00,0xfa,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0x98,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x9a,0x03,0x00,0x00,0x99,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x9c,0x03,0x00,0x00,
-0xf5,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x03,0x00,0x00,0x9c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4d,0x00,0x00,0x00,0xa4,0x03,0x00,0x00,0x59,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x4d,0x00,0x00,0x00,0xa7,0x03,0x00,0x00,
-0xa4,0x03,0x00,0x00,0x53,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xa8,0x03,0x00,0x00,0xa7,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa9,0x03,0x00,0x00,
-0xa8,0x03,0x00,0x00,0xab,0x00,0x05,0x00,0x70,0x00,0x00,0x00,
-0xaa,0x03,0x00,0x00,0xa9,0x03,0x00,0x00,0x16,0x00,0x00,0x00,
-0xa9,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xab,0x03,0x00,0x00,
-0xaa,0x03,0x00,0x00,0x5f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x03,0x00,0x00,
-0x9d,0x03,0x00,0x00,0xab,0x03,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xad,0x03,0x00,0x00,0xac,0x03,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,0xaf,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x9a,0x03,0x00,0x00,
-0xad,0x03,0x00,0x00,0x93,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xb6,0x03,0x00,0x00,0x00,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xbd,0x03,0x00,0x00,
-0x19,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xbe,0x03,0x00,0x00,0xb6,0x03,0x00,0x00,0xbd,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xc5,0x03,0x00,0x00,
-0x34,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xc6,0x03,0x00,0x00,0xbe,0x03,0x00,0x00,0xc5,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xcd,0x03,0x00,0x00,
-0x4f,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0xce,0x03,0x00,0x00,0xc6,0x03,0x00,0x00,0xcd,0x03,0x00,0x00,
-0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0xd0,0x03,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0xd8,0x03,0x00,0x00,0x6b,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xdf,0x03,0x00,0x00,0x86,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xe0,0x03,0x00,0x00,
-0xd8,0x03,0x00,0x00,0xdf,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xe7,0x03,0x00,0x00,0xa3,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xe8,0x03,0x00,0x00,
-0xe0,0x03,0x00,0x00,0xe7,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xef,0x03,0x00,0x00,0xc0,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xf0,0x03,0x00,0x00,
-0xe8,0x03,0x00,0x00,0xef,0x03,0x00,0x00,0x70,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0xf2,0x03,0x00,0x00,0xd1,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0xf3,0x03,0x00,0x00,
-0xf0,0x03,0x00,0x00,0xf2,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x5a,0x00,0x00,0x00,0xf4,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xce,0x03,0x00,0x00,0xd0,0x03,0x00,0x00,
-0xf3,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0xfa,0x03,0x00,0x00,0xdc,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x01,0x04,0x00,0x00,0xf5,0x02,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x02,0x04,0x00,0x00,
-0xfa,0x03,0x00,0x00,0x01,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x09,0x04,0x00,0x00,0x10,0x03,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x0a,0x04,0x00,0x00,
-0x02,0x04,0x00,0x00,0x09,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x11,0x04,0x00,0x00,0x2b,0x03,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x12,0x04,0x00,0x00,
-0x0a,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x70,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x14,0x04,0x00,0x00,0x26,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,0x16,0x04,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x12,0x04,0x00,0x00,
-0x14,0x04,0x00,0x00,0xf4,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x1d,0x04,0x00,0x00,0x47,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x24,0x04,0x00,0x00,
-0x61,0x03,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x25,0x04,0x00,0x00,0x1d,0x04,0x00,0x00,0x24,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x2c,0x04,0x00,0x00,
-0x7d,0x03,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x2d,0x04,0x00,0x00,0x25,0x04,0x00,0x00,0x2c,0x04,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x34,0x04,0x00,0x00,
-0x99,0x03,0x00,0x00,0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,
-0x35,0x04,0x00,0x00,0x2d,0x04,0x00,0x00,0x34,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x37,0x04,0x00,0x00,
-0x42,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x39,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x35,0x04,0x00,0x00,0x37,0x04,0x00,0x00,0x16,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x41,0x04,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x45,0x04,0x00,0x00,0xb4,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x46,0x04,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x45,0x04,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x47,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x64,0x02,0x00,0x00,0x41,0x04,0x00,0x00,0x46,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x4a,0x04,0x00,0x00,
-0xee,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x4c,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x40,0x03,0x00,0x00,0x4a,0x04,0x00,0x00,0x47,0x04,0x00,0x00,
-0x70,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,0x4f,0x04,0x00,0x00,
-0x0a,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,
-0x51,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xaf,0x03,0x00,0x00,0x4f,0x04,0x00,0x00,0x4c,0x04,0x00,0x00,
-0x85,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x55,0x04,0x00,0x00,
-0x97,0x00,0x00,0x00,0x39,0x04,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x8c,0x04,0x00,0x00,0x55,0x04,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x5a,0x00,0x00,0x00,0x56,0x04,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x51,0x04,0x00,0x00,0x8c,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x58,0x04,0x00,0x00,0x66,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x59,0x04,0x00,0x00,
-0x58,0x04,0x00,0x00,0x56,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x66,0x00,0x00,0x00,0x59,0x04,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x04,0x00,0x00,0x88,0x04,0x00,0x00,
-0x25,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x69,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6b,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x5d,0x04,0x00,0x00,0x5d,0x04,0x00,0x00,0x5e,0x04,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x60,0x04,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x60,0x04,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x89,0x04,0x00,0x00,0x5f,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x77,0x04,0x00,0x00,0x63,0x04,0x00,0x00,0xad,0x00,0x05,0x00,
-0x70,0x00,0x00,0x00,0x66,0x04,0x00,0x00,0x89,0x04,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x62,0x04,0x00,0x00,
-0x63,0x04,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x66,0x04,0x00,0x00,0x61,0x04,0x00,0x00,0x62,0x04,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x61,0x04,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x70,0x00,0x00,0x00,0x69,0x04,0x00,0x00,0x26,0x00,0x00,0x00,
-0x89,0x04,0x00,0x00,0xf7,0x00,0x03,0x00,0x6b,0x04,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x69,0x04,0x00,0x00,
-0x6a,0x04,0x00,0x00,0x6b,0x04,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6a,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x04,0x00,0x00,0x26,0x00,0x00,0x00,0x89,0x04,0x00,0x00,
-0x41,0x00,0x05,0x00,0x65,0x00,0x00,0x00,0x70,0x04,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x6f,0x04,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x71,0x04,0x00,0x00,0x70,0x04,0x00,0x00,
-0x41,0x00,0x05,0x00,0x65,0x00,0x00,0x00,0x72,0x04,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x73,0x04,0x00,0x00,0x72,0x04,0x00,0x00,
-0x81,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x74,0x04,0x00,0x00,
-0x73,0x04,0x00,0x00,0x71,0x04,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x72,0x04,0x00,0x00,0x74,0x04,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x6b,0x04,0x00,0x00,0xf8,0x00,0x02,0x00,0x6b,0x04,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x5d,0x04,0x00,0x00,0x5d,0x04,0x00,0x00,
-0x5e,0x04,0x00,0x00,0xf9,0x00,0x02,0x00,0x63,0x04,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x63,0x04,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x77,0x04,0x00,0x00,0x89,0x04,0x00,0x00,
-0x50,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x60,0x04,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x62,0x04,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x70,0x00,0x00,0x00,0x79,0x04,0x00,0x00,0x26,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x7b,0x04,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x79,0x04,0x00,0x00,
-0x7a,0x04,0x00,0x00,0x7b,0x04,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7a,0x04,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x80,0x04,0x00,0x00,0x15,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x81,0x04,0x00,0x00,
-0x80,0x04,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x83,0x04,0x00,0x00,0x81,0x04,0x00,0x00,0x11,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x65,0x00,0x00,0x00,0x84,0x04,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x5a,0x00,0x00,0x00,0x85,0x04,0x00,0x00,0x84,0x04,0x00,0x00,
-0x41,0x00,0x06,0x00,0xff,0x01,0x00,0x00,0x86,0x04,0x00,0x00,
-0x7f,0x04,0x00,0x00,0x16,0x00,0x00,0x00,0x83,0x04,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x86,0x04,0x00,0x00,0x85,0x04,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7b,0x04,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7b,0x04,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t mul_mat_vec_q5_K_f32_len = 12048;
-
-unsigned char mul_mat_vec_q6_K_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x97,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
-0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
-0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x57,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0c,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x13,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x13,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x79,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x7b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x7b,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x7c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x7d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x7d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x7f,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x7f,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x91,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x92,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x92,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x92,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x99,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x9a,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9a,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x9a,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9c,0x01,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9c,0x01,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xa5,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x13,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x69,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x74,0x00,0x00,0x00,
-0x72,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x79,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x7a,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x7b,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x7c,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,
-0x7c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x7e,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x7e,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x84,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x91,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x92,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x93,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x93,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xa5,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xb3,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x72,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,
-0x60,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,
-0x08,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,0x99,0x01,0x00,0x00,
-0x53,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9a,0x01,0x00,0x00,
-0x99,0x01,0x00,0x00,0x20,0x00,0x04,0x00,0x9b,0x01,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9a,0x01,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x9b,0x01,0x00,0x00,0x9c,0x01,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,
-0xa5,0x01,0x00,0x00,0x54,0x00,0x00,0x00,0xa4,0x01,0x00,0x00,
-0xa4,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8e,0x03,0x00,0x00,0x21,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8f,0x03,0x00,0x00,0x41,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x90,0x03,0x00,0x00,
-0x61,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x91,0x03,0x00,0x00,0x22,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x92,0x03,0x00,0x00,0x42,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x03,0x00,0x00,
-0x62,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x94,0x03,0x00,0x00,0x23,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x95,0x03,0x00,0x00,0x43,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x96,0x03,0x00,0x00,
-0x63,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x50,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x26,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x5f,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x62,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x62,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa6,0x01,0x00,0x00,0x2b,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x79,0x01,0x00,0x00,0x63,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x64,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x6a,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x63,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0xa6,0x01,0x00,0x00,0x41,0x00,0x07,0x00,
-0x84,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x7a,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xa5,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xb3,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0xd4,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xa5,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xb3,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xe7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x72,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0xfb,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xab,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x99,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x07,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x08,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xa5,0x00,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0x10,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
-0x11,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x14,0x01,0x00,0x00,0x12,0x01,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,
-0xb4,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0x36,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x1f,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x01,0x00,0x00,0x1f,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x28,0x01,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x29,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2b,0x01,0x00,0x00,
-0x2a,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x2c,0x01,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,0x20,0x01,0x00,0x00,
-0x2d,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x2e,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x30,0x01,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,
-0x30,0x01,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x32,0x01,0x00,0x00,0x31,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0x34,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x32,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3c,0x01,0x00,0x00,0x99,0x00,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x44,0x01,0x00,0x00,0x4c,0x00,0x00,0x00,
-0x43,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0xa5,0x00,0x00,0x00,
-0x45,0x01,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x46,0x01,0x00,0x00,
-0x45,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0x46,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x47,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x4a,0x01,0x00,0x00,0x48,0x01,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x53,0x01,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x36,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x54,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xc0,0x00,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x5e,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x61,0x01,0x00,0x00,
-0x60,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x62,0x01,0x00,0x00,0x61,0x01,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
-0x62,0x01,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x56,0x01,0x00,0x00,
-0x63,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x65,0x01,0x00,0x00,0x64,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,
-0x66,0x01,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,
-0x68,0x01,0x00,0x00,0x34,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x01,0x00,0x00,0x99,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0xb8,0x01,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xb6,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xb9,0x01,0x00,0x00,0xb8,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0xbd,0x01,0x00,0x00,0xa6,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xbe,0x01,0x00,0x00,
-0xbd,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0xbf,0x01,0x00,0x00,0xb9,0x01,0x00,0x00,0xbe,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0xc0,0x01,0x00,0x00,
-0xbf,0x01,0x00,0x00,0x87,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,0x41,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xc2,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xc5,0x01,0x00,0x00,
-0xc4,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xc6,0x01,0x00,0x00,0xc5,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x01,0x00,0x00,0xc6,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x01,0x00,0x00,
-0xc7,0x01,0x00,0x00,0xb8,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xca,0x01,0x00,0x00,0x47,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,
-0xcb,0x01,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0xca,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xcc,0x01,0x00,0x00,
-0xcb,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0xcd,0x01,0x00,0x00,0xcc,0x01,0x00,0x00,0x16,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xce,0x01,0x00,0x00,
-0xcd,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xcf,0x01,0x00,0x00,0xce,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd0,0x01,0x00,0x00,0xcf,0x01,0x00,0x00,
-0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd1,0x01,0x00,0x00,0xd0,0x01,0x00,0x00,0x36,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xd2,0x01,0x00,0x00,
-0xc8,0x01,0x00,0x00,0xd1,0x01,0x00,0x00,0x72,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0xd3,0x01,0x00,0x00,0xd2,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd4,0x01,0x00,0x00,
-0xd3,0x01,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x01,0x00,0x00,0xd4,0x01,0x00,0x00,0x43,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xd6,0x01,0x00,0x00,
-0xd5,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xdc,0x01,0x00,0x00,0x99,0x00,0x00,0x00,0x8e,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,0xdd,0x01,0x00,0x00,
-0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xdc,0x01,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xde,0x01,0x00,0x00,
-0xdd,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0xe2,0x01,0x00,0x00,0xdb,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xe3,0x01,0x00,0x00,0xe2,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0xe4,0x01,0x00,0x00,
-0xde,0x01,0x00,0x00,0xe3,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xe5,0x01,0x00,0x00,0xe4,0x01,0x00,0x00,
-0x87,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x01,0x00,0x00,0x41,0x00,0x00,0x00,0x8e,0x03,0x00,0x00,
-0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xe8,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xea,0x01,0x00,0x00,0xe9,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xeb,0x01,0x00,0x00,
-0xea,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xec,0x01,0x00,0x00,0xeb,0x01,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xed,0x01,0x00,0x00,0xec,0x01,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xf1,0x01,0x00,0x00,0xcb,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xf2,0x01,0x00,0x00,0xf1,0x01,0x00,0x00,
-0x25,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xf3,0x01,0x00,0x00,0xf2,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x01,0x00,0x00,0xf3,0x01,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf5,0x01,0x00,0x00,
-0xf4,0x01,0x00,0x00,0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf6,0x01,0x00,0x00,0xf5,0x01,0x00,0x00,
-0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf7,0x01,0x00,0x00,0xed,0x01,0x00,0x00,0xf6,0x01,0x00,0x00,
-0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0xf8,0x01,0x00,0x00,
-0xf7,0x01,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfa,0x01,0x00,0x00,0xf9,0x01,0x00,0x00,
-0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xfb,0x01,0x00,0x00,0xfa,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xfc,0x01,0x00,0x00,0xe5,0x01,0x00,0x00,
-0xfb,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,
-0xfd,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xc0,0x01,0x00,0x00,0xd6,0x01,0x00,0x00,0xfc,0x01,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x02,0x02,0x00,0x00,
-0x99,0x00,0x00,0x00,0x8f,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9d,0x00,0x00,0x00,0x03,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x04,0x02,0x00,0x00,0x03,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x08,0x02,0x00,0x00,
-0x0f,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x09,0x02,0x00,0x00,0x08,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x0a,0x02,0x00,0x00,0x04,0x02,0x00,0x00,
-0x09,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x0b,0x02,0x00,0x00,0x0a,0x02,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x10,0x02,0x00,0x00,
-0xc4,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x11,0x02,0x00,0x00,0x10,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x12,0x02,0x00,0x00,
-0x11,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x13,0x02,0x00,0x00,0x12,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x17,0x02,0x00,0x00,0xcb,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x18,0x02,0x00,0x00,
-0x17,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x19,0x02,0x00,0x00,0x18,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1a,0x02,0x00,0x00,
-0x19,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x1a,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x02,0x00,0x00,
-0x1b,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x02,0x00,0x00,0x13,0x02,0x00,0x00,
-0x1c,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x1e,0x02,0x00,0x00,0x1d,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x02,0x00,0x00,0x1e,0x02,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x20,0x02,0x00,0x00,
-0x1f,0x02,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x21,0x02,0x00,0x00,0x20,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0x23,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x0b,0x02,0x00,0x00,
-0x21,0x02,0x00,0x00,0xfd,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x28,0x02,0x00,0x00,0x99,0x00,0x00,0x00,
-0x90,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x29,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x28,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x2a,0x02,0x00,0x00,0x29,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x2e,0x02,0x00,0x00,0x45,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x2e,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x30,0x02,0x00,0x00,0x2a,0x02,0x00,0x00,0x2f,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x31,0x02,0x00,0x00,
-0x30,0x02,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x36,0x02,0x00,0x00,0xe9,0x01,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x37,0x02,0x00,0x00,
-0x36,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x38,0x02,0x00,0x00,0x37,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x02,0x00,0x00,
-0x38,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x3d,0x02,0x00,0x00,0xcb,0x01,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0x3e,0x02,0x00,0x00,0x3d,0x02,0x00,0x00,
-0x43,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x3f,0x02,0x00,0x00,0x3e,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x40,0x02,0x00,0x00,0x3f,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x02,0x00,0x00,
-0x40,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x02,0x00,0x00,0x41,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x02,0x00,0x00,0x39,0x02,0x00,0x00,0x42,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x44,0x02,0x00,0x00,
-0x43,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x45,0x02,0x00,0x00,0x44,0x02,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x46,0x02,0x00,0x00,0x45,0x02,0x00,0x00,
-0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x47,0x02,0x00,0x00,0x46,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x53,0x00,0x00,0x00,0x49,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x31,0x02,0x00,0x00,0x47,0x02,0x00,0x00,
-0x23,0x02,0x00,0x00,0x81,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x4a,0x02,0x00,0x00,0x6a,0x01,0x00,0x00,0x49,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x56,0x02,0x00,0x00,
-0x99,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9d,0x00,0x00,0x00,0x58,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x56,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x59,0x02,0x00,0x00,0x58,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x5d,0x02,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x5d,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x5f,0x02,0x00,0x00,0x59,0x02,0x00,0x00,
-0x5e,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x60,0x02,0x00,0x00,0x5f,0x02,0x00,0x00,0x87,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x62,0x02,0x00,0x00,
-0x41,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xb3,0x00,0x00,0x00,0x64,0x02,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x62,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x65,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x66,0x02,0x00,0x00,0x65,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x67,0x02,0x00,0x00,
-0x66,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x02,0x00,0x00,0x67,0x02,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x02,0x00,0x00,
-0x47,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x41,0x00,0x08,0x00,
-0xb3,0x00,0x00,0x00,0x6b,0x02,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x6a,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x6c,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0x6d,0x02,0x00,0x00,0x6c,0x02,0x00,0x00,
-0x16,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x6e,0x02,0x00,0x00,0x6d,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x6f,0x02,0x00,0x00,0x6e,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x70,0x02,0x00,0x00,
-0x6f,0x02,0x00,0x00,0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x71,0x02,0x00,0x00,0x70,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x72,0x02,0x00,0x00,0x68,0x02,0x00,0x00,0x71,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x73,0x02,0x00,0x00,
-0x72,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x74,0x02,0x00,0x00,0x73,0x02,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x75,0x02,0x00,0x00,0x74,0x02,0x00,0x00,
-0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x76,0x02,0x00,0x00,0x75,0x02,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7c,0x02,0x00,0x00,0x99,0x00,0x00,0x00,
-0x91,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x7d,0x02,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x7c,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x7e,0x02,0x00,0x00,0x7d,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x82,0x02,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x83,0x02,0x00,0x00,
-0x82,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x84,0x02,0x00,0x00,0x7e,0x02,0x00,0x00,0x83,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x85,0x02,0x00,0x00,
-0x84,0x02,0x00,0x00,0x87,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x88,0x02,0x00,0x00,0x41,0x00,0x00,0x00,
-0x91,0x03,0x00,0x00,0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,
-0x89,0x02,0x00,0x00,0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x88,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x8a,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x8b,0x02,0x00,0x00,0x8a,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x8c,0x02,0x00,0x00,0x8b,0x02,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x8c,0x02,0x00,0x00,0xb8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x91,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x92,0x02,0x00,0x00,
-0x91,0x02,0x00,0x00,0x25,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x93,0x02,0x00,0x00,0x92,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x94,0x02,0x00,0x00,
-0x93,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x95,0x02,0x00,0x00,0x94,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x96,0x02,0x00,0x00,
-0x95,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x97,0x02,0x00,0x00,0x8d,0x02,0x00,0x00,
-0x96,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x98,0x02,0x00,0x00,0x97,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x99,0x02,0x00,0x00,0x98,0x02,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x99,0x02,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x9b,0x02,0x00,0x00,0x9a,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x9c,0x02,0x00,0x00,
-0x85,0x02,0x00,0x00,0x9b,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x53,0x00,0x00,0x00,0x9d,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x60,0x02,0x00,0x00,0x76,0x02,0x00,0x00,
-0x9c,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa2,0x02,0x00,0x00,0x99,0x00,0x00,0x00,0x92,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,0xa3,0x02,0x00,0x00,
-0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xa2,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xa4,0x02,0x00,0x00,
-0xa3,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0xa8,0x02,0x00,0x00,0x0f,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xa9,0x02,0x00,0x00,0xa8,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0xaa,0x02,0x00,0x00,
-0xa4,0x02,0x00,0x00,0xa9,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xab,0x02,0x00,0x00,0xaa,0x02,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0xb0,0x02,0x00,0x00,0x64,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0xb1,0x02,0x00,0x00,0xb0,0x02,0x00,0x00,
-0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0xb2,0x02,0x00,0x00,0xb1,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x02,0x00,0x00,0xb2,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xb7,0x02,0x00,0x00,
-0x6b,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0xb8,0x02,0x00,0x00,0xb7,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xb9,0x02,0x00,0x00,
-0xb8,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xba,0x02,0x00,0x00,0xb9,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xbb,0x02,0x00,0x00,0xba,0x02,0x00,0x00,
-0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xbc,0x02,0x00,0x00,0xbb,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xbd,0x02,0x00,0x00,
-0xb3,0x02,0x00,0x00,0xbc,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0xbe,0x02,0x00,0x00,0xbd,0x02,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbf,0x02,0x00,0x00,
-0xbe,0x02,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x02,0x00,0x00,0xbf,0x02,0x00,0x00,0x43,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xc1,0x02,0x00,0x00,
-0xc0,0x02,0x00,0x00,0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,
-0xc3,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xab,0x02,0x00,0x00,0xc1,0x02,0x00,0x00,0x9d,0x02,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,
-0x99,0x00,0x00,0x00,0x93,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9d,0x00,0x00,0x00,0xc9,0x02,0x00,0x00,0x94,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xc8,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xca,0x02,0x00,0x00,0xc9,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0xce,0x02,0x00,0x00,
-0x45,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0xcf,0x02,0x00,0x00,0xce,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xd0,0x02,0x00,0x00,0xca,0x02,0x00,0x00,
-0xcf,0x02,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0xd1,0x02,0x00,0x00,0xd0,0x02,0x00,0x00,0x87,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0xd6,0x02,0x00,0x00,
-0x89,0x02,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0xd7,0x02,0x00,0x00,0xd6,0x02,0x00,0x00,0x36,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0xd8,0x02,0x00,0x00,
-0xd7,0x02,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x02,0x00,0x00,0xd8,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0xdd,0x02,0x00,0x00,0x6b,0x02,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0xde,0x02,0x00,0x00,
-0xdd,0x02,0x00,0x00,0x43,0x01,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0xdf,0x02,0x00,0x00,0xde,0x02,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe0,0x02,0x00,0x00,
-0xdf,0x02,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe1,0x02,0x00,0x00,0xe0,0x02,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe2,0x02,0x00,0x00,
-0xe1,0x02,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x02,0x00,0x00,0xd9,0x02,0x00,0x00,
-0xe2,0x02,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0xe4,0x02,0x00,0x00,0xe3,0x02,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x02,0x00,0x00,0xe4,0x02,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe6,0x02,0x00,0x00,
-0xe5,0x02,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xe7,0x02,0x00,0x00,0xe6,0x02,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0xe9,0x02,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xd1,0x02,0x00,0x00,
-0xe7,0x02,0x00,0x00,0xc3,0x02,0x00,0x00,0x81,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0xea,0x02,0x00,0x00,0x4a,0x02,0x00,0x00,
-0xe9,0x02,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x02,0x00,0x00,0x99,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,0xf8,0x02,0x00,0x00,
-0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xf6,0x02,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0xf9,0x02,0x00,0x00,
-0xf8,0x02,0x00,0x00,0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0xfd,0x02,0x00,0x00,0xa6,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xfe,0x02,0x00,0x00,0xfd,0x02,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0xff,0x02,0x00,0x00,
-0xf9,0x02,0x00,0x00,0xfe,0x02,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0xff,0x02,0x00,0x00,
-0x87,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x02,0x03,0x00,0x00,0x41,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,0x04,0x03,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x02,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x05,0x03,0x00,0x00,0x04,0x03,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x06,0x03,0x00,0x00,
-0x05,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x07,0x03,0x00,0x00,0x06,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x08,0x03,0x00,0x00,0x07,0x03,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0a,0x03,0x00,0x00,0x47,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0xb3,0x00,0x00,0x00,0x0b,0x03,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x0a,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x0c,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x0c,0x03,0x00,0x00,0x16,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0e,0x03,0x00,0x00,0x0d,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x03,0x00,0x00,
-0x0e,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x10,0x03,0x00,0x00,0x0f,0x03,0x00,0x00,0x83,0x00,0x00,0x00,
-0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x11,0x03,0x00,0x00,
-0x10,0x03,0x00,0x00,0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x12,0x03,0x00,0x00,0x08,0x03,0x00,0x00,
-0x11,0x03,0x00,0x00,0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x13,0x03,0x00,0x00,0x12,0x03,0x00,0x00,0x72,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x14,0x03,0x00,0x00,0x13,0x03,0x00,0x00,
-0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x15,0x03,0x00,0x00,
-0x14,0x03,0x00,0x00,0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x16,0x03,0x00,0x00,0x15,0x03,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,
-0x99,0x00,0x00,0x00,0x94,0x03,0x00,0x00,0x41,0x00,0x06,0x00,
-0x9d,0x00,0x00,0x00,0x1d,0x03,0x00,0x00,0x94,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x1c,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x1e,0x03,0x00,0x00,0x1d,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x22,0x03,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x23,0x03,0x00,0x00,0x22,0x03,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x24,0x03,0x00,0x00,0x1e,0x03,0x00,0x00,
-0x23,0x03,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x25,0x03,0x00,0x00,0x24,0x03,0x00,0x00,0x87,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x28,0x03,0x00,0x00,
-0x41,0x00,0x00,0x00,0x94,0x03,0x00,0x00,0x41,0x00,0x08,0x00,
-0xb3,0x00,0x00,0x00,0x29,0x03,0x00,0x00,0x7f,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x28,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x2a,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x2b,0x03,0x00,0x00,0x2a,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2c,0x03,0x00,0x00,
-0x2b,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2d,0x03,0x00,0x00,0x2c,0x03,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x31,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x32,0x03,0x00,0x00,0x31,0x03,0x00,0x00,0x25,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x33,0x03,0x00,0x00,
-0x32,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x34,0x03,0x00,0x00,0x33,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x35,0x03,0x00,0x00,0x34,0x03,0x00,0x00,
-0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x36,0x03,0x00,0x00,0x35,0x03,0x00,0x00,0x36,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x37,0x03,0x00,0x00,
-0x2d,0x03,0x00,0x00,0x36,0x03,0x00,0x00,0x72,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x38,0x03,0x00,0x00,0x37,0x03,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x39,0x03,0x00,0x00,
-0x38,0x03,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x39,0x03,0x00,0x00,0x43,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x3a,0x03,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x3c,0x03,0x00,0x00,0x25,0x03,0x00,0x00,0x3b,0x03,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,0x3d,0x03,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x00,0x03,0x00,0x00,
-0x16,0x03,0x00,0x00,0x3c,0x03,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x03,0x00,0x00,0x99,0x00,0x00,0x00,
-0x95,0x03,0x00,0x00,0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,
-0x43,0x03,0x00,0x00,0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x42,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x44,0x03,0x00,0x00,0x43,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x48,0x03,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x49,0x03,0x00,0x00,
-0x48,0x03,0x00,0x00,0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,
-0x4a,0x03,0x00,0x00,0x44,0x03,0x00,0x00,0x49,0x03,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x4b,0x03,0x00,0x00,
-0x4a,0x03,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x50,0x03,0x00,0x00,0x04,0x03,0x00,0x00,
-0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x51,0x03,0x00,0x00,
-0x50,0x03,0x00,0x00,0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x52,0x03,0x00,0x00,0x51,0x03,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x03,0x00,0x00,
-0x52,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x57,0x03,0x00,0x00,0x0b,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0x58,0x03,0x00,0x00,0x57,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x59,0x03,0x00,0x00,0x58,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x03,0x00,0x00,0x59,0x03,0x00,0x00,
-0xc7,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5b,0x03,0x00,0x00,
-0x5a,0x03,0x00,0x00,0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x03,0x00,0x00,0x5b,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5d,0x03,0x00,0x00,0x53,0x03,0x00,0x00,0x5c,0x03,0x00,0x00,
-0x72,0x00,0x04,0x00,0x77,0x00,0x00,0x00,0x5e,0x03,0x00,0x00,
-0x5d,0x03,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x5f,0x03,0x00,0x00,0x5e,0x03,0x00,0x00,0x82,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x60,0x03,0x00,0x00,0x5f,0x03,0x00,0x00,
-0x43,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x61,0x03,0x00,0x00,0x60,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x53,0x00,0x00,0x00,0x63,0x03,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x03,0x00,0x00,0x61,0x03,0x00,0x00,
-0x3d,0x03,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x03,0x00,0x00,0x99,0x00,0x00,0x00,0x96,0x03,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,0x69,0x03,0x00,0x00,
-0x94,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x68,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x6a,0x03,0x00,0x00,
-0x69,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,0x77,0x00,0x00,0x00,
-0x6e,0x03,0x00,0x00,0x45,0x01,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x6f,0x03,0x00,0x00,0x6e,0x03,0x00,0x00,
-0x85,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x70,0x03,0x00,0x00,
-0x6a,0x03,0x00,0x00,0x6f,0x03,0x00,0x00,0x85,0x00,0x05,0x00,
-0x53,0x00,0x00,0x00,0x71,0x03,0x00,0x00,0x70,0x03,0x00,0x00,
-0x87,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x76,0x03,0x00,0x00,0x29,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0x77,0x03,0x00,0x00,0x76,0x03,0x00,0x00,
-0x36,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x78,0x03,0x00,0x00,0x77,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x79,0x03,0x00,0x00,0x78,0x03,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x7d,0x03,0x00,0x00,
-0x0b,0x03,0x00,0x00,0xc2,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x7e,0x03,0x00,0x00,0x7d,0x03,0x00,0x00,0x43,0x01,0x00,0x00,
-0x71,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x7f,0x03,0x00,0x00,
-0x7e,0x03,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x80,0x03,0x00,0x00,0x7f,0x03,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x81,0x03,0x00,0x00,0x80,0x03,0x00,0x00,
-0x83,0x00,0x00,0x00,0xc4,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x82,0x03,0x00,0x00,0x81,0x03,0x00,0x00,0x36,0x00,0x00,0x00,
-0xc5,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x83,0x03,0x00,0x00,
-0x79,0x03,0x00,0x00,0x82,0x03,0x00,0x00,0x72,0x00,0x04,0x00,
-0x77,0x00,0x00,0x00,0x84,0x03,0x00,0x00,0x83,0x03,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x85,0x03,0x00,0x00,
-0x84,0x03,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x86,0x03,0x00,0x00,0x85,0x03,0x00,0x00,0x43,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x53,0x00,0x00,0x00,0x87,0x03,0x00,0x00,
-0x86,0x03,0x00,0x00,0x0c,0x00,0x08,0x00,0x53,0x00,0x00,0x00,
-0x89,0x03,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x71,0x03,0x00,0x00,0x87,0x03,0x00,0x00,0x63,0x03,0x00,0x00,
-0x81,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x8a,0x03,0x00,0x00,
-0xea,0x02,0x00,0x00,0x89,0x03,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x75,0x01,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x76,0x01,0x00,0x00,
-0x75,0x01,0x00,0x00,0x8a,0x03,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x5f,0x00,0x00,0x00,0x76,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0xa6,0x01,0x00,0x00,
-0x25,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x62,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x64,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x7a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,0x7b,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x7d,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7d,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0x58,0x00,0x00,0x00,0x64,0x00,0x00,0x00,
-0x94,0x01,0x00,0x00,0x80,0x01,0x00,0x00,0xad,0x00,0x05,0x00,
-0x69,0x00,0x00,0x00,0x83,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x7f,0x01,0x00,0x00,
-0x80,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x83,0x01,0x00,0x00,0x7e,0x01,0x00,0x00,0x7f,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7e,0x01,0x00,0x00,0xb1,0x00,0x05,0x00,
-0x69,0x00,0x00,0x00,0x86,0x01,0x00,0x00,0x26,0x00,0x00,0x00,
-0xa7,0x01,0x00,0x00,0xf7,0x00,0x03,0x00,0x88,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x86,0x01,0x00,0x00,
-0x87,0x01,0x00,0x00,0x88,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x87,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8c,0x01,0x00,0x00,0x26,0x00,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x57,0x00,0x00,0x00,0x8c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x8e,0x01,0x00,0x00,0x8d,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x57,0x00,0x00,0x00,0x26,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x90,0x01,0x00,0x00,0x8f,0x01,0x00,0x00,
-0x81,0x00,0x05,0x00,0x53,0x00,0x00,0x00,0x91,0x01,0x00,0x00,
-0x90,0x01,0x00,0x00,0x8e,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x8f,0x01,0x00,0x00,0x91,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x88,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x88,0x01,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x7a,0x01,0x00,0x00,0x7a,0x01,0x00,0x00,
-0x7b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x80,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x80,0x01,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x94,0x01,0x00,0x00,0xa7,0x01,0x00,0x00,
-0x95,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x7d,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7f,0x01,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x69,0x00,0x00,0x00,0x96,0x01,0x00,0x00,0x26,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x98,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x96,0x01,0x00,0x00,
-0x97,0x01,0x00,0x00,0x98,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x97,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x15,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9e,0x01,0x00,0x00,
-0x9d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xa0,0x01,0x00,0x00,0x9e,0x01,0x00,0x00,0x11,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x5e,0x00,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x57,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0xa2,0x01,0x00,0x00,0xa1,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x9d,0x00,0x00,0x00,0xa3,0x01,0x00,0x00,
-0x9c,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0xa0,0x01,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xa3,0x01,0x00,0x00,0xa2,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x98,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x98,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t mul_mat_vec_q6_K_f32_len = 10992;
-
-unsigned char mul_mat_vec_q8_0_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xc9,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x11,0x00,0x02,0x00,0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,
-0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,
-0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,
-0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,
-0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,
-0x00,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x0c,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x13,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x28,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x50,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x51,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x54,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x73,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x73,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x73,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x75,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xba,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xbc,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xc4,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x17,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x19,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x1e,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x30,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x4d,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x4f,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x50,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x51,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x52,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x5a,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x72,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x73,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x74,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x74,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x7d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x08,0x01,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xba,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xbb,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xbb,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x0a,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0e,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x31,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x87,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x8b,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x87,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x82,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x41,0x00,0x07,0x00,0x56,0x00,0x00,0x00,0x57,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0x58,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x4e,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x61,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x5f,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x54,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x40,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x72,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x50,0x00,0x05,0x00,0x5a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x8e,0x00,0x05,0x00,
-0x5a,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x59,0x00,0x00,0x00,0x51,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x79,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x7d,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x17,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x51,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x7d,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x17,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1f,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x98,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x22,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x24,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x99,0x00,0x00,0x00,
-0x99,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x9d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x9d,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0xa0,0x00,0x00,0x00,0xad,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x9f,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xa3,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x9e,0x00,0x00,0x00,0xb1,0x00,0x05,0x00,0x30,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xa8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xa6,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa7,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x1e,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xa8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xa8,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x99,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x9a,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xa0,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa0,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x9d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x9f,0x00,0x00,0x00,0xaa,0x00,0x05,0x00,
-0x30,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xb8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xb6,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb7,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2b,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x7d,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xb8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xb8,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t mul_mat_vec_q8_0_f32_len = 3120;
-
-unsigned char norm_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xb5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0b,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x27,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x27,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x27,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x27,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x27,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x33,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x34,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x36,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x36,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x95,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x97,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x97,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xad,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x1c,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2c,0x00,0x05,0x00,
-0x15,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1d,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x27,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x28,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x28,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x2f,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x33,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x34,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x35,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x42,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0xac,0xc5,0x27,0x37,0x1d,0x00,0x03,0x00,0x94,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x95,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x96,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x96,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1e,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xae,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x23,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x22,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x3d,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x42,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x81,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x43,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x42,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x14,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x4c,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x21,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x23,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x57,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x57,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x2a,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x5a,0x00,0x00,0x00,0xad,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x59,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0x61,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x63,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x61,0x00,0x00,0x00,
-0x62,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x62,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1d,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x15,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x15,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x63,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x63,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x52,0x00,0x00,0x00,
-0x52,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x5a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x5a,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x57,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x59,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x42,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x73,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x42,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
-0x4b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x7b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
-0x14,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x14,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x14,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x8b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8b,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x59,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x8d,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x93,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x3d,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xa7,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x3d,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x8b,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x8d,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t norm_f32_len = 2572;
-
-unsigned char relu_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x1f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x20,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x27,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x31,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x20,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x24,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x25,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x32,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x33,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x32,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x29,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x2e,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x32,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x32,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t relu_f32_len = 1212;
-
-unsigned char rms_norm_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x9e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0b,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x25,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x25,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x31,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x32,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x32,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x32,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x34,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x34,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x82,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x83,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x83,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x83,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x85,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x85,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x17,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x25,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x28,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x28,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x2d,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x31,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x32,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x33,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x33,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x3b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x48,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x28,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x28,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x28,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x72,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x82,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x83,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x84,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x84,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x99,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1f,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x20,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x3b,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x3c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x14,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x44,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x1f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x21,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x4d,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x28,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x4c,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x50,0x00,0x00,0x00,0xad,0x00,0x05,0x00,0x2d,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x4f,0x00,0x00,0x00,0x50,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x53,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x4e,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x2d,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x59,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x57,0x00,0x00,0x00,
-0x58,0x00,0x00,0x00,0x59,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x58,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x62,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x59,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x59,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
-0x48,0x00,0x00,0x00,0x49,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x50,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x50,0x00,0x00,0x00,
-0xc3,0x00,0x05,0x00,0x28,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x4d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x4f,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x1b,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x88,0x00,0x05,0x00,0x14,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,
-0x14,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x79,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x79,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0x2d,0x00,0x00,0x00,
-0x81,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x7b,0x00,0x00,0x00,0x7a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x81,0x00,0x00,0x00,
-0x7a,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x7a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x3b,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x14,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x14,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x3b,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x79,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x7b,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t rms_norm_f32_len = 2344;
-
-unsigned char rope_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x1c,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x67,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x88,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x89,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x89,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x89,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x8b,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8b,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xaa,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xab,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xab,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xab,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xad,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xad,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xbb,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xbb,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xbb,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xbd,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xd5,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x00,0x00,0x80,0x3f,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x1e,0x00,0x09,0x00,0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0xcd,0xcc,0xcc,0x3d,
-0x17,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x65,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x69,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x88,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x8a,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x8d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0xa9,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xac,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xab,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xac,0x00,0x00,0x00,
-0xad,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xb0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xba,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xbb,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xbc,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xbc,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x65,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xd6,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x6e,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd7,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x69,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x72,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x3c,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x77,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x75,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x76,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xd6,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x8d,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x95,0x00,0x00,0x00,
-0x9c,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xb7,0x00,0x05,0x00,
-0x3c,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x02,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xe9,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xea,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,
-0xeb,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x2f,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,
-0x0f,0x01,0x00,0x00,0xec,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0xee,0x00,0x00,0x00,
-0xec,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x11,0x01,0x00,0x00,0x88,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x10,0x01,0x00,0x00,
-0x12,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x15,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x17,0x01,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1a,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x1a,0x01,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0xf9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x06,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0xe2,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x02,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x02,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x19,0x01,0x00,0x00,0xe2,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0xea,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x18,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x08,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0d,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x08,0x01,0x00,0x00,
-0x19,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xb0,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xb0,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x06,0x01,0x00,0x00,0x1b,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0xc6,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xb0,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xc8,0x00,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0x0a,0x01,0x00,0x00,0xd0,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
-0xa9,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xb0,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd6,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd6,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t rope_f16_len = 3156;
-
-unsigned char rope_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x17,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x67,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x88,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x89,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x89,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x89,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8b,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x8b,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xa9,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xaa,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xaa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xaa,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xac,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xac,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xb7,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xb8,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xb8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xb8,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xba,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xba,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x1e,0x00,0x09,0x00,0x2a,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x3c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0xcd,0xcc,0xcc,0x3d,0x17,0x00,0x04,0x00,0x65,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x65,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x69,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x72,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x88,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x89,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x8a,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x89,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x8a,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x8d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xa9,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xaa,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xab,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xab,0x00,0x00,0x00,0xac,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xaf,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xb7,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xb8,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xb9,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xb9,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,0x65,0x00,0x00,0x00,
-0xd0,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xd1,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x6e,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd2,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x69,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x6a,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x69,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x72,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x77,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x75,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x76,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd1,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x72,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x84,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x8d,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
-0x8b,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
-0x8e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x92,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x92,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
-0xe3,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xfd,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xe4,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe5,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x2f,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,
-0x09,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,0xe7,0x00,0x00,0x00,
-0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0xe9,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,
-0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0b,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x0e,0x01,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x10,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xed,0x00,0x00,0x00,
-0x12,0x01,0x00,0x00,0xe3,0x00,0x00,0x00,0x83,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0xe3,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x9e,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xfd,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xfd,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0xfc,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,
-0xe5,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x13,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x13,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x14,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xaf,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x7f,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xaf,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0x16,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xaf,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0xba,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x7f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xaf,0x00,0x00,0x00,0xce,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xd1,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xd1,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t rope_f32_len = 3072;
-
-unsigned char rope_neox_f16_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x5f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,
-0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c,
-0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00,
-0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0xcf,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x68,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x95,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x96,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x9c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xcc,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0xcd,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xcd,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x16,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,
-0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x00,0x00,0x80,0x3f,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x1e,0x00,0x0c,0x00,0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0xcd,0xcc,0xcc,0x3d,0x17,0x00,0x04,0x00,0x66,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x67,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x6a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x73,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x94,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x96,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x97,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9c,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x94,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0xcc,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xce,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xcd,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xce,0x00,0x00,0x00,
-0xcf,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xd1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x66,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x69,0x00,0x00,0x00,
-0x15,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x00,0x00,0x00,0x3f,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x17,0x01,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x6f,0x00,0x00,0x00,
-0x18,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x18,0x01,0x00,0x00,
-0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x6a,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x73,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x3c,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x76,0x00,0x00,0x00,
-0x77,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x77,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x17,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x73,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0xac,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x85,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x87,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x85,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x87,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x86,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,
-0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x69,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,
-0xa9,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0xa6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x98,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x17,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x87,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xb6,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xbf,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0xc8,0x00,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
-0xc5,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xd1,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,
-0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0x22,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00,
-0x22,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x26,0x01,0x00,0x00,0xd8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
-0x29,0x01,0x00,0x00,0x20,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2a,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x43,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x2f,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0xe5,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x2f,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
-0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x51,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x56,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x58,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00,
-0x58,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x83,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x57,0x01,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0x36,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x5d,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00,
-0xe2,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x36,0x01,0x00,0x00,
-0x39,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x3d,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
-0x3e,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x40,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x43,0x01,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x43,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
-0x87,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0x87,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,
-0x2b,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x45,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x59,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x47,0x01,0x00,0x00,0x45,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x59,0x01,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x49,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x94,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0xf1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xf7,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa1,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x94,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x03,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,
-0x73,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x05,0x01,0x00,0x00,
-0x04,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,
-0x06,0x01,0x00,0x00,0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0xb9,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x06,0x01,0x00,0x00,
-0x05,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x11,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x73,0x00,0x04,0x00,
-0x94,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
-0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x14,0x01,0x00,0x00,0x13,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x17,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x17,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t rope_neox_f16_len = 3876;
-
-unsigned char rope_neox_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x5a,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x97,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x68,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x95,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x95,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x97,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x9b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x9b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x9d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xcc,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xcc,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xce,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xce,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x11,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x16,0x00,0x03,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x1c,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x1e,0x00,0x0c,0x00,0x2a,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2f,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
-0x3c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0xcd,0xcc,0xcc,0x3d,0x17,0x00,0x04,0x00,
-0x66,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x66,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x67,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x6a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0x6f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x73,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x94,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x95,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x96,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x9b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x9c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xcb,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xcc,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xcd,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0xcd,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x00,0x01,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x66,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
-0x69,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x69,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00,
-0x00,0x00,0x00,0x3f,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x12,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x6f,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x13,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x76,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x78,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0xac,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,
-0x85,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x87,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x85,0x00,0x00,0x00,0x86,0x00,0x00,0x00,
-0x87,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x86,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,
-0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x93,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa0,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
-0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0xb4,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xb7,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x73,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0xbb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x71,0x00,0x00,0x00,
-0xbd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0xc1,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xc4,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
-0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0xc7,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0xc8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x56,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xd0,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xd1,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x57,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
-0xe0,0x00,0x00,0x00,0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,
-0xe4,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x2f,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,0xd7,0x00,0x00,0x00,
-0xe1,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,
-0x23,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00,
-0x23,0x01,0x00,0x00,0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,
-0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x20,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x3e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x25,0x01,0x00,0x00,0x26,0x01,0x00,0x00,
-0x3e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x27,0x01,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
-0x27,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,
-0x29,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x86,0x00,0x05,0x00,
-0x07,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,0xe4,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x4b,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,0x83,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00,
-0x28,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x4d,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x4f,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x83,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x52,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x2e,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00,
-0x52,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x24,0x01,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x34,0x01,0x00,0x00,0xe1,0x00,0x00,0x00,0x2e,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x21,0x01,0x00,0x00,
-0x31,0x01,0x00,0x00,0x34,0x01,0x00,0x00,0x88,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,
-0xd7,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x39,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x38,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0x3b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x53,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x3e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x01,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x55,0x01,0x00,0x00,
-0x1e,0x01,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
-0x26,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x21,0x01,0x00,0x00,0x87,0x00,0x00,0x00,
-0x35,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,
-0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
-0x55,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,
-0x44,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,
-0x54,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x46,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x55,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xef,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0xef,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0xb8,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x00,0x01,0x00,0x00,
-0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x01,0x01,0x00,0x00,
-0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0xa0,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x97,0x00,0x00,0x00,
-0x42,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x02,0x01,0x00,0x00,0x01,0x01,0x00,0x00,0x85,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0xf8,0x00,0x00,0x00,
-0x42,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,
-0x0e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xf0,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x0d,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0x0f,0x01,0x00,0x00,
-0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x12,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t rope_neox_f32_len = 3792;
-
-unsigned char scale_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x37,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x1f,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x20,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x25,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x27,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x27,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x1f,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x20,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x21,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x24,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x25,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2d,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x35,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x36,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,
-0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x35,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x29,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x31,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x35,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x35,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t scale_f32_len = 1256;
-
-unsigned char silu_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x3b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x38,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x21,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x29,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
-0x37,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x39,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x3a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x39,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x0c,0x00,0x06,0x00,0x11,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x35,0x00,0x00,0x00,
-0x34,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x39,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x39,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
-0x38,0x00,0x01,0x00,
-};
-const uint64_t silu_f32_len = 1264;
-
-unsigned char soft_max_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x0e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x0c,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x11,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x17,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x3a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x3b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x3b,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x3b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x53,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x04,0x00,0x54,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x54,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x54,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x56,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x56,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xbf,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0xc0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0xc0,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0xc2,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0xc2,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xff,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x16,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,
-0x17,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x18,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x18,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x1c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x1c,0x00,0x04,0x00,
-0x21,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x22,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x22,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x27,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x34,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x3a,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x3b,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x3c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x3c,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x44,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x53,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x54,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x55,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x54,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x55,0x00,0x00,0x00,0x56,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0x08,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x1d,0x00,0x03,0x00,
-0xbf,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0xc1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0xc1,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0xff,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0xfe,0x00,0x00,0x00,0xfe,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0x00,0x00,0x80,0xff,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x1d,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x27,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x2e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x1c,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0x34,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x33,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x2c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x44,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x43,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x48,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0xac,0x00,0x05,0x00,0x34,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x52,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x4e,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x5f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x51,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,0x5a,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x44,0x00,0x00,0x00,
-0x5d,0x00,0x00,0x00,0x56,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x5c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0x5e,0x00,0x00,0x00,0x5d,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x52,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x5f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x52,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x52,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x16,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x5e,0x00,0x00,0x00,0x51,0x00,0x00,0x00,
-0x60,0x00,0x00,0x00,0x5f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x16,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x09,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x16,0x00,0x00,0x00,
-0x63,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x39,0x00,0x00,0x00,0x62,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0x63,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2e,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x66,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x20,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x2b,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2d,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x6c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x6c,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x1a,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,
-0xad,0x00,0x05,0x00,0x34,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0x31,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0x6e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x72,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x6d,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x01,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0x34,0x00,0x00,0x00,
-0x76,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
-0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x28,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x75,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x27,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
-0x0c,0x00,0x07,0x00,0x16,0x00,0x00,0x00,0x83,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x28,0x00,0x00,0x00,
-0x83,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x78,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x78,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,
-0x67,0x00,0x00,0x00,0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6f,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6f,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x1b,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x6c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x6e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x27,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
-0x88,0x00,0x00,0x00,0xe0,0x00,0x04,0x00,0x67,0x00,0x00,0x00,
-0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0x60,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8e,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x91,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,0x34,0x00,0x00,0x00,
-0x96,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x33,0x00,0x00,0x00,
-0xf6,0x00,0x04,0x00,0x90,0x00,0x00,0x00,0x91,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x96,0x00,0x00,0x00,
-0x8f,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x41,0x00,0x06,0x00,
-0x44,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x48,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
-0xac,0x00,0x05,0x00,0x34,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0xaa,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xa7,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xa9,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xb0,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x02,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x44,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,
-0x56,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0xb2,0x00,0x00,0x00,
-0xb1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xb3,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xaa,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xaa,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x16,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0xb2,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x60,0x00,0x00,0x00,
-0xb3,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x16,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
-0x83,0x00,0x05,0x00,0x16,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
-0xb5,0x00,0x00,0x00,0x89,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,
-0x16,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x16,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x44,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,
-0x31,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc5,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x91,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x91,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,
-0x02,0x01,0x00,0x00,0x20,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8e,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x90,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,
-0x1a,0x00,0x00,0x00,0x03,0x01,0x00,0x00,0x6b,0x00,0x00,0x00,
-0x90,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
-0xad,0x00,0x05,0x00,0x34,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0x31,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,
-0xcb,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xcf,0x00,0x00,0x00,0xca,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xca,0x00,0x00,0x00,
-0x7c,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0x03,0x01,0x00,0x00,0xb0,0x00,0x05,0x00,0x34,0x00,0x00,0x00,
-0xd3,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0xd5,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfa,0x00,0x04,0x00,0xd3,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd4,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x27,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0xda,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,
-0xdc,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x81,0x00,0x05,0x00,0x16,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0xde,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x28,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xd5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xd5,0x00,0x00,0x00,
-0xe0,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x67,0x00,0x00,0x00,
-0x68,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcc,0x00,0x00,0x00,0xc3,0x00,0x05,0x00,
-0x1a,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
-0x1b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0xc9,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xcb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x88,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0xe8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0xe8,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x0f,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0xfd,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x34,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x33,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0xea,0x00,0x00,0x00,
-0xe9,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0xf0,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0xe9,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x33,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x04,0x01,0x00,0x00,
-0x41,0x00,0x06,0x00,0x44,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,
-0xc2,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x16,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,
-0xf8,0x00,0x00,0x00,0x88,0x00,0x05,0x00,0x16,0x00,0x00,0x00,
-0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0xf8,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xfd,0x00,0x00,0x00,
-0x04,0x01,0x00,0x00,0x20,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0xe8,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xea,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-};
-const uint64_t soft_max_f32_len = 3752;
-
-unsigned char split_k_reduce_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x50,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x11,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x47,0x00,0x03,0x00,0x11,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2e,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x30,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x30,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x3f,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x3f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x3f,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x41,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x41,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x47,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,
-0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,0x09,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x1e,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x12,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x12,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x16,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x19,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x1e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x1e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x14,0x00,0x00,0x00,0x29,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x2d,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x2e,0x00,0x00,0x00,
-0x2d,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x2f,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x3e,0x00,0x00,0x00,
-0x1e,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x3f,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x40,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x40,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x00,0x01,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
-0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
-0xf7,0x00,0x03,0x00,0x48,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x49,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x16,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x15,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
-0x19,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x1c,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1a,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x1b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x48,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x23,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x23,0x00,0x00,0x00,
-0xf5,0x00,0x07,0x00,0x1e,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0x21,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
-0x16,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x29,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0xb0,0x00,0x05,0x00,
-0x19,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x2b,0x00,0x00,0x00,0xf6,0x00,0x04,0x00,0x25,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x25,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x24,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x18,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x37,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x1e,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x81,0x00,0x05,0x00,0x1e,0x00,0x00,0x00,
-0x3b,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
-0x4e,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x23,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x25,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x37,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
-0x41,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x44,0x00,0x00,0x00,0x4f,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x48,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x48,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t split_k_reduce_len = 1416;
-
-unsigned char sqr_f32_data[] = {
-0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0x37,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
-0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
-0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
-0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x12,0x00,0x00,0x00,0x02,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x12,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x12,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x21,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x2c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x34,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
-0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x17,0x00,0x04,0x00,
-0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x03,0x00,0x11,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x1e,0x00,0x06,0x00,0x12,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x13,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x13,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x15,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x15,0x00,0x00,0x00,
-0x16,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x17,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x1a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x21,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x22,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
-0x29,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x2b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x33,0x00,0x00,0x00,
-0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x35,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x36,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x35,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x24,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x27,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
-0x30,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
-0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x3e,0x00,0x03,0x00,0x31,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
-0xf9,0x00,0x02,0x00,0x35,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x35,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
-
-};
-const uint64_t sqr_f32_len = 1188;
-
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
deleted file mode 100644
index ae9cb3c1c..000000000
--- a/ggml-vulkan.cpp
+++ /dev/null
@@ -1,5895 +0,0 @@
-#include "ggml-vulkan.h"
-
-#ifdef GGML_VULKAN_RUN_TESTS
-#include <chrono>
-#endif
-
-#include <vulkan/vulkan.hpp>
-
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <iomanip>
-#include <limits>
-#include <tuple>
-#include <vector>
-#include <sstream>
-#include <utility>
-#include <memory>
-
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
-#include "ggml-vulkan-shaders.hpp"
-
-#define VK_API_VERSION VK_API_VERSION_1_2
-
-#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
-
-#define VK_VENDOR_ID_AMD 0x1002
-#define VK_VENDOR_ID_APPLE 0x106b
-#define VK_VENDOR_ID_INTEL 0x8086
-#define VK_VENDOR_ID_NVIDIA 0x10de
-
-#define VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN 0
-#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
-#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
-
-#define VK_NUM_TYPES 16
-
-#define GGML_VK_MAX_NODES 8192
-
-#define MAX_VK_BUFFERS 256
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 1
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#define VK_CHECK(err, msg)                                          \
-    do {                                                            \
-        vk::Result err_ = (err);                                    \
-        if (err_ != vk::Result::eSuccess) {                         \
-            fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n",  \
-                #err, to_string(err_).c_str(), __FILE__, __LINE__); \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-struct ggml_backend_vk_context;
-
-struct vk_queue {
-    uint32_t queue_family_index;
-    vk::Queue queue;
-    vk::CommandPool pool;
-    uint32_t cmd_buffer_idx;
-    std::vector<vk::CommandBuffer> cmd_buffers;
-
-    vk::PipelineStageFlags stage_flags;
-};
-
-struct vk_device {
-    vk::PhysicalDevice physical_device;
-    vk::PhysicalDeviceProperties properties;
-    std::string name;
-    uint64_t max_memory_allocation_size;
-    bool fp16;
-    vk::Device device;
-    uint32_t vendor_id;
-    vk_queue compute_queue;
-    vk_queue transfer_queue;
-    bool single_queue;
-    uint32_t descriptor_set_mode;
-    uint32_t subgroup_size;
-    bool uma;
-
-    ~vk_device() {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "destroy device " << name << std::endl;
-#endif
-        device.destroy();
-    }
-};
-
-struct vk_buffer_struct {
-    vk::Buffer buffer;
-    vk::DeviceMemory device_memory;
-    vk::MemoryPropertyFlags memory_property_flags;
-    void * ptr;
-    size_t size = 0;
-
-    ggml_backend_vk_context * ctx;
-
-    std::shared_ptr<vk_device> device;
-
-    ~vk_buffer_struct() {
-        if (size == 0) {
-            return;
-        }
-#ifdef GGML_VULKAN_DEBUG
-        std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl;
-#endif
-
-        device->device.freeMemory(device_memory);
-        device->device.destroyBuffer(buffer);
-    }
-};
-
-typedef std::shared_ptr<vk_buffer_struct> vk_buffer;
-typedef std::weak_ptr<vk_buffer_struct> vk_buffer_ref;
-
-struct vk_subbuffer {
-    vk_buffer buffer;
-    uint64_t offset;
-    uint64_t size;
-};
-
-struct vk_pipeline {
-    std::string name;
-    vk::ShaderModule shader_module;
-    vk::DescriptorSetLayout dsl;
-    std::vector<vk::DescriptorPool> descriptor_pools;
-    std::vector<vk::DescriptorSet> descriptor_sets;
-    uint32_t descriptor_set_idx;
-    vk::PipelineLayout layout;
-    vk::Pipeline pipeline;
-    uint32_t push_constant_size;
-    uint32_t parameter_count;
-    std::array<uint32_t, 3> wg_denoms;
-    uint32_t align;
-};
-
-struct vk_semaphore {
-    vk::Semaphore s;
-    uint64_t value;
-};
-
-struct vk_submission {
-    vk::CommandBuffer buffer;
-    std::vector<vk_semaphore> wait_semaphores;
-    std::vector<vk_semaphore> signal_semaphores;
-};
-
-typedef std::vector<vk_submission> vk_sequence;
-
-struct vk_op_push_constants {
-    uint32_t KX;
-    uint32_t KY;
-    float param1;
-    float param2;
-};
-
-struct vk_op_cpy_push_constants {
-    uint32_t ne;
-    uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02;
-    uint32_t ne10; uint32_t ne11; uint32_t nb10; uint32_t nb11; uint32_t nb12;
-    uint32_t d_offset;
-};
-
-struct vk_op_diag_mask_push_constants {
-    uint32_t ncols;
-    uint32_t rows_per_channel;
-    int32_t n_past;
-};
-
-struct vk_op_rope_push_constants {
-    uint32_t ncols;
-    float freq_scale;
-    uint32_t p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[4];
-};
-
-struct vk_op_rope_neox_push_constants {
-    uint32_t ncols;
-    uint32_t ndims;
-    float freq_scale;
-    uint32_t p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[4];
-    float theta_scale;
-    float inv_ndims;
-};
-
-// Allow pre-recording command buffers
-struct vk_staging_memcpy {
-    vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
-
-    void * dst;
-    const void * src;
-    size_t n;
-};
-
-struct vk_context {
-    size_t idx;
-
-    vk_submission * s;
-    std::vector<vk_sequence> seqs;
-
-    ggml_tensor * exit_tensor;
-
-    std::vector<vk_staging_memcpy> in_memcpys;
-    std::vector<vk_staging_memcpy> out_memcpys;
-
-    vk_queue * q;
-};
-
-struct ggml_tensor_extra_gpu {
-    bool ready;
-
-    size_t ctx_idx;
-
-    vk_buffer_ref buffer_gpu;
-    uint64_t offset;
-
-    void reset() {
-        ready = false;
-        ctx_idx = 0;
-        buffer_gpu.reset();
-        offset = 0;
-    }
-};
-
-struct ggml_vk_garbage_collector {
-    std::vector<vk_pipeline *> pipelines;
-    std::vector<vk_semaphore> tl_semaphores;
-    std::vector<vk_semaphore> semaphores;
-    std::vector<vk::Event> events;
-    std::vector<vk_buffer> temp_buffers;
-    std::vector<vk_context> contexts;
-};
-
-struct ggml_backend_vk_context {
-    std::string name;
-
-    std::weak_ptr<vk_device> device;
-    vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s;
-    vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s;
-    vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s;
-    vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s;
-    vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s;
-    vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s;
-    vk_pipeline pipeline_matmul_split_k_reduce;
-    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
-    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
-    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
-    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
-    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
-    vk_pipeline pipeline_mul_f32;
-    vk_pipeline pipeline_add_f32;
-    vk_pipeline pipeline_scale_f32;
-    vk_pipeline pipeline_sqr_f32;
-    vk_pipeline pipeline_clamp_f32;
-    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
-    vk_pipeline pipeline_norm_f32;
-    vk_pipeline pipeline_rms_norm_f32;
-    vk_pipeline pipeline_gelu_f32;
-    vk_pipeline pipeline_silu_f32;
-    vk_pipeline pipeline_relu_f32;
-    vk_pipeline pipeline_diag_mask_inf_f32;
-    vk_pipeline pipeline_soft_max_f32;
-    vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
-    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
-
-    size_t semaphore_idx, event_idx;
-    ggml_vk_garbage_collector gc;
-    std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
-    size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
-    vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k;
-    vk::Fence fence;
-    vk_buffer staging;
-    size_t staging_size;
-    size_t staging_offset;
-    vk_buffer sync_staging;
-
-    vk_buffer buffer_pool[MAX_VK_BUFFERS];
-
-    vk_context * compute_ctx;
-    vk_context * transfer_ctx;
-
-    bool disable;
-    bool initialized;
-
-    size_t idx;
-};
-
-struct vk_instance {
-    vk::Instance instance;
-
-    std::vector<size_t> device_indices;
-
-    std::shared_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
-    ggml_backend_t backends[GGML_VK_MAX_DEVICES];
-    ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES];
-    ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES];
-    bool initialized[GGML_VK_MAX_DEVICES];
-};
-
-#ifdef GGML_VULKAN_CHECK_RESULTS
-static size_t vk_skip_checks;
-static size_t vk_output_tensor;
-
-static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name);
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor);
-#endif
-
-typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-static bool vk_instance_initialized = false;
-static vk_instance vk_instance;
-
-GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
-
-static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl;
-#endif
-    GGML_ASSERT(parameter_count > 0);
-    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
-
-    pipeline.name = name;
-    pipeline.parameter_count = parameter_count;
-    pipeline.push_constant_size = push_constant_size;
-    pipeline.wg_denoms = wg_denoms;
-    pipeline.align = align;
-
-    vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
-    pipeline.shader_module = ctx->device.lock()->device.createShaderModule(shader_module_create_info);
-
-    std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
-    std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
-    for (uint32_t i = 0; i < parameter_count; i++) {
-        dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
-        dsl_binding_flags.push_back({});
-    }
-
-    vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
-
-    vk::PushConstantRange pcr(
-        vk::ShaderStageFlagBits::eCompute,
-        0,
-        pipeline.push_constant_size
-    );
-
-    vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
-        {},
-        dsl_binding);
-    descriptor_set_layout_create_info.setPNext(&dslbfci);
-    pipeline.dsl = ctx->device.lock()->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
-
-    // Check if device supports multiple descriptors per pool
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
-        const uint32_t alloc_count = 2;
-
-        // Try allocating multiple sets from one pool
-        // This fails on AMD for some reason, so add a fall back to allocating one pool per set
-        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
-        vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
-        vk::DescriptorPool pool = ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info);
-
-        std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
-        for (uint32_t i = 0; i < alloc_count; i++) {
-            layouts[i] = pipeline.dsl;
-        }
-        try {
-            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
-            std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
-        } catch(vk::OutOfPoolMemoryError const&) {
-            ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
-        }
-
-        ctx->device.lock()->device.destroyDescriptorPool(pool);
-    }
-
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
-        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
-        vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
-        pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info));
-    }
-
-    pipeline.descriptor_set_idx = 0;
-
-    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr);
-    pipeline.layout = ctx->device.lock()->device.createPipelineLayout(pipeline_layout_create_info);
-
-    std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
-
-    for (size_t i = 0; i < specialization_constants.size(); i++) {
-        specialization_entries[i].constantID = i;
-        specialization_entries[i].offset = i * sizeof(uint32_t);
-        specialization_entries[i].size = sizeof(uint32_t);
-    }
-
-    vk::SpecializationInfo specialization_info(
-        specialization_entries.size(),
-        specialization_entries.data(),
-        specialization_constants.size() * sizeof(uint32_t),
-        specialization_constants.data()
-    );
-
-    vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
-            vk::PipelineShaderStageCreateFlags(),
-            vk::ShaderStageFlagBits::eCompute,
-            pipeline.shader_module,
-            entrypoint.c_str(),
-            &specialization_info);
-    vk::ComputePipelineCreateInfo compute_pipeline_create_info(
-        vk::PipelineCreateFlags(),
-        pipeline_shader_create_info,
-        pipeline.layout);
-    pipeline.pipeline = ctx->device.lock()->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
-
-    ctx->gc.pipelines.push_back(&pipeline);
-}
-
-static void ggml_vk_destroy_pipeline(ggml_backend_vk_context * ctx, vk_pipeline * pipeline) {
-    for (auto& pool : pipeline->descriptor_pools) {
-        ctx->device.lock()->device.destroyDescriptorPool(pool);
-    }
-    pipeline->descriptor_pools.clear();
-    pipeline->descriptor_sets.clear();
-    pipeline->descriptor_set_idx = 0;
-
-    ctx->device.lock()->device.destroyDescriptorSetLayout(pipeline->dsl);
-
-    ctx->device.lock()->device.destroyPipelineLayout(pipeline->layout);
-
-    ctx->device.lock()->device.destroyShaderModule(pipeline->shader_module);
-
-    ctx->device.lock()->device.destroyPipeline(pipeline->pipeline);
-}
-
-static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl;
-#endif
-    if (pipeline.descriptor_sets.size() >= pipeline.descriptor_set_idx + n) {
-        // Enough descriptors are available
-        return;
-    }
-
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
-        const uint32_t alloc_count = pipeline.descriptor_set_idx + n - pipeline.descriptor_sets.size();
-
-        std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
-        for (uint32_t i = 0; i < alloc_count; i++) {
-            layouts[i] = pipeline.dsl;
-        }
-        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[0], alloc_count, layouts.data());
-        std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
-        pipeline.descriptor_sets.insert(pipeline.descriptor_sets.end(), sets.begin(), sets.end());
-    } else {
-        for (uint32_t i = pipeline.descriptor_sets.size(); i < pipeline.descriptor_set_idx + n; i++) {
-            vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
-            vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
-            pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info));
-
-            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[i], 1, &pipeline.dsl);
-            std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
-            pipeline.descriptor_sets.push_back(sets[0]);
-        }
-    }
-}
-
-static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_pipeline_cleanup(" << pipeline.name << ")" << std::endl;
-#endif
-    pipeline.descriptor_set_idx = 0;
-}
-
-static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
-#endif
-    if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
-        // Reuse command buffer
-        return q.cmd_buffers[q.cmd_buffer_idx++];
-    }
-
-    vk::CommandBufferAllocateInfo command_buffer_alloc_info(
-        q.pool,
-        vk::CommandBufferLevel::ePrimary,
-        1);
-    const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device.lock()->device.allocateCommandBuffers(command_buffer_alloc_info);
-    auto buf = cmd_buffers.front();
-
-    q.cmd_buffers.push_back(buf);
-    q.cmd_buffer_idx++;
-
-    return buf;
-}
-
-static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_submission()" << std::endl;
-#endif
-    vk_submission s;
-    s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
-    s.wait_semaphores = std::move(wait_semaphores);
-    s.signal_semaphores = std::move(signal_semaphores);
-    return s;
-}
-
-static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl;
-#endif
-    if (ctx->seqs.empty()) {
-        return;
-    }
-
-    std::vector<std::vector<uint64_t>> tl_wait_vals;
-    std::vector<std::vector<uint64_t>> tl_signal_vals;
-    std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
-    std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
-    std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
-    std::vector<vk::SubmitInfo> submit_infos;
-    int idx = -1;
-    std::vector<std::vector<vk::PipelineStageFlags>> stage_flags;
-
-    size_t reserve = 0;
-
-    for (const auto& sequence : ctx->seqs) {
-        reserve += sequence.size();
-    }
-
-    // Pre-reserve vectors to prevent reallocation, which invalidates pointers
-    tl_wait_semaphores.reserve(reserve);
-    tl_wait_vals.reserve(reserve);
-    tl_signal_semaphores.reserve(reserve);
-    tl_signal_vals.reserve(reserve);
-    tl_submit_infos.reserve(reserve);
-    submit_infos.reserve(reserve);
-    stage_flags.reserve(reserve);
-
-    for (const auto& sequence : ctx->seqs) {
-        for (const auto& submission : sequence) {
-            stage_flags.push_back({});
-            idx++;
-            tl_wait_vals.push_back({});
-            tl_wait_semaphores.push_back({});
-            tl_signal_vals.push_back({});
-            tl_signal_semaphores.push_back({});
-            for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
-                stage_flags[idx].push_back(ctx->q->stage_flags);
-                tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
-                tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
-            }
-            for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
-                tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
-                tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
-            }
-            tl_submit_infos.push_back({
-                (uint32_t) submission.wait_semaphores.size(),
-                tl_wait_vals[idx].data(),
-                (uint32_t) submission.signal_semaphores.size(),
-                tl_signal_vals[idx].data(),
-            });
-            tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo;
-            tl_submit_infos[idx].pNext = nullptr;
-            vk::SubmitInfo si{
-                (uint32_t) submission.wait_semaphores.size(),
-                tl_wait_semaphores[idx].data(),
-                stage_flags[idx].data(),
-                1,
-                &submission.buffer,
-                (uint32_t) submission.signal_semaphores.size(),
-                tl_signal_semaphores[idx].data(),
-            };
-            si.setPNext(&tl_submit_infos[idx]);
-            submit_infos.push_back(si);
-        }
-    }
-
-    ctx->q->queue.submit(submit_infos, fence);
-
-    ctx->seqs.clear();
-}
-
-static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
-#endif
-    const uint32_t qfsize = queue_family_props.size();
-
-    // Try with avoid preferences first
-    for (uint32_t i = 0; i < qfsize; i++) {
-        if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) {
-            return i;
-        }
-    }
-
-    // Fall back to only required
-    for (size_t i = 0; i < qfsize; i++) {
-        if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) {
-            return i;
-        }
-    }
-
-    // Fall back to reusing compute queue
-    for (size_t i = 0; i < qfsize; i++) {
-        if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) {
-            return i;
-        }
-    }
-
-    // Fall back to ignoring min_num_queries
-    for (size_t i = 0; i < qfsize; i++) {
-        if (queue_family_props[i].queueFlags & required) {
-            return i;
-        }
-    }
-
-    std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
-
-    for(auto &q_family : queue_family_props) {
-        std::cerr << "Queue number: "  + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl;
-    }
-    abort();
-}
-
-static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_queue()" << std::endl;
-#endif
-    q.queue_family_index = queue_family_index;
-
-    vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
-    q.pool = ctx->device.lock()->device.createCommandPool(command_pool_create_info_compute);
-
-    q.cmd_buffer_idx = 0;
-
-    q.queue = ctx->device.lock()->device.getQueue(queue_family_index, queue_index);
-
-    q.stage_flags = stage_flags;
-}
-
-static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_context()" << std::endl;
-#endif
-    ctx->gc.contexts.emplace_back();
-    vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1];
-    memset((void *) result, 0, sizeof(vk_context));
-    result->idx = ctx->gc.contexts.size() - 1;
-    result->q = &q;
-    return result;
-}
-
-static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
-#endif
-    vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
-    vk::SemaphoreCreateInfo ci{};
-    ci.setPNext(&tci);
-    vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci);
-    ctx->gc.semaphores.push_back({ semaphore, 0 });
-    return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
-}
-
-static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl;
-#endif
-    if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
-        vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
-        vk::SemaphoreCreateInfo ci{};
-        ci.setPNext(&tci);
-        vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci);
-        ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
-    }
-    return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
-}
-
-static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
-    if (ctx->event_idx >= ctx->gc.events.size()) {
-        ctx->gc.events.push_back(ctx->device.lock()->device.createEvent({}));
-    }
-    return ctx->gc.events[ctx->event_idx++];
-}
-
-static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
-#endif
-    // Requires command buffers to be done
-
-    ctx->device.lock()->device.resetCommandPool(q.pool);
-    q.cmd_buffer_idx = 0;
-}
-
-static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
-    for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
-        vk::MemoryType memory_type = mem_props->memoryTypes[i];
-        if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
-            (flags & memory_type.propertyFlags) == flags &&
-            mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
-            return static_cast<int32_t>(i);
-        }
-    }
-    return UINT32_MAX;
-}
-
-static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
-#endif
-    vk_buffer buf = std::make_shared<vk_buffer_struct>();
-
-    if (size == 0) {
-        buf->size = 0;
-        return buf;
-    }
-
-    buf->size = size;
-    vk::BufferCreateInfo buffer_create_info{
-        vk::BufferCreateFlags(),
-        size,
-        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
-        vk::SharingMode::eExclusive,
-        0,
-        nullptr,
-    };
-
-    buf->buffer = ctx->device.lock()->device.createBuffer(buffer_create_info);
-
-    vk::MemoryRequirements mem_req = ctx->device.lock()->device.getBufferMemoryRequirements(buf->buffer);
-
-    vk::PhysicalDeviceMemoryProperties mem_props = ctx->device.lock()->physical_device.getMemoryProperties();
-
-    uint32_t memory_type_index = UINT32_MAX;
-
-    memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
-    buf->memory_property_flags = req_flags;
-
-    if (memory_type_index == UINT32_MAX && fallback_flags) {
-        memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
-        buf->memory_property_flags = fallback_flags;
-    }
-
-    if (memory_type_index == UINT32_MAX) {
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
-        buf->size = 0;
-        throw vk::OutOfDeviceMemoryError("No suitable memory type found");
-    }
-
-    try {
-        buf->device_memory = ctx->device.lock()->device.allocateMemory({ mem_req.size, memory_type_index });
-    } catch (const vk::SystemError& e) {
-        // Out of Host/Device memory, clean up buffer
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
-        buf->size = 0;
-        throw e;
-    }
-    buf->ptr = nullptr;
-
-    if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
-    }
-
-    ctx->device.lock()->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
-
-    buf->ctx = ctx;
-
-    buf->device = ctx->device.lock();
-
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "Created buffer " << buf->buffer << std::endl;
-#endif
-
-    return buf;
-}
-
-static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
-    try {
-        return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
-    } catch (const vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
-        throw e;
-    }
-}
-
-static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
-    vk_buffer buf;
-    try {
-        if (ctx->device.lock()->uma) {
-            // Fall back to host memory type
-            buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-        } else {
-            buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
-        }
-    } catch (const vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
-        throw e;
-    }
-
-    return buf;
-}
-
-static void ggml_vk_destroy_buffer(vk_buffer& buf) {
-    buf.reset();
-}
-
-static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
-    return { buf, 0, VK_WHOLE_SIZE };
-}
-
-static void ggml_vk_sync_buffers(vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_sync_buffers()" << std::endl;
-#endif
-    const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
-
-    ctx->s->buffer.pipelineBarrier(
-        ctx->q->stage_flags,
-        ctx->q->stage_flags,
-        {},
-        mem_barriers,
-        {},
-        {}
-    );
-}
-
-static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& events) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_wait_events()" << std::endl;
-#endif
-    if (events.empty()) {
-        return;
-    }
-
-    ctx->s->buffer.waitEvents(
-        events,
-        ctx->q->stage_flags,
-        ctx->q->stage_flags,
-        {},
-        {},
-        {}
-    );
-}
-
-static bool ggml_vk_build_shader(ggml_type type) {
-    switch(type) {
-    case GGML_TYPE_F16:
-    case GGML_TYPE_Q4_0:
-    case GGML_TYPE_Q4_1:
-    case GGML_TYPE_Q5_0:
-    case GGML_TYPE_Q5_1:
-    case GGML_TYPE_Q8_0:
-    case GGML_TYPE_Q2_K:
-    case GGML_TYPE_Q3_K:
-    case GGML_TYPE_Q4_K:
-    case GGML_TYPE_Q5_K:
-    case GGML_TYPE_Q6_K:
-        return true;
-    default:
-        return false;
-    }
-}
-
-static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
-#endif
-
-    // mulmat
-    std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, ctx->device.lock()->subgroup_size * 2, 64, 2, 4, 4, ctx->device.lock()->subgroup_size };
-    std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, ctx->device.lock()->subgroup_size, 32, 2, 4, 2, ctx->device.lock()->subgroup_size };
-    std::initializer_list<uint32_t> warptile_s = { ctx->device.lock()->subgroup_size,  32,  32, 16, 32, 32, 2, 2, 2, ctx->device.lock()->subgroup_size };
-
-    std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
-    std::array<uint32_t, 3> m_wg_denoms = { 64,  64, 1 };
-    std::array<uint32_t, 3> s_wg_denoms = { 32,  32, 1 };
-
-    uint32_t l_align = 128;
-    uint32_t m_align =  64;
-    uint32_t s_align =  32;
-
-    if (ctx->device.lock()->fp16) {
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_len, matmul_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_len, matmul_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_len, matmul_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_len, matmul_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_len, matmul_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_len, matmul_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_len, matmul_f16_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_len, matmul_f16_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_len, matmul_f16_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_len, matmul_f16_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_len, matmul_f16_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_len, matmul_f16_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_len, matmul_f16_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_len, matmul_f16_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_len, matmul_f16_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-    } else {
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_fp32_len, matmul_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_fp32_len, matmul_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_fp32_len, matmul_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_fp32_len, matmul_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_fp32_len, matmul_f16_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_fp32_len, matmul_f16_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_fp32_len, matmul_f16_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_fp32_len, matmul_f16_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_fp32_len, matmul_f16_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_fp32_len, matmul_f16_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_fp32_len, matmul_f16_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_fp32_len, matmul_f16_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_fp32_len, matmul_f16_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
-    }
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32",  mul_mat_vec_f16_f32_len,  mul_mat_vec_f16_f32_data,  "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1);
-
-    // dequant shaders
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   f32_to_f16_len,   f32_to_f16_data,   "main", 2, 4 * sizeof(int), {      64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16",  dequant_f16_len,  dequant_f16_data,  "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1);
-
-    // get_rows
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
-
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
-}
-
-static void ggml_vk_print_gpu_info(size_t idx) {
-    GGML_ASSERT(idx < vk_instance.device_indices.size());
-    size_t dev_num = vk_instance.device_indices[idx];
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl;
-#endif
-    GGML_ASSERT(vk_instance.initialized);
-
-    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-
-    if (dev_num >= devices.size()) {
-        std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
-        throw std::runtime_error("Device not found");
-    }
-
-    vk::PhysicalDevice physical_device = devices[dev_num];
-    std::vector<vk::ExtensionProperties> ext_props = physical_device.enumerateDeviceExtensionProperties();
-
-    vk::PhysicalDeviceProperties2 props2;
-    vk::PhysicalDeviceMaintenance3Properties props3;
-    vk::PhysicalDeviceSubgroupProperties subgroup_props;
-    props2.pNext = &props3;
-    props3.pNext = &subgroup_props;
-    physical_device.getProperties2(&props2);
-
-    const size_t subgroup_size = subgroup_props.subgroupSize;
-    const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
-
-    bool fp16_storage = false;
-    bool fp16_compute = false;
-
-    for (auto properties : ext_props) {
-        if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
-            fp16_storage = true;
-        } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
-            fp16_compute = true;
-        }
-    }
-
-    const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16");
-    bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr;
-
-    bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
-
-    vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures();
-
-    VkPhysicalDeviceFeatures2 device_features2;
-    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    device_features2.pNext = nullptr;
-    device_features2.features = (VkPhysicalDeviceFeatures)device_features;
-
-    VkPhysicalDeviceVulkan11Features vk11_features;
-    vk11_features.pNext = nullptr;
-    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
-    device_features2.pNext = &vk11_features;
-
-    VkPhysicalDeviceVulkan12Features vk12_features;
-    vk12_features.pNext = nullptr;
-    vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
-    vk11_features.pNext = &vk12_features;
-
-    vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
-
-    fp16 = fp16 && vk12_features.shaderFloat16;
-
-    std::string device_name = props2.properties.deviceName.data();
-    std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
-
-    if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
-        std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
-    }
-}
-
-static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
-static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
-
-void ggml_vk_instance_init() {
-    if (vk_instance_initialized) {
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_instance_init()" << std::endl;
-#endif
-
-    vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
-
-    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
-    const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
-#ifdef __APPLE__
-    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
-#endif
-
-    std::vector<const char*> layers;
-
-    if (validation_ext) {
-        layers.push_back("VK_LAYER_KHRONOS_validation");
-    }
-    std::vector<const char*> extensions;
-    if (validation_ext) {
-        extensions.push_back("VK_EXT_validation_features");
-    }
-#ifdef __APPLE__
-    if (portability_enumeration_ext) {
-        extensions.push_back("VK_KHR_portability_enumeration");
-    }
-#endif
-    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
-#ifdef __APPLE__
-    if (portability_enumeration_ext) {
-        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
-    }
-#endif
-
-    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
-    vk::ValidationFeaturesEXT validation_features;
-
-    if (validation_ext) {
-        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
-        validation_features = {
-            features_enable,
-            {},
-        };
-        validation_features.setPNext(nullptr);
-        instance_create_info.setPNext(&validation_features);
-
-        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
-    }
-    vk_instance.instance = vk::createInstance(instance_create_info);
-
-    memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
-
-    size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
-
-    // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
-    char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
-    if (devices_env != nullptr) {
-        std::string devices(devices_env);
-        std::replace(devices.begin(), devices.end(), ',', ' ');
-
-        std::stringstream ss(devices);
-        size_t tmp;
-        while (ss >> tmp) {
-            if(tmp >= num_available_devices) {
-                std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl;
-                throw std::runtime_error("Invalid Vulkan device index");
-            }
-            vk_instance.device_indices.push_back(tmp);
-        }
-    } else {
-        vk_instance.device_indices.push_back(0);
-    }
-
-    vk_instance_initialized = true;
-}
-
-static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
-    GGML_ASSERT(idx < vk_instance.device_indices.size());
-    size_t dev_num = vk_instance.device_indices[idx];
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl;
-#endif
-    ggml_vk_instance_init();
-
-    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-
-    if (dev_num >= devices.size()) {
-        std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
-        throw std::runtime_error("Device not found");
-    }
-
-    vk_instance.devices[idx] = std::make_shared<vk_device>();
-    ctx->device = vk_instance.devices[idx];
-    ctx->device.lock()->physical_device = devices[dev_num];
-    const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
-
-    bool maintenance4_support = false;
-
-    // Check if maintenance4 is supported
-    for (const auto& properties : ext_props) {
-        if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
-            maintenance4_support = true;
-        }
-    }
-
-    vk::PhysicalDeviceProperties2 props2;
-    vk::PhysicalDeviceMaintenance3Properties props3;
-    vk::PhysicalDeviceMaintenance4Properties props4;
-    vk::PhysicalDeviceSubgroupProperties subgroup_props;
-    props2.pNext = &props3;
-    props3.pNext = &subgroup_props;
-    if (maintenance4_support) {
-        subgroup_props.pNext = &props4;
-    }
-    ctx->device.lock()->physical_device.getProperties2(&props2);
-    ctx->device.lock()->properties = props2.properties;
-
-    if (maintenance4_support) {
-        ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
-    } else {
-        ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize;
-    }
-
-    ctx->device.lock()->vendor_id = ctx->device.lock()->properties.vendorID;
-    ctx->device.lock()->subgroup_size = subgroup_props.subgroupSize;
-    ctx->device.lock()->uma = ctx->device.lock()->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
-
-    bool fp16_storage = false;
-    bool fp16_compute = false;
-
-    for (const auto& properties : ext_props) {
-        if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
-            fp16_storage = true;
-        } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
-            fp16_compute = true;
-        }
-    }
-
-    const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16");
-    bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr;
-
-    ctx->device.lock()->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
-
-    std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device.lock()->physical_device.getQueueFamilyProperties();
-
-    // Try to find a non-graphics compute queue and transfer-focused queues
-    const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
-    const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
-
-    const float priorities[] = { 1.0f, 1.0f };
-    ctx->device.lock()->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
-
-    std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
-    if (compute_queue_family_index != transfer_queue_family_index) {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
-    } else if(!ctx->device.lock()->single_queue) {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
-    } else {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
-    }
-    vk::DeviceCreateInfo device_create_info;
-    std::vector<const char *> device_extensions;
-    vk::PhysicalDeviceFeatures device_features = ctx->device.lock()->physical_device.getFeatures();
-
-    VkPhysicalDeviceFeatures2 device_features2;
-    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    device_features2.pNext = nullptr;
-    device_features2.features = (VkPhysicalDeviceFeatures)device_features;
-
-    VkPhysicalDeviceVulkan11Features vk11_features;
-    vk11_features.pNext = nullptr;
-    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
-    device_features2.pNext = &vk11_features;
-
-    VkPhysicalDeviceVulkan12Features vk12_features;
-    vk12_features.pNext = nullptr;
-    vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
-    vk11_features.pNext = &vk12_features;
-
-    vkGetPhysicalDeviceFeatures2(ctx->device.lock()->physical_device, &device_features2);
-
-    ctx->device.lock()->fp16 = ctx->device.lock()->fp16 && vk12_features.shaderFloat16;
-
-    if (!vk11_features.storageBuffer16BitAccess) {
-        std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
-        throw std::runtime_error("Unsupported device");
-    }
-
-    device_extensions.push_back("VK_KHR_16bit_storage");
-
-#ifdef GGML_VULKAN_VALIDATE
-    device_extensions.push_back("VK_KHR_shader_non_semantic_info");
-#endif
-
-    if (ctx->device.lock()->fp16) {
-        device_extensions.push_back("VK_KHR_shader_float16_int8");
-    }
-    ctx->device.lock()->name = ctx->device.lock()->properties.deviceName.data();
-
-    device_create_info = {
-        vk::DeviceCreateFlags(),
-        device_queue_create_infos,
-        {},
-        device_extensions
-    };
-    device_create_info.setPNext(&device_features2);
-    ctx->device.lock()->device = ctx->device.lock()->physical_device.createDevice(device_create_info);
-
-    ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
-
-    // Shaders
-    ggml_vk_load_shaders(ctx);
-
-    // Queues
-    ggml_vk_create_queue(ctx, ctx->device.lock()->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
-    if (!ctx->device.lock()->single_queue) {
-        const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
-        ggml_vk_create_queue(ctx, ctx->device.lock()->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
-    } else {
-        // TODO: Use pointer or reference to avoid copy
-        ctx->device.lock()->transfer_queue = ctx->device.lock()->compute_queue;
-    }
-
-    ctx->fence = ctx->device.lock()->device.createFence({});
-
-    ctx->compute_ctx = nullptr;
-    ctx->transfer_ctx = nullptr;
-
-    ctx->disable = false;
-    ctx->initialized = true;
-
-    ctx->idx = idx;
-
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
-    vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
-    const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR");
-    vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor));
-#endif
-}
-
-static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
-#endif
-    switch (type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            break;
-        default:
-            return nullptr;
-    }
-
-    return &ctx->pipeline_dequant[type];
-}
-
-static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
-#endif
-    switch (type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            break;
-        default:
-            return nullptr;
-    }
-
-    return &ctx->pipeline_dequant_mul_mat_vec_f32[type];
-}
-
-static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
-#endif
-    int best_i = -1;
-    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
-    int worst_i = -1;
-    size_t worst_size = 0; //largest unused buffer seen so far
-    for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
-        vk_buffer &b = ctx->buffer_pool[i];
-        if (b != nullptr && b->size >= size && b->size < best_size) {
-            best_i = i;
-            best_size = b->size;
-        }
-        if (b != nullptr && b->size > worst_size) {
-            worst_i = i;
-            worst_size = b->size;
-        }
-    }
-    if(best_i != -1) {
-        //found the smallest buffer that fits our needs
-        vk_buffer b = ctx->buffer_pool[best_i];
-        ctx->buffer_pool[best_i].reset();
-        return b;
-    }
-    if(worst_i != -1) {
-        //no buffer that fits our needs, resize largest one to save memory
-        vk_buffer& b = ctx->buffer_pool[worst_i];
-        ggml_vk_destroy_buffer(b);
-    }
-
-    return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
-}
-
-static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl;
-#endif
-    for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
-        vk_buffer& b = ctx->buffer_pool[i];
-        if (b == nullptr) {
-            b = buffer;
-            return;
-        }
-    }
-    std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl;
-    ggml_vk_destroy_buffer(buffer);
-}
-
-// Returns an available temporary buffer that may only be used temporarily, it will be reused
-static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) {
-    // Try to find existing temp buffer with enough capacity
-    for (auto& buffer : ctx->gc.temp_buffers) {
-        if (buffer->size >= size) {
-            return buffer;
-        }
-    }
-
-    // Otherwise create new buffer
-    vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
-    ctx->gc.temp_buffers.push_back(buf);
-
-    return buf;
-}
-
-static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
-#endif
-    vk_buffer buf = ggml_vk_create_buffer(ctx, size,
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-
-    if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
-            size/1024.0/1024.0);
-        ctx->device.lock()->device.freeMemory(buf->device_memory);
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
-        return nullptr;
-    }
-
-    ctx->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
-
-    return buf->ptr;
-}
-
-static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) {
-    if (ptr == nullptr) {
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl;
-#endif
-    vk_buffer buf;
-    size_t index;
-    for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
-        const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]);
-        const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]);
-        if (ptr >= addr && ptr < endr) {
-            buf = std::get<2>(ctx->pinned_memory[i]);
-            index = i;
-            break;
-        }
-    }
-    if (buf == nullptr) {
-        fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n");
-        return;
-    }
-
-    ggml_vk_destroy_buffer(buf);
-
-    ctx->pinned_memory.erase(ctx->pinned_memory.begin() + index);
-}
-
-static void ggml_vk_host_get(ggml_backend_vk_context * ctx, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
-    buf = nullptr;
-    buf_offset = 0;
-    for (size_t i = 0; i < ctx->pinned_memory.size(); i++) {
-        const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]);
-        const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]);
-        if (ptr >= addr && ptr < endr) {
-            buf = std::get<2>(ctx->pinned_memory[i]);
-            buf_offset = ((const uint8_t *)ptr) - addr;
-            break;
-        }
-    }
-}
-
-static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_queue& q, bool one_time = true) {
-    vk_submission s;
-    s.buffer = ggml_vk_create_cmd_buffer(ctx, q);
-    if (one_time) {
-        s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
-    } else {
-        s.buffer.begin({ vk::CommandBufferUsageFlags{} });
-    }
-
-    return s;
-}
-
-static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
-    const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]);
-    const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]);
-    const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]);
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
-#endif
-    std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
-    std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
-    GGML_ASSERT(pipeline.descriptor_set_idx < pipeline.descriptor_sets.size());
-    GGML_ASSERT(buffers.size() == pipeline.parameter_count);
-    vk::DescriptorSet& descriptor_set = pipeline.descriptor_sets[pipeline.descriptor_set_idx++];
-    for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
-        descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
-    }
-    for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
-        write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
-    }
-
-    ctx->device.lock()->device.updateDescriptorSets(write_descriptor_sets, {});
-
-    subctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
-    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline);
-    subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
-                                pipeline.layout,
-                                0,
-                                { descriptor_set },
-                                {});
-    subctx->s->buffer.dispatch(wg0, wg1, wg2);
-}
-
-static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
-    s.buffer.end();
-
-    s.wait_semaphores = std::move(wait_semaphores);
-    s.signal_semaphores = std::move(signal_semaphores);
-}
-
-static void ggml_vk_ctx_end(vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl;
-#endif
-    if (ctx->s == nullptr) {
-        return;
-    }
-
-    ctx->s->buffer.end();
-    ctx->s = nullptr;
-}
-
-static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl;
-#endif
-    if (subctx->s != nullptr) {
-        ggml_vk_ctx_end(subctx);
-    }
-
-    subctx->seqs.push_back({ ggml_vk_begin_submission(ctx, *subctx->q) });
-    subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
-}
-
-static size_t ggml_vk_align_size(size_t width, size_t align) {
-    return CEIL_DIV(width, align) * align;
-}
-
-static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr) {
-    if (memcpys == nullptr) {
-        memcpy(dst, src, size);
-    } else {
-        memcpys->emplace_back(dst, src, size);
-    }
-}
-
-static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
-    if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
-        ggml_vk_destroy_buffer(ctx->sync_staging);
-        ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-    }
-}
-
-static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
-#endif
-    GGML_ASSERT(!ggml_is_contiguous(tensor));
-    // Buffer is already mapped
-    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
-        GGML_ASSERT(false);
-    }
-    // Check if src is pinned memory
-    vk_buffer buf;
-    size_t buf_offset;
-    ggml_vk_host_get(ctx, tensor->data, buf, buf_offset);
-
-    const uint64_t ne0 = tensor->ne[0];
-    const uint64_t ne1 = tensor->ne[1];
-    const uint64_t ne2 = tensor->ne[2];
-    const uint64_t ne3 = tensor->ne[3];
-    const uint64_t nb0 = tensor->nb[0];
-    const uint64_t nb1 = tensor->nb[1];
-    const uint64_t nb2 = tensor->nb[2];
-    const uint64_t nb3 = tensor->nb[3];
-    const ggml_type type = tensor->type;
-    const uint64_t ts = ggml_type_size(type);
-    const uint64_t bs = ggml_blck_size(type);
-
-    const uint64_t dstnb0 = ts;
-    const uint64_t dstnb1 = dstnb0*(ne0/bs);
-    const uint64_t dstnb2 = dstnb1*ne1;
-    const uint64_t dstnb3 = dstnb2*ne2;
-
-    const uint64_t ne = ggml_nelements(tensor);
-
-    if (buf != nullptr) {
-        // Memory is pinned, use as staging buffer
-        std::vector<vk::BufferCopy> slices;
-
-        for (uint64_t i3 = 0; i3 < ne3; i3++) {
-            for (uint64_t i2 = 0; i2 < ne2; i2++) {
-                // Find longest contiguous slice
-                if (ne1*nb1 == dstnb2) {
-                    slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
-                } else {
-                    for (uint64_t i1 = 0; i1 < ne1; i1++) {
-                        if (ne0*nb0/bs == dstnb1) {
-                            slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
-                        } else {
-                            const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
-                            const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
-                            for (uint64_t i0 = 0; i0 < ne0; i0++) {
-                                slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        ggml_vk_sync_buffers(subctx);
-        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
-        return;
-    }
-
-    // Staging buffer required
-    vk_buffer staging = ctx->staging;
-    size_t staging_offset = ctx->staging_offset;
-    const size_t copy_size = ts*ne/bs;
-    if (ctx->staging->size < ctx->staging_offset + copy_size) {
-        if (sync_staging) {
-            // Create temporary larger buffer
-            ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
-
-            staging = ctx->sync_staging;
-            staging_offset = 0;
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    VkBufferCopy buf_copy{ staging_offset, offset, copy_size };
-
-    ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
-
-    for (uint64_t i3 = 0; i3 < ne3; i3++) {
-        for (uint64_t i2 = 0; i2 < ne2; i2++) {
-            // Find longest contiguous slice
-            if (ne1*nb1 == dstnb2) {
-                deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
-            } else {
-                for (uint64_t i1 = 0; i1 < ne1; i1++) {
-                    if (ne0*nb0/bs == dstnb1) {
-                        deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
-                    } else {
-                        const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
-                        const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
-                        for (uint64_t i0 = 0; i0 < ne0; i0++) {
-                            deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
-#endif
-    // Make sure ctx owns the buffer
-    GGML_ASSERT(dst->ctx == ctx);
-
-    // Buffer is already mapped
-    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
-        GGML_ASSERT(false);
-    }
-    // Check if src is pinned memory
-    vk_buffer buf = nullptr;
-    size_t buf_offset;
-    ggml_vk_host_get(ctx, src, buf, buf_offset);
-
-    if (buf != nullptr) {
-        // Memory is pinned, use as staging buffer
-        std::vector<vk::BufferCopy> slices(1);
-        if (width == spitch) {
-            // Only do single write if stride is equal
-            slices[0].srcOffset = buf_offset;
-            slices[0].dstOffset = offset;
-            slices[0].size = width * height;
-        } else {
-            slices.resize(height);
-            for (size_t i = 0; i < height; i++) {
-                slices[i].srcOffset = buf_offset + i * spitch;
-                slices[i].dstOffset = offset + i * width;
-                slices[i].size = width;
-            }
-        }
-
-        ggml_vk_sync_buffers(subctx);
-        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "STAGING" << std::endl;
-#endif
-
-    // Staging buffer required
-    vk_buffer staging = ctx->staging;
-    size_t staging_offset = ctx->staging_offset;
-    const size_t copy_size = width*height;
-    if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) {
-        if (sync_staging) {
-            ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
-
-            staging = ctx->sync_staging;
-            staging_offset = 0;
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    VkBufferCopy buf_copy = {
-        staging_offset,
-        offset,
-        copy_size};
-
-    ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
-
-    if (width == spitch) {
-        deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &subctx->in_memcpys);
-    } else {
-        for (size_t i = 0; i < height; i++) {
-            deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
-        }
-    }
-}
-
-static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
-#endif
-    return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging);
-}
-
-static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
-#endif
-    // Buffer is already mapped
-    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
-
-        for (size_t i = 0; i < height; i++) {
-            memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
-        }
-    } else {
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-        ggml_vk_ctx_begin(ctx, subctx);
-        ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true);
-        ggml_vk_ctx_end(subctx);
-
-        for (auto& cpy : subctx->in_memcpys) {
-            memcpy(cpy.dst, cpy.src, cpy.n);
-        }
-
-        ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
-    }
-}
-
-static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
-#endif
-    ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1);
-}
-
-static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
-#endif
-    GGML_ASSERT(width > 0);
-    GGML_ASSERT(height > 0);
-    GGML_ASSERT(src != nullptr);
-    // Make sure ctx owns the buffer
-    GGML_ASSERT(src->ctx == ctx);
-
-    // Check if dst is pinned memory
-    vk_buffer buf = nullptr;
-    size_t buf_offset;
-    ggml_vk_host_get(ctx, dst, buf, buf_offset);
-
-    std::vector<vk::BufferCopy> slices(1);
-    if (width == spitch && width == dpitch) {
-        // Only do single write if stride is equal
-        slices[0].srcOffset = offset;
-        slices[0].dstOffset = buf_offset;
-        slices[0].size = width * height;
-    } else {
-        slices.resize(height);
-        for (size_t i = 0; i < height; i++) {
-            slices[i].srcOffset = offset + i * spitch;
-            slices[i].dstOffset = buf_offset + i * dpitch;
-            slices[i].size = width;
-        }
-    }
-
-    if (buf != nullptr) {
-        // Memory is pinned, use as staging buffer
-        ggml_vk_sync_buffers(subctx);
-        subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
-
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "STAGING" << std::endl;
-#endif
-
-    // Fall back to staging buffer
-    vk_buffer staging = ctx->staging;
-    const size_t copy_size = dpitch * height;
-    if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) {
-        if (sync_staging) {
-            // Create temporary larger buffer
-            ggml_vk_ensure_sync_staging_buffer(ctx, copy_size);
-
-            staging = ctx->sync_staging;
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    ggml_vk_sync_buffers(subctx);
-    subctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices);
-
-    deferred_memcpy(dst, staging->ptr, copy_size, &subctx->out_memcpys);
-}
-
-static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
-    return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst, size, size, size, 1, sync_staging);
-}
-
-static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
-#endif
-    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
-
-        memcpy(dst, (uint8_t *) src->ptr + offset, size);
-    } else {
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-        ggml_vk_ctx_begin(ctx, subctx);
-        ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true);
-        ggml_vk_ctx_end(subctx);
-
-        ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
-
-        for (auto& cpy : subctx->out_memcpys) {
-            memcpy(cpy.dst, cpy.src, cpy.n);
-        }
-    }
-}
-
-static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
-#endif
-    // Make sure both buffers are on same ctx
-    GGML_ASSERT(src->ctx == dst->ctx);
-
-    VkBufferCopy bc{ src_offset, dst_offset, size };
-
-    vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
-}
-
-static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
-    if (src->ctx == dst->ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl;
-#endif
-        // Copy within the device
-        ggml_backend_vk_context * ctx = src->ctx;
-
-        VkBufferCopy bc{ src_offset, dst_offset, size };
-
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-        ggml_vk_ctx_begin(ctx, subctx);
-        ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
-        ggml_vk_ctx_end(subctx);
-        ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
-    } else {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
-#endif
-        // Copy device to device
-        ggml_backend_vk_context * src_ctx = src->ctx;
-        ggml_backend_vk_context * dst_ctx = dst->ctx;
-
-        ggml_vk_ensure_sync_staging_buffer(src_ctx, size);
-        ggml_vk_ensure_sync_staging_buffer(dst_ctx, size);
-
-        // Copy to src staging buffer
-        ggml_vk_buffer_copy(src_ctx->sync_staging, 0, src, src_offset, size);
-        // memcpy to dst staging buffer
-        memcpy(dst_ctx->sync_staging->ptr, src_ctx->sync_staging->ptr, size);
-        // Copy to dst buffer
-        ggml_vk_buffer_copy(dst, dst_offset, dst_ctx->sync_staging, 0, size);
-    }
-}
-
-static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
-#endif
-    // Make sure ctx owns the buffer
-    GGML_ASSERT(dst->ctx == ctx);
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
-    subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
-    ggml_vk_ctx_end(subctx);
-
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-}
-
-static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
-#endif
-    const uint64_t ne0 = src->ne[0];
-    const uint64_t ne1 = src->ne[1];
-    const uint64_t nb0 = src->nb[0];
-    const uint64_t nb1 = src->nb[1];
-    const uint64_t nb2 = src->nb[2];
-    const uint64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
-    const size_t row_length = ts*ne0/bs;
-
-    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
-    if (nb0 == ts && nb1 == row_length) {
-        return ggml_vk_buffer_write_async(ctx, subctx, dst, offset, x, i1*nb1);
-    }
-    if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) {
-        return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, x, nb1, row_length, i1);
-    }
-
-    GGML_ASSERT(i3 == 0);
-    GGML_ASSERT(i2 == 0);
-    GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src));
-
-    return ggml_vk_buffer_write_nc_async(ctx, subctx, dst, offset, src);
-}
-
-static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
-#endif
-    const uint64_t ne0 = dst->ne[0];
-    const uint64_t ne1 = dst->ne[1];
-    const uint64_t ne2 = dst->ne[2];
-    const uint64_t ne3 = dst->ne[3];
-    const uint64_t nb0 = dst->nb[0];
-    const uint64_t nb1 = dst->nb[1];
-    // const uint64_t nb2 = dst->nb[2];
-    // const uint64_t nb3 = dst->nb[3];
-    const enum ggml_type type = dst->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
-    const size_t row_length = ts*ne0/bs;
-
-    if (ggml_is_contiguous(dst)) {
-        return ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst->data, ne1*nb1*ne2*ne3);
-    }
-    if (nb0 == ts) {
-        return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3);
-    }
-    GGML_ASSERT(false);
-}
-
-static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")";
-#endif
-    if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " = 4" << std::endl;
-#endif
-        return 4;
-    }
-
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " = 1" << std::endl;
-#endif
-    return 1;
-}
-
-static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, int m, int n) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
-#endif
-    if (m <= 32 || n <= 32) {
-        return ctx->pipeline_matmul_f32_aligned_s.align;
-    }
-    if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
-        return ctx->pipeline_matmul_f32_aligned_m.align;
-    }
-    return ctx->pipeline_matmul_f32_aligned_l.align;
-}
-
-static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
-    if (bit16_x && bit16_y) {
-        if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
-        }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
-    }
-    if (bit16_x && !bit16_y) {
-        if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
-        }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
-    }
-    if (!bit16_x && bit16_y) {
-        GGML_ASSERT(false);
-    }
-
-    if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-    return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
-}
-
-static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-    if (bit16_x && bit16_y) {
-        return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
-    }
-    if (bit16_x && !bit16_y) {
-        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
-    }
-    if (!bit16_x && bit16_y) {
-        GGML_ASSERT(false);
-    }
-    return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
-}
-
-static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-    if (bit16_x && bit16_y) {
-        return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
-    }
-    if (bit16_x && !bit16_y) {
-        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
-    }
-    if (!bit16_x && bit16_y) {
-        GGML_ASSERT(false);
-    }
-    return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
-}
-
-static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
-#endif
-    switch (ctx->device.lock()->vendor_id) {
-    case VK_VENDOR_ID_AMD:
-        return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
-    case VK_VENDOR_ID_APPLE:
-        return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
-    case VK_VENDOR_ID_INTEL:
-        return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
-    }
-
-    if (bit16_x && bit16_y) {
-        if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
-        }
-        if (m <= 64 || n <= 64) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
-        }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " L" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
-    }
-    if (bit16_x && !bit16_y) {
-        if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
-        }
-        if (m <= 64 || n <= 64) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
-        }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " L" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l;
-    }
-    if (!bit16_x && bit16_y) {
-        GGML_ASSERT(false);
-    }
-
-    if (m <= 32 || n <= 32) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " S" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
-    }
-    if (m <= 64 || n <= 64) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " M" << std::endl;
-#endif
-        return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << " L" << std::endl;
-#endif
-    return aligned ? &ctx->pipeline_matmul_f32_aligned_l : &ctx->pipeline_matmul_f32_l;
-}
-
-static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
-#endif
-    ggml_vk_sync_buffers(subctx);
-    if (split_k == 1) {
-        const std::array<uint32_t, 14> pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch });
-        return;
-    }
-
-    GGML_ASSERT(batch_stride_d == m * n);
-
-    const std::array<uint32_t, 14> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
-    // Make sure enough workgroups get assigned for split k to work
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch });
-    ggml_vk_sync_buffers(subctx);
-    const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
-}
-
-static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
-        return &ctx->pipeline_cpy_f32_f32;
-    }
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
-        return &ctx->pipeline_cpy_f32_f16;
-    }
-    if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
-        return &ctx->pipeline_cpy_f16_f16;
-    }
-
-    std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
-    GGML_ASSERT(false);
-}
-
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
-    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
-#endif
-    const int tensor_type_size = ggml_type_size(tensor->type);
-    const int dst_type_size = ggml_type_size(buffer_type);
-
-    const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2];
-
-    const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1];
-
-    const vk_op_cpy_push_constants pc = {
-        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1],                       1                   , (uint32_t)tensor->ne[0]                   , nb2,
-        0,
-    };
-    ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 });
-}
-
-static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
-    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
-    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
-#endif
-    GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
-    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
-
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    const uint64_t ne03 = src0->ne[3];
-
-    const uint64_t ne10 = src1->ne[0];
-    const uint64_t ne11 = src1->ne[1];
-    const uint64_t ne12 = src1->ne[2];
-    const uint64_t ne13 = src1->ne[3];
-
-    const uint64_t ne20 = dst->ne[0];
-    const uint64_t ne21 = dst->ne[1];
-
-    const uint64_t r2 = ne12 / ne02;
-    const uint64_t r3 = ne13 / ne03;
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
-
-    vk_buffer d_Qx;
-    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
-    size_t qy_buf_offset = 0;
-
-    bool src0_uma = false;
-    bool src1_uma = false;
-
-    if (ctx->device.lock()->uma) {
-        ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
-        ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
-        src0_uma = d_Qx != nullptr;
-        src1_uma = d_Qy != nullptr;
-    }
-
-    const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
-
-    const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
-    const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
-
-    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
-
-    const bool qx_needs_dequant = src0->type != GGML_TYPE_F16 || x_non_contig;
-    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
-
-    // Not implemented
-    GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
-
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, ne01, ne11));
-    const bool aligned = ne10 == kpad;
-
-    const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
-
-    vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned);
-
-    const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
-    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
-    const uint64_t x_sz = sizeof(ggml_fp16_t) * x_ne;
-    const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
-    const uint64_t d_sz = sizeof(float) * d_ne;
-
-    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
-    GGML_ASSERT(d_D != nullptr);
-    GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
-    vk_buffer d_X;
-    uint64_t x_buf_offset = 0;
-    vk_buffer d_Y;
-    uint64_t y_buf_offset = 0;
-    if (load_x) {
-        d_Qx = ctx->prealloc_qx;
-    } else if (!src0_uma) {
-        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
-        GGML_ASSERT(d_Qx != nullptr);
-    }
-    if (load_y) {
-        d_Qy = ctx->prealloc_qy;
-    } else if (!src1_uma) {
-        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
-        GGML_ASSERT(d_Qy != nullptr);
-    }
-    if (qx_needs_dequant) {
-        d_X = ctx->prealloc_x;
-        GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
-    } else {
-        d_X = d_Qx;
-        x_buf_offset = qx_buf_offset;
-        GGML_ASSERT(qx_sz == x_sz);  // NOLINT
-    }
-    if (qy_needs_dequant) {
-        d_Y = ctx->prealloc_y;
-        GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
-    } else {
-        d_Y = d_Qy;
-        y_buf_offset = qy_buf_offset;
-        GGML_ASSERT(qy_sz == y_sz);
-    }
-
-    vk_pipeline * to_fp16_vk_0 = nullptr;
-    vk_pipeline * to_fp16_vk_1 = nullptr;
-
-    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
-    } else {
-        to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
-    }
-    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
-    } else {
-        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
-    }
-    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
-    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
-
-    // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne12 * ne13);
-    if (qx_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, x_non_contig ? 1 : ne12 * ne13);
-    }
-    if (qy_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
-    }
-    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, ne12 * ne13);
-    }
-
-    if (x_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type, false);
-    } else if (load_x || qx_needs_dequant) {
-        if (load_x) {
-            // copy data to device
-            ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
-            ctx->staging_offset = qx_sz * ne02 * ne03;
-        }
-
-        if (qx_needs_dequant) {
-            const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
-            ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
-        }
-    }
-    if (y_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, dst->type);
-    } else if (load_y) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
-    }
-
-    uint32_t stride_batch_x = ne00*ne01;
-    uint32_t stride_batch_y = ne10*ne11;
-
-    if (!ggml_vk_dim01_contiguous(src0) && !load_x && !qx_needs_dequant) {
-        stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
-    }
-
-    if (!ggml_vk_dim01_contiguous(src1) && !load_y && !qy_needs_dequant) {
-        stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
-    }
-
-    // compute
-    ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21);  // NOLINT
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        // copy dst to host
-        float * d = (float *) ((char *) dst->data);
-        ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
-    }
-}
-
-static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",  backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
-    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",  backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
-    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",  backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
-#endif
-    GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
-    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
-
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    const uint64_t ne03 = src0->ne[3];
-
-    const uint64_t ne10 = src1->ne[0];
-    const uint64_t ne11 = src1->ne[1];
-    const uint64_t ne12 = src1->ne[2];
-    const uint64_t ne13 = src1->ne[3];
-
-    GGML_ASSERT(ne11 == 1);
-
-    const uint64_t nb2  = dst->nb[2];
-    const uint64_t nb3  = dst->nb[3];
-
-    const uint64_t r2 = ne12 / ne02;
-    const uint64_t r3 = ne13 / ne03;
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
-
-    vk_buffer d_Qx;
-    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
-    size_t qy_buf_offset = 0;
-
-    bool src0_uma = false;
-    bool src1_uma = false;
-
-    if (ctx->device.lock()->uma) {
-        ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
-        ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
-        src0_uma = d_Qx != nullptr;
-        src1_uma = d_Qy != nullptr;
-    }
-
-    const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
-
-    const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
-    const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
-
-    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
-
-    const bool qx_needs_dequant = x_non_contig;
-    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
-
-    const uint64_t x_ne = ne01 * ne00;
-    const uint64_t y_ne = ne11 * ne10;
-    const uint64_t d_ne = ne11 * ne01;
-
-    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
-    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
-    const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
-    const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
-    const uint64_t d_sz = sizeof(float) * d_ne;
-
-    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
-    GGML_ASSERT(d_D != nullptr);
-    vk_buffer d_X;
-    uint64_t x_buf_offset = 0;
-    vk_buffer d_Y;
-    uint64_t y_buf_offset = 0;
-    if (load_x) {
-        d_Qx = ctx->prealloc_qx;
-    } else if(!src1_uma) {
-        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
-        GGML_ASSERT(d_Qx != nullptr);
-    }
-    if (load_y) {
-        d_Qy = ctx->prealloc_qy;
-    } else if(!src1_uma) {
-        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
-        GGML_ASSERT(d_Qy != nullptr);
-    }
-    if (qx_needs_dequant) {
-        d_X = ctx->prealloc_x;
-    } else {
-        d_X = d_Qx;
-        x_buf_offset = qx_buf_offset;
-        GGML_ASSERT(qx_sz == x_sz);
-    }
-    if (qy_needs_dequant) {
-        d_Y = ctx->prealloc_y;
-    } else {
-        d_Y = d_Qy;
-        y_buf_offset = qy_buf_offset;
-        GGML_ASSERT(qy_sz == y_sz);
-    }
-
-    vk_pipeline * to_fp16_vk_0 = nullptr;
-    vk_pipeline* to_fp16_vk_1 = nullptr;
-    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
-    }
-    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
-    } else {
-        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
-    }
-    vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
-    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
-    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
-    GGML_ASSERT(dmmv != nullptr);
-
-    // Allocate descriptor sets
-    if (qx_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1);
-    }
-    if (qy_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
-    }
-    ggml_pipeline_allocate_descriptor_sets(ctx, *dmmv, ne12 * ne13);
-
-    if (x_non_contig) {
-        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment));
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type);
-    } else if (load_x) {
-        // copy data to device
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
-    }
-    if (y_non_contig) {
-        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, src1->type);
-    } else if (load_y) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
-    }
-
-    for (uint64_t i13 = 0; i13 < ne13; i13++) {
-        const uint64_t i03 = i13 / r3;
-        for (uint64_t i12 = 0; i12 < ne12; i12++) {
-            const uint64_t i02 = i12 / r2;
-
-            const uint64_t it_idx0 = (i03 * ne02 + i02);
-            const uint64_t it_idx1 = (i13 * ne12 + i12);
-            const uint64_t x_offset = x_buf_offset + x_sz * it_idx0;
-            const uint64_t qy_offset = qy_buf_offset + qy_sz * it_idx1;
-            const uint64_t y_offset = y_buf_offset + y_sz * it_idx1;
-            const uint64_t d_offset = d_buf_offset + d_sz * it_idx1;
-
-            const uint64_t y_buffer_offset = (y_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-            const uint64_t y_shader_offset = y_offset - y_buffer_offset;
-
-            const uint64_t d_buffer_offset = (d_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-            const uint64_t d_shader_offset = d_offset - d_buffer_offset;
-
-            if (!y_non_contig && qy_needs_dequant) {
-                const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
-                ggml_vk_sync_buffers(subctx);
-                ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1});
-            }
-
-            // compute
-            const std::array<int, 3> pc = { (int)ne00, (int)(y_shader_offset / ggml_type_size(src1->type)), (int)(d_shader_offset / ggml_type_size(dst->type))};
-            ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
-
-            if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                // copy dst to host
-                float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                ggml_vk_sync_buffers(subctx);
-                ggml_vk_buffer_read_async(ctx, subctx, d_D, d_offset, d, sizeof(float) * d_ne);
-            }
-        }
-    }
-}
-
-static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",  backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
-    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",  backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
-    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",  backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
-#endif
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]);  // NOLINT
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]);  // NOLINT
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    // const uint64_t ne03 = src0->ne[3];
-
-    const uint64_t ne10 = src1->ne[0];
-    const uint64_t ne11 = src1->ne[1];
-    const uint64_t ne12 = src1->ne[2];
-    // const uint64_t ne13 = src1->ne[3];
-
-    GGML_ASSERT(ne11 == 1);
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
-
-    vk_buffer d_Qy;
-    size_t qy_buf_offset = 0;
-
-    bool src1_uma = false;
-
-    if (ctx->device.lock()->uma) {
-        ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
-        src1_uma = d_Qy != nullptr;
-    }
-
-    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
-
-    const uint64_t x_ne = ne00 * ne01 * ne02;
-    const uint64_t y_ne = ne10 * ne11 * ne12;
-    const uint64_t d_ne = ne01 * ne11 * ne12;
-
-    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
-    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
-    const uint64_t d_sz = sizeof(float) * d_ne;
-
-    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
-    GGML_ASSERT(d_D != nullptr);
-    vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
-    GGML_ASSERT(d_Qx != nullptr);
-    if (load_y) {
-        d_Qy = ctx->prealloc_qy;
-    } else if (!src1_uma) {
-        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
-        GGML_ASSERT(d_Qx != nullptr);
-    }
-
-    // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1);
-
-    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-    const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
-
-    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
-
-    if (load_y) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
-    }
-
-    // compute
-    const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
-    ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        // copy dst to host
-        float * d = (float *) dst->data;
-        ggml_vk_sync_buffers(subctx);
-        ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
-    }
-}
-
-static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",  backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
-    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",  backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
-    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",  backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
-#endif
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    // const uint64_t ne03 = src0->ne[3];
-
-    const uint64_t nb01 = src0->nb[1];
-    const uint64_t nb02 = src0->nb[2];
-
-    // const uint64_t ne10 = src1->ne[0];
-    const uint64_t ne11 = src1->ne[1];
-    const uint64_t ne12 = src1->ne[2];
-    // const uint64_t ne13 = src1->ne[3];
-
-    GGML_ASSERT(ne11 == 1);
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
-
-    vk_buffer d_Qy = nullptr;
-    size_t qy_buf_offset = 0;
-
-    bool src1_uma = false;
-
-    if (ctx->device.lock()->uma) {
-        ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
-        src1_uma = d_Qy != nullptr;
-    }
-
-    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
-
-    const uint64_t d_ne = ne01 * ne11 * ne12;
-
-    const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
-    const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
-
-    const uint64_t qx_sz = ggml_nbytes(src0);
-    const uint64_t qy_sz = ggml_nbytes(src1);
-    const uint64_t d_sz = sizeof(float) * d_ne;
-
-    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
-    GGML_ASSERT(d_D != nullptr);
-    vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
-    GGML_ASSERT(d_Qx != nullptr);
-    if (load_y) {
-        d_Qy = ctx->prealloc_qy;
-    } else {
-        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
-        GGML_ASSERT(d_Qx != nullptr);
-    }
-
-    // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1);
-
-    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-    const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
-
-    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
-
-    if (load_y) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
-    }
-
-    // compute
-    const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
-    ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        // copy dst to host
-        float * d = (float *) dst->data;
-        ggml_vk_sync_buffers(subctx);
-        ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
-    }
-}
-
-static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
-    const uint64_t ne10 = src1->ne[0];
-
-    const uint64_t ne0 = dst->ne[0];
-    const uint64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-           (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
-           dst->type == GGML_TYPE_F32 &&
-           ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
-}
-
-static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
-#endif
-    if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst);
-    } else if (src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
-        ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst);
-    } else {
-        ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst);
-    }
-}
-
-static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const uint64_t ne0 = dst->ne[0];
-    const uint64_t ne1 = dst->ne[1];
-    const uint64_t ne2 = dst->ne[2];
-    const uint64_t ne3 = dst->ne[3];
-
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    const uint64_t ne03 = src0->ne[3];
-
-    const uint64_t nb0 = dst->nb[0];
-    const uint64_t nb1 = dst->nb[1];
-    const uint64_t nb2 = dst->nb[2];
-    const uint64_t nb3 = dst->nb[3];
-
-    const uint64_t nb00 = src0->nb[0];
-    const uint64_t nb01 = src0->nb[1];
-    const uint64_t nb02 = src0->nb[2];
-    const uint64_t nb03 = src0->nb[3];
-
-    const uint64_t nr0 = ne0/ne00;
-    const uint64_t nr1 = ne1/ne01;
-    const uint64_t nr2 = ne2/ne02;
-    const uint64_t nr3 = ne3/ne03;
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-
-    const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
-    const uint64_t src_offset = extra_src0->offset;
-    vk_buffer dst_buf = extra->buffer_gpu.lock();
-    const uint64_t dst_offset = extra->offset;
-
-    std::vector<vk::BufferCopy> copies;
-
-    for                         (uint64_t i3 = 0; i3 < nr3;  i3++) {
-        for                     (uint64_t k3 = 0; k3 < ne03; k3++) {
-            for                 (uint64_t i2 = 0; i2 < nr2;  i2++) {
-                for             (uint64_t k2 = 0; k2 < ne02; k2++) {
-                    for         (uint64_t i1 = 0; i1 < nr1;  i1++) {
-                        for     (uint64_t k1 = 0; k1 < ne01; k1++) {
-                            for (uint64_t i0 = 0; i0 < nr0;  i0++) {
-                                copies.push_back({
-                                    src_offset + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
-                                    dst_offset + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
-                                    ne00*nb0,
-                                });
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    ggml_vk_sync_buffers(subctx);
-    subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies);
-
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(src1);
-}
-
-
-static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
-    switch (op) {
-    case GGML_OP_ADD:
-        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_add_f32;
-        }
-        return nullptr;
-    case GGML_OP_GET_ROWS:
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        if (dst->type == GGML_TYPE_F16) {
-            return &ctx->pipeline_get_rows[src0->type];
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_get_rows_f32[src0->type];
-        }
-        return nullptr;
-    case GGML_OP_MUL:
-        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_mul_f32;
-        }
-        return nullptr;
-    case GGML_OP_SCALE:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_scale_f32;
-        }
-        return nullptr;
-    case GGML_OP_SQR:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_sqr_f32;
-        }
-        return nullptr;
-    case GGML_OP_CLAMP:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_clamp_f32;
-        }
-        return nullptr;
-    case GGML_OP_CPY:
-    case GGML_OP_CONT:
-    case GGML_OP_DUP:
-        return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
-    case GGML_OP_NORM:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_norm_f32;
-        }
-        return nullptr;
-    case GGML_OP_RMS_NORM:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_rms_norm_f32;
-        }
-        return nullptr;
-    case GGML_OP_UNARY:
-        switch (ggml_get_unary_op(dst)) {
-            case GGML_UNARY_OP_SILU:
-                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_silu_f32;
-                }
-                break;
-            case GGML_UNARY_OP_GELU:
-                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_gelu_f32;
-                }
-                break;
-            case GGML_UNARY_OP_RELU:
-                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_relu_f32;
-                }
-                break;
-            default:
-                break;
-        }
-        return nullptr;
-    case GGML_OP_DIAG_MASK_INF:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_diag_mask_inf_f32;
-        }
-        return nullptr;
-    case GGML_OP_SOFT_MAX:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_soft_max_f32;
-        }
-        return nullptr;
-    case GGML_OP_ROPE:
-        {
-            const int mode = ((const int32_t *) dst->op_params)[2];
-            const bool is_neox = mode & 2;
-            const bool is_glm  = mode & 4;
-
-            if (is_glm) {
-                return nullptr;
-            }
-
-            if (is_neox) {
-                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_rope_neox_f32;
-                }
-                if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-                    return &ctx->pipeline_rope_neox_f16;
-                }
-            } else {
-                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_rope_f32;
-                }
-                if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-                    return &ctx->pipeline_rope_f16;
-                }
-            }
-            return nullptr;
-        }
-    default:
-        return nullptr;
-    }
-}
-
-static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
-    switch(op) {
-    case GGML_OP_REPEAT:
-        return ggml_vk_op_repeat;
-    default:
-        return nullptr;
-    }
-}
-
-template<typename PC>
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
-    if (src1 != nullptr) {
-        std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
-    }
-    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
-#endif
-    GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)));  // NOLINT
-    GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0));  // NOLINT
-    GGML_ASSERT(src1 == nullptr || ggml_vk_dim01_contiguous(src1));  // NOLINT
-    GGML_ASSERT(dst->extra != nullptr);
-    const uint64_t ne00 = src0->ne[0];
-    const uint64_t ne01 = src0->ne[1];
-    const uint64_t ne02 = src0->ne[2];
-    const uint64_t ne03 = src0->ne[3];
-    const uint64_t ne0 = ne00 * ne01;
-    const bool use_src1 = src1 != nullptr;
-    const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
-    const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
-    const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
-    const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
-    const uint64_t ne1 = ne10 * ne11;
-    // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
-    const uint64_t nb2  = dst->nb[2];
-    const uint64_t nb3  = dst->nb[3];
-
-    vk_pipeline * pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
-    ggml_vk_func_t op_func;
-
-    if (pipeline == nullptr) {
-        op_func = ggml_vk_op_get_func(op);
-        if (op_func == nullptr) {
-            std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
-            if (src1 != nullptr) {
-                std::cerr << " and " << ggml_type_name(src1->type);
-            }
-            std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
-            GGML_ASSERT(false);
-        }
-
-        op_func(ctx, subctx, src0, src1, dst);
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-
-    vk_buffer d_X = nullptr;
-    size_t x_buf_offset = 0;
-    vk_buffer d_Y = nullptr;
-    size_t y_buf_offset = 0;
-
-    bool src0_uma = false;
-    bool src1_uma = false;
-
-    if (ctx->device.lock()->uma) {
-        ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
-        src0_uma = d_X != nullptr;
-        if (use_src1) {
-            ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
-            src1_uma = d_Y != nullptr;
-        }
-    }
-
-    const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
-
-    uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
-    uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
-    uint64_t d_sz = ggml_type_size(dst->type) * ne0;
-
-    vk_buffer d_D = extra->buffer_gpu.lock();
-
-    // Workaround for tiny tensor inputs on ROPE
-    if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
-        y_sz = VK_WHOLE_SIZE;
-    }
-
-    GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = (extra->offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-    GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
-    if (transfer_src0) {
-        d_X = ctx->prealloc_qx;
-    } else if(!src0_uma) {
-        d_X = extra_src0->buffer_gpu.lock();
-        x_buf_offset = extra_src0->offset;
-        GGML_ASSERT(d_X != nullptr);
-    }
-    if (transfer_src1) {
-        d_Y = ctx->prealloc_qy;
-    } else if (use_src1 && !src1_uma) {
-        d_Y = extra_src1->buffer_gpu.lock();
-        y_buf_offset = extra_src1->offset;
-        GGML_ASSERT(d_Y != nullptr);
-    }
-
-    if (op == GGML_OP_CPY) {
-        GGML_ASSERT(!transfer_src0);
-        GGML_ASSERT(!transfer_src1);
-        x_sz = ggml_nbytes(src0);
-        d_sz = ggml_nbytes(dst);
-
-        if (extra_src0->offset + x_sz >= d_X->size) {
-            x_sz = VK_WHOLE_SIZE;
-        }
-        if (extra->offset + d_sz >= d_D->size) {
-            d_sz = VK_WHOLE_SIZE;
-        }
-    }
-
-    std::array<uint32_t, 3> elements;
-
-    // copy src0 to device
-    if (transfer_src0) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
-        ctx->staging_offset = x_sz * ne02 * ne03;
-    }
-    if (transfer_src1) {
-        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
-    }
-
-    // Single call if dimension 2 is contiguous
-    if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1);
-
-        switch (dst->op) {
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SOFT_MAX:
-            elements = { (uint32_t)ggml_nrows(src0), 1, 1 };
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_ROPE:
-            elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
-            break;
-        default:
-            elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
-            break;
-        }
-
-        if (op != GGML_OP_CPY) {
-            if (x_sz != VK_WHOLE_SIZE) {
-                x_sz *= ne02 * ne03;
-            }
-            if (y_sz != VK_WHOLE_SIZE) {
-                y_sz *= ne12 * ne13;
-            }
-            if (d_sz != VK_WHOLE_SIZE) {
-                d_sz *= ne02 * ne03;
-            }
-        }
-
-        if (!use_src1 && op == GGML_OP_SOFT_MAX) {
-            // Empty src1 is possible on soft_max, but the shader needs a buffer
-            ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
-        } else if (use_src1) {
-            ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
-        } else {
-            ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
-        }
-        if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
-            ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
-        } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
-            // copy dst to host
-            float * d = (float *) dst->data;
-            ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
-        }
-    } else {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne02 * ne03);
-
-        switch (dst->op) {
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SOFT_MAX:
-            elements = { (uint32_t)ne01, 1, 1 };
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_ROPE:
-            elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
-            break;
-        default:
-            elements = { (uint32_t)ne0, 1, 1 };
-            break;
-        }
-
-        for (uint64_t i03 = 0; i03 < ne03; i03++) {
-            for (uint64_t i02 = 0; i02 < ne02; i02++) {
-                const uint32_t it_idx0 = (i03 * ne02 + i02);
-                const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0;
-                const uint32_t x_offset = x_sz * it_idx0;
-                const uint32_t y_offset = y_sz * it_idx1;
-                const uint32_t d_offset = d_sz * it_idx0;
-
-                if (!use_src1 && op == GGML_OP_SOFT_MAX) {
-                    // Empty src1 is possible on soft_max, but the shader needs a buffer
-                    ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
-                } else if (use_src1) {
-                    ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
-                } else {
-                    ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
-                }
-                if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                    // copy dst to host
-                    ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
-                }
-            }
-        }
-    }
-}
-
-static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
-}
-
-static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
-}
-
-static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
-}
-
-static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
-}
-
-static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
-}
-
-static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
-}
-
-static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] });
-}
-
-static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
-    const int src0_type_size = ggml_type_size(src0->type);
-    const int dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = (extra->offset % ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
-    ggml_vk_op_f32<vk_op_cpy_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
-        (uint32_t)ggml_nelements(src0),
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size,
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size,
-        d_offset,
-    });
-}
-
-static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
-}
-
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
-}
-
-static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
-}
-
-static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    int32_t * op_params = (int32_t *)dst->op_params;
-    ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
-}
-
-static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f });
-}
-
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int n_dims        = ((int32_t *) dst->op_params)[1];
-    const int mode          = ((int32_t *) dst->op_params)[2];
-    // const int n_ctx         = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx    = ((int32_t *) dst->op_params)[4];
-    const float freq_base   = ((float *)   dst->op_params)[5];
-    const float freq_scale  = ((float *)   dst->op_params)[6];
-    const float ext_factor  = ((float *)   dst->op_params)[7];
-    const float attn_factor = ((float *)   dst->op_params)[8];
-    const float beta_fast   = ((float *)   dst->op_params)[9];
-    const float beta_slow   = ((float *)   dst->op_params)[10];
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    GGML_ASSERT(!is_glm);
-
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    if (is_neox) {
-        const float theta_scale = powf(freq_base, -2.0f/n_dims);
-        const float inv_ndims = -1.0f / n_dims;
-        ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
-    } else {
-        ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
-    }
-}
-
-static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    // If backend is CPU, data from src0 has to be copied off the device
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
-        vk_buffer d_D = extra_src0->buffer_gpu.lock();
-        ggml_vk_sync_buffers(subctx);
-        ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
-    }
-}
-
-#ifdef GGML_VULKAN_RUN_TESTS
-static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
-    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
-        return;
-    }
-    i0 = std::max(i0, 5);
-    i1 = std::max(i1, 5);
-    i2 = std::max(i2, 0);
-    fprintf(stderr, "         ");
-    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-        fprintf(stderr, "%7d ", idx1);
-    }
-    fprintf(stderr, "\n");
-    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
-        fprintf(stderr, "%7d: ", idx0);
-        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-            if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) {
-                float val;
-                if (type == GGML_TYPE_F32) {
-                    val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
-                } else if (type == GGML_TYPE_F16) {
-                    val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
-                }
-                fprintf(stderr, "% 7.2f ", val);
-            } else {
-                fprintf(stderr, "        ");
-            }
-        }
-        fprintf(stderr, "\n");
-    }
-}
-
-template <typename X_TYPE, typename Y_TYPE>
-static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl;
-#endif
-    const size_t x_ne = m * k * batch;
-    const size_t y_ne = k * n * batch;
-    const size_t d_ne = m * n * batch;
-
-    vk_pipeline * p;
-    std::string shname;
-    if (shader_size == 0) {
-        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_s;
-            shname = "F32_ALIGNED_S";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_s;
-            shname = "F16_F32_ALIGNED_S";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_s;
-            shname = "F16_ALIGNED_S";
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else if (shader_size == 1) {
-        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_m;
-            shname = "F32_ALIGNED_M";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_m;
-            shname = "F16_F32_ALIGNED_M";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_m;
-            shname = "F16_ALIGNED_M";
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else if (shader_size == 2) {
-        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_l;
-            shname = "F32_ALIGNED_L";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_l;
-            shname = "F16_F32_ALIGNED_L";
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_l;
-            shname = "F16_ALIGNED_L";
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        GGML_ASSERT(0);
-    }
-
-    const size_t kpad = ggml_vk_align_size(k, p->align);
-
-    if (k != kpad) {
-        if (shader_size == 0) {
-            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_s;
-                shname = "F32_S";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_s;
-                shname = "F16_F32_S";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_s;
-                shname = "F16_S";
-            }
-        } else if (shader_size == 1) {
-            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_m;
-                shname = "F32_M";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_m;
-                shname = "F16_F32_M";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_m;
-                shname = "F16_M";
-            }
-        } else if (shader_size == 2) {
-            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_l;
-                shname = "F32_L";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_l;
-                shname = "F16_F32_L";
-            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_l;
-                shname = "F16_L";
-            }
-        }
-    }
-
-    ggml_pipeline_allocate_descriptor_sets(ctx, *p, num_it);
-    if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it);
-
-        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
-            // Resize buffer
-            if (ctx->prealloc_split_k != nullptr) {
-                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
-            }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
-        }
-    }
-
-    vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-
-    X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
-    Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
-    float* d = (float *) malloc(sizeof(float) * d_ne);
-
-    for (size_t i = 0; i < x_ne; i++) {
-        if (std::is_same<float, X_TYPE>()) {
-            x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
-        } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
-            x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-    for (size_t i = 0; i < y_ne; i++) {
-        if (std::is_same<float, Y_TYPE>()) {
-            y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
-        } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
-    ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
-    for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx, subctx);
-        ggml_vk_matmul(ctx, subctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n);
-        ggml_vk_ctx_end(subctx);
-    }
-
-    auto begin = std::chrono::high_resolution_clock::now();
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    auto end = std::chrono::high_resolution_clock::now();
-    double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-
-    // copy dst to host
-    ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
-
-    float * d_chk = (float *) malloc(sizeof(float) * d_ne);
-
-    ggml_init_params iparams = {
-        /*.mem_size   =*/ 1024*1024*1024,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context * ggml_ctx = ggml_init(iparams);
-
-    ggml_type src0_type;
-    ggml_type src1_type;
-
-    if (std::is_same<float, X_TYPE>()) {
-        src0_type = GGML_TYPE_F32;
-    } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
-        src0_type = GGML_TYPE_F16;
-    } else {
-        GGML_ASSERT(false);
-    }
-    if (std::is_same<float, Y_TYPE>()) {
-        src1_type = GGML_TYPE_F32;
-    } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
-        src1_type = GGML_TYPE_F16;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
-    ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch);
-    ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
-
-    src0_ggml->data = x;
-    src1_ggml->data = y;
-    tensor_ggml->data = d_chk;
-
-    ctx->disable = true;
-
-    ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
-    ggml_build_forward_expand(cgraph, tensor_ggml);
-
-    ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
-
-    ctx->disable = false;
-
-    ggml_free(ggml_ctx);
-
-    double avg_err = 0.0;
-    int first_err_n = -1;
-    int first_err_m = -1;
-    int first_err_b = -1;
-
-    for (size_t i = 0; i < m*n*batch; i++) {
-        double err = std::fabs(d[i] - d_chk[i]);
-        avg_err += err;
-
-        if (err > 0.05f && first_err_n == -1) {
-            first_err_b = i / (m * n);
-            first_err_n = (i % (m * n)) / m;
-            first_err_m = (i % (m * n)) % m;
-        }
-    }
-
-    avg_err /= m * n;
-
-    std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl;
-
-    if (avg_err > 0.1) {
-        std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
-        std::cerr << "Actual result: " << std::endl << std::endl;
-        ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-        std::cerr << "Expected result: " << std::endl << std::endl;
-        ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-
-        if (split_k > 1) {
-            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
-
-            std::cerr << "d_buf0: " << std::endl << std::endl;
-            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-
-            std::cerr << "d_buf1: " << std::endl << std::endl;
-            ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-
-            std::cerr << "d_buf2: " << std::endl << std::endl;
-            ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-
-            std::cerr << "d_buf3: " << std::endl << std::endl;
-            ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
-
-            free(split_k_buf);
-        }
-    }
-
-    free(d_chk);
-
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue);
-
-    ggml_vk_destroy_buffer(d_X);
-    ggml_vk_destroy_buffer(d_Y);
-    ggml_vk_destroy_buffer(d_D);
-
-    ggml_pipeline_cleanup(*p);
-    ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce);
-
-    free(x);
-    free(y);
-    free(d);
-}
-
-static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
-        return;
-    }
-    i0 = std::max(i0, 5);
-    i1 = std::max(i1, 5);
-    i2 = std::max(i2, 0);
-    i3 = std::max(i3, 0);
-    fprintf(stderr, "         ");
-    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-        fprintf(stderr, "%7d ", idx1);
-    }
-    fprintf(stderr, "\n");
-    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
-        fprintf(stderr, "%7d: ", idx0);
-        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-            if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
-                float val;
-                if (tensor->type == GGML_TYPE_F32) {
-                    val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
-                } else if (tensor->type == GGML_TYPE_F16) {
-                    val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
-                }
-                fprintf(stderr, "% 7.2f ", val);
-            } else {
-                fprintf(stderr, "        ");
-            }
-        }
-        fprintf(stderr, "\n");
-    }
-}
-
-static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_t ne1, size_t ne2, size_t ne3) {
-    const size_t ne = ne0 * ne1 * ne2 * ne3;
-
-    ggml_init_params iparams = {
-        /*.mem_size   =*/ 1024*1024*1024,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context * ggml_ctx = ggml_init(iparams);
-
-    ggml_tensor * tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne2, ne1, ne3);  // NOLINT
-    ggml_tensor * result_tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne1, ne2, ne3);
-
-    float * data = (float *) ggml_vk_host_malloc(ctx, ggml_nbytes(tensor));
-    tensor->data = data;
-
-    float * result_data = (float *) malloc(ggml_nbytes(tensor));
-    result_tensor->data = result_data;
-
-    // Permute
-    {
-        size_t tmp = tensor->nb[2];
-        tensor->nb[2] = tensor->nb[1];
-        tensor->nb[1] = tmp;
-
-        tensor->ne[2] = ne2;
-        tensor->ne[1] = ne1;
-    }
-
-    for (size_t i = 0; i < ne; i++) {
-        data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
-    }
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
-
-    vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
-
-    ggml_vk_h2d_tensor_2d(ctx, subctx, buffer, 0, tensor, 0, 0, ggml_nrows(tensor));
-
-    ggml_vk_ctx_end(subctx);
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor));
-
-    double avg_err = 0.0;
-    int first_err_i0 = -1;
-    int first_err_i1 = -1;
-    int first_err_i2 = -1;
-    int first_err_i3 = -1;
-
-    for (size_t i3 = 0; i3 < ne3; i3++) {
-        for (size_t i2 = 0; i2 < ne2; i2++) {
-            for (size_t i1 = 0; i1 < ne1; i1++) {
-                for (size_t i0 = 0; i0 < ne0; i0++) {
-                    float correct = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
-                    float result = *(float *) ((char *) result_data + i3*ne2*ne1*ne0*sizeof(float) + i2*ne1*ne0*sizeof(float) + i1*ne0*sizeof(float) + i0*sizeof(float));
-                    double err = std::fabs(result - correct);
-
-                    avg_err += err;
-
-                    if (err > 0.05f && first_err_i0 == -1) {
-                        first_err_i0 = i0;
-                        first_err_i1 = i1;
-                        first_err_i2 = i2;
-                        first_err_i3 = i3;
-                    }
-                }
-            }
-        }
-    }
-
-    avg_err /= ne;
-
-    std::cerr << "TEST nc copy ne0=" << ne0 << " ne1=" << ne1 << " ne2=" << ne2 << " ne3=" << ne3 << " avg_err=" << avg_err << std::endl;
-
-    if (avg_err > 0.1) {
-        std::cerr << "i0 = " << first_err_i0 << " i1 = " << first_err_i1 << " i2 = " << first_err_i2 << " i3 = " << first_err_i3 << std::endl;
-        std::cerr << "Actual result: " << std::endl << std::endl;
-        ggml_vk_print_tensor_area(result_tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3);
-        std::cerr << "Expected result: " << std::endl << std::endl;
-        ggml_vk_print_tensor_area(tensor, first_err_i0, first_err_i1, first_err_i2, first_err_i3);
-    }
-
-    ggml_free(ggml_ctx);
-
-    ggml_vk_destroy_buffer(buffer);
-
-    ggml_vk_host_free(ctx, data);
-    free(result_data);
-}
-
-static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
-#endif
-    // Check transfers are correct
-    vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-
-    float * x;
-    float * y;
-    if (pinned) {
-        x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
-        y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
-    } else {
-        x = (float *) malloc(sizeof(float) * ne);
-        y = (float *) malloc(sizeof(float) * ne);
-    }
-
-    for (size_t i = 0; i < ne; i++) {
-        x[i] = rand() / (float)RAND_MAX;
-    }
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
-
-    auto begin = std::chrono::high_resolution_clock::now();
-
-    ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
-
-    for (auto& cpy : subctx->in_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-    subctx->in_memcpys.clear();
-
-    ggml_vk_ctx_end(subctx);
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    auto end = std::chrono::high_resolution_clock::now();
-
-    double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-
-    ggml_vk_ctx_begin(ctx, subctx);
-
-    begin = std::chrono::high_resolution_clock::now();
-
-    ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
-
-    ggml_vk_ctx_end(subctx);
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    for (auto& cpy : subctx->out_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-    subctx->out_memcpys.clear();
-
-    end = std::chrono::high_resolution_clock::now();
-
-    double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-
-    double avg_err = 0.0;
-    for (size_t i = 0; i < ne; i++) {
-        avg_err += std::fabs(x[i] - y[i]);
-    }
-
-    double kb = ne * sizeof(float) / 1024.0;
-
-    std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
-
-    ggml_vk_destroy_buffer(buffer);
-
-    if (pinned) {
-        ggml_vk_host_free(ctx, x);
-        ggml_vk_host_free(ctx, y);
-    } else {
-        free(x);
-        free(y);
-    }
-}
-
-static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl;
-#endif
-    const size_t x_sz = sizeof(float) * ne;
-    const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
-    const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
-    float * x = (float *) malloc(x_sz);
-    void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
-
-    for (size_t i = 0; i < ne; i++) {
-        x[i] = rand() / (float)RAND_MAX;
-    }
-
-    std::vector<int64_t> hist_cur(1 << 4, 0);
-
-    vk_pipeline& p = ctx->pipeline_dequant[quant];
-
-    switch(quant) {
-    case GGML_TYPE_Q4_0:
-        ggml_quantize_q4_0(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q4_1:
-        ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q5_0:
-        ggml_quantize_q5_0(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q5_1:
-        ggml_quantize_q4_1(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q8_0:
-        ggml_quantize_q8_0(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q2_K:
-        ggml_quantize_q2_K(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q3_K:
-        ggml_quantize_q3_K(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q4_K:
-        ggml_quantize_q4_K(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q5_K:
-        ggml_quantize_q5_K(x, qx, ne, ne, hist_cur.data());
-        break;
-    case GGML_TYPE_Q6_K:
-        ggml_quantize_q6_K(x, qx, ne, ne, hist_cur.data());
-        break;
-    default:
-        GGML_ASSERT(false);
-    }
-
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
-
-    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
-    const std::vector<int> pc = { 1, (int)ne, (int)ne, (int)ne };
-    ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
-    ggml_vk_ctx_end(subctx);
-
-    auto begin = std::chrono::high_resolution_clock::now();
-
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    auto end = std::chrono::high_resolution_clock::now();
-
-    double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
-
-    double avg_err = 0.0;
-    for (size_t i = 0; i < ne; i++) {
-        avg_err += std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
-    }
-
-    std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err / ne << std::endl;
-
-    ggml_vk_destroy_buffer(x_buf);
-    ggml_vk_destroy_buffer(qx_buf);
-
-    free(x);
-    free(qx);
-    free(x_chk);
-}
-#endif
-
-static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
-#endif
-    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
-    extra->reset();
-    tensor->extra = extra;
-    return extra;
-}
-
-static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph * graph) {
-    GGML_ASSERT(node != nullptr);
-
-    for (int i = graph->n_nodes - 1; i >= 0; i--) {
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (graph->nodes[i]->src[j] == node) {
-                return graph->nodes[i];
-            }
-        }
-    }
-
-    return nullptr;
-}
-
-static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
-#endif
-    const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
-        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
-
-    if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
-    if (extra == nullptr) {
-        // Workaround for CPU backend BLAS matmul calls
-        extra = ggml_vk_tensor_create_extra(node);
-    }
-
-    ggml_tensor * src0 = node->src[0];
-    ggml_tensor * src1 = node->src[1];
-
-    const bool use_src0 = src0 != nullptr;
-    const int64_t ne00 = use_src0 ? src0->ne[0] : 0;
-    const int64_t ne01 = use_src0 ? src0->ne[1] : 0;
-    const int64_t ne02 = use_src0 ? src0->ne[2] : 0;
-    const int64_t ne03 = use_src0 ? src0->ne[3] : 0;
-    const bool use_src1 = src1 != nullptr && node->op != GGML_OP_CPY && node->op != GGML_OP_CONT && node->op != GGML_OP_DUP;
-    const int64_t ne10 = use_src1 ? src1->ne[0] : 0;
-    const int64_t ne11 = use_src1 ? src1->ne[1] : 0;
-    const int64_t ne12 = use_src1 ? src1->ne[2] : 0;
-    const int64_t ne13 = use_src1 ? src1->ne[3] : 0;
-    const int64_t ne20 = node->ne[0];
-    const int64_t ne21 = node->ne[1];
-    const int64_t ne22 = node->ne[2];
-    const int64_t ne23 = node->ne[3];
-
-    const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
-
-    int split_k;
-    if (node->op == GGML_OP_MUL_MAT) {
-        split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
-    } else {
-        split_k = 1;
-    }
-    const uint32_t x_ne = ne00 * ne01;
-    const uint32_t y_ne = ne10 * ne11;
-    const uint32_t d_ne = ne20 * ne21;
-
-    const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
-    const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
-    const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
-    const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
-    uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
-    const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
-
-    if (extra->buffer_gpu.expired()) {
-        // Workaround for CPU backend BLAS matmul calls
-        extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz);
-    }
-
-    switch (node->op) {
-    case GGML_OP_REPEAT:
-    case GGML_OP_GET_ROWS:
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
-    case GGML_OP_ADD:
-    case GGML_OP_SCALE:
-    case GGML_OP_SQR:
-    case GGML_OP_CLAMP:
-    case GGML_OP_CPY:
-    case GGML_OP_CONT:
-    case GGML_OP_DUP:
-    case GGML_OP_MUL:
-    case GGML_OP_NORM:
-    case GGML_OP_RMS_NORM:
-    case GGML_OP_DIAG_MASK_INF:
-    case GGML_OP_SOFT_MAX:
-    case GGML_OP_ROPE:
-        break;
-    case GGML_OP_UNARY:
-        switch (ggml_get_unary_op(node)) {
-        case GGML_UNARY_OP_SILU:
-        case GGML_UNARY_OP_GELU:
-        case GGML_UNARY_OP_RELU:
-            break;
-        default:
-            return;
-        }
-        break;
-    case GGML_OP_MUL_MAT:
-        if (ctx->prealloc_size_qx < qx_sz) {
-            ctx->prealloc_size_qx = qx_sz;
-        }
-        if (ctx->prealloc_size_qy < qy_sz) {
-            ctx->prealloc_size_qy = qy_sz;
-        }
-        if (ctx->prealloc_size_x < x_sz) {
-            ctx->prealloc_size_x = x_sz;
-        }
-        if (ctx->prealloc_size_y < y_sz) {
-            ctx->prealloc_size_y = y_sz;
-        }
-        if (ctx->prealloc_size_split_k < split_k_size) {
-            ctx->prealloc_size_split_k = split_k_size;
-        }
-        if (ctx->staging_size < x_sz + y_sz) {
-            ctx->staging_size = x_sz + y_sz;
-        }
-        break;
-    default:
-        return;
-    }
-}
-
-static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
-    if (ctx->disable) {
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
-#endif
-#if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
-        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-    ggml_vk_test_transfer(ctx, 8192 * 1000, false);
-    ggml_vk_test_transfer(ctx, 8192 * 1000, true);
-
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q6_K);
-
-    const std::vector<size_t> vals {
-        8, 8, 8,
-        100, 46, 576,
-        623, 111, 128,
-        100, 46, 558,
-        512, 1, 256,
-        128, 110, 622,
-        511, 511, 127,
-        511, 511, 7,
-        511, 511, 17,
-        49, 49, 128,
-        128, 49, 49,
-        4096, 49, 4096,
-        11008, 49, 4096,
-        4096, 49, 11008,
-        32000, 49, 4096,
-        512, 512, 128,
-        128, 512, 512,
-        4096, 512, 4096,
-        11008, 512, 4096,
-        4096, 512, 11008,
-        32000, 512, 4096,
-    };
-    const size_t num_it = 1;
-    for (size_t i = 0; i < vals.size(); i += 3) {
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
-        std::cerr << std::endl;
-    }
-
-    GGML_ASSERT(false);
-#endif
-
-    if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
-        // Resize buffer
-        if (ctx->prealloc_qx != nullptr) {
-            ggml_vk_destroy_buffer(ctx->prealloc_qx);
-        }
-        ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
-    }
-    if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
-        // Resize buffer
-        if (ctx->prealloc_qy != nullptr) {
-            ggml_vk_destroy_buffer(ctx->prealloc_qy);
-        }
-        ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
-    }
-    if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
-        // Resize buffer
-        if (ctx->prealloc_x != nullptr) {
-            ggml_vk_destroy_buffer(ctx->prealloc_x);
-        }
-        ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x);
-    }
-    if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
-        // Resize buffer
-        if (ctx->prealloc_y != nullptr) {
-            ggml_vk_destroy_buffer(ctx->prealloc_y);
-        }
-        ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y);
-    }
-    if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
-        // Resize buffer
-        if (ctx->prealloc_split_k != nullptr) {
-            ggml_vk_destroy_buffer(ctx->prealloc_split_k);
-        }
-        ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k);
-    }
-    if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) {
-        // Resize buffer
-        if (ctx->staging != nullptr) {
-            ggml_vk_destroy_buffer(ctx->staging);
-        }
-        ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-    }
-}
-
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
-    const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
-        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
-
-    if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
-        return;
-    }
-
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl;
-#endif
-    ctx->semaphore_idx = 0;
-    ctx->staging_offset = 0;
-
-    const ggml_tensor * src0 = node->src[0];
-    const ggml_tensor * src1 = node->src[1];
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
-
-    switch (node->op) {
-    case GGML_OP_UNARY:
-        switch (ggml_get_unary_op(node)) {
-        case GGML_UNARY_OP_SILU:
-        case GGML_UNARY_OP_GELU:
-        case GGML_UNARY_OP_RELU:
-            break;
-        default:
-            return;
-        }
-        break;
-    case GGML_OP_REPEAT:
-    // case GGML_OP_GET_ROWS:
-    case GGML_OP_ADD:
-    case GGML_OP_MUL:
-    case GGML_OP_SCALE:
-    case GGML_OP_SQR:
-    case GGML_OP_CLAMP:
-    case GGML_OP_CPY:
-    case GGML_OP_CONT:
-    case GGML_OP_DUP:
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
-    case GGML_OP_NORM:
-    case GGML_OP_RMS_NORM:
-    case GGML_OP_DIAG_MASK_INF:
-    case GGML_OP_SOFT_MAX:
-    case GGML_OP_ROPE:
-    case GGML_OP_MUL_MAT:
-    case GGML_OP_NONE:
-        break;
-    default:
-        if (any_on_device) {
-            std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
-            GGML_ASSERT(false);
-        }
-        return;
-    }
-
-    if (ctx->compute_ctx == nullptr) {
-        ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
-        ggml_vk_ctx_begin(ctx, ctx->compute_ctx);
-    }
-
-    switch (node->op) {
-    case GGML_OP_REPEAT:
-        ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_GET_ROWS:
-        ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_ADD:
-        ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_MUL:
-        ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_SCALE:
-        ggml_vk_scale(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_SQR:
-        ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_CLAMP:
-        ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_CPY:
-    case GGML_OP_CONT:
-    case GGML_OP_DUP:
-        ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
-    case GGML_OP_NONE:
-        ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_NORM:
-        ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_RMS_NORM:
-        ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_UNARY:
-        switch (ggml_get_unary_op(node)) {
-        case GGML_UNARY_OP_SILU:
-        case GGML_UNARY_OP_GELU:
-        case GGML_UNARY_OP_RELU:
-            ggml_vk_unary(ctx, ctx->compute_ctx, src0, node);
-            break;
-        default:
-            return;
-        }
-        break;
-    case GGML_OP_DIAG_MASK_INF:
-        ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node);
-
-        break;
-    case GGML_OP_SOFT_MAX:
-        ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_ROPE:
-        ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    case GGML_OP_MUL_MAT:
-        ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node);
-
-        break;
-    default:
-        return;
-    }
-
-    extra->ready = true;
-    extra->ctx_idx = ctx->compute_ctx->idx;
-
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    // Force context reset on each node so that each tensor ends up in its own context
-    // and can be run and compared to its CPU equivalent separately
-    last_node = true;
-#endif
-
-    if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
-        ggml_vk_ctx_end(ctx->compute_ctx);
-        ctx->compute_ctx->exit_tensor = node;
-        ctx->compute_ctx = nullptr;
-    }
-}
-
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
-    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
-
-    if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) {
-        return false;
-    }
-
-    ggml_tensor_extra_gpu * extra = nullptr;
-
-    switch (tensor->op) {
-    case GGML_OP_ADD:
-    case GGML_OP_GET_ROWS:
-    case GGML_OP_MUL:
-    case GGML_OP_SCALE:
-    case GGML_OP_SQR:
-    case GGML_OP_CLAMP:
-    case GGML_OP_CPY:
-    case GGML_OP_CONT:
-    case GGML_OP_DUP:
-    case GGML_OP_NORM:
-    case GGML_OP_RMS_NORM:
-    case GGML_OP_DIAG_MASK_INF:
-    case GGML_OP_SOFT_MAX:
-    case GGML_OP_ROPE:
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
-    case GGML_OP_NONE:
-        extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-        break;
-    case GGML_OP_UNARY:
-        switch (ggml_get_unary_op(tensor)) {
-        case GGML_UNARY_OP_SILU:
-        case GGML_UNARY_OP_GELU:
-        case GGML_UNARY_OP_RELU:
-            extra = (ggml_tensor_extra_gpu *) tensor->extra;
-            break;
-        default:
-            return false;
-        }
-        break;
-    case GGML_OP_MUL_MAT:
-        if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-            return false;
-        }
-
-        extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-        break;
-    default:
-        return false;
-    }
-
-    if (extra == nullptr) {
-        return false;
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return true;
-    }
-
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
-#endif
-
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    ggml_vk_check_results_0(ctx, params, tensor);
-#endif
-
-    GGML_ASSERT(extra->ready);
-
-    vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
-
-    // Only run if ctx hasn't been submitted yet
-    if (!subctx.seqs.empty()) {
-        // Do staging buffer copies
-        for (auto& cpy : subctx.in_memcpys) {
-            memcpy(cpy.dst, cpy.src, cpy.n);
-        }
-
-        ggml_vk_submit(&subctx, ctx->fence);
-    }
-
-    if (tensor == subctx.exit_tensor) {
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
-
-        // Do staging buffer copies
-        for (auto& cpy : subctx.out_memcpys) {
-            memcpy(cpy.dst, cpy.src, cpy.n);
-        }
-        subctx.in_memcpys.clear();
-        subctx.out_memcpys.clear();
-    }
-
-    extra->ready = false;
-
-    return true;
-}
-
-// Clean up after graph processing is done
-static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
-    if (ctx->disable) {
-        return;
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
-#endif
-    for (auto& buffer : ctx->gc.temp_buffers) {
-        ggml_vk_pool_free(ctx, buffer);
-    }
-    ctx->gc.temp_buffers.clear();
-
-    for (auto * pipeline : ctx->gc.pipelines) {
-        ggml_pipeline_cleanup(*pipeline);
-    }
-
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue);
-
-    for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
-        ctx->device.lock()->device.destroySemaphore({ ctx->gc.semaphores[i].s });
-    }
-    ctx->gc.semaphores.clear();
-
-    for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
-        ctx->device.lock()->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
-    }
-    ctx->gc.tl_semaphores.clear();
-    ctx->semaphore_idx = 0;
-
-    ctx->event_idx = 0;
-
-    for (auto& event : ctx->gc.events) {
-        ctx->device.lock()->device.resetEvent(event);
-    }
-
-    ctx->staging_offset = 0;
-
-    ctx->compute_ctx = nullptr;
-    ctx->transfer_ctx = nullptr;
-    ctx->gc.contexts.clear();
-}
-
-// Clean up on backend free
-static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl;
-#endif
-    ggml_vk_graph_cleanup(ctx);
-
-    ggml_vk_destroy_buffer(ctx->prealloc_qx);
-    ggml_vk_destroy_buffer(ctx->prealloc_qy);
-    ggml_vk_destroy_buffer(ctx->prealloc_x);
-    ggml_vk_destroy_buffer(ctx->prealloc_y);
-    ggml_vk_destroy_buffer(ctx->prealloc_split_k);
-    ggml_vk_destroy_buffer(ctx->staging);
-    ggml_vk_destroy_buffer(ctx->sync_staging);
-
-    for (auto& buffer : ctx->buffer_pool) {
-        ggml_vk_destroy_buffer(buffer);
-    }
-
-    ctx->prealloc_size_qx = 0;
-    ctx->prealloc_size_qy = 0;
-    ctx->prealloc_size_x = 0;
-    ctx->prealloc_size_y = 0;
-    ctx->prealloc_size_split_k = 0;
-    ctx->staging_size = 0;
-
-    for (auto& event : ctx->gc.events) {
-        ctx->device.lock()->device.destroyEvent(event);
-    }
-    ctx->gc.events.clear();
-
-    for (auto* pipeline : ctx->gc.pipelines) {
-        ggml_vk_destroy_pipeline(ctx, pipeline);
-    }
-    ctx->gc.pipelines.clear();
-
-    ctx->device.lock()->device.destroyFence(ctx->fence);
-
-    ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool);
-    if (!ctx->device.lock()->single_queue) {
-        ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool);
-    }
-}
-
-GGML_CALL static int ggml_vk_get_device_count() {
-    ggml_vk_instance_init();
-
-    return vk_instance.device_indices.size();
-}
-
-GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
-    ggml_vk_instance_init();
-
-    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
-
-    vk::PhysicalDeviceProperties props;
-    devices[device].getProperties(&props);
-
-    snprintf(description, description_size, "%s", props.deviceName.data());
-}
-
-// CPU assist interface
-
-void ggml_vk_init_cpu_assist() {
-    ggml_vk_instance_init();
-
-    std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
-
-    for (int i = 0; i < ggml_vk_get_device_count(); i++) {
-        ggml_vk_print_gpu_info(i);
-    }
-    // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
-    ggml_backend_vk_init(0);
-}
-
-void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized) {
-        return;
-    }
-
-    ggml_vk_preallocate_buffers_graph(ctx, node);
-}
-
-void ggml_vk_preallocate_buffers_cpu_assist() {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized) {
-        return;
-    }
-
-    ggml_vk_preallocate_buffers(ctx);
-}
-
-void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized) {
-        return;
-    }
-
-    ggml_vk_build_graph(ctx, node, last_node);
-}
-
-bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized) {
-        return false;
-    }
-
-    return ggml_vk_compute_forward(ctx, params, tensor);
-}
-
-void ggml_vk_graph_cleanup_cpu_assist() {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized) {
-        return;
-    }
-
-    ggml_vk_graph_cleanup(ctx);
-}
-
-void ggml_vk_free_cpu_assist() {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
-        return;
-    }
-
-    ggml_backend_vk_free(vk_instance.backends[0]);
-}
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-// device backend
-
-static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
-
-struct ggml_backend_vk_buffer_context {
-    ggml_backend_vk_context * ctx;
-    vk_buffer dev_buffer;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-    std::string name;
-
-    ggml_backend_vk_buffer_context(ggml_backend_vk_context * ctx, vk_buffer&& dev_buffer, std::string& name) :
-        ctx(ctx),
-        dev_buffer(dev_buffer),
-        name(name) {
-    }
-
-    ~ggml_backend_vk_buffer_context() {
-        ggml_vk_destroy_buffer(dev_buffer);
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        extra->reset();
-
-        return extra;
-    }
-};
-
-GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
-}
-
-GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl;
-#endif
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-    ggml_vk_destroy_buffer(ctx->dev_buffer);
-    delete ctx;
-}
-
-GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return vk_ptr_base;
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl;
-#endif
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
-    if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
-        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-        ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
-        extra->buffer_gpu = extra_view->buffer_gpu;
-        extra->offset = extra_view->offset + tensor->view_offs;
-    } else {
-        extra->buffer_gpu = ctx->dev_buffer;
-        extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
-    }
-
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-    tensor->extra = extra;
-}
-
-GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
-#endif
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    vk_buffer buf = extra->buffer_gpu.lock();
-
-    ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
-}
-
-GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
-#endif
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    vk_buffer buf = extra->buffer_gpu.lock();
-
-    ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
-}
-
-GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_vk(src->buffer)) {
-        ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
-        ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-
-        vk_buffer src_buf = src_extra->buffer_gpu.lock();
-        vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
-
-        ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
-
-        return true;
-    }
-    return false;
-}
-
-GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-
-    ggml_vk_buffer_memset(ctx->ctx, ctx->dev_buffer, 0, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
-    /* .get_name        = */ ggml_backend_vk_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_vk_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_vk_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_vk_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_vk_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_vk_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_vk_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_vk_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// vk buffer type
-struct ggml_backend_vk_buffer_type_context {
-    std::string name;
-    ggml_backend_vk_context * ctx;
-};
-
-GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
-
-    return ctx->name.c_str();
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
-#endif
-    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
-
-    ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
-
-    return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
-}
-
-GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
-}
-
-GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->ctx->device.lock()->max_memory_allocation_size;
-}
-
-GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    return ggml_nbytes(tensor);
-
-    UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_vk(backend)) {
-        return false;
-    }
-
-    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    return buft_ctx->ctx->idx == ctx->idx;
-}
-
-static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_vk_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_vk_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_vk_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ ggml_backend_vk_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl;
-#endif
-
-    GGML_ASSERT(idx < vk_instance.device_indices.size());
-
-    ggml_backend_vk_init(idx);
-
-    return &vk_instance.buffer_types[idx];
-}
-
-// host buffer type
-
-GGML_CALL static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_VK_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_VK_NAME "_Host";
-
-    UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl;
-#endif
-    ggml_vk_host_free(&vk_instance.contexts[0], buffer->context);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
-#endif
-    void * ptr = nullptr;
-    try {
-        ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
-    } catch (vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return vk_instance.contexts[0].device.lock()->properties.limits.minMemoryMapAlignment;
-
-    UNUSED(buft);
-}
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_vk_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    if (!vk_instance.contexts[0].initialized) {
-        // Fall back to CPU
-        return ggml_backend_cpu_buffer_type();
-    }
-
-    return &ggml_backend_vk_buffer_type_host;
-}
-
-// backend
-
-GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) {
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    return ctx->name.c_str();
-}
-
-GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl;
-#endif
-
-    size_t idx = ctx->idx;
-
-    ggml_vk_cleanup(ctx);
-
-    // Release device
-    vk_instance.devices[ctx->idx].reset();
-    ctx->initialized = false;
-
-    vk_instance.initialized[idx] = false;
-    vk_instance.backends[idx] = nullptr;
-    memset(&vk_instance.buffer_types[idx], 0, sizeof(ggml_backend_buffer_type));
-    delete backend;
-}
-
-GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    GGML_ASSERT(ctx->initialized);
-
-    return ggml_backend_vk_buffer_type(ctx->idx);
-}
-
-GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
-#endif
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    if (ctx->transfer_ctx == nullptr) {
-        // Initialize new transfer context
-        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-        ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
-    }
-
-    vk_buffer buf = extra->buffer_gpu.lock();
-
-    ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
-}
-
-GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
-#endif
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    if (ctx->transfer_ctx == nullptr) {
-        // Initialize new transfer context
-        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-        ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
-    }
-
-    vk_buffer buf = extra->buffer_gpu.lock();
-
-    ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
-}
-
-GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
-#endif
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-    if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
-        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
-        ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-
-        if (ctx->transfer_ctx == nullptr) {
-            // Initialize new transfer context
-            ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
-            ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
-        }
-
-        vk_buffer src_buf = src_extra->buffer_gpu.lock();
-        vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
-
-        ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src));
-        return true;
-    }
-
-    return false;
-}
-
-GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
-#endif
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-    if(ctx->transfer_ctx == nullptr) {
-        return;
-    }
-
-    ggml_vk_ctx_end(ctx->transfer_ctx);
-
-    for (auto& cpy : ctx->transfer_ctx->in_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-
-    ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
-
-    for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-
-    ctx->transfer_ctx = nullptr;
-}
-
-GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]);
-    }
-    ggml_vk_preallocate_buffers(ctx);
-
-    int last_node = cgraph->n_nodes - 1;
-
-    // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
-    while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
-        last_node -= 1;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
-    }
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_TYPE_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-
-        bool ok = ggml_vk_compute_forward(ctx, &params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-        else {
-            ggml_vk_check_results_1(ctx, &params, node);
-        }
-#endif
-        GGML_ASSERT(ok);
-    }
-
-    ggml_vk_graph_cleanup(ctx);
-
-    return true;
-
-    UNUSED(backend);
-}
-
-GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-                return true;
-            } break;
-        // case GGML_OP_GET_ROWS:
-        //     {
-        //         switch (op->src[0]->type) {
-        //             case GGML_TYPE_F16:
-        //             case GGML_TYPE_F32:
-        //             case GGML_TYPE_Q4_0:
-        //             case GGML_TYPE_Q4_1:
-        //             case GGML_TYPE_Q5_0:
-        //             case GGML_TYPE_Q5_1:
-        //             case GGML_TYPE_Q8_0:
-        //                 return true;
-        //             default:
-        //                 return false;
-        //         }
-        //     } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_DUP:
-        // case GGML_OP_REPEAT:
-        //     {
-        //         ggml_type src0_type = op->src[0]->type;
-        //         return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-        //     } break;
-        case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                const bool is_glm  = mode & 4;
-
-                return !is_glm;
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-// TODO: enable async and synchronize
-static ggml_backend_i ggml_backend_vk_interface = {
-    /* .get_name                = */ ggml_backend_vk_name,
-    /* .free                    = */ ggml_backend_vk_free,
-    /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
-    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
-    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
-    /* .supports_op             = */ ggml_backend_vk_supports_op,
-};
-
-static ggml_guid_t ggml_backend_vk_guid() {
-    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-    return &guid;
-}
-
-GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
-    if (vk_instance.initialized[idx]) {
-        return vk_instance.backends[idx];
-    }
-#ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl;
-#endif
-
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[idx];
-    ggml_vk_init(ctx, idx);
-    ctx->name = GGML_VK_NAME + std::to_string(idx);
-    vk_instance.buffer_types[idx] = {
-        /* .iface    = */ ggml_backend_vk_buffer_type_interface,
-        /* .context  = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
-    };
-    vk_instance.initialized[idx] = true;
-
-    ggml_backend_t vk_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_vk_guid(),
-        /* .interface = */ ggml_backend_vk_interface,
-        /* .context   = */ &vk_instance.contexts[ctx->idx],
-    };
-
-    vk_instance.backends[idx] = vk_backend;
-
-    return vk_backend;
-}
-
-GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) {
-    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
-}
-
-GGML_CALL int ggml_backend_vk_get_device_count() {
-    return ggml_vk_get_device_count();
-}
-
-GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
-    ggml_vk_get_device_description(device, description, description_size);
-}
-
-GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
-
-    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
-
-    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
-
-    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
-        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
-            *total = heap.size;
-            *free = heap.size;
-            break;
-        }
-    }
-}
-
-// backend registry
-GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) {
-    ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data);
-    return vk_backend;
-
-    UNUSED(params);
-}
-
-extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
-
-GGML_CALL int ggml_backend_vk_reg_devices() {
-    for (auto idx : vk_instance.device_indices) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx);
-        ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx);
-    }
-    return vk_instance.device_indices.size();
-}
-
-// Extension availability
-static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
-#ifdef GGML_VULKAN_VALIDATE
-    bool portability_enumeration_ext = false;
-    // Check for portability enumeration extension for MoltenVK support
-    for (const auto& properties : instance_extensions) {
-        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
-            return true;
-        }
-    }
-    if (!portability_enumeration_ext) {
-        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
-    }
-#endif
-    return false;
-
-    UNUSED(instance_extensions);
-}
-static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
-#ifdef __APPLE__
-    bool portability_enumeration_ext = false;
-    // Check for portability enumeration extension for MoltenVK support
-    for (const auto& properties : instance_extensions) {
-        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
-            return true;
-        }
-    }
-    if (!portability_enumeration_ext) {
-        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
-    }
-#endif
-    return false;
-
-    UNUSED(instance_extensions);
-}
-
-// checks
-
-#ifdef GGML_VULKAN_CHECK_RESULTS
-static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
-    if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
-        return;
-    }
-    for (int j = 0; j < level; j++) {
-        std::cerr << " ";
-    }
-    std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
-
-    done.push_back(tensor);
-
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (tensor->src[i] != nullptr) {
-            ggml_vk_print_graph_origin(tensor->src[i], done, level + 1);
-        }
-    }
-}
-
-static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
-    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
-        return;
-    }
-    i0 = std::max(i0, 5);
-    i1 = std::max(i1, 5);
-    i2 = std::max(i2, 0);
-    i3 = std::max(i3, 0);
-    fprintf(stderr, "         ");
-    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-        fprintf(stderr, "%7d ", idx1);
-    }
-    fprintf(stderr, "\n");
-    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
-        fprintf(stderr, "%7d: ", idx0);
-        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
-            if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
-                float val;
-                if (tensor->type == GGML_TYPE_F32) {
-                    val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
-                } else if (tensor->type == GGML_TYPE_F16) {
-                    val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
-                }
-                fprintf(stderr, "% 7.2f ", val);
-            } else {
-                fprintf(stderr, "        ");
-            }
-        }
-        fprintf(stderr, "\n");
-    }
-}
-
-static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
-    void * tensor_data = tensor->data;
-
-    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
-        const size_t tensor_size = ggml_nbytes(tensor);
-        tensor_data = malloc(tensor_size);
-
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
-    }
-
-    std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
-    std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
-    if (tensor->src[0] != nullptr) {
-        std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
-    }
-    if (tensor->src[1] != nullptr) {
-        std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
-    }
-    std::cerr << std::endl << "Result:" << std::endl;
-    ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
-    std::cerr << std::endl;
-    std::cerr << std::endl << "Result:" << std::endl;
-    ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
-    std::cerr << std::endl;
-    std::vector<const ggml_tensor *> done;
-    ggml_vk_print_graph_origin(tensor, done);
-
-    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
-        free(tensor_data);
-    }
-}
-
-static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
-    return;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
-    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
-        return;
-    }
-    for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-        for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float val = 0.0f;
-                    if (tensor->type == GGML_TYPE_F32) {
-                        val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
-                    } else if (tensor->type == GGML_TYPE_F16) {
-                        val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
-                    }
-                    if (std::isnan(val)) {
-                        std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
-                        std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
-                        std::cerr << std::endl;
-                        ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
-                        std::cerr << std::endl;
-                        std::vector<const ggml_tensor *> done;
-                        ggml_vk_print_graph_origin(tensor, done);
-                        GGML_ASSERT(false);
-                    }
-                }
-            }
-        }
-    }
-}
-
-void * comp_result;
-size_t comp_size;
-size_t comp_nb[GGML_MAX_DIMS];
-size_t check_counter = 0;
-static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
-    if (params->ith != 0) {
-        return;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
-        return;
-    }
-
-    check_counter++;
-    if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
-        return;
-    }
-
-    ggml_tensor * src0 = tensor->src[0];
-    ggml_tensor * src1 = tensor->src[1];
-
-    struct ggml_init_params iparams = {
-        /*.mem_size   =*/ 1024*1024*1024,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ggml_ctx = ggml_init(iparams);
-
-    struct ggml_tensor * src0_clone = nullptr;
-    struct ggml_tensor * src1_clone = nullptr;
-    struct ggml_tensor * tensor_clone = nullptr;
-
-    size_t src0_size;
-    size_t src1_size;
-
-    void * src0_buffer;
-    void * src1_buffer;
-
-    if (src0 != nullptr) {
-        src0_clone = ggml_dup_tensor(ggml_ctx, src0);
-
-        src0_size = ggml_nbytes(src0);
-
-        src0_buffer = malloc(src0_size);
-        src0_clone->data = src0_buffer;
-        if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-            memcpy(src0_clone->data, src0->data, src0_size);
-            memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
-            uint64_t offset = extra->offset;
-            if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
-                for (int i3 = 0; i3 < src0->ne[3]; i3++) {
-                    for (int i2 = 0; i2 < src0->ne[2]; i2++) {
-                        const int idx = i3*src0->ne[2] + i2;
-                        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-                        ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
-                    }
-                }
-
-                src0_clone->nb[0] = src0->nb[0];
-                src0_clone->nb[1] = src0->nb[1];
-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
-                    src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1];
-                }
-            } else {
-                vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-                if (offset + src0_size >= buffer_gpu->size) {
-                    src0_size = buffer_gpu->size - offset;
-                }
-                ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size);
-                memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-
-        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
-            ggml_vk_print_tensor(ctx, src0, "src0");
-        }
-
-        ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
-    }
-    if (src1 != nullptr) {
-        src1_clone = ggml_dup_tensor(ggml_ctx, src1);
-
-        src1_size = ggml_nbytes(src1);
-
-        src1_buffer = malloc(src1_size);
-        src1_clone->data = src1_buffer;
-        if (src1->backend == GGML_BACKEND_TYPE_CPU) {
-            memcpy(src1_clone->data, src1->data, src1_size);
-            memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
-            uint64_t offset = extra->offset;
-            if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
-                for (int i3 = 0; i3 < src1->ne[3]; i3++) {
-                    for (int i2 = 0; i2 < src1->ne[2]; i2++) {
-                        const int idx = i3*src1->ne[2] + i2;
-                        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-                        ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
-                    }
-                }
-
-                src1_clone->nb[0] = src1->nb[0];
-                src1_clone->nb[1] = src1->nb[1];
-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
-                    src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1];
-                }
-            } else {
-                vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-                if (offset + src1_size >= buffer_gpu->size) {
-                    src1_size = buffer_gpu->size - offset;
-                }
-                ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size);
-                memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-
-        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
-            ggml_vk_print_tensor(ctx, src1, "src1");
-            std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
-            std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
-            if (src1->src[0] != nullptr) {
-                std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
-            }
-            if (src1->src[1] != nullptr) {
-                std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
-            }
-            std::cerr << std::endl << "Result:" << std::endl;
-            ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
-            std::cerr << std::endl;
-            std::cerr << std::endl << "Result:" << std::endl;
-            ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 1, 0);
-            std::cerr << std::endl;
-            std::vector<const ggml_tensor *> done;
-            ggml_vk_print_graph_origin(src1_clone, done);
-        }
-
-        ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
-    } else if (tensor->op == GGML_OP_MUL) {
-        tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
-    } else if (tensor->op == GGML_OP_SCALE) {
-        tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
-    } else if (tensor->op == GGML_OP_SQR) {
-        tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
-    } else if (tensor->op == GGML_OP_CLAMP) {
-        tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
-    } else if (tensor->op == GGML_OP_ADD) {
-        tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
-    } else if (tensor->op == GGML_OP_NORM) {
-        tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_RMS_NORM) {
-        tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_SOFT_MAX) {
-            tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
-    } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
-        tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_ROPE) {
-        const int n_dims      = ((int32_t *) tensor->op_params)[1];
-        const int mode        = ((int32_t *) tensor->op_params)[2];
-        const int n_ggml_ctx       = ((int32_t *) tensor->op_params)[3];
-        const int n_orig_ggml_ctx  = ((int32_t *) tensor->op_params)[4];
-        float freq_base       = ((float *)   tensor->op_params)[5];
-        float freq_scale      = ((float *)   tensor->op_params)[6];
-        float ext_factor      = ((float *)   tensor->op_params)[7];
-        float attn_factor     = ((float *)   tensor->op_params)[8];
-        float beta_fast       = ((float *)   tensor->op_params)[9];
-        float beta_slow       = ((float *)   tensor->op_params)[10];
-        tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
-    } else if (tensor->op == GGML_OP_UNARY) {
-        switch (ggml_get_unary_op(tensor)) {
-        case GGML_UNARY_OP_SILU:
-            tensor_clone = ggml_silu(ggml_ctx, src0_clone);
-            break;
-        case GGML_UNARY_OP_GELU:
-            tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
-            break;
-        case GGML_UNARY_OP_RELU:
-            tensor_clone = ggml_relu(ggml_ctx, src0_clone);
-            break;
-        default:
-            std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
-            GGML_ASSERT(false);
-        }
-    } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
-        if (src1 == nullptr) {
-            tensor_clone = ggml_dup(ggml_ctx, src0_clone);
-            tensor_clone->type = tensor->type;
-        } else {
-            tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone);
-        }
-    } else if (tensor->op == GGML_OP_CONT) {
-        tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-    } else if (tensor->op == GGML_OP_RESHAPE) {
-        tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-    } else if (tensor->op == GGML_OP_VIEW) {
-        tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
-    } else if (tensor->op == GGML_OP_PERMUTE) {
-        int32_t * params = (int32_t *)tensor->op_params;
-        tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
-    } else if (tensor->op == GGML_OP_TRANSPOSE) {
-        tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
-    } else {
-        std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
-        GGML_ASSERT(false);
-    }
-
-    // Disable vulkan here to avoid the hooks in ggml.c
-    ctx->disable = true;
-
-    ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
-    ggml_build_forward_expand(cgraph, tensor_clone);
-
-    ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
-
-    ctx->disable = false;
-
-    ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
-    if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
-        ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
-    }
-
-    comp_size = ggml_nbytes(tensor_clone);
-
-    comp_result = malloc(comp_size);
-    memcpy(comp_result, tensor_clone->data, comp_size);
-    memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
-
-    if (src0 != nullptr) {
-        free(src0_buffer);
-    }
-    if (src1 != nullptr) {
-        free(src1_buffer);
-    }
-
-    ggml_free(ggml_ctx);
-}
-
-static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) {
-    if (params->ith != 0) {
-        return;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
-        return;
-    }
-    if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
-        return;
-    }
-
-    ggml_tensor * src0 = tensor->src[0];
-    ggml_tensor * src1 = tensor->src[1];
-
-    void * tensor_data = tensor->data;
-
-    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
-        size_t tensor_size = ggml_nbytes(tensor);
-        tensor_data = malloc(tensor_size);
-
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        if (extra->offset + tensor_size >= buffer_gpu->size) {
-            tensor_size = buffer_gpu->size - (extra->offset);
-        }
-
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
-    }
-
-    float first_error_result = -1.0f;
-    float first_error_correct = -1.0f;
-    std::array<int, 4> first_error = { -1, -1, -1, -1 };
-    double avg_err = 0.0;
-    size_t counter = 0;
-
-    for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-        for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size;
-                    float correct = 0.0f;
-                    float result = 0.0f;
-
-                    if (buffer_size_fit) {
-                        if (tensor->type == GGML_TYPE_F32) {
-                            correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
-                            result  = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
-                        } else if (tensor->type == GGML_TYPE_F16) {
-                            correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
-                            result  = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
-                        } else {
-                            std::cerr << "comp_size=" << comp_size << " but required is " << (i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]) << std::endl;
-                        }
-                    } else {
-                        std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
-                        GGML_ASSERT(false);
-                    }
-
-                    if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
-                        std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
-                        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
-                        if (src0 != nullptr) {
-                            std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
-                        }
-                        if (src1 != nullptr) {
-                            std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
-                        }
-                        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
-                        std::cerr << std::endl << "Result:" << std::endl;
-                        ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
-                        std::cerr << std::endl << "Correct:" << std::endl;
-                        ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3);
-                        std::cerr << std::endl;
-                        std::vector<const ggml_tensor *> done;
-                        ggml_vk_print_graph_origin(tensor, done);
-                        GGML_ASSERT(false);
-                    }
-                    if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
-                        first_error[0] = i0;
-                        first_error[1] = i1;
-                        first_error[2] = i2;
-                        first_error[3] = i3;
-                        first_error_result = result;
-                        first_error_correct = correct;
-                    }
-
-                    // Special case, value is infinite, avoid NaN result in avg_err
-                    // NaN also appears in results, if both are nan error is 0
-                    if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) {
-                        avg_err += std::fabs(correct - result);
-                    }
-                    counter++;
-                }
-            }
-        }
-    }
-
-    avg_err /= counter;
-
-    if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
-        std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
-        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
-        if (src0 != nullptr) {
-            std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
-        }
-        if (src1 != nullptr) {
-            std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
-        }
-        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
-        std::cerr << std::endl << "Result:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
-        std::cerr << std::endl << "Correct:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
-        std::cerr << std::endl;
-        std::cerr << std::endl << "Result:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
-        std::cerr << std::endl << "Correct:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 1, 0);
-        std::cerr << std::endl;
-        std::vector<const ggml_tensor *> done;
-        ggml_vk_print_graph_origin(tensor, done);
-    }
-
-    if (avg_err > 0.05 || std::isnan(avg_err)) {
-        std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
-        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
-        if (src0 != nullptr) {
-            std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
-        }
-        if (src1 != nullptr) {
-            std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
-        }
-        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
-        std::cerr << std::endl << "Result:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
-        std::cerr << std::endl << "Correct:" << std::endl;
-        ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]);
-        std::cerr << std::endl;
-        std::vector<const ggml_tensor *> done;
-        ggml_vk_print_graph_origin(tensor, done);
-        GGML_ASSERT(false);
-    } else {
-        std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
-    }
-
-    free(comp_result);
-    comp_result = nullptr;
-    comp_size = 0;
-
-    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
-        free(tensor_data);
-    }
-}
-
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
-
-    ggml_vk_check_results_0(ctx, params, tensor);
-}
-#endif
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
deleted file mode 100644
index 9645126b4..000000000
--- a/ggml-vulkan.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_VK_NAME "Vulkan"
-#define GGML_VK_MAX_DEVICES 16
-
-GGML_API void ggml_vk_init_cpu_assist(void);
-
-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#endif
-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
-GGML_API void ggml_vk_free_cpu_assist(void);
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-
-GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml.c b/ggml.c
deleted file mode 100644
index f29b9f13f..000000000
--- a/ggml.c
+++ /dev/null
@@ -1,21255 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#define _USE_MATH_DEFINES // For M_PI on MSVC
-
-#include "ggml-impl.h"
-#include "ggml-quants.h"
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
-#include <assert.h>
-#include <errno.h>
-#include <time.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <float.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <signal.h>
-#if defined(__gnu_linux__)
-#include <syscall.h>
-#endif
-
-#ifdef GGML_USE_METAL
-#include <unistd.h>
-#endif
-
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-#endif
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-typedef volatile LONG atomic_int;
-typedef atomic_int atomic_bool;
-
-static void atomic_store(atomic_int * ptr, LONG val) {
-    InterlockedExchange(ptr, val);
-}
-static LONG atomic_load(atomic_int * ptr) {
-    return InterlockedCompareExchange(ptr, 0, 0);
-}
-static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
-    return InterlockedExchangeAdd(ptr, inc);
-}
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
-}
-
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void * unused) {
-    (void) unused;
-    int ret = (int) WaitForSingleObject(thread, INFINITE);
-    CloseHandle(thread);
-    return ret;
-}
-
-static int sched_yield (void) {
-    Sleep (0);
-    return 0;
-}
-#else
-#include <pthread.h>
-#include <stdatomic.h>
-
-typedef void * thread_ret_t;
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#endif
-
-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#endif
-
-#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
-    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
-
-#include <sys/wait.h>
-
-void ggml_print_backtrace(void) {
-    /*
-    #include <execinfo.h>
-    #include <dlfcn.h>
-
-    void * trace[100];
-
-    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
-
-    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
-    */
-
-    // backtrack_symbols does not show line numbers, use gdb instead
-    char attach[32];
-    snprintf(attach, sizeof(attach), "attach %d", getpid());
-    int pid = fork();
-    if (pid == 0) {
-        execlp("gdb", "gdb", "--batch",
-            "-ex", "set style enabled on",
-            "-ex", attach,
-            "-ex", "bt -frame-info source-and-location",
-            "-ex", "detach",
-            "-ex", "quit",
-            (char *) NULL);
-    } else {
-        waitpid(pid, NULL, 0);
-    }
-}
-#else
-void ggml_print_backtrace(void) {
-    // platform not supported
-}
-#endif
-
-/*#define GGML_PERF*/
-#define GGML_DEBUG 0
-#define GGML_GELU_FP16
-#define GGML_GELU_QUICK_FP16
-#define GGML_SILU_FP16
-// #define GGML_CROSS_ENTROPY_EXP_FP16
-// #define GGML_FLASH_ATTN_EXP_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-#define GGML_VEC_MAD_UNROLL  32
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-//
-// end of logging block
-//
-
-#ifdef GGML_USE_ACCELERATE
-// uncomment to use vDSP for soft max computation
-// note: not sure if it is actually faster
-//#define GGML_SOFT_MAX_ACCELERATE
-#endif
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
-#else
-inline static void * ggml_aligned_malloc(size_t size) {
-    if (size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
-        return NULL;
-    }
-    void * aligned_memory = NULL;
-#ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, 16, size);
-#elif GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
-#else
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
-#endif
-    if (result != 0) {
-        // Handle allocation failure
-        const char *error_desc = "unknown allocation error";
-        switch (result) {
-            case EINVAL:
-                error_desc = "invalid alignment value";
-                break;
-            case ENOMEM:
-                error_desc = "insufficient memory";
-                break;
-        }
-        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
-        return NULL;
-    }
-    return aligned_memory;
-}
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#ifdef GGML_USE_CPU_HBM
-#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
-#else
-#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-#endif
-#endif
-
-inline static void * ggml_malloc(size_t size) {
-    if (size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
-        return NULL;
-    }
-    void * result = malloc(size);
-    if (result == NULL) {
-        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
-    }
-    return result;
-}
-
-// calloc
-inline static void * ggml_calloc(size_t num, size_t size) {
-    if (num == 0 || size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
-        return NULL;
-    }
-    void * result = calloc(num, size);
-    if (result == NULL) {
-        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
-    }
-    return result;
-}
-
-#define GGML_MALLOC(size)      ggml_malloc(size)
-#define GGML_CALLOC(num, size) ggml_calloc(num, size)
-
-#define GGML_FREE(ptr) free(ptr)
-
-#define UNUSED GGML_UNUSED
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
-#include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
-#endif
-#elif defined(GGML_USE_OPENBLAS)
-#if defined(GGML_BLAS_USE_MKL)
-#include <mkl.h>
-#else
-#include <cblas.h>
-#endif
-#elif defined(GGML_USE_CUBLAS)
-#include "ggml-cuda.h"
-#elif defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.h"
-#endif
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-
-// precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-
-// precomputed silu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-static ggml_fp16_t ggml_table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
-
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return GGML_FP16_TO_FP32(x);
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
-    for (int i = 0; i < n; i++) {
-        y[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-}
-
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
-    int i = 0;
-#if defined(__F16C__)
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for(; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
-    }
-#endif
-    for (; i < n; i++) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
-    }
-}
-
-bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
-    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
-}
-
-//
-// timing
-//
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-static int64_t timer_freq, timer_start;
-void ggml_time_init(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceFrequency(&t);
-    timer_freq = t.QuadPart;
-
-    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
-    // and the uptime is high enough.
-    // We subtract the program start time to reduce the likelihood of that happening.
-    QueryPerformanceCounter(&t);
-    timer_start = t.QuadPart;
-}
-int64_t ggml_time_ms(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
-}
-int64_t ggml_time_us(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
-}
-#else
-void ggml_time_init(void) {}
-int64_t ggml_time_ms(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
-}
-
-int64_t ggml_time_us(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
-}
-#endif
-
-int64_t ggml_cycles(void) {
-    return clock();
-}
-
-int64_t ggml_cycles_per_ms(void) {
-    return CLOCKS_PER_SEC/1000;
-}
-
-#ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
-
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
-        .type_name                = "i8",
-        .blck_size                = 1,
-        .type_size                = sizeof(int8_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
-        .type_name                = "i16",
-        .blck_size                = 1,
-        .type_size                = sizeof(int16_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
-        .type_name                = "i32",
-        .blck_size                = 1,
-        .type_size                = sizeof(int32_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
-        .type_name                = "f32",
-        .blck_size                = 1,
-        .type_size                = sizeof(float),
-        .is_quantized             = false,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-        .vec_dot_type             = GGML_TYPE_F32,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_F16] = {
-        .type_name                = "f16",
-        .blck_size                = 1,
-        .type_size                = sizeof(ggml_fp16_t),
-        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
-        .vec_dot_type             = GGML_TYPE_F16,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q4_0] = {
-        .type_name                = "q4_0",
-        .blck_size                = QK4_0,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
-    },
-    [GGML_TYPE_Q4_1] = {
-        .type_name                = "q4_1",
-        .blck_size                = QK4_1,
-        .type_size                = sizeof(block_q4_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
-        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
-    },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .type_name                = "q5_0",
-        .blck_size                = QK5_0,
-        .type_size                = sizeof(block_q5_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
-        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .type_name                = "q5_1",
-        .blck_size                = QK5_1,
-        .type_size                = sizeof(block_q5_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
-        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .type_name                = "q8_0",
-        .blck_size                = QK8_0,
-        .type_size                = sizeof(block_q8_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
-        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-        .nrows                    = 2,
-#else
-        .nrows                    = 1,
-#endif
-    },
-    [GGML_TYPE_Q8_1] = {
-        .type_name                = "q8_1",
-        .blck_size                = QK8_1,
-        .type_size                = sizeof(block_q8_1),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q2_K] = {
-        .type_name                = "q2_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q2_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
-        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q3_K] = {
-        .type_name                = "q3_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q3_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
-        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q4_K] = {
-        .type_name                = "q4_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q4_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
-        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q5_K] = {
-        .type_name                = "q5_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q5_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
-        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q6_K] = {
-        .type_name                = "q6_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q6_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
-        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ2_XXS] = {
-        .type_name                = "iq2_xxs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xxs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ2_XS] = {
-        .type_name                = "iq2_xs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ3_XXS] = {
-        .type_name                = "iq3_xxs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq3_xxs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
-        .from_float               = quantize_row_iq3_xxs,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
-        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ3_S] = {
-        .type_name                = "iq3_s",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq3_s),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
-        .from_float               = quantize_row_iq3_s,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_s_reference,
-        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ2_S] = {
-        .type_name                = "iq2_s",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_s),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
-        .from_float               = quantize_row_iq2_s,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq2_s_reference,
-        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ1_S] = {
-        .type_name                = "iq1_s",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq1_s),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ4_NL] = {
-        .type_name                = "iq4_nl",
-        .blck_size                = QK4_NL,
-        .type_size                = sizeof(block_iq4_nl),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
-        .from_float               = quantize_row_iq4_nl,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_nl_reference,
-        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_IQ4_XS] = {
-        .type_name                = "iq4_xs",
-#if QK_K == 64
-        .blck_size                = QK4_NL,
-#else
-        .blck_size                = QK_K,
-#endif
-        .type_size                = sizeof(block_iq4_xs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
-        .from_float               = quantize_row_iq4_xs,
-        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
-        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
-#if QK_K == 64
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-#else
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-#endif
-        .nrows                    = 1,
-    },
-    [GGML_TYPE_Q8_K] = {
-        .type_name                = "q8_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q8_K),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
-    }
-};
-
-// For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
-    GGML_ASSERT(type < GGML_TYPE_COUNT);
-    return type_traits[type];
-}
-
-//
-// simd mappings
-//
-
-#if defined(__ARM_NEON)
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-#endif
-#endif
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    do {                                                          \
-        int offset = GGML_F16_ARR >> 1;                           \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    } while (0)
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F16_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
-
-//
-// fundamental operations
-//
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
-   assert(nrc == 1);
-   UNUSED(nrc);
-   UNUSED(bx);
-   UNUSED(by);
-   UNUSED(bs);
-
-#ifdef GGML_SIMD
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-// xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
-
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
-        x[i] = (const float *) ((const char *) xv + i*xs);
-        v[i] = (const float *) ((const char *) vv + i*vs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
-    }
-
-    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
-            }
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#else
-    // scalar
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = 0; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_USE_ACCELERATE)
-    vDSP_vsmul(y, 1, &v, y, 1, n);
-#elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-// TODO: optimize performance
-inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-
-static const float GELU_COEF_A     = 0.044715f;
-static const float GELU_QUICK_COEF = -1.702f;
-static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        if (x[i] <= -10.0f) {
-            y[i] = 0.0f;
-        } else if (x[i] >= 10.0f) {
-            y[i] = x[i];
-        } else {
-            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-            memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
-        }
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_gelu_quick_f32(float x) {
-    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
-}
-
-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_GELU_QUICK_FP16
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_quick_f32(x[i]);
-    }
-}
-#endif
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_silu_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_silu_backward_f32(float x, float dy) {
-    const float s = 1.0f/(1.0f + expf(-x));
-    return dy*s*(1.0f + x*(1.0f - s));
-}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        // we did not use x[i] to compute forward silu but its f16 equivalent
-        // take derivative at f16 of x[i]:
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        float usedx = GGML_FP16_TO_FP32(fp16);
-        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
-    }
-}
-#else
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
-    }
-}
-#endif
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_FP16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
-    float max = -INFINITY;
-    int idx = 0;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-        if (max == x[i]) { idx = i; }
-    }
-    *s = idx;
-}
-
-//
-// data types
-//
-
-static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-    "NONE",
-
-    "DUP",
-    "ADD",
-    "ADD1",
-    "ACC",
-    "SUB",
-    "MUL",
-    "DIV",
-    "SQR",
-    "SQRT",
-    "LOG",
-    "SUM",
-    "SUM_ROWS",
-    "MEAN",
-    "ARGMAX",
-    "REPEAT",
-    "REPEAT_BACK",
-    "CONCAT",
-    "SILU_BACK",
-    "NORM",
-    "RMS_NORM",
-    "RMS_NORM_BACK",
-    "GROUP_NORM",
-
-    "MUL_MAT",
-    "MUL_MAT_ID",
-    "OUT_PROD",
-
-    "SCALE",
-    "SET",
-    "CPY",
-    "CONT",
-    "RESHAPE",
-    "VIEW",
-    "PERMUTE",
-    "TRANSPOSE",
-    "GET_ROWS",
-    "GET_ROWS_BACK",
-    "DIAG",
-    "DIAG_MASK_INF",
-    "DIAG_MASK_ZERO",
-    "SOFT_MAX",
-    "SOFT_MAX_BACK",
-    "ROPE",
-    "ROPE_BACK",
-    "ALIBI",
-    "CLAMP",
-    "CONV_TRANSPOSE_1D",
-    "IM2COL",
-    "CONV_TRANSPOSE_2D",
-    "POOL_1D",
-    "POOL_2D",
-    "UPSCALE",
-    "PAD",
-    "ARGSORT",
-    "LEAKY_RELU",
-
-    "FLASH_ATTN",
-    "FLASH_FF",
-    "FLASH_ATTN_BACK",
-    "WIN_PART",
-    "WIN_UNPART",
-    "GET_REL_POS",
-    "ADD_REL_POS",
-
-    "UNARY",
-
-    "MAP_UNARY",
-    "MAP_BINARY",
-
-    "MAP_CUSTOM1_F32",
-    "MAP_CUSTOM2_F32",
-    "MAP_CUSTOM3_F32",
-
-    "MAP_CUSTOM1",
-    "MAP_CUSTOM2",
-    "MAP_CUSTOM3",
-
-    "CROSS_ENTROPY_LOSS",
-    "CROSS_ENTROPY_LOSS_BACK",
-};
-
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
-
-static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-    "none",
-
-    "x",
-    "x+y",
-    "x+y",
-    "view(x,nb,offset)+=y->x",
-    "x-y",
-    "x*y",
-    "x/y",
-    "x^2",
-    "√x",
-    "log(x)",
-    "Σx",
-    "Σx_k",
-    "Σx/n",
-    "argmax(x)",
-    "repeat(x)",
-    "repeat_back(x)",
-    "concat(x, y)",
-    "silu_back(x)",
-    "norm(x)",
-    "rms_norm(x)",
-    "rms_norm_back(x)",
-    "group_norm(x)",
-
-    "X*Y",
-    "X[i]*Y",
-    "X*Y",
-
-    "x*v",
-    "y-\\>view(x)",
-    "x-\\>y",
-    "cont(x)",
-    "reshape(x)",
-    "view(x)",
-    "permute(x)",
-    "transpose(x)",
-    "get_rows(x)",
-    "get_rows_back(x)",
-    "diag(x)",
-    "diag_mask_inf(x)",
-    "diag_mask_zero(x)",
-    "soft_max(x)",
-    "soft_max_back(x)",
-    "rope(x)",
-    "rope_back(x)",
-    "alibi(x)",
-    "clamp(x)",
-    "conv_transpose_1d(x)",
-    "im2col(x)",
-    "conv_transpose_2d(x)",
-    "pool_1d(x)",
-    "pool_2d(x)",
-    "upscale(x)",
-    "pad(x)",
-    "argsort(x)",
-    "leaky_relu(x)",
-
-    "flash_attn(x)",
-    "flash_ff(x)",
-    "flash_attn_back(x)",
-    "win_part(x)",
-    "win_unpart(x)",
-    "get_rel_pos(x)",
-    "add_rel_pos(x)",
-
-    "unary(x)",
-
-    "f(x)",
-    "f(x,y)",
-
-    "custom_f32(x)",
-    "custom_f32(x,y)",
-    "custom_f32(x,y,z)",
-
-    "custom(x)",
-    "custom(x,y)",
-    "custom(x,y,z)",
-
-    "cross_entropy_loss(x,y)",
-    "cross_entropy_loss_back(x,y)",
-};
-
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
-
-static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
-
-
-static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
-    "ABS",
-    "SGN",
-    "NEG",
-    "STEP",
-    "TANH",
-    "ELU",
-    "RELU",
-    "GELU",
-    "GELU_QUICK",
-    "SILU",
-    "HARDSWISH",
-    "HARDSIGMOID",
-};
-
-static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
-
-
-static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
-static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-
-// WARN:
-// Mis-configuration can lead to problem that's hard to reason about:
-// * At best  it crash or talks nosense.
-// * At worst it talks slightly difference but hard to perceive.
-//
-// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
-// Take care about compile options (e.g., GGML_USE_xxx).
-static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
-static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
-
-static void ggml_setup_op_has_task_pass(void) {
-    {   // INIT
-        bool * p = GGML_OP_HAS_INIT;
-
-        p[GGML_OP_ACC                    ] = true;
-        p[GGML_OP_MUL_MAT                ] = true;
-        p[GGML_OP_MUL_MAT_ID             ] = true;
-        p[GGML_OP_OUT_PROD               ] = true;
-        p[GGML_OP_SET                    ] = true;
-        p[GGML_OP_GET_ROWS_BACK          ] = true;
-        p[GGML_OP_DIAG_MASK_INF          ] = true;
-        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
-        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-        p[GGML_OP_ADD_REL_POS            ] = true;
-    }
-
-    {   // FINALIZE
-        bool * p = GGML_OP_HAS_FINALIZE;
-
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-    }
-}
-
-//
-// ggml context
-//
-
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
-//
-// NUMA support
-//
-
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
-
-struct ggml_numa_node {
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes {
-    enum ggml_numa_strategy numa_strategy;
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-    uint32_t current_node; // node on which main process is execting
-#if defined(__gnu_linux__)
-    cpu_set_t cpuset; // cpuset from numactl
-#else
-    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
-#endif
-};
-
-//
-// ggml state
-//
-
-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-    struct ggml_numa_nodes numa;
-};
-
-// global state
-static struct ggml_state g_state;
-static atomic_int g_state_barrier = 0;
-
-// barrier via spin lock
-inline static void ggml_critical_section_start(void) {
-    int processing = atomic_fetch_add(&g_state_barrier, 1);
-
-    while (processing > 0) {
-        // wait for other threads to finish
-        atomic_fetch_sub(&g_state_barrier, 1);
-        sched_yield(); // TODO: reconsider this
-        processing = atomic_fetch_add(&g_state_barrier, 1);
-    }
-}
-
-// TODO: make this somehow automatically executed
-//       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end(void) {
-    atomic_fetch_sub(&g_state_barrier, 1);
-}
-
-#if defined(__gnu_linux__)
-static cpu_set_t ggml_get_numa_affinity(void) {
-    cpu_set_t cpuset;
-    pthread_t thread;
-    thread = pthread_self();
-    CPU_ZERO(&cpuset);
-    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
-    return cpuset;
-}
-#else
-static uint32_t ggml_get_numa_affinity(void) {
-    return 0; // no NUMA support
-}
-#endif
-
-void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
-    if (g_state.numa.n_nodes > 0) {
-        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
-
-        return;
-    }
-
-#if defined(__gnu_linux__)
-    struct stat st;
-    char path[256];
-    int rv;
-
-    // set numa scheme
-    g_state.numa.numa_strategy = numa_flag;
-
-    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
-
-    g_state.numa.cpuset = ggml_get_numa_affinity();
-
-    // enumerate nodes
-    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.n_nodes;
-    }
-
-    // enumerate CPUs
-    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.total_cpus;
-    }
-
-    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
-
-    // figure out which node we're on
-    uint current_cpu;
-    int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
-    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
-#else
-    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
-    getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
-#endif
-
-    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
-        g_state.numa.n_nodes = 0;
-        return;
-    }
-
-    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
-
-    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
-        struct ggml_numa_node * node = &g_state.numa.nodes[n];
-        GGML_PRINT_DEBUG("CPUs on node %u:", n);
-        node->n_cpus = 0;
-        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
-            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
-            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-            if (stat(path, &st) == 0) {
-                node->cpus[node->n_cpus++] = c;
-                GGML_PRINT_DEBUG(" %u", c);
-            }
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    if (ggml_is_numa()) {
-        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
-        if (fptr != NULL) {
-            char buf[42];
-            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
-            }
-            fclose(fptr);
-        }
-    }
-#else
-    GGML_UNUSED(numa_flag);
-    // TODO
-#endif
-}
-
-bool ggml_is_numa(void) {
-    return g_state.numa.n_nodes > 1;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_print_object(const struct ggml_object * obj) {
-    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
-            obj->type, obj->offs, obj->size, (const void *) obj->next);
-}
-
-void ggml_print_objects(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
-
-    while (obj != NULL) {
-        ggml_print_object(obj);
-        obj = obj->next;
-    }
-
-    GGML_PRINT("%s: --- end ---\n", __func__);
-}
-
-GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
-    size_t nbytes;
-    size_t blck_size = ggml_blck_size(tensor->type);
-    if (blck_size == 1) {
-        nbytes = ggml_type_size(tensor->type);
-        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
-        }
-    }
-    else {
-        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
-        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
-        }
-    }
-
-    return nbytes;
-}
-
-size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
-    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
-}
-
-GGML_CALL int ggml_blck_size(enum ggml_type type) {
-    return type_traits[type].blck_size;
-}
-
-GGML_CALL size_t ggml_type_size(enum ggml_type type) {
-    return type_traits[type].type_size;
-}
-
-GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
-    assert(ne % ggml_blck_size(type) == 0);
-    return ggml_type_size(type)*ne/ggml_blck_size(type);
-}
-
-double ggml_type_sizef(enum ggml_type type) {
-    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
-}
-
-GGML_CALL const char * ggml_type_name(enum ggml_type type) {
-    return type_traits[type].type_name;
-}
-
-GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
-    return type_traits[type].is_quantized;
-}
-
-GGML_CALL const char * ggml_op_name(enum ggml_op op) {
-    return GGML_OP_NAME[op];
-}
-
-const char * ggml_op_symbol(enum ggml_op op) {
-    return GGML_OP_SYMBOL[op];
-}
-
-const char * ggml_unary_op_name(enum ggml_unary_op op) {
-    return GGML_UNARY_OP_NAME[op];
-}
-
-GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
-    if (t->op == GGML_OP_UNARY) {
-        enum ggml_unary_op uop = ggml_get_unary_op(t);
-        return ggml_unary_op_name(uop);
-    }
-    else {
-        return ggml_op_name(t->op);
-    }
-}
-
-GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return ggml_type_size(tensor->type);
-}
-
-bool ggml_is_scalar(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_vector(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_matrix(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_3d(const struct ggml_tensor * tensor) {
-    return tensor->ne[3] == 1;
-}
-
-int ggml_n_dims(const struct ggml_tensor * tensor) {
-    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
-        if (tensor->ne[i] > 1) {
-            return i + 1;
-        }
-    }
-    return 1;
-}
-
-static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0]           == t1->ne[0])  &&
-           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[1] == t1->ne[1])   &&
-           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
-enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
-    enum ggml_type wtype = GGML_TYPE_COUNT;
-
-    switch (ftype) {
-        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
-        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
-        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
-        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
-        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
-        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
-        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
-        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
-        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
-        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
-        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
-        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
-        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
-        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
-        case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
-        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
-        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
-        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
-    }
-
-    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
-
-    return wtype;
-}
-
-size_t ggml_tensor_overhead(void) {
-    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
-}
-
-GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
-    return tensor->nb[0] > tensor->nb[1];
-}
-
-GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
-}
-
-static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0] ) &&
-        (t0->ne[1] == t1->ne[1] ) &&
-        (t0->ne[2] == t1->ne[2] ) &&
-        (t0->ne[3] == t1->ne[3] );
-}
-
-// check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t1->ne[0]%t0->ne[0] == 0) &&
-        (t1->ne[1]%t0->ne[1] == 0) &&
-        (t1->ne[2]%t0->ne[2] == 0) &&
-        (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
-}
-
-static inline int ggml_up32(int n) {
-    return (n + 31) & ~31;
-}
-
-//static inline int ggml_up64(int n) {
-//    return (n + 63) & ~63;
-//}
-
-static inline int ggml_up(int n, int m) {
-    // assert m is a power of 2
-    GGML_ASSERT((m & (m - 1)) == 0);
-    return (n + m - 1) & ~(m - 1);
-}
-
-// assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
-
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            ggml_fp16_t ii;
-            for (int i = 0; i < (1 << 16); ++i) {
-                uint16_t ui = i;
-                memcpy(&ii, &ui, sizeof(ii));
-                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
-
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        ggml_init_cublas();
-#elif defined(GGML_USE_CLBLAST)
-        ggml_cl_init();
-#elif defined(GGML_USE_VULKAN)
-        ggml_vk_init_cpu_assist();
-#elif defined(GGML_USE_SYCL)
-        ggml_init_sycl();
-#endif
-
-        ggml_setup_op_has_task_pass();
-
-        is_first_call = false;
-    }
-
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
-
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
-
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-
-        ggml_critical_section_end();
-
-        return NULL;
-    }
-
-    // allow to call ggml_init with 0 size
-    if (params.mem_size == 0) {
-        params.mem_size = GGML_MEM_ALIGN;
-    }
-
-    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
-
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.no_alloc_save      =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
-
-    GGML_ASSERT(ctx->mem_buffer != NULL);
-
-    ggml_assert_aligned(ctx->mem_buffer);
-
-    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
-
-    ggml_critical_section_end();
-
-    return ctx;
-}
-
-void ggml_free(struct ggml_context * ctx) {
-    if (ctx == NULL) {
-        return;
-    }
-
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ggml_used_mem(ctx));
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
-
-            found = true;
-            break;
-        }
-    }
-
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
-}
-
-size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
-}
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
-    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
-
-    ctx->scratch = scratch;
-
-    return result;
-}
-
-bool ggml_get_no_alloc(struct ggml_context * ctx) {
-    return ctx->no_alloc;
-}
-
-void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
-    ctx->no_alloc = no_alloc;
-}
-
-void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
-    return ctx->mem_buffer;
-}
-
-size_t ggml_get_mem_size(const struct ggml_context * ctx) {
-    return ctx->mem_size;
-}
-
-size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
-    size_t max_size = 0;
-
-    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
-        size_t bytes = ggml_nbytes(tensor);
-        max_size = MAX(max_size, bytes);
-    }
-
-    return max_size;
-}
-
-// IMPORTANT:
-// when creating "opt" tensors, always save and load the scratch buffer
-// this is an error prone process, but it is necessary to support inplace
-// operators when using scratch buffers
-// TODO: implement a better way
-static void ggml_scratch_save(struct ggml_context * ctx) {
-    // this is needed to allow opt tensors to store their data
-    // TODO: again, need to find a better way
-    ctx->no_alloc_save = ctx->no_alloc;
-    ctx->no_alloc      = false;
-
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-}
-
-static void ggml_scratch_load(struct ggml_context * ctx) {
-    ctx->no_alloc = ctx->no_alloc_save;
-
-    ctx->scratch = ctx->scratch_save;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
-    // always insert objects at the end of the context's memory pool
-    struct ggml_object * obj_cur = ctx->objects_end;
-
-    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
-    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end  = cur_offs + cur_size;
-
-    // align to GGML_MEM_ALIGN
-    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
-
-    char * const mem_buffer = ctx->mem_buffer;
-    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-
-    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
-        assert(false);
-        return NULL;
-    }
-
-    *obj_new = (struct ggml_object) {
-        .offs = cur_end + GGML_OBJECT_SIZE,
-        .size = size_needed,
-        .next = NULL,
-        .type = type,
-    };
-
-    ggml_assert_aligned(mem_buffer + obj_new->offs);
-
-    if (obj_cur != NULL) {
-        obj_cur->next = obj_new;
-    } else {
-        // this is the first object in this context
-        ctx->objects_begin = obj_new;
-    }
-
-    ctx->objects_end = obj_new;
-
-    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
-
-    return obj_new;
-}
-
-static struct ggml_tensor * ggml_new_tensor_impl(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int                   n_dims,
-        const int64_t       * ne,
-        struct ggml_tensor  * view_src,
-        size_t                view_offs) {
-
-    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
-
-    // find the base tensor and absolute offset
-    if (view_src != NULL && view_src->view_src != NULL) {
-        view_offs += view_src->view_offs;
-        view_src   = view_src->view_src;
-    }
-
-    size_t data_size = ggml_row_size(type, ne[0]);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= ne[i];
-    }
-
-    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
-
-    void * data = view_src != NULL ? view_src->data : NULL;
-    if (data != NULL) {
-        data = (char *) data + view_offs;
-    }
-
-    size_t obj_alloc_size = 0;
-
-    if (view_src == NULL && !ctx->no_alloc) {
-        if (ctx->scratch.data != NULL) {
-            // allocate tensor data in the scratch buffer
-            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
-                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
-                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
-                assert(false);
-                return NULL;
-            }
-
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
-            ctx->scratch.offs += data_size;
-        } else {
-            // allocate tensor data in the context's memory pool
-            obj_alloc_size = data_size;
-        }
-    }
-
-    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
-
-    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
-
-    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
-
-    *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.backend      =*/ GGML_BACKEND_TYPE_CPU,
-        /*.buffer       =*/ NULL,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.op_params    =*/ { 0 },
-        /*.flags        =*/ 0,
-        /*.grad         =*/ NULL,
-        /*.src          =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-        /*.view_src     =*/ view_src,
-        /*.view_offs    =*/ view_offs,
-        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
-        /*.name         =*/ { 0 },
-        /*.extra        =*/ NULL,
-        /*.padding      =*/ { 0 },
-    };
-
-    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
-
-    for (int i = 0; i < n_dims; i++) {
-        result->ne[i] = ne[i];
-    }
-
-    result->nb[0] = ggml_type_size(type);
-    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
-    for (int i = 2; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
-    }
-
-    ctx->n_objects++;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int                   n_dims,
-        const int64_t       * ne) {
-    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
-}
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0) {
-    return ggml_new_tensor(ctx, type, 1, &ne0);
-}
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0,
-        int64_t ne1) {
-    const int64_t ne[2] = { ne0, ne1 };
-    return ggml_new_tensor(ctx, type, 2, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2) {
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    return ggml_new_tensor(ctx, type, 3, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3) {
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    return ggml_new_tensor(ctx, type, 4, ne);
-}
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ggml_scratch_save(ctx);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-
-    ggml_scratch_load(ctx);
-
-    ggml_set_i32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ggml_scratch_save(ctx);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ggml_scratch_load(ctx);
-
-    ggml_set_f32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
-}
-
-static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
-    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
-    assert(params_size <= GGML_MAX_OP_PARAMS);
-    memcpy(tensor->op_params, params, params_size);
-}
-
-static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    return ((const int32_t *)(tensor->op_params))[i];
-}
-
-static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    ((int32_t *)(tensor->op_params))[i] = value;
-}
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    memset(tensor->data, 0, ggml_nbytes(tensor));
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
-    const int64_t ne2 = tensor->ne[2];
-    const int64_t ne1 = tensor->ne[1];
-    const int64_t ne0 = tensor->ne[0];
-
-    const int64_t i3_ = (i/(ne2*ne1*ne0));
-    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
-    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
-    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
-
-    if (i0) {
-        * i0 = i0_;
-    }
-    if (i1) {
-        * i1 = i1_;
-    }
-    if (i2) {
-        * i2 = i2_;
-    }
-    if (i3) {
-        * i3 = i3_;
-    }
-}
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ASSERT(false);
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ASSERT(false);
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-void * ggml_get_data(const struct ggml_tensor * tensor) {
-    return tensor->data;
-}
-
-float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
-    assert(tensor->type == GGML_TYPE_F32);
-    return (float *)(tensor->data);
-}
-
-GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
-    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
-}
-
-const char * ggml_get_name(const struct ggml_tensor * tensor) {
-    return tensor->name;
-}
-
-struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-    strncpy(tensor->name, name, sizeof(tensor->name) - 1);
-    tensor->name[sizeof(tensor->name) - 1] = '\0';
-    return tensor;
-}
-
-struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
-    va_end(args);
-    return tensor;
-}
-
-struct ggml_tensor * ggml_view_tensor(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * src) {
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
-    ggml_format_name(result, "%s (view)", src->name);
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = src->nb[i];
-    }
-
-    return result;
-}
-
-struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-            return (struct ggml_tensor *)(mem_buffer + obj->offs);
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
-    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
-    obj = obj->next;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-            return (struct ggml_tensor *)(mem_buffer + obj->offs);
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
-            if (strcmp(cur->name, name) == 0) {
-                return cur;
-            }
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// ggml_dup
-
-static struct ggml_tensor * ggml_dup_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DUP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_dup_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, true);
-}
-
-// ggml_add
-
-static struct ggml_tensor * ggml_add_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, true);
-}
-
-// ggml_add_cast
-
-static struct ggml_tensor * ggml_add_cast_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
-    // TODO: support less-strict constraint
-    //       GGML_ASSERT(ggml_can_repeat(b, a));
-    GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add_cast(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
-    return ggml_add_cast_impl(ctx, a, b, type);
-}
-
-// ggml_add1
-
-static struct ggml_tensor * ggml_add1_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_scalar(b));
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add1(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add1_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add1_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add1_impl(ctx, a, b, true);
-}
-
-// ggml_acc
-
-static struct ggml_tensor * ggml_acc_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
-    GGML_ASSERT(b->type == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ACC;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_acc(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
-    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-}
-
-struct ggml_tensor * ggml_acc_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
-    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-}
-
-// ggml_sub
-
-static struct ggml_tensor * ggml_sub_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SUB;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_sub_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, true);
-}
-
-// ggml_mul
-
-static struct ggml_tensor * ggml_mul_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_MUL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_mul_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, true);
-}
-
-// ggml_div
-
-static struct ggml_tensor * ggml_div_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DIV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_div_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, true);
-}
-
-// ggml_sqr
-
-static struct ggml_tensor * ggml_sqr_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQR;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqr_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, true);
-}
-
-// ggml_sqrt
-
-static struct ggml_tensor * ggml_sqrt_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQRT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqrt_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, true);
-}
-
-// ggml_log
-
-static struct ggml_tensor * ggml_log_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_LOG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_log(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_log_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_log_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_log_impl(ctx, a, true);
-}
-
-// ggml_sum
-
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_SUM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_sum_rows
-
-struct ggml_tensor * ggml_sum_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    int64_t ne[GGML_MAX_DIMS] = { 1 };
-    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-        ne[i] = a->ne[i];
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
-
-    result->op   = GGML_OP_SUM_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_mean
-
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement
-        is_node = true;
-    }
-
-    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_MEAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_argmax
-
-struct ggml_tensor * ggml_argmax(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    GGML_ASSERT(ggml_is_matrix(a));
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false);
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
-
-    result->op   = GGML_OP_ARGMAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_repeat
-
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(a, b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-
-    result->op   = GGML_OP_REPEAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_repeat_back
-
-struct ggml_tensor * ggml_repeat_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (ggml_are_same_shape(a, b) && !is_node) {
-        return a;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-
-    result->op   = GGML_OP_REPEAT_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_concat
-
-struct ggml_tensor * ggml_concat(
-    struct ggml_context* ctx,
-    struct ggml_tensor* a,
-    struct ggml_tensor* b) {
-    GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_CONCAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_abs
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
-}
-
-struct ggml_tensor * ggml_abs_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
-}
-
-// ggml_sgn
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
-}
-
-struct ggml_tensor * ggml_sgn_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
-}
-
-// ggml_neg
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
-}
-
-struct ggml_tensor * ggml_neg_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
-}
-
-// ggml_step
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
-}
-
-struct ggml_tensor * ggml_step_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
-}
-
-// ggml_tanh
-
-struct ggml_tensor * ggml_tanh(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
-}
-
-struct ggml_tensor * ggml_tanh_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
-}
-
-// ggml_elu
-
-struct ggml_tensor * ggml_elu(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
-}
-
-struct ggml_tensor * ggml_elu_inplace(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
-}
-
-// ggml_relu
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
-}
-
-struct ggml_tensor * ggml_relu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
-}
-
-// ggml_leaky_relu
-
-struct ggml_tensor * ggml_leaky_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a, float negative_slope, bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
-
-    result->op   = GGML_OP_LEAKY_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_gelu
-
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
-}
-
-struct ggml_tensor * ggml_gelu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
-}
-
-// ggml_gelu_quick
-
-struct ggml_tensor * ggml_gelu_quick(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-}
-
-struct ggml_tensor * ggml_gelu_quick_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-}
-
-// ggml_silu
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
-}
-
-struct ggml_tensor * ggml_silu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
-}
-
-// ggml_silu_back
-
-struct ggml_tensor * ggml_silu_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SILU_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml hardswish
-struct ggml_tensor * ggml_hardswish(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
-}
-
-// ggml hardsigmoid
-struct ggml_tensor * ggml_hardsigmoid(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
-}
-
-// ggml_norm
-
-static struct ggml_tensor * ggml_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_norm_impl(ctx, a, eps, false);
-}
-
-struct ggml_tensor * ggml_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_norm_impl(ctx, a, eps, true);
-}
-
-// ggml_rms_norm
-
-static struct ggml_tensor * ggml_rms_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_RMS_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float  eps) {
-    return ggml_rms_norm_impl(ctx, a, eps, false);
-}
-
-struct ggml_tensor * ggml_rms_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_rms_norm_impl(ctx, a, eps, true);
-}
-
-// ggml_rms_norm_back
-
-struct ggml_tensor * ggml_rms_norm_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        float  eps) {
-    bool is_node = false;
-
-    if (a->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_RMS_NORM_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_group_norm
-
-static struct ggml_tensor * ggml_group_norm_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups,
-    bool inplace) {
-
-    bool is_node = false;
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op_params[0] = n_groups;
-
-    result->op = GGML_OP_GROUP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_group_norm(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, false);
-}
-
-struct ggml_tensor * ggml_group_norm_inplace(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, true);
-}
-
-// ggml_mul_mat
-
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_mul_mat(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_MUL_MAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-void ggml_mul_mat_set_prec(
-        struct ggml_tensor * a,
-        enum ggml_prec       prec) {
-    const int32_t prec_i32 = (int32_t) prec;
-
-    ggml_set_op_params_i32(a, 0, prec_i32);
-}
-
-// ggml_mul_mat_id
-
-struct ggml_tensor * ggml_mul_mat_id(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * const as[],
-        int                   n_as,
-        struct ggml_tensor  * ids,
-        int                   id,
-        struct ggml_tensor  * b) {
-
-    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
-    GGML_ASSERT(ids->ne[1] == b->ne[1]);
-    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
-    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
-    GGML_ASSERT(id >= 0 && id < ids->ne[0]);
-
-    bool is_node = false;
-
-    if (as[0]->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    ggml_set_op_params_i32(result, 0, id);
-    ggml_set_op_params_i32(result, 1, n_as);
-
-    result->op   = GGML_OP_MUL_MAT_ID;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = ids;
-    result->src[1] = b;
-
-    for (int i = 0; i < n_as; i++) {
-        struct ggml_tensor * a = as[i];
-        GGML_ASSERT(ggml_are_same_shape(as[0], a));
-        GGML_ASSERT(ggml_can_mul_mat(a, b));
-        GGML_ASSERT(!ggml_is_transposed(a));
-        result->src[i + 2] = a;
-    }
-
-    return result;
-}
-
-// ggml_out_prod
-
-struct ggml_tensor * ggml_out_prod(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_out_prod(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
-    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_OUT_PROD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_scale
-
-static struct ggml_tensor * ggml_scale_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &s, sizeof(s));
-
-    result->op   = GGML_OP_SCALE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
-    return ggml_scale_impl(ctx, a, s, false);
-}
-
-struct ggml_tensor * ggml_scale_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
-    return ggml_scale_impl(ctx, a, s, true);
-}
-
-// ggml_set
-
-static struct ggml_tensor * ggml_set_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_SET;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_set(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-}
-
-struct ggml_tensor * ggml_set_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-}
-
-struct ggml_tensor * ggml_set_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
-}
-
-struct ggml_tensor * ggml_set_1d_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
-}
-
-struct ggml_tensor * ggml_set_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
-}
-
-struct ggml_tensor * ggml_set_2d_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
-}
-
-// ggml_cpy
-
-static struct ggml_tensor * ggml_cpy_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // inplace is false and either one have a grad
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
-    if (strlen(b->name) > 0) {
-        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
-    } else {
-        ggml_format_name(result, "%s (copy)", a->name);
-    }
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b);
-}
-
-struct ggml_tensor * ggml_cast(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum   ggml_type      type) {
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
-    ggml_format_name(result, "%s (copy)", a->name);
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = result;
-
-    return result;
-}
-
-// ggml_cont
-
-static struct ggml_tensor * ggml_cont_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    ggml_format_name(result, "%s (cont)", a->name);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a);
-}
-
-// make contiguous, with new shape
-GGML_API struct ggml_tensor * ggml_cont_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0) {
-    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
-}
-
-GGML_API struct ggml_tensor * ggml_cont_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
-}
-
-GGML_API struct ggml_tensor * ggml_cont_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
-}
-
-struct ggml_tensor * ggml_cont_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3) {
-    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
-
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
-    ggml_format_name(result, "%s (cont)", a->name);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_reshape
-
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (b->grad) {
-        // gradient propagation is not supported
-        //GGML_ASSERT(false);
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[1] = { ne0 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[2] = { ne0, ne1 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-static struct ggml_tensor * ggml_view_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_dims,
-        const int64_t       * ne,
-        size_t                offset) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
-    ggml_format_name(result, "%s (view)", a->name);
-
-    ggml_set_op_params(result, &offset, sizeof(offset));
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_view_1d
-
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset) {
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
-
-    return result;
-}
-
-// ggml_view_2d
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1,
-        size_t                offset) {
-
-    const int64_t ne[2] = { ne0, ne1 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = result->nb[1]*ne1;
-    result->nb[3] = result->nb[2];
-
-    return result;
-}
-
-// ggml_view_3d
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                offset) {
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = result->nb[2]*ne2;
-
-    return result;
-}
-
-// ggml_view_4d
-
-struct ggml_tensor * ggml_view_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = nb3;
-
-    return result;
-}
-
-// ggml_permute
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3) {
-    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
-
-    GGML_ASSERT(axis0 != axis1);
-    GGML_ASSERT(axis0 != axis2);
-    GGML_ASSERT(axis0 != axis3);
-    GGML_ASSERT(axis1 != axis2);
-    GGML_ASSERT(axis1 != axis3);
-    GGML_ASSERT(axis2 != axis3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    ggml_format_name(result, "%s (permuted)", a->name);
-
-    int ne[GGML_MAX_DIMS];
-    int nb[GGML_MAX_DIMS];
-
-    ne[axis0] = a->ne[0];
-    ne[axis1] = a->ne[1];
-    ne[axis2] = a->ne[2];
-    ne[axis3] = a->ne[3];
-
-    nb[axis0] = a->nb[0];
-    nb[axis1] = a->nb[1];
-    nb[axis2] = a->nb[2];
-    nb[axis3] = a->nb[3];
-
-    result->ne[0] = ne[0];
-    result->ne[1] = ne[1];
-    result->ne[2] = ne[2];
-    result->ne[3] = ne[3];
-
-    result->nb[0] = nb[0];
-    result->nb[1] = nb[1];
-    result->nb[2] = nb[2];
-    result->nb[3] = nb[3];
-
-    result->op   = GGML_OP_PERMUTE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    int32_t params[] = { axis0, axis1, axis2, axis3 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    return result;
-}
-
-// ggml_transpose
-
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    ggml_format_name(result, "%s (transposed)", a->name);
-
-    result->ne[0] = a->ne[1];
-    result->ne[1] = a->ne[0];
-
-    result->nb[0] = a->nb[1];
-    result->nb[1] = a->nb[0];
-
-    result->op   = GGML_OP_TRANSPOSE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_get_rows
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(b->ne[3] == 1);
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    enum ggml_type type = GGML_TYPE_F32;
-    if (a->type == GGML_TYPE_I32) {
-        type = a->type;
-    }
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
-
-    result->op   = GGML_OP_GET_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_get_rows_back
-
-struct ggml_tensor * ggml_get_rows_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        struct ggml_tensor  * c) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
-
-    result->op   = GGML_OP_GET_ROWS_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_diag
-
-struct ggml_tensor * ggml_diag(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    GGML_ASSERT(a->ne[1] == 1);
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
-
-    result->op   = GGML_OP_DIAG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_diag_mask_inf
-
-static struct ggml_tensor * ggml_diag_mask_inf_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { n_past };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_DIAG_MASK_INF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
-}
-
-struct ggml_tensor * ggml_diag_mask_inf_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
-}
-
-// ggml_diag_mask_zero
-
-static struct ggml_tensor * ggml_diag_mask_zero_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { n_past };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_DIAG_MASK_ZERO;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_diag_mask_zero(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
-}
-
-struct ggml_tensor * ggml_diag_mask_zero_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
-}
-
-// ggml_soft_max
-
-static struct ggml_tensor * ggml_soft_max_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * mask,
-        struct ggml_tensor  * pos,
-        float                 scale,
-        float                 max_bias,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-
-    if (mask) {
-        GGML_ASSERT(ggml_is_contiguous(mask));
-        GGML_ASSERT(ggml_is_matrix(mask));
-        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
-    }
-
-    if (pos) {
-        GGML_ASSERT(ggml_is_vector(pos));
-        GGML_ASSERT(pos->type == GGML_TYPE_F32);
-        GGML_ASSERT(pos->ne[0] == a->ne[0]);
-    }
-
-    if (max_bias > 0.0f) {
-        GGML_ASSERT(pos);
-    }
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    float params[] = { scale, max_bias };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_SOFT_MAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = mask;
-    result->src[2] = pos;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
-}
-
-struct ggml_tensor * ggml_soft_max_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
-}
-
-struct ggml_tensor * ggml_soft_max_ext(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * mask,
-        struct ggml_tensor  * pos,
-        float                 scale,
-        float                 max_bias) {
-    return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
-}
-
-// ggml_soft_max_back
-
-static struct ggml_tensor * ggml_soft_max_back_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true; // TODO : implement backward pass
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SOFT_MAX_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_soft_max_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_soft_max_back_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_soft_max_back_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_soft_max_back_impl(ctx, a, b, true);
-}
-
-// ggml_rope
-
-static struct ggml_tensor * ggml_rope_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-    GGML_ASSERT(a->ne[2] == b->ne[0]);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ROPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
-    );
-}
-
-struct ggml_tensor * ggml_rope_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
-    );
-}
-
-struct ggml_tensor * ggml_rope_custom(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
-    );
-}
-
-struct ggml_tensor * ggml_rope_custom_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
-    );
-}
-
-struct ggml_tensor * ggml_rope_xpos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        float                 base,
-        bool                  down) {
-    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
-}
-
-// ggml_rope_back
-
-struct ggml_tensor * ggml_rope_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down) {
-    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-    GGML_ASSERT(a->ne[2] == b->ne[0]);
-
-    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = false; // TODO: implement backward
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ROPE_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_alibi
-
-struct ggml_tensor * ggml_alibi(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_head,
-        float                 bias_max) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    int32_t op_params[3] = { n_past, n_head };
-    memcpy(op_params + 2, &bias_max, sizeof(float));
-    ggml_set_op_params(result, op_params, sizeof(op_params));
-
-    result->op   = GGML_OP_ALIBI;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_clamp
-
-struct ggml_tensor * ggml_clamp(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 min,
-        float                 max) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    float params[] = { min, max };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_CLAMP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_conv_1d
-
-static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
-}
-
-GGML_API struct ggml_tensor * ggml_conv_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
-
-    struct ggml_tensor * result =
-        ggml_mul_mat(ctx,
-                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
-                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
-
-    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-
-    return result;
-}
-
-// ggml_conv_1d_ph
-
-struct ggml_tensor* ggml_conv_1d_ph(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s,
-        int                   d) {
-    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
-}
-
-// ggml_conv_transpose_1d
-
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
-
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_CONV_TRANSPOSE_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_conv_depthwise
-struct ggml_tensor * ggml_conv_depthwise_2d(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    struct ggml_tensor * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
-
-    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
-                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
-
-    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
-
-    return result;
-}
-// ggml_conv_2d
-
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct ggml_tensor * ggml_im2col(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1,
-    bool                 is_2D,
-    enum ggml_type       dst_type) {
-
-    if(is_2D) {
-        GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        GGML_ASSERT(a->ne[1] == b->ne[1]);
-    }
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
-    const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
-    };
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_IM2COL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OC, OH, OW]
-struct ggml_tensor * ggml_conv_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                  s0,
-        int                  s1,
-        int                  p0,
-        int                  p1,
-        int                  d0,
-        int                  d1) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
-
-    struct ggml_tensor * result =
-        ggml_mul_mat(ctx,
-                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
-                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
-    result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
-
-
-    return result;
-}
-
-// ggml_conv_2d_sk_p0
-struct ggml_tensor * ggml_conv_2d_sk_p0(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
-}
-
-// ggml_conv_2d_s1_ph
-
-struct ggml_tensor * ggml_conv_2d_s1_ph(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
-}
-
-// ggml_conv_transpose_2d_p0
-
-static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
-    return (ins - 1) * s - 2 * p + ks;
-}
-
-struct ggml_tensor * ggml_conv_transpose_2d_p0(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   stride) {
-    GGML_ASSERT(a->ne[3] == b->ne[2]);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
-        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
-        a->ne[2], b->ne[3],
-    };
-
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    ggml_set_op_params_i32(result, 0, stride);
-
-    result->op = GGML_OP_CONV_TRANSPOSE_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_pool_*
-
-static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
-    return (ins + 2 * p - ks) / s + 1;
-}
-
-// ggml_pool_1d
-
-struct ggml_tensor * ggml_pool_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_op_pool     op,
-        int                   k0,
-        int                   s0,
-        int                   p0) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
-        a->ne[1],
-        a->ne[2],
-        a->ne[3],
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { op, k0, s0, p0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_POOL_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_pool_2d
-
-struct ggml_tensor * ggml_pool_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_op_pool     op,
-        int                   k0,
-        int                   k1,
-        int                   s0,
-        int                   s1,
-        float                 p0,
-        float                 p1) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result;
-    const int64_t ne[3] = {
-        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
-        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
-        a->ne[2],
-    };
-    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_POOL_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    return result;
-}
-
-// ggml_upscale
-
-static struct ggml_tensor * ggml_upscale_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] * scale_factor,
-            a->ne[1] * scale_factor,
-            a->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_UPSCALE;
-    result->op_params[0] = scale_factor;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_pad(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] + p0,
-            a->ne[1] + p1,
-            a->ne[2] + p2,
-            a->ne[3] + p3);
-
-    result->op = GGML_OP_PAD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_upscale(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
-    return ggml_upscale_impl(ctx, a, scale_factor);
-}
-
-// ggml_argsort
-
-struct ggml_tensor * ggml_argsort(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_sort_order  order) {
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
-
-    ggml_set_op_params_i32(result, 0, (int32_t) order);
-
-    result->op   = GGML_OP_ARGSORT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_top_k
-
-struct ggml_tensor * ggml_top_k(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   k) {
-    GGML_ASSERT(a->ne[0] >= k);
-
-    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
-
-    result = ggml_view_4d(ctx, result,
-                k, result->ne[1], result->ne[2], result->ne[3],
-                   result->nb[1], result->nb[2], result->nb[3],
-                0);
-
-    return result;
-}
-
-// ggml_flash_attn
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
-
-    int32_t t = masked ? 1 : 0;
-    ggml_set_op_params(result, &t, sizeof(t));
-
-    result->op   = GGML_OP_FLASH_ATTN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = q;
-    result->src[1] = k;
-    result->src[2] = v;
-
-    return result;
-}
-
-// ggml_flash_ff
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1) {
-    GGML_ASSERT(ggml_can_mul_mat(b0, a));
-    // TODO: more checks
-
-    bool is_node = false;
-
-    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
-
-    result->op   = GGML_OP_FLASH_FF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b0;
-    result->src[2] = b1;
-    result->src[3] = c0;
-    result->src[4] = c1;
-
-    return result;
-}
-
-// ggml_flash_attn_back
-
-struct ggml_tensor * ggml_flash_attn_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        struct ggml_tensor  * d,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    // d shape [D,N,ne2,ne3]
-    // q shape [D,N,ne2,ne3]
-    // k shape [D,M,kvne2,ne3]
-    // v shape [M,D,kvne2,ne3]
-
-    const int64_t     D = q->ne[0];
-    const int64_t     N = q->ne[1];
-    const int64_t     M = k->ne[1];
-    const int64_t   ne2 = q->ne[2];
-    const int64_t   ne3 = q->ne[3];
-    const int64_t kvne2 = k->ne[2];
-
-    GGML_ASSERT(k->ne[0] == D);
-    GGML_ASSERT(v->ne[0] == M);
-    GGML_ASSERT(v->ne[1] == D);
-    GGML_ASSERT(d->ne[0] == D);
-    GGML_ASSERT(d->ne[1] == N);
-    GGML_ASSERT(k->ne[2] == kvne2);
-    GGML_ASSERT(k->ne[3] == ne3);
-    GGML_ASSERT(v->ne[2] == kvne2);
-    GGML_ASSERT(v->ne[3] == ne3);
-    GGML_ASSERT(d->ne[2] == ne2);
-    GGML_ASSERT(d->ne[3] == ne3);
-
-    GGML_ASSERT(ne2 % kvne2 == 0);
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        // when using this operation (in backwards pass) these grads are set.
-        // we don't want to create (big) grad of our result, so is_node is false.
-        is_node = false;
-    }
-
-    // store gradients of q, k and v as continuous tensors concatenated in result.
-    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
-    const int64_t elem_q = ggml_nelements(q);
-    const int64_t elem_k = ggml_nelements(k);
-    const int64_t elem_v = ggml_nelements(v);
-
-    enum ggml_type result_type = GGML_TYPE_F32;
-    GGML_ASSERT(ggml_blck_size(result_type) == 1);
-    const size_t tsize = ggml_type_size(result_type);
-
-    const size_t offs_q = 0;
-    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
-
-    const size_t nelements = (end + tsize - 1)/tsize;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
-
-    int32_t masked_i = masked ? 1 : 0;
-    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
-
-    result->op   = GGML_OP_FLASH_ATTN_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = q;
-    result->src[1] = k;
-    result->src[2] = v;
-    result->src[3] = d;
-
-    return result;
-}
-
-// ggml_win_part
-
-struct ggml_tensor * ggml_win_part(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   w) {
-    GGML_ASSERT(a->ne[3] == 1);
-    GGML_ASSERT(a->type  == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // padding
-    const int px = (w - a->ne[1]%w)%w;
-    const int py = (w - a->ne[2]%w)%w;
-
-    const int npx = (px + a->ne[1])/w;
-    const int npy = (py + a->ne[2])/w;
-    const int np  = npx*npy;
-
-    const int64_t ne[4] = { a->ne[0], w, w, np, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { npx, npy, w };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_WIN_PART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_win_unpart
-
-struct ggml_tensor * ggml_win_unpart(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   w0,
-        int                   h0,
-        int                   w) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-
-    int32_t params[] = { w };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_WIN_UNPART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_get_rel_pos
-
-struct ggml_tensor * ggml_get_rel_pos(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   qh,
-        int                   kh) {
-    GGML_ASSERT(qh == kh);
-    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
-
-    result->op   = GGML_OP_GET_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_add_rel_pos
-
-static struct ggml_tensor * ggml_add_rel_pos_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_are_same_shape(pw, ph));
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_is_contiguous(pw));
-    GGML_ASSERT(ggml_is_contiguous(ph));
-    GGML_ASSERT(ph->type == GGML_TYPE_F32);
-    GGML_ASSERT(pw->type == GGML_TYPE_F32);
-    GGML_ASSERT(pw->ne[3] == a->ne[2]);
-    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
-    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || pw->grad || ph->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
-
-    result->op   = GGML_OP_ADD_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = pw;
-    result->src[2] = ph;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add_rel_pos(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph) {
-    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
-}
-
-struct ggml_tensor * ggml_add_rel_pos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph) {
-    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
-}
-
-// gmml_unary
-
-static struct ggml_tensor * ggml_unary_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        enum ggml_unary_op op,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params_i32(result, 0, (int32_t) op);
-
-    result->op   = GGML_OP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_unary(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
-    return ggml_unary_impl(ctx, a, op, false);
-}
-
-struct ggml_tensor * ggml_unary_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
-    return ggml_unary_impl(ctx, a, op, true);
-}
-
-// ggml_map_unary
-
-static struct ggml_tensor * ggml_map_unary_impl_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_unary_inplace_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_binary
-
-static struct ggml_tensor * ggml_map_binary_impl_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun,
-        bool   inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_BINARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_binary_inplace_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
-}
-
-// ggml_map_custom1_f32
-
-static struct ggml_tensor * ggml_map_custom1_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM1_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom1_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun) {
-    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom1_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun) {
-    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_custom2_f32
-
-static struct ggml_tensor * ggml_map_custom2_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM2_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom2_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun) {
-    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom2_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun) {
-    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
-}
-
-// ggml_map_custom3_f32
-
-static struct ggml_tensor * ggml_map_custom3_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM3_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom3_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun) {
-    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom3_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun) {
-    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
-}
-
-// ggml_map_custom1
-struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom1_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom1_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom1(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom1_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
-}
-
-// ggml_map_custom2
-
-struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom2_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom2_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM2;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom2(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom2_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
-}
-
-// ggml_map_custom3
-
-struct ggml_map_custom3_op_params {
-    ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom3_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom3_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM3;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom3(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom3_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
-}
-
-// ggml_cross_entropy_loss
-
-struct ggml_tensor * ggml_cross_entropy_loss(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_cross_entropy_loss_back
-
-struct ggml_tensor * ggml_cross_entropy_loss_back(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        struct ggml_tensor          * c) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_scalar(c));
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
-    result->grad = NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor) {
-    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
-
-    GGML_ASSERT(tensor->grad == NULL);
-    tensor->grad = ggml_dup_tensor(ctx, tensor);
-    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
-}
-
-// ggml_compute_forward_dup
-
-static void ggml_compute_forward_dup_same_cont(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb0 = dst->nb[0];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by elements
-    const int ne = ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = MIN(ie0 + dr, ne);
-
-    if (ie0 < ie1) {
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * ggml_type_size(src0->type));
-    }
-
-}
-static void ggml_compute_forward_dup_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_fp16_t)) {
-            if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        // TODO: simplify
-        if (nb00 == sizeof(float)) {
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
-static void ggml_compute_forward_dup_bytes(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    const size_t type_size = ggml_type_size(src0->type);
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == type_size && nb0 == type_size) {
-        // copy by rows
-        const size_t rs = ne00 * type_size;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        size_t id = 0;
-        char * dst_ptr = (char *) dst->data;
-        const size_t rs = ne00 * type_size;
-
-        if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        memcpy(dst_ptr + id, src0_ptr, rs);
-                        id += rs;
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, type_size);
-
-                            id += type_size;
-                        }
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            i10 += ne00 * ir0;
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                    memcpy(dst_ptr, src0_ptr, type_size);
-
-                    if (++i10 == ne0) {
-                        i10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            i10 += ne00 * (ne01 - ir1);
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_dup(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (src0->type == dst->type) {
-        ggml_compute_forward_dup_bytes(params, dst);
-        return;
-    }
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_dup_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_dup_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-#ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-        // TODO: OpenCL kernel support full broadcast
-        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-        if (ith == 0) {
-            ggml_cl_add(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_f16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    }
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_f16_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(ggml_fp16_t)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_q_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    const enum ggml_type dtype = dst->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        // src1 and dst are same shape as src0 => same indices
-        const int i13 = i03;
-        const int i12 = i02;
-        const int i11 = i01;
-
-        const int i3 = i03;
-        const int i2 = i02;
-        const int i1 = i01;
-
-        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
-
-        assert(ne00 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
-        // add src1
-        ggml_vec_acc_f32(ne00, wdata, src1_row);
-        // quantize row to dst
-        if (quantize_row_q != NULL) {
-            quantize_row_q(wdata, dst_row, ne00);
-        } else {
-            memcpy(dst_row, wdata, ne0*nb0);
-        }
-    }
-}
-
-static void ggml_compute_forward_add(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f32(params, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add_f16_f16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f16_f32(params, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-            {
-                ggml_compute_forward_add_q_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add1
-
-static void ggml_compute_forward_add1_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-#ifdef GGML_USE_ACCELERATE
-        UNUSED(ggml_vec_add1_f32);
-
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                (float *) ((char *) src1->data), 0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                ne0);
-#else
-        ggml_vec_add1_f32(ne0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-               *(float *) src1->data);
-#endif
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_q_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
-
-    // we don't support permuted src0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
-        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
-
-        assert(ne0 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne0);
-        // add src1
-        ggml_vec_acc1_f32(ne0, wdata, v);
-        // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne0);
-    }
-}
-
-static void ggml_compute_forward_add1(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add1_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add1_f16_f16(params, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add1_f16_f32(params, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-            {
-                ggml_compute_forward_add1_q_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_acc
-
-static void ggml_compute_forward_acc_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during acc
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-        if (params->ith != 0) {
-            return;
-        }
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during acc
-    const size_t nb0 = ggml_element_size(src0);
-
-    const size_t nb00 = nb0;
-    const size_t nb01 = nb1;
-    const size_t nb02 = nb2;
-    const size_t nb03 = nb3;
-
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-#ifdef GGML_USE_ACCELERATE
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
-#else
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-#endif
-    }
-}
-
-static void ggml_compute_forward_acc(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_acc_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sub
-
-static void ggml_compute_forward_sub_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-#ifdef GGML_USE_ACCELERATE
-            vDSP_vsub(
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
-#else
-            ggml_vec_sub_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-#endif
-                // }
-            // }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sub(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sub_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul
-
-static void ggml_compute_forward_mul_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-#if defined(GGML_USE_CLBLAST)
-    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-        // TODO: OpenCL kernel support full broadcast
-        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0 ; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_mul_f32);
-
-                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mul(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_div
-
-static void ggml_compute_forward_div_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_div_f32);
-
-                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_div(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_div_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqr
-
-static void ggml_compute_forward_sqr_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n     = ggml_nrows(src0);
-    const int nc    = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqr_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqr(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqr_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqrt
-
-static void ggml_compute_forward_sqrt_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqrt_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqrt(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqrt_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_log
-
-static void ggml_compute_forward_log_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_log_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_log(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_log_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum
-
-static void ggml_compute_forward_sum_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    ggml_float sum     = 0;
-    ggml_float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32_ggf(ne00,
-                        &row_sum,
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((float *) dst->data)[0] = sum;
-}
-
-static void ggml_compute_forward_sum_f16(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    float sum = 0;
-    float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f16_ggf(ne00,
-                    &row_sum,
-                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
-}
-
-static void ggml_compute_forward_sum(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_sum_f16(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum_rows
-
-static void ggml_compute_forward_sum_rows_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne0 == 1);
-    GGML_ASSERT(ne1 == ne01);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    for (int64_t i3 = 0; i3 < ne03; i3++) {
-        for (int64_t i2 = 0; i2 < ne02; i2++) {
-            for (int64_t i1 = 0; i1 < ne01; i1++) {
-                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
-                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
-                float row_sum = 0;
-                ggml_vec_sum_f32(ne00, &row_sum, src_row);
-                dst_row[0] = row_sum;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sum_rows(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_rows_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    assert(ne0 == 1);
-    assert(ne1 == ne01);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    UNUSED(ne0);
-    UNUSED(ne1);
-    UNUSED(ne2);
-    UNUSED(ne3);
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
-                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mean(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mean_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_argmax
-
-static void ggml_compute_forward_argmax_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-    assert(dst->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb0 = dst->nb[0];
-
-    for (int64_t i1 = 0; i1 < ne01; i1++) {
-        float * src = (float *) ((char *) src0->data + i1*nb01);
-        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
-        int v = 0;
-        ggml_vec_argmax_f32(ne00, &v, src);
-        dst_[0] = v;
-    }
-}
-
-static void ggml_compute_forward_argmax(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argmax_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_vec_cpy_f32(ne00,
-                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
-                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
-                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
-                                // ggml_vec_cpy_f16(ne00, y, x)
-                                for (int i = 0; i < ne00; ++i) {
-                                    y[i]  = x[i];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_I16:
-            {
-                ggml_compute_forward_repeat_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_repeat_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat_back
-
-static void ggml_compute_forward_repeat_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(dst, src0));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne00/ne0);
-    const int nr1 = (int)(ne01/ne1);
-    const int nr2 = (int)(ne02/ne2);
-    const int nr3 = (int)(ne03/ne3);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-    } else {
-        for         (int k3 = 0; k3 < ne3; k3++) {
-            for     (int k2 = 0; k2 < ne2; k2++) {
-                for (int k1 = 0; k1 < ne1; k1++) {
-                    ggml_vec_set_f32(ne0,
-                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
-                        0);
-                }
-            }
-        }
-    }
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3; i3++) {
-        for                     (int k3 = 0; k3 < ne3; k3++) {
-            for                 (int i2 = 0; i2 < nr2; i2++) {
-                for             (int k2 = 0; k2 < ne2; k2++) {
-                    for         (int i1 = 0; i1 < nr1; i1++) {
-                        for     (int k1 = 0; k1 < ne1; k1++) {
-                            for (int i0 = 0; i0 < nr0; i0++) {
-                                ggml_vec_acc_f32(ne0,
-                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
-                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_repeat_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_concat
-
-static void ggml_compute_forward_concat_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ith; i2 < ne2; i2 += nth) {
-            if (i2 < ne02) { // src0
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
-
-                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
-                        *y = *x;
-                    }
-                }
-            } // src1
-            else {
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
-
-                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
-                        *y = *x;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_concat(
-    const struct ggml_compute_params* params,
-    struct ggml_tensor* dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_concat_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_abs_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_abs(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_abs_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sgn_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sgn(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sgn_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_neg_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_neg(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_neg_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_step_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_step(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_step_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_tanh
-
-static void ggml_compute_forward_tanh_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_tanh_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_tanh(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_tanh_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_elu
-
-static void ggml_compute_forward_elu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_elu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_elu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_elu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_relu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_relu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu
-
-static void ggml_compute_forward_gelu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu_quick
-
-static void ggml_compute_forward_gelu_quick_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_quick_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu_quick(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_quick_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_silu
-
-static void ggml_compute_forward_silu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-// ggml_compute_forward_leaky_relu
-
-static void ggml_compute_forward_leaky_relu_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_leaky_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
-    }
-}
-
-static void ggml_compute_forward_leaky_relu(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_leaky_relu_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_silu_back
-
-static void ggml_compute_forward_silu_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * grad = dst->src[1];
-
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, grad));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_backward_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])),
-                (float *) ((char *) grad->data + i1*(grad->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-static void ggml_compute_forward_hardswish_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardswish_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-static void ggml_compute_forward_hardswish(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardswish_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardsigmoid_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardsigmoid_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_norm
-
-static void ggml_compute_forward_norm_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
-
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_norm(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_group_rms_norm
-
-static void ggml_compute_forward_rms_norm_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
-
-                const float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                // src1 is same shape as src0 => same indices
-                const int64_t i11 = i01;
-                const int64_t i12 = i02;
-                const int64_t i13 = i03;
-
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
-
-                ggml_float sum_xx  = 0.0;
-                ggml_float sum_xdz = 0.0;
-
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
-                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
-                }
-
-                //const float mean     = (float)(sum_xx)/ne00;
-                const float mean_eps = (float)(sum_xx)/ne00 + eps;
-                const float sum_eps  = (float)(sum_xx) + eps*ne00;
-                //const float mean_xdz = (float)(sum_xdz)/ne00;
-                // we could cache rms from forward pass to improve performance.
-                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
-                //const float rms      = sqrtf(mean_eps);
-                const float rrms     = 1.0f / sqrtf(mean_eps);
-                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
-
-                {
-                    // z = rms_norm(x)
-                    //
-                    // rms_norm(src0) =
-                    //     scale(
-                    //         src0,
-                    //         div(
-                    //             1,
-                    //             sqrt(
-                    //                 add(
-                    //                     scale(
-                    //                         sum(
-                    //                             sqr(
-                    //                                 src0)),
-                    //                         (1.0/N)),
-                    //                     eps))));
-
-                    // postorder:
-                    // ## op    args         grad
-                    // 00 param src0         grad[#00]
-                    // 01 const 1
-                    // 02 sqr   (#00)        grad[#02]
-                    // 03 sum   (#02)        grad[#03]
-                    // 04 const 1/N
-                    // 05 scale (#03, #04)   grad[#05]
-                    // 06 const eps
-                    // 07 add   (#05, #06)   grad[#07]
-                    // 08 sqrt  (#07)        grad[#08]
-                    // 09 div   (#01,#08)    grad[#09]
-                    // 10 scale (#00,#09)    grad[#10]
-                    //
-                    // backward pass, given grad[#10]
-                    // #10: scale
-                    // grad[#00] += scale(grad[#10],#09)
-                    // grad[#09] += sum(mul(grad[#10],#00))
-                    // #09: div
-                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
-                    // #08: sqrt
-                    // grad[#07] += mul(grad[#08], div(0.5, #08))
-                    // #07: add
-                    // grad[#05] += grad[#07]
-                    // #05: scale
-                    // grad[#03] += scale(grad[#05],#04)
-                    // #03: sum
-                    // grad[#02] += repeat(grad[#03], #02)
-                    // #02:
-                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
-                    //
-                    // substitute and simplify:
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#02] = repeat(grad[#03], #02)
-                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
-                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
-                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
-                    // a = b*c + d*e
-                    // a = b*c*f/f + d*e*f/f
-                    // a = (b*c*f + d*e*f)*(1/f)
-                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
-                    // a = (b + d*e/c)*c
-                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
-                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
-                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
-                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
-                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
-                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
-                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                }
-                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                // post-order:
-                // dx := x
-                // dx := scale(dx,-mean_xdz/mean_eps)
-                // dx := add(dx, dz)
-                // dx := scale(dx, rrms)
-                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_vec_cpy_f32  (ne00, dx, x);
-                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
-                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
-                ggml_vec_acc_f32  (ne00, dx, dz);
-                ggml_vec_scale_f32(ne00, dx, rrms);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_group_norm
-
-static void ggml_compute_forward_group_norm_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const float eps = 1e-6f; // TODO: make this a parameter
-
-    // TODO: optimize
-
-    int n_channels = src0->ne[2];
-    int n_groups = dst->op_params[0];
-    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
-    for (int i = ith; i < n_groups; i+=nth) {
-        int start = i * n_channels_per_group;
-        int end = start + n_channels_per_group;
-        if (end > n_channels) {
-            end = n_channels;
-        }
-        int step = end - start;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            ggml_float sum = 0.0;
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00];
-                    }
-                }
-            }
-            float mean = sum / (ne00 * ne01 * step);
-            ggml_float sum2 = 0.0;
-
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        float v = x[i00] - mean;
-                        y[i00] = v;
-                        sum2 += (ggml_float)(v * v);
-                    }
-                }
-            }
-            float variance = sum2 / (ne00 * ne01 * step);
-            const float scale = 1.0f / sqrtf(variance + eps);
-
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-                    ggml_vec_scale_f32(ne00, y, scale);
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_group_norm(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_group_norm_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
-    //       all the experts for each batch element and the processing would become incredibly slow
-    // TODO: find the optimal values for these
-    if (dst->op != GGML_OP_MUL_MAT_ID &&
-        ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-      //src0->type == GGML_TYPE_F32 &&
-        src1->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        const int64_t ne_plane      = ne01*ne00;
-        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-        UNUSED(desired_wsize);
-
-        if (params->type == GGML_TASK_TYPE_INIT) {
-            if (type != GGML_TYPE_F32) {
-                assert(params->wsize >= desired_wsize);
-                // parallelize by src0 rows
-                for (int64_t i13 = 0; i13 < ne13; i13++) {
-                    for (int64_t i12 = 0; i12 < ne12; i12++) {
-                        // broadcast src0 into src1 across 2nd,3rd dimension
-                        const int64_t i03 = i13/r3;
-                        const int64_t i02 = i12/r2;
-
-                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
-                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-                              ggml_to_float_t  const to_float = type_traits[type].to_float;
-
-                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
-                        }
-                    }
-                }
-            }
-            return;
-        }
-
-        if (params->type == GGML_TASK_TYPE_FINALIZE) {
-            return;
-        }
-
-        // perform sgemm, parallelization controlled by blas lib
-        if (ith != 0) {
-            return;
-        }
-
-        //const int64_t tgemm0 = ggml_perf_time_us();
-        for (int64_t i13 = 0; i13 < ne13; i13++) {
-            for (int64_t i12 = 0; i12 < ne12; i12++) {
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
-
-                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
-
-                if (type != GGML_TYPE_F32) {
-                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-                }
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                          ne1, ne01, ne10,
-                         1.0f,    y, ne10,
-                                  x, ne00,
-                         0.0f,    d, ne01);
-            }
-        }
-        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
-
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
-            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-    const int64_t nr0 = ne01;          // src0 rows
-    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
-
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-
-    // distribute the thread work across the inner or outer loop based on which one is larger
-
-    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-    const int64_t ith0 = ith % nth0;
-    const int64_t ith1 = ith / nth0;
-
-    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-    const int64_t ir010 = dr0*ith0;
-    const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-    const int64_t ir110 = dr1*ith1;
-    const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
-    // threads with no work simply yield (not sure if it helps)
-    if (ir010 >= ir011 || ir110 >= ir111) {
-        sched_yield();
-        return;
-    }
-
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
-
-    // block-tiling attempt
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
-
-    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t nrc = vec_dot_num_rows;
-    // TODO: currently the mmla kernels support only even numbered rows/cols.
-    // this check can be removed once they are extended to support odd numbered rows/cols too
-    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        nrc = 1;
-    }
-
-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
-
-    // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
-
-    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
-                const int64_t i13 = (ir1/(ne12*ne1));
-                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
-                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
-
-                // broadcast src0 into src1
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
-
-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
-
-                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
-
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                     : (i11*nb11 + i12*nb12 + i13*nb13));
-                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-
-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
-                }
-
-                for (int cn = 0; cn < nrc; ++cn) {
-                    memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_mul_mat_id
-
-static void ggml_compute_forward_mul_mat_id(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    // row groups
-    const int id   = ggml_get_op_params_i32(dst, 0);
-    const int n_as = ggml_get_op_params_i32(dst, 1);
-
-    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-            (char *) params->wdata :
-            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
-
-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
-
-    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
-
-   if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        char * wdata = params->wdata;
-        if (src1->type != vec_dot_type) {
-            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            assert(src1->type == GGML_TYPE_F32);
-
-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
-            }
-        }
-
-        // initialize matrix_row_counts
-        GGML_ASSERT(wdata == wdata_src1_end);
-        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
-
-        // group rows by src0 matrix
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
-            matrix_row_counts[row_id] += 1;
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // compute each matrix multiplication in sequence
-    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
-        const int64_t cne1 = matrix_row_counts[cur_a];
-
-        if (cne1 == 0) {
-            continue;
-        }
-
-        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
-
-        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        const int64_t nr0 = ne01;           // src0 rows
-        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
-
-        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-
-        // distribute the thread work across the inner or outer loop based on which one is larger
-
-        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-        const int64_t ith0 = ith % nth0;
-        const int64_t ith1 = ith / nth0;
-
-        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-        const int64_t ir010 = dr0*ith0;
-        const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-        const int64_t ir110 = dr1*ith1;
-        const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
-        // threads with no work simply yield (not sure if it helps)
-        if (ir010 >= ir011 || ir110 >= ir111) {
-            sched_yield();
-            continue;
-        }
-
-        assert(ne12 % ne02 == 0);
-        assert(ne13 % ne03 == 0);
-
-        // block-tiling attempt
-        const int64_t blck_0 = 16;
-        const int64_t blck_1 = 16;
-
-        // attempt to reduce false-sharing (does not seem to make a difference)
-        float tmp[16];
-
-        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
-                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
-                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
-                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
-
-                    // broadcast src0 into src1
-                    const int64_t i03 = i13/r3;
-                    const int64_t i02 = i12/r2;
-
-                    const int64_t i1 = i11;
-                    const int64_t i2 = i12;
-                    const int64_t i3 = i13;
-
-                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
-
-                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                    //       the original src1 data pointer, so we should index using the indices directly
-                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                    const char * src1_col = (const char *) wdata +
-                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12 + i13*nb13));
-
-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-
-                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                    //}
-
-                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
-                    }
-                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
-            }
-        }
-    }
-
-    #undef MMID_MATRIX_ROW
-}
-
-// ggml_compute_forward_out_prod
-
-static void ggml_compute_forward_out_prod_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    // int64_t t0 = ggml_perf_time_us();
-    // UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne0  == ne00);
-    GGML_ASSERT(ne1  == ne10);
-    GGML_ASSERT(ne2  == ne02);
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne3  == ne13);
-    GGML_ASSERT(ne03 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_CLBLAST)
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    bool use_blas = ggml_is_matrix(src0) &&
-        ggml_is_matrix(src1) &&
-        ggml_is_contiguous(src0) &&
-        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
-#endif
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
-        if (use_blas) {
-            return;
-        }
-#endif
-        if (ith != 0) {
-            return;
-        }
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (use_blas) {
-        if (params->ith != 0) { // All threads other than the first do no work.
-            return;
-        }
-        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
-        // src0: (k,n)
-        // src1: (k,m)
-        // dst:  (m,n)
-        //
-        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
-        // Also expressed as (major,minor)
-        // a: (m,k): so src1 transposed
-        // b: (k,n): so src0
-        // c: (m,n)
-        //
-        // However, if ggml_is_transposed(src1) is true, then
-        // src1->data already contains a transposed version, so sgemm mustn't
-        // transpose it further.
-
-        int n = src0->ne[0];
-        int k = src0->ne[1];
-        int m = src1->ne[0];
-
-        int transposeA, lda;
-
-        if (!ggml_is_transposed(src1)) {
-            transposeA = CblasTrans;
-            lda = m;
-        } else {
-            transposeA = CblasNoTrans;
-            lda = k;
-        }
-
-        float * a = (float *) ((char *) src1->data);
-        float * b = (float *) ((char *) src0->data);
-        float * c = (float *) ((char *) dst->data);
-
-        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-
-        return;
-    }
-#endif
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // block-tiling attempt
-    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
-    const int64_t blck_1 = 16;
-
-    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
-        const int64_t bir1 = MIN(bir + blck_1, ir1);
-        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
-            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
-            for (int64_t ir = bir; ir < bir1; ++ir) {
-                // dst indices
-                const int64_t i3 = ir/(ne2*ne1);
-                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                const int64_t i02 = i2;
-                const int64_t i03 = i3;
-
-                //const int64_t i10 = i1;
-                const int64_t i12 = i2;
-                const int64_t i13 = i3;
-
-#if GGML_VEC_MAD_UNROLL > 2
-                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
-                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
-                }
-                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#else
-                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#endif
-            }
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_out_prod_q_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    // int64_t t0 = ggml_perf_time_us();
-    // UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // we don't support permuted src0 dim0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst dim0 cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne00);
-    GGML_ASSERT(ne1 == ne10);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        // dst indices
-        const int64_t i3 = ir/(ne2*ne1);
-        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        const int64_t i02 = i2;
-        const int64_t i03 = i3;
-
-        //const int64_t i10 = i1;
-        const int64_t i12 = i2;
-        const int64_t i13 = i3;
-
-        for (int64_t i01 = 0; i01 < ne01; ++i01) {
-            const int64_t i11 = i01;
-
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-            dequantize_row_q(s0, wdata, ne0);
-            ggml_vec_mad_f32(ne0, d, wdata, *s1);
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_out_prod(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-            {
-                ggml_compute_forward_out_prod_q_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(false); // todo
-                // ggml_compute_forward_out_prod_f16_f32(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_out_prod_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_scale
-
-static void ggml_compute_forward_scale_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // scale factor
-    float v;
-    memcpy(&v, dst->op_params, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb1 = dst->nb[1];
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        if (dst->data != src0->data) {
-            // src0 is same shape as dst => same indices
-            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
-        }
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
-    }
-}
-
-static void ggml_compute_forward_scale(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_scale_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_set
-
-static void ggml_compute_forward_set_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during set
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-        if (params->ith != 0) {
-            return;
-        }
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during set
-    const size_t nb0 = ggml_element_size(src0);
-
-    const int im0 = (ne10 == 0 ? 0 : ne10-1);
-    const int im1 = (ne11 == 0 ? 0 : ne11-1);
-    const int im2 = (ne12 == 0 ? 0 : ne12-1);
-    const int im3 = (ne13 == 0 ? 0 : ne13-1);
-
-    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-    }
-}
-
-static void ggml_compute_forward_set(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_set_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cpy
-
-static void ggml_compute_forward_cpy(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, dst);
-}
-
-// ggml_compute_forward_cont
-
-static void ggml_compute_forward_cont(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, dst);
-}
-
-// ggml_compute_forward_reshape
-
-static void ggml_compute_forward_reshape(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_view
-
-static void ggml_compute_forward_view(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_permute
-
-static void ggml_compute_forward_permute(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_transpose
-
-static void ggml_compute_forward_transpose(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_get_rows
-
-static void ggml_compute_forward_get_rows_q(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == ggml_type_size(type));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                dequantize_row_q(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(ggml_fp16_t));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                ggml_fp16_to_fp32_row(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(float));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                ggml_vec_cpy_f32(nc,
-                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-            {
-                ggml_compute_forward_get_rows_q(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_get_rows_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_get_rows_back
-
-static void ggml_compute_forward_get_rows_back_f32_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        for (int j = 0; j < nc; ++j) {
-            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
-            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_back_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *) src0->data + i*src0->nb[1]));
-    }
-}
-
-static void ggml_compute_forward_get_rows_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_rows_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_diag
-
-static void ggml_compute_forward_diag_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne00 == ne0);
-    GGML_ASSERT(ne00 == ne1);
-    GGML_ASSERT(ne01 == 1);
-    GGML_ASSERT(ne02 == ne2);
-    GGML_ASSERT(ne03 == ne3);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = 0; i2 < ne2; i2++) {
-            for (int i1 = 0; i1 < ne1; i1++) {
-                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
-                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
-                for (int i0 = 0; i0 < i1; i0++) {
-                    d[i0] = 0;
-                }
-                d[i1] = s[i1];
-                for (int i0 = i1+1; i0 < ne0; i0++) {
-                    d[i0] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_diag_mask_inf
-
-static void ggml_compute_forward_diag_mask_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const float value) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int  n_past  = ((int32_t *) dst->op_params)[0];
-    const bool inplace = src0->data == dst->data;
-
-    GGML_ASSERT(n_past >= 0);
-
-    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-        if (ith != 0) {
-            return;
-        }
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-        GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int k = 0; k < nz; k++) {
-        for (int j = ith; j < nr; j += nth) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_inf(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_diag_mask_zero(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, dst, 0);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src2 = dst->src[2];
-
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-
-    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int64_t ne11 = src1 ? src1->ne[1] : 1;
-
-    // TODO: is this supposed to be ceil instead of floor?
-    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
-    const uint32_t n_head_kv   = ne02;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
-
-    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
-
-    // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
-    float * pos = src2 ? (float *) src2->data : src0->data;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
-
-        // broadcast the mask across rows
-        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
-
-        ggml_vec_cpy_f32  (nc, wp, sp);
-        ggml_vec_scale_f32(nc, wp, scale);
-        if (mp) {
-            ggml_vec_acc_f32(nc, wp, mp);
-        }
-
-        // ALiBi bias
-        if (max_bias > 0.0f) {
-            const uint32_t h  = (i1/ne01)%ne02; // head
-            const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
-
-            for (int i = 0; i < nc; i++) {
-                wp[i] = wp[i] + slope*pos[i];
-            }
-        }
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(wp[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, wp);
-
-        ggml_float sum = 0.0;
-
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            if (wp[i] == -INFINITY) {
-                dp[i] = 0.0f;
-            } else {
-                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                dp[i] = val;
-            }
-        }
-
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, dp, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dp[i]));
-            assert(!isinf(dp[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max_back
-
-static void ggml_compute_forward_soft_max_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src1, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(dy[i]));
-            assert(!isnan(y[i]));
-        }
-#endif
-        // Jii = yi - yi*yi
-        // Jij = -yi*yj
-        // J = diag(y)-y.T*y
-        // dx = J * dy
-        // dxk = sum_i(Jki * dyi)
-        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
-        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
-        // dxk = -yk * dot(y, dy) + yk*dyk
-        // dxk = yk * (- dot(y, dy) + dyk)
-        // dxk = yk * (dyk - dot(y, dy))
-        //
-        // post-order:
-        // dot_y_dy := dot(y, dy)
-        // dx := dy
-        // dx := dx - dot_y_dy
-        // dx := dx * y
-
-        // linear runtime, no additional memory
-        float dot_y_dy = 0;
-        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
-        ggml_vec_cpy_f32 (nc, dx, dy);
-        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
-        ggml_vec_mul_f32 (nc, dx, dx, y);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dx[i]));
-            assert(!isinf(dx[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_alibi
-
-static void ggml_compute_forward_alibi_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
-    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
-    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int64_t n  = ggml_nrows(src0);
-    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
-
-    const size_t nb0 = src0->nb[0];
-    const size_t nb1 = src0->nb[1];
-    const size_t nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(n_head == ne2);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    for (int64_t k = 0; k < ne2_ne3; k++) {
-        // TODO: k*nb2 or k*nb3
-        float m_k;
-
-        if (k < n_heads_log2_floor) {
-            m_k = powf(m0, k + 1);
-        } else {
-            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-        }
-
-        for (int64_t i = 0; i < ne0; i++) {
-            for (int64_t j = 0; j < ne1; j++) {
-                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-                pdst[0] = i * m_k + src[0];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_alibi_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    //const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
-    GGML_ASSERT(n_head == ne2);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    for (int k = 0; k < ne2_ne3; k++) {
-        // TODO: k*nb2 or k*nb3
-        float m_k;
-
-        if (k < n_heads_log2_floor) {
-            m_k = powf(m0, k + 1);
-        } else {
-            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-        }
-
-        for (int i = 0; i < ne0; i++) {
-            for (int j = 0; j < ne1; j++) {
-                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float       *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // we return F32
-                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_alibi(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_alibi_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_alibi_f32(params, dst);
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_clamp
-
-static void ggml_compute_forward_clamp_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    float min;
-    float max;
-    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    for (int j = ith; j < n; j += nth) {
-        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
-        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
-
-        for (int i = 0; i < nc; i++) {
-            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
-        }
-    }
-}
-
-static void ggml_compute_forward_clamp(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_clamp_f32(params, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
-    return 1 - MIN(1, MAX(0, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
-}
-
-static void ggml_rope_cache_init(
-     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
-     float * cache, float sin_sign, float theta_scale
-) {
-    float theta = theta_base;
-    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-        rope_yarn(
-            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
-        );
-        cache[i0 + 1] *= sin_sign;
-
-        theta *= theta_scale;
-    }
-}
-
-GGML_CALL void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
-    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
-    dims[0] = MAX(0, start);
-    dims[1] = MIN(n_dims - 1, end);
-}
-
-static void ggml_compute_forward_rope_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const bool forward) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool  xpos_down;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
-    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-                        const float x2 = src[n_dims];
-                        const float x3 = src[n_dims/2*3];
-
-                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
-                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
-                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[1];
-
-                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
-                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
-                    }
-                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
-                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-                    theta_base *= freq_scale;
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t ib = 0;
-
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
-
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-                            sin_theta *= sin_sign;
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = src[0];
-                            const float x1 = src[n_dims/2];
-
-                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const bool forward) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
-                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
-
-                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
-                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
-
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
-                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-                    theta_base *= freq_scale;
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t ib = 0;
-
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
-
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-                            sin_theta *= sin_sign;
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, dst, true);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, dst, true);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope_back
-
-static void ggml_compute_forward_rope_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, dst, false);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, dst, false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_transpose_1d
-
-static void ggml_compute_forward_conv_transpose_1d_f16_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (L x Cin) to (Cin x L)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            ggml_fp16_t * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f16(ne02, &v, 0,
-                        (ggml_fp16_t *)    wdata_src + i1n, 0,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + nk;
-            float * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = src[i10];
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * const wdata     = (float *) params->wdata + 0;
-    float * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        float * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f32(ne02, &v, 0,
-                        wdata_src + i1n, 0,
-                        wdata_kernel + i00*ne02, 0, 1);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_im2col_f32(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
-    const int64_t IH = is_2D ? ne11 : 1;
-    const int64_t IW = ne10;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne2 : 1;
-    const int64_t OW = ne1;
-
-    int ofs0 = is_2D ? nb13 : nb12;
-    int ofs1 = is_2D ? nb12 : nb11;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        float * const wdata = (float *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic += nth) {
-
-                        // micro kernel
-                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
-                                } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_im2col_f16(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
-    const int64_t IH = is_2D ? ne11 : 1;
-    const int64_t IW = ne10;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne2 : 1;
-    const int64_t OW = ne1;
-
-    int ofs0 = is_2D ? nb13 : nb12;
-    int ofs1 = is_2D ? nb12 : nb11;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic += nth) {
-
-                        // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
-                                } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_im2col(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-    switch (dst->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_im2col_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_im2col_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_conv_transpose_2d
-
-static void ggml_compute_forward_conv_transpose_2d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02*ne03;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
-                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
-                        }
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            for (int i12 = 0; i12 < ne12; i12++) {
-                for (int i11 = 0; i11 < ne11; i11++) {
-                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
-                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
-                    for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
-                    }
-                }
-            }
-        }
-
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int32_t stride = ggml_get_op_params_i32(dst, 0);
-
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
-        float * dst_data = (float *)((char *) dst->data + i2*nb2);
-        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
-        for (int i11 = 0; i11 < ne11; i11++) {
-            for (int i10 = 0; i10 < ne10; i10++) {
-                const int i1n = i11*ne10*ne12 + i10*ne12;
-                for (int i01 = 0; i01 < ne01; i01++) {
-                    for (int i00 = 0; i00 < ne00; i00++) {
-                        float v = 0;
-                        ggml_vec_dot_f16(ne03, &v, 0,
-                                wdata_src + i1n, 0,
-                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
-                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
-                    }
-                }
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_pool_1d_sk_p0
-
-static void ggml_compute_forward_pool_1d_sk_p0(
-        const struct ggml_compute_params * params,
-        const enum ggml_op_pool op,
-        const int k,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src = dst->src[0];
-
-    assert(src->type == GGML_TYPE_F32);
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const char * cdata = (const char *)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-    float * drow = (float *)dst->data;
-
-    const int64_t rs = dst->ne[0];
-
-    while (cdata < data_end) {
-        const float * const srow = (const float *)cdata;
-
-        int j = 0;
-
-        for (int64_t i = 0; i < rs; ++i) {
-            switch (op) {
-                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
-                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-            }
-            for (int ki = 0; ki < k; ++ki) {
-                switch (op) {
-                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
-                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
-                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
-                }
-                ++j;
-            }
-            switch (op) {
-                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
-                case GGML_OP_POOL_MAX:                       break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-            }
-        }
-
-        cdata += src->nb[1];
-        drow  += rs;
-    }
-}
-
-// ggml_compute_forward_pool_1d
-
-static void ggml_compute_forward_pool_1d(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int s0 = opts[2];
-    const int p0 = opts[3];
-    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(k0 == s0); // only s = k supported
-
-    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
-}
-
-// ggml_compute_forward_pool_2d
-
-static void ggml_compute_forward_pool_2d(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src = dst->src[0];
-
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-    const char * cdata = (const char*)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-
-    const int64_t px = dst->ne[0];
-    const int64_t py = dst->ne[1];
-    const int64_t pa = px * py;
-
-    float * dplane = (float *)dst->data;
-
-    const int ka = k0 * k1;
-    const int offset0 = -p0;
-    const int offset1 = -p1;
-
-    while (cdata < data_end) {
-        for (int oy = 0; oy < py; ++oy) {
-            float * const drow = dplane + oy * px;
-            for (int ox = 0; ox < px; ++ox) {
-                float * const out =  drow + ox;
-                switch (op) {
-                    case GGML_OP_POOL_AVG:     *out = 0;        break;
-                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-                }
-
-                const int ix = offset0 + ox * s0;
-                const int iy = offset1 + oy * s1;
-
-                for (int ky = 0; ky < k1; ++ky) {
-                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
-                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
-                    for (int kx = 0; kx < k0; ++kx) {
-                        int j = ix + kx;
-                        if (j < 0 || j >= src->ne[0]) continue;
-                        switch (op) {
-                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
-                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
-                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
-                        }
-                    }
-                }
-                switch (op) {
-                    case GGML_OP_POOL_AVG:           *out /= ka; break;
-                    case GGML_OP_POOL_MAX:                       break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-                }
-            }
-        }
-
-        cdata  += src->nb[2];
-        dplane += pa;
-    }
-}
-
-// ggml_compute_forward_upscale
-
-static void ggml_compute_forward_upscale_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int scale_factor = dst->op_params[0];
-
-    // TODO: optimize
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / scale_factor;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / scale_factor;
-
-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-
-                    *y = *x;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_upscale(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_upscale_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_pad
-
-static void ggml_compute_forward_pad_f32(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float * dst_ptr = (float *) dst->data;
-
-    // TODO: optimize
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_pad(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_pad_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_argsort
-
-static void ggml_compute_forward_argsort_f32(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
-
-    for (int64_t i = ith; i < nr; i += nth) {
-        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
-        const float * src_data = (float *)((char *) src0->data + i*nb01);
-
-        for (int64_t j = 0; j < ne0; j++) {
-            dst_data[j] = j;
-        }
-
-        // C doesn't have a functional sort, so we do a bubble sort instead
-        for (int64_t j = 0; j < ne0; j++) {
-            for (int64_t k = j + 1; k < ne0; k++) {
-                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
-                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-                    int32_t tmp = dst_data[j];
-                    dst_data[j] = dst_data[k];
-                    dst_data[k] = tmp;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_argsort(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argsort_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn
-
-static void ggml_compute_forward_flash_attn_f32(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-        for (int64_t ic = 0; ic < masked_begin; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2 % nek2;
-            const int ik1 = ic;
-
-            // S indices
-            const int i1 = ik1;
-
-            ggml_vec_dot_f32(neq0,
-                    S + i1, 0,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-        }
-
-        // scale
-        ggml_vec_scale_f32(masked_begin, S, scale);
-
-        for (int64_t i = masked_begin; i < M; i++) {
-            S[i] = -INFINITY;
-        }
-
-        // softmax
-        // exclude known -INF S[..] values from max and loop
-        // dont forget to set their SW values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(masked_begin, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    if (i >= masked_begin) {
-                        break;
-                    }
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (i + j >= masked_begin) {
-                            break;
-                        } else if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                            const float val = expf(SS[j] - max);
-#else
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(masked_begin, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < masked_begin; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        for (int64_t ic = 0; ic < nev1; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            // v indices
-            const int iv2 = iq2 % nev2;
-            const int iv3 = iq3;
-
-            ggml_vec_dot_f32(masked_begin,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                    S, 0, 1);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_f16(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16(neq0,
-                        S + i1, 0,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16_unroll(neq0, nbk1,
-                        S + i1,
-                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
-        // dont forget to set their S values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
-        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-            for (int64_t ic = 0; ic < nev1; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16(nev0,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                        S16, 0, 1);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16_unroll(nev0, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
-                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-                        S16);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-
-    switch (q->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_attn_f16(params, masked, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_f32(params, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_ff
-
-static void ggml_compute_forward_flash_ff_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];  // F16
-    const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
-    const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
-    const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
-    const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
-    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = nea0;
-    //const int64_t N = nea1;
-    const int64_t M = neb01;
-
-    GGML_ASSERT(ne0 == nea0);
-    GGML_ASSERT(ne1 == nea1);
-    GGML_ASSERT(ne2 == nea2);
-
-    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb10 == sizeof(float));
-    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbc10 == sizeof(float));
-
-    GGML_ASSERT(neb00 == D);
-    GGML_ASSERT(neb01 == M);
-    GGML_ASSERT(neb10 == M);
-    GGML_ASSERT(neb11 == 1);
-
-    GGML_ASSERT(nec00 == M);
-    GGML_ASSERT(nec01 == D);
-    GGML_ASSERT(nec10 == D);
-    GGML_ASSERT(nec11 == 1);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // parallelize by a rows using ggml_vec_dot_f32
-
-    // total rows in a
-    const int nr = nea1*nea2*nea3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // a indices
-        const int ia3 = ir/(nea2*nea1);
-        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
-        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
-
-        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-
-        for (int64_t ic = 0; ic < neb01; ++ic) {
-            // b0 indices
-            const int ib03 = ia3;
-            const int ib02 = ia2;
-            const int ib01 = ic;
-
-            // S indices
-            const int i1 = ib01;
-
-            ggml_vec_dot_f16(nea0,
-                    S + i1, 0,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)), 0, 1);
-        }
-
-        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
-        //ggml_vec_gelu_f32(neb01, S, S);
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        ggml_vec_gelu_f16(neb01, S16, S16);
-
-        {
-            // dst indices
-            const int i1 = ia1;
-            const int i2 = ia2;
-            const int i3 = ia3;
-
-            for (int64_t ic = 0; ic < nec01; ++ic) {
-
-                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
-                        S16, 0, 1);
-            }
-
-            ggml_vec_add_f32(nec01,
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) c1->data);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_ff(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * b0 = dst->src[1];
-
-    switch (b0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_ff_f16(params, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false); // TODO
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn_back
-
-static void ggml_compute_forward_flash_attn_back_f32(
-        const struct ggml_compute_params * params,
-        const bool masked,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-    const struct ggml_tensor * k = dst->src[1];
-    const struct ggml_tensor * v = dst->src[2];
-    const struct ggml_tensor * d = dst->src[3];
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-    const int mxDM = MAX(D, Mup);
-
-    // GGML_ASSERT(ne0 == D);
-    // GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned0 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned1 == N);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith == 0) {
-            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
-        }
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int64_t elem_q = ggml_nelements(q);
-    const int64_t elem_k = ggml_nelements(k);
-
-    enum ggml_type result_type = dst->type;
-    GGML_ASSERT(ggml_blck_size(result_type) == 1);
-    const size_t tsize = ggml_type_size(result_type);
-
-    const size_t offs_q = 0;
-    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-    void * grad_q = (char *) dst->data;
-    void * grad_k = (char *) dst->data + offs_k;
-    void * grad_v = (char *) dst->data + offs_v;
-
-    const size_t nbgq1 = nb0*neq0;
-    const size_t nbgq2 = nb0*neq0*neq1;
-    const size_t nbgq3 = nb0*neq0*neq1*neq2;
-
-    const size_t nbgk1 = nb0*nek0;
-    const size_t nbgk2 = nb0*nek0*nek1;
-    const size_t nbgk3 = nb0*nek0*nek1*neq2;
-
-    const size_t nbgv1 = nb0*nev0;
-    const size_t nbgv2 = nb0*nev0*nev1;
-    const size_t nbgv3 = nb0*nev0*nev1*neq2;
-
-    // parallelize by k rows using ggml_vec_dot_f32
-
-    // total rows in k
-    const int nr = nek2*nek3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    // how often k2 (and v2) is repeated in q2
-    int nrep = neq2/nek2;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int ik3 = ir/(nek2);
-        const int ik2 = ir - ik3*nek2;
-
-        const int iq3 = ik3;
-        const int id3 = ik3;
-        const int iv3 = ik3;
-        const int iv2 = ik2;
-
-        for (int irep = 0; irep < nrep; ++irep) {
-            const int iq2 = ik2 + irep*nek2;
-            const int id2 = iq2;
-
-            // (ik2 + irep*nek2) % nek2 == ik2
-            for (int iq1 = 0; iq1 < neq1; ++iq1) {
-                const int id1 = iq1;
-
-                // not sure about CACHE_LINE_SIZE_F32..
-                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
-                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
-                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
-
-                for (int i = M; i < Mup; ++i) {
-                    S[i] = -INFINITY;
-                }
-
-                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    // k indices
-                    const int ik1 = ic;
-
-                    // S indices
-                    const int i1 = ik1;
-
-                    ggml_vec_dot_f32(neq0,
-                            S + i1, 0,
-                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-                }
-
-                // scale
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                for (int64_t i = masked_begin; i < M; i++) {
-                    S[i] = -INFINITY;
-                }
-
-                // softmax
-                // exclude known -INF S[..] values from max and loop
-                // dont forget to set their SM values to zero
-                {
-                    float max = -INFINITY;
-                    ggml_vec_max_f32(masked_begin, &max, S);
-
-                    ggml_float sum = 0.0;
-                    {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                        max = -max;
-                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
-                        vvexpf(SM, SM, &Mup);
-                        ggml_vec_sum_f32(Mup, &sum, SM);
-#else
-                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                            if (i >= masked_begin) {
-                                break;
-                            }
-                            float * SR =  S + i;
-                            float * SW = SM + i;
-
-                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                                if (i + j >= masked_begin) {
-                                    break;
-                                } else if (SR[j] == -INFINITY) {
-                                    SW[j] = 0.0f;
-                                } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                                    const float val = expf(SR[j] - max);
-#else
-                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
-                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                                    sump[j] += (ggml_float)val;
-                                    SW[j] = val;
-                                }
-                            }
-                        }
-
-                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                            sum += sump[i];
-                        }
-#endif
-                    }
-
-                    assert(sum > 0.0);
-
-                    sum = 1.0/sum;
-                    ggml_vec_scale_f32(masked_begin, SM, sum);
-
-                }
-
-                // step-by-step explanation
-                {
-                    // forward-process                    shape      grads from backward process
-                    // parallel_for ik2,ik3:
-                    //  for irep:
-                    //   iq2 = ik2 + irep*nek2
-                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
-                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
-                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
-                    //   for iq1:
-                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
-                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
-                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
-                    //    S0     = -Inf                   [D,1,1,1]
-                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
-                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
-                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
-                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-                    //   ~S5[i]  = dot(vcur[:,i], S4)
-                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
-                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
-                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
-                    // dst                               backward-/ grad[dst]                 = d
-                    //
-                    // output gradients with their dependencies:
-                    //
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S4]   = grad[S5] @ vcur
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[vcur] = grad[S5].T @ S4
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // in post-order:
-                    //
-                    // S1         = qcur @ kcur.T
-                    // S2         = S1 * scale
-                    // S3         = diag_mask_inf(S2, P)
-                    // S4         = softmax(S3)
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // using less variables (SM=S4):
-                    //
-                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
-                    // SM            = softmax(S)
-                    // S             = d[:D,iq1,iq2,iq3] @ vcur
-                    // dot_SM_gradSM = dot(SM, S)
-                    // S             = SM * (S - dot(SM, S))
-                    // S             = diag_mask_zero(S, P) * scale
-                    //
-                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
-                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
-                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
-                }
-
-                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // for ic:
-                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
-                // exclude known future zero S[..] values from operation
-                ggml_vec_set_f32(masked_begin, S, 0);
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            S,
-                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
-                }
-
-                // S = SM * (S - dot(SM, S))
-                float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
-                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-                ggml_vec_mul_f32 (masked_begin, S, S, SM);
-
-                // S = diag_mask_zero(S, P) * scale
-                // already done by above ggml_vec_set_f32
-
-                // exclude known zero S[..] values from operation
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                // S    shape [M,1]
-                // SM   shape [M,1]
-                // kcur shape [D,M]
-                // qcur shape [D,1]
-                // vcur shape [M,D]
-
-                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
-                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
-                // for ic:
-                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
-                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
-                            S[ic]);
-                }
-
-                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
-                // for ic:
-                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
-                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
-                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
-                            S[ic]);
-                }
-
-                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
-                // for ic:
-                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
-                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
-                // exclude known zero SM[..] values from mad
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
-                            SM,
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_back(
-        const struct ggml_compute_params * params,
-        const bool masked,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * q = dst->src[0];
-
-    switch (q->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_win_part
-
-static void ggml_compute_forward_win_part_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
-
-    assert(ne00 == ne0);
-    assert(ne3  == nep0*nep1);
-
-    // TODO: optimize / multi-thread
-    for (int py = 0; py < nep1; ++py) {
-        for (int px = 0; px < nep0; ++px) {
-            const int64_t i3 = py*nep0 + px;
-            for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                        const int64_t i02 = py*w + i2;
-                        const int64_t i01 = px*w + i1;
-                        const int64_t i00 = i0;
-
-                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
-                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
-
-                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
-                            ((float *) dst->data)[i] = 0.0f;
-                        } else {
-                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_part(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_part_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_win_unpart
-
-static void ggml_compute_forward_win_unpart_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t w = ((const int32_t *)(dst->op_params))[0];
-
-    // padding
-    const int px = (w - ne1%w)%w;
-    //const int py = (w - ne2%w)%w;
-
-    const int npx = (px + ne1)/w;
-    //const int npy = (py + ne2)/w;
-
-    assert(ne0 == ne00);
-
-    // TODO: optimize / multi-thread
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int ip2 = i2/w;
-                const int ip1 = i1/w;
-
-                const int64_t i02 = i2%w;
-                const int64_t i01 = i1%w;
-                const int64_t i00 = i0;
-
-                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
-                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
-
-                ((float *) dst->data)[j] = ((float *) src0->data)[i];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_unpart(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_unpart_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-//gmml_compute_forward_unary
-
-static void ggml_compute_forward_unary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const enum ggml_unary_op op = ggml_get_unary_op(dst);
-
-    switch (op) {
-        case GGML_UNARY_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, dst);
-            } break;
-        case GGML_UNARY_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, dst);
-            } break;
-        case GGML_UNARY_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, dst);
-            } break;
-        case GGML_UNARY_OP_STEP:
-            {
-                ggml_compute_forward_step(params, dst);
-            } break;
-        case GGML_UNARY_OP_TANH:
-            {
-                ggml_compute_forward_tanh(params, dst);
-            } break;
-        case GGML_UNARY_OP_ELU:
-            {
-                ggml_compute_forward_elu(params, dst);
-            } break;
-        case GGML_UNARY_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, dst);
-            } break;
-        case GGML_UNARY_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, dst);
-            } break;
-        case GGML_UNARY_OP_GELU_QUICK:
-            {
-                ggml_compute_forward_gelu_quick(params, dst);
-            } break;
-        case GGML_UNARY_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, dst);
-            } break;
-        case GGML_UNARY_OP_HARDSWISH:
-            {
-                ggml_compute_forward_hardswish(params, dst);
-            } break;
-        case GGML_UNARY_OP_HARDSIGMOID:
-            {
-                ggml_compute_forward_hardsigmoid(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_get_rel_pos
-
-static void ggml_compute_forward_get_rel_pos_f16(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int64_t w = ne1;
-
-    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
-    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            const int64_t pos = (w - i1 - 1) + i2;
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rel_pos(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rel_pos_f16(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add_rel_pos
-
-static void ggml_compute_forward_add_rel_pos_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src2 = dst->src[2];
-
-    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
-    if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
-        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
-        return;
-    }
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
-
-    float * src1_data = (float *) src1->data;
-    float * src2_data = (float *) src2->data;
-    float * dst_data  = (float *) dst->data;
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // total patches in dst
-    const int np = ne13;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
-        for (int64_t i12 = 0; i12 < ne12; ++i12) {
-            for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
-                for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                    const int64_t jp0  = jp1 + i10;
-                    const float src1_e = src1_data[jp0];
-                    const float src2_e = src2_data[jp0];
-
-                    const int64_t jdh = jp0 * ne10;
-                    const int64_t jdw = jdh - (ne10 - 1) * i10;
-
-                    for (int64_t j = 0; j < ne10; ++j) {
-                        dst_data[jdh + j     ] += src2_e;
-                        dst_data[jdw + j*ne10] += src1_e;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_rel_pos(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_rel_pos_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_unary
-
-static void ggml_compute_forward_map_unary_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_unary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_unary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_binary
-
-static void ggml_compute_forward_map_binary_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_binary(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_binary_f32(params, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom1_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom2_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a, b);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst,
-        const ggml_custom3_op_f32_t fun) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-    const struct ggml_tensor * c = dst->src[1];
-
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a, b, c);
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom1_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom2_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * a = dst->src[0];
-    const struct ggml_tensor * b = dst->src[1];
-    const struct ggml_tensor * c = dst->src[2];
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom3_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
-}
-
-// ggml_compute_forward_cross_entropy_loss
-
-static void ggml_compute_forward_cross_entropy_loss_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_scalar(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    float * sums = (float *) params->wdata;
-
-    // TODO: handle transposed/permuted matrices
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
-
-    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith == 0) {
-            memset(sums, 0, sizeof(float) * (nth + nth * nc));
-        }
-        return;
-    }
-
-    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-        if (ith == 0) {
-            float * dp = (float *) dst->data;
-            ggml_vec_sum_f32(nth, dp, sums);
-            dp[0] *= -1.0f / (float) nr;
-        }
-        return;
-    }
-
-    const double eps = 1e-9;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * st = ((float *) params->wdata) + nth + ith*nc;
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-        // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    st[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    st[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            // sum = 1.0/sum;
-        }
-        // avoid log(0) by rescaling from [0..1] to [eps..1]
-        sum = (1.0 - eps) / sum;
-        ggml_vec_scale_f32(nc, st, sum);
-        ggml_vec_add1_f32(nc, st, st, eps);
-        ggml_vec_log_f32(nc, st, st);
-        ggml_vec_mul_f32(nc, st, st, s1);
-
-        float st_sum = 0;
-        ggml_vec_sum_f32(nc, &st_sum, st);
-        sums[ith] += st_sum;
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(st[i]));
-            assert(!isinf(st[i]));
-        }
-#endif
-    }
-
-}
-
-static void ggml_compute_forward_cross_entropy_loss(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cross_entropy_loss_back
-
-static void ggml_compute_forward_cross_entropy_loss_back_f32(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * opt0 = dst->src[2];
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(opt0));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int64_t ith = params->ith;
-    const int64_t nth = params->nth;
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const double eps = 1e-9;
-
-    // TODO: handle transposed/permuted matrices
-    const int64_t nc = src0->ne[0];
-    const int64_t nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    float * d   = (float *) opt0->data;
-
-    for (int64_t i1 = ir0; i1 < ir1; i1++) {
-        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
-        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-
-        // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    ds0[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    ds0[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            sum = (1.0 - eps)/sum;
-        }
-
-        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
-        ggml_vec_scale_f32(nc, ds0, sum);
-        ggml_vec_add1_f32(nc, ds0, ds0, eps);
-        ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(ds0[i]));
-            assert(!isinf(ds0[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_cross_entropy_loss_back(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-/////////////////////////////////
-
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    GGML_ASSERT(params);
-
-    if (tensor->op == GGML_OP_NONE) {
-        return;
-    }
-
-#ifdef GGML_USE_CUBLAS
-    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#elif defined(GGML_USE_VULKAN)
-    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    if (skip_cpu) {
-        ggml_vk_check_results_1_cpu_assist(params, tensor);
-    }
-#endif
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#endif // GGML_USE_CUBLAS
-
-#ifdef GGML_USE_SYCL
-    bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
-    }
-#endif // GGML_USE_SYCL
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                ggml_compute_forward_dup(params, tensor);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_compute_forward_add(params, tensor);
-            } break;
-        case GGML_OP_ADD1:
-            {
-                ggml_compute_forward_add1(params, tensor);
-            } break;
-        case GGML_OP_ACC:
-            {
-                ggml_compute_forward_acc(params, tensor);
-            } break;
-        case GGML_OP_SUB:
-            {
-                ggml_compute_forward_sub(params, tensor);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_compute_forward_mul(params, tensor);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_compute_forward_div(params, tensor);
-            } break;
-        case GGML_OP_SQR:
-            {
-                ggml_compute_forward_sqr(params, tensor);
-            } break;
-        case GGML_OP_SQRT:
-            {
-                ggml_compute_forward_sqrt(params, tensor);
-            } break;
-        case GGML_OP_LOG:
-            {
-                ggml_compute_forward_log(params, tensor);
-            } break;
-        case GGML_OP_SUM:
-            {
-                ggml_compute_forward_sum(params, tensor);
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                ggml_compute_forward_sum_rows(params, tensor);
-            } break;
-        case GGML_OP_MEAN:
-            {
-                ggml_compute_forward_mean(params, tensor);
-            } break;
-        case GGML_OP_ARGMAX:
-            {
-                ggml_compute_forward_argmax(params, tensor);
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                ggml_compute_forward_repeat(params, tensor);
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                ggml_compute_forward_repeat_back(params, tensor);
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                ggml_compute_forward_concat(params, tensor);
-            } break;
-        case GGML_OP_SILU_BACK:
-            {
-                ggml_compute_forward_silu_back(params, tensor);
-            } break;
-        case GGML_OP_NORM:
-            {
-                ggml_compute_forward_norm(params, tensor);
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                ggml_compute_forward_rms_norm(params, tensor);
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                ggml_compute_forward_rms_norm_back(params, tensor);
-            } break;
-        case GGML_OP_GROUP_NORM:
-            {
-                ggml_compute_forward_group_norm(params, tensor);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_compute_forward_mul_mat(params, tensor);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                ggml_compute_forward_mul_mat_id(params, tensor);
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                ggml_compute_forward_out_prod(params, tensor);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                ggml_compute_forward_scale(params, tensor);
-            } break;
-        case GGML_OP_SET:
-            {
-                ggml_compute_forward_set(params, tensor);
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_compute_forward_cpy(params, tensor);
-            } break;
-        case GGML_OP_CONT:
-            {
-                ggml_compute_forward_cont(params, tensor);
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor);
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_compute_forward_get_rows(params, tensor);
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                ggml_compute_forward_get_rows_back(params, tensor);
-            } break;
-        case GGML_OP_DIAG:
-            {
-                ggml_compute_forward_diag(params, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                ggml_compute_forward_diag_mask_inf(params, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                ggml_compute_forward_diag_mask_zero(params, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                ggml_compute_forward_soft_max(params, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                ggml_compute_forward_soft_max_back(params, tensor);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                ggml_compute_forward_rope(params, tensor);
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                ggml_compute_forward_rope_back(params, tensor);
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                ggml_compute_forward_alibi(params, tensor);
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                ggml_compute_forward_clamp(params, tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                ggml_compute_forward_conv_transpose_1d(params, tensor);
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                ggml_compute_forward_im2col(params, tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                ggml_compute_forward_conv_transpose_2d(params, tensor);
-            } break;
-        case GGML_OP_POOL_1D:
-            {
-                ggml_compute_forward_pool_1d(params, tensor);
-            } break;
-        case GGML_OP_POOL_2D:
-            {
-                ggml_compute_forward_pool_2d(params, tensor);
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                ggml_compute_forward_upscale(params, tensor);
-            } break;
-        case GGML_OP_PAD:
-            {
-                ggml_compute_forward_pad(params, tensor);
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                ggml_compute_forward_argsort(params, tensor);
-            } break;
-        case GGML_OP_LEAKY_RELU:
-            {
-                ggml_compute_forward_leaky_relu(params, tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                const int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                const bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, masked, tensor);
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                ggml_compute_forward_flash_ff(params, tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                bool masked = t != 0;
-                ggml_compute_forward_flash_attn_back(params, masked, tensor);
-            } break;
-        case GGML_OP_WIN_PART:
-            {
-                ggml_compute_forward_win_part(params, tensor);
-            } break;
-        case GGML_OP_WIN_UNPART:
-            {
-                ggml_compute_forward_win_unpart(params, tensor);
-            } break;
-        case GGML_OP_UNARY:
-            {
-                ggml_compute_forward_unary(params, tensor);
-            } break;
-        case GGML_OP_GET_REL_POS:
-            {
-                ggml_compute_forward_get_rel_pos(params, tensor);
-            } break;
-        case GGML_OP_ADD_REL_POS:
-            {
-                ggml_compute_forward_add_rel_pos(params, tensor);
-            } break;
-        case GGML_OP_MAP_UNARY:
-            {
-                ggml_unary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_unary(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_BINARY:
-            {
-                ggml_binary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_binary(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1_F32:
-            {
-                ggml_custom1_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2_F32:
-            {
-                ggml_custom2_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                ggml_custom3_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                ggml_compute_forward_map_custom1(params, tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                ggml_compute_forward_map_custom2(params, tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                ggml_compute_forward_map_custom3(params, tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                ggml_compute_forward_cross_entropy_loss(params, tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
-            }
-            break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static size_t ggml_hash_size(size_t min_sz) {
-    // next primes after powers of two
-    static const size_t primes[] = {
-        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
-        2053, 4099, 8209, 16411, 32771, 65537, 131101,
-        262147, 524309, 1048583, 2097169, 4194319, 8388617,
-        16777259, 33554467, 67108879, 134217757, 268435459,
-        536870923, 1073741827, 2147483659
-    };
-    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
-
-    // find the smallest prime that is larger or equal to min_sz
-    size_t l = 0;
-    size_t r = n_primes;
-    while (l < r) {
-        size_t m = (l + r)/2;
-        if (primes[m] < min_sz) {
-            l = m + 1;
-        } else {
-            r = m;
-        }
-    }
-    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
-    return sz;
-}
-
-static size_t ggml_hash(const void * p) {
-    return (size_t)p;
-}
-
-size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t h = ggml_hash(key) % hash_set.size;
-
-    // linear probing
-    size_t i = h;
-    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
-        i = (i + 1) % hash_set.size;
-        if (i == h) {
-            // visited all hash table entries -> not found
-            return GGML_HASHTABLE_FULL;
-        }
-    }
-    return i;
-}
-
-bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-    return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
-}
-
-size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    if (hash_set.keys[i] == key) {
-        return GGML_HASHTABLE_ALREADY_EXISTS;
-    }
-
-    // insert
-    GGML_ASSERT(hash_set.keys[i] == NULL);
-    hash_set.keys[i] = key;
-    return i;
-}
-
-size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    hash_set.keys[i] = key;
-    return i;
-}
-
-struct ggml_hash_set ggml_hash_set_new(size_t size) {
-    size = ggml_hash_size(size);
-    struct ggml_hash_set result;
-    result.size = size;
-    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
-    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
-    return result;
-}
-
-static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
-    GGML_FREE(hash_set.keys);
-}
-
-struct hash_map {
-    struct ggml_hash_set set;
-    struct ggml_tensor ** vals;
-};
-
-static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
-    result->set = ggml_hash_set_new(size);
-    result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
-    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
-    return result;
-}
-
-static void ggml_hash_map_free(struct hash_map * map) {
-    ggml_hash_set_free(map->set);
-    GGML_FREE(map->vals);
-    GGML_FREE(map);
-}
-
-// gradient checkpointing
-
-static struct ggml_tensor * ggml_recompute_graph_node(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * graph,
-        struct hash_map     * replacements,
-        struct ggml_tensor  * node) {
-
-    if (node == NULL) {
-        return NULL;
-    }
-
-    if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-        return node;
-    }
-
-    if (!ggml_hash_contains(graph->visited_hash_table, node)) {
-        return node;
-    }
-
-    int count_children = 0;
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        if (node->src[k]) {
-            ++count_children;
-        }
-    }
-
-    if (count_children == 0) {
-        return node;
-    }
-
-    size_t i = ggml_hash_find(replacements->set, node);
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
-    if (replacements->set.keys[i] == node) {
-        return replacements->vals[i];
-    }
-
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
-
-    // insert clone into replacements
-    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
-    replacements->set.keys[i] = node;
-    replacements->vals[i] = clone;
-
-    clone->op       = node->op;
-    clone->grad     = node->grad;
-    clone->flags    = node->flags;
-    clone->extra    = node->extra;
-    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
-        clone->nb[k] = node->nb[k];
-    }
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
-    }
-    if (node->view_src != NULL) {
-        clone->data = (node->view_src->data == NULL)
-                        ? NULL // view_src not yet allocated
-                        : (char *) node->view_src->data // view_src already allocated
-                                 + node->view_offs;
-        clone->view_src  = node->view_src;
-        clone->view_offs = node->view_offs;
-    }
-
-    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
-    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
-    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
-
-    return clone;
-}
-
-void ggml_build_backward_gradient_checkpointing(
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * checkpoints,
-        int                     n_checkpoints) {
-    ggml_graph_cpy(gf, gb_tmp);
-    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
-
-    if (n_checkpoints <= 0) {
-        ggml_graph_cpy(gb_tmp, gb);
-        return;
-    }
-
-    struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
-
-    // insert checkpoints in replacements
-    for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
-        GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
-        GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
-        replacements->set.keys[k] = checkpoints[i];
-        replacements->vals[k]     = checkpoints[i];
-    }
-
-    ggml_graph_cpy(gf, gb);
-    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
-    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
-    // by recomputing them from checkpoints
-    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
-        struct ggml_tensor * node = gb_tmp->nodes[i];
-        for (int k = 0; k < GGML_MAX_SRC; ++k) {
-            // insert new tensors recomputing src, reusing already made replacements,
-            // remember replacements: remember new tensors with mapping from corresponding gf nodes
-            // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
-            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
-        }
-        // insert rewritten backward node with replacements made into resulting backward graph gb
-        ggml_build_forward_expand(gb, node);
-    }
-
-    ggml_hash_map_free(replacements);
-}
-
-// functions to change gradients considering the case that input a might be initial gradient with zero value
-
-static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return b;
-    } else {
-        return ggml_add_impl(ctx, a, b, false);
-    }
-}
-
-static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
-        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
-    } else {
-        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-    }
-}
-
-static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_repeat(ctx, b, a);
-    } else {
-        return ggml_add1_impl(ctx, a, b, false);
-    }
-}
-
-static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_neg(ctx, b);
-    } else {
-        return ggml_sub_impl(ctx, a, b, false);
-    }
-}
-
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
-    struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_ADD:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_ADD1:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx,
-                        src1->grad,
-                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_ACC:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
-                        nb1, nb2, nb3, offset);
-
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_SUB:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_MUL:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_mul(ctx, src1, tensor->grad),
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx, src0, tensor->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_DIV:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx, tensor->grad, src1),
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_sub_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_div(ctx, tensor, src1)),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SQR:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_mul(ctx, src0, tensor->grad),
-                                    2.0f),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SQRT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_div(ctx,
-                                        tensor->grad,
-                                        tensor),
-                                    0.5f),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_LOG:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx,
-                                    tensor->grad,
-                                    src0),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SUM:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add1_or_set(ctx,
-                                src0->grad,
-                                tensor->grad,
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_repeat(ctx,
-                                    tensor->grad,
-                                    src0->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                if (src0->grad) {
-                    // TODO: test this
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat(ctx, tensor->grad, src0->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_SILU_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float eps;
-                    memcpy(&eps, tensor->op_params, sizeof(float));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_GROUP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                // https://cs231n.github.io/optimization-2/#staged
-                // # forward pass
-                // s0 = np.random.randn(5, 10)
-                // s1 = np.random.randn(10, 3)
-                // t = s0.dot(s1)
-
-                // # now suppose we had the gradient on t from above in the circuit
-                // dt = np.random.randn(*t.shape) # same shape as t
-                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
-                // ds1 = t.T.dot(dt)
-
-                // tensor.shape [m,p,qq,rr]
-                // src0.shape   [n,m,q1,r1]
-                // src1.shape   [n,p,qq,rr]
-
-                // necessary for llama
-                if (src0->grad) {
-                    struct ggml_tensor * s1_tg =
-                        ggml_out_prod(ctx, // [n,m,qq,rr]
-                            src1,          // [n,p,qq,rr]
-                            tensor->grad); // [m,p,qq,rr]
-                    const int64_t qq = s1_tg->ne[2];
-                    const int64_t rr = s1_tg->ne[3];
-                    const int64_t q1 = src0->ne[2];
-                    const int64_t r1 = src0->ne[3];
-                    const bool ne2_broadcasted = qq > q1;
-                    const bool ne3_broadcasted = rr > r1;
-                    if (ne2_broadcasted || ne3_broadcasted) {
-                        // sum broadcast repetitions of s1_tg into shape of src0
-                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
-                    }
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad, // [n,m,q1,r1]
-                                s1_tg,      // [n,m,q1,r1]
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,                            // [n,p,qq,rr]
-                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
-                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
-                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
-                                //     tensor->grad),                  // [m,p,qq,rr]
-
-                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
-                                // // avoid transpose of src0, rather transpose smaller tensor->grad
-                                // // and then use ggml_out_prod
-                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
-                                    src0,                           // [n,m,q1,r1]
-                                    ggml_transpose(ctx,             // [p,m,qq,rr]
-                                        tensor->grad)),             // [m,p,qq,rr]
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SCALE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float s;
-                    memcpy(&s, tensor->op_params, sizeof(float));
-
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_scale_impl(ctx, tensor->grad, s, false),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_SET:
-            {
-                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                struct ggml_tensor * tensor_grad_view = NULL;
-
-                if (src0->grad || src1->grad) {
-                    GGML_ASSERT(src0->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
-
-                    tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
-                        nb1, nb2, nb3, offset);
-                }
-
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                        src0->grad,
-                        ggml_acc_impl(ctx,
-                            tensor->grad,
-                            ggml_neg(ctx, tensor_grad_view),
-                            nb1, nb2, nb3, offset, false),
-                        zero_table);
-                }
-
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                // necessary for llama
-                // cpy overwrites value of src1 by src0 and returns view(src1)
-                // the overwriting is mathematically equivalent to:
-                // tensor = src0 * 1 + src1 * 0
-                if (src0->grad) {
-                    // dsrc0 = dtensor * 1
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    // dsrc1 = dtensor * 0 -> noop
-                }
-            } break;
-        case GGML_OP_CONT:
-            {
-                // same as cpy
-                if (src0->grad) {
-                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
-                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_reshape(ctx,
-                                ggml_is_contiguous(tensor->grad)
-                                    ? tensor->grad
-                                    : ggml_cont(ctx, tensor->grad),
-                                src0->grad),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_VIEW:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    size_t offset;
-
-                    memcpy(&offset, tensor->op_params, sizeof(offset));
-
-                    size_t nb1     = tensor->nb[1];
-                    size_t nb2     = tensor->nb[2];
-                    size_t nb3     = tensor->nb[3];
-
-                    if (src0->type != src0->grad->type) {
-                        // gradient is typically F32, but src0 could be other type
-                        size_t ng = ggml_element_size(src0->grad);
-                        size_t n0 = ggml_element_size(src0);
-                        GGML_ASSERT(offset % n0 == 0);
-                        GGML_ASSERT(nb1 % n0 == 0);
-                        GGML_ASSERT(nb2 % n0 == 0);
-                        GGML_ASSERT(nb3 % n0 == 0);
-                        offset = (offset / n0) * ng;
-                        nb1 = (nb1 / n0) * ng;
-                        nb2 = (nb2 / n0) * ng;
-                        nb3 = (nb3 / n0) * ng;
-                    }
-
-                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
-                }
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    int32_t * axes = (int32_t *) tensor->op_params;
-                    int axis0 = axes[0] & 0x3;
-                    int axis1 = axes[1] & 0x3;
-                    int axis2 = axes[2] & 0x3;
-                    int axis3 = axes[3] & 0x3;
-                    int axes_backward[4] = {0,0,0,0};
-                    axes_backward[axis0] = 0;
-                    axes_backward[axis1] = 1;
-                    axes_backward[axis2] = 2;
-                    axes_backward[axis3] = 3;
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_permute(ctx,
-                                tensor->grad,
-                                axes_backward[0],
-                                axes_backward[1],
-                                axes_backward[2],
-                                axes_backward[3]),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_transpose(ctx, tensor->grad),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                // necessary for llama (only for tokenizer)
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            // last ggml_get_rows_back argument src0->grad is only
-                            // necessary to setup correct output shape
-                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-                        zero_table);
-                }
-                if (src1->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            /* ggml_diag_mask_inf_impl() shouldn't be here */
-                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_soft_max_back(ctx, tensor->grad, tensor),
-                        zero_table);
-                }
-
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ROPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_back(ctx,
-                                tensor->grad,
-                                src1,
-                                n_dims,
-                                mode,
-                                n_ctx,
-                                n_orig_ctx,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
-                                xpos_base,
-                                xpos_down),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_impl(ctx,
-                                tensor->grad,
-                                src1,
-                                n_dims,
-                                mode,
-                                n_ctx,
-                                n_orig_ctx,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
-                                xpos_base,
-                                xpos_down,
-                                false),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_POOL_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_POOL_2D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_PAD:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_LEAKY_RELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                struct ggml_tensor * flash_grad = NULL;
-                if (src0->grad || src1->grad || tensor->src[2]->grad) {
-                    int32_t t = ggml_get_op_params_i32(tensor, 0);
-                    GGML_ASSERT(t == 0 || t == 1);
-                    bool masked = t != 0;
-                    flash_grad =
-                        ggml_flash_attn_back(ctx,
-                            src0,
-                            src1,
-                            tensor->src[2],
-                            tensor->grad,
-                            masked);
-                }
-
-                struct ggml_tensor * src2 = tensor->src[2];
-                const int64_t elem_q = ggml_nelements(src0);
-                const int64_t elem_k = ggml_nelements(src1);
-                const int64_t elem_v = ggml_nelements(src2);
-
-                enum ggml_type result_type = flash_grad->type;
-                GGML_ASSERT(ggml_blck_size(result_type) == 1);
-                const size_t tsize = ggml_type_size(result_type);
-
-                const size_t offs_q = 0;
-                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-                if (src0->grad) {
-                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
-                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            grad_q,
-                            zero_table);
-                }
-                if (src1->grad) {
-                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
-                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
-                    src1->grad = ggml_add_or_set(ctx,
-                            src1->grad,
-                            grad_k,
-                            zero_table);
-                }
-                if (src2->grad) {
-                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
-                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
-                    src2->grad = ggml_add_or_set(ctx,
-                            src2->grad,
-                            grad_v,
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_WIN_PART:
-        case GGML_OP_WIN_UNPART:
-        case GGML_OP_UNARY:
-            {
-                switch (ggml_get_unary_op(tensor)) {
-                    case GGML_UNARY_OP_ABS:
-                        {
-                            if (src0->grad) {
-                                src0->grad =
-                                    ggml_add_or_set(ctx,
-                                            src0->grad,
-                                            ggml_mul(ctx,
-                                                ggml_sgn(ctx, src0),
-                                                tensor->grad),
-                                            zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_SGN:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_NEG:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_STEP:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_TANH:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_ELU:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_RELU:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_mul(ctx,
-                                            ggml_step(ctx, src0),
-                                            tensor->grad),
-                                        zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_GELU:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_GELU_QUICK:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_SILU:
-                        {
-                            // necessary for llama
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_silu_back(ctx, src0, tensor->grad),
-                                        zero_table);
-                            }
-                        } break;
-                    default:
-                        GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_ADD_REL_POS:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-        case GGML_OP_MAP_CUSTOM1:
-        case GGML_OP_MAP_CUSTOM2:
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_cross_entropy_loss_back(ctx,
-                                    src0,
-                                    src1,
-                                    tensor->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (tensor->src[i] && tensor->src[i]->grad) {
-            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
-        }
-    }
-}
-
-static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    if (node->grad == NULL) {
-        // this usually happens when we generate intermediate nodes from constants in the backward pass
-        // it can also happen during forward pass, if the user performs computations with constants
-        if (node->op != GGML_OP_NONE) {
-            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
-        }
-    }
-
-    // check if already visited
-    if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
-        return;
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        const int k =
-            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
-            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
-            /* unknown order, just fall back to using i*/ i;
-        if (node->src[k]) {
-            ggml_visit_parents(cgraph, node->src[k]);
-        }
-    }
-
-    if (node->op == GGML_OP_NONE && node->grad == NULL) {
-        // reached a leaf node, not part of the gradient graph (e.g. a constant)
-        GGML_ASSERT(cgraph->n_leafs < cgraph->size);
-
-        if (strlen(node->name) == 0) {
-            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
-        }
-
-        cgraph->leafs[cgraph->n_leafs] = node;
-        cgraph->n_leafs++;
-    } else {
-        GGML_ASSERT(cgraph->n_nodes < cgraph->size);
-
-        if (strlen(node->name) == 0) {
-            ggml_format_name(node, "node_%d", cgraph->n_nodes);
-        }
-
-        cgraph->nodes[cgraph->n_nodes] = node;
-        if (cgraph->grads) {
-            cgraph->grads[cgraph->n_nodes] = node->grad;
-        }
-        cgraph->n_nodes++;
-    }
-}
-
-static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
-    if (!expand) {
-        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
-        ggml_graph_clear(cgraph);
-    }
-
-    const int n0 = cgraph->n_nodes;
-    UNUSED(n0);
-
-    ggml_visit_parents(cgraph, tensor);
-
-    const int n_new = cgraph->n_nodes - n0;
-    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
-
-    if (n_new > 0) {
-        // the last added node should always be starting point
-        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
-    }
-}
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    ggml_build_forward_impl(cgraph, tensor, true);
-}
-
-void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
-    GGML_ASSERT(gf->n_nodes > 0);
-
-    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
-    if (keep) {
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
-
-            if (node->grad) {
-                node->grad = ggml_dup_tensor(ctx, node);
-                gf->grads[i] = node->grad;
-            }
-        }
-    }
-
-    // remember original gradients which start with zero values
-    struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
-    for (int i = 0; i < gf->n_nodes; i++) {
-        if (gf->grads[i]) {
-            ggml_hash_insert(zero_table, gf->grads[i]);
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        // inplace operations to add gradients are not created by ggml_compute_backward
-        // use allocator to automatically make inplace operations
-        if (node->grad) {
-            ggml_compute_backward(ctx, node, zero_table);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_expand(gb, node->grad);
-        }
-    }
-
-    ggml_hash_set_free(zero_table);
-}
-
-static size_t ggml_graph_nbytes(size_t size, bool grads) {
-    size_t nbytes = sizeof(struct ggml_cgraph);
-    nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
-    if (grads) {
-        nbytes += size * sizeof(struct ggml_tensor *); // grads
-    }
-    nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
-    return nbytes;
-}
-
-size_t ggml_graph_overhead_custom(size_t size, bool grads) {
-    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
-}
-
-size_t ggml_graph_overhead(void) {
-    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
-}
-
-struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
-    const size_t obj_size = ggml_graph_nbytes(size, grads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
-    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
-
-    struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
-
-    size_t hash_size = ggml_hash_size(size * 2);
-    struct ggml_tensor ** nodes_ptr = data_start;
-    struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
-    struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
-    struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
-
-    // check that we allocated the correct amount of memory
-    assert(obj_size == (size_t) (
-        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
-
-    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
-
-    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    return cgraph;
-}
-
-struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
-    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
-}
-
-struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
-    struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL },
-        /*.order        =*/ cgraph0->order,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    return cgraph;
-}
-
-void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
-    GGML_ASSERT(dst->size >= src->n_leafs);
-    GGML_ASSERT(dst->size >= src->n_nodes);
-    GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
-
-    dst->n_leafs = src->n_leafs;
-    dst->n_nodes = src->n_nodes;
-    dst->order   = src->order;
-
-    for (int i = 0; i < src->n_leafs; ++i) {
-        dst->leafs[i] = src->leafs[i];
-    }
-
-    for (int i = 0; i < src->n_nodes; ++i) {
-        dst->nodes[i] = src->nodes[i];
-    }
-
-    if (src->grads) {
-        GGML_ASSERT(dst->grads != NULL);
-        for (int i = 0; i < src->n_nodes; ++i) {
-            dst->grads[i] = src->grads[i];
-        }
-    }
-
-    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
-        if (src->visited_hash_table.keys[i]) {
-            ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
-        }
-    }
-}
-
-struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
-    ggml_graph_cpy(cgraph, result);
-    return result;
-}
-
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
-    GGML_ASSERT(cgraph->grads != NULL);
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            ggml_set_zero(grad);
-        }
-    }
-}
-
-void ggml_graph_clear(struct ggml_cgraph * cgraph) {
-    cgraph->n_leafs = 0;
-    cgraph->n_nodes = 0;
-    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
-}
-
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
-// Android's libc implementation "bionic" does not support setting affinity
-#if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    int node_num;
-    int rv;
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    switch(g_state.numa.numa_strategy) {
-        case GGML_NUMA_STRATEGY_DISTRIBUTE:
-            // run thread on node_num thread_n / (threads per node)
-            node_num = thread_n % g_state.numa.n_nodes;
-            break;
-        case GGML_NUMA_STRATEGY_ISOLATE:
-            // run thread on current_node
-            node_num = g_state.numa.current_node;
-            break;
-        case GGML_NUMA_STRATEGY_NUMACTL:
-            // use the cpuset that numactl gave us
-            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
-            if (rv) {
-                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
-            }
-            return;
-        default:
-            return;
-    }
-
-    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (size_t i = 0; i < node->n_cpus; ++i) {
-        CPU_SET_S(node->cpus[i], setsize, cpus);
-    }
-
-    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-
-static void clear_numa_thread_affinity(void) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
-        CPU_SET_S(i, setsize, cpus);
-    }
-
-    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-#else
-// TODO: Windows etc.
-// (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
-static void clear_numa_thread_affinity(void) {}
-#endif
-
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan  * cplan;
-
-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
-
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active;  // num active threads
-    atomic_int node_n;    // active graph node
-    atomic_int node_task; // active graph node task phase
-
-    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-    int ith;
-    struct ggml_compute_state_shared * shared;
-};
-
-static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
-    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
-    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
-
-    node->perf_runs++;
-    node->perf_cycles  += cycles_cur;
-    node->perf_time_us += time_us_cur;
-}
-
-static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-    int n_tasks = 0;
-
-    switch (node->op) {
-        case GGML_OP_CPY:
-        case GGML_OP_DUP:
-        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
-        case GGML_OP_ACC:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_SUB:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-        case GGML_OP_SUM:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-        case GGML_OP_REPEAT:
-        case GGML_OP_REPEAT_BACK:
-        case GGML_OP_LEAKY_RELU:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(node)) {
-                case GGML_UNARY_OP_ABS:
-                case GGML_UNARY_OP_SGN:
-                case GGML_UNARY_OP_NEG:
-                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_TANH:
-                case GGML_UNARY_OP_ELU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
-                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
-                    {
-                        n_tasks = 1;
-                    } break;
-
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_SILU:
-                    {
-                        n_tasks = n_threads;
-                    } break;
-                default:
-                    GGML_ASSERT(false);
-            }
-            break;
-        case GGML_OP_SILU_BACK:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_RMS_NORM_BACK:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_CONCAT:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                n_tasks = n_threads;
-
-                // TODO: use different scheduling for different matrix sizes
-                //const int nr0 = ggml_nrows(node->src[0]);
-                //const int nr1 = ggml_nrows(node->src[1]);
-
-                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_SCALE:
-        case GGML_OP_SET:
-        case GGML_OP_CONT:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_GET_ROWS:
-        case GGML_OP_GET_ROWS_BACK:
-        case GGML_OP_DIAG:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX_BACK:
-        case GGML_OP_ROPE:
-        case GGML_OP_ROPE_BACK:
-        case GGML_OP_ADD_REL_POS:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                n_tasks = 1; //TODO
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                n_tasks = 1; //TODO
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_POOL_1D:
-        case GGML_OP_POOL_2D:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_PAD:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_WIN_PART:
-        case GGML_OP_WIN_UNPART:
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                struct ggml_map_custom1_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                struct ggml_map_custom2_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                struct ggml_map_custom3_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_NONE:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                fprintf(stderr, "%s: op not implemented: ", __func__);
-                if (node->op < GGML_OP_COUNT) {
-                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
-                } else {
-                    fprintf(stderr, "%d\n", node->op);
-                }
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    assert(n_tasks > 0);
-
-    return n_tasks;
-}
-
-static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_node_n = * node_n;
-
-    while (true) {
-        if (do_yield) {
-            sched_yield();
-        }
-
-        * node_n = atomic_load(&state->shared->node_n);
-        if (* node_n != last_node_n) break;
-    }
-}
-
-static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_task_phase = * task_phase;
-
-    while (true) {
-        if (do_yield) {
-            sched_yield();
-        }
-
-        * task_phase = atomic_load(&state->shared->node_task);
-        if (* task_phase != last_task_phase) break;
-    }
-}
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
-
-    const int   n_threads   = state->shared->n_threads;
-
-    set_numa_thread_affinity(state->ith);
-
-    int node_n     = -1;
-    int task_phase = GGML_TASK_TYPE_FINALIZE;
-
-    while (true) {
-        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
-        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            // all other threads are finished and spinning
-            // do finalize and init here so we don't have synchronize again
-            struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
-            };
-
-            if (node_n != -1) {
-                /* FINALIZE */
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = ggml_get_n_tasks(node, n_threads);
-                    ggml_compute_forward(&params, node);
-                }
-                ggml_graph_compute_perf_stats_node(node, state->shared);
-            }
-
-            // distribute new work or execute it direct if 1T
-            while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
-                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-
-                params.nth = n_tasks;
-
-                if (n_tasks == 1) {
-                    /* INIT */
-                    if (GGML_OP_HAS_INIT[node->op]) {
-                        params.type = GGML_TASK_TYPE_INIT;
-                        ggml_compute_forward(&params, node);
-                    }
-
-                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
-                    // they do something more efficient than spinning (?)
-                    params.type = GGML_TASK_TYPE_COMPUTE;
-                    ggml_compute_forward(&params, node);
-
-                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-                        params.type = GGML_TASK_TYPE_FINALIZE;
-                        ggml_compute_forward(&params, node);
-                    }
-
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
-                } else {
-                    break;
-                }
-
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
-            }
-
-            task_phase = GGML_TASK_TYPE_INIT;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_n,    node_n);
-            atomic_store(&state->shared->node_task, task_phase);
-        } else {
-            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
-        }
-
-        // check if we should stop
-        if (node_n >= cgraph->n_nodes) break;
-
-        /* INIT & COMPUTE */
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_TYPE_INIT,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
-        };
-
-        if (state->ith < n_tasks) {
-            if (GGML_OP_HAS_INIT[node->op]) {
-                ggml_compute_forward(&params, node);
-            }
-        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_TYPE_COMPUTE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-            //       depending on the workload and the operating system.
-            //       since it is not clear what is the best approach, it should potentially become user-configurable
-            //       ref: https://github.com/ggerganov/ggml/issues/291
-            // UPD:  adding the do_yield flag seems to resolve the issue universally
-            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
-            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
-        }
-
-        if (state->ith < n_tasks) {
-            params.type = GGML_TASK_TYPE_COMPUTE;
-            ggml_compute_forward(&params, node);
-        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_TYPE_FINALIZE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
-        }
-    }
-
-    return GGML_EXIT_SUCCESS;
-}
-
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
-    if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-    }
-
-    size_t work_size = 0;
-
-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
-    int max_tasks = 1;
-
-    // thread scheduling for the different operations + work buffer size estimation
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-        max_tasks = MAX(max_tasks, n_tasks);
-
-        size_t cur = 0;
-
-        switch (node->op) {
-            case GGML_OP_CPY:
-            case GGML_OP_DUP:
-                {
-                    if (ggml_is_quantized(node->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ADD:
-            case GGML_OP_ADD1:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ACC:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
-
-#if defined(GGML_USE_CLBLAST)
-                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
-                    } else
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
-                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory for fully dequantized matrix from src0
-                            // take into account that src0 can be broadcasted into src1[2,3]
-                            cur = ggml_type_size(GGML_TYPE_F32)
-                                * node->src[0]->ne[0]*node->src[0]->ne[1]
-                                * node->src[1]->ne[2]*node->src[1]->ne[3];
-                        }
-                    } else
-#endif
-                    if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-                    }
-                } break;
-            case GGML_OP_MUL_MAT_ID:
-                {
-                    cur = 0;
-                    const struct ggml_tensor * src0 = node->src[2];
-                    const struct ggml_tensor * src1 = node->src[1];
-                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
-                    if (src1->type != vec_dot_type) {
-                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
-                    }
-                    const int n_as = ggml_get_op_params_i32(node, 1);
-                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
-                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
-                } break;
-            case GGML_OP_OUT_PROD:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_SOFT_MAX:
-            case GGML_OP_ROPE:
-                {
-                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
-
-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
-
-                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
-                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-
-                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
-                } break;
-            case GGML_OP_FLASH_ATTN:
-                {
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-            case GGML_OP_FLASH_FF:
-                {
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-            case GGML_OP_FLASH_ATTN_BACK:
-                {
-                    const int64_t    D = node->src[0]->ne[0];
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-
-            case GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-                } break;
-            case GGML_OP_COUNT:
-                {
-                    GGML_ASSERT(false);
-                } break;
-            default:
-                break;
-        }
-
-        work_size = MAX(work_size, cur);
-    }
-
-    if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
-    }
-
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
-
-    return cplan;
-}
-
-int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    {
-        GGML_ASSERT(cplan);
-        GGML_ASSERT(cplan->n_threads > 0);
-
-        if (cplan->work_size > 0) {
-            GGML_ASSERT(cplan->work_data);
-        }
-    }
-
-#ifdef GGML_USE_VULKAN
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
-    }
-    ggml_vk_preallocate_buffers_cpu_assist();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
-    }
-#endif
-
-    const int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.perf_node_start_cycles  =*/ 0,
-        /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-        /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-    };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-            };
-
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
-
-    workers[0].ith = 0;
-    workers[0].shared = &state_shared;
-
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
-
-    // this is a work thread too
-    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
-
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-        }
-    }
-
-#ifdef GGML_USE_VULKAN
-    ggml_vk_graph_cleanup_cpu_assist();
-#endif
-
-    // performance stats (graph)
-    {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
-
-        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
-                __func__, cgraph->perf_runs,
-                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
-                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
-                (double) perf_time_us_cur     / 1000.0,
-                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
-    }
-
-    return compute_status;
-}
-
-void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    ggml_graph_compute(cgraph, &cplan);
-}
-
-struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * leaf = cgraph->leafs[i];
-
-        if (strcmp(leaf->name, name) == 0) {
-            return leaf;
-        }
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        if (strcmp(node->name, name) == 0) {
-            return node;
-        }
-    }
-
-    return NULL;
-}
-
-static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            arg,
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    uint64_t size_eval = 0;
-
-    // compute size of intermediate results
-    // TODO: does not take into account scratch buffers !!!!
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
-    }
-
-    // print
-    {
-        FILE * fout = stdout;
-
-        fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
-        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
-                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_leafs; ++i) {
-            ggml_graph_export_leaf(cgraph->leafs[i], fout);
-
-            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
-        }
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
-                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
-
-            for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                if (cgraph->nodes[i]->src[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
-                }
-            }
-
-            fprintf(fout, "\n");
-        }
-
-        fprintf(fout, "\n");
-    }
-
-    // write binary data
-    {
-        FILE * fout = fopen(fname, "wb");
-
-        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return;
-        }
-
-        // header
-        {
-            const uint32_t magic   = GGML_FILE_MAGIC;
-            const uint32_t version = GGML_FILE_VERSION;
-            const uint32_t n_leafs = cgraph->n_leafs;
-            const uint32_t n_nodes = cgraph->n_nodes;
-
-            fwrite(&magic,     sizeof(uint32_t), 1, fout);
-            fwrite(&version,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
-            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
-        }
-
-        // leafs
-        {
-            for (int i = 0; i < cgraph->n_leafs; ++i) {
-                const struct ggml_tensor * tensor = cgraph->leafs[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // dump the data
-                // TODO: pad this to 32 byte boundary
-                {
-                    const size_t size = ggml_nbytes(tensor);
-
-                    fwrite(tensor->data, sizeof(char), size, fout);
-                }
-            }
-        }
-
-        // nodes
-        {
-            for (int i = 0; i < cgraph->n_nodes; ++i) {
-                const struct ggml_tensor * tensor = cgraph->nodes[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // output the op arguments
-                {
-                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        args[j] = tensor->src[j];
-                    }
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        if (args[j]) {
-                            int32_t idx = -1;
-
-                            // check if leaf
-                            {
-                                for (int k = 0; k < cgraph->n_leafs; ++k) {
-                                    if (args[j] == cgraph->leafs[k]) {
-                                        idx = k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            // check if node
-                            if (idx == -1) {
-                                for (int k = 0; k < cgraph->n_nodes; ++k) {
-                                    if (args[j] == cgraph->nodes[k]) {
-                                        idx = cgraph->n_leafs + k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            if (idx == -1) {
-                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-                                fclose(fout);
-                                return;
-                            }
-
-                            fwrite(&idx, sizeof(int32_t), 1, fout);
-                        } else {
-                            const int32_t nul = -1;
-
-                            fwrite(&nul, sizeof(int32_t), 1, fout);
-                        }
-                    }
-                }
-            }
-        }
-
-        fclose(fout);
-    }
-}
-
-struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
-    assert(*ctx_data == NULL);
-    assert(*ctx_eval == NULL);
-
-    struct ggml_cgraph * result = NULL;
-
-    struct ggml_tensor * data = NULL;
-
-    // read file into data
-    {
-        FILE * fin = fopen(fname, "rb");
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return result;
-        }
-
-        size_t fsize = 0;
-
-        fseek(fin, 0, SEEK_END);
-        fsize = ftell(fin);
-        fseek(fin, 0, SEEK_SET);
-
-        // create the data context
-        {
-            const size_t overhead = 1*ggml_tensor_overhead();
-
-            struct ggml_init_params params = {
-                .mem_size   = fsize + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
-
-            *ctx_data = ggml_init(params);
-
-            if (!*ctx_data) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
-
-        {
-            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
-            if (ret != fsize) {
-                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        fclose(fin);
-    }
-
-    // populate result
-    {
-        char * ptr = (char *) data->data;
-
-        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
-
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
-            return result;
-        }
-
-        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
-
-        if (version != GGML_FILE_VERSION) {
-            fprintf(stderr, "%s: invalid version number\n", __func__);
-            return result;
-        }
-
-        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
-        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
-        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-        const int     graph_size = MAX(n_leafs, n_nodes);
-
-        // create the data context
-        {
-            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
-
-            struct ggml_init_params params = {
-                .mem_size   = size_eval + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = true,
-            };
-
-            *ctx_eval = ggml_init(params);
-
-            if (!*ctx_eval) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                return result;
-            }
-        }
-
-        result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
-
-        result->n_leafs = n_leafs;
-        result->n_nodes = n_nodes;
-
-
-        // leafs
-        {
-            uint32_t type;
-            uint32_t op;
-
-            for (uint32_t i = 0; i < n_leafs; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                tensor->op = (enum ggml_op) op;
-
-                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
-                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
-
-                tensor->data = (void *) ptr;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                result->leafs[i] = tensor;
-
-                ptr += ggml_nbytes(tensor);
-
-                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-
-        ggml_set_no_alloc(*ctx_eval, false);
-
-        // nodes
-        {
-            uint32_t type;
-            uint32_t op;
-
-            for (uint32_t i = 0; i < n_nodes; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-
-                enum ggml_op eop = (enum ggml_op) op;
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
-                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
-
-                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
-
-                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                // parse args
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    const int32_t arg_idx = ptr_arg_idx[j];
-
-                    if (arg_idx == -1) {
-                        continue;
-                    }
-
-                    if (arg_idx < result->n_leafs) {
-                        args[j] = result->leafs[arg_idx];
-                    } else {
-                        args[j] = result->nodes[arg_idx - result->n_leafs];
-                    }
-                }
-
-                // create the tensor
-                // "view" operations are handled differently
-                // TODO: handle inplace ops - currently a copy is always made
-
-                struct ggml_tensor * tensor = NULL;
-
-                switch (eop) {
-                    // TODO: implement other view ops
-                    case GGML_OP_RESHAPE:
-                        {
-                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
-                        } break;
-                    case GGML_OP_VIEW:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-
-                            size_t offs;
-                            memcpy(&offs, ptr_op_params, sizeof(offs));
-
-                            tensor->data = ((char *) tensor->data) + offs;
-                        } break;
-                    case GGML_OP_TRANSPOSE:
-                        {
-                            tensor = ggml_transpose(*ctx_eval, args[0]);
-                        } break;
-                    case GGML_OP_PERMUTE:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-                        } break;
-                    default:
-                        {
-                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                            tensor->op = eop;
-                        } break;
-                }
-
-                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
-                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    tensor->src[j] = args[j];
-                }
-
-                result->nodes[i] = tensor;
-
-                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-    }
-
-    return result;
-}
-
-void ggml_graph_print(const struct ggml_cgraph * cgraph) {
-    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
-
-    GGML_PRINT("=== GRAPH ===\n");
-
-    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
-                i,
-                node->ne[0], node->ne[1], node->ne[2],
-                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
-                (double) node->perf_time_us / 1000.0,
-                (double) node->perf_time_us / 1000.0 / node->perf_runs);
-    }
-
-    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * node = cgraph->leafs[i];
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
-                i,
-                node->ne[0], node->ne[1],
-                ggml_op_name(node->op),
-                ggml_get_name(node));
-    }
-
-    for (int i = 0; i < GGML_OP_COUNT; i++) {
-        if (perf_total_per_op_us[i] == 0) {
-            continue;
-        }
-
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
-    }
-
-    GGML_PRINT("========================================\n");
-}
-
-// check if node is part of the graph
-static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    if (cgraph == NULL) {
-        return true;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * parent = cgraph->nodes[i];
-
-        if (parent->grad == node) {
-            return parent;
-        }
-    }
-
-    return NULL;
-}
-
-static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
-    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
-    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
-    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
-            gparent0 ? (void *) gparent0 : (void *) parent,
-            gparent0 ? "g" : "x",
-            gparent ? (void *) gparent : (void *) node,
-            gparent ? "g" : "x",
-            gparent ? "empty" : "vee",
-            gparent ? "dashed" : "solid",
-            label);
-}
-
-static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
-    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
-            (void *) parent, "x",
-            (void *) node, "x",
-            label);
-}
-
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
-    char color[16];
-
-    FILE * fp = fopen(filename, "w");
-    GGML_ASSERT(fp);
-
-    fprintf(fp, "digraph G {\n");
-    fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        if (ggml_graph_get_parent(gb, node) != NULL) {
-            continue;
-        }
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            snprintf(color, sizeof(color), "yellow");
-        } else if (node->grad) {
-            if (ggml_graph_find(gf, node)) {
-                snprintf(color, sizeof(color), "green");
-            } else {
-                snprintf(color, sizeof(color), "lightblue");
-            }
-        } else {
-            snprintf(color, sizeof(color), "white");
-        }
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
-        } else {
-            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-        }
-
-        if (ggml_is_matrix(node)) {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
-        } else {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
-        }
-
-        if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
-        } else {
-            fprintf(fp, "\"; ]\n");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        snprintf(color, sizeof(color), "pink");
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"<x>",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
-        } else {
-            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-        }
-
-        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        if (ggml_nelements(node) < 5) {
-            fprintf(fp, " | (");
-            for (int j = 0; j < ggml_nelements(node); j++) {
-                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
-                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
-                }
-                else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
-                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
-                }
-                else {
-                    fprintf(fp, "#");
-                }
-                if (j < ggml_nelements(node) - 1) {
-                    fprintf(fp, ", ");
-                }
-            }
-            fprintf(fp, ")");
-        }
-        fprintf(fp, "\"; ]\n");
-    }
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j]) {
-                char label[16];
-                snprintf(label, sizeof(label), "src %d", j);
-                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
-            }
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j]) {
-                char label[16];
-                snprintf(label, sizeof(label), "src %d", j);
-                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
-            }
-        }
-    }
-
-    fprintf(fp, "}\n");
-
-    fclose(fp);
-
-    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to set tensor from array
-        for (int64_t j = 0; j < ne; ++j) {
-            ggml_set_f32_1d(ps[p], j, x[i++]);
-        }
-    }
-}
-
-static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            x[i++] = ggml_get_f32_1d(ps[p], j);
-        }
-    }
-}
-
-static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
-        }
-    }
-}
-
-static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
-        }
-    }
-}
-
-//
-// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
-//
-// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
-//
-
-static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    GGML_ASSERT(ggml_is_scalar(f));
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int64_t nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
-        int iter = opt->iter;
-        ggml_opt_init(opt->ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    // constants
-    float sched = params.adam.sched;
-    const float alpha = params.adam.alpha;
-    const float decay = params.adam.decay * alpha;
-    const float beta1 = params.adam.beta1;
-    const float beta2 = params.adam.beta2;
-    const float eps   = params.adam.eps;
-    const float gclip = params.adam.gclip;
-    const int decay_min_ndim = params.adam.decay_min_ndim;
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
-
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    bool cancel = false;
-
-    // compute the function value
-    float fx = 0;
-    ggml_set_zero(opt->adam.g);
-    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-        if (callback) {
-            callback(callback_data, accum_step, &sched, &cancel);
-            if (cancel) {
-                return GGML_OPT_RESULT_CANCEL;
-            }
-        }
-        // ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
-        ggml_opt_acc_grad(np, ps, g, accum_norm);
-        fx += ggml_get_f32_1d(f, 0);
-    }
-    fx *= accum_norm;
-
-    opt->adam.fx_prev = fx;
-    opt->adam.fx_best = opt->adam.fx_prev;
-    if (pf) {
-        pf[opt->iter % params.past] = opt->adam.fx_prev;
-    }
-
-    opt->loss_before = opt->adam.fx_prev;
-    opt->loss_after  = opt->adam.fx_prev;
-
-    // initialize
-    if (opt->just_initialized) {
-        opt->adam.n_no_improvement = 0;
-        opt->just_initialized = false;
-    }
-
-    float * fx_best = &opt->adam.fx_best;
-    float * fx_prev = &opt->adam.fx_prev;
-    int * n_no_improvement = &opt->adam.n_no_improvement;
-
-    int iter0 = opt->iter;
-
-    // run the optimizer
-    for (int t = 0; t < params.adam.n_iter; ++t) {
-        opt->iter = iter0 + t + 1;
-        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
-
-        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
-        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
-        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
-
-        for (int i = 0; i < np; ++i) {
-            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
-                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
-        }
-
-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
-
-        {
-            float gnorm = 1.0f;
-            if (gclip > 0.0f) {
-                // gradient clipping
-                ggml_float sum = 0.0;
-                for (int64_t i = 0; i < nx; ++i) {
-                    sum += (ggml_float)(g[i]*g[i]);
-                }
-                ggml_float norm = sqrt(sum);
-                if (norm > (ggml_float) gclip) {
-                    gnorm = (float) ((ggml_float) gclip / norm);
-                }
-            }
-            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
-            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
-            int64_t i = 0;
-            for (int p = 0; p < np; ++p) {
-                const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
-                for (int64_t j = 0; j < ne; ++j) {
-                    float x  = ggml_get_f32_1d(ps[p], j);
-                    float g_ = g[i]*gnorm;
-                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
-                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
-                    float mh = m[i]*beta1h;
-                    float vh = v[i]*beta2h;
-                    vh = sqrtf(vh) + eps;
-                    x  = x*(1.0f - p_decay) - mh/vh;
-                    ggml_set_f32_1d(ps[p], j, x);
-                    ++i;
-                }
-            }
-        }
-
-        fx = 0;
-        ggml_set_zero(opt->adam.g);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_RESULT_CANCEL;;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_after = fx;
-
-        // check convergence
-        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
-            GGML_PRINT_DEBUG("converged\n");
-
-            return GGML_OPT_RESULT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= iter0 + t) {
-                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-
-            pf[(iter0 + t)%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx_best[0] > fx) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                ++n_no_improvement[0];
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-        }
-
-        fx_prev[0] = fx;
-
-        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
-
-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
-        }
-    }
-
-    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-}
-
-//
-// L-BFGS
-//
-// the L-BFGS implementation below is based on the following implementation:
-//
-//   https://github.com/chokkan/liblbfgs
-//
-
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
-
-static enum ggml_opt_result linesearch_backtracking(
-        const struct ggml_opt_params * params,
-        int nx,
-        float * x,
-        float * fx,
-        float * g,
-        float * d,
-        float * step,
-        const float * xp,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
-        const int np,
-        struct ggml_tensor * ps[],
-        bool * cancel,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    int count = 0;
-
-    float width  = 0.0f;
-    float dg     = 0.0f;
-    float finit  = 0.0f;
-    float dginit = 0.0f;
-    float dgtest = 0.0f;
-
-    const float dec = 0.5f;
-    const float inc = 2.1f;
-
-    const int n_accum = MAX(1, params->n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    if (*step <= 0.f) {
-        return GGML_LINESEARCH_INVALID_PARAMETERS;
-    }
-
-    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
-
-    // make sure that d points to a descent direction
-    if (0 < dginit) {
-        return GGML_LINESEARCH_FAIL;
-    }
-
-    // initialize local variables
-    finit = *fx;
-    dgtest = params->lbfgs.ftol*dginit;
-
-    while (true) {
-        ggml_vec_cpy_f32(nx, x, xp);
-        ggml_vec_mad_f32(nx, x, d, *step);
-
-        // evaluate the function and gradient values
-        {
-            ggml_opt_set_params(np, ps, x);
-
-            *fx = 0;
-            memset(g, 0, sizeof(float)*nx);
-            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-                if (callback) {
-                    // LBFG-S does not support learning rate -> ignore learning schedule
-                    float sched = 0;
-                    callback(callback_data, accum_step, &sched, cancel);
-                    if (*cancel) {
-                        return GGML_OPT_RESULT_CANCEL;
-                    }
-                }
-                // ggml_graph_reset  (gf);
-                ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
-                ggml_opt_acc_grad(np, ps, g, accum_norm);
-                *fx += ggml_get_f32_1d(f, 0);
-            }
-            *fx *= accum_norm;
-
-        }
-
-        ++count;
-
-        if (*fx > finit + (*step)*dgtest) {
-            width = dec;
-        } else {
-            // Armijo condition is satisfied
-            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
-            }
-
-            ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
-
-            // check the Wolfe condition
-            if (dg < params->lbfgs.wolfe * dginit) {
-                width = inc;
-            } else {
-                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
-                    // regular Wolfe conditions
-                    return count;
-                }
-
-                if(dg > -params->lbfgs.wolfe*dginit) {
-                    width = dec;
-                } else {
-                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
-                }
-            }
-        }
-
-        if (*step < params->lbfgs.min_step) {
-            return GGML_LINESEARCH_MINIMUM_STEP;
-        }
-        if (*step > params->lbfgs.max_step) {
-            return GGML_LINESEARCH_MAXIMUM_STEP;
-        }
-        if (params->lbfgs.max_linesearch <= count) {
-            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
-        }
-
-        (*step) *= width;
-    }
-
-    GGML_ASSERT(false && "line search failed");
-
-    return GGML_LINESEARCH_FAIL;
-}
-
-static enum ggml_opt_result ggml_opt_lbfgs(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
-        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-            return GGML_OPT_RESULT_INVALID_WOLFE;
-        }
-    }
-
-    const int m = params.lbfgs.m;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
-        int iter = opt->iter;
-        ggml_opt_init(ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
-
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
-
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float fx    = 0.0f; // cost function value
-    float xnorm = 0.0f; // ||x||
-    float gnorm = 0.0f; // ||g||
-
-    // initialize x from the graph nodes
-    ggml_opt_get_params(np, ps, x);
-
-    // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
-
-    bool cancel = false;
-
-    // evaluate the function value and its gradient
-    {
-        ggml_opt_set_params(np, ps, x);
-
-        fx = 0;
-        memset(g, 0, sizeof(float)*nx);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                // LBFG-S does not support learning rate -> ignore learning schedule
-                float sched = 0;
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_RESULT_CANCEL;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_before = fx;
-        opt->loss_after  = fx;
-    }
-
-    // search direction = -gradient
-    ggml_vec_neg_f32(nx, d, g);
-
-    // ||x||, ||g||
-    ggml_vec_norm_f32(nx, &xnorm, x);
-    ggml_vec_norm_f32(nx, &gnorm, g);
-
-    if (xnorm < 1.0f) {
-        xnorm = 1.0f;
-    }
-
-    // already optimized
-    if (gnorm/xnorm <= params.lbfgs.eps) {
-        return GGML_OPT_RESULT_OK;
-    }
-
-    if (opt->just_initialized) {
-        if (pf) {
-            pf[0] = fx;
-        }
-        opt->lbfgs.fx_best = fx;
-
-        // initial step
-        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
-        opt->lbfgs.j                = 0;
-        opt->lbfgs.k                = 1;
-        opt->lbfgs.end              = 0;
-        opt->lbfgs.n_no_improvement = 0;
-        opt->just_initialized       = false;
-    }
-
-    float * fx_best        = &opt->lbfgs.fx_best;
-    float * step           = &opt->lbfgs.step;
-    int * j                = &opt->lbfgs.j;
-    int * k                = &opt->lbfgs.k;
-    int * end              = &opt->lbfgs.end;
-    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
-
-    int ls     = 0;
-    int bound  = 0;
-
-    float ys   = 0.0f;
-    float yy   = 0.0f;
-    float beta = 0.0f;
-
-    int it = 0;
-
-    while (true) {
-        // store the current position and gradient vectors
-        ggml_vec_cpy_f32(nx, xp, x);
-        ggml_vec_cpy_f32(nx, gp, g);
-
-        // TODO: instead of passing &cancel here, use the return code of the linesearch
-        //       to determine if the optimization should be cancelled
-        //       this is a simple change, but not doing this atm, since I don't have a nice
-        //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
-        if (cancel) {
-            return GGML_OPT_RESULT_CANCEL;
-        }
-
-        if (ls < 0) {
-            // linesearch failed - go back to the previous point and return
-            ggml_vec_cpy_f32(nx, x, xp);
-            ggml_vec_cpy_f32(nx, g, gp);
-
-            return ls;
-        }
-
-        opt->loss_after = fx;
-
-        ggml_vec_norm_f32(nx, &xnorm, x);
-        ggml_vec_norm_f32(nx, &gnorm, g);
-
-        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-
-        if (xnorm < 1.0f) {
-            xnorm = 1.0f;
-        }
-        if (gnorm/xnorm <= params.lbfgs.eps) {
-            // converged
-            return GGML_OPT_RESULT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= k[0]) {
-                const float rate = (pf[k[0]%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-
-            pf[k[0]%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx < fx_best[0]) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                n_no_improvement[0]++;
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_RESULT_OK;
-                }
-            }
-        }
-
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
-            // reached the maximum number of iterations
-            return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-        }
-
-        // update vectors s and y:
-        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
-        //   y_{k+1} = g_{k+1} - g_{k}.
-        //
-        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
-        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
-
-        // compute scalars ys and yy:
-        //     ys = y^t \cdot s    -> 1 / \rho.
-        //     yy = y^t \cdot y.
-        //
-        ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
-        ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
-
-        lm_ys[end[0]] = ys;
-
-        // find new search direction
-        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-
-        bound = (m <= k[0]) ? m : k[0];
-        k[0]++;
-        it++;
-        end[0] = (end[0] + 1)%m;
-
-        // initialize search direction with -g
-        ggml_vec_neg_f32(nx, d, g);
-
-        j[0] = end[0];
-        for (int i = 0; i < bound; ++i) {
-            j[0] = (j[0] + m - 1) % m;
-            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
-            lm_alpha[j[0]] /= lm_ys[j[0]];
-            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
-        }
-
-        ggml_vec_scale_f32(nx, d, ys/yy);
-
-        for (int i = 0; i < bound; ++i) {
-            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
-            beta /= lm_ys[j[0]];
-            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
-            j[0] = (j[0] + 1)%m;
-        }
-
-        step[0] = 1.0;
-    }
-
-    GGML_ASSERT(false && "lbfgs failed");
-
-    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-}
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
-    struct ggml_opt_params result;
-
-    switch (type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_TYPE_ADAM,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .sched  = 1.000f,
-                        .decay  = 0.0f,
-                        .decay_min_ndim = 2,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                        .gclip  = 0.0f,
-                    },
-                };
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_TYPE_LBFGS,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1,
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
-    }
-
-    return result;
-}
-
-GGML_API void ggml_opt_init(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        int64_t nx) {
-    opt->ctx = ctx;
-    opt->params = params;
-    opt->iter = 0;
-    opt->nx = nx;
-    opt->just_initialized = true;
-    if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
-        if (opt->params.type == GGML_OPT_TYPE_ADAM) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        }
-        ctx_opt_params.mem_buffer = NULL;
-        ctx_opt_params.no_alloc   = false;
-
-        opt->ctx = ggml_init(ctx_opt_params);
-    }
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                ggml_set_zero(opt->adam.m);
-                ggml_set_zero(opt->adam.v);
-                if (opt->adam.pf) {
-                    ggml_set_zero(opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                ggml_set_zero(opt->lbfgs.x);
-                ggml_set_zero(opt->lbfgs.xp);
-                ggml_set_zero(opt->lbfgs.g);
-                ggml_set_zero(opt->lbfgs.gp);
-                ggml_set_zero(opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    ggml_set_zero(opt->lbfgs.pf);
-                }
-                ggml_set_zero(opt->lbfgs.lmal);
-                ggml_set_zero(opt->lbfgs.lmys);
-                ggml_set_zero(opt->lbfgs.lms);
-                ggml_set_zero(opt->lbfgs.lmy);
-            } break;
-    }
-}
-
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f) {
-    bool free_ctx = false;
-    if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
-
-        ctx = ggml_init(params_ctx);
-        if (ctx == NULL) {
-            return GGML_OPT_RESULT_NO_CONTEXT;
-        }
-
-        free_ctx = true;
-    }
-
-    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-
-    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
-
-    ggml_opt_init(ctx, opt, params, 0);
-    result = ggml_opt_resume(ctx, opt, f);
-
-    if (free_ctx) {
-        ggml_free(ctx);
-    }
-
-    return result;
-}
-
-enum ggml_opt_result ggml_opt_resume(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f) {
-
-    // build forward + backward compute graphs
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
-    ggml_build_forward_expand(gf, f);
-
-    struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
-    ggml_build_backward_expand(ctx, gf, gb, true);
-
-    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
-}
-
-enum ggml_opt_result ggml_opt_resume_g(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-
-    // build forward + backward compute graphs
-    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-
-    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
-            {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-        case GGML_OPT_TYPE_LBFGS:
-            {
-                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-    }
-
-    if (opt->params.print_forward_graph) {
-        ggml_graph_print   (gf);
-        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
-    }
-
-    if (opt->params.print_backward_graph) {
-        ggml_graph_print   (gb);
-        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
-    }
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_set_input(struct ggml_tensor * tensor) {
-    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
-}
-
-void ggml_set_output(struct ggml_tensor * tensor) {
-    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_quantize_init(enum ggml_type type) {
-    ggml_critical_section_start();
-
-    switch (type) {
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ1_S:   iq2xs_init_impl(type); break;
-        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
-        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
-        default: // nothing
-            break;
-    }
-
-    ggml_critical_section_end();
-}
-
-void ggml_quantize_free(void) {
-    ggml_critical_section_start();
-
-    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
-    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
-    iq2xs_free_impl(GGML_TYPE_IQ1_S);
-    iq3xs_free_impl(256);
-
-    ggml_critical_section_end();
-}
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
-
-        quantize_row_q4_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_0; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
-
-        quantize_row_q4_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_1; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
-
-        quantize_row_q5_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_0*sizeof(block_q5_0));
-}
-
-size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
-
-        quantize_row_q5_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_1*sizeof(block_q5_1));
-}
-
-size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
-
-        quantize_row_q8_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK8_0; ++j) {
-                const int8_t vi = y[i].qs[j];
-
-                hist[vi/16 + 8]++;
-            }
-        }
-    }
-
-    return (n/QK8_0*sizeof(block_q8_0));
-}
-
-bool ggml_quantize_requires_imatrix(enum ggml_type type) {
-    return
-        type == GGML_TYPE_IQ2_XXS ||
-        type == GGML_TYPE_IQ2_XS  ||
-        type == GGML_TYPE_IQ1_S;
-}
-
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
-        int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
-    ggml_quantize_init(type); // this is noop if already initialized
-    size_t result = 0;
-    int n = nrows * n_per_row;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            {
-                GGML_ASSERT(start % QK4_0 == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q4_1:
-            {
-                GGML_ASSERT(start % QK4_1 == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q5_0:
-            {
-                GGML_ASSERT(start % QK5_0 == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q5_1:
-            {
-                GGML_ASSERT(start % QK5_1 == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q8_0:
-            {
-                GGML_ASSERT(start % QK8_0 == 0);
-                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
-                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q2_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q3_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q4_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q5_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_Q6_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ2_XXS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                GGML_ASSERT(imatrix);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ2_XS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                GGML_ASSERT(imatrix);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ3_XXS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ3_S:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ2_S:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ1_S:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-        case GGML_TYPE_IQ4_NL:
-#if QK_K == 64
-        case GGML_TYPE_IQ4_XS:
-#endif
-            {
-                GGML_ASSERT(start % QK4_NL == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-#if QK_K != 64
-        case GGML_TYPE_IQ4_XS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                GGML_ASSERT(start % n_per_row == 0);
-                size_t start_row = start / n_per_row;
-                size_t row_size = ggml_row_size(type, n_per_row);
-                result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
-                GGML_ASSERT(result == row_size * nrows);
-            } break;
-#endif
-        case GGML_TYPE_F16:
-            {
-                size_t elemsize = sizeof(ggml_fp16_t);
-                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
-                result = n * elemsize;
-            } break;
-        case GGML_TYPE_F32:
-            {
-                size_t elemsize = sizeof(float);
-                result = n * elemsize;
-                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
-            } break;
-        default:
-            assert(false);
-    }
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct gguf_str {
-    uint64_t n;  // GGUFv2
-    char * data;
-};
-
-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
-};
-static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-
-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
-};
-static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-
-union gguf_value {
-    uint8_t  uint8;
-    int8_t   int8;
-    uint16_t uint16;
-    int16_t  int16;
-    uint32_t uint32;
-    int32_t  int32;
-    float    float32;
-    uint64_t uint64;
-    int64_t  int64;
-    double   float64;
-    bool     bool_;
-
-    struct gguf_str str;
-
-    struct {
-        enum gguf_type type;
-
-        uint64_t n;  // GGUFv2
-        void * data;
-    } arr;
-};
-
-struct gguf_kv {
-    struct gguf_str key;
-
-    enum  gguf_type  type;
-    union gguf_value value;
-};
-
-struct gguf_header {
-    char magic[4];
-
-    uint32_t version;
-    uint64_t n_tensors; // GGUFv2
-    uint64_t n_kv;      // GGUFv2
-};
-
-struct gguf_tensor_info {
-    struct gguf_str name;
-
-    uint32_t n_dims;
-    uint64_t ne[GGML_MAX_DIMS];
-
-    enum ggml_type type;
-
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-
-    // for writing API
-    const void * data;
-    size_t size;
-};
-
-struct gguf_context {
-    struct gguf_header header;
-
-    struct gguf_kv          * kv;
-    struct gguf_tensor_info * infos;
-
-    size_t alignment;
-    size_t offset;    // offset of `data` from beginning of file
-    size_t size;      // size of `data` in bytes
-
-    //uint8_t * padding;
-    void * data;
-};
-
-static size_t gguf_type_size(enum gguf_type type) {
-    GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
-    return GGUF_TYPE_SIZE[type];
-}
-
-static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
-    GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
-    GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
-
-    for (uint32_t i = 0; i < info->n_dims; ++i) {
-        GGML_ASSERT(info->ne[i] > 0);
-    }
-
-    // prevent overflow for total number of elements
-    GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
-    GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
-    GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
-}
-
-static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
-    const size_t n = fread(dst, 1, size, file);
-    *offset += n;
-    return n == size;
-}
-
-static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
-    p->n    = 0;
-    p->data = NULL;
-
-    bool ok = true;
-
-    ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
-
-    // early exit if string length is invalid, prevents from integer overflow
-    if (p->n == SIZE_MAX) {
-        fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
-        return false;
-    }
-
-    p->data = GGML_CALLOC(p->n + 1, 1);
-
-    ok = ok && gguf_fread_el(file,  p->data, p->n, offset);
-
-    return ok;
-}
-
-struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-
-    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
-    ctx->header.version   = GGUF_VERSION;
-    ctx->header.n_tensors = 0;
-    ctx->header.n_kv      = 0;
-
-    ctx->kv    = NULL;
-    ctx->infos = NULL;
-
-    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
-    ctx->offset    = 0;
-    ctx->size      = 0;
-
-    ctx->data = NULL;
-
-    return ctx;
-}
-
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = fopen(fname, "rb");
-    if (!file) {
-        return NULL;
-    }
-
-    // offset from start of file
-    size_t offset = 0;
-
-    char magic[4];
-
-    // check the magic before making allocations
-    {
-        gguf_fread_el(file, &magic, sizeof(magic), &offset);
-
-        for (uint32_t i = 0; i < sizeof(magic); i++) {
-            if (magic[i] != GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
-                return NULL;
-            }
-        }
-    }
-
-    bool ok = true;
-
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-
-    // read the header
-    {
-        strncpy(ctx->header.magic, magic, 4);
-
-        ctx->kv    = NULL;
-        ctx->infos = NULL;
-        ctx->data  = NULL;
-
-        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
-
-        if (ctx->header.version == 1) {
-            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-
-        // sanity-checks to prevent from integer/buffer overflows
-
-        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
-        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
-        ok = ok && (ctx->header.n_kv      < (SIZE_MAX/2)/sizeof(struct gguf_kv));
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-    }
-
-    // read the kv pairs
-    {
-        ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
-
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->kv[i];
-
-            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-
-            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
-            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
-
-            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
-
-            switch (kv->type) {
-                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
-                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
-                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
-                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
-                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
-                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
-                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
-                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
-                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
-                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
-                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
-                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
-                case GGUF_TYPE_ARRAY:
-                    {
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
-
-                        switch (kv->value.arr.type) {
-                            case GGUF_TYPE_UINT8:
-                            case GGUF_TYPE_INT8:
-                            case GGUF_TYPE_UINT16:
-                            case GGUF_TYPE_INT16:
-                            case GGUF_TYPE_UINT32:
-                            case GGUF_TYPE_INT32:
-                            case GGUF_TYPE_FLOAT32:
-                            case GGUF_TYPE_UINT64:
-                            case GGUF_TYPE_INT64:
-                            case GGUF_TYPE_FLOAT64:
-                            case GGUF_TYPE_BOOL:
-                                {
-                                    // prevent from integer overflow in the malloc below
-                                    if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
-                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
-                                        gguf_free(ctx);
-                                        return NULL;
-                                    }
-
-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
-
-                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
-                                } break;
-                            case GGUF_TYPE_STRING:
-                                {
-                                    // prevent from integer overflow in the malloc below
-                                    if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
-                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
-                                        gguf_free(ctx);
-                                        return NULL;
-                                    }
-
-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
-
-                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
-                                    }
-                                } break;
-                            case GGUF_TYPE_ARRAY:
-                            default: GGML_ASSERT(false && "invalid type"); break;
-                        }
-                    } break;
-                default: GGML_ASSERT(false && "invalid type");
-            }
-
-            if (!ok) {
-                break;
-            }
-        }
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-    }
-
-    // read the tensor infos
-    {
-        ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
-
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                info->ne[j] = 1;
-            }
-
-            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
-            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
-
-            ok = ok && (info->n_dims <= GGML_MAX_DIMS);
-
-            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-            }
-
-            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
-            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
-
-            gguf_tensor_info_sanitize(info);
-
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
-            }
-        }
-    }
-
-    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
-
-    int alignment_idx = gguf_find_key(ctx, "general.alignment");
-    if (alignment_idx != -1) {
-        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
-    }
-
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset_pad = offset % ctx->alignment;
-
-        if (offset_pad != 0) {
-            offset += ctx->alignment - offset_pad;
-            fseek(file, offset, SEEK_SET);
-        }
-    }
-
-    // store the current file offset - this is where the data section starts
-    ctx->offset = offset;
-
-    // compute the total size of the data section, taking into account the alignment
-    {
-        ctx->size = 0;
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            const int64_t ne =
-                (int64_t) info->ne[0] *
-                (int64_t) info->ne[1] *
-                (int64_t) info->ne[2] *
-                (int64_t) info->ne[3];
-
-            if (ne % ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
-                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
-            }
-
-            const size_t size_cur = ggml_row_size(info->type, ne);
-
-            ctx->size += GGML_PAD(size_cur, ctx->alignment);
-        }
-    }
-
-    // load the tensor data only if requested
-    if (params.ctx != NULL) {
-        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
-        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
-        // the ggml_tensor structs to the appropriate locations in the binary blob
-
-        // compute the exact size needed for the new ggml_context
-        const size_t mem_size =
-            params.no_alloc ?
-            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
-            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
-
-        struct ggml_init_params pdata = {
-            .mem_size   = mem_size,
-            .mem_buffer = NULL,
-            .no_alloc   = params.no_alloc,
-        };
-
-        *params.ctx = ggml_init(pdata);
-
-        struct ggml_context * ctx_data = *params.ctx;
-
-        struct ggml_tensor * data = NULL;
-
-        if (!params.no_alloc) {
-            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
-
-            ok = ok && data != NULL;
-
-            // read the binary blob with the tensor data
-            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
-
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
-                ggml_free(ctx_data);
-                gguf_free(ctx);
-                return NULL;
-            }
-
-            ctx->data = data->data;
-        }
-
-        ggml_set_no_alloc(ctx_data, true);
-
-        // create the tensors
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
-            };
-
-            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
-
-            ok = ok && cur != NULL;
-
-            ggml_set_name(cur, ctx->infos[i].name.data);
-
-            if (!ok) {
-                break;
-            }
-
-            // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (!params.no_alloc) {
-              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
-                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
-            }
-        }
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
-            ggml_free(ctx_data);
-            gguf_free(ctx);
-            return NULL;
-        }
-
-        ggml_set_no_alloc(ctx_data, params.no_alloc);
-    }
-
-    fclose(file);
-
-    return ctx;
-}
-
-void gguf_free(struct gguf_context * ctx) {
-    if (ctx == NULL) {
-        return;
-    }
-
-    if (ctx->kv) {
-        // free string memory - not great..
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->kv[i];
-
-            if (kv->key.data) {
-                GGML_FREE(kv->key.data);
-            }
-
-            if (kv->type == GGUF_TYPE_STRING) {
-                if (kv->value.str.data) {
-                    GGML_FREE(kv->value.str.data);
-                }
-            }
-
-            if (kv->type == GGUF_TYPE_ARRAY) {
-                if (kv->value.arr.data) {
-                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
-                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
-                            if (str->data) {
-                                GGML_FREE(str->data);
-                            }
-                        }
-                    }
-                    GGML_FREE(kv->value.arr.data);
-                }
-            }
-        }
-
-        GGML_FREE(ctx->kv);
-    }
-
-    if (ctx->infos) {
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            if (info->name.data) {
-                GGML_FREE(info->name.data);
-            }
-        }
-
-        GGML_FREE(ctx->infos);
-    }
-
-    GGML_ALIGNED_FREE(ctx);
-}
-
-const char * gguf_type_name(enum gguf_type type) {
-    return GGUF_TYPE_NAME[type];
-}
-
-int gguf_get_version(const struct gguf_context * ctx) {
-    return ctx->header.version;
-}
-
-size_t gguf_get_alignment(const struct gguf_context * ctx) {
-    return ctx->alignment;
-}
-
-size_t gguf_get_data_offset(const struct gguf_context * ctx) {
-    return ctx->offset;
-}
-
-void * gguf_get_data(const struct gguf_context * ctx) {
-    return ctx->data;
-}
-
-int gguf_get_n_kv(const struct gguf_context * ctx) {
-    return ctx->header.n_kv;
-}
-
-int gguf_find_key(const struct gguf_context * ctx, const char * key) {
-    // return -1 if key not found
-    int keyfound = -1;
-
-    const int n_kv = gguf_get_n_kv(ctx);
-
-    for (int i = 0; i < n_kv; ++i) {
-        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
-            keyfound = i;
-            break;
-        }
-    }
-
-    return keyfound;
-}
-
-const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].key.data;
-}
-
-enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].type;
-}
-
-enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.type;
-}
-
-const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.data;
-}
-
-const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    struct gguf_kv * kv = &ctx->kv[key_id];
-    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
-    return str->data;
-}
-
-int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.n;
-}
-
-uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
-    return ctx->kv[key_id].value.uint8;
-}
-
-int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
-    return ctx->kv[key_id].value.int8;
-}
-
-uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
-    return ctx->kv[key_id].value.uint16;
-}
-
-int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
-    return ctx->kv[key_id].value.int16;
-}
-
-uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
-    return ctx->kv[key_id].value.uint32;
-}
-
-int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
-    return ctx->kv[key_id].value.int32;
-}
-
-float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
-    return ctx->kv[key_id].value.float32;
-}
-
-uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
-    return ctx->kv[key_id].value.uint64;
-}
-
-int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
-    return ctx->kv[key_id].value.int64;
-}
-
-double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
-    return ctx->kv[key_id].value.float64;
-}
-
-bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
-    return ctx->kv[key_id].value.bool_;
-}
-
-const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
-    return ctx->kv[key_id].value.str.data;
-}
-
-const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
-    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
-    return &ctx->kv[key_id].value;
-}
-
-int gguf_get_n_tensors(const struct gguf_context * ctx) {
-    return ctx->header.n_tensors;
-}
-
-int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
-    // return -1 if tensor not found
-    int tensorfound = -1;
-
-    const int n_tensors = gguf_get_n_tensors(ctx);
-
-    for (int i = 0; i < n_tensors; ++i) {
-        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
-            tensorfound = i;
-            break;
-        }
-    }
-
-    return tensorfound;
-}
-
-size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].offset;
-}
-
-char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].name.data;
-}
-
-enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].type;
-}
-
-// returns the index
-static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
-    const int idx = gguf_find_key(ctx, key);
-    if (idx >= 0) {
-        return idx;
-    }
-
-    const int n_kv = gguf_get_n_kv(ctx);
-
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
-    ctx->kv[n_kv].key.n    = strlen(key);
-    ctx->kv[n_kv].key.data = strdup(key);
-    ctx->header.n_kv++;
-
-    return n_kv;
-}
-
-void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
-    ctx->kv[idx].value.uint8 = val;
-}
-
-void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type       = GGUF_TYPE_INT8;
-    ctx->kv[idx].value.int8 = val;
-}
-
-void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
-    ctx->kv[idx].value.uint16 = val;
-}
-
-void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT16;
-    ctx->kv[idx].value.int16 = val;
-}
-
-void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
-    ctx->kv[idx].value.uint32 = val;
-}
-
-void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT32;
-    ctx->kv[idx].value.int32 = val;
-}
-
-void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
-    ctx->kv[idx].value.float32 = val;
-}
-
-void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
-    ctx->kv[idx].value.uint64 = val;
-}
-
-void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT64;
-    ctx->kv[idx].value.int64 = val;
-}
-
-void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
-    ctx->kv[idx].value.float64 = val;
-}
-
-void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
-    ctx->kv[idx].value.bool_ = val;
-}
-
-void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_STRING;
-    ctx->kv[idx].value.str.n    = strlen(val);
-    ctx->kv[idx].value.str.data = strdup(val);
-}
-
-void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = type;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
-    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
-}
-
-void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
-    for (int i = 0; i < n; i++) {
-        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
-        str->n    = strlen(data[i]);
-        str->data = strdup(data[i]);
-    }
-}
-
-// set or add KV pairs from another context
-void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
-    for (uint32_t i = 0; i < src->header.n_kv; i++) {
-        switch (src->kv[i].type) {
-            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
-            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
-            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
-            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
-            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
-            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
-            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
-            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
-            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
-            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
-            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
-            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
-            case GGUF_TYPE_ARRAY:
-                {
-                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
-                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
-                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
-                        }
-                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
-                        GGML_FREE((void *)data);
-                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
-                        GGML_ASSERT(false && "nested arrays not supported");
-                    } else {
-                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
-                    }
-                } break;
-            default: GGML_ASSERT(false && "invalid type"); break;
-        }
-    }
-}
-
-void gguf_add_tensor(
-             struct gguf_context * ctx,
-        const struct ggml_tensor * tensor) {
-    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
-
-    ctx->infos[idx].name.n    = strlen(tensor->name);
-    ctx->infos[idx].name.data = strdup(tensor->name);
-
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        ctx->infos[idx].ne[i] = 1;
-    }
-
-    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
-    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
-        ctx->infos[idx].ne[i] = tensor->ne[i];
-    }
-
-    ctx->infos[idx].type   = tensor->type;
-    ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = tensor->data;
-    ctx->infos[idx].size   = ggml_nbytes(tensor);
-
-    if (ctx->header.n_tensors > 0) {
-        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
-    }
-
-    ctx->header.n_tensors++;
-}
-
-void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
-    const int idx = gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
-    }
-
-    ctx->infos[idx].type = type;
-}
-
-void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
-    const int idx = gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
-    }
-
-    ctx->infos[idx].data = data;
-    ctx->infos[idx].size = size;
-
-    // update offsets
-    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
-        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
-    }
-}
-
-//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
-//    fwrite(&val->n,   sizeof(val->n),    1, file);
-//    fwrite(val->data, sizeof(char), val->n, file);
-//}
-//
-//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
-//    fwrite(val, sizeof(char), size, file);
-//}
-
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-
-static struct gguf_buf gguf_buf_init(size_t size) {
-    struct gguf_buf buf = {
-        /*buf.data   =*/ size == 0 ? NULL : GGML_MALLOC(size),
-        /*buf.size   =*/ size,
-        /*buf.offset =*/ 0,
-    };
-
-    return buf;
-}
-
-static void gguf_buf_free(struct gguf_buf buf) {
-    if (buf.data) {
-        GGML_FREE(buf.data);
-    }
-}
-
-static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
-    if (buf->offset + size > buf->size) {
-        buf->size = 1.5*(buf->offset + size);
-        if (buf->data) {
-            buf->data = realloc(buf->data, buf->size);
-        }
-    }
-}
-
-static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
-    gguf_buf_grow(buf, sizeof(val->n) + val->n);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
-    }
-    buf->offset += sizeof(val->n);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val->data, val->n);
-    }
-    buf->offset += val->n;
-}
-
-static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
-    gguf_buf_grow(buf, el_size);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val, el_size);
-    }
-    buf->offset += el_size;
-}
-
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
-    // write header
-    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
-    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
-    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
-    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
-
-    // write key-value pairs
-    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-        struct gguf_kv * kv = &ctx->kv[i];
-
-        gguf_bwrite_str(buf, &kv->key);
-        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
-
-        switch (kv->type) {
-            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
-            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
-            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
-            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
-            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
-            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
-            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
-            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
-            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
-            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
-            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
-            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
-            case GGUF_TYPE_ARRAY:
-                {
-                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
-
-                    switch (kv->value.arr.type) {
-                        case GGUF_TYPE_UINT8:
-                        case GGUF_TYPE_INT8:
-                        case GGUF_TYPE_UINT16:
-                        case GGUF_TYPE_INT16:
-                        case GGUF_TYPE_UINT32:
-                        case GGUF_TYPE_INT32:
-                        case GGUF_TYPE_FLOAT32:
-                        case GGUF_TYPE_UINT64:
-                        case GGUF_TYPE_INT64:
-                        case GGUF_TYPE_FLOAT64:
-                        case GGUF_TYPE_BOOL:
-                            {
-                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
-                            } break;
-                        case GGUF_TYPE_STRING:
-                            {
-                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
-                                }
-                            } break;
-                        case GGUF_TYPE_ARRAY:
-                        default: GGML_ASSERT(false && "invalid type"); break;
-                    }
-                } break;
-            default: GGML_ASSERT(false && "invalid type");
-        }
-    }
-
-    // write tensor infos
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
-
-        gguf_bwrite_str(buf, &info->name);
-        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
-        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
-        }
-        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
-        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
-    }
-
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset     = buf->offset;
-        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
-
-        if (offset_pad != offset) {
-            uint8_t pad = 0;
-            for (size_t i = 0; i < offset_pad - offset; ++i) {
-                gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-    }
-
-    if (only_meta) {
-        return;
-    }
-
-    size_t offset = 0;
-
-    // write tensor data
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
-
-        const size_t size     = info->size;
-        const size_t size_pad = GGML_PAD(size, ctx->alignment);
-
-        gguf_bwrite_el(buf, info->data, size);
-
-        if (size_pad != size) {
-            uint8_t pad = 0;
-            for (size_t j = 0; j < size_pad - size; ++j) {
-                gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-
-        GGML_ASSERT(offset == info->offset);
-
-        offset += size_pad;
-    }
-}
-
-void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
-    FILE * file = fopen(fname, "wb");
-    if (!file) {
-        GGML_ASSERT(false && "failed to open file for writing");
-    }
-
-    struct gguf_buf buf = gguf_buf_init(16*1024);
-
-    gguf_write_to_buf(ctx, &buf, only_meta);
-
-    fwrite(buf.data, 1, buf.offset, file);
-
-    gguf_buf_free(buf);
-
-    fclose(file);
-}
-
-size_t gguf_get_meta_size(const struct gguf_context * ctx) {
-    // no allocs - only compute size
-    struct gguf_buf buf = gguf_buf_init(0);
-
-    gguf_write_to_buf(ctx, &buf, true);
-
-    return buf.offset;
-}
-
-void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
-    struct gguf_buf buf = gguf_buf_init(16*1024);
-
-    gguf_write_to_buf(ctx, &buf, true);
-
-    memcpy(data, buf.data, buf.offset);
-
-    gguf_buf_free(buf);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-int ggml_cpu_has_avx(void) {
-#if defined(__AVX__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx_vnni(void) {
-#if defined(__AVXVNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx2(void) {
-#if defined(__AVX2__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512(void) {
-#if defined(__AVX512F__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vbmi(void) {
-#if defined(__AVX512VBMI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vnni(void) {
-#if defined(__AVX512VNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fma(void) {
-#if defined(__FMA__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_arm_fma(void) {
-#if defined(__ARM_FEATURE_FMA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_metal(void) {
-#if defined(GGML_USE_METAL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_f16c(void) {
-#if defined(__F16C__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fp16_va(void) {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_wasm_simd(void) {
-#if defined(__wasm_simd128__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_clblast(void) {
-#if defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vulkan(void) {
-#if defined(GGML_USE_VULKAN)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_kompute(void) {
-#if defined(GGML_USE_KOMPUTE)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_sycl(void) {
-#if defined(GGML_USE_SYCL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
-           ggml_cpu_has_sycl();
-}
-
-int ggml_cpu_has_sse3(void) {
-#if defined(__SSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_ssse3(void) {
-#if defined(__SSSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vsx(void) {
-#if defined(__POWER9_VECTOR__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml/.gitignore b/ggml/.gitignore
new file mode 100644
index 000000000..c82d8e692
--- /dev/null
+++ b/ggml/.gitignore
@@ -0,0 +1,2 @@
+src/ggml-vulkan-shaders.hpp
+src/ggml-vulkan-shaders.cpp
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
new file mode 100644
index 000000000..75b5ea3b4
--- /dev/null
+++ b/ggml/CMakeLists.txt
@@ -0,0 +1,343 @@
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("ggml" C CXX)
+include(CheckIncludeFileCXX)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(GGML_STANDALONE ON)
+
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+    # configure project version
+    # TODO
+else()
+    set(GGML_STANDALONE OFF)
+endif()
+
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+
+    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+
+# remove the lib prefix on win32 mingw
+if (WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_MODULE_PREFIX  "")
+endif()
+
+option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
+
+#
+# option list
+#
+
+# TODO: mark all options as advanced when not GGML_STANDALONE
+
+if (APPLE)
+    set(GGML_METAL_DEFAULT ON)
+    set(GGML_BLAS_DEFAULT ON)
+    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
+else()
+    set(GGML_METAL_DEFAULT OFF)
+    set(GGML_BLAS_DEFAULT OFF)
+    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
+endif()
+
+if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
+    message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
+    set(GGML_NATIVE_DEFAULT OFF)
+else()
+    set(GGML_NATIVE_DEFAULT ON)
+endif()
+
+# defaults
+if (NOT GGML_LLAMAFILE_DEFAULT)
+    set(GGML_LLAMAFILE_DEFAULT OFF)
+endif()
+
+if (NOT GGML_CUDA_GRAPHS_DEFAULT)
+    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
+endif()
+
+# general
+option(GGML_STATIC "ggml: static link libraries"                     OFF)
+option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
+option(GGML_LTO    "ggml: enable link time optimization"             OFF)
+option(GGML_CCACHE "ggml: use ccache if available"                   ON)
+
+# debug
+option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
+option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
+option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+
+# build
+option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
+
+# sanitizers
+option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
+option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
+option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
+
+# instruction set specific
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
+option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
+option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
+option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
+if (NOT MSVC)
+    # in MSVC F16C and FMA is implied with AVX2/AVX512
+    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
+    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
+    # MSVC does not seem to support AMX
+    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
+    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
+    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
+endif()
+option(GGML_LASX             "ggml: enable lasx"             ON)
+option(GGML_LSX              "ggml: enable lsx"              ON)
+option(GGML_RVV              "ggml: enable rvv"              ON)
+
+option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
+
+
+if (WIN32)
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
+endif()
+
+# ggml core
+set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
+
+# 3rd party libs / backends
+option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
+option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
+set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                            "ggml: BLAS library vendor")
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+
+option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
+option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
+option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
+                                            "ggml: max. batch size for using peer access")
+option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
+option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
+option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
+option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
+
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
+option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
+option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
+option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
+option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
+option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
+option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
+option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
+option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
+option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
+option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
+option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
+option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
+option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
+set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                            "ggml: metal minimum macOS version")
+set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
+option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
+option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
+                                            "ggml: sycl target device")
+set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
+                                            "ggml: sycl device architecture")
+
+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+
+# toolchain for vulkan-shaders-gen
+set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
+
+# extra artifacts
+option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
+
+#
+# dependencies
+#
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+find_package(Threads REQUIRED)
+
+#
+# build the library
+#
+
+add_subdirectory(src)
+
+#
+# tests and examples
+#
+
+if (GGML_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+if (GGML_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
+
+#
+# install
+#
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+# all public headers
+set(GGML_PUBLIC_HEADERS
+    include/ggml.h
+    include/ggml-cpu.h
+    include/ggml-alloc.h
+    include/ggml-backend.h
+    include/ggml-blas.h
+    include/ggml-cann.h
+    include/ggml-cuda.h
+    include/ggml-kompute.h
+    include/ggml-opt.h
+    include/ggml-metal.h
+    include/ggml-rpc.h
+    include/ggml-sycl.h
+    include/ggml-vulkan.h
+    include/gguf.h)
+
+set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+#if (GGML_METAL)
+#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
+#endif()
+install(TARGETS ggml LIBRARY PUBLIC_HEADER)
+install(TARGETS ggml-base LIBRARY)
+
+if (GGML_STANDALONE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        @ONLY)
+
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        DESTINATION share/pkgconfig)
+endif()
+
+#
+# Create CMake package
+#
+
+# Generate version info based on git commit.
+
+if(NOT DEFINED GGML_BUILD_NUMBER)
+    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
+    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE GGML_BUILD_NUMBER
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    if(GGML_BUILD_NUMBER EQUAL 1)
+        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
+    endif()
+
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+endif()
+
+
+# Capture variables prefixed with GGML_.
+
+set(variable_set_statements
+"
+####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
+####### Any changes to this file will be overwritten by the next CMake run        #######
+
+")
+
+set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
+
+get_cmake_property(all_variables VARIABLES)
+foreach(variable_name IN LISTS all_variables)
+    if(variable_name MATCHES "^GGML_")
+        string(REPLACE ";" "\\;"
+               variable_value "${${variable_name}}")
+
+        set(variable_set_statements
+            "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
+    endif()
+endforeach()
+
+set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
+
+# Create the CMake package and set install location.
+
+set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
+set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
+    PATH_VARS GGML_INCLUDE_INSTALL_DIR
+              GGML_LIB_INSTALL_DIR
+              GGML_BIN_INSTALL_DIR)
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+    VERSION ${GGML_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
new file mode 100644
index 000000000..bf39f9c00
--- /dev/null
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -0,0 +1,147 @@
+
+@GGML_VARIABLES_EXPANDED@
+
+@PACKAGE_INIT@
+
+set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
+set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
+set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+
+find_package(Threads REQUIRED)
+
+find_library(GGML_LIBRARY ggml
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_LIBRARY}")
+
+find_library(GGML_BASE_LIBRARY ggml-base
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml-base UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml-base
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
+
+if (NOT GGML_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
+    endif()
+
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
+    endif()
+
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()
+
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+
+        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
+                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
+
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
+    endif()
+
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
+endif()
+
+set(_ggml_all_targets "")
+foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
+    string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
+
+    find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+        REQUIRED
+        HINTS ${GGML_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH)
+
+    message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
+
+    add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+    set_target_properties(ggml::${_ggml_backend}
+        PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
+            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+            IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
+            INTERFACE_COMPILE_FEATURES c_std_90
+            POSITION_INDEPENDENT_CODE ON)
+
+    string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+    if(is_cpu_variant)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+           PROPERTIES
+               INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
+
+        if(GGML_CPU_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
+        endif()
+
+    else()
+        list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+            PROPERTIES
+                INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
+
+        if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
+        endif()
+    endif()
+
+    list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+endforeach()
+
+add_library(ggml::all INTERFACE IMPORTED)
+set_target_properties(ggml::all
+    PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
+
+check_required_components(ggml)
diff --git a/ggml-alloc.h b/ggml/include/ggml-alloc.h
similarity index 77%
rename from ggml-alloc.h
rename to ggml/include/ggml-alloc.h
index 1d9085d15..23600eea9 100644
--- a/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,20 +7,24 @@ extern "C" {
 #endif
 
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
 
 // Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
 
-GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
-GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 
 // Graph allocator
 /*
   Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
 
     // optional: create a worst-case graph and reserve the buffers to avoid reallocations
     ggml_gallocr_reserve(galloc, build_graph(max_batch));
@@ -50,7 +54,11 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
 
 // automatic reallocation if the topology changes when using a single buffer
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
new file mode 100644
index 000000000..fc9571c82
--- /dev/null
+++ b/ggml/include/ggml-backend.h
@@ -0,0 +1,354 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef struct ggml_backend_reg * ggml_backend_reg_t;
+    typedef struct ggml_backend_device * ggml_backend_dev_t;
+
+
+    //
+    // Backend buffer type
+    //
+
+    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
+
+    //
+    // Backend buffer
+    //
+
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
+    };
+
+    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    //
+    // Backend (stream)
+    //
+
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    // "offset" refers to the offset in tensor->data for setting/getting data
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    // NOTE: will be removed, use device version instead
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
+
+    //
+    // Events
+    //
+
+    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
+    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
+    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
+
+    //
+    // Backend device
+    //
+
+    enum ggml_backend_dev_type {
+        // CPU device using system memory
+        GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
+        GGML_BACKEND_DEVICE_TYPE_GPU,
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
+    };
+
+    // functionality supported by the device
+    struct ggml_backend_dev_caps {
+        // asynchronous operations
+        bool async;
+        // pinned host buffer
+        bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
+        // event synchronization
+        bool events;
+    };
+
+    // all the device properties
+    struct ggml_backend_dev_props {
+        const char * name;
+        const char * description;
+        size_t memory_free;
+        size_t memory_total;
+        enum ggml_backend_dev_type type;
+        struct ggml_backend_dev_caps caps;
+    };
+
+    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
+    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+
+    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
+    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+
+    //
+    // Backend (reg)
+    //
+
+    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
+    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
+    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
+    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
+
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
+
+    //
+    // Backend registry
+    //
+
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
+    // Backend (reg) enumeration
+    GGML_API size_t             ggml_backend_reg_count(void);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
+
+    // Device enumeration
+    GGML_API size_t             ggml_backend_dev_count(void);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
+
+    // Direct backend (stream) initialization
+    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
+    GGML_API ggml_backend_t ggml_backend_init_best(void);
+
+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backend devices to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
+
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
+
+        ggml_backend_sched_reserve(sched, reserve_graph);
+
+        // compute
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
+
+        // if there are graph inputs:
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
+    }
+    */
+
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+
+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
+
+    // Get the number of splits of the last graph
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
+
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
+
+    // CPU buffer types are always available
+    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h
new file mode 100644
index 000000000..87a81b363
--- /dev/null
+++ b/ggml/include/ggml-blas.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
new file mode 100644
index 000000000..b469e228d
--- /dev/null
+++ b/ggml/include/ggml-cann.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Maximum number of CANN devices supported.
+ */
+#define GGML_CANN_MAX_DEVICES 16
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+
+/**
+ * @brief Initializes the CANN backend for a specified device.
+ *
+ * This function initializes the CANN backend for the given device.
+ * It verifies the device index, allocates a context, and creates a backend
+ * instance.
+ *
+ * @param device The index of the device to initialize.
+ * @return A pointer to the initialized backend instance, or nullptr on failure.
+ */
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+
+/**
+ * @brief Checks if a given backend is a CANN backend.
+ *
+ * This function verifies if the provided backend is a CANN backend by comparing
+ * its GUID with the CANN backend's GUID.
+ *
+ * @param backend The backend instance to check.
+ * @return True if the backend is a CANN backend, false otherwise.
+ */
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device);
+
+/**
+ * @brief Retrieves the number of CANN devices available.
+ *
+ * This function returns the number of CANN devices available based on
+ * information obtained from `ggml_cann_info()`.
+ *
+ * @return The number of CANN devices available.
+ */
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
+
+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
+/**
+ * @brief Retrieves the description of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the SoC name,
+ * and writes it into the provided description buffer.
+ *
+ * @param device The device index to retrieve the description for.
+ * @param description Pointer to a buffer where the description will be written.
+ * @param description_size Size of the description buffer.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size);
+
+/**
+ * @brief Retrieves the memory information of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the free and total
+ * memory information of the specified type (ACL_HBM_MEM), and stores them
+ * in the provided pointers.
+ *
+ * @param device The device index to retrieve memory information for.
+ * @param free Pointer to a variable where the free memory size will be stored.
+ * @param total Pointer to a variable where the total memory size will be
+ * stored.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
+                                                  size_t* free,
+                                                  size_t* total);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-cpp.h b/ggml/include/ggml-cpp.h
new file mode 100644
index 000000000..a12342c25
--- /dev/null
+++ b/ggml/include/ggml-cpp.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
new file mode 100644
index 000000000..3aa71badb
--- /dev/null
+++ b/ggml/include/ggml-cpu.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
+
+    // Internal types and functions exposed for tests and benchmarks
+
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+
+    struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+    };
+
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+
+    GGML_BACKEND_API void ggml_cpu_init(void);
+
+    //
+    // CPU backend
+    //
+
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
new file mode 100644
index 000000000..22ad2c009
--- /dev/null
+++ b/ggml/include/ggml-cuda.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef GGML_USE_HIP
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+#define GGML_CUDA_MAX_DEVICES       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml-kompute.h b/ggml/include/ggml-kompute.h
similarity index 70%
rename from ggml-kompute.h
rename to ggml/include/ggml-kompute.h
index 171465456..154aa56a7 100644
--- a/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -11,6 +11,8 @@
 extern "C" {
 #endif
 
+#define GGML_KOMPUTE_MAX_DEVICES 16
+
 struct ggml_vk_device {
     int index;
     int type; // same as VkPhysicalDeviceType
@@ -35,11 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
-GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
 
-GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
 
 #ifdef __cplusplus
 }
diff --git a/ggml-metal.h b/ggml/include/ggml-metal.h
similarity index 62%
rename from ggml-metal.h
rename to ggml/include/ggml-metal.h
index a5c542189..669c1f84a 100644
--- a/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -1,7 +1,9 @@
+// Note: this description is outdated
+//
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
 //
 // How it works?
 //
@@ -25,9 +27,6 @@
 #include <stddef.h>
 #include <stdbool.h>
 
-// max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 64
-
 struct ggml_tensor;
 struct ggml_cgraph;
 
@@ -40,27 +39,28 @@ extern "C" {
 // user-code should use only these functions
 //
 
-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
 
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+GGML_DEPRECATED(
+        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
 
-GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
 
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
 
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/ggml/include/ggml-opencl.h b/ggml/include/ggml-opencl.h
new file mode 100644
index 000000000..6b6177135
--- /dev/null
+++ b/ggml/include/ggml-opencl.h
@@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
new file mode 100644
index 000000000..eb5eab9de
--- /dev/null
+++ b/ggml/include/ggml-opt.h
@@ -0,0 +1,216 @@
+// This file contains functionality for training models using GGML.
+// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
+// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdint.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    struct ggml_opt_dataset;
+    struct ggml_opt_context;
+    struct ggml_opt_result;
+
+    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
+    typedef struct ggml_opt_context * ggml_opt_context_t;
+    typedef struct ggml_opt_result  * ggml_opt_result_t;
+
+    // ====== Loss ======
+
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+    enum ggml_opt_loss_type {
+        GGML_OPT_LOSS_TYPE_MEAN,
+        GGML_OPT_LOSS_TYPE_SUM,
+        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+    };
+
+    // ====== Dataset ======
+
+    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
+            int64_t ne_datapoint, // number of elements per datapoint
+            int64_t ne_label,     // number of elements per label
+            int64_t ndata,        // total number of datapoints/labels
+            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
+
+    // get underlying tensors that store the data
+    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
+    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
+
+    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
+    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
+
+    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
+    GGML_API void ggml_opt_dataset_get_batch(
+            ggml_opt_dataset_t   dataset,
+            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
+            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
+            int64_t              ibatch);
+
+    // ====== Model / Context ======
+
+    enum ggml_opt_build_type {
+        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_OPT,
+    };
+
+    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
+        struct {
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float wd;    // weight decay for AdamW, use 0.0f to disable
+        } adamw;
+    };
+
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
+    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+
+    // returns the default optimizer params (constant)
+    // userdata is not used
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+
+    // parameters for initializing a new optimization context
+    struct ggml_opt_params {
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
+
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+
+        // the forward graph is defined by inputs and outputs
+        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor * inputs;
+        struct ggml_tensor * outputs;
+
+        enum ggml_opt_loss_type  loss_type;
+        enum ggml_opt_build_type build_type;
+
+        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    // get parameters for an optimization context with defaults set where possible
+    // parameters for which no sensible defaults exist are supplied as arguments to this function
+    GGML_API ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t      backend_sched,
+            struct ggml_context     * ctx_compute,
+            struct ggml_tensor      * inputs,
+            struct ggml_tensor      * outputs,
+            enum ggml_opt_loss_type   loss_type);
+
+    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
+    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
+
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
+
+    // get underlying tensors that store data
+    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
+    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
+    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
+    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
+    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
+    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+
+    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+
+    // ====== Optimization Result ======
+
+    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
+    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
+
+    // get data from result, uncertainties are optional and can be ignored by passing NULL
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
+
+    // ====== Computation ======
+
+    // do forward pass, increment result if not NULL
+    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // do forward pass, increment result if not NULL, do backward pass
+    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // ############################################################################
+    // ## The high-level functions start here. They do not depend on any private ##
+    // ## functions or structs and can be copied to and adapted for user code.   ##
+    // ############################################################################
+
+    // ====== Intended Usage ======
+    //
+    // 1. Select the appropriate loss for your problem.
+    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
+    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
+    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
+    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
+    //    The second context should contain all other tensors and will be (re)allocated automatically.
+    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
+    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
+    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+
+    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
+    typedef void (*ggml_opt_epoch_callback)(
+            bool               train,       // true after training evaluation, false after validation evaluation
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,      // result associated with the dataset subsection
+            int64_t            ibatch,      // number of batches that have been evaluated so far
+            int64_t            ibatch_max,  // total number of batches in this dataset subsection
+            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
+
+    // do training on front of dataset, do evaluation only on back of dataset
+    GGML_API void ggml_opt_epoch(
+            ggml_opt_context_t      opt_ctx,
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
+            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
+            int64_t                 idata_split,    // data index at which to split training and evaluation
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    // callback that prints a progress bar on stderr
+    GGML_API void ggml_opt_epoch_callback_progress_bar(
+            bool               train,
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,
+            int64_t            ibatch,
+            int64_t            ibatch_max,
+            int64_t            t_start_us);
+
+    // fit model defined by inputs and outputs to dataset
+    GGML_API void ggml_opt_fit(
+            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
+            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
+            enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
+            int64_t                         nepoch,         // how many times the dataset should be iterated over
+            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
+            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
+            bool                            silent);        // whether or not info prints to stderr should be suppressed
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
new file mode 100644
index 000000000..ade6c3b0e
--- /dev/null
+++ b/ggml/include/ggml-rpc.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
new file mode 100644
index 000000000..5ce349a88
--- /dev/null
+++ b/ggml/include/ggml-sycl.h
@@ -0,0 +1,49 @@
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#define GGML_SYCL_NAME "SYCL"
+#define GGML_SYCL_MAX_DEVICES 48
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+
+// devide buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
+                                                       char *description,
+                                                       size_t description_size);
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+
+// SYCL doesn't support registering host memory, keep here for reference
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h
new file mode 100644
index 000000000..ed5ea5f79
--- /dev/null
+++ b/ggml/include/ggml-vulkan.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+#define GGML_VK_MAX_DEVICES 16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml.h b/ggml/include/ggml.h
similarity index 63%
rename from ggml.h
rename to ggml/include/ggml.h
index 0a6d3c051..5bd8d9c8b 100644
--- a/ggml.h
+++ b/ggml/include/ggml.h
@@ -176,25 +176,15 @@
 #ifdef GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
+#            define GGML_API __declspec(dllexport) extern
 #        else
-#            define GGML_API __declspec(dllimport)
+#            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default")))
+#        define GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define GGML_API
-#endif
-
-#ifdef GGML_MULTIPLATFORM
-#    if defined(_WIN32)
-#        define GGML_CALL
-#    else
-#        define GGML_CALL __attribute__((__ms_abi__))
-#    endif
-#else
-#    define GGML_CALL
+#    define GGML_API extern
 #endif
 
 // TODO: support for clang
@@ -214,26 +204,30 @@
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 
-#include <stdint.h>
-#include <stddef.h>
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
 
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2
 
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
-#define GGML_MAX_NAME           64
-#endif
+#define GGML_MAX_N_THREADS      512
 #define GGML_MAX_OP_PARAMS      64
+
+#ifndef GGML_MAX_NAME
+#   define GGML_MAX_NAME        64
+#endif
+
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
+
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define GGML_MEM_ALIGN 4
 #else
@@ -243,36 +237,35 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGUF_MAGIC "GGUF"
-
-#define GGUF_VERSION 3
-
-#define GGUF_DEFAULT_ALIGNMENT 32
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  8
+#define GGML_ROPE_TYPE_VISION 24
 
 #define GGML_UNUSED(x) (void)(x)
 
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
-
 #ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#   define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
+#   define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
+#   define GGML_UNREACHABLE() __assume(0)
 #else
-#define GGML_UNREACHABLE() ((void) 0)
+#   define GGML_UNREACHABLE() ((void) 0)
 #endif
 
+#ifdef __cplusplus
+#   define GGML_NORETURN [[noreturn]]
+#elif defined(_MSC_VER)
+#   define GGML_NORETURN __declspec(noreturn)
+#else
+#   define GGML_NORETURN _Noreturn
+#endif
+
+#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -311,40 +304,67 @@
     GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
     GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 
+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
+    GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
+    GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+
+    enum ggml_status {
+        GGML_STATUS_ALLOC_FAILED = -2,
+        GGML_STATUS_FAILED = -1,
+        GGML_STATUS_SUCCESS = 0,
+        GGML_STATUS_ABORTED = 1,
+    };
+
+    // get ggml_status name string
+    GGML_API const char * ggml_status_to_string(enum ggml_status status);
+
+    // ieee 754-2008 half-precision float16
+    // todo: make this not an integral type
     typedef uint16_t ggml_fp16_t;
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
+    GGML_API void        ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
 
-    // convert FP16 <-> FP32
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
+    // google brain half-precision bfloat16
+    typedef struct { uint16_t bits; } ggml_bf16_t;
+    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
+    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
+    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
 
     struct ggml_object;
     struct ggml_context;
+    struct ggml_cgraph;
 
+    // NOTE: always add types at the end of the enum to keep backward compatibility
     enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
+        GGML_TYPE_F32     = 0,
+        GGML_TYPE_F16     = 1,
+        GGML_TYPE_Q4_0    = 2,
+        GGML_TYPE_Q4_1    = 3,
         // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        GGML_TYPE_Q2_K = 10,
-        GGML_TYPE_Q3_K = 11,
-        GGML_TYPE_Q4_K = 12,
-        GGML_TYPE_Q5_K = 13,
-        GGML_TYPE_Q6_K = 14,
-        GGML_TYPE_Q8_K = 15,
+        // GGML_TYPE_Q4_3 = 5, support has been removed
+        GGML_TYPE_Q5_0    = 6,
+        GGML_TYPE_Q5_1    = 7,
+        GGML_TYPE_Q8_0    = 8,
+        GGML_TYPE_Q8_1    = 9,
+        GGML_TYPE_Q2_K    = 10,
+        GGML_TYPE_Q3_K    = 11,
+        GGML_TYPE_Q4_K    = 12,
+        GGML_TYPE_Q5_K    = 13,
+        GGML_TYPE_Q6_K    = 14,
+        GGML_TYPE_Q8_K    = 15,
         GGML_TYPE_IQ2_XXS = 16,
         GGML_TYPE_IQ2_XS  = 17,
         GGML_TYPE_IQ3_XXS = 18,
@@ -353,10 +373,22 @@ extern "C" {
         GGML_TYPE_IQ3_S   = 21,
         GGML_TYPE_IQ2_S   = 22,
         GGML_TYPE_IQ4_XS  = 23,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
-        GGML_TYPE_COUNT,
+        GGML_TYPE_I8      = 24,
+        GGML_TYPE_I16     = 25,
+        GGML_TYPE_I32     = 26,
+        GGML_TYPE_I64     = 27,
+        GGML_TYPE_F64     = 28,
+        GGML_TYPE_IQ1_M   = 29,
+        GGML_TYPE_BF16    = 30,
+        // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // GGML_TYPE_Q4_0_4_8 = 32,
+        // GGML_TYPE_Q4_0_8_8 = 33,
+        GGML_TYPE_TQ1_0   = 34,
+        GGML_TYPE_TQ2_0   = 35,
+        // GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_8 = 37,
+        // GGML_TYPE_IQ4_NL_8_8 = 38,
+        GGML_TYPE_COUNT   = 39,
     };
 
     // precision
@@ -365,28 +397,22 @@ extern "C" {
         GGML_PREC_F32,
     };
 
-    enum ggml_backend_type {
-        GGML_BACKEND_TYPE_CPU = 0,
-        GGML_BACKEND_TYPE_GPU = 10,
-        GGML_BACKEND_TYPE_GPU_SPLIT = 20,
-    };
-
     // model file types
     enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_UNKNOWN        = -1,
+        GGML_FTYPE_ALL_F32        = 0,
+        GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
@@ -395,6 +421,8 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
+        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
     };
 
     // available tensor operations:
@@ -411,10 +439,13 @@ extern "C" {
         GGML_OP_SQR,
         GGML_OP_SQRT,
         GGML_OP_LOG,
+        GGML_OP_SIN,
+        GGML_OP_COS,
         GGML_OP_SUM,
         GGML_OP_SUM_ROWS,
         GGML_OP_MEAN,
         GGML_OP_ARGMAX,
+        GGML_OP_COUNT_EQUAL,
         GGML_OP_REPEAT,
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
@@ -445,25 +476,32 @@ extern "C" {
         GGML_OP_SOFT_MAX_BACK,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
-        GGML_OP_ALIBI,
         GGML_OP_CLAMP,
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
+        GGML_OP_IM2COL_BACK,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
+        GGML_OP_POOL_2D_BACK,
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
+        GGML_OP_PAD_REFLECT_1D,
+        GGML_OP_ARANGE,
+        GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
 
-        GGML_OP_FLASH_ATTN,
-        GGML_OP_FLASH_FF,
+        GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
+        GGML_OP_SSM_CONV,
+        GGML_OP_SSM_SCAN,
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
         GGML_OP_GET_REL_POS,
         GGML_OP_ADD_REL_POS,
+        GGML_OP_RWKV_WKV6,
+        GGML_OP_GATED_LINEAR_ATTN,
 
         GGML_OP_UNARY,
 
@@ -480,6 +518,7 @@ extern "C" {
 
         GGML_OP_CROSS_ENTROPY_LOSS,
         GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+        GGML_OP_OPT_STEP_ADAMW,
 
         GGML_OP_COUNT,
     };
@@ -492,11 +531,13 @@ extern "C" {
         GGML_UNARY_OP_TANH,
         GGML_UNARY_OP_ELU,
         GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_SIGMOID,
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
         GGML_UNARY_OP_HARDSWISH,
         GGML_UNARY_OP_HARDSIGMOID,
+        GGML_UNARY_OP_EXP,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -508,36 +549,32 @@ extern "C" {
     };
 
     enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_NONE  = 0,
+        GGML_LOG_LEVEL_DEBUG = 1,
+        GGML_LOG_LEVEL_INFO  = 2,
         GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_ERROR = 4,
+        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
     };
 
+    // this tensor...
     enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  = 1,
-        GGML_TENSOR_FLAG_OUTPUT = 2,
-        GGML_TENSOR_FLAG_PARAM  = 4,
+        GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
+        GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
+        GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
+        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
 
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
     };
 
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
+        enum ggml_type type;
 
         struct ggml_backend_buffer * buffer;
 
@@ -555,14 +592,9 @@ extern "C" {
 
         int32_t flags;
 
-        struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
 
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-
+        // source tensor and offset for views
         struct ggml_tensor * view_src;
         size_t               view_offs;
 
@@ -582,95 +614,6 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    struct ggml_hash_set {
-        size_t size;
-        struct ggml_tensor ** keys;
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_table;
-
-        enum ggml_cgraph_eval_order order;
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-
-    // scratch buffer
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-
-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
-
-    // compute types
-
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum ggml_task_type {
-        GGML_TASK_TYPE_INIT = 0,
-        GGML_TASK_TYPE_COMPUTE,
-        GGML_TASK_TYPE_FINALIZE,
-    };
-
-    struct ggml_compute_params {
-        enum ggml_task_type type;
-
-        // ith = thread index, nth = number of threads
-        int ith, nth;
-
-        // work buffer for all threads
-        size_t wsize;
-        void * wdata;
-    };
-
-    // numa strategies
-    enum ggml_numa_strategy {
-        GGML_NUMA_STRATEGY_DISABLED   = 0,
-        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
-        GGML_NUMA_STRATEGY_ISOLATE    = 2,
-        GGML_NUMA_STRATEGY_NUMACTL    = 3,
-        GGML_NUMA_STRATEGY_MIRROR     = 4,
-        GGML_NUMA_STRATEGY_COUNT
-    };
 
     //
     // GUID
@@ -690,63 +633,71 @@ extern "C" {
     GGML_API int64_t ggml_cycles(void);
     GGML_API int64_t ggml_cycles_per_ms(void);
 
-    GGML_API void    ggml_print_backtrace(void);
-
-    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    // accepts a UTF-8 path, even on Windows
+    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
 
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
-    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows     (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes    (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
 
-    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
-    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API int64_t ggml_blck_size(enum ggml_type type);
+    GGML_API size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
 
     GGML_DEPRECATED(
     GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
     "use ggml_row_size() instead");
 
-    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
-    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
 
-    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
 
-    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
 
-    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
 
     // TODO: temporary until model loading of ggml examples is refactored
     GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
 
-    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_empty     (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
 
-    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+    GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
+    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
+    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+
+    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+    GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
 
+    GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
+
     // main
 
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
+    GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
+    GGML_API void                  ggml_reset(struct ggml_context * ctx);
+    GGML_API void                  ggml_free (struct ggml_context * ctx);
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
-    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
     GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
@@ -786,8 +737,7 @@ extern "C" {
             int64_t ne2,
             int64_t ne3);
 
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
 
     GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
     GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@@ -797,35 +747,25 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
 
-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
     // Converts a flat index into coordinates
-    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+    GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
 
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
 
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
-
     GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
     GGML_ATTRIBUTE_FORMAT(2, 3)
     GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
 
+    // Tensor flags
+    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
+
     //
     // operations on tensors with backpropagation
     //
@@ -940,6 +880,22 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_sin(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sin_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // return scalar
     GGML_API struct ggml_tensor * ggml_sum(
             struct ggml_context * ctx,
@@ -960,6 +916,12 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // count number of equal elements in a and b
+    GGML_API struct ggml_tensor * ggml_count_equal(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // if a is the same shape as b, and a is not parameter, return a
     // otherwise, return a new tensor: repeat(a) to fit in b
     GGML_API struct ggml_tensor * ggml_repeat(
@@ -973,12 +935,13 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    // concat a and b on dim 2
+    // concat a and b along dim
     // used in stable-diffusion
     GGML_API struct ggml_tensor * ggml_concat(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            int                   dim);
 
     GGML_API struct ggml_tensor * ggml_abs(
             struct ggml_context * ctx,
@@ -1040,6 +1003,14 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_sigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_gelu(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
@@ -1081,6 +1052,14 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_exp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_exp_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
@@ -1104,16 +1083,17 @@ extern "C" {
 
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     GGML_API struct ggml_tensor * ggml_group_norm(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
 
     GGML_API struct ggml_tensor * ggml_group_norm_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
 
     // a - x
     // b - dy
@@ -1138,14 +1118,11 @@ extern "C" {
             enum ggml_prec       prec);
 
     // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
     GGML_API struct ggml_tensor * ggml_mul_mat_id(
             struct ggml_context * ctx,
-            struct ggml_tensor  * const as[],
-            int                   n_as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * as,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * ids);
 
     // A: m columns, n rows,
     // B: p columns, n rows,
@@ -1178,7 +1155,7 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1188,19 +1165,19 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     GGML_API struct ggml_tensor * ggml_set_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     GGML_API struct ggml_tensor * ggml_set_1d_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return modified a
     GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1208,7 +1185,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1216,7 +1193,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
@@ -1351,14 +1328,14 @@ extern "C" {
     // supports 3D: a->ne[2] == b->ne[1]
     GGML_API struct ggml_tensor * ggml_get_rows(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * a,  // data
+            struct ggml_tensor  * b); // row indices
 
     GGML_API struct ggml_tensor * ggml_get_rows_back(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c);
+            struct ggml_tensor  * a,  // gradients of ggml_get_rows result
+            struct ggml_tensor  * b,  // row indices
+            struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
 
     GGML_API struct ggml_tensor * ggml_diag(
         struct ggml_context     * ctx,
@@ -1397,33 +1374,34 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
-    // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
+    // fused soft_max(a*scale + mask*(ALiBi slope))
     // mask is optional
-    // pos is required when max_bias > 0.0f
     // max_bias = 0.0f for no ALiBi
     GGML_API struct ggml_tensor * ggml_soft_max_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * mask,
-            struct ggml_tensor  * pos,
             float                 scale,
             float                 max_bias);
 
-    GGML_API struct ggml_tensor * ggml_soft_max_back(
+    GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
 
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+    GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
 
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
-    // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
     GGML_API struct ggml_tensor * ggml_rope(
@@ -1431,8 +1409,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
 
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1440,18 +1417,34 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
 
     // custom RoPE
-    GGML_API struct ggml_tensor * ggml_rope_custom(
+    // c is freq factors (e.g. phi3-128k), (optional)
+    GGML_API struct ggml_tensor * ggml_rope_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    GGML_API struct ggml_tensor * ggml_rope_multi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1460,14 +1453,14 @@ extern "C" {
             float                 beta_slow);
 
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+    GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1475,47 +1468,73 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
 
-    // compute correction dims for YaRN RoPE scaling
-    GGML_CALL void ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
-
-    // xPos RoPE, in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   n_dims,
-            float                 base,
-            bool                  down);
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use ggml_rope_ext instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use ggml_rope_ext_inplace instead");
+
+    // compute correction dims for YaRN RoPE scaling
+    GGML_API void ggml_rope_yarn_corr_dims(
+        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
-    GGML_API struct ggml_tensor * ggml_rope_back(
+    GGML_API struct ggml_tensor * ggml_rope_ext_back(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * a, // gradients of ggml_rope result
+            struct ggml_tensor  * b, // positions
+            struct ggml_tensor  * c, // freq factors
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
             float                 attn_factor,
             float                 beta_fast,
-            float                 beta_slow,
-            float                 xpos_base,
-            bool                  xpos_down);
+            float                 beta_slow);
 
-    // alibi position embedding
-    // in-place, returns view(a)
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
+    GGML_API struct ggml_tensor * ggml_rope_multi_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max),
-        "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
 
     // clamp
     // in-place, returns view(a)
@@ -1525,34 +1544,38 @@ extern "C" {
             float                 min,
             float                 max);
 
+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
     GGML_API struct ggml_tensor * ggml_im2col(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D,
-            enum ggml_type       dst_type);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum ggml_type        dst_type);
 
-    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
+    GGML_API struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,  // convolution kernel
+        struct ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);
 
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
             int                   s0,  // stride
             int                   p0,  // padding
             int                   d0); // dilation
@@ -1561,30 +1584,46 @@ extern "C" {
     // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
     GGML_API struct ggml_tensor* ggml_conv_1d_ph(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation
+
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
 
     GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
 
     GGML_API struct ggml_tensor * ggml_conv_2d(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
-
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
 
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
@@ -1612,6 +1651,18 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // depthwise
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1645,13 +1696,37 @@ extern "C" {
             float                 p0,
             float                 p1);
 
+    GGML_API struct ggml_tensor * ggml_pool_2d_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * af, // "a"/input used in forward pass
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
     // nearest interpolate
+    // multiplies ne0 and ne1 by scale factor
     // used in stable-diffusion
     GGML_API struct ggml_tensor * ggml_upscale(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   scale_factor);
 
+    // nearest interpolate
+    // nearest interpolate to specified dimensions
+    // used in tortoise.cpp
+    GGML_API struct ggml_tensor * ggml_upscale_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   ne0,
+            int                   ne1,
+            int                   ne2,
+            int                   ne3);
+
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,
@@ -1661,6 +1736,22 @@ extern "C" {
             int                  p2,
             int                  p3);
 
+    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+    GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   p0,
+            int                   p1);
+
+    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+    // timesteps: [N,]
+    // return: [N, dim]
+    GGML_API struct ggml_tensor * ggml_timestep_embedding(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * timesteps,
+            int                   dim,
+            int                   max_period);
+
     // sort rows
     enum ggml_sort_order {
         GGML_SORT_ORDER_ASC,
@@ -1672,19 +1763,43 @@ extern "C" {
             struct ggml_tensor  * a,
             enum ggml_sort_order  order);
 
+    GGML_API struct ggml_tensor * ggml_arange(
+            struct ggml_context * ctx,
+            float                 start,
+            float                 stop,
+            float                 step);
+
     // top k elements per row
     GGML_API struct ggml_tensor * ggml_top_k(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   k);
 
-    GGML_API struct ggml_tensor * ggml_flash_attn(
+#define GGML_KQ_MASK_PAD 64
+
+    // q:    [n_embd, n_batch,     n_head,    1]
+    // k:    [n_embd, n_kv,        n_head_kv, 1]
+    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * q,
             struct ggml_tensor  * k,
             struct ggml_tensor  * v,
-            bool                  masked);
+            struct ggml_tensor  * mask,
+            float                 scale,
+            float                 max_bias,
+            float                 logit_softcap);
 
+    GGML_API void ggml_flash_attn_ext_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
+
+    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
+            const struct ggml_tensor * a);
+
+    // TODO: needs to be adapted to ggml_flash_attn_ext
     GGML_API struct ggml_tensor * ggml_flash_attn_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1693,13 +1808,19 @@ extern "C" {
            struct ggml_tensor  * d,
            bool                  masked);
 
-    GGML_API struct ggml_tensor * ggml_flash_ff(
+    GGML_API struct ggml_tensor * ggml_ssm_conv(
             struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b0,
-            struct ggml_tensor  * b1,
-            struct ggml_tensor  * c0,
-            struct ggml_tensor  * c1);
+            struct ggml_tensor  * sx,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_ssm_scan(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * s,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * dt,
+            struct ggml_tensor  * A,
+            struct ggml_tensor  * B,
+            struct ggml_tensor  * C);
 
     // partition into non-overlapping windows with padding if needed
     // example:
@@ -1751,6 +1872,24 @@ extern "C" {
             struct ggml_tensor  * pw,
             struct ggml_tensor  * ph);
 
+    GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * tf,
+            struct ggml_tensor  * td,
+            struct ggml_tensor  * state);
+
+    GGML_API struct ggml_tensor * ggml_gated_linear_attn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * g,
+            struct ggml_tensor  * state,
+            float scale);
+
     // custom operators
 
     typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1834,7 +1973,8 @@ extern "C" {
     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 
-    #define GGML_N_TASKS_MAX -1
+#define GGML_N_TASKS_MAX (-1)
+    // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
 
     GGML_API struct ggml_tensor * ggml_map_custom1(
             struct ggml_context   * ctx,
@@ -1887,50 +2027,59 @@ extern "C" {
     // loss function
 
     GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b);
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor  * b); // labels
 
     GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-            struct ggml_tensor          * c);
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor  * b,  // labels
+            struct ggml_tensor  * c); // gradients of cross_entropy_loss result
+
+    // AdamW optimizer step
+    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
+    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+    GGML_API struct ggml_tensor * ggml_opt_step_adamw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * grad,
+            struct ggml_tensor  * m,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
 
     //
     // automatic differentiation
     //
 
-    GGML_API void ggml_set_param(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * tensor);
-
-
-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(
+        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct ggml_context * ctx_compute, // context for gradient computation
+        struct ggml_cgraph  * cgraph,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
 
     // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+
+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor  (const struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
+    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
 
     GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
     GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@@ -1941,197 +2090,14 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    GGML_API void ggml_build_backward_gradient_checkpointing(
-            struct ggml_context   * ctx,
-            struct ggml_cgraph    * gf,
-            struct ggml_cgraph    * gb,
-            struct ggml_cgraph    * gb_tmp,
-            struct ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
-    //
-    // optimization
-    //
-
-    // optimization methods
-    enum ggml_opt_type {
-        GGML_OPT_TYPE_ADAM,
-        GGML_OPT_TYPE_LBFGS,
-    };
-
-    // linesearch methods
-    enum ggml_linesearch {
-        GGML_LINESEARCH_DEFAULT = 1,
-
-        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-
-    // optimization return values
-    enum ggml_opt_result {
-        GGML_OPT_RESULT_OK = 0,
-        GGML_OPT_RESULT_DID_NOT_CONVERGE,
-        GGML_OPT_RESULT_NO_CONTEXT,
-        GGML_OPT_RESULT_INVALID_WOLFE,
-        GGML_OPT_RESULT_FAIL,
-        GGML_OPT_RESULT_CANCEL,
-
-        GGML_LINESEARCH_FAIL = -128,
-        GGML_LINESEARCH_MINIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
     typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 
-    // optimization parameters
-    //
-    //   see ggml.c (ggml_opt_default_params) for default values
-    //
-    struct ggml_opt_params {
-        enum ggml_opt_type type;
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
 
-        size_t graph_size;
-
-        int n_threads;
-
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-
-        bool print_forward_graph;
-        bool print_backward_graph;
-
-        int n_gradient_accumulation;
-
-        // ADAM parameters
-        struct {
-            int n_iter;
-
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-
-            enum ggml_linesearch linesearch;
-        } lbfgs;
-    };
-
-    struct ggml_opt_context {
-        struct ggml_context * ctx;
-        struct ggml_opt_params params;
-
-        int iter;
-        int64_t nx; // number of parameter elements
-
-        bool just_initialized;
-
-        float loss_before;
-        float loss_after;
-
-        struct {
-            struct ggml_tensor * g;  // current gradient
-            struct ggml_tensor * m;  // first moment
-            struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-
-        struct {
-            struct ggml_tensor * x;    // current parameters
-            struct ggml_tensor * xp;   // previous parameters
-            struct ggml_tensor * g;    // current gradient
-            struct ggml_tensor * gp;   // previous gradient
-            struct ggml_tensor * d;    // search direction
-            struct ggml_tensor * pf;   // past function values
-            struct ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct ggml_tensor * lmys; // the L-BFGS memory ys
-            struct ggml_tensor * lms;  // the L-BFGS memory s
-            struct ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-
-    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-    // optimize the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt(
-            struct ggml_context * ctx,
-            struct ggml_opt_params params,
-            struct ggml_tensor * f);
-
-    // initialize optimizer context
-    GGML_API void ggml_opt_init(
-            struct ggml_context     * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_opt_params    params,
-            int64_t                   nx);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume_g(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f,
-            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb,
-            ggml_opt_callback callback,
-            void * callback_data);
-
-    //
-    // tensor flags
-    //
-    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
 
     //
     // quantization
@@ -2149,207 +2115,78 @@ extern "C" {
     GGML_API void ggml_quantize_init(enum ggml_type type);
     GGML_API void ggml_quantize_free(void);
 
-    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
-    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-
     // some quantization type cannot be used without an importance matrix
     GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
 
     // calls ggml_quantize_init internally (i.e. can allocate memory)
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
-            int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+    GGML_API size_t ggml_quantize_chunk(
+            enum ggml_type   type,
+               const float * src,
+                      void * dst,
+                   int64_t   start,
+                   int64_t   nrows,
+                   int64_t   n_per_row,
+               const float * imatrix);
 
-    //
-    // gguf
-    //
-
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
-    };
-
-    struct gguf_context;
-
-    struct gguf_init_params {
-        bool no_alloc;
-
-        // if not NULL, create a ggml_context and allocate the tensor data in it
-        struct ggml_context ** ctx;
-    };
-
-    GGML_API struct gguf_context * gguf_init_empty(void);
-    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
-
-    GGML_API void gguf_free(struct gguf_context * ctx);
-
-    GGML_API const char * gguf_type_name(enum gguf_type type);
-
-    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
-
-    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
-
-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
-
-    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
-
-    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
-    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
-
-    // overrides existing values or adds a new one
-    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
-    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
-    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
-
-    // set or add KV pairs from another context
-    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
-
-    // manage tensor info
-    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-
-    // writing gguf files can be done in 2 ways:
-    //
-    // - write the entire gguf_context to a binary file in a single pass:
-    //
-    //   gguf_write_to_file(ctx, fname);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
-    //   fwrite(f, ...);
-    //   void * data = gguf_meta_get_meta_data(ctx);
-    //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, gguf_get_meta_size(ctx));
-    //   free(data);
-    //   fclose(f);
-    //
-
-    // write the entire context to a binary file
-    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
-
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
-
-    //
-    // system info
-    //
-
-    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx_vnni   (void);
-    GGML_API int ggml_cpu_has_avx2       (void);
-    GGML_API int ggml_cpu_has_avx512     (void);
-    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_metal      (void);
-    GGML_API int ggml_cpu_has_f16c       (void);
-    GGML_API int ggml_cpu_has_fp16_va    (void);
-    GGML_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
-    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_vulkan     (void);
-    GGML_API int ggml_cpu_has_kompute    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
-    GGML_API int ggml_cpu_has_sse3       (void);
-    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_sycl       (void);
-    GGML_API int ggml_cpu_has_vsx        (void);
-    GGML_API int ggml_cpu_has_matmul_int8(void);
-
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
+#ifdef __cplusplus
+    // restrict not standard in C++
+#    if defined(__GNUC__)
+#        define GGML_RESTRICT __restrict__
+#    elif defined(__clang__)
+#        define GGML_RESTRICT __restrict
+#    elif defined(_MSC_VER)
+#        define GGML_RESTRICT __restrict
+#    else
+#        define GGML_RESTRICT
+#    endif
 #else
-#define GGML_RESTRICT restrict
+#    define GGML_RESTRICT restrict
 #endif
-    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
 
-    typedef struct {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        ggml_to_float_t   to_float;
-        ggml_from_float_t from_float;
-        ggml_from_float_t from_float_reference;
-        ggml_vec_dot_t    vec_dot;
-        enum ggml_type    vec_dot_type;
-        int64_t           nrows; // number of rows to process simultaneously;
-    } ggml_type_traits_t;
+    struct ggml_type_traits {
+        const char             * type_name;
+        int64_t                  blck_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
+        size_t                   type_size;
+        bool                     is_quantized;
+        ggml_to_float_t          to_float;
+        ggml_from_float_t        from_float_ref;
+    };
 
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
+
+    // ggml threadpool
+    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+    // the goal should be to create an API that other backends can use move everything to the ggml base
+
+    // scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
new file mode 100644
index 000000000..79ee20206
--- /dev/null
+++ b/ggml/include/gguf.h
@@ -0,0 +1,202 @@
+// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
+// GGUF files have the following structure:
+//
+// 1. File magic "GGUF" (4 bytes).
+// 2. File version (uint32_t).
+// 3. Number of ggml tensors in file (int64_t).
+// 4. Number of key-value-pairs in file (int64_t).
+// 5. For each KV pair:
+//   1. The key (string).
+//   2. The value type (gguf_type).
+//   3a. If the value type is GGUF_TYPE_ARRAY:
+//     1. The type of the array (gguf_type).
+//     2. The number of elements in the array (uint64_t).
+//     3. The binary representation of each element in the array.
+//   3b. Otherwise:
+//     1. The binary representation of the value.
+// 6. For each ggml tensor:
+//   1. The tensor name (string).
+//   2. The number of dimensions of the tensor (uint32_t).
+//   3. For each dimension:
+//     1. The size of the tensor in the dimension (int64_t).
+//   4. The tensor data type (ggml_type).
+//   5. The tensor data offset in the tensor data binary blob (uint64_t).
+// 7. The tensor data binary blob (optional, aligned).
+//
+// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
+// All enums are stored as int32_t.
+// All bool values are stored as int8_t.
+// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
+//   otherwise GGUF_DEFAULT_ALIGNMENT is used.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#define GGUF_MAGIC   "GGUF"
+#define GGUF_VERSION 3
+
+#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // types that can be stored as GGUF KV data
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_UINT64  = 10,
+        GGUF_TYPE_INT64   = 11,
+        GGUF_TYPE_FLOAT64 = 12,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API uint32_t gguf_get_version    (const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_alignment  (const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);
+
+    GGML_API int64_t      gguf_get_n_kv(const struct gguf_context * ctx);
+    GGML_API int64_t      gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
+
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
+
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
+    GGML_API size_t       gguf_get_arr_n   (const struct gguf_context * ctx, int64_t key_id);
+
+    // get raw pointer to the first element of the array with the given key_id
+    // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
+
+    // get ith C string from array with given key_id
+    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
+
+    GGML_API int64_t        gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int64_t        gguf_find_tensor      (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
+    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API const char *   gguf_get_tensor_name  (const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API size_t         gguf_get_tensor_size  (const struct gguf_context * ctx, int64_t tensor_id);
+
+    // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
+    GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
+
+    // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t      val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t       val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t     val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t      val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t     val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t      val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float        val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t     val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t      val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double       val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool         val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+
+    // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
+
+    // creates a new array with n strings and copies the corresponding strings from data
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
+
+    // add tensor to GGUF context, tensor name must be unique
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+
+    // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
+    //   in such a way that the tensor data remains as one contiguous block (except for padding)
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+
+    // assumes that at least gguf_get_tensor_size bytes can be read from data
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
+
+    // writing gguf files can be done in 3 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
+    //
+    // - write only the meta data to a file, then re-open the file and append the tensor data:
+    //
+    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
+    //   FILE * f = fopen(fname, "ab");
+    //   fwrite(f, ...); // write tensor data
+    //   fclose(f);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   const size_t size_meta = gguf_get_meta_size(ctx);
+    //   fseek(f, size_meta, SEEK_SET);
+    //   fwrite(f, ...); // write tensor data
+    //   void * data = malloc(size_meta);
+    //   gguf_get_meta_data(ctx, data);
+    //   rewind(f);
+    //   fwrite(data, 1, data, f);
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+
+    // writes the meta data to pointer "data"
+    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
new file mode 100644
index 000000000..0002ac18a
--- /dev/null
+++ b/ggml/src/CMakeLists.txt
@@ -0,0 +1,357 @@
+include(CheckCXXCompilerFlag)
+
+add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
+
+# enable libstdc++ assertions for debug builds
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+endif()
+
+if (NOT MSVC)
+    if (GGML_SANITIZE_THREAD)
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (GGML_SANITIZE_ADDRESS)
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (GGML_SANITIZE_UNDEFINED)
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
+
+if (GGML_FATAL_WARNINGS)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        list(APPEND C_FLAGS   -Werror)
+        list(APPEND CXX_FLAGS -Werror)
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        add_compile_options(/WX)
+    endif()
+endif()
+
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
+endif()
+
+if (GGML_LTO)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT result OUTPUT output)
+    if (result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(WARNING "IPO is not supported: ${output}")
+    endif()
+endif()
+
+if (GGML_CCACHE)
+    find_program(GGML_CCACHE_FOUND ccache)
+    find_program(GGML_SCCACHE_FOUND sccache)
+
+    if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
+        if(GGML_CCACHE_FOUND)
+            set(GGML_CCACHE_VARIANT ccache)
+        else()
+            set(GGML_CCACHE_VARIANT sccache)
+        endif()
+        # TODO: should not be set globally
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
+    endif ()
+endif()
+
+# this version of Apple ld64 is buggy
+execute_process(
+    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
+    ERROR_VARIABLE output
+    OUTPUT_QUIET
+)
+
+if (output MATCHES "dyld-1015\.7")
+    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
+endif()
+
+# architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+if (MSVC)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+else ()
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
+endif ()
+
+if (NOT MSVC)
+    if (GGML_STATIC)
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (GGML_GPROF)
+        add_compile_options(-pg)
+    endif()
+endif()
+
+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_XOPEN_SOURCE=700)
+else()
+    add_compile_definitions(_XOPEN_SOURCE=600)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (
+    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
+    CMAKE_SYSTEM_NAME MATCHES "iOS"    OR
+    CMAKE_SYSTEM_NAME MATCHES "tvOS"   OR
+    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
+)
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
+# ggml
+
+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
+add_library(ggml-base
+            ../include/ggml.h
+            ../include/ggml-alloc.h
+            ../include/ggml-backend.h
+            ../include/ggml-cpp.h
+            ../include/ggml-opt.h
+            ../include/gguf.h
+            ggml.c
+            ggml-alloc.c
+            ggml-backend.cpp
+            ggml-opt.cpp
+            ggml-threading.cpp
+            ggml-threading.h
+            ggml-quants.c
+            ggml-quants.h
+            gguf.cpp)
+
+target_include_directories(ggml-base PRIVATE .)
+
+add_library(ggml
+            ggml-backend-reg.cpp)
+
+target_link_libraries(ggml PUBLIC ggml-base)
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+    if (GGML_BACKEND_DL)
+        add_library(${backend} MODULE ${ARGN})
+        # write the shared library to the output directory
+        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+        add_dependencies(ggml ${backend})
+    else()
+        add_library(${backend} ${ARGN})
+        target_link_libraries(ggml PUBLIC ${backend})
+        install(TARGETS ${backend} LIBRARY)
+    endif()
+
+    target_link_libraries(${backend} PRIVATE ggml-base)
+    target_include_directories(${backend} PRIVATE ..)
+
+    if (${BUILD_SHARED_LIBS})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
+    endif()
+
+    if(NOT GGML_AVAILABLE_BACKENDS)
+        set(GGML_AVAILABLE_BACKENDS "${backend}"
+            CACHE INTERNAL "List of backends for cmake package")
+    else()
+        list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
+        if(has_backend EQUAL -1)
+            set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
+                CACHE INTERNAL "List of backends for cmake package")
+        endif()
+    endif()
+endfunction()
+
+function(ggml_add_backend backend)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    if (${backend_id})
+        string(TOLOWER "ggml-${backend}" backend_target)
+        add_subdirectory(${backend_target})
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
+        endif()
+    endif()
+endfunction()
+
+function(ggml_add_cpu_backend_variant tag_name)
+    set(GGML_CPU_TAG_NAME ${tag_name})
+    # other: OPENMP LLAMAFILE CPU_HBM
+    foreach (feat NATIVE
+                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                  AMX_TILE AMX_INT8 AMX_BF16)
+        set(GGML_${feat} OFF)
+    endforeach()
+
+    foreach (feat ${ARGN})
+        set(GGML_${feat} ON)
+    endforeach()
+
+    ggml_add_cpu_backend_variant_impl(${tag_name})
+endfunction()
+
+ggml_add_backend(CPU)
+
+if (GGML_CPU_ALL_VARIANTS)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    endif()
+    ggml_add_cpu_backend_variant(sandybridge    AVX)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+    if (NOT MSVC)
+        # MSVC doesn't support AMX
+        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    endif()
+elseif (GGML_CPU)
+    ggml_add_cpu_backend_variant_impl("")
+endif()
+
+ggml_add_backend(BLAS)
+ggml_add_backend(CANN)
+ggml_add_backend(CUDA)
+ggml_add_backend(HIP)
+ggml_add_backend(Kompute)
+ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
+ggml_add_backend(RPC)
+ggml_add_backend(SYCL)
+ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL)
+
+foreach (target ggml-base ggml)
+    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
+endforeach()
+
+target_link_libraries(ggml-base PRIVATE Threads::Threads)
+
+find_library(MATH_LIBRARY m)
+if (MATH_LIBRARY)
+    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
+        target_link_libraries(ggml-base PRIVATE m)
+    endif()
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    target_link_libraries(ggml-base PRIVATE dl)
+endif()
+
+if (BUILD_SHARED_LIBS)
+    foreach (target ggml-base ggml)
+        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_compile_definitions(${target} PRIVATE GGML_BUILD)
+        target_compile_definitions(${target} PUBLIC  GGML_SHARED)
+    endforeach()
+endif()
diff --git a/ggml-alloc.c b/ggml/src/ggml-alloc.c
similarity index 74%
rename from ggml-alloc.c
rename to ggml/src/ggml-alloc.c
index e675306c8..7244a9cbb 100644
--- a/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -14,7 +14,7 @@
 
 //#define GGML_ALLOCATOR_DEBUG
 
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
+//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
 #define AT_PRINTF(...)
 
 
@@ -37,6 +37,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
     return true;
 }
 
+// ops that return true for this function must not use restrict pointers for their backend implementations
 static bool ggml_op_can_inplace(enum ggml_op op) {
     switch (op) {
         case GGML_OP_SCALE:
@@ -52,8 +53,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_LOG:
         case GGML_OP_UNARY:
         case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
+        case GGML_OP_SILU_BACK:
         case GGML_OP_RMS_NORM:
+        case GGML_OP_RMS_NORM_BACK:
         case GGML_OP_SOFT_MAX:
+        case GGML_OP_SOFT_MAX_BACK:
             return true;
 
         default:
@@ -61,7 +66,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
 
-// TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
     assert(alignment && !(alignment & (alignment - 1))); // power of 2
     size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
@@ -69,25 +73,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
 }
 
 // tallocr
-struct ggml_tallocr {
-    ggml_backend_buffer_t buffer;
-    void * base;
-    size_t alignment;
-    size_t offset;
-};
-
-ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
-    ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
-    if (talloc == NULL) {
-        return NULL;
-    }
 
+struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     void * base = ggml_backend_buffer_get_base(buffer);
     size_t align = ggml_backend_buffer_get_alignment(buffer);
 
     assert(align && !(align & (align - 1))); // power of 2
 
-    *talloc = (struct ggml_tallocr) {
+    struct ggml_tallocr talloc = (struct ggml_tallocr) {
         /*.buffer    = */ buffer,
         /*.base      = */ base,
         /*.alignment = */ align,
@@ -96,19 +89,14 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     return talloc;
 }
 
-void ggml_tallocr_free(ggml_tallocr_t talloc) {
-    free(talloc);
-}
-
-void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
+void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
-        GGML_ASSERT(!"not enough space in the buffer");
-        return;
+        GGML_ABORT("not enough space in the buffer");
     }
 
     void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -149,7 +137,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
             return;
         }
     }
-    GGML_ASSERT(!"out of allocated_tensors");
+    GGML_ABORT("out of allocated_tensors");
 }
 static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
@@ -158,8 +146,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
             return;
         }
     }
-    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
-    GGML_ASSERT(!"tensor not found");
+    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
 }
 #endif
 
@@ -190,10 +177,9 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
             // this should never happen
-            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
-            GGML_ASSERT(!"not enough space in the buffer");
-            GGML_UNREACHABLE();
+            GGML_ABORT("not enough space in the buffer");
         }
     }
 
@@ -228,16 +214,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
                 }
             }
         }
-        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
             if (alloc->allocated_tensors[i].tensor) {
-                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
                     alloc->allocated_tensors[i].offset,
                     alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                     ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
             }
         }
-        fprintf(stderr, "\n");
+        GGML_LOG_DEBUG("\n");
     }
 #endif
 
@@ -313,6 +299,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->free_blocks[0].offset = 0;
     alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
     alloc->max_size = 0;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    for (int i = 0; i < 1024; i++) {
+        alloc->allocated_tensors[i].tensor = NULL;
+    }
+#endif
 }
 
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@@ -354,14 +346,17 @@ struct hash_node {
     bool allocated;
 };
 
-//
 struct tensor_alloc {
+    int buffer_id;
     size_t offset;
     size_t size_max; // 0 = pre-allocated, unused, or view
 };
 
+struct leaf_alloc {
+    struct tensor_alloc leaf;
+};
+
 struct node_alloc {
-    int buffer_id;
     struct tensor_alloc dst;
     struct tensor_alloc src[GGML_MAX_SRC];
 };
@@ -378,28 +373,39 @@ struct ggml_gallocr {
     struct node_alloc * node_allocs; // [n_nodes]
     int n_nodes;
 
-    struct tensor_alloc * leaf_allocs; // [n_leafs]
+    struct leaf_alloc * leaf_allocs; // [n_leafs]
     int n_leafs;
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
-    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
+    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
-    galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
+    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
     GGML_ASSERT(galloc->bufts != NULL);
 
-    galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
     GGML_ASSERT(galloc->buffers != NULL);
 
-    galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
+    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
 
     for (int i = 0; i < n_bufs; i++) {
         galloc->bufts[i] = bufts[i];
         galloc->buffers[i] = NULL;
-        size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-        galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+
+        // check if the same buffer type is used multiple times and reuse the same allocator
+        for (int j = 0; j < i; j++) {
+            if (bufts[i] == bufts[j]) {
+                galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
+                break;
+            }
+        }
+
+        if (galloc->buf_tallocs[i] == NULL) {
+            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+        }
     }
     galloc->n_buffers = n_bufs;
 
@@ -417,14 +423,34 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 
     for (int i = 0; i < galloc->n_buffers; i++) {
         if (galloc->buffers != NULL) {
-            ggml_backend_buffer_free(galloc->buffers[i]);
+            // skip if already freed
+            bool freed = false;
+            for (int j = 0; j < i; j++) {
+                if (galloc->buffers[j] == galloc->buffers[i]) {
+                    freed = true;
+                    break;
+                }
+            }
+            if (!freed) {
+                ggml_backend_buffer_free(galloc->buffers[i]);
+            }
         }
         if (galloc->buf_tallocs != NULL) {
-            ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+            // skip if already freed
+            bool freed = false;
+            for (int j = 0; j < i; j++) {
+                if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+                    freed = true;
+                    break;
+                }
+            }
+            if (!freed) {
+                ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+            }
         }
     }
 
-    free(galloc->hash_set.keys);
+    ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
     free(galloc->buffers);
@@ -437,7 +463,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 typedef struct ggml_gallocr * ggml_gallocr_t;
 
 static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
+    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
     return &galloc->hash_values[i];
 }
 
@@ -445,18 +471,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     return ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
-static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
-    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-    hn->buffer_id = buffer_id;
-    hn->offset = offset;
-    hn->allocated = true;
-}
-
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
     return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
 
     if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -519,21 +539,21 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
         size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
         hn->offset = offset;
-        return;
     }
 }
 
-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
     // graph outputs are never freed
     if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
         AT_PRINTF("not freeing output %s\n", node->name);
         return;
     }
 
-    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
-    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
     size_t offset = hn->offset;
+    int buffer_id = hn->buffer_id;
+    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = ggml_backend_buft_get_alloc_size(buft, node);
     ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
     hn->allocated = false;
@@ -543,17 +563,28 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
     return node_buffer_ids ? node_buffer_ids[i] : 0;
 }
 
-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
-    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
-    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    ggml_hash_set_reset(&galloc->hash_set);
+    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
+
+    // allocate leafs
+    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
+    }
 
     // count number of children and views
-    // allocate all graph inputs and leafs first to avoid overwriting them
+    // allocate other graph inputs and leafs first to avoid overwriting them
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
 
-        if (ggml_is_view(node)) {
+        // TODO: better way to add external dependencies
+        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
+        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
+        // itself is never used and should not be considered a dependency
+        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
             struct ggml_tensor * view_src = node->view_src;
             ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
         }
@@ -570,26 +601,13 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
 
             ggml_gallocr_hash_get(galloc, src)->n_children += 1;
 
-            // allocate explicit inputs and leafs
-            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+            // allocate explicit inputs
+            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
                 ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
             }
         }
     }
 
-    // allocate the remaining leafs that are unused on the graph
-    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-
-        if (hn->n_children == 0) {
-            assert(!hn->allocated);
-            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
-            ggml_gallocr_allocate_node(galloc, leaf, 0);
-        }
-    }
-
     // allocate tensors
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -640,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
                     AT_PRINTF("view_src %s: %d children, %d views\n",
                         view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                     if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
-                        ggml_gallocr_free_node(galloc, view_src, buffer_id);
+                        ggml_gallocr_free_node(galloc, view_src);
                     }
                 }
                 else if (p_hn->allocated) {
-                    ggml_gallocr_free_node(galloc, parent, buffer_id);
+                    ggml_gallocr_free_node(galloc, parent);
                 }
             }
             AT_PRINTF("\n");
@@ -652,22 +670,20 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
-    size_t hash_size = graph->visited_hash_table.size;
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
+    // add 25% margin to avoid hash collisions
+    min_hash_size += min_hash_size / 4;
 
     // initialize hash table
-    if (galloc->hash_set.size < hash_size) {
-        free(galloc->hash_set.keys);
-        free(galloc->hash_values);
-        galloc->hash_set.size = hash_size;
-        galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
-        galloc->hash_values   = calloc(sizeof(struct hash_node), hash_size);
+    if (galloc->hash_set.size < min_hash_size) {
+        ggml_hash_set_free(&galloc->hash_set);
+        galloc->hash_set = ggml_hash_set_new(min_hash_size);
         GGML_ASSERT(galloc->hash_set.keys != NULL);
+
+        free(galloc->hash_values);
+        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
         GGML_ASSERT(galloc->hash_values != NULL);
-    } else {
-        // reset hash table
-        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
-        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
     }
 
     // reset allocators
@@ -676,34 +692,37 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
 
     // allocate in hash table
-    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
+    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
 
     // set the node_allocs from the hash table
     if (galloc->n_nodes < graph->n_nodes) {
         free(galloc->node_allocs);
-        galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
+        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
         GGML_ASSERT(galloc->node_allocs != NULL);
     }
     galloc->n_nodes = graph->n_nodes;
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
-        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
         if (node->view_src || node->data) {
+            node_alloc->dst.buffer_id = -1;
             node_alloc->dst.offset = SIZE_MAX;
             node_alloc->dst.size_max = 0;
         } else {
             struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-            node_alloc->dst.offset   = hn->offset;
-            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+            node_alloc->dst.buffer_id = hn->buffer_id;
+            node_alloc->dst.offset    = hn->offset;
+            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (!src || src->view_src || src->data) {
+                node_alloc->src[j].buffer_id = -1;
                 node_alloc->src[j].offset = SIZE_MAX;
                 node_alloc->src[j].size_max = 0;
             } else {
                 struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
+                node_alloc->src[j].buffer_id = hn->buffer_id;
                 node_alloc->src[j].offset   = hn->offset;
                 node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
             }
@@ -711,32 +730,50 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
     }
     if (galloc->n_leafs < graph->n_leafs) {
         free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
         GGML_ASSERT(galloc->leaf_allocs != NULL);
     }
     galloc->n_leafs = graph->n_leafs;
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].offset = hn->offset;
-        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+        if (leaf->view_src || leaf->data) {
+            galloc->leaf_allocs[i].leaf.buffer_id = -1;
+            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
+            galloc->leaf_allocs[i].leaf.size_max = 0;
+        } else {
+            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
+            galloc->leaf_allocs[i].leaf.offset = hn->offset;
+            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+        }
     }
 
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
+        // if the buffer type is used multiple times, we reuse the same buffer
+        for (int j = 0; j < i; j++) {
+            if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+                galloc->buffers[i] = galloc->buffers[j];
+                break;
+            }
+        }
+
         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 
-        if (new_size > cur_size) {
+        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
+
             ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
-                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
+            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
     }
 
@@ -744,30 +781,31 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-    return ggml_gallocr_reserve_n(galloc, graph, NULL);
+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
+    int buffer_id = tensor_alloc->buffer_id;
+    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
 
-    if (node->view_src != NULL) {
-        if (node->buffer == NULL) {
+    if (tensor->view_src != NULL) {
+        if (tensor->buffer == NULL) {
             assert(tensor_alloc->offset == SIZE_MAX);
-            if (node->view_src->buffer == NULL) {
+            if (tensor->view_src->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
             }
-            ggml_backend_view_init(galloc->buffers[buffer_id], node);
+            ggml_backend_view_init(tensor);
         }
     } else {
-        if (node->data == NULL) {
+        if (tensor->data == NULL) {
             assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
             void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
             void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
         } else {
-            if (node->buffer == NULL) {
+            if (tensor->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
             }
@@ -775,23 +813,26 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
     }
 }
 
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+    size_t node_size = 0;
+    if (!node->data && !node->view_src) {
+        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    }
     return talloc->size_max >= node_size;
 }
 
 static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (galloc->n_nodes != graph->n_nodes) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+        GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
 #endif
         return true;
     }
 
     if (galloc->n_leafs != graph->n_leafs) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+        GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
 #endif
         return true;
     }
@@ -800,9 +841,9 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
 
-        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+            GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
 #endif
             return true;
         }
@@ -812,9 +853,9 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
             if (src == NULL) {
                 continue;
             }
-            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
-                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+                GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
                 return true;
             }
@@ -828,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+            GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
             if (!ggml_gallocr_reserve(galloc, graph)) {
                 return false;
             }
         } else {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+            GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
 #endif
             return false;
         }
@@ -843,13 +884,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
 
     // reset buffers
     for (int i = 0; i < galloc->n_buffers; i++) {
-        // zero size buffers are not allocated
         if (galloc->buffers[i] != NULL) {
             ggml_backend_buffer_reset(galloc->buffers[i]);
         }
     }
 
     // allocate the graph tensors from the previous assignments
+    // leafs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+        ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
+    }
     // nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -859,15 +905,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
             if (src == NULL) {
                 continue;
             }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
         }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
-    }
-    // leafs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
+        ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
     }
 
     return true;
@@ -879,6 +919,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     if (galloc->buffers[buffer_id] == NULL) {
         return 0;
     }
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
+            // this buffer is the same as a previous one due to the same buffer type being used multiple times
+            // only return the buffer size the first time it appears to avoid double counting
+            return 0;
+        }
+    }
+
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
@@ -891,34 +940,32 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+        GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
         for (size_t i = 0; i < *n_buffers; i++) {
-            ggml_backend_buffer_free(*buffers[i]);
+            ggml_backend_buffer_free((*buffers)[i]);
         }
         free(*buffers);
         return false;
     }
 
-    struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
 
     for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
         if (t->data == NULL) {
             if (t->view_src == NULL) {
-                ggml_tallocr_alloc(tallocr, t);
+                ggml_tallocr_alloc(&tallocr, t);
             } else if (t->buffer == NULL) {
-                ggml_backend_view_init(buffer, t);
+                ggml_backend_view_init(t);
             }
         } else {
             if (t->view_src != NULL && t->buffer == NULL) {
                 // view of a pre-allocated tensor
-                ggml_backend_view_init(buffer, t);
+                ggml_backend_view_init(t);
             }
         }
     }
 
-    ggml_tallocr_free(tallocr);
-
     *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
     (*buffers)[(*n_buffers)++] = buffer;
 
@@ -942,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
             this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
         }
 
-        if (this_size > max_size) {
-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
-
-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
             if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                 return NULL;
@@ -975,7 +1010,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     if (n_buffers == 0) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+        GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
         return NULL;
     }
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
new file mode 100644
index 000000000..d1c2d76d8
--- /dev/null
+++ b/ggml/src/ggml-backend-impl.h
@@ -0,0 +1,255 @@
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    #define GGML_BACKEND_API_VERSION 1
+
+    //
+    // Backend buffer type
+    //
+
+    struct ggml_backend_buffer_type_i {
+        const char *          (*get_name)      (ggml_backend_buffer_type_t buft);
+        // allocate a buffer of this type
+        ggml_backend_buffer_t (*alloc_buffer)  (ggml_backend_buffer_type_t buft, size_t size);
+        // tensor alignment
+        size_t                (*get_alignment) (ggml_backend_buffer_type_t buft);
+        // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
+        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
+        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
+        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
+        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_dev_t device;
+        void * context;
+    };
+
+    //
+    // Backend buffer
+    //
+
+    struct ggml_backend_buffer_i {
+        // (optional) free the buffer
+        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
+        // base address of the buffer
+        void *       (*get_base)     (ggml_backend_buffer_t buffer);
+        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
+        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        // tensor data access
+        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
+        bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        // clear the entire buffer
+        void         (*clear)        (ggml_backend_buffer_t buffer, uint8_t value);
+        // (optional) reset any internal state due to tensor initialization, such as tensor extras
+        void         (*reset)        (ggml_backend_buffer_t buffer);
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
+        void * context;
+        size_t size;
+        enum ggml_backend_buffer_usage usage;
+    };
+
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t buft,
+            struct ggml_backend_buffer_i      iface,
+                   void *                     context,
+                   size_t                     size);
+
+    // do not use directly, use ggml_backend_tensor_copy instead
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // multi-buffer
+    // buffer that contains a collection of buffers
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
+    //
+    // Backend (stream)
+    //
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // (optional) asynchronous tensor data access
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // (optional) complete all pending operations (required if the backend supports async operations)
+        void (*synchronize)(ggml_backend_t backend);
+
+        // (optional) graph plans (not used currently)
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+        void                      (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        // compute the graph with the plan
+        enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph (always async if supported by the backend)
+        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // (optional) event synchronization
+        // record an event on this stream
+        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
+        // wait for an event on on a different stream
+        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+    };
+
+    struct ggml_backend {
+        ggml_guid_t guid;
+        struct ggml_backend_i iface;
+        ggml_backend_dev_t device;
+        void * context;
+    };
+
+    struct ggml_backend_event {
+        struct ggml_backend_device * device;
+        void * context;
+    };
+
+    //
+    // Backend device
+    //
+
+    // Note: if additional properties are needed, we should add a struct with all of them
+    //       the current functions to obtain the properties can remain, since they are more convenient for often used properties
+    struct ggml_backend_device_i {
+        // device name: short identifier for this device, such as "CPU" or "CUDA0"
+        const char * (*get_name)(ggml_backend_dev_t dev);
+
+        // device description: short informative description of the device, could be the model name
+        const char * (*get_description)(ggml_backend_dev_t dev);
+
+        // device memory in bytes
+        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
+
+        // device type
+        enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
+
+        // device properties
+        void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
+
+        // backend (stream) initialization
+        ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
+
+        // preferred buffer type
+        ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
+
+        // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
+        ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
+
+        // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
+        ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
+
+        // check if the backend can compute an operation
+        bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
+
+        // check if the backend can use tensors allocated in a buffer type
+        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
+
+        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
+        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
+        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
+
+        // (optional) event synchronization
+        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
+        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
+        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
+    };
+
+    struct ggml_backend_device {
+        struct ggml_backend_device_i iface;
+        ggml_backend_reg_t reg;
+        void * context;
+    };
+
+    //
+    // Backend (reg)
+    //
+
+    struct ggml_backend_reg_i {
+        const char * (*get_name)(ggml_backend_reg_t reg);
+
+        // enumerate available devices
+        size_t             (*get_device_count)(ggml_backend_reg_t reg);
+        ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
+
+        // (optional) get a pointer to a function in the backend
+        // backends can add custom functions that are not part of the standard ggml-backend interface
+        void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
+    };
+
+    struct ggml_backend_reg {
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
+        struct ggml_backend_reg_i iface;
+        void * context;
+    };
+
+    // Internal backend registry API
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+
+    // Add backend dynamic loading support to the backend
+
+    // Initialize the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+    // Optional: obtain a score for the backend based on the system configuration
+    // Higher scores are preferred, 0 means the backend is not supported in the current system
+    typedef int                (*ggml_backend_score_t)(void);
+
+#ifdef GGML_BACKEND_DL
+#    ifdef __cplusplus
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+            extern "C" {                                                 \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+            }                                                            \
+            ggml_backend_reg_t ggml_backend_init(void) {                 \
+                return reg_fn();                                         \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
+            extern "C" {                                   \
+            GGML_BACKEND_API int ggml_backend_score(void); \
+            }                                              \
+            int ggml_backend_score(void) {                 \
+                return score_fn();                         \
+            }
+#    else
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
+            ggml_backend_reg_t                  ggml_backend_init(void) { \
+                return reg_fn();                                          \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
+            GGML_BACKEND_API int ggml_backend_score(void);  \
+            int                  ggml_backend_score(void) { \
+                return score_fn();                          \
+            }
+#    endif
+#else
+#    define GGML_BACKEND_DL_IMPL(reg_fn)
+#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
new file mode 100644
index 000000000..955ed505f
--- /dev/null
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -0,0 +1,582 @@
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include <algorithm>
+#include <codecvt>
+#include <cstring>
+#include <filesystem>
+#include <locale>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#elif defined(__APPLE__)
+#    include <mach-o/dyld.h>
+#    include <dlfcn.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
+// Backend registry
+#ifdef GGML_USE_CPU
+#include "ggml-cpu.h"
+#endif
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#ifdef GGML_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
+#ifdef GGML_USE_BLAS
+#include "ggml-blas.h"
+#endif
+
+#ifdef GGML_USE_RPC
+#include "ggml-rpc.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_KOMPUTE
+#include "ggml-kompute.h"
+#endif
+
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+static std::wstring utf8_to_utf16(const std::string & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.from_bytes(str);
+}
+
+static std::string utf16_to_utf8(const std::wstring & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.to_bytes(str);
+}
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static dl_handle * dl_load_library(const std::wstring & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static void * dl_load_library(const std::wstring & path) {
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+struct ggml_backend_reg_entry {
+    ggml_backend_reg_t reg;
+    dl_handle_ptr handle;
+};
+
+struct ggml_backend_registry {
+    std::vector<ggml_backend_reg_entry> backends;
+    std::vector<ggml_backend_dev_t> devices;
+
+    ggml_backend_registry() {
+#ifdef GGML_USE_CUDA
+        register_backend(ggml_backend_cuda_reg());
+#endif
+#ifdef GGML_USE_METAL
+        register_backend(ggml_backend_metal_reg());
+#endif
+#ifdef GGML_USE_SYCL
+        register_backend(ggml_backend_sycl_reg());
+#endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
+#ifdef GGML_USE_OPENCL
+        register_backend(ggml_backend_opencl_reg());
+#endif
+#ifdef GGML_USE_CANN
+        register_backend(ggml_backend_cann_reg());
+#endif
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
+#ifdef GGML_USE_RPC
+        register_backend(ggml_backend_rpc_reg());
+#endif
+#ifdef GGML_USE_KOMPUTE
+        register_backend(ggml_backend_kompute_reg());
+#endif
+#ifdef GGML_USE_CPU
+        register_backend(ggml_backend_cpu_reg());
+#endif
+    }
+
+    ~ggml_backend_registry() {
+        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
+        // since backend threads may still be running and accessing resources from the dynamic library
+        for (auto & entry : backends) {
+            if (entry.handle) {
+                entry.handle.release(); // NOLINT
+            }
+        }
+    }
+
+    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
+        if (!reg) {
+            return;
+        }
+
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+#endif
+        backends.push_back({ reg, std::move(handle) });
+        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+            register_device(ggml_backend_reg_dev_get(reg, i));
+        }
+    }
+
+    void register_device(ggml_backend_dev_t device) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+#endif
+        devices.push_back(device);
+    }
+
+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
+        dl_handle_ptr handle { dl_load_library(path) };
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
+            }
+            return nullptr;
+        }
+
+        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+        if (score_fn && score_fn() == 0) {
+            if (!silent) {
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
+            }
+            return nullptr;
+        }
+
+        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+        if (!backend_init_fn) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
+            }
+            return nullptr;
+        }
+
+        ggml_backend_reg_t reg = backend_init_fn();
+        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+            if (!silent) {
+                if (!reg) {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
+                } else {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                }
+            }
+            return nullptr;
+        }
+
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+
+        register_backend(reg, std::move(handle));
+
+        return reg;
+    }
+
+    void unload_backend(ggml_backend_reg_t reg, bool silent) {
+        auto it = std::find_if(backends.begin(), backends.end(),
+                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
+
+        if (it == backends.end()) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: backend not found\n", __func__);
+            }
+            return;
+        }
+
+        if (!silent) {
+            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
+        }
+
+        // remove devices
+        devices.erase(
+            std::remove_if(devices.begin(), devices.end(),
+                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+            devices.end());
+
+        // remove backend
+        backends.erase(it);
+    }
+};
+
+static ggml_backend_registry & get_reg() {
+    static ggml_backend_registry reg;
+    return reg;
+}
+
+// Internal API
+void ggml_backend_register(ggml_backend_reg_t reg) {
+    get_reg().register_backend(reg);
+}
+
+void ggml_backend_device_register(ggml_backend_dev_t device) {
+    get_reg().register_device(device);
+}
+
+// Backend (reg) enumeration
+static bool striequals(const char * a, const char * b) {
+    for (; *a && *b; a++, b++) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+    }
+    return *a == *b;
+}
+
+size_t ggml_backend_reg_count() {
+    return get_reg().backends.size();
+}
+
+ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_reg_count());
+    return get_reg().backends[index].reg;
+}
+
+ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
+        if (striequals(ggml_backend_reg_name(reg), name)) {
+            return reg;
+        }
+    }
+    return nullptr;
+}
+
+// Device enumeration
+size_t ggml_backend_dev_count() {
+    return get_reg().devices.size();
+}
+
+ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_dev_count());
+    return get_reg().devices[index];
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (striequals(ggml_backend_dev_name(dev), name)) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == type) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+// Convenience functions
+ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_best(void) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    if (!dev) {
+        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    }
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, nullptr);
+}
+
+// Dynamic loading
+ggml_backend_reg_t ggml_backend_load(const char * path) {
+    return get_reg().load_backend(utf8_to_utf16(path), false);
+}
+
+void ggml_backend_unload(ggml_backend_reg_t reg) {
+    get_reg().unload_backend(reg, true);
+}
+
+static std::wstring get_executable_path() {
+#if defined(__APPLE__)
+    // get executable path
+    std::vector<char> path;
+    uint32_t size;
+    while (true) {
+        size = path.size();
+        if (_NSGetExecutablePath(path.data(), &size) == 0) {
+            break;
+        }
+        path.resize(size);
+    }
+    std::string base_path(path.data(), size);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('/');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return utf8_to_utf16(base_path + "/");
+#elif defined(__linux__) || defined(__FreeBSD__)
+    std::string base_path = ".";
+    std::vector<char> path(1024);
+    while (true) {
+        // get executable path
+#    if defined(__linux__)
+        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
+        if (len == -1) {
+            break;
+        }
+        if (len < (ssize_t) path.size()) {
+            base_path = std::string(path.data(), len);
+            // remove executable name
+            auto last_slash = base_path.find_last_of('/');
+            if (last_slash != std::string::npos) {
+                base_path = base_path.substr(0, last_slash);
+            }
+            break;
+        }
+        path.resize(path.size() * 2);
+    }
+
+    return utf8_to_utf16(base_path + "/");
+#elif defined(_WIN32)
+    std::vector<wchar_t> path(MAX_PATH);
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
+    if (len == 0) {
+        return {};
+    }
+    std::wstring base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + L"\\";
+#else
+    return {};
+#endif
+}
+
+static std::wstring backend_filename_prefix() {
+#ifdef _WIN32
+    return L"ggml-";
+#else
+    return L"libggml-";
+#endif
+}
+
+static std::wstring backend_filename_suffix() {
+#ifdef _WIN32
+    return L".dll";
+#else
+    return L".so";
+#endif
+}
+
+static std::wstring path_separator() {
+#ifdef _WIN32
+    return L"\\";
+#else
+    return L"/";
+#endif
+}
+
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
+    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+     // TODO: search system paths
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
+    std::vector<std::wstring> search_paths;
+    if (user_search_path == nullptr) {
+        search_paths.push_back(L"." + path_separator());
+        search_paths.push_back(get_executable_path());
+    } else {
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
+    }
+
+    int best_score = 0;
+    std::wstring best_path;
+
+    namespace fs = std::filesystem;
+    for (const auto & search_path : search_paths) {
+        if (!fs::exists(search_path)) {
+            continue;
+        }
+        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
+        for (const auto & entry : dir_it) {
+            if (entry.is_regular_file()) {
+                std::wstring filename = entry.path().filename().wstring();
+                std::wstring ext = entry.path().extension().wstring();
+                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                    }
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path().wstring();
+                            }
+                        } else {
+                            if (!silent) {
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (best_score == 0) {
+        // try to load the base backend
+        for (const auto & search_path : search_paths) {
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
+            if (fs::exists(path)) {
+                return get_reg().load_backend(path, silent);
+            }
+        }
+        return nullptr;
+    }
+
+    return get_reg().load_backend(best_path, silent);
+}
+
+void ggml_backend_load_all() {
+    ggml_backend_load_all_from_path(nullptr);
+}
+
+void ggml_backend_load_all_from_path(const char * dir_path) {
+#ifdef NDEBUG
+    bool silent = true;
+#else
+    bool silent = false;
+#endif
+
+    ggml_backend_load_best("blas", silent, dir_path);
+    ggml_backend_load_best("cann", silent, dir_path);
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
+    ggml_backend_load_best("kompute", silent, dir_path);
+    ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("rpc", silent, dir_path);
+    ggml_backend_load_best("sycl", silent, dir_path);
+    ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("cpu", silent, dir_path);
+    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
+    const char * backend_path = std::getenv("GGML_BACKEND_PATH");
+    if (backend_path) {
+        ggml_backend_load(backend_path);
+    }
+}
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
new file mode 100644
index 000000000..dba7be33b
--- /dev/null
+++ b/ggml/src/ggml-backend.cpp
@@ -0,0 +1,2002 @@
+// Note: porting this file to C++ is a work in progress
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-alloc.h"
+#include "ggml-impl.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+
+// backend buffer type
+
+const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name(buft);
+}
+
+ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if (size == 0) {
+        // return a dummy buffer for zero-sized allocations
+        return ggml_backend_buffer_init(buft, {}, NULL, 0);
+    }
+
+    return buft->iface.alloc_buffer(buft, size);
+}
+
+size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_alignment(buft);
+}
+
+size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
+    // get_max_size is optional, defaults to SIZE_MAX
+    if (buft->iface.get_max_size) {
+        return buft->iface.get_max_size(buft);
+    }
+    return SIZE_MAX;
+}
+
+size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+    // get_alloc_size is optional, defaults to ggml_nbytes
+    if (buft->iface.get_alloc_size) {
+        size_t size = buft->iface.get_alloc_size(buft, tensor);
+        assert(size >= ggml_nbytes(tensor));
+        return size;
+    }
+    return ggml_nbytes(tensor);
+}
+
+bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
+    if (buft->iface.is_host) {
+        return buft->iface.is_host(buft);
+    }
+    return false;
+}
+
+ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
+    return buft->device;
+}
+
+// backend buffer
+
+ggml_backend_buffer_t ggml_backend_buffer_init(
+               ggml_backend_buffer_type_t buft,
+        struct ggml_backend_buffer_i      iface,
+               void *                     context,
+               size_t                     size) {
+    ggml_backend_buffer_t buffer = new ggml_backend_buffer {
+        /* .interface = */ iface,
+        /* .buft      = */ buft,
+        /* .context   = */ context,
+        /* .size      = */ size,
+        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
+    };
+
+    return buffer;
+}
+
+const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
+}
+
+void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+    if (buffer == NULL) {
+        return;
+    }
+
+    if (buffer->iface.free_buffer != NULL) {
+        buffer->iface.free_buffer(buffer);
+    }
+    delete buffer;
+}
+
+size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
+    return buffer->size;
+}
+
+void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // get_base is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return NULL;
+    }
+
+    void * base = buffer->iface.get_base(buffer);
+
+    GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
+
+    return base;
+}
+
+void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    // init_tensor is optional
+    if (buffer->iface.init_tensor) {
+        buffer->iface.init_tensor(buffer, tensor);
+    }
+}
+
+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    // clear is optional if the buffer is zero-sized
+    if (buffer->size == 0) {
+        return;
+    }
+
+    buffer->iface.clear(buffer, value);
+}
+
+size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
+}
+
+size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
+}
+
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
+}
+
+bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
+}
+
+void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+    buffer->usage = usage;
+
+    // FIXME: add a generic callback to the buffer interface
+    if (ggml_backend_buffer_is_multi_buffer(buffer)) {
+        ggml_backend_multi_buffer_set_usage(buffer, usage);
+    }
+}
+
+enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
+    return buffer->usage;
+}
+
+ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
+    return buffer->buft;
+}
+
+void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
+    if (buffer->iface.reset) {
+        buffer->iface.reset(buffer);
+    }
+}
+
+bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
+    if (dst_buf->iface.cpy_tensor) {
+        return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
+    }
+    return false;
+}
+
+// backend
+
+ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
+    if (backend == NULL) {
+        return NULL;
+    }
+    return backend->guid;
+}
+
+const char * ggml_backend_name(ggml_backend_t backend) {
+    if (backend == NULL) {
+        return "NULL";
+    }
+    return backend->iface.get_name(backend);
+}
+
+void ggml_backend_free(ggml_backend_t backend) {
+    if (backend == NULL) {
+        return;
+    }
+
+    backend->iface.free(backend);
+}
+
+ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_dev_buffer_type(backend->device);
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
+    return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
+}
+
+size_t ggml_backend_get_alignment(ggml_backend_t backend) {
+    return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
+}
+
+size_t ggml_backend_get_max_size(ggml_backend_t backend) {
+    return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
+}
+
+void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+
+    if (backend->iface.set_tensor_async == NULL) {
+        ggml_backend_tensor_set(tensor, data, offset, size);
+    } else {
+        backend->iface.set_tensor_async(backend, tensor, data, offset, size);
+    }
+}
+
+void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+
+    if (backend->iface.get_tensor_async == NULL) {
+        ggml_backend_tensor_get(tensor, data, offset, size);
+    } else {
+        backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+    }
+}
+
+void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    if (size == 0) {
+        return;
+    }
+
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+
+    buf->iface.set_tensor(buf, tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor);
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    if (size == 0) {
+        return;
+    }
+
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+
+    buf->iface.get_tensor(buf, tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    if (size == 0) {
+        return;
+    }
+
+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
+
+    buf->iface.memset_tensor(buf, tensor, value, offset, size);
+}
+
+void ggml_backend_synchronize(ggml_backend_t backend) {
+    if (backend->iface.synchronize == NULL) {
+        return;
+    }
+
+    backend->iface.synchronize(backend);
+}
+
+ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    GGML_ASSERT(backend->iface.graph_plan_create != NULL);
+
+    return backend->iface.graph_plan_create(backend, cgraph);
+}
+
+void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(backend->iface.graph_plan_free != NULL);
+
+    backend->iface.graph_plan_free(backend, plan);
+}
+
+enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
+
+    return backend->iface.graph_plan_compute(backend, plan);
+}
+
+enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+    ggml_backend_synchronize(backend);
+    return err;
+}
+
+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_compute(backend, cgraph);
+}
+
+bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return ggml_backend_dev_supports_op(backend->device, op);
+}
+
+bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_dev_supports_buft(backend->device, buft);
+}
+
+bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return ggml_backend_dev_offload_op(backend->device, op);
+}
+
+ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
+    return backend->device;
+}
+
+// backend copy
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+    if (src == dst) {
+        return;
+    }
+
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
+    } else if (ggml_backend_buffer_is_host(dst->buffer)) {
+        ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+    } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
+#endif
+        size_t nbytes = ggml_nbytes(src);
+        void * data = malloc(nbytes);
+        ggml_backend_tensor_get(src, data, 0, nbytes);
+        ggml_backend_tensor_set(dst, data, 0, nbytes);
+        free(data);
+    }
+}
+
+void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+    if (src == dst) {
+        return;
+    }
+
+    if (backend_dst->iface.cpy_tensor_async != NULL) {
+        if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
+            return;
+        }
+    }
+
+    // an async copy would normally happen after all the queued operations on both backends are completed
+    // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
+    ggml_backend_synchronize(backend_src);
+    ggml_backend_synchronize(backend_dst);
+    ggml_backend_tensor_copy(src, dst);
+}
+
+// events
+
+ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
+    // null device is allowed for the transition period to the device interface
+    if (device == NULL || device->iface.event_new == NULL) {
+        return NULL;
+    }
+    return device->iface.event_new(device);
+}
+
+void ggml_backend_event_free(ggml_backend_event_t event) {
+    if (event == NULL) {
+        return;
+    }
+    event->device->iface.event_free(event->device, event);
+}
+
+void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
+    GGML_ASSERT(backend->iface.event_record != NULL);
+
+    backend->iface.event_record(backend, event);
+}
+
+void ggml_backend_event_synchronize(ggml_backend_event_t event) {
+    GGML_ASSERT(event->device->iface.event_synchronize);
+
+    event->device->iface.event_synchronize(event->device, event);
+}
+
+void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    GGML_ASSERT(backend->iface.event_wait != NULL);
+
+    backend->iface.event_wait(backend, event);
+}
+
+// Backend device
+
+const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
+    return device->iface.get_name(device);
+}
+
+const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
+    return device->iface.get_description(device);
+}
+
+void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    device->iface.get_memory(device, free, total);
+}
+
+enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
+    return device->iface.get_type(device);
+}
+
+void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
+    memset(props, 0, sizeof(*props));
+    device->iface.get_props(device, props);
+}
+
+ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
+    return device->reg;
+}
+
+ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
+    return device->iface.init_backend(device, params);
+}
+
+ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
+    return device->iface.get_buffer_type(device);
+}
+
+ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
+    if (device->iface.get_host_buffer_type == NULL) {
+        return NULL;
+    }
+
+    return device->iface.get_host_buffer_type(device);
+}
+
+ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
+    return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
+}
+
+bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
+    return device->iface.supports_op(device, op);
+}
+
+bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
+    return device->iface.supports_buft(device, buft);
+}
+
+bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
+    if (device->iface.offload_op != NULL) {
+        return device->iface.offload_op(device, op);
+    }
+
+    return false;
+}
+
+// Backend (reg)
+
+const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
+    return reg->iface.get_name(reg);
+}
+
+size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
+    return reg->iface.get_device_count(reg);
+}
+
+ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
+    return reg->iface.get_device(reg, index);
+}
+
+void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (!reg->iface.get_proc_address) {
+        return NULL;
+    }
+    return reg->iface.get_proc_address(reg, name);
+}
+
+// multi-buffer buffer
+
+struct ggml_backend_multi_buffer_context {
+    ggml_backend_buffer_t * buffers;
+    size_t n_buffers;
+};
+
+static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_free(ctx->buffers[i]);
+    }
+
+    free(ctx->buffers);
+    free(ctx);
+}
+
+static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_clear(ctx->buffers[i], value);
+    }
+}
+
+static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
+    /* .get_base        = */ NULL,
+    /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ NULL,
+    /* .get_tensor      = */ NULL,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_multi_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
+    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
+    ctx->n_buffers = n_buffers;
+    ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+
+    GGML_ASSERT(ctx->buffers != NULL);
+
+    size_t total_size = 0;
+    for (size_t i = 0; i < n_buffers; i++) {
+        ctx->buffers[i] = buffers[i];
+        total_size += ggml_backend_buffer_get_size(buffers[i]);
+    }
+
+    return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
+}
+
+bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
+}
+
+void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+    GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
+    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
+    }
+}
+
+// creates a copy of the tensor with the same memory layout
+static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
+    struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        dup->nb[i] = tensor->nb[i];
+    }
+    return dup;
+}
+
+static bool ggml_is_view_op(enum ggml_op op) {
+    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
+}
+
+// scheduler
+
+#ifndef GGML_SCHED_MAX_BACKENDS
+#define GGML_SCHED_MAX_BACKENDS 16
+#endif
+
+#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
+#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
+#endif
+
+#ifndef GGML_SCHED_MAX_COPIES
+#define GGML_SCHED_MAX_COPIES 4
+#endif
+
+struct ggml_backend_sched_split {
+    int backend_id;
+    int i_start;
+    int i_end;
+    struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
+    int n_inputs;
+    // graph view of this split
+    struct ggml_cgraph graph;
+};
+
+struct ggml_backend_sched {
+    bool is_reset; // true if the scheduler has been reset since the last graph split
+    bool is_alloc;
+
+    int n_backends;
+
+    ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
+    ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
+    ggml_gallocr_t galloc;
+
+    // hash map of the nodes in the graph
+    struct ggml_hash_set  hash_set;
+    int                 * hv_tensor_backend_ids; // [hash_set.size]
+    struct ggml_tensor ** hv_tensor_copies;      // [hash_set.size][n_backends][n_copies]
+
+    int * node_backend_ids; // [graph_size]
+    int * leaf_backend_ids; // [graph_size]
+
+    int * prev_node_backend_ids; // [graph_size]
+    int * prev_leaf_backend_ids; // [graph_size]
+
+    // copy of the graph with modified inputs
+    struct ggml_cgraph graph;
+
+    // graph splits
+    struct ggml_backend_sched_split * splits;
+    int n_splits;
+    int splits_capacity;
+
+    // pipeline parallelism support
+    int n_copies;
+    int cur_copy;
+    ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
+    struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
+    int n_graph_inputs;
+
+    struct ggml_context * ctx;
+
+    ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
+
+    char * context_buffer;
+    size_t context_buffer_size;
+
+    int debug;
+};
+
+#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
+#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
+#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
+#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
+
+// returns the priority of the backend, lower id is higher priority
+static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (sched->backends[i] == backend) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
+    ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+    if (buffer == NULL) {
+        return -1;
+    }
+
+    // find highest prio backend that supports the buffer type and the op
+    for (int i = 0; i < sched->n_backends; i++) {
+        if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
+            ggml_backend_supports_op(sched->backends[i], op)) {
+            return i;
+        }
+    }
+
+#ifndef NDEBUG
+    GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+        __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
+#endif
+
+    return -1;
+}
+
+#if 0
+#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
+#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
+#define GET_CAUSE(node) causes[hash_id(node)]
+#else
+#define SET_CAUSE(node, ...)
+#define GET_CAUSE(node) ""
+#endif
+
+// returns the backend that should be used for the node based on the current locations
+static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
+    // assign pre-allocated nodes to their backend
+    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
+    if (cur_backend_id != -1) {
+        SET_CAUSE(tensor, "1.dst");
+        return cur_backend_id;
+    }
+
+    // view_src
+    if (tensor->view_src != NULL) {
+        cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
+        if (cur_backend_id != -1) {
+            SET_CAUSE(tensor, "1.vsrc");
+            return cur_backend_id;
+        }
+    }
+
+    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
+        // since the tensor is pre-allocated, it cannot be moved to another backend
+        ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+        GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
+    }
+
+    // graph input
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
+        SET_CAUSE(tensor, "1.inp");
+        return cur_backend_id;
+    }
+
+    // operations with weights are preferably run on the same backend as the weights
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        const struct ggml_tensor * src = tensor->src[i];
+        if (src == NULL) {
+            continue;
+        }
+        // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
+        // not an ideal solution
+        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
+            // check if a backend with higher prio wants to offload the op
+            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+                for (int b = 0; b < src_backend_id; b++) {
+                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
+                        SET_CAUSE(tensor, "1.off");
+                        return b;
+                    }
+                }
+            }
+            SET_CAUSE(tensor, "1.wgt%d", i);
+            return src_backend_id;
+        }
+    }
+
+    return -1;
+}
+
+static char * fmt_size(size_t size) {
+    static char buffer[128];
+    if (size >= 1024*1024) {
+        snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
+    } else {
+        snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
+    }
+    return buffer;
+}
+
+static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    int cur_split = 0;
+    for (int i = 0; i < graph->n_nodes; i++) {
+        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
+            ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
+            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
+                sched->splits[cur_split].n_inputs);
+            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                if (j == 0) {
+                    GGML_LOG_DEBUG(": ");
+                }
+                GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+                    fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
+            }
+            GGML_LOG_DEBUG("\n");
+            cur_split++;
+        }
+        struct ggml_tensor * node = graph->nodes[i];
+        if (ggml_is_view_op(node->op)) {
+            continue;
+        }
+        if (sched->debug > 1) {
+            ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
+                GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+                    fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
+            }
+            GGML_LOG_DEBUG("\n");
+        }
+    }
+}
+
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
+    ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+    ggml_backend_buffer_type_t buft = NULL;
+
+    if (buf) {
+        // the tensor is already allocated
+        buft = buf->buft;
+    } else {
+        // see if the tensor already has a backend assigned, and use the buffer type of that backend
+        int tensor_backend_id = tensor_backend_id(t);
+        if (tensor_backend_id == -1 && t->view_src) {
+            tensor_backend_id = tensor_backend_id(t->view_src);
+        }
+        if (tensor_backend_id != -1) {
+            buft = sched->bufts[tensor_backend_id];
+        }
+    }
+
+    return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
+}
+
+static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
+        *node_backend_id = cur_backend_id;
+        SET_CAUSE(node, "2.sup");
+    }
+}
+
+// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
+static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    // reset splits
+    sched->n_splits = 0;
+    sched->n_graph_inputs = 0;
+    sched->is_reset = false;
+
+    struct ggml_init_params params = {
+        /* .mem_size =   */ sched->context_buffer_size,
+        /* .mem_buffer = */ sched->context_buffer,
+        /* .no_alloc =   */ true
+    };
+
+    ggml_free(sched->ctx);
+
+    sched->ctx = ggml_init(params);
+    if (sched->ctx == NULL) {
+        GGML_ABORT("%s: failed to initialize context\n", __func__);
+    }
+
+    // pass 1: assign backends to ops with pre-allocated inputs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        int * leaf_backend_id = &tensor_backend_id(leaf);
+        // do not overwrite user assignments
+        if (*leaf_backend_id == -1) {
+            *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
+        }
+    }
+
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        int * node_backend_id = &tensor_backend_id(node);
+        // do not overwrite user assignments
+        if (*node_backend_id == -1) {
+            *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
+
+#if 0
+            // src
+            if (node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                int * src_backend_id = &tensor_backend_id(src);
+                if (*src_backend_id == -1) {
+                    *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
+                }
+            }
+#endif
+        }
+    }
+
+    // pass 2: expand current backend assignments
+    // assign the same backend to adjacent nodes
+    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
+    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+    // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
+    // expand gpu down
+    {
+        int cur_backend_id = -1;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                if (*node_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                } else {
+                    cur_backend_id = *node_backend_id;
+                }
+            } else if (cur_backend_id != -1) {
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+    // expand gpu up
+    {
+        int cur_backend_id = -1;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                if (*node_backend_id == sched->n_backends - 1) {
+                    // skip cpu (lowest prio backend)
+                    cur_backend_id = -1;
+                } else {
+                    cur_backend_id = *node_backend_id;
+                }
+            } else if (cur_backend_id != -1) {
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+    // expand rest down
+    {
+        int cur_backend_id = -1;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                cur_backend_id = *node_backend_id;
+            } else if (cur_backend_id != -1) {
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+    // expand rest up
+    {
+        int cur_backend_id = -1;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            int * node_backend_id = &tensor_backend_id(node);
+            if (*node_backend_id != -1) {
+                cur_backend_id = *node_backend_id;
+            } else if (cur_backend_id != -1) {
+                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+            }
+        }
+    }
+
+    // pass 3: upgrade nodes to higher prio backends with compatible buffer types
+    // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
+    // however, we also need to verify that the sources are in compatible buffer types
+    // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
+    // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
+    // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+    // additionally, set remaining unassigned nodes to the backend with the most supported inputs
+    // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        if (ggml_is_view_op(node->op)) {
+            continue;
+        }
+        int * node_backend_id = &tensor_backend_id(node);
+        if (*node_backend_id == -1) {
+            // unassigned node: find the backend with the most supported inputs
+            int n_supported_best = -1;
+            for (int b = 0; b < sched->n_backends; b++) {
+                if (ggml_backend_supports_op(sched->backends[b], node)) {
+                    int n_supported = 0;
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            n_supported++;
+                        }
+                    }
+                    if (n_supported > n_supported_best) {
+                        n_supported_best = n_supported;
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.best");
+                    }
+                }
+            }
+        } else {
+            // assigned node: upgrade to higher prio backend if possible
+            for (int b = 0; b < *node_backend_id; b++) {
+                if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
+                    bool supported = true;
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * src = node->src[j];
+                        if (src == NULL) {
+                            continue;
+                        }
+                        if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
+                            supported = false;
+                            break;
+                        }
+                    }
+                    if (supported) {
+                        *node_backend_id = b;
+                        SET_CAUSE(node, "3.upg");
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    // pass 4: assign backends to remaining src from dst and view_src
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        int * cur_backend_id = &tensor_backend_id(node);
+        if (node->view_src != NULL && *cur_backend_id == -1) {
+            *cur_backend_id = tensor_backend_id(node->view_src);
+            SET_CAUSE(node, "4.vsrc");
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            struct ggml_tensor * src = node->src[j];
+            if (src == NULL) {
+                continue;
+            }
+            int * src_backend_id = &tensor_backend_id(src);
+            if (*src_backend_id == -1) {
+                if (src->view_src != NULL) {
+                    // views are always on the same backend as the source
+                    *src_backend_id = tensor_backend_id(src->view_src);
+                    SET_CAUSE(src, "4.vsrc");
+                } else {
+                    *src_backend_id = *cur_backend_id;
+                    SET_CAUSE(src, "4.cur");
+                }
+            }
+        }
+    }
+
+    // pass 5: split graph, find tensors that need to be copied
+    {
+        int i_split = 0;
+        struct ggml_backend_sched_split * split = &sched->splits[0];
+        // find the backend of the first split, skipping view ops
+        int i = 0;
+        for (; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (!ggml_is_view_op(node->op)) {
+                split->backend_id = tensor_backend_id(node);
+                break;
+            }
+        }
+        split->i_start = 0;
+        split->n_inputs = 0;
+        int cur_backend_id = split->backend_id;
+        for (; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+
+            const int node_backend_id = tensor_backend_id(node);
+
+            assert(node_backend_id != -1); // all nodes should be assigned by now
+
+            // check if we should start a new split based on the sources of the current node
+            bool need_new_split = false;
+            if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * src = node->src[j];
+                    if (src == NULL) {
+                        continue;
+                    }
+                    // check if a weight is on a different and incompatible backend
+                    // by starting a new split, the memory of the previously offloaded weights can be reused
+                    if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                        int src_backend_id = tensor_backend_id(src);
+                        if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
+                            need_new_split = true;
+                            break;
+                        }
+                    }
+                    // check if the split has too many inputs
+                    // FIXME: count the number of inputs instead of only checking when full
+                    if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
+                        const size_t id = hash_id(src);
+                        int src_backend_id = sched->hv_tensor_backend_ids[id];
+                        bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
+                        if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
+                            need_new_split = true;
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (node_backend_id != cur_backend_id || need_new_split) {
+                split->i_end = i;
+                i_split++;
+                if (i_split >= sched->splits_capacity) {
+                    sched->splits_capacity *= 2;
+                    sched->splits = (ggml_backend_sched_split *)
+                        realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
+                    GGML_ASSERT(sched->splits != NULL);
+                }
+                split = &sched->splits[i_split];
+                split->backend_id = node_backend_id;
+                split->i_start = i;
+                split->n_inputs = 0;
+                cur_backend_id = node_backend_id;
+            }
+
+            // find inputs that are not on the same backend
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+
+                size_t src_id = hash_id(src);
+                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
+                assert(src_backend_id != -1); // all inputs should be assigned by now
+
+                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
+                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
+                        ggml_backend_t backend = sched->backends[src_backend_id];
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct ggml_tensor * tensor_copy;
+                            if (c == sched->cur_copy) {
+                                tensor_copy = src; // use the original tensor as the current copy
+                            } else {
+                                tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                                ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
+                            }
+                            if (sched->n_copies > 1) {
+                                ggml_set_input(tensor_copy);
+                                ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
+                        int n_graph_inputs = sched->n_graph_inputs++;
+                        GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
+                        sched->graph_inputs[n_graph_inputs] = src;
+                    }
+                }
+
+                if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
+                    // create a copy of the input in the split's backend
+                    if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
+                        ggml_backend_t backend = sched->backends[cur_backend_id];
+                        for (int c = 0; c < sched->n_copies; c++) {
+                            struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                            ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
+                            if (sched->n_copies > 1) {
+                                ggml_set_input(tensor_copy);
+                                ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
+                            }
+                            tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
+                            SET_CAUSE(tensor_copy, "4.cpy");
+                        }
+                        int n_inputs = split->n_inputs++;
+                        GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
+                        split->inputs[n_inputs] = src;
+                    }
+                    node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
+                }
+            }
+        }
+        split->i_end = graph->n_nodes;
+        sched->n_splits = i_split + 1;
+    }
+
+    if (sched->debug) {
+        ggml_backend_sched_print_assignments(sched, graph);
+    }
+
+    // swap node_backend_ids and leaf _backend_ids with prevs
+    {
+        int * tmp = sched->node_backend_ids;
+        sched->node_backend_ids = sched->prev_node_backend_ids;
+        sched->prev_node_backend_ids = tmp;
+
+        tmp = sched->leaf_backend_ids;
+        sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
+        sched->prev_leaf_backend_ids = tmp;
+    }
+
+    int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
+    if (sched->graph.size < graph_size) {
+        sched->graph.size = graph_size;
+        sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
+        sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
+        GGML_ASSERT(sched->graph.nodes != NULL);
+        GGML_ASSERT(sched->graph.leafs != NULL);
+    }
+    sched->graph.n_nodes = 0;
+    sched->graph.n_leafs = 0;
+
+    struct ggml_cgraph * graph_copy = &sched->graph;
+
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct ggml_backend_sched_split * split = &sched->splits[i];
+        split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
+
+        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
+        for (int j = 0; j < split->n_inputs; j++) {
+            assert(graph_copy->size > (graph_copy->n_nodes + 1));
+
+            struct ggml_tensor * input = split->inputs[j];
+            const size_t input_id = hash_id(input);
+            struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
+
+            // add a dependency to the input source so that it is not freed before the copy is done
+            struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
+            input_dep->src[0] = input;
+            sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
+            graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
+
+            // add a dependency to the input copy so that it is allocated at the start of the split
+            sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
+            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
+        }
+
+        for (int j = split->i_start; j < split->i_end; j++) {
+            assert(graph_copy->size > graph_copy->n_nodes);
+            sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
+            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
+        }
+    }
+
+    if (sched->n_copies > 1) {
+        // add input copies as leafs so that they are allocated first
+        for (int i = 0; i < sched->n_graph_inputs; i++) {
+            struct ggml_tensor * input = sched->graph_inputs[i];
+            size_t id = hash_id(input);
+            int backend_id = tensor_backend_id(input);
+            for (int c = 0; c < sched->n_copies; c++) {
+                struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
+                sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                assert(graph_copy->size > graph_copy->n_leafs);
+                graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+            }
+        }
+
+        for (int i = 0; i < sched->n_splits; i++) {
+            struct ggml_backend_sched_split * split = &sched->splits[i];
+            int backend_id = split->backend_id;
+            for (int j = 0; j < split->n_inputs; j++) {
+                struct ggml_tensor * input = split->inputs[j];
+                size_t id = hash_id(input);
+                for (int c = 0; c < sched->n_copies; c++) {
+                    struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
+                    sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
+                    assert(graph_copy->size > graph_copy->n_leafs);
+                    graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
+                }
+            }
+        }
+    }
+
+    // add leafs from the original graph
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
+        assert(graph_copy->size > graph_copy->n_leafs);
+        graph_copy->leafs[graph_copy->n_leafs++] = leaf;
+    }
+}
+
+static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+    bool backend_ids_changed = false;
+    for (int i = 0; i < sched->graph.n_nodes; i++) {
+        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
+            sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
+            backend_ids_changed = true;
+            break;
+        }
+    }
+    if (!backend_ids_changed) {
+        for (int i = 0; i < sched->graph.n_leafs; i++) {
+            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
+                sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
+                backend_ids_changed = true;
+                break;
+            }
+        }
+    }
+
+    // allocate graph
+    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+        // the re-allocation may cause the split inputs to be moved to a different address
+        ggml_backend_sched_synchronize(sched);
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+#endif
+        ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+        if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+            GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
+    struct ggml_backend_sched_split * splits = sched->splits;
+
+    for (int i = 0; i < sched->n_splits; i++) {
+        struct ggml_backend_sched_split * split = &splits[i];
+        int split_backend_id = split->backend_id;
+        ggml_backend_t split_backend = sched->backends[split_backend_id];
+
+        // copy the input tensors to the split backend
+        for (int j = 0; j < split->n_inputs; j++) {
+            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
+            struct ggml_tensor * input = split->inputs[j];
+            struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
+
+            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
+                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    ggml_backend_synchronize(split_backend);
+                }
+                ggml_backend_tensor_copy(input, input_cpy);
+            } else {
+                // wait for the split backend to finish using the input before overwriting it
+                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
+                } else {
+                    ggml_backend_synchronize(split_backend);
+                }
+                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
+                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    ggml_backend_synchronize(input_backend);
+                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                        ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
+                    } else {
+                        ggml_backend_synchronize(split_backend);
+                    }
+                    ggml_backend_tensor_copy(input, input_cpy);
+                }
+            }
+        }
+
+        if (!sched->callback_eval) {
+            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+            if (ec != GGML_STATUS_SUCCESS) {
+                return ec;
+            }
+        } else {
+            // similar to ggml_backend_compare_graph_backend
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct ggml_tensor * t = split->graph.nodes[j0];
+
+                // check if the user needs data from this node
+                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+
+                int j1 = j0;
+
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++j1];
+                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                }
+
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+
+                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
+                if (ec != GGML_STATUS_SUCCESS) {
+                    return ec;
+                }
+
+                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
+                ggml_backend_synchronize(split_backend);
+
+                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                    break;
+                }
+
+                j0 = j1;
+            }
+        }
+
+        // record the event of this copy
+        if (split->n_inputs > 0) {
+            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
+                ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
+            }
+        }
+    }
+
+    sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
+
+    return GGML_STATUS_SUCCESS;
+}
+
+ggml_backend_sched_t ggml_backend_sched_new(
+        ggml_backend_t * backends,
+        ggml_backend_buffer_type_t * bufts,
+        int n_backends,
+        size_t graph_size,
+        bool parallel) {
+    GGML_ASSERT(n_backends > 0);
+    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
+    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
+
+    struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
+
+    const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
+    sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
+    sched->n_backends = n_backends;
+    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
+
+    // initialize hash table
+    // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
+    sched->hash_set    = ggml_hash_set_new(graph_size);
+    sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+    sched->hv_tensor_copies      = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
+
+    const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
+    const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+    sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+    sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
+    sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
+
+    sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
+    sched->context_buffer = (char *) malloc(sched->context_buffer_size);
+
+    const int initial_splits_capacity = 16;
+    sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
+    sched->splits_capacity = initial_splits_capacity;
+
+    for (int b = 0; b < n_backends; b++) {
+        sched->backends[b] = backends[b];
+        sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
+        GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
+
+        if (sched->n_copies > 1) {
+            for (int c = 0; c < sched->n_copies; c++) {
+                sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
+            }
+        }
+    }
+
+    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+
+    ggml_backend_sched_reset(sched);
+
+    return sched;
+}
+
+void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+    if (sched == NULL) {
+        return;
+    }
+    for (int b = 0; b < sched->n_backends; b++) {
+        for (int c = 0; c < sched->n_copies; c++) {
+            ggml_backend_event_free(sched->events[b][c]);
+        }
+    }
+    ggml_gallocr_free(sched->galloc);
+    ggml_free(sched->ctx);
+    ggml_hash_set_free(&sched->hash_set);
+    free(sched->splits);
+    free(sched->hv_tensor_backend_ids);
+    free(sched->hv_tensor_copies);
+    free(sched->node_backend_ids);
+    free(sched->leaf_backend_ids);
+    free(sched->prev_node_backend_ids);
+    free(sched->prev_leaf_backend_ids);
+    free(sched->context_buffer);
+    free(sched->graph.nodes);
+    free(sched->graph.leafs);
+    free(sched);
+}
+
+void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
+    // reset state for the next run
+    if (!sched->is_reset) {
+        ggml_hash_set_reset(&sched->hash_set);
+        memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+        memset(sched->hv_tensor_copies,       0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
+        sched->is_reset = true;
+    }
+    sched->is_alloc = false;
+}
+
+bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
+    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+
+    ggml_backend_sched_split_graph(sched, measure_graph);
+
+    ggml_backend_sched_synchronize(sched);
+
+    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+        return false;
+    }
+
+    ggml_backend_sched_reset(sched);
+
+    return true;
+}
+
+bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
+
+    ggml_backend_sched_split_graph(sched, graph);
+
+
+    if (!ggml_backend_sched_alloc_splits(sched)) {
+        return false;
+    }
+
+    sched->is_alloc = true;
+
+    return true;
+}
+
+enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
+    ggml_backend_sched_synchronize(sched);
+    return err;
+}
+
+enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+    if (!sched->is_reset && !sched->is_alloc) {
+        ggml_backend_sched_reset(sched);
+    }
+
+    if (!sched->is_alloc) {
+        if (!ggml_backend_sched_alloc_graph(sched, graph)) {
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+    }
+
+    return ggml_backend_sched_compute_splits(sched);
+}
+
+void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
+    for (int i = 0; i < sched->n_backends; i++) {
+        ggml_backend_synchronize(sched->backends[i]);
+    }
+}
+
+void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+
+int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
+    return sched->n_splits;
+}
+
+int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
+    return sched->n_copies;
+}
+
+int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
+    return sched->n_backends;
+}
+
+ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
+    GGML_ASSERT(i >= 0 && i < sched->n_backends);
+    return sched->backends[i];
+}
+
+size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+}
+
+void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+    tensor_backend_id(node) = backend_index;
+    SET_CAUSE(node, "usr");
+    sched->is_reset = false;
+}
+
+ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+    int backend_index = tensor_backend_id(node);
+    if (backend_index == -1) {
+        return NULL;
+    }
+    return sched->backends[backend_index];
+}
+
+// utils
+
+void ggml_backend_view_init(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->buffer == NULL);
+    GGML_ASSERT(tensor->view_src != NULL);
+    GGML_ASSERT(tensor->view_src->buffer != NULL);
+    GGML_ASSERT(tensor->view_src->data != NULL);
+
+    tensor->buffer = tensor->view_src->buffer;
+    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
+    ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
+}
+
+void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
+    GGML_ASSERT(tensor->buffer == NULL);
+    GGML_ASSERT(tensor->data == NULL);
+    GGML_ASSERT(tensor->view_src == NULL);
+    GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
+    GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
+                (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
+
+    tensor->buffer = buffer;
+    tensor->data = addr;
+    ggml_backend_buffer_init_tensor(buffer, tensor);
+}
+
+static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
+    struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
+
+    GGML_ASSERT(src != NULL);
+    GGML_ASSERT(src->data && "graph must be allocated");
+
+    size_t id = ggml_hash_insert(&hash_set, src);
+    if (id == GGML_HASHSET_ALREADY_EXISTS) {
+        return node_copies[ggml_hash_find(&hash_set, src)];
+    }
+
+    struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
+    if (src->view_src != NULL) {
+        dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
+        dst->view_offs = src->view_offs;
+    }
+    dst->op = src->op;
+    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
+    ggml_set_name(dst, src->name);
+
+    // copy src
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        struct ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            continue;
+        }
+        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
+    }
+
+    node_copies[id] = dst;
+    return dst;
+}
+
+static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
+    size_t id = ggml_hash_find(hash_set, src);
+    if (node_init[id]) {
+        return;
+    }
+    node_init[id] = true;
+
+    struct ggml_tensor * dst = node_copies[id];
+    if (dst->view_src != NULL) {
+        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
+        ggml_backend_view_init(dst);
+    }
+    else {
+        ggml_backend_tensor_copy(src, dst);
+    }
+
+    // init src
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        struct ggml_tensor * s = src->src[i];
+        if (s == NULL) {
+            continue;
+        }
+        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
+    }
+}
+
+struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
+    struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
+    struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
+    bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
+
+    struct ggml_init_params params = {
+        /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true
+    };
+
+    struct ggml_context * ctx_allocated = ggml_init(params);
+    struct ggml_context * ctx_unallocated = ggml_init(params);
+
+    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
+        ggml_hash_set_free(&hash_set);
+        free(node_copies);
+        free(node_init);
+        ggml_free(ctx_allocated);
+        ggml_free(ctx_unallocated);
+        return {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
+
+    // dup nodes
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
+    }
+
+    // allocate nodes
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+    if (buffer == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
+        ggml_hash_set_free(&hash_set);
+        free(node_copies);
+        free(node_init);
+        ggml_free(ctx_allocated);
+        ggml_free(ctx_unallocated);
+        return {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
+
+    //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
+
+    // copy data and init views
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
+    }
+
+    // build graph copy
+    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
+        graph_copy->nodes[i] = node_copy;
+    }
+    graph_copy->n_nodes = graph->n_nodes;
+
+    ggml_hash_set_free(&hash_set);
+    free(node_copies);
+    free(node_init);
+
+    return {
+        /* .buffer           = */ buffer,
+        /* .ctx_allocated    = */ ctx_allocated,
+        /* .ctx_unallocated  = */ ctx_unallocated,
+        /* .graph            = */ graph_copy,
+    };
+}
+
+void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
+    ggml_backend_buffer_free(copy.buffer);
+    ggml_free(copy.ctx_allocated);
+    ggml_free(copy.ctx_unallocated);
+}
+
+bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
+    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
+    if (copy.buffer == NULL) {
+        return false;
+    }
+
+    struct ggml_cgraph * g1 = graph;
+    struct ggml_cgraph * g2 = copy.graph;
+
+    assert(g1->n_nodes == g2->n_nodes);
+
+    for (int i = 0; i < g1->n_nodes; i++) {
+        //printf("eval %d/%d\n", i, g1->n_nodes);
+        struct ggml_tensor * t1 = g1->nodes[i];
+        struct ggml_tensor * t2 = g2->nodes[i];
+
+        assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
+
+        struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
+        struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
+
+        ggml_backend_graph_compute(backend1, &g1v);
+        ggml_backend_graph_compute(backend2, &g2v);
+
+        if (ggml_is_view_op(t1->op)) {
+            continue;
+        }
+
+        // compare results, calculate rms etc
+        if (!callback(i, t1, t2, user_data)) {
+            break;
+        }
+    }
+
+    ggml_backend_graph_copy_free(copy);
+
+    return true;
+}
+
+// CPU backend - buffer
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    uintptr_t data = (uintptr_t)buffer->context;
+
+    // align the buffer
+    if (data % TENSOR_ALIGNMENT != 0) {
+        data = GGML_PAD(data, TENSOR_ALIGNMENT);
+    }
+
+    return (void *)data;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_aligned_free(buffer->context, buffer->size);
+}
+
+static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
+    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
+    /* .init_tensor     = */ NULL, // no initialization required
+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// CPU backend buffer type
+
+// this buffer type is defined here to make it available to all backends
+
+static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = ggml_aligned_malloc(size);
+
+    if (data == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
+}
+
+static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+
+    GGML_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type;
+}
+
+static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_Mapped";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+        /* .iface   = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .device  = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type;
+}
+
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
+    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
+}
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
new file mode 100644
index 000000000..0bf3c05d9
--- /dev/null
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -0,0 +1,87 @@
+if (GGML_STATIC)
+    set(BLA_STATIC ON)
+endif()
+#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+#    set(BLA_SIZEOF_INTEGER 8)
+#endif()
+
+set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+find_package(BLAS)
+
+if (BLAS_FOUND)
+    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+
+    ggml_add_backend_library(ggml-blas
+                             ggml-blas.cpp
+                            )
+
+    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
+    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_package(PkgConfig REQUIRED)
+        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+            pkg_check_modules(DepBLAS blas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+            pkg_check_modules(DepBLAS openblas64)
+            if (NOT DepBLAS_FOUND)
+                pkg_check_modules(DepBLAS openblas)
+            endif()
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            add_compile_definitions(GGML_BLAS_USE_BLIS)
+            pkg_check_modules(DepBLAS blis)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+            pkg_check_modules(DepBLAS blas-atlas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+            pkg_check_modules(DepBLAS flexiblas_api)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+            # all Intel* libraries share the same include path
+            pkg_check_modules(DepBLAS mkl-sdl)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+            # this doesn't provide pkg-config
+            # suggest to assign BLAS_INCLUDE_DIRS on your own
+            if ("${NVHPC_VERSION}" STREQUAL "")
+                message(WARNING "Better to set NVHPC_VERSION")
+            else()
+                set(DepBLAS_FOUND ON)
+                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
+            endif()
+        endif()
+        if (DepBLAS_FOUND)
+            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
+        else()
+            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
+            " detected by pkgconfig, trying to find cblas.h from possible paths...")
+            find_path(BLAS_INCLUDE_DIRS
+                NAMES cblas.h
+                HINTS
+                    /usr/include
+                    /usr/local/include
+                    /usr/include/openblas
+                    /opt/homebrew/opt/openblas/include
+                    /usr/local/opt/openblas/include
+                    /usr/include/x86_64-linux-gnu/openblas/include
+            )
+        endif()
+    endif()
+
+    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
+
+    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+        add_compile_definitions(GGML_BLAS_USE_MKL)
+    endif()
+
+    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
+    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
+else()
+    message(ERROR "BLAS not found, please refer to "
+                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                  " to set correct GGML_BLAS_VENDOR")
+endif()
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
new file mode 100644
index 000000000..ec158dfac
--- /dev/null
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -0,0 +1,517 @@
+#include "ggml-impl.h"
+#include "ggml-blas.h"
+#include "ggml-backend-impl.h"
+
+#include <future>
+#include <vector>
+#include <cstring>
+
+#if defined(GGML_BLAS_USE_ACCELERATE)
+#   include <Accelerate/Accelerate.h>
+#elif defined(GGML_BLAS_USE_MKL)
+#   include <mkl.h>
+#elif defined(GGML_BLAS_USE_BLIS)
+#   include <blis.h>
+#elif defined(GGML_BLAS_USE_NVPL)
+#   include <nvpl_blas.h>
+#else
+#   include <cblas.h>
+#endif
+
+struct ggml_backend_blas_context {
+    int n_threads = GGML_DEFAULT_N_THREADS;
+    std::unique_ptr<char[]> work_data;
+    size_t work_size = 0;
+#ifndef GGML_USE_OPENMP
+    std::vector<std::future<void>> tasks;
+#endif
+};
+
+static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const int64_t ne_plane      = ne01*ne00;
+    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
+
+    if (ctx->work_size < desired_wsize) {
+        ctx->work_data.reset(new char[desired_wsize]);
+        ctx->work_size = desired_wsize;
+    }
+    void * wdata = ctx->work_data.get();
+
+    // convert src0 to float
+    if (type != GGML_TYPE_F32) {
+        const auto * type_traits = ggml_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
+                      float * const wplane = (float *) wdata      + i02*ne_plane      + i03*ne02*ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
+                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
+
+#ifdef GGML_USE_OPENMP
+                #pragma omp parallel for num_threads(n_threads)
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                }
+#else
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start =       i*ne01/n_threads;
+                    const int64_t end   = (i + 1)*ne01/n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end   = ne01/n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
+#endif
+            }
+        }
+
+#ifndef GGML_USE_OPENMP
+        // wait for all tasks to finish
+        for (auto & task : ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+#endif
+    }
+
+#if defined(OPENBLAS_VERSION)
+    openblas_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(GGML_BLAS_USE_BLIS)
+    bli_thread_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(GGML_BLAS_USE_NVPL)
+    nvpl_blas_set_num_threads(ctx->n_threads);
+#endif
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            const int64_t i03 = i13/r3;
+            const int64_t i02 = i12/r2;
+
+            const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
+            const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
+
+            if (type != GGML_TYPE_F32) {
+                x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
+            }
+
+            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne1, ne01, ne10,
+                        1.0f,   y, ne10,
+                                x, ne00,
+                        0.0f,   d, ne01);
+        }
+    }
+}
+
+static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(ne0  == ne00);
+    GGML_ASSERT(ne1  == ne10);
+    GGML_ASSERT(ne2  == ne02);
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne3  == ne13);
+    GGML_ASSERT(ne03 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
+    // src0: (k,n)
+    // src1: (k,m)
+    // dst:  (m,n)
+    //
+    // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+    // Also expressed as (major,minor)
+    // a: (m,k): so src1 transposed
+    // b: (k,n): so src0
+    // c: (m,n)
+    //
+    // However, if ggml_is_transposed(src1) is true, then
+    // src1->data already contains a transposed version, so sgemm mustn't
+    // transpose it further.
+
+    int n = src0->ne[0];
+    int k = src0->ne[1];
+    int m = src1->ne[0];
+
+    CBLAS_TRANSPOSE transposeA;
+    int lda;
+
+    if (!ggml_is_transposed(src1)) {
+        transposeA = CblasTrans;
+        lda = m;
+    } else {
+        transposeA = CblasNoTrans;
+        lda = k;
+    }
+
+    float * a = (float *) ((char *) src1->data);
+    float * b = (float *) ((char *) src0->data);
+    float * c = (float *) ((char *) dst->data);
+
+    cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+    GGML_UNUSED(ctx);
+}
+
+// backend interface
+
+static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
+    return "BLAS";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_blas_free(ggml_backend_t backend) {
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+    delete ctx;
+    delete backend;
+}
+
+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_backend_blas_mul_mat(ctx, node);
+                break;
+
+            case GGML_OP_OUT_PROD:
+                ggml_backend_blas_out_prod(ctx, node);
+                break;
+
+            case GGML_OP_NONE:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+                break;
+
+            default:
+                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i blas_backend_i = {
+    /* .get_name                = */ ggml_backend_blas_get_name,
+    /* .free                    = */ ggml_backend_blas_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_blas_guid(void) {
+    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_blas_init(void) {
+    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_blas_guid(),
+        /* .interface = */ blas_backend_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .context   = */ ctx,
+    };
+
+#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
+        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+    }
+#endif
+
+#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+    GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#endif
+
+    return backend;
+}
+
+bool ggml_backend_is_blas(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
+}
+
+void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_blas(backend_blas));
+
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
+    ctx->n_threads = n_threads;
+}
+
+// device interface
+
+static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
+    return "BLAS";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
+    #if defined(GGML_BLAS_USE_ACCELERATE)
+        return "Accelerate";
+    #elif defined(GGML_BLAS_USE_MKL)
+        return "MKL";
+    #elif defined(GGML_BLAS_USE_BLIS)
+        return "BLIS";
+    #elif defined(GGML_BLAS_USE_NVPL)
+        return "NVPL";
+    #elif defined(OPENBLAS_VERSION)
+        return "OpenBLAS";
+    #else
+        return "BLAS";
+    #endif
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_blas_device_get_name(dev);
+    props->description = ggml_backend_blas_device_get_description(dev);
+    props->type        = ggml_backend_blas_device_get_type(dev);
+    ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_blas_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT:
+        {
+            // BLAS usually is only faster for large matrices
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const int64_t ne10 = src1->ne[0];
+
+            const int64_t ne0 = op->ne[0];
+            const int64_t ne1 = op->ne[1];
+
+            // TODO: find the optimal value
+            const int64_t min_batch = 32;
+
+            return ggml_is_contiguous(src0) &&
+                   ggml_is_contiguous(src1) &&
+                   src1->type == GGML_TYPE_F32 &&
+                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
+        }
+
+        case GGML_OP_OUT_PROD:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   ggml_is_matrix(src0) &&
+                   ggml_is_matrix(src1) &&
+                   ggml_is_contiguous(src0) &&
+                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
+
+        default:
+            return false;
+
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
+    /* .get_name             = */ ggml_backend_blas_device_get_name,
+    /* .get_description      = */ ggml_backend_blas_device_get_description,
+    /* .get_memory           = */ ggml_backend_blas_device_get_memory,
+    /* .get_type             = */ ggml_backend_blas_device_get_type,
+    /* .get_props            = */ ggml_backend_blas_device_get_props,
+    /* .init_backend         = */ ggml_backend_blas_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_blas_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
+    return "BLAS";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_device ggml_backend_blas_device = {
+        /* .iface   = */ ggml_backend_blas_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ nullptr,
+    };
+
+    return &ggml_backend_blas_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_blas_set_n_threads;
+    }
+    return NULL;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
+    /* .get_name         = */ ggml_backend_blas_reg_get_name,
+    /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_blas_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_blas_reg(void) {
+    static struct ggml_backend_reg ggml_backend_blas_reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_blas_reg_i,
+        /* .context     = */ NULL,
+    };
+
+    return &ggml_backend_blas_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
diff --git a/ggml/src/ggml-cann/.clang-format b/ggml/src/ggml-cann/.clang-format
new file mode 100644
index 000000000..2ad03d739
--- /dev/null
+++ b/ggml/src/ggml-cann/.clang-format
@@ -0,0 +1,168 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: true
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        Auto
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+...
+
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
new file mode 100644
index 000000000..05cf06bfa
--- /dev/null
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -0,0 +1,76 @@
+if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
+    set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
+    message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
+endif()
+
+# Auto-detech Soc type and Soc version, if detect failed, will abort build
+set(SOC_VERSION "")
+function(detect_ascend_soc_type SOC_VERSION)
+    execute_process(
+        COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
+        OUTPUT_VARIABLE npu_info
+        RESULT_VARIABLE npu_result
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if("${npu_info}" STREQUAL "" OR ${npu_result})
+        message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
+    endif()
+    set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
+endfunction()
+
+if(NOT SOC_TYPE)
+    detect_ascend_soc_type(SOC_VERSION)
+    set(SOC_TYPE "${SOC_VERSION}")
+    message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
+endif()
+
+string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
+
+# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
+string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
+set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
+string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
+
+if (CANN_INSTALL_DIR)
+    # Only Support Linux.
+    if (NOT UNIX)
+        message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
+    endif()
+
+    # Supported platforms: x86-64, arm64
+    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+    elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+    else()
+        message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
+
+    # Set header and libs
+    set(CANN_INCLUDE_DIRS
+        ${CANN_INSTALL_DIR}/include
+        ${CANN_INSTALL_DIR}/include/aclnn
+        ${CANN_INSTALL_DIR}/acllib/include
+    )
+
+    add_subdirectory(kernels)
+    list(APPEND CANN_LIBRARIES
+        ascendcl
+        nnopbase
+        opapi
+        acl_op_compiler
+        ascendc_kernels
+    )
+
+    file(GLOB GGML_SOURCES_CANN "*.cpp")
+
+    ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
+    target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
+    target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
+    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
+
+    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+
+    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
+    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
+else()
+    message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
+endif()
diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile
new file mode 100644
index 000000000..3290a4859
--- /dev/null
+++ b/ggml/src/ggml-cann/Doxyfile
@@ -0,0 +1,2579 @@
+# Doxyfile 1.8.17
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "ggml"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Tensor library for machine learning"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = YES
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# (including Cygwin) ands Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  =
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
+# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.doc \
+                         *.txt \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
new file mode 100644
index 000000000..d120ce6ac
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "acl_tensor.h"
+
+#include <algorithm>
+#include <cstring>
+
+aclDataType ggml_cann_type_mapping(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_F32:
+            return ACL_FLOAT;
+        case GGML_TYPE_F16:
+            return ACL_FLOAT16;
+        case GGML_TYPE_I8:
+            return ACL_INT8;
+        case GGML_TYPE_I16:
+            return ACL_INT16;
+        case GGML_TYPE_I32:
+            return ACL_INT32;
+        case GGML_TYPE_Q4_0:
+            return ACL_INT4;
+        case GGML_TYPE_Q8_0:
+            return ACL_INT8;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+    return ACL_DT_UNDEFINED;
+}
+
+aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
+                                   size_t* nb, int64_t dims, aclFormat format,
+                                   size_t offset) {
+    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
+    // added.
+    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
+
+    int64_t acl_storage_len = 0;
+    if (ne == nullptr) {
+        acl_storage_len = ggml_nbytes(tensor);
+        for (int i = 0; i < GGML_MAX_DIMS; i++) {
+            acl_ne[i] = tensor->ne[i];
+            // The step size of acl is in elements.
+            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
+        }
+    } else {
+        // With bcast
+        for (int i = 0; i < dims; i++) {
+            acl_storage_len += (ne[i] - 1) * nb[i];
+            acl_ne[i] = ne[i];
+            acl_stride[i] = nb[i] / ggml_element_size(tensor);
+        }
+    }
+
+    // Reverse ne and stride.
+    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    std::reverse(acl_ne, acl_ne + final_dims);
+    std::reverse(acl_stride, acl_stride + final_dims);
+
+    aclTensor* acl_tensor = aclCreateTensor(
+        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
+        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
+        tensor->data);
+
+    return acl_tensor;
+}
+
+bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
+                                  const ggml_tensor* src1,
+                                  int64_t* bcast_src0_ne,
+                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
+                                  size_t* bcast_src1_nb) {
+    GGML_ASSERT(ggml_can_repeat(src1, src0));
+    int bcast_dim_cnt = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        int64_t nr = src0->ne[i] / src1->ne[i];
+        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
+        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
+        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
+        bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
+        bcast_dim_cnt++;
+        if (nr != 1) {
+            // Need to add an extra dim.
+            bcast_src0_ne[bcast_dim_cnt] = nr;
+            bcast_src1_ne[bcast_dim_cnt] = 1;
+            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
+                                           bcast_src0_ne[bcast_dim_cnt - 1];
+            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
+                                           bcast_src1_ne[bcast_dim_cnt - 1];
+            bcast_dim_cnt++;
+        }
+    }
+    return bcast_dim_cnt;
+}
+
+int64_t ggml_cann_get_mulmat_bcast_shape(
+    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
+    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
+    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
+    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
+    // input and dst shoule in same shape, except first two dims.
+    GGML_ASSERT(input_ne[2] == dst_ne[2]);
+    GGML_ASSERT(input_ne[3] == dst_ne[3]);
+
+    int bcast_dim_cnt = 0;
+
+    // For mul_mat, a dimension needs to be added before the dimension that
+    // weight needs to be expanded to satisfy the bcast rule of matrix
+    // multiplication.
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        int64_t nr = input_ne[i] / weight_ne[i];
+        // Do not use bcast in the first two dimensions because we only support
+        // the bcast batch dimension. Just copy them.
+        if (i < 2 || nr == 1) {
+            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
+            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
+
+            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
+            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_dim_cnt++;
+        } else {
+            // Need to add an extra dim.
+            bcast_input_ne[bcast_dim_cnt] = nr;
+            bcast_dst_ne[bcast_dim_cnt] = nr;
+            bcast_weight_ne[bcast_dim_cnt] = 1;
+            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
+            bcast_dim_cnt++;
+
+            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
+            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
+            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
+                                            bcast_input_ne[bcast_dim_cnt - 1];
+            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
+                                          bcast_dst_ne[bcast_dim_cnt - 1];
+            bcast_weight_nb[bcast_dim_cnt] =
+                bcast_weight_nb[bcast_dim_cnt - 1] *
+                bcast_weight_ne[bcast_dim_cnt - 1];
+            bcast_dim_cnt++;
+        }
+    }
+    return bcast_dim_cnt;
+}
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
new file mode 100644
index 000000000..4734a9cb8
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CANN_ACL_TENSOR_H
+#define CANN_ACL_TENSOR_H
+
+#include <algorithm>
+#include <cstring>
+
+#include <aclnn/aclnn_base.h>
+#include "common.h"
+
+/**
+ * @brief	Maps a ggml_type to its corresponding aclDataType.
+ *
+ * @details	This function takes a ggml_type as input and returns the corresponding
+ *			aclDataType. It supports mapping for various ggml_types. If the input type
+ *			does not match any of the predefined ggml_types, the function returns
+ *          ACL_DT_UNDEFINED.
+ *
+ * @param	type    The ggml_type to be mapped.
+ * @return	The corresponding aclDataType. If the input type is not recognized,
+ *			ACL_DT_UNDEFINED is returned.
+ */
+aclDataType ggml_cann_type_mapping(ggml_type type);
+
+/**
+ * @brief   Creates an ACL tensor from a ggml_tensor with optional shape.
+ *
+ * @details This function creates an ACL tensor based on the properties of the
+ *          provided ggml_tensor. It supports customer shape by adjusting dimensions
+ *          and strides accordingly. If customer shape is applied, additional
+ *          dimensions and strides are calculated based on the provided parameters.
+ *
+ * @param   tensor      Pointer to the ggml_tensor to be converted to ACL tensor.
+ * @param   ne          Pointer to an array containing dimensions. Defaults to nullptr
+ *                      if no customer shape is applied.
+ * @param   nb          Pointer to an array containing strides. Defaults to nullptr
+ *                      if no customer shape is applied.
+ * @param   dims        Number of dimensions in the tensor. Defaults to 0 if no customer
+ *                      shape is applied.
+ * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
+ * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
+ * @return  Pointer to the created ACL tensor.
+ */
+aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
+                             size_t* nb = nullptr, int64_t dims = 0,
+                             aclFormat format = ACL_FORMAT_ND,
+                             size_t offset = 0);
+
+/**
+ * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
+ *          should be size_t or float.
+ *
+ * @details This function creates an ACL tensor using the provided data pointer,
+ *          data type, dimensions, strides, format, offset, and additional parameters.
+ *          It calculates necessary dimensions and strides based on the provided ne and nb
+ *          arrays, adjusting them for the ACL tensor creation. The ACL storage length
+ *          is also calculated based on the provided dimensions and strides.
+ *
+ * @param   data_ptr    Pointer to the data buffer for the ACL tensor.
+ * @param   dtype       ACL data type of the tensor.
+ * @param   type_size   Size of each element in the tensor data buffer.
+ * @param   ne          Pointer to an array containing tensor dimensions.
+ * @param   nb          Pointer to an array containing tensor strides.
+ * @param   dims        Number of dimensions of the tensor.
+ * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
+ * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
+ * @return  Pointer to the created ACL tensor.
+ */
+template<typename TYPE>
+aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
+                                   TYPE type_size, int64_t* ne, TYPE* nb,
+                                   int64_t dims,
+                                   aclFormat format = ACL_FORMAT_ND,
+                                   size_t offset = 0) {
+    int64_t tmp_ne[GGML_MAX_DIMS * 2];
+    int64_t tmp_stride[GGML_MAX_DIMS * 2];
+
+    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
+    for (int i = 0; i < dims; i++) {
+        tmp_stride[i] = nb[i] / type_size;
+    }
+
+    std::reverse(tmp_ne, tmp_ne + dims);
+    std::reverse(tmp_stride, tmp_stride + dims);
+
+    int64_t acl_storage_len = 0;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (ne[i] - 1) * nb[i];
+    }
+
+    aclTensor* acl_tensor =
+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
+                        format, &acl_storage_len, 1, data_ptr);
+
+    return acl_tensor;
+}
+
+/**
+ * @brief   Checks if tensors require broadcasting based on their shapes.
+ *
+ * @details This function determines if two ggml_tensors need to be broadcasted for
+ *          element-wise operations. Broadcasting is necessary if the shapes of the
+ *          tensors are not identical and no dimension in either tensor equals 1.
+ *
+ * @param   t0      Pointer to the first ggml_tensor.
+ * @param   t1      Pointer to the second ggml_tensor.
+ * @return  True if broadcasting is needed, False otherwise.
+ *
+ * @remarks This function iterates over the dimensions of t0 and t1. It checks if each
+ *          dimension in t1 differs from t0's corresponding dimension and is not equal
+ *          to 1. If such a dimension is found, broadcasting is required to align t1
+ *          with t0 for element-wise operations.
+ */
+bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
+
+/**
+ * @brief   Computes broadcast shapes and strides for two ggml_tensors.
+ *
+ * @details This function calculates the broadcast shapes and strides for two ggml_tensors,
+ *          following the broadcasting rules similar to numpy. It adjusts dimensions and
+ *          strides to ensure compatibility for element-wise operations where one tensor
+ *          can be broadcasted to match the shape of another tensor.
+ *
+ * @param   src0                Pointer to the first ggml_tensor.
+ * @param   src1                Pointer to the second ggml_tensor.
+ * @param   bcast_ne_src0       Output array to store broadcasted dimensions for src0.
+ * @param   bcast_ne_src1       Output array to store broadcasted dimensions for src1.
+ * @param   bcast_nb_src0       Output array to store broadcasted strides for src0.
+ * @param   bcast_nb_src1       Output array to store broadcasted strides for src1.
+ * @return  Number of dimensions in the broadcasted shape.
+ *
+ * @pre     ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
+ *          to match src0.
+ *
+ * @remarks This function iterates over the dimensions of src0 and src1, calculating the
+ *          necessary broadcast dimensions and strides. If a dimension requires broadcasting
+ *          (i.e., its size in src1 is smaller than in src0), an additional dimension is
+ *          added with size calculated to match src0's dimension. This adjustment ensures
+ *          that src1 can be element-wise broadcasted to src0's shape.
+ *
+ *  How it works:
+ *
+ *  if dim0 has padding.
+ *  a -> (2, 2) padding = 2
+ *   a: [[1, 2, *, *]
+ *       [2, 3, *, *]]
+ *  nb = (8, 4, 2)
+ *
+ *  if a should bcast with b -> (2, 4)
+ *  b' -> (2, 2, 2)
+ *  b : [[1, 2, 3, 4, *, *]
+ *       [5, 6, 7, 8, *, *]]
+ *  nb = (12, 6, 1)
+ *
+ *  after bcast:
+ *  a' -> (2, 1, 2)
+ *  a': [[[1, 2], *, *]
+ *       [[2, 3], *, *]]
+ *  nb = (8, 4, 2, 1)
+ *
+ *  b' : [[[1, 2], [3, 4], *, *]
+ *        [[5, 6], [7, 8], *, *]]
+ *  nb = (12, 6, 2, 1)
+ *  \endcode
+ *
+ *  dim1 in a inserted dim, should add nb for dim1,
+ *  and all other nb moves to next in order.
+ */
+int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
+                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
+                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
+
+// Bcast macro to avoid duplicate code.
+#define BCAST_SHAPE(src0, src1)                                              \
+    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
+    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
+    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
+    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
+    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
+        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
+        bcast_##src1##_nb);
+
+#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+
+/**
+ * @brief Calculates broadcast shapes for matrix multiplication.
+ *
+ * @details This function computes the broadcast shapes required for matrix multiplication
+ *          based on the input, weight, and destination tensor shapes. It ensures that the
+ *          dimensions of weight tensors are expanded appropriately to satisfy matrix
+ *          multiplication broadcast rules.
+ *
+ * @param input_ne      Array containing the dimensions of the input tensor.
+ * @param weight_ne     Array containing the dimensions of the weight tensor.
+ * @param dst_ne        Array containing the dimensions of the destination tensor.
+ * @param input_nb      Array containing the strides of the input tensor.
+ * @param weight_nb     Array containing the strides of the weight tensor.
+ * @param dst_nb        Array containing the strides of the destination tensor.
+ * @param bcast_input_ne    Output array for broadcasted input tensor dimensions.
+ * @param bcast_weight_ne   Output array for broadcasted weight tensor dimensions.
+ * @param bcast_dst_ne      Output array for broadcasted destination tensor dimensions.
+ * @param bcast_input_nb    Output array for broadcasted input tensor strides.
+ * @param bcast_weight_nb   Output array for broadcasted weight tensor strides.
+ * @param bcast_dst_nb      Output array for broadcasted destination tensor strides.
+ * @return The number of dimensions in the broadcasted tensors.
+ *
+ * @remarks This function iterates over the tensor dimensions and calculates the broadcast
+ *          shapes needed for matrix multiplication. It ensures that dimensions where
+ *          weight tensor requires expansion are appropriately handled to conform with
+ *          broadcasting rules.
+ * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
+ *       before cast dim.
+ * @sa ggml_cann_get_bcast_shape
+ */
+int64_t ggml_cann_get_mulmat_bcast_shape(
+    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
+    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
+    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
+    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
+
+// Bcast macro to avoid duplicate code.
+#define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
+    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
+    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
+    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
+    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
+    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
+    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
+    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
+        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
+        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
+        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
+
+#define BCAST_MUL_MAT_PARAM(tensor) \
+    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
+
+#endif  // CANN_ACL_TENSOR_H
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
new file mode 100644
index 000000000..b2d857e1e
--- /dev/null
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -0,0 +1,3427 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "aclnn_ops.h"
+
+#include <aclnnop/aclnn_addcdiv.h>
+#include <aclnnop/aclnn_avgpool2d.h>
+#include <aclnnop/aclnn_batch_matmul.h>
+#include <aclnnop/aclnn_cast.h>
+#include <aclnnop/aclnn_constant_pad_nd.h>
+#include <aclnnop/aclnn_copy.h>
+#include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_exp.h>
+#include <aclnnop/aclnn_fill_scalar.h>
+#include <aclnnop/aclnn_group_norm.h>
+#include <aclnnop/aclnn_index_fill_tensor.h>
+#include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_matmul.h>
+#include <aclnnop/aclnn_max_pool.h>
+#include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_permute.h>
+#include <aclnnop/aclnn_pow_tensor_tensor.h>
+#include <aclnnop/aclnn_reduce_sum.h>
+#include <aclnnop/aclnn_repeat.h>
+#include <aclnnop/aclnn_repeat_interleave.h>
+#include <aclnnop/aclnn_roll.h>
+#include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_softmax.h>
+#include <aclnnop/aclnn_tril.h>
+#include <aclnnop/aclnn_triu.h>
+#include <aclnnop/aclnn_upsample_nearest_2d.h>
+#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
+#include <float.h>
+
+#include <cmath>
+#include <cstring>
+#include <exception>
+#include <vector>
+
+#include "ggml-impl.h"
+#include "kernels/ascendc_kernels.h"
+
+#define GGML_COMMON_DECL_C
+
+#include "../ggml-common.h"
+
+/**
+ * @brief Repeats elements of a tensor along each dimension according to the
+ * specified repeat array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be repeated.
+ * @param acl_dst The destination tensor after repeating.
+ * @param repeat_array The array specifying the number of repetitions along each
+ * dimension.
+ */
+static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                         aclTensor* acl_dst, int64_t* repeat_array) {
+    // repeat tensor along each dim with repeat_array
+    aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
+                                          &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        // Memory from allocator will "free" immediately, and this memory
+        // will be alloced to other pointers, but it won't access before
+        // this async task end because all tasks in same stream will execute
+        // in queue.
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(
+        aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    ACL_CHECK(aclDestroyIntArray(repeats));
+}
+
+void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(ggml_can_repeat(src, dst));
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
+                              dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
+
+    aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Adds two tensors element-wise and stores the result in a destination
+ * tensor.
+ *
+ * This function performs the operation:
+ * \f[
+ *    dst = acl\_src0 + alpha \times acl\_src1
+ * \f]
+ * where alpha is a scalar value and defaults to 1.0f.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src0 The first source tensor.
+ * @param acl_src1 The second source tensor.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
+                      aclTensor* acl_src1, aclTensor* acl_dst) {
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+}
+
+void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    aclTensor* acl_src0;
+    aclTensor* acl_src1;
+    aclTensor* acl_dst;
+
+    // Need bcast
+    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
+        BCAST_SHAPE(src0, src1)
+        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
+        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
+        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
+    } else {
+        acl_src0 = ggml_cann_create_tensor(src0);
+        acl_src1 = ggml_cann_create_tensor(src1);
+        acl_dst = ggml_cann_create_tensor(dst);
+    }
+
+    aclnn_add(ctx, acl_src0, acl_src1, acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+    aclScalar* acl_negative_slope =
+        aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
+        acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(acl_negative_slope));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Concatenates a list of tensors along a specified dimension and stores
+ * the result in a destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param tensorList The list of tensors to be concatenated.
+ * @param acl_dst The destination tensor where the concatenated result will be
+ * stored.
+ * @param concat_dim The dimension along which the tensors will be concatenated.
+ */
+static void aclnn_concat(ggml_backend_cann_context& ctx,
+                         aclTensorList* tensorList, aclTensor* acl_dst,
+                         int64_t concat_dim) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+    int32_t acl_dim = 3 - dim;
+
+    aclTensor* tensors[] = {acl_src0, acl_src1};
+    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
+    aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
+
+    ACL_CHECK(aclDestroyTensorList(tensorList));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Creates a tensor with values starting from `start`, incremented by
+ * `step`, and ending before `stop`.
+ *
+ * This function performs the operation:
+ * \f[
+ *    \text {out }_{i+1}=\text {out }_i+\text {step}
+ * \f]
+ * the range is [start, stop).
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor where the values will be stored.
+ * @param start The starting value of the range.
+ * @param stop The ending value of the range (exclusive).
+ * @param step The step size between consecutive values.
+ * @param n_elements The number of elements in the destination tensor.
+ */
+static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
+                         float start, float stop, float step,
+                         int64_t n_elements) {
+    int64_t steps = (int64_t)std::ceil((stop - start) / step);
+    GGML_ASSERT(n_elements == steps);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
+    aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
+    aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
+                                          &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(acl_start));
+    ACL_CHECK(aclDestroyScalar(acl_end));
+    ACL_CHECK(aclDestroyScalar(acl_step));
+}
+
+void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    int64_t n_elements = ggml_nelements(dst);
+    float start;
+    float stop;
+    float step;
+    memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
+    memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
+
+    aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    dst->src[1] = dst->src[0];
+    ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
+}
+
+void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
+    aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
+                                         &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(acl_min));
+    ACL_CHECK(aclDestroyScalar(acl_max));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));
+
+    aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
+                                        &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(scale));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    ggml_cann_pool_alloc temp_buffer_allocator(
+        ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
+    void* buffer = temp_buffer_allocator.get();
+    aclTensor* tmp_tensor =
+        ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
+                                dst->ne, dst->nb, GGML_MAX_DIMS);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnArgsortGetWorkspaceSize(
+        acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
+        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    workspaceSize = 0;
+    ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor,
+                                        ggml_cann_type_mapping(dst->type),
+                                        acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(tmp_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    std::vector<int64_t> normData = {dst->ne[0]};
+    aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
+    ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
+                                             eps, acl_dst, nullptr, nullptr,
+                                             &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(norm));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    int n_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    int64_t N = src->ne[3];
+    int64_t C = src->ne[2];
+    int64_t HxW = src->ne[1] * src->ne[0];
+
+    size_t type_size = ggml_type_size(src->type);
+    int64_t ne[] = {n_groups, N};
+    size_t nb[] = {type_size, type_size * n_groups};
+    size_t n_bytes = N * n_groups;
+
+    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
+    void* buffer = temp_buffer_allocator.get();
+    aclTensor* acl_mean_out = ggml_cann_create_tensor(
+        buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+    aclTensor* acl_rstd_out = ggml_cann_create_tensor(
+        (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+
+    ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
+        acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
+        acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_mean_out));
+    ACL_CHECK(aclDestroyTensor(acl_rstd_out));
+}
+
+void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    size_t nb1 = ((int32_t*)dst->op_params)[0];
+    size_t nb2 = ((int32_t*)dst->op_params)[1];
+    size_t nb3 = ((int32_t*)dst->op_params)[2];
+    size_t offset = ((int32_t*)dst->op_params)[3];
+    bool inplace = (bool)((int32_t*)dst->op_params)[4];
+
+    size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
+
+    aclTensor* acl_dst = ggml_cann_create_tensor(
+        dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (!inplace) {
+        size_t cpy_size = ggml_nbytes(dst);
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+        aclTensor* acl_src0 = ggml_cann_create_tensor(
+            src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+        ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
+                                           &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+        ACL_CHECK(
+            aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
+        ACL_CHECK(aclDestroyTensor(acl_src0));
+    } else {
+        ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
+                                                  &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+        ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+
+    GGML_ASSERT(dst->ne[0] == 1);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    int64_t reduce_dims_host[] = {3};
+    aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnReduceSumGetWorkspaceSize(
+        acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst,
+        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    aclTensor* acl_src =
+        ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+    std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
+    auto output_size_array = aclCreateIntArray(output_size.data(), 2);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
+        acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(output_size_array));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Pads a tensor with a specified value along each dimension.
+ *
+ * This function performs padding of the source tensor `acl_src` and stores the
+ * result in the destination tensor `acl_dst`. The padding values for each
+ * dimension are specified in the `paddings` array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be padded.
+ * @param acl_dst The destination tensor where the padded result will be stored.
+ * @param paddings An array specifying the padding values for each dimension.
+ * The size of the array should be twice the number of dimensions of the tensor.
+ * @param value The value to be used for padding. The default value is 0.0.
+ */
+static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_dst, int64_t* paddings,
+                      float value = 0.0f) {
+    aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
+        acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
+                                 ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(acl_pad));
+    ACL_CHECK(aclDestroyScalar(acl_value));
+}
+
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    // padding: value in the array means how much distance will be padding.
+    // the position of elements in the array means which dirction to padding,
+    // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
+    //                       dim2.front, dim2.behind, dim3.front, dim3.behind]
+    int64_t paddings[] = {
+        0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
+        0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
+    aclnn_pad(ctx, acl_src, acl_dst, paddings);
+
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+}
+
+/**
+ * @brief Performs 2D average pooling on the input tensor and stores the result
+ * in the destination tensor.
+ *
+ * This function performs average pooling on the source tensor and stores the
+ * result in the destination tensor. The pooling parameters (kernel size,
+ * strides, padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
+static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
+                                 ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src =
+        ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    std::vector<int64_t> kernel_dims = {k1, k0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    std::vector<int64_t> padding_avg_dims = {p1, p0};  // (padH, padW)
+
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+    auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
+
+    bool ceil_mode = false;
+    bool count_include_pad = true;
+    int64_t divisor_override = 0;
+    int8_t cube_math_type = 0;
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
+        acl_src, kernel_size, strides, paddings_avg, ceil_mode,
+        count_include_pad, divisor_override, cube_math_type, acl_dst,
+        &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(
+        aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(strides));
+    ACL_CHECK(aclDestroyIntArray(paddings_avg));
+}
+
+/**
+ * @brief Performs 2D max pooling on the input tensor and stores the result in
+ * the destination tensor.
+ *
+ * This function performs max pooling on the source tensor and stores the result
+ * in the destination tensor. The pooling parameters (kernel size, strides,
+ * padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
+static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
+                                 ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src =
+        ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
+                         src->ne[3]};
+    size_t temp_nb[GGML_MAX_DIMS];
+
+    temp_nb[0] = ggml_element_size(src);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
+    }
+
+    ggml_cann_pool_alloc temp_buffer_allocator(
+        ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
+    void* buffer = temp_buffer_allocator.get();
+    aclTensor* tmp_tensor = ggml_cann_create_tensor(
+        buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
+        GGML_MAX_DIMS, ACL_FORMAT_NCHW);
+
+    // pad: see padding in ggml_cann_pad()
+    int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
+    float value = -FLT_MAX;
+    aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
+
+    // max_pool
+    std::vector<int64_t> kernel_dims = {k1, k0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
+    std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
+    std::vector<int64_t> dilation_size = {1, 1};
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+    auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
+    auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
+
+    bool ceil_mode = false;
+    int64_t auto_pads = 0;
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
+        tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
+        ceil_mode, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(tmp_tensor));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(strides));
+    ACL_CHECK(aclDestroyIntArray(paddings_max));
+    ACL_CHECK(aclDestroyIntArray(dilations));
+}
+
+void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+    switch (op) {
+        case GGML_OP_POOL_AVG:
+            ggml_cann_avg_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_MAX:
+            ggml_cann_max_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_COUNT:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+/**
+ * @brief Copies data from the source tensor to the destination tensor.
+ *
+ * This function copies data from the source tensor `acl_src` to the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor from which data will be copied.
+ * @param acl_dst The destination tensor where the data will be copied to.
+ */
+static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    src->extra = src_extra_allocator.get();
+    dst->extra = dst_extra_allocator.get();
+    ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+
+    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
+        ggml_are_same_shape(src, dst)) {
+        cann_copy(ctx, acl_src, acl_dst);
+        ACL_CHECK(aclDestroyTensor(acl_src));
+        ACL_CHECK(aclDestroyTensor(acl_dst));
+        return;
+    }
+    // TODO: simplify
+    if (src->type == GGML_TYPE_F16) {
+        if (dst->type == GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f16_q8_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+
+                    aclrtlaunch_ascendc_dup_by_rows_fp16(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+            GGML_ABORT("fatal error");
+        }
+        if (dst->type == GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+            GGML_ABORT("fatal error");
+        }
+        // TODO
+        GGML_ABORT("fatal error");
+    } else if (src->type == GGML_TYPE_F32) {
+        // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
+        //          && nb0 == type_size)
+        if (dst->type == GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f32_q8_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp32(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            } else {
+                // TODO: dst not contiguous
+                GGML_ABORT("fatal error");
+            }
+        }
+        if (dst->type == GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+        }
+        // TODO
+        GGML_ABORT("fatal error");
+    } else {
+        if (ggml_are_same_shape(src, dst)) {
+            cann_copy(ctx, acl_src, acl_dst);
+            ACL_CHECK(aclDestroyTensor(acl_src));
+            ACL_CHECK(aclDestroyTensor(acl_dst));
+            return;
+        }
+        GGML_ABORT("fatal error");
+    }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
+                                         const aclTensor* gamma, double epsilon,
+                                         const aclTensor* yOut,
+                                         const aclTensor* rstdOout,
+                                         uint64_t* workspaceSize,
+                                         aclOpExecutor** executor);
+aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
+                         aclOpExecutor* executor, aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
+/**
+ * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
+ *
+ * This function initializes a tensor with zeros using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @return An ACL tensor initialized with zeros.
+ */
+static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
+                             size_t n_bytes, int64_t* ne, int64_t dims,
+                             aclDataType type, size_t type_size) {
+    size_t nb[GGML_MAX_DIMS];
+    nb[0] = type_size;
+    for (int i = 1; i < dims; i++) {
+        nb[i] = nb[i - 1] * ne[i - 1];
+    }
+
+    ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
+    aclTensor* zero =
+        ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
+    return zero;
+}
+
+/**
+ * @brief Creates an ACL tensor initialized with value using a provided buffer.
+ *
+ * This function initializes a tensor with value using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @param value The value to be used for initializing the tensor (default
+ * is 1.0).
+ * @return An ACL tensor initialized with value.
+ */
+static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
+                               size_t n_bytes, int64_t* ne, int64_t dims,
+                               aclDataType type, size_t type_size,
+                               float value = 1.0f) {
+    aclTensor* acl_tensor =
+        aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
+    float alpha_host = 1.0f;
+    aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
+    aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
+                                               &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(
+        aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    return acl_tensor;
+}
+
+void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps > 0.0f);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
+    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
+
+    aclTensor* acl_gamma = aclnn_values(
+        ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
+        ggml_cann_type_mapping(src->type), ggml_element_size(src));
+
+    size_t zero_tensor_n_bytes =
+        src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
+    ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
+    aclTensor* acl_rstd =
+        aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
+                   src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
+                   ggml_element_size(src));
+
+    ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
+        acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_gamma));
+    ACL_CHECK(aclDestroyTensor(acl_rstd));
+}
+
+// TODO: performace is low.
+void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+                         float value) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    const int n_past = ((int32_t*)dst->op_params)[0];
+
+    size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
+                                src->ne[3] * ggml_element_size(src);
+    ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
+
+    aclTensor* mask_tensor =
+        aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
+                     src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
+                     ggml_element_size(src), value);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
+                                               &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(
+        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+    ACL_CHECK(aclDestroyTensor(mask_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Casts the data type of a source tensor to a destination tensor.
+ *
+ * This function casts the data type of the source tensor `acl_src` to the
+ * specified data type `cast_data_type` and stores the result in the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose data type will be casted.
+ * @param acl_dst The destination tensor where the casted result will be stored.
+ * @param cast_data_type The target data type to which the source tensor will be
+ * casted.
+ */
+static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       aclTensor* acl_dst, aclDataType cast_data_type) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Permutes the dimensions of a tensor according to a specified order.
+ *
+ * This function permutes the dimensions of the source tensor `acl_src`
+ * according to the order specified in the `new_dim` array and stores the result
+ * in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose dimensions will be permuted.
+ * @param acl_dst The destination tensor where the permuted result will be
+ * stored.
+ * @param new_dim An array specifying the new order of dimensions for the
+ * tensor.
+ * @param dims The number of dimensions in the tensor.
+ */
+static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
+    aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
+                                           &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(acl_dims));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
+                                        const aclIntArray* kernelSize,
+                                        const aclIntArray* dilation,
+                                        const aclIntArray* padding,
+                                        const aclIntArray* stride,
+                                        aclTensor* out, uint64_t* workspaceSize,
+                                        aclOpExecutor** executor);
+aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
+                        aclOpExecutor* executor, aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
+static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
+                                             ggml_tensor* dst,
+                                             ggml_tensor* src1,
+                                             aclTensor* tmp_cast_tensor,
+                                             aclTensor* tmp_im2col_tensor) {
+    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+    int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
+    size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+
+    int64_t permute_dim[] = {0, 2, 1};
+    if (src1->type != dst->type) {
+        aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
+    } else {
+        aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void ggml_cann_im2col_1d_post_process(
+    ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
+    aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
+    const std::vector<int64_t>& im2col_op_params) {
+    // get params
+    const int64_t KH = im2col_op_params[0];
+    const int64_t KW = im2col_op_params[1];
+    const int64_t IW = im2col_op_params[2];
+    const int64_t IC = im2col_op_params[3];
+    const int64_t N = im2col_op_params[4];
+    const int64_t OH = im2col_op_params[5];
+    const int64_t OW = im2col_op_params[6];
+    const int64_t s0 = im2col_op_params[7];
+    const int64_t p0 = im2col_op_params[8];
+    const int64_t d0 = im2col_op_params[9];
+    const int64_t n_bytes_factor = im2col_op_params[10];
+
+    // Permute: [N, IC * KH * KW, OW * OH] ->
+    // [N, OW * OH * n_bytes_factor, IC * KH * KW]
+    aclTensor* tmp_permute_tensor = nullptr;
+    ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
+    tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+    void* tmp_permute_buffer = tmp_permute_allocator.get();
+
+    int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
+    size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
+    tmp_permute_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+    }
+
+    tmp_permute_tensor = ggml_cann_create_tensor(
+        tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
+        GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+    int64_t permute_dim[] = {0, 2, 1};
+    if (src1->type != dst->type) {
+        aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
+    } else {
+        aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
+                      3);
+    }
+
+    // number of times the kernel moves in W dimension
+    const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
+    size_t offset;
+    void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
+
+    // memory copy with offset to restore 1D im2col from 2d
+    if (IC > 1) {
+        offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
+        size_t size_cpy = KH * KW * ggml_type_size(dst->type);
+
+        for (int c = 0; c < IC; c++) {
+            cur_permute_buffer = (char*)tmp_permute_buffer + offset +
+                                 KH * KW * c * ggml_type_size(dst->type);
+            cur_dst_buffer = (char*)dst->data +
+                             c * KH * KW * n_step_w * ggml_type_size(dst->type);
+
+            for (int i = 0; i < n_step_w; i++) {
+                ACL_CHECK(aclrtMemcpyAsync(
+                    cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                cur_dst_buffer =
+                    (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
+                cur_permute_buffer = (char*)cur_permute_buffer +
+                                     KH * KW * IC * ggml_type_size(dst->type);
+            }
+        }
+    } else {
+        offset = KH * KW * n_step_w *
+                 ggml_type_size(dst->type);  // equal to ggml_nbytes(dst)
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
+                                   (char*)tmp_permute_buffer + offset, offset,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
+}
+
+void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // kernel
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
+    // im2col and do post-processing to restore it to 1D.
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
+
+    const int64_t N = ne13;
+    const int64_t IC = ne12;
+    const int64_t KH = ne01;
+    const int64_t KW = ne00;
+    const int64_t IW = ne10;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    // memory allocated increased to 3x when is_2D == false
+    const int64_t n_bytes_factor = is_2D ? 1 : 3;
+
+    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
+    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+    int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
+    size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
+
+    tmp_im2col_nb[0] = ggml_type_size(src1->type);
+    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+        tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
+    }
+
+    // Calculate im2col.
+    // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
+    // dst.elemcount.
+    ggml_cann_pool_alloc im2col_allocator(
+        ctx.pool(),
+        ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
+    void* tmp_im2col_buffer = im2col_allocator.get();
+
+    aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
+        tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
+        ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
+        GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+    std::vector<int64_t> kernel_dims = {KH, KW};
+    std::vector<int64_t> dilation_size = {d1, d0};
+    std::vector<int64_t> padding_dims = {p1, p0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
+    auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
+                                          paddings, strides, tmp_im2col_tensor,
+                                          &workspaceSize, &executor));
+
+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+    if (workspaceSize > 0) {
+        workspace_allocator.alloc(workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    // Cast if dst is f16.
+    aclTensor* tmp_cast_tensor = nullptr;
+    ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
+    void* tmp_cast_buffer = nullptr;
+    if (src1->type != dst->type) {
+        tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
+        tmp_cast_buffer = tmp_cast_allocator.get();
+        size_t temp_cast_nb[GGML_MAX_DIMS - 1];
+        temp_cast_nb[0] = ggml_type_size(dst->type);
+        for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+            temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
+        }
+
+        tmp_cast_tensor = ggml_cann_create_tensor(
+            tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
+            ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
+            GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+        aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
+                   ggml_cann_type_mapping(dst->type));
+    }
+
+    // post-processing
+    if (is_2D) {
+        ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
+                                         tmp_im2col_tensor);
+    } else {
+        std::vector<int64_t> im2col_op_params = {
+            KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
+        ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
+                                         tmp_im2col_tensor, im2col_op_params);
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(dilations));
+    ACL_CHECK(aclDestroyIntArray(paddings));
+    ACL_CHECK(aclDestroyIntArray(strides));
+}
+
+/**
+ * @brief Applies element-wise exponential function to the elements of a tensor.
+ *
+ * This function computes the exponential of each element in the source tensor
+ * `acl_src` and stores the result back into the same tensor.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_src }_i=e^{acl\_src_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The tensor on which the exponential function will be applied.
+ */
+static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
+ * in-place.
+ *
+ * This function multiplies each element of the source tensor `acl_src` by the
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
+ * `inplace` is true, `acl_dst` will not be used and the operation is performed
+ *  in-place on `acl_src`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst }_i=\text {acl_src }_i \times \text {scale}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be multiplied.
+ * @param scale The scalar value by which each element of `acl_src` will be
+ * multiplied.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       float scale, aclTensor* acl_dst, bool inplace) {
+    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (inplace) {
+        ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
+                                                   &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
+                                   ctx.stream()));
+    } else {
+        ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
+                                            &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(
+            aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    }
+
+    ACL_CHECK(aclDestroyScalar(acl_scale));
+}
+
+/**
+ * @brief Performs an in-place element-wise multiplication of two tensors.
+ *
+ * This function performs an element-wise multiplication of the tensors
+ * `acl_src` and `acl_other` and stores the result in `acl_src`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor where the multiplication result will be
+ * stored.
+ * @param acl_other The tensor whose elements will be multiplied with `acl_src`.
+ */
+static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
+                              aclTensor* acl_src, aclTensor* acl_other) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs element-wise multiplication of two tensors and stores the
+ * result in a destination tensor.
+ *
+ * This function performs element-wise multiplication of the tensors `acl_src`
+ * and `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The first tensor for element-wise multiplication.
+ * @param acl_other The second tensor for element-wise multiplication.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_other, aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Applies element-wise cosine function to the elements of a tensor.
+ *
+ * This function computes the cosine of each element in the source tensor
+ * `acl_src` and stores the result in the destination tensor `acl_dst`. The
+ * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
+ * }_i\right) \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the cosine function will be
+ * applied.
+ * @param acl_dst The destination tensor where the cosine results will be
+ * stored.
+ */
+static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Applies element-wise sine function to the elements of a tensor.
+ *
+ * This function computes the sine of each element in the source tensor
+ `acl_src`
+ * and stores the result in the destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
+ * \f]
+
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the sine function will be applied.
+ * @param acl_dst The destination tensor where the sine results will be stored.
+ */
+static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
+ result by the scalar value and adds it to self .
+ *
+ * Performs element-wise division of tensor1 by tensor2,
+ * multiplies the result by the scalar value and adds it to self .
+ * The operation is defined as:
+ * \f[
+ *     \text{out}_i = \text{selft}_i + \text{value} \times
+ \frac{\text{tensor1}_i}{\text{tensor2}_i}
+ * \f]
+
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_self The source tensor on which the addcdiv function will be
+ applied.
+ * @param tensor1 Numerator tensor.
+ * @param tensor2 Denominator tensor.
+ * @param value The value to be used for coefficient.
+ */
+static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
+                                  aclTensor* acl_self, aclTensor* tensor1,
+                                  aclTensor* tensor2, float value) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
+        acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+}
+
+/**
+ * @brief Matrix division, optionally in-place.
+ *
+ * This function division each element of the source tensor `acl_src` by the
+ * tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
+ * If `inplace` is true, `acl_dst` will not be used and the operation is
+ * performed in-place on `acl_src`. The operation is defined as: \f[
+ *     \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src Numerator tensor..
+ * @param acl_other Denominator tensor.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
+static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                             aclTensor* acl_other, aclTensor* acl_dst,
+                             bool inplace) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (inplace) {
+        ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
+                                                  &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
+    } else {
+        ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
+                                           &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+            workspaceAddr = workspace_allocator.get();
+        }
+
+        ACL_CHECK(
+            aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
+    }
+}
+
+void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst) {
+    const ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+    int half = dim / 2;
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+
+    // arange: [0, ..., half)
+    float start = 0;
+    float stop = half;
+    float step = 1;
+    int64_t n_elements_arange = half;
+    int64_t tmp_arange_ne[] = {half};
+    size_t tmp_arange_nb[] = {sizeof(dst->type)};
+
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
+    void* tmp_arange_buffer = arange_allocator.get();
+    aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
+
+    // freq
+    float freq_param = -logf(max_period) / half;
+    bool inplace = true;
+    aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
+    aclnn_exp(ctx, tmp_arange_tensor);
+
+    // permute: src [0,1,2,3]->[0,1,3,2]
+    int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
+    size_t tmp_permute_nb[GGML_MAX_DIMS];
+    tmp_permute_nb[0] = ggml_type_size(src->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+    }
+
+    ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
+    void* tmp_permute_buffer = permute_allocator.get();
+    aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor(
+        tmp_permute_buffer, ggml_cann_type_mapping(src->type),
+        ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
+        GGML_MAX_DIMS, ACL_FORMAT_ND);
+    int64_t permute_dim[] = {0, 1, 3, 2};
+    int64_t num_dims = 4;
+    aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims);
+
+    // timestep * freq
+    int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
+                            src->ne[3]};
+    size_t tmp_mul_nb[GGML_MAX_DIMS];
+    tmp_mul_nb[0] = ggml_type_size(src->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
+    }
+
+    int mul_nelements =
+        src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
+
+    ggml_cann_pool_alloc mul_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    void* tmp_mul_buffer = mul_allocator.get();
+    aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
+        tmp_mul_buffer, ggml_cann_type_mapping(src->type),
+        ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+    aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor);
+
+    // cos
+    ggml_cann_pool_alloc cos_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    void* tmp_cos_buffer = cos_allocator.get();
+    aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
+        tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+
+    aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
+
+    // sin
+    ggml_cann_pool_alloc sin_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    void* tmp_sin_buffer = sin_allocator.get();
+    aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
+        tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+
+    aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
+
+    // concat
+    int64_t concat_dim = 3;
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
+    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
+    aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
+
+    // release
+    // segmentation fault when delete both tensorList and his elements.
+    ACL_CHECK(aclDestroyTensorList(tensorList));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
+    ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Fills a tensor with a scalar value.
+ *
+ * This function fills the destination tensor `acl_dst` with the scalar value
+ * `scalar`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param scalar The scalar value used to fill the tensor.
+ * @param acl_dst The destination tensor to be filled with the scalar value.
+ */
+static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
+                              aclTensor* acl_dst) {
+    auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
+        acl_dst, acl_scalar, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+    ACL_CHECK(aclDestroyScalar(acl_scalar));
+}
+
+/**
+ * @brief Raises each element of a tensor to the power of the corresponding
+ * element in another tensor.
+ *
+ * This function computes the element-wise power of the destination tensor
+ * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor, which also serves as the base tensor.
+ * @param acl_exp The exponent tensor, each element of which is used to raise
+ * the corresponding element in the destination tensor.
+ */
+static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
+                                    aclTensor* acl_dst, aclTensor* acl_exp) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
+        acl_dst, acl_exp, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
+                                          executor, ctx.stream()));
+}
+
+/**
+ * @brief   Applies the Alibi (Attention with Linear Biases) mechanism to the
+ * @details This function implements the Alibi mechanism, which introduces
+ *          learnable biases into the attention scores to simulate relative
+ *          position encoding without the need for explicit positional
+ *          embeddings.
+ *
+ * @param ctx          The backend CANN context for executing operations.
+ * @param acl_src      The source tensor representing the query or key.
+ * @param acl_position The position tensor containing relative positions.
+ * @param acl_dst      The destination tensor where the result will be stored.
+ * @param n_head       The number of attention heads.
+ * @param src_ne       The dimensions of the source tensor.
+ * @param src_nb0      The byte size of the first dimension of the source
+ tensor.
+ * @param max_bias     The maximum bias value used in the Alibi mechanism.
+ * @param dst          The destination tensor object for additional metadata.
+ *
+ * The function performs the following steps:
+ * 1. Calculates the logarithm floor of the number of heads to determine the
+      base for bias calculation.
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias
+      values.
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic
+      sequences.
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
+ * 5. Multiplies the position tensor by the bias tensor.
+ * 6. Adds the result of the multiplication to the source tensor to produce the
+      final output.
+ */
+static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                        aclTensor* acl_position, aclTensor* acl_dst,
+                        const int n_head, int64_t* src_ne, const size_t src_nb0,
+                        float max_bias, ggml_tensor* dst) {
+    const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
+    GGML_ASSERT(src_nb0 == sizeof(float));
+    GGML_ASSERT(n_head == src_ne[2]);
+
+    const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
+
+    float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    // init arange
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                          ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_arange_buffer = arange_allocator.get();
+
+    // arange1: [1, ..., n_heads_log2_floor+1)
+    float start = 1;
+    float stop = n_heads_log2_floor + 1;
+    float step = 1;
+    int64_t n_elements_arange = n_heads_log2_floor;
+
+    int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+    size_t tmp_arange1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+    aclTensor* tmp_arange2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+        start = 1;
+        stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+        step = 2;
+        n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+        int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_arange2_nb[] = {sizeof(dst->type)};
+
+        aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_arange_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                     n_elements_arange);
+    }
+
+    // init mk_base
+    ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                           ne2_ne3 * ggml_type_size(dst->type));
+    void* tmp_mk_base_buffer = mk_base_allocator.get();
+    int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+    size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+    aclTensor* tmp_mk_base2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
+        aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+            (char*)tmp_mk_base_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+    }
+
+    // init mk
+    int64_t tmp_mk_base_ne[] = {ne2_ne3};
+    size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+        tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
+        GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+    // reshape mk
+    int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
+    size_t tmp_mk_nb[GGML_MAX_DIMS];
+    tmp_mk_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+    }
+    aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+        tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+
+    // acl_position * mk
+    int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
+    size_t tmp_output_nb[GGML_MAX_DIMS];
+    tmp_output_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
+    }
+    ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
+    void* tmp_output_buffer = output_allocator.get();
+    aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
+        tmp_output_buffer, ggml_cann_type_mapping(dst->type),
+        ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
+        ACL_FORMAT_ND);
+    aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
+
+    // add
+    aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
+}
+
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_cann_dup(ctx, dst);
+}
+
+/**
+ * @brief Performs element-wise addition of two tensors in place.
+ *
+ * This function adds the source tensor `acl_src` to the destination tensor
+ * `acl_dst` element-wise and stores the result in the destination tensor
+ * `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be added.
+ * @param acl_dst The destination tensor which will hold the result of the
+ * addition.
+ */
+static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
+                              aclTensor* acl_src, aclTensor* acl_dst) {
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+}
+
+/**
+ * @brief Applies the softmax function to a tensor along a specified dimension.
+ *
+ * This function computes the softmax of the source tensor `acl_src` along the
+ * specified dimension `dim` and stores the result in the destination tensor
+ * `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the softmax function will be
+ * applied.
+ * @param dim The dimension along which the softmax function will be computed.
+ * @param acl_dst The destination tensor where the softmax results will be
+ * stored.
+ */
+static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          int64_t dim, aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst,
+                                           &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
+}
+
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];  // mask
+
+    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float scale = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
+
+    // input mul scale
+    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+
+    size_t n_bytes = ggml_nbytes(src0);
+    ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
+    void* input_mul_scale_buffer = mul_scale_allocator.get();
+    aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
+        input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
+        src0->nb, GGML_MAX_DIMS);
+
+    bool inplace = false;
+    aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
+
+    // mask
+    aclTensor* acl_src1_fp32_tensor = nullptr;
+    aclTensor* tmp_mask_tensor = nullptr;
+    ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
+    if (src1) {
+        const bool use_f16 = src1->type == GGML_TYPE_F16;
+        if (use_f16) {
+            // cast to fp32
+            size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
+            size_t src1_fp32_nb[GGML_MAX_DIMS];
+            src1_fp32_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
+            }
+            src1_fp32_allocator.alloc(n_bytes);
+            void* src1_fp32_buffer = src1_fp32_allocator.get();
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(
+                src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
+                src1_fp32_nb, GGML_MAX_DIMS);
+            aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+            aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
+
+            ACL_CHECK(aclDestroyTensor(acl_src1));
+        } else {
+            acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
+        }
+
+        // broadcast the mask across rows, only use ne11 of ne01 in mask
+        if (src1->ne[1] != src0->ne[1]) {
+            // mask shape: [1,1,ne11,ne10]
+            int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
+            size_t tmp_mask_nb[GGML_MAX_DIMS];
+            tmp_mask_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
+            }
+            tmp_mask_tensor = ggml_cann_create_tensor(
+                src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
+                GGML_MAX_DIMS, ACL_FORMAT_ND);
+        }
+
+        // alibi
+        const int n_head = src0->ne[2];
+        const size_t src_nb0 = src0->nb[0];
+
+        n_bytes = ggml_nbytes(dst);
+        ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
+        void* output_buffer = output_allocator.get();
+        aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
+            output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
+            dst->nb, GGML_MAX_DIMS);
+        if (max_bias <= 0.0f) {
+            // slope = 1.0
+            if (tmp_mask_tensor) {
+                aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            } else {
+                aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
+                          alibi_output_tensor);
+            }
+        } else {
+            // slope != 1.0
+            if (tmp_mask_tensor) {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
+                            alibi_output_tensor, n_head, src0->ne, src_nb0,
+                            max_bias, dst);
+            } else {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor,
+                            acl_src1_fp32_tensor, alibi_output_tensor, n_head,
+                            src0->ne, src_nb0, max_bias, dst);
+            }
+        }
+
+        // softmax
+        aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
+        ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
+    } else {
+        aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyScalar(acl_scale));
+    ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    src0->extra = src0_extra_allocator.get();
+    src1->extra = src1_extra_allocator.get();
+    dst->extra = dst_extra_allocator.get();
+    ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+
+    switch (src0->type) {
+        case GGML_TYPE_F32: {
+#ifdef ASCEND_310P
+            // Special operation for get_row_f32 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 8) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
+                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
+            aclrtlaunch_ascendc_get_row_f32(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        }
+        case GGML_TYPE_F16: {
+#ifdef ASCEND_310P
+            // Special operation for get_row_f16 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 16) != 0) {
+                size_t dst_len =
+                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
+                    ggml_type_size(
+                        GGML_TYPE_F32);  // out is also f32, even input is f16
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
+            aclrtlaunch_ascendc_get_row_f16(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        }
+        case GGML_TYPE_Q4_0:
+            aclrtlaunch_ascendc_get_row_q4_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        case GGML_TYPE_Q8_0:
+            aclrtlaunch_ascendc_get_row_q8_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+/**
+ * @brief Repeats elements of a tensor along a specified dimension.
+ *
+ * This function repeats each element of the source tensor `acl_src` a specified
+ * number of times (`repeats`) along the specified dimension `dim` and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be repeated.
+ * @param acl_dst The destination tensor where the repeated elements will be
+ * stored.
+ * @param dim The dimension along which the elements will be repeated.
+ * @param repeats The number of times each element will be repeated.
+ * @param output_size The size of the output tensor.
+ */
+static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
+                                    aclTensor* acl_src, aclTensor* acl_dst,
+                                    int64_t dim, int64_t repeats,
+                                    int64_t output_size) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
+        acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
+        &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
+                                              executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                          aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
+                                // fp32, atlas a2 will transpose it to HFLOAT32.
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                          cube_math_type, &workspaceSize,
+                                          &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two 2D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                      cube_math_type, &workspaceSize,
+                                      &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two 3D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
+                             aclTensor* acl_input, aclTensor* acl_weight,
+                             aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                               cube_math_type, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication with floating-point precision on
+ * tensors using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor and the
+ * weight tensor, handling broadcasting and transposing as needed, and stores
+ * the result in the destination tensor `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
+                                 ggml_tensor* dst) {
+    ggml_tensor* weight = dst->src[0];  // weight
+    ggml_tensor* input = dst->src[1];   // input
+
+    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
+    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
+    BCAST_MUL_MAT_SHAPE(input, weight, dst);
+
+    int64_t n_dims = bcast_dims;
+    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
+        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
+            n_dims = 2;
+        } else if (bcast_input_ne[2] == 1) {
+            n_dims = 3;
+        }
+    }
+
+    aclTensor* acl_input_tensor =
+        ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
+                              bcast_weight_ne[2], bcast_weight_ne[3],
+                              bcast_weight_ne[4], bcast_weight_ne[5]};
+    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
+                             bcast_weight_nb[2], bcast_weight_nb[3],
+                             bcast_weight_nb[4], bcast_weight_nb[5]};
+    aclTensor* acl_weight_tensor =
+        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+
+    switch (n_dims) {
+        case 2:
+            aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        case 3:
+            aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+        default:
+            aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+            break;
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+/**
+ * @brief Performs matrix multiplication with quantized weights and
+ * floating-point inputs using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor `src1` and
+ * the weight tensor `src0`, handling broadcasting, transposing, and
+ * quantization as needed, and stores the result in the destination tensor
+ * `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
+                                    ggml_tensor* dst,
+                                    const enum ggml_type type) {
+    ggml_tensor* src0 = dst->src[0];  // weight
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    // The shape of the weight is NCHW.
+    // Matrix multiplication uses HW dims.
+    // HC is regarded as batch.
+    // weight need transpose.
+    float weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    } else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    } else {
+        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+    }
+    float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
+    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
+    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
+
+    // scale stored at the end of weight. Also need transpose.
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
+                         scale_elem_size};
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+    char* scale_offset = (char*)src0->data + weight_size;
+
+    // input
+    size_t input_elem_size = sizeof(uint16_t);
+    int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
+    size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
+    size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
+    ggml_cann_pool_alloc input_alloctor(ctx.pool());
+    void* input_buffer = src1->data;
+
+    // case in
+    if (src1->type != GGML_TYPE_F16) {
+        aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
+        input_buffer =
+            input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+
+        int64_t* input_cast_ne = src1->ne;
+        size_t input_cast_nb[GGML_MAX_DIMS];
+        input_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
+            input_cast_nb, GGML_MAX_DIMS);
+        aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
+
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
+    }
+
+    // output
+    size_t output_elem_size = sizeof(uint16_t);
+    size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
+    ggml_cann_pool_alloc output_allocator(ctx.pool());
+    void* output_buffer =
+        output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
+
+    // aclnn
+    int64_t max_elem_size = 65535;
+    int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+    aclOpExecutor* executor = nullptr;
+    uint64_t workspaceSize = 0;
+    void* workspaceAddr = nullptr;
+    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
+        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
+            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
+            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
+
+            int64_t batch1 = (n1 * src1->ne[2]) + c1;
+            int64_t batch0 = (n0 * src0->ne[2]) + c0;
+
+            aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
+                input_elem_size, input_ne, input_nb, 2);
+
+            // first split
+            int64_t weight_ne_offset = 0;
+            int64_t weight_ne[2] = {
+                max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
+                src0->ne[0]};
+            int64_t scale_ne_offset = 0;
+            int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+            int64_t output_ne_offset = 0;
+            int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
+
+            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
+                (char*)src0->data + batch0 * weight_stride,
+                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
+                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                scale_ne_offset);
+            aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                output_ne_offset);
+
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
+                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
+                &workspaceSize, &executor));
+            if (workspaceAddr == nullptr) {
+                workspaceAddr = workspace_allocator.alloc(workspaceSize);
+            }
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+
+            // other splits
+            for (int64_t split = 1; split < split_size; split++) {
+                weight_ne_offset +=
+                    weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
+                                   ? src0->ne[1] - (max_elem_size * split)
+                                   : max_elem_size;
+                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
+                scale_ne[0] = weight_ne[0];
+                output_ne_offset +=
+                    output_elem_size * output_ne[0] * output_ne[1];
+                output_ne[0] = weight_ne[0];
+
+                acl_weight_tensor = ggml_cann_create_tensor(
+                    (char*)src0->data + batch0 * weight_stride,
+                    ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                    weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+                acl_scale_tensor = ggml_cann_create_tensor(
+                    scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
+                    scale_ne_offset);
+                acl_output_tensor = ggml_cann_create_tensor(
+                    (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
+                    output_ne_offset);
+
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                    nullptr, nullptr, nullptr, nullptr, QK8_0,
+                    acl_output_tensor, &workspaceSize, &executor));
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                    workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+                ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+            }
+
+            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        }
+    }
+
+    // cast out
+    if (dst->type != GGML_TYPE_F16) {
+        int64_t* output_cast_ne = dst->ne;
+        size_t output_cast_nb[GGML_MAX_DIMS];
+        output_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+            output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
+            output_cast_nb, GGML_MAX_DIMS);
+        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
+                   ggml_cann_type_mapping(dst->type));
+
+        ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
+    }
+}
+
+void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const enum ggml_type type = dst->src[0]->type;
+    switch (type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+            ggml_cann_mat_mul_fp(ctx, dst);
+            break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+            ggml_cann_mul_mat_quant(ctx, dst, type);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+/**
+ * @brief Rolls the elements of a tensor along a specified dimension.
+ *
+ * This function rolls the elements of the source tensor `acl_src` by the
+ * specified shifts `shifts` along the specified dimensions `dims`, and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be rolled.
+ * @param acl_dst The destination tensor where the rolled elements will be
+ * stored.
+ * @param shifts An array specifying the number of positions by which elements
+ * are shifted.
+ * @param dims An array specifying the dimensions along which elements are
+ * shifted.
+ */
+static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
+    aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
+    aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst,
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(acl_shifts));
+    ACL_CHECK(aclDestroyIntArray(acl_dims));
+}
+
+/**
+ * @brief Fills specified positions of a tensor with a scalar value.
+ *
+ * This function fills the positions in the source tensor `acl_src` specified by
+ * `index` along the dimension `dim` with the scalar value `value`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor where the positions will be filled.
+ * @param dim The dimension along which the positions are specified.
+ * @param index An array specifying the positions to be filled.
+ * @param index_num The number of positions specified in the index array.
+ * @param value The scalar value used to fill the specified positions.
+ */
+static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
+                                    aclTensor* acl_src, int64_t dim,
+                                    int64_t* index, int64_t index_num,
+                                    float value) {
+    aclIntArray* acl_index = aclCreateIntArray(index, index_num);
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
+        acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
+                                          executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(acl_index));
+    ACL_CHECK(aclDestroyScalar(acl_value));
+}
+
+static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+                             aclTensor* acl_cos_repeat_tensor,
+                             aclTensor* acl_sin_repeat_tensor,
+                             float theta_scale, float freq_scale,
+                             float attn_factor, bool is_neox) {
+    // int sin/cos cache, cache has different repeat method depond on
+    // @param.is_neox
+
+    ggml_tensor* src0 = dst->src[0];  // input
+    ggml_tensor* src1 = dst->src[1];  // position
+    ggml_tensor* src2 = dst->src[2];  // freq_factors
+
+    // arange, [0,1,...,ne0/2]
+    int64_t arange_length = src0->ne[0] / 2;
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                          arange_length * sizeof(float_t));
+    void* arange_buffer = arange_allocator.get();
+    int64_t arange_ne[] = {arange_length, 1, 1, 1};
+    size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
+                          arange_length * sizeof(float_t)};
+
+    aclTensor* acl_arange_tensor =
+        ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t),
+                                arange_ne, arange_nb, GGML_MAX_DIMS);
+    float start = 0;
+    float step = 1;
+    float stop = src0->ne[0] / 2;
+    float n_elements = src0->ne[0] / 2;
+    aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
+
+    // power
+    // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
+    // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
+    // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
+    // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
+    // acl_power_tensor);
+    ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
+                                               arange_length * sizeof(float_t));
+    void* theta_scale_buffer = theta_scale_allocator.get();
+    aclTensor* acl_theta_scale_tensor = aclnn_values(
+        ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
+        GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
+    aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
+
+    // freq_scale
+    if (freq_scale != 1) {
+        aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
+    }
+
+    // freq_factors
+    if (src2) {
+        aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
+            src2->data, ggml_cann_type_mapping(src2->type),
+            ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
+        aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
+                         nullptr, true);
+        ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
+    }
+
+    // position
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    int64_t position_length = src1->ne[0];
+    int64_t position_ne[] = {1, position_length, 1, 1};
+    size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
+                            sizeof(int32_t) * position_length,
+                            sizeof(int32_t) * position_length};
+    aclTensor* acl_position_tensor = ggml_cann_create_tensor(
+        src1->data, ggml_cann_type_mapping(src1->type),
+        ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
+
+    // power * position
+    int64_t theta_length = arange_length * position_length;
+    ggml_cann_pool_alloc theta_allocator(ctx.pool(),
+                                         theta_length * sizeof(float_t));
+    void* theta_buffer = theta_allocator.get();
+    int64_t theta_ne[] = {arange_length, position_length, 1, 1};
+    size_t theta_nb[GGML_MAX_DIMS];
+    theta_nb[0] = sizeof(float_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
+    }
+    aclTensor* acl_theta_tensor =
+        ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
+                                theta_ne, theta_nb, GGML_MAX_DIMS);
+    aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
+              acl_theta_tensor);
+
+    // permute: [0,1,2,3]->[0,2,1,3]
+    int64_t permute_ne[] = {arange_length, 1, position_length, 1};
+    size_t permute_nb[GGML_MAX_DIMS];
+    permute_nb[0] = sizeof(float_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
+    }
+    ggml_cann_pool_alloc permute_allocator(ctx.pool(),
+                                           theta_length * sizeof(float_t));
+    void* permute_buffer = permute_allocator.get();
+    aclTensor* acl_permute_tensor = ggml_cann_create_tensor(
+        permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
+        GGML_MAX_DIMS, ACL_FORMAT_ND);
+    int64_t permute_dim[] = {0, 2, 1, 3};
+    int64_t num_dims = 4;
+    aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
+                  num_dims);
+
+    // sin/cos
+    ggml_cann_pool_alloc sin_allocator(ctx.pool(),
+                                       theta_length * sizeof(float_t));
+    void* sin_buffer = sin_allocator.get();
+    aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
+        sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
+        GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
+
+    ggml_cann_pool_alloc cos_allocator(ctx.pool(),
+                                       theta_length * sizeof(float_t));
+    void* cos_buffer = cos_allocator.get();
+    aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
+        cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb,
+        GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
+
+    // attn_factor
+    if (attn_factor != 1) {
+        aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
+        aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
+    }
+
+    // repeat
+    if (is_neox) {
+        int64_t repeatsArray[] = {1, 1, 1, 2};
+        aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
+        aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
+    } else {
+        int64_t num_repeats = 2;
+        int64_t dim = 3;
+        int64_t output_size = arange_length * num_repeats;
+        aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
+                                num_repeats, output_size);
+        aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
+                                num_repeats, output_size);
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_arange_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_position_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_theta_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_permute_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_sin_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+    const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
+    int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
+    aclOpExecutor** executor);
+aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
+                                         uint64_t workspaceSize,
+                                         aclOpExecutor* executor,
+                                         aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
+void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // TODO: use ascendc
+    // Only test with LLAMA model.
+    ggml_tensor* src0 = dst->src[0];  // input
+    ggml_tensor* src2 = dst->src[2];  // freq_factors
+
+    // param
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    // const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t*)dst->op_params)[1];
+    const int mode = ((int32_t*)dst->op_params)[2];
+    // const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
+
+    // TODO: n_dims <= ne0
+    GGML_ASSERT(n_dims == ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+    // TODO: ext_factor != 0
+    GGML_ASSERT(ext_factor == 0);
+
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
+                             beta_slow, corr_dims);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+    // init cos/sin cache
+    ggml_cann_pool_alloc sin_allocator(
+        ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
+    ggml_cann_pool_alloc cos_allocator(
+        ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
+    void* sin_buffer = sin_allocator.get();
+    void* cos_buffer = cos_allocator.get();
+
+    int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+    size_t sin_reshape_nb[GGML_MAX_DIMS];
+    sin_reshape_nb[0] = sizeof(float_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
+    }
+    aclTensor* acl_sin_reshape_tensor =
+        ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
+                                sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+    aclTensor* acl_cos_reshape_tensor =
+        ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
+                                sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+    aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
+                     theta_scale, freq_scale, attn_factor, is_neox);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+#ifdef ASCEND_310P
+    // Special ROPE operation for 310P
+
+    // roll input
+    void* input_roll_buffer;
+    aclTensor* acl_minus_one_tensor;
+    void* minus_one_scale_buffer = nullptr;
+    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
+    ggml_cann_pool_alloc minus_one_scale_allocator(
+        ctx.pool(), sizeof(float_t) * src0->ne[0]);
+    if (!is_neox) {
+        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
+        input_roll_buffer = roll_allocator.get();
+        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
+                                    src0->ne[2], src0->ne[3]};
+        size_t input_roll_nb[GGML_MAX_DIMS];
+        input_roll_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+        }
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+            src0->data, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
+            GGML_MAX_DIMS);
+
+        int64_t shifts[] = {1};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+        // init [-1, 1, -1, 1, ...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        int64_t dim = 3;
+        int64_t* index = new int64_t[src0->ne[0]];
+        for (int i = 0; i < src0->ne[0]; i++) {
+            index[i] = i / 2 * 2;
+        }
+        int64_t index_num = src0->ne[0];
+        float value = -1;
+        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
+                                index_num, value);
+    } else {
+        // roll input: [q0,q1,q2,...] ->
+        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+        input_roll_buffer = roll_allocator.get();
+        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
+            input_roll_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
+
+        int64_t shifts[] = {src0->ne[0] / 2};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
+
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        // init [-1, -1, -1, 1, 1，1，...]
+        minus_one_scale_buffer = minus_one_scale_allocator.get();
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = aclnn_values(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
+        // -1 * first half
+        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
+        size_t first_half_nb[GGML_MAX_DIMS];
+        first_half_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
+        }
+        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
+            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
+            first_half_nb, GGML_MAX_DIMS);
+        bool inplace = true;
+        float scale = -1;
+        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
+        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
+    }
+
+    // TODO: n_dims < ne0
+    GGML_ASSERT(n_dims == src0->ne[0]);
+
+    // input * scale
+    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
+                                                  ggml_nbytes(src0));
+    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+    size_t input_nb[GGML_MAX_DIMS];
+    input_nb[0] = ggml_type_size(src0->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+    }
+    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
+        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
+        input_roll_buffer, ggml_cann_type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+
+    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
+              acl_input_roll_mul_scale_tensor);
+
+    // output
+    void* output_fp32_buffer;
+    if (src0->type == GGML_TYPE_F32) {
+        aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
+        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
+                          acl_sin_reshape_tensor);
+        aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
+        // TODO: ne0 != n_dims in mode2
+    } else if (src0->type == GGML_TYPE_F16) {
+        size_t input_fp32_nb[GGML_MAX_DIMS];
+        input_fp32_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+        }
+        ggml_cann_pool_alloc fp32_allocator1(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer1 = fp32_allocator1.get();
+        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
+            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        ggml_cann_pool_alloc fp32_allocator2(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_fp32_buffer2 = fp32_allocator2.get();
+        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
+            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+
+        ggml_cann_pool_alloc fp32_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        output_fp32_buffer = fp32_allocator.get();
+        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
+            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
+            input_fp32_nb, GGML_MAX_DIMS);
+        aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
+                  input_fp32_tensor2);
+        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
+                  output_fp32_tensor);
+        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
+
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src));
+    }
+    return;
+#endif
+
+    // src0 == GGML_TYPE_F16
+    // TODO: optimization this `if` code
+    if (src0->type == GGML_TYPE_F16) {
+        ggml_cann_pool_alloc sin_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        ggml_cann_pool_alloc cos_final_allocator(
+            ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
+        void* sin_final_buffer = sin_final_allocator.get();
+        void* cos_final_buffer = cos_final_allocator.get();
+
+        int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+        size_t sin_final_nb[GGML_MAX_DIMS];
+        sin_final_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
+        }
+        aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
+            sin_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+        aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
+            cos_final_buffer, ggml_cann_type_mapping(src0->type),
+            ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
+            GGML_MAX_DIMS);
+
+        aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
+                   ggml_cann_type_mapping(src0->type));
+        ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+        acl_sin_reshape_tensor = acl_sin_final_tensor;
+        acl_cos_reshape_tensor = acl_cos_final_tensor;
+    }
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+
+    void* workspaceAddr = nullptr;
+
+    int acl_mode = mode;
+    if (mode == 0) {
+        acl_mode = 1;
+    }
+
+    ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
+        acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
+        acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
+                                           executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
new file mode 100644
index 000000000..680129c76
--- /dev/null
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -0,0 +1,592 @@
+#ifndef CANN_ACLNN_OPS
+#define CANN_ACLNN_OPS
+
+/**
+ * @file    acl_tensor
+ * @brief   This file contains related functions of ggml_tensor and acl_tensor.
+ *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
+ *          functions.
+ * @author  hipudding <huafengchun@gmail.com>
+ * @author  wangshuai09 <391746016@qq.com>
+ * @date    July 15, 2024
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <aclnnop/aclnn_add.h>
+#include <aclnnop/aclnn_arange.h>
+#include <aclnnop/aclnn_argsort.h>
+#include <aclnnop/aclnn_cat.h>
+#include <aclnnop/aclnn_clamp.h>
+#include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_gelu.h>
+#include <aclnnop/aclnn_hardsigmoid.h>
+#include <aclnnop/aclnn_hardswish.h>
+#include <aclnnop/aclnn_leaky_relu.h>
+#include <aclnnop/aclnn_mul.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_silu.h>
+#include <aclnnop/aclnn_tanh.h>
+#include "acl_tensor.h"
+#include "common.h"
+
+/**
+ * @brief   Repeats a ggml tensor along each dimension to match the dimensions
+ *          of another tensor.
+ *
+ * @details This function repeats the elements of a source ggml tensor along
+ *          each dimension to create a destination tensor with the specified
+ *          dimensions. The operation is performed using the ACL backend and
+ *          executed asynchronously on the device.
+ *
+ * @param   ctx The CANN context used for operations.
+ * @param   dst The ggml tensor representing the destination, which op is
+ *              GGML_OP_REPEAT and specifies the desired dimensions.
+ */
+void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Adds two ggml tensors using the CANN backend.
+ *
+ * @details This function performs an element-wise addition of two tensors. In
+ *          case the tensors do not have the same shape, one or both tensors
+ *          will be broadcasted to match the shape of the other before the
+ *          addition is performed.The formula for the operation is given by:
+ *          \f[
+ *              \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
+ *          \f]
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The ggml tensor representing the destination, result of the
+ *            addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
+ */
+void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
+ *          backend.
+ *
+ * @details This function computes the Leaky ReLU activation for each element of
+ *          the input tensor. The Leaky ReLU function allows a small gradient
+ *          when the unit is not active (i.e., when the input is negative). The
+ *          Leaky ReLU function is defined as:
+ *          \f[
+ *              \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
+ *               src)
+ *          \f]
+ *          `negativeSlope` is in dst->params.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result of the Leaky ReLU
+ *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
+ */
+void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief    Concatenates multiple tensors along a specified dimension using the
+ *           CANN backend.
+ *
+ * @param ctx        The CANN context used for operations.
+ * @param tensorList A pointer to the list of tensors to be concatenated.
+ * @param dst        The destination tensor where the result of the
+ *                   concatenation is stored. dst->op is `GGML_OP_CONCAT`.
+ * @param concat_dim The dimension along which the tensors are concatenated.
+ *
+ * @attention tensorList length should be 2 and the dimension using for concat
+ *            default to 1.
+ */
+void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Generates a sequence of evenly spaced values within a specified
+ *          interval for a ggml tensor using the CANN backend.
+ *
+ * @details This function creates a sequence of numbers over a specified i
+ *          nterval, starting from `start`, ending before `stop`, and
+ *          incrementing by `step`. The sequence is stored in the destination
+ *          tensor `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the generated sequence will be stored.
+ *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
+ *            `GGML_OP_ARANGE`.
+ */
+void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the square of the elements of a ggml tensor using the CANN
+ *          backend.
+ * @details The function sets the second source tensor of the destination
+ *          tensor `dst` to be equal to the first source tensor. This is
+ *          effectively squaring the elements since the multiplication becomes
+ *          `element * element`.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the squared values will be stored，
+ *            which dst->op is `GGML_OP_SQR`.
+ */
+void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Applies a clamp operation to the elements of a ggml tensor using the
+ *          CANN backend.
+ *
+ * @details This function clamps the elements of the input tensor `src` to a
+ *          specified range defined by `min` and `max` values. The result is
+ *          stored in the destination tensor `dst`. The operation is defined as:
+ *          \f[
+ *              y = \max(\min(x, max\_value), min\_value)
+ *           \f]
+ *          where `x` is an element of the input tensor, and `y` is the
+ *          corresponding element in the output tensor.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the clamped values will be stored.
+ *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
+ */
+void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Scales the elements of a ggml tensor by a constant factor using the
+ *          CANN backend.
+ *
+ * @details This function multiplies each element of the input tensor `src` by
+ *          a scaling factor `scale`, storing the result in the destination
+ *          tensor `dst`. The operation is defined as:
+ *          \f[
+ *             dst = src \times scale
+ *          \f]
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the scaled values will be stored.
+ *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
+ */
+void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Sorts the elements of a ggml tensor and returns the indices that
+ *          would sort the tensor using the CANN backend.
+ *
+ * @details This function performs an argsort operation on the input tensor
+ *          `src`. It sorts the elements of `src` in either ascending or
+ *          descending order, depending on the `GGML_SORT_ORDER_DESC`,
+ *          and returns the indices that would sort the original tensor.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the sorted indices will be stored.
+ *            dst->op is `GGML_OP_ARGSORT`.
+ */
+void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
+ *          backend.
+ *
+ * @details This function applies the Layer Normalization operation on the
+ *          input tensor `src` and stores the result in the destination tensor
+ *          `dst`. Layer Normalization normalizes the features at each sample in
+ *          a mini-batch independently. It is commonly used in neural networks
+ *          to normalize the activations of a layer by adjusting and scaling
+ *          the outputs.
+ *          The operation is defined as:
+ *          \f[
+ *              \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
+ *          \f]
+ *          `Var` defaults dst->ne[0]. `eps` is in dst->params.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * @attention `Var` defaults to dst->ne[0].
+ */
+void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief  Computes the Group Normalization for a ggml tensor using the CANN
+ *         backend.
+ *
+ * @brief  This function applies the Group Normalization operation on the input
+ *         tensor `src` and stores the result in the destination tensor `dst`.
+ *         Group Normalization divides the channels into groups and normalizes
+ *         the features within each group across spatial locations.
+ *         It is commonly used in convolutional neural networks to improve
+ *         training stability and performance.
+ *         The operation is defined as:
+ *         \f[
+ *             \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
+ *         \f]
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ *            `n_groups` is in dst->params, which split C channel to `n_groups`.
+ *            dst->op is `GGML_OP_GROUP_NORM`.
+ *
+ * @attention eps defaults to 1e-6f.
+ */
+void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the accumulation of tensors using the CANN backend.
+ *
+ * @details This function performs an accumulation operation on two tensors.
+ *          Depending on the `inplace` flag, it either updates the destination
+ *          tensor `dst` in place by adding `alpha * src1` to it, or it creates
+ *          a new tensor as the result of `src0 + alpha * src1` and stores it in
+ *          `dst`.
+ *          The operation is defined as:
+ *          \f[
+ *               dst = src0 + alpha \times src1
+ *          \f]
+ *          if `inplace` is `true`, `src0` is equal to 'dst'.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the accumulated values will be stored.
+ *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
+ */
+void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the sum of elements along the last dimension of a ggml tensor
+ *          using the CANN backend.
+ *
+ * @details This function performs a reduction sum operation along the last
+ *          dimension of the input tensor `src`. The result of the sum is stored
+ *          in the destination tensor `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the reduced values will be stored。
+ *            dst->op is `GGML_OP_SUM_ROWS`.
+ *
+ * @attention `reduce_dims` defaults to 3, which means the last dimension.
+ */
+void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
+ *          the CANN backend.
+ *
+ * @details This function performs upsampling of the input tensor `src` using
+ *          nearest neighbor interpolation. The upsampling is applied to the
+ *          height and width dimensions (last two dimensions) of the tensor. The
+ *          result is stored in the destination tensor `dst`, which must have
+ *          the appropriate dimensions for the upsampled output.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the upsampled values will be stored.
+ *            dst->op is `GGML_OP_UPSCALE`.
+ */
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst);
+
+/**
+ * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
+ *          using the CANN backend.
+ *
+ * @details This function pads the input tensor `src` so that it matches the
+ *          dimensions of the destination tensor `dst`. The amount of padding
+ *          is calculated based on the difference in sizes between `src` and
+ *          `dst` along each dimension. The padded tensor is stored in `dst`.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor, which specifies the target dimensions for
+ *            padding. dst->op is `GGML_OP_PAD`.
+ */
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
+ *          backend.
+ *
+ * @details This function dispatches the execution of a 2D pooling operation on
+ *          the input tensor `dst`. The type of pooling (average or max) is
+ *          determined by the `op` parameter, which is read from the operation
+ *          parameters of `dst`. The function supports average pooling
+ *          (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
+ *          invalid operation is encountered, the function asserts a failure.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor on which the pooling operation is to be
+ *            performed. dst->op is `GGML_OP_POOL_2D`.
+ */
+void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Duplicates a ggml tensor using the CANN backend.
+ *
+ * @details This function duplicates the contents of the source tensor `src` to
+ *          the destination tensor `dst`. The function supports various tensor
+ *          types and configurations, including handling of extra data, type
+ *          conversions, and special cases for contiguous and non-contiguous
+ *          tensors.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the duplicated data will be stored.
+ *            dst->op is `GGML_OP_DUP`
+ *
+ * @attention Only support Fp16/FP32. Not support when src and dst have
+ *            different shape and dst is no-contiguous.
+ * @note:     This func need to simplify.
+ */
+void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
+ *          using the CANN backend.
+ *
+ * @details This function applies RMS normalization to the input tensor `src`
+ *          and stores the result in the destination tensor `dst`. RMS
+ *          normalization involves computing the root mean square of the input
+ *          tensor along a specified dimension and then dividing each element of
+ *          the tensor by this value, adjusted by a small epsilon value to
+ *          prevent division by zero.
+ *          The operation is defined as:
+ *          \f[
+ *               \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
+ *               \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
+ *          \f]
+ *          `eps` is in dst->op_params.
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ *            dst->op is `GGML_OP_RMS_NORM`.
+ */
+void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Applies a diagonal mask to the tensor with a specified value.
+ *
+ * @details This function creates a mask tensor filled with ones, then applies
+ *          an upper triangular and lower triangular operation to it based on
+ *          the number of past elements specified. Afterward, it adds the masked
+ *          tensor to the destination tensor in-place.
+ *
+ * @param ctx The backend CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored. dst->op is
+ *            `GGML_OP_DIAG_MASK`
+ * @param value The value to use for masking.
+ */
+void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
+
+/**
+ * @brief   Performs an image-to-column transformation on the input tensor.
+ *
+ * @details This function takes an input tensor and applies an image-to-column
+ *          operation, converting spatial dimensions into column-like
+ *          structures suitable for convolutional operations. It supports both
+ *          half-precision (F16) and single-precision (F32) floating-point data
+ *          types.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor that stores the result of the operation.
+ *            dst->op is `GGML_OP_IM2COL`.
+ */
+void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes time step embeddings using sine and cosine functions.
+ *
+ * @details This function calculates time step embeddings by applying sine and
+ *          cosine transformations to a given input tensor, which is typically
+ *          used in temporal models like diffusion models or transformers to
+ *          encode time information effectively.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the result of the embedding operation
+ *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
+ */
+void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+// @see ggml_cann_dup.
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the softmax activation with optional masking.
+ *
+ * @details This function computes the softmax activation over the input tensor,
+ *          optionally applying a mask and scaling factor. It supports both FP16
+ *          and FP32 data types and can handle masking by broadcasting the mask
+ *          across rows if necessary.
+ *          The function performs the following steps:
+ *          1. Multiplies the input tensor by a scale factor.
+ *          2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
+ *          3. Broadcasts the mask tensor if its dimensions do not match the
+ *             input tensor's dimensions.
+ *          4. Adds the mask to the scaled input tensor.
+ *          5. Applies the softmax activation function along the specified
+ *             dimension.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the result will be stored. dst->op is
+ *            `GGML_OP_SOFTMAX`.
+ */
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Extracts specific rows from a tensor based on indices.
+ *
+ * @details This function retrieves rows from a source tensor src0 according to
+ *          the indices provided in another tensor src1 and stores the result in
+ *          a destination tensor (\p dst). It supports different data types
+ *          including F32, F16, Q4_0, and Q8_0.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the extracted rows will be stored.
+ *            dst->op is `GGML_OP_GET_ROWS`.
+ */
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Executes matrix multiplication for the given tensor.
+ *
+ * @details This function performs matrix multiplication on the source tensors
+ *          associated with the destination tensor. It supports matrix
+ *          multiplication F32, F16, and Q8_0.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor for storing the result of the matrix
+ *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
+ */
+void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
+ *
+ * @details This function implements the RoPE mechanism, which is a method to
+ *          encode positional information into sequence data, particularly
+ *          useful in transformer models. It supports both F32 and F16 data
+ *          types.
+ *
+ * @param ctx The backend CANN context for executing operations.
+ * @param dst The destination tensor where the RoPE-transformed data will be
+ *            stored. dst->op is `GGML_OP_ROPE`.
+ *
+ * @note The function currently does not support cases where the n_dims is less
+ *       than the input tensor's first dimension.
+ * @note The function currently does not support cases where the freq_factors is
+ *       not NULL.
+ * @note The function currently does not support cases where the ext_factor is
+ *       not equal 0.
+ * @note The function currently does not support cases where the freq_scale is
+ *       not equal 1.
+ */
+void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+                                       aclTensor*, uint64_t*, aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
+void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    aclTensor* acl_src0;
+    aclTensor* acl_src1;
+    aclTensor* acl_dst;
+
+    // Need bcast
+    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
+        BCAST_SHAPE(src0, src1)
+        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
+        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
+        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
+    } else {
+        acl_src0 = ggml_cann_create_tensor(src0);
+        acl_src1 = ggml_cann_create_tensor(src1);
+        acl_dst = ggml_cann_create_tensor(dst);
+    }
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
+                               &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+// Activation functions template.
+template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
+                                       aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+                              const aclrtStream)>
+void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+// Activation functions template for const aclTensors.
+template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+                                       uint64_t*, aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+                              const aclrtStream)>
+void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+#endif  // CANN_ACLNN_OPS
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
new file mode 100644
index 000000000..5164cb74e
--- /dev/null
+++ b/ggml/src/ggml-cann/common.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CANN_COMMON_H
+#define CANN_COMMON_H
+
+#include <acl/acl.h>
+
+#include <cstdio>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../include/ggml-cann.h"
+#include "../include/ggml.h"
+
+#define MATRIX_ROW_PADDING 512
+#define GGML_CANN_MAX_STREAMS 8
+
+/**
+ * @brief Handles CANN-related errors by printing an error message and
+ *        terminating the program.
+ * @param stmt The statement that caused the error.
+ * @param func The function in which the error occurred.
+ * @param file The file in which the error occurred.
+ * @param line The line number at which the error occurred.
+ * @param msg The error message.
+ */
+[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
+                                  const char* file, int line, const char* msg);
+
+/**
+ * @brief Checks the result of a CANN function call and invokes the error
+ *        handler if the call fails.
+ * @param stmt The CANN function call to check.
+ * @param success The success code that indicates the call was successful.
+ * @param error_fn The function to call to retrieve the error message.
+ */
+#define ACL_CHECK_GEN(stmt, success, error_fn)                                \
+    do {                                                                      \
+        int err_code = (stmt);                                                \
+        if (err_code != (success)) {                                          \
+            ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
+        }                                                                     \
+    } while (0);
+
+#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
+
+/**
+ * @brief Contains information about CANN devices.
+ */
+struct ggml_cann_device_info {
+    /**
+     * @brief Number of CANN devices available.
+     */
+    int32_t device_count;
+
+    /**
+     * @brief Information about a single CANN device.
+     */
+    struct cann_device_info {
+        int cc;                 /**< Compute capability.                   */
+        size_t smpb;            /**< Maximum shared memory per block.      */
+        bool vmm;               /**< Virtual memory support.               */
+        size_t vmm_granularity; /**< Granularity of virtual memory.        */
+        size_t total_vram;      /**< Total video RAM available on the device. */
+    };
+
+    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
+        {}; /**< Array of CANN device information. */
+};
+
+const ggml_cann_device_info& ggml_cann_info();
+
+void ggml_cann_set_device(int32_t device);
+int32_t ggml_cann_get_device();
+
+/**
+ * @brief Abstract base class for memory pools used by CANN.
+ */
+struct ggml_cann_pool {
+    /**
+     * @brief Virtual destructor for the memory pool.
+     */
+    virtual ~ggml_cann_pool() = default;
+
+    /**
+     * @brief Allocates memory from the pool.
+     *
+     * @param size         The size of the memory block to allocate.
+     * @param actual_size  Pointer to a variable where the actual allocated size
+     *                     will be stored.
+     * @return             Pointer to the allocated memory block.
+     */
+    virtual void* alloc(size_t size, size_t* actual_size) = 0;
+
+    /**
+     * @brief Frees a previously allocated memory block.
+     *
+     * @param ptr   Pointer to the memory block to free.
+     * @param size  Size of the memory block to free.
+     * @note Note that all CANN opertors are running async. Make sure memory is
+     *       still avaiable before this operator finished.
+     */
+    virtual void free(void* ptr, size_t size) = 0;
+};
+
+/**
+ * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
+ */
+struct ggml_cann_pool_alloc {
+    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
+    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
+    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
+
+    /**
+     * @brief Default constructor.
+     */
+    ggml_cann_pool_alloc() = default;
+
+    /**
+     * @brief Constructor that initializes the memory pool.
+     * @param pool Reference to the memory pool.
+     */
+    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
+
+    /**
+     * @brief Constructor that initializes the memory pool and allocates memory.
+     * @param pool Reference to the memory pool.
+     * @param size Size of the memory block to allocate.
+     */
+    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
+        alloc(size);
+    }
+
+    /**
+     * @brief Destructor that frees the allocated memory block.
+     */
+    ~ggml_cann_pool_alloc() {
+        if (ptr != nullptr) {
+            pool->free(ptr, actual_size);
+        }
+    }
+
+    /**
+     * @brief Allocates memory from the pool.
+     * @param size Size of the memory block to allocate.
+     * @return Pointer to the allocated memory block.
+     */
+    void* alloc(size_t size) {
+        GGML_ASSERT(pool != nullptr);
+        GGML_ASSERT(ptr == nullptr);
+        ptr = pool->alloc(size, &this->actual_size);
+        return ptr;
+    }
+
+    /**
+     * @brief Allocates memory from a specific memory pool.
+     * @param pool Reference to the memory pool.
+     * @param size Size of the memory block to allocate.
+     * @return Pointer to the allocated memory block.
+     */
+    void* alloc(ggml_cann_pool& pool, size_t size) {
+        this->pool = &pool;
+        return alloc(size);
+    }
+
+    /**
+     * @brief Gets the pointer to the allocated memory block.
+     * @return Pointer to the allocated memory block.
+     */
+    void* get() { return ptr; }
+
+    // Deleted copy constructor
+    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
+
+    // Deleted move constructor
+    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
+
+    // Deleted copy assignment operator
+    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
+
+    // Deleted move assignment operator
+    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
+};
+
+/**
+ * @brief Context for managing CANN backend operations.
+ */
+struct ggml_backend_cann_context {
+    int32_t device;                  /**< Device ID. */
+    std::string name;                /**< Name of the device. */
+    std::string description;         /**< Description of the device. */
+    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
+
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
+
+    /**
+     * @brief Constructor for initializing the context with a given device.
+     * @param device Device ID.
+     */
+    explicit ggml_backend_cann_context(int device)
+        : device(device), name("CANN" + std::to_string(device)) {
+        ggml_cann_set_device(device);
+        description = aclrtGetSocName();
+    }
+
+    /**
+     * @brief Destructor for cleaning up resources.
+     */
+    ~ggml_backend_cann_context() {
+        ggml_cann_set_device(device);
+        if (copy_event != nullptr) {
+            ACL_CHECK(aclrtDestroyEvent(copy_event));
+        }
+        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
+            if (streams[i] != nullptr) {
+                ACL_CHECK(aclrtDestroyStream(streams[i]));
+            }
+        }
+    }
+
+    /**
+     * @brief Get or create a stream for a given index.
+     * @param stream Index of the stream.
+     * @return The stream corresponding to the given index.
+     */
+    aclrtStream stream(int stream) {
+        if (streams[stream] == nullptr) {
+            ggml_cann_set_device(device);
+            ACL_CHECK(aclrtCreateStream(&streams[stream]));
+        }
+        return streams[stream];
+    }
+
+    /**
+     * @brief Get or create the default stream (index 0).
+     * @return The default stream.
+     */
+    aclrtStream stream() { return stream(0); }
+
+    // TODO: each stream should have a memory pool.
+    std::unique_ptr<ggml_cann_pool>
+        mem_pool; /**< Memory pool for the device. */
+
+    /**
+     * @brief Create a new memory pool for a given device.
+     * @param device Device ID.
+     * @return A unique pointer to the new memory pool.
+     */
+    static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
+
+    /**
+     * @brief Get or create the memory pool for the context.
+     * @return Reference to the memory pool.
+     */
+    ggml_cann_pool& pool() {
+        if (mem_pool == nullptr) {
+            mem_pool = new_pool_for_device(device);
+        }
+        return *mem_pool;
+    }
+};
+
+#endif  // CANN_COMMON_H
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
new file mode 100644
index 000000000..d410c0244
--- /dev/null
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -0,0 +1,2188 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ggml-cann.h"
+
+#include <acl/acl.h>
+#include <stdarg.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cann/aclnn_ops.h"
+#include "ggml-cann/common.h"
+
+#define GGML_COMMON_DECL_C
+
+#include "ggml-common.h"
+
+#define GGML_CANN_NAME "CANN"
+
+/**
+ * @brief Handles CANN errors by printing an error message and aborting.
+ *
+ * @param stmt The statement that caused the error.
+ * @param func The function in which the error occurred.
+ * @param file The file in which the error occurred.
+ * @param line The line number where the error occurred.
+ * @param msg The error message.
+ */
+[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
+                                  const char* file, int line, const char* msg) {
+    int32_t id = -1;
+    aclrtGetDevice(&id);
+
+    GGML_LOG_ERROR("CANN error: %s\n", msg);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func,
+            file, line);
+    GGML_LOG_ERROR("  %s\n", stmt);
+    // abort with GGML_ASSERT to get a stack trace
+    GGML_ABORT("CANN error");
+}
+
+/**
+ * @brief Sets the device to be used by CANN.
+ *
+ * @param device The device ID to set.
+ */
+void ggml_cann_set_device(const int32_t device) {
+    // TODO: uncomment these lines after empty context has fixed.
+    // int current_device;
+    // ACL_CHECK(aclrtGetDevice(&current_device));
+
+    // if (device == current_device) {
+    //   return;
+    // }
+    ACL_CHECK(aclrtSetDevice(device));
+}
+
+/**
+ * @brief Retrieves the current device ID.
+ *
+ * @return The current device ID.
+ */
+int32_t ggml_cann_get_device() {
+    int32_t id;
+    ACL_CHECK(aclrtGetDevice(&id));
+    return id;
+}
+
+/**
+ * @brief Initialize the CANN device information.
+ *
+ * This function initializes the CANN device information by obtaining the
+ * device count and setting the memory allocation granularity for each device.
+ *
+ * @return A structure containing the device information.
+ */
+static ggml_cann_device_info ggml_cann_init() {
+    ggml_cann_device_info info = {};
+
+    aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
+
+    if (err != ACL_SUCCESS) {
+        GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
+                __func__, aclGetRecentErrMsg());
+        return info;
+    }
+
+    GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES);
+
+    for (int id = 0; id < info.device_count; ++id) {
+        aclrtPhysicalMemProp prop = {};
+        prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+        prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+        prop.memAttr = ACL_HBM_MEM_HUGE;
+        prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = id;
+        prop.reserve = 0;
+        ACL_CHECK(aclrtMemGetAllocationGranularity(
+            &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+            &info.devices[id].vmm_granularity));
+
+        size_t free, total;
+        ggml_backend_cann_get_device_memory(id, &free, &total);
+        info.devices[id].total_vram = free;
+    }
+
+    // TODO: add more device info later.
+    return info;
+}
+
+/**
+ * @brief Retrieve the CANN device information.
+ *
+ * This function returns a reference to a structure containing the CANN device
+ * information. The device information is initialized once and reused on
+ * subsequent calls.
+ *
+ * @return A reference to the structure containing the device information.
+ */
+const ggml_cann_device_info& ggml_cann_info() {
+    static ggml_cann_device_info info = ggml_cann_init();
+    return info;
+}
+
+//#define DEBUG_CANN_MALLOC
+/**
+ * @brief A pool of CANN buffers(legacy).
+ *
+ * This class manages a pool of CANN buffers for a specific device.
+ */
+struct ggml_cann_pool_leg : public ggml_cann_pool {
+    /**
+     * @brief The maximum number of buffers in the pool.
+     */
+    static const int MAX_BUFFERS = 256;
+
+    /**
+     * @brief The device ID associated with this buffer pool.
+     */
+    int device;
+
+    /**
+     * @brief Structure representing a CANN buffer.
+     */
+    struct ggml_cann_buffer {
+        void* ptr = nullptr;  ///< Pointer to the buffer memory.
+        size_t size = 0;      ///< Size of the buffer.
+    };
+
+    /**
+     * @brief Array of CANN buffers in the pool.
+     */
+    ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {};
+
+    /**
+     * @brief Total size of all buffers in the pool.
+     */
+    size_t pool_size = 0;
+
+    /**
+     * @brief Constructor to initialize the buffer pool for a specific device.
+     *
+     * @param device The device ID to associate with this buffer pool.
+     */
+    explicit ggml_cann_pool_leg(int device) : device(device) {}
+
+    /**
+     * @brief Destructor to free all buffers in the pool.
+     */
+    ~ggml_cann_pool_leg() {
+        ggml_cann_set_device(device);
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cann_buffer& b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                ACL_CHECK(aclrtFree(b.ptr));
+                pool_size -= b.size;
+            }
+        }
+        GGML_ASSERT(pool_size == 0);
+    }
+
+    /**
+     * @brief Allocate a buffer of the given size.
+     *
+     * @param size The size of the buffer to allocate.
+     * @param actual_size A pointer to a variable to receive the actual size of
+     * the allocated buffer.
+     * @return A pointer to the allocated buffer.
+     */
+    void* alloc(size_t size, size_t* actual_size) override {
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
+#ifdef DEBUG_CANN_MALLOC
+        int nnz = 0;
+        size_t max_size = 0;
+#endif
+        size_t best_diff = 1ull << 36;
+        int ibest = -1;
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cann_buffer& b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+#ifdef DEBUG_CANN_MALLOC
+                ++nnz;
+                if (b.size > max_size) max_size = b.size;
+#endif
+                if (b.size >= size) {
+                    size_t diff = b.size - size;
+                    if (diff < best_diff) {
+                        best_diff = diff;
+                        ibest = i;
+                        if (!best_diff) {
+                            void* ptr = b.ptr;
+                            *actual_size = b.size;
+                            b.ptr = nullptr;
+                            b.size = 0;
+                            return ptr;
+                        }
+                    }
+                }
+            }
+        }
+        if (ibest >= 0) {
+            ggml_cann_buffer& b = buffer_pool[ibest];
+            void* ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+        void* ptr;
+        ggml_cann_set_device(device);
+        ACL_CHECK(
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        *actual_size = size;
+        pool_size += size;
+#ifdef DEBUG_CANN_MALLOC
+        GGML_LOG_INFO(
+            "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
+            "requested %u MB\n",
+            __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
+            (uint32_t)(pool_size / 1024 / 1024),
+            (uint32_t)(size / 1024 / 1024));
+#endif
+        return ptr;
+    }
+
+    /**
+     * @brief Free a buffer and return it to the pool.
+     *
+     * @param ptr Pointer to the buffer to free.
+     * @param size Size of the buffer to free.
+     */
+    void free(void* ptr, size_t size) override {
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cann_buffer& b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr = ptr;
+                b.size = size;
+                return;
+            }
+        }
+        // memory should always buffered. these memory may still needed by
+        // tasks in stream.
+        // TODO, fix me.
+        GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
+    }
+};
+
+/**
+ * @brief A pool of CANN buffers with virtual memory.
+ *
+ * This class manages a pool of CANN buffers with virtual memory for a specific
+ * device.
+ */
+struct ggml_cann_pool_vmm : public ggml_cann_pool {
+    /**
+     * @brief The maximum size of the virtual memory pool (32 GB).
+     */
+    size_t max_size;
+
+    /**
+     * @brief The device ID associated with this buffer pool.
+     */
+    int device;
+
+    /**
+     * @brief Pointer to the start of the virtual memory pool.
+     */
+    void* pool_addr = 0;
+
+    /**
+     * @brief Amount of virtual memory used in the pool.
+     */
+    size_t pool_used = 0;
+
+    /**
+     * @brief Total size of the virtual memory pool.
+     */
+    size_t pool_size = 0;
+
+    /**
+     * @brief Allocation granularity for the virtual memory pool.
+     */
+    size_t granularity;
+
+    /**
+     * @brief Handles for the physical memory allocated.
+     */
+    std::vector<aclrtDrvMemHandle> handles;
+
+    /**
+     * @brief Offsets for the mapped memory regions.
+     */
+    std::vector<void*> map_offsets;
+
+    /**
+     * @brief Constructor to initialize the buffer pool with virtual memory for
+     * a specific device.
+     *
+     * @param device The device ID to associate with this buffer pool.
+     */
+    explicit ggml_cann_pool_vmm(int device)
+        : device(device),
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+        auto dev = ggml_cann_info().devices[device];
+        granularity = dev.vmm_granularity;
+        max_size = dev.total_vram;
+    }
+
+    /**
+     * @brief Destructor to free all buffers in the virtual memory pool.
+     */
+    ~ggml_cann_pool_vmm() {
+        if (pool_addr != 0) {
+            for (auto& offset : map_offsets) {
+                ACL_CHECK(aclrtUnmapMem(offset));
+            }
+            for (auto& handle : handles) {
+                ACL_CHECK(aclrtFreePhysical(handle));
+            }
+            ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
+        }
+    }
+
+    /**
+     * @brief Allocate a buffer of the given size in the virtual memory pool.
+     *
+     * @param size The size of the buffer to allocate.
+     * @param actual_size A pointer to a variable to receive the actual size of
+     * the allocated buffer.
+     * @return A pointer to the allocated buffer.
+     */
+    void* alloc(size_t size, size_t* actual_size) override {
+        // round up the allocation size to the alignment to ensure that all
+        // allocations are aligned for all data types
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
+
+        size_t avail = pool_size - pool_used;
+
+        if (size > avail) {
+            // round up to the next multiple of the granularity
+            size_t reserve_size = size - avail;
+            reserve_size = GGML_PAD(reserve_size, granularity);
+
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
+
+            // allocate more physical memory
+            aclrtPhysicalMemProp prop = {};
+            prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+            prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+            prop.memAttr = ACL_HBM_MEM_HUGE;
+            prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = device;
+            prop.reserve = 0;
+            aclrtDrvMemHandle handle;
+            ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
+
+            // reserve virtual address space (if not already reserved)
+            if (pool_addr == 0) {
+                ACL_CHECK(aclrtReserveMemAddress(
+                    &pool_addr, max_size, 0, NULL, 1));
+            }
+
+            // map at the end of the pool
+            ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
+                                  handle, 0));
+
+            handles.push_back(handle);
+            map_offsets.push_back((char*)pool_addr + pool_size);
+
+            // add to the pool
+            pool_size += reserve_size;
+
+#ifdef DEBUG_CANN_MALLOC
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                   device, (unsigned long long) (pool_size/1024/1024),
+                   (unsigned long long) (reserve_size/1024/1024));
+#endif
+        }
+
+        GGML_ASSERT(pool_addr != 0);
+
+        void* ptr = (void*)((char*)pool_addr + pool_used);
+        *actual_size = size;
+        pool_used += size;
+
+#ifdef DEBUG_CANN_MALLOC
+        GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
+               (unsigned long long)size, (unsigned long long)ptr);
+#endif
+        return ptr;
+    }
+
+    /**
+     * @brief Free a buffer and return it to the virtual memory pool.
+     *
+     * @param ptr Pointer to the buffer to free.
+     * @param size Size of the buffer to free.
+     */
+    void free(void* ptr, size_t size) override {
+#ifdef DEBUG_CANN_MALLOC
+        GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
+               (unsigned long long)size, (unsigned long long)ptr);
+#endif
+
+        pool_used -= size;
+
+        // all deallocations must be in reverse order of the allocations
+        GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
+    }
+};
+
+/**
+ * @brief Create a new CANN pool for a specific device.
+ *
+ * Factory method to create a new CANN pool object based on the device type.
+ *
+ * @param device The device ID for which to create the pool.
+ * @return A unique pointer to the created CANN pool.
+ */
+std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
+    int device) {
+    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+}
+
+// cann buffer
+/**
+ * @brief Context for managing a CANN buffer associated with a specific device.
+ *
+ * This structure holds information about a CANN buffer, including the device
+ * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
+ */
+struct ggml_backend_cann_buffer_context {
+    int32_t device;  ///< The device ID associated with this buffer context.
+    void* dev_ptr =
+        nullptr;  ///< Pointer to the device memory allocated for the buffer.
+
+    /**
+     * @brief Constructor to initialize the CANN buffer context.
+     *
+     * @param device The device ID associated with this buffer context.
+     * @param dev_ptr Pointer to the device memory allocated for the buffer.
+     */
+    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
+        : device(device),
+          dev_ptr(dev_ptr) {}
+
+    /**
+     * @brief Destructor to free the device memory allocated for the buffer.
+     */
+    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+};
+
+/**
+ * @brief Check if a buffer is a CANN buffer.
+ *
+ * This function checks if a given buffer is a CANN buffer by comparing its
+ * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
+ *
+ * @param buffer The buffer to check.
+ * @return true if the buffer is a CANN buffer, false otherwise.
+ */
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
+static bool ggml_backend_buffer_is_cann(
+    ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_is_cann(buffer->buft);
+}
+
+/**
+ * @brief Free resources associated with a CANN buffer.
+ *
+ * This function frees the resources associated with a CANN buffer, including
+ * its context.
+ *
+ * @param buffer The CANN buffer to free.
+ */
+static void ggml_backend_cann_buffer_free_buffer(
+    ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    delete ctx;
+}
+
+/**
+ * @brief Retrieve the base pointer of a CANN buffer.
+ *
+ * This function returns the base pointer of a CANN buffer, which points to the
+ * device memory allocated for the buffer.
+ *
+ * @param buffer The CANN buffer whose base pointer is to be retrieved.
+ * @return A pointer to the base of the device memory allocated for the buffer.
+ */
+static void* ggml_backend_cann_buffer_get_base(
+    ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    return ctx->dev_ptr;
+}
+
+/**
+ * @brief Transform quantized Q4.0 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q4.0 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q4.0 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
+                                             const void* src,
+                                             void* dst) {
+
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK4_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q4_0* group =
+            (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
+        *scale_offset = group->d;
+        scale_offset++;
+
+        // 0-15
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] & 0x0F);
+            (*quant_offset) |= ((group->qs[j + 1] << 4));
+            quant_offset++;
+        }
+
+        // 16-31
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] >> 4);
+            (*quant_offset) |= (group->qs[j + 1] & 0xF0);
+            quant_offset++;
+        }
+    }
+
+    // put (uint4b_t -8) into int4b_t
+    for (quant_offset = (uint8_t*)dst;
+         quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+        (*quant_offset) ^= 0x88;
+    }
+}
+
+/**
+ * @brief Transform CANN processed data back into quantized Q4.0 format.
+ *
+ * This function transforms CANN processed data back into quantized Q4.0 format.
+ * It reverses the transformation performed by
+ * ggml_backend_cann_transform_q4_0(), converting the data back into its
+ * original quantized form.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source buffer containing transformed data.
+ * @param dst Pointer to the destination buffer where the Q4.0 formatted data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back_q4_0(
+    const ggml_tensor* tensor, void* src, void* dst) {
+
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK4_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+    uint8_t* quant_offset = (uint8_t*)src;
+    uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
+
+    for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
+        (*quant_offset) ^= 0x88;
+    }
+    quant_offset = (uint8_t*)src;
+
+    for (int i = 0; i < groups; i++) {
+        block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
+        group->d = *scale_offset;
+        scale_offset++;
+
+        // 0-15
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            group->qs[j] = ((*quant_offset) & 0x0F);
+            group->qs[j + 1] = ((*quant_offset) >> 4);
+            quant_offset++;
+        }
+
+        // 16-31
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            group->qs[j] |= ((*quant_offset) << 4);
+            group->qs[j + 1] |= ((*quant_offset) & 0xF0);
+            quant_offset++;
+        }
+    }
+}
+
+/**
+ * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN
+ * processing.
+ *
+ * This function transforms quantized Q8.0 tensor data into a format suitable
+ * for CANN processing. It extracts quantization values and scales from the
+ * source data and prepares them in a format expected by CANN operations.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data in Q8.0 format.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
+                                             const void* src,
+                                             void* dst) {
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK8_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q8_0* group =
+            (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
+        *scale_offset = group->d;
+        scale_offset++;
+        size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+        memcpy(quant_offset, group->qs, group_quant_size);
+        quant_offset += group_quant_size;
+    }
+}
+
+/**
+ * @brief Transform CANN processed data back into quantized Q8.0 format.
+ *
+ * This function transforms CANN processed data back into quantized Q8.0 format.
+ * It reverses the transformation performed by
+ * ggml_backend_cann_transform_q8_0(), converting the data back into its
+ * original quantized form.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source buffer containing transformed data.
+ * @param dst Pointer to the destination buffer where the Q8.0 formatted data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back_q8_0(
+    const ggml_tensor* tensor, const void* src, void* dst) {
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK8_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+    const uint8_t* quant_offset = (const uint8_t*)src;
+    const uint16_t* scale_offset =
+        (const uint16_t*)((const char*)src + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
+        group->d = *scale_offset;
+        scale_offset++;
+        size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+        memcpy(group->qs, quant_offset, group_quant_size);
+        quant_offset += group_quant_size;
+    }
+}
+
+/**
+ * @brief Transform tensor data based on its type for CANN processing.
+ *
+ * This function transforms tensor data based on its quantization type for CANN
+ * processing. It dispatches the transformation based on the tensor's type to
+ * specialized functions handling Q4.0 and Q8.0 formats.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data to be transformed.
+ * @param dst Pointer to the destination buffer where transformed data will be
+ * stored.
+ */
+static void ggml_backend_cann_transform(ggml_tensor* tensor,
+                                        const void* src, void* dst) {
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_backend_cann_transform_q4_0(tensor, src, dst);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_backend_cann_transform_q8_0(tensor, src, dst);
+            break;
+        default:
+            break;
+    }
+}
+
+/**
+ * @brief Transform CANN processed data back into tensor data based on its type.
+ *
+ * This function transforms CANN processed data back into tensor data based on
+ * its quantization type for Q4.0 and Q8.0 formats. It dispatches the
+ * transformation based on the tensor's type to specialized functions.
+ *
+ * @param tensor Pointer to the tensor information.
+ * @param src Pointer to the source data containing CANN processed data.
+ * @param dst Pointer to the destination buffer where transformed tensor data
+ * will be stored.
+ */
+static void ggml_backend_cann_transform_back(
+    const ggml_tensor* tensor, void* src, void* dst) {
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_backend_cann_transform_back_q8_0(tensor, src, dst);
+            break;
+        default:
+            break;
+    }
+}
+
+/**
+ * @brief Check if transformation is needed for a given tensor type.
+ *
+ * This function checks if transformation is needed for a given tensor type
+ * to prepare data for CANN processing.
+ *
+ * @param type The tensor type to check.
+ * @return true if transformation is needed, false otherwise.
+ */
+static bool need_transform(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/**
+ * @brief Initialize a tensor using data from a CANN buffer.
+ *
+ * This function initializes a tensor using data from a CANN buffer.
+ * It handles special cases such as views and quantization.
+ *
+ * @param buffer The CANN buffer from which to initialize the tensor.
+ * @param tensor Pointer to the tensor to be initialized.
+ */
+static void ggml_backend_cann_buffer_init_tensor(
+    ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+        return;
+    }
+
+    // TODO: can backend doesn't support quantized yet. Just leave the code
+    // here.
+    if (ggml_is_quantized(tensor->type)) {
+        // Initialize padding to 0 to avoid possible NaN values
+        size_t original_size = ggml_nbytes(tensor);
+        size_t padded_size =
+            ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            size_t memset_size = padded_size - original_size;
+            ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
+                                  memset_size, 0, memset_size));
+        }
+    }
+}
+
+// TODO: need handle tensor which has paddings.
+/**
+ * @brief Set tensor data in a CANN buffer.
+ *
+ * This function sets tensor data in a CANN buffer, handling transformations
+ * if needed based on the tensor's type.
+ *
+ * @param buffer The CANN buffer where the tensor data will be set.
+ * @param tensor Pointer to the tensor whose data will be set.
+ * @param data Pointer to the source data to be copied into the tensor.
+ * @param offset Offset in the source data from where to start copying.
+ * @param size Size of the data to be copied, in bytes.
+ */
+static void ggml_backend_cann_buffer_set_tensor(
+    ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
+    size_t offset, size_t size) {
+    ggml_backend_cann_buffer_context *ctx =
+        (ggml_backend_cann_buffer_context *)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+    // TODO: refer to cann(#6017), it use thread's default stream.
+    // For acl, synchronous functions use this default stream.
+    // Why aclrtSynchronizeDevice?
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+    } else {
+        void *transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, data, transform_buffer);
+
+        ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
+                              transform_buffer, size,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+        free(transform_buffer);
+    }
+}
+
+/**
+ * @brief Get tensor data from a CANN buffer.
+ *
+ * This function retrieves tensor data from a CANN buffer, handling
+ * transformations if needed based on the tensor's type.
+ *
+ * @param buffer The CANN buffer from which to retrieve tensor data.
+ * @param tensor Pointer to the tensor whose data will be retrieved.
+ * @param data Pointer to the destination buffer where the tensor data will be
+ * copied.
+ * @param offset Offset in the destination buffer where to start copying.
+ * @param size Size of the data to be copied, in bytes.
+ */
+static void ggml_backend_cann_buffer_get_tensor(
+    ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
+    size_t offset, size_t size) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+    } else {
+        void* transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpy(transform_buffer, size,
+                              (char*)tensor->data + offset, size,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+        ggml_backend_cann_transform_back(tensor, transform_buffer, data);
+        free(transform_buffer);
+    }
+}
+
+/**
+ * @brief Copy tensor data between CANN buffers if possible.
+ *
+ * This function copies tensor data between CANN buffers if the source and
+ * destination buffers are CANN buffers and they meet the necessary conditions
+ * (same device or devices can access each other).
+ *
+ * @param buffer The destination CANN buffer where the tensor data will be
+ * copied.
+ * @param src Pointer to the source tensor whose data will be copied.
+ * @param dst Pointer to the destination tensor where the data will be copied.
+ * @return true if the copy operation succeeded, false otherwise.
+ */
+static bool ggml_backend_cann_buffer_cpy_tensor(
+    ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
+    if (ggml_backend_buffer_is_cann(src->buffer)) {
+        ggml_backend_cann_buffer_context* src_ctx =
+            (ggml_backend_cann_buffer_context*)src->buffer->context;
+        ggml_backend_cann_buffer_context* dst_ctx =
+            (ggml_backend_cann_buffer_context*)buffer->context;
+
+        size_t memcpy_size = ggml_nbytes(src);
+        // Same device.
+        if (src_ctx->device == dst_ctx->device) {
+            ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
+                                  (const char*)src->data, memcpy_size,
+                                  ACL_MEMCPY_DEVICE_TO_DEVICE));
+            return true;
+        } else {
+            // Different device but can access by peer.
+            int32_t canAccessPeer = 0;
+            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
+                                               dst_ctx->device));
+            if (canAccessPeer) {
+                ggml_cann_set_device(src_ctx->device);
+                ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
+                ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
+                                      (const char*)src->data, memcpy_size,
+                                      ACL_MEMCPY_DEVICE_TO_DEVICE));
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/**
+ * @brief Clear a CANN buffer by setting all its memory to a specified value.
+ *
+ * This function clears a CANN buffer by setting all its memory to a specified
+ * value.
+ *
+ * @param buffer The CANN buffer to be cleared.
+ * @param value The value to which each byte in the buffer will be set.
+ */
+static void ggml_backend_cann_buffer_clear(
+    ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+    ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
+}
+
+/**
+ * @brief Interface for a CANN buffer in the backend.
+ *
+ * This structure defines function pointers to operations that can be performed
+ * on a CANN buffer within the backend.
+ */
+static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cann_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_cann_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cann_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cann_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cann_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// cann buffer type
+/**
+ * @brief Structure representing context information for a specific backend
+ * buffer type.
+ */
+struct ggml_backend_cann_buffer_type_context {
+    int32_t
+        device; /**< Device identifier associated with the buffer context. */
+    std::string name; /**< Name associated with the buffer context. */
+};
+
+/**
+ * @brief Retrieves the name associated with a CANN buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN buffer type context.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char* ggml_backend_cann_buffer_type_name(
+    ggml_backend_buffer_type_t buft) {
+    ggml_backend_cann_buffer_type_context* buft_ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
+
+    return buft_ctx->name.c_str();
+}
+
+/**
+ * @brief Allocates a new CANN buffer of the specified type and size.
+ *
+ * This function allocates a new CANN buffer on the specified device with the
+ * given size.
+ *
+ * @param buft Pointer to the buffer type context.
+ * @param size Size in bytes of the buffer to allocate.
+ * @return Pointer to the allocated buffer, or nullptr if allocation fails.
+ */
+static ggml_backend_buffer_t
+ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) {
+    ggml_backend_cann_buffer_type_context* buft_ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
+
+    ggml_cann_set_device(buft_ctx->device);
+
+    size = std::max(size, (size_t)1);
+
+    void* dev_ptr;
+    aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (err != ACL_SUCCESS) {
+        GGML_LOG_ERROR(
+            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
+            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
+            aclGetRecentErrMsg());
+        return nullptr;
+    }
+
+    ggml_backend_cann_buffer_context* ctx =
+        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
+                                    ctx, size);
+}
+
+/**
+ * @brief Retrieves the memory alignment requirement for CANN buffers of this
+ * type.
+ *
+ * This function returns the alignment requirement in bytes for memory allocated
+ * by the CANN buffer type.
+ *
+ * @param buft Pointer to the buffer type context (unused in this
+ * implementation).
+ * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
+ * buffers).
+ */
+static size_t ggml_backend_cann_buffer_type_get_alignment(
+    ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Calculates the allocation size required for a tensor in a CANN buffer.
+ *
+ * Computes the total allocation size needed for storing the tensor's data in a
+ * CANN buffer, considering any necessary padding or adjustments for quantized
+ * types.
+ *
+ * @param buft Pointer to the buffer type context (unused in this
+ * implementation).
+ * @param tensor Pointer to the tensor for which the allocation size is
+ * calculated.
+ * @return The total allocation size in bytes required for the tensor in the
+ * CANN buffer.
+ */
+static size_t ggml_backend_cann_buffer_type_get_alloc_size(
+    ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    size_t size = ggml_nbytes(tensor);
+    int64_t ne0 = tensor->ne[0];
+
+    // last line must bigger than 32, because every single op deal at
+    // least 32 bytes.
+    // TODO: quantized type?
+    // int64_t line_size = ne0 * ggml_element_size(tensor);
+    // int64_t line_size_align_32 = (line_size + 31) & ~31;
+    // size += (line_size_align_32 - line_size);
+
+    // TODO: not support quantized yet.
+    // TODO: consider un-continue tensor.
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(
+                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Interface for managing CANN buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cann_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_cann_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cann_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_cann_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_cann_buffer_type_is_host,
+};
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (device >= ggml_backend_cann_get_device_count()) {
+        return nullptr;
+    }
+
+    static ggml_backend_buffer_type
+        ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+
+    static bool ggml_backend_cann_buffer_type_initialized = false;
+
+    if (!ggml_backend_cann_buffer_type_initialized) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
+            ggml_backend_cann_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_cann_buffer_type_interface,
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
+                /* .context  = */
+                 new ggml_backend_cann_buffer_type_context{
+                    i, "CANN" + std::to_string(i)},
+            };
+        }
+        ggml_backend_cann_buffer_type_initialized = true;
+    }
+
+    return &ggml_backend_cann_buffer_types[device];
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer type context.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer context.
+ *
+ * @param buft Pointer to the host buffer context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+/**
+ * @brief Free resources associated with a CANN host buffer.
+ *
+ * This function frees the resources associated with a CANN host buffer, including
+ * its context.
+ *
+ * @param buffer The CANN host buffer to free.
+ */
+static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
+    ACL_CHECK(aclrtFreeHost(buffer->context));
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified size.
+ *
+ * This function allocates a new CANN host buffer with the given size.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
+ */
+static void * ggml_cann_host_malloc(size_t size) {
+    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
+
+    void * hostPtr = nullptr;
+    aclError err = aclrtMallocHost((void **) &hostPtr, size);
+    if (err != ACL_SUCCESS) {
+        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+        return nullptr;
+    }
+    return hostPtr;
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified type and size.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
+ */
+static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * hostPtr = ggml_cann_host_malloc(size);
+
+    if (hostPtr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+
+    return buffer;
+}
+
+/**
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cann_buffer_type_host;
+}
+
+/**
+ * @brief Computes the forward operation for a given tensor using CANN
+ * operations.
+ *
+ * This function selects the appropriate CANN operation based on the type of
+ * operation specified in the tensor and performs the computation.
+ *
+ * @param ctx The CANN context containing necessary resources and
+ * configurations.
+ * @param dst The destination tensor where the result of the computation will be
+ * stored.
+ * @return true if the computation was successful; false otherwise.
+ */
+static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
+                                      struct ggml_tensor* dst) {
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggml_cann_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_cann_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_cann_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+            ggml_cann_add(ctx, dst);
+            break;
+        case GGML_OP_ACC:
+            ggml_cann_acc(ctx, dst);
+            break;
+        case GGML_OP_MUL:
+            ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
+            break;
+        case GGML_OP_DIV:
+            ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
+                        ctx, dst);
+                    break;
+                // TODO: Use faster gelu??
+                case GGML_UNARY_OP_GELU_QUICK:
+                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
+                                         aclnnHardsigmoid>(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
+                                         aclnnHardswish>(ctx, dst);
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_cann_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_cann_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_cann_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_cann_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_cann_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggml_cann_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_cann_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_cann_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_cann_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggml_cann_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggml_cann_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_cann_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_cann_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_cann_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggml_cann_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_cann_diag_mask(ctx, dst, -INFINITY);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_cann_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_cann_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_cann_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_cann_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_cann_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_cann_argsort(ctx, dst);
+            break;
+        default:
+            return false;
+    }
+
+    return true;
+}
+
+// backend
+/**
+ * @brief Retrieves the name associated with the CANN backend.
+ *
+ * This function returns the name assigned to the CANN backend, which is stored
+ * in the context of the provided backend structure.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @return A pointer to a constant string representing the backend name.
+ */
+static const char* ggml_backend_cann_name(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    return cann_ctx->name.c_str();
+}
+
+/**
+ * @brief Frees resources associated with the CANN backend.
+ *
+ * This function releases resources associated with the CANN backend context
+ * and resets the device associated with the backend to its initial state.
+ *
+ * @param backend Pointer to the CANN backend structure to be freed.
+ */
+static void ggml_backend_cann_free(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+    ACL_CHECK(aclrtSynchronizeDevice());
+    ACL_CHECK(aclrtResetDevice(cann_ctx->device));
+
+    // finalize when last backend freed.
+    if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
+        ACL_CHECK(aclFinalize());
+    }
+
+    delete cann_ctx;
+    delete backend;
+}
+
+/**
+ * @brief Sets tensor data asynchronously in the CANN backend.
+ *
+ * This function asynchronously sets tensor data in the CANN backend. Depending
+ * on the tensor type, it may perform data transformations before copying data
+ * to the device.
+ *
+ * @param backend Pointer to the CANN backend structure.
+ * @param tensor Pointer to the tensor structure to set data for.
+ * @param data Pointer to the host data to copy to the tensor.
+ * @param offset Offset in bytes within the host data.
+ * @param size Size of the data to copy in bytes.
+ */
+static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
+                                               ggml_tensor *tensor,
+                                               const void *data,
+                                               size_t offset,
+                                               size_t size) {
+    ggml_backend_cann_context *cann_ctx =
+        (ggml_backend_cann_context *)backend->context;
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
+                                   size, ACL_MEMCPY_HOST_TO_DEVICE,
+                                   cann_ctx->stream()));
+    } else {
+        void *transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, data, transform_buffer);
+
+        ACL_CHECK(aclrtMemcpyAsync(
+            (char *)tensor->data + offset, size, transform_buffer, size,
+            ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+        free(transform_buffer);
+    }
+}
+
+static void ggml_backend_cann_get_tensor_async(
+    ggml_backend_t backend, const ggml_tensor *tensor, void *data,
+    size_t offset, size_t size) {
+    ggml_backend_cann_context *cann_ctx =
+        (ggml_backend_cann_context *)backend->context;
+    ggml_backend_buffer_t buf =
+        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
+                "unsupported buffer type");
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
+                                   size, ACL_MEMCPY_DEVICE_TO_HOST,
+                                   cann_ctx->stream()));
+    } else {
+        void *transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpyAsync(
+            transform_buffer, size, (char *)tensor->data + offset, size,
+            ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+        ggml_backend_cann_transform_back(tensor, transform_buffer, data);
+        free(transform_buffer);
+    }
+}
+
+/**
+ * @brief Asynchronously copies tensor data between CANN backends.
+ *
+ * This function copies tensor data asynchronously between two CANN backends. It
+ * checks if both tensors reside in CANN buffers and whether the devices support
+ * peer-to-peer access for direct copying. If not, it returns false.
+ *
+ * @param backend_src Pointer to the source CANN backend structure.
+ * @param backend_dst Pointer to the destination CANN backend structure.
+ * @param src Pointer to the source tensor to copy data from.
+ * @param dst Pointer to the destination tensor to copy data to.
+ * @return true if the copy operation succeeds, false otherwise.
+ */
+static bool ggml_backend_cann_cpy_tensor_async(
+    ggml_backend_t backend_src, ggml_backend_t backend_dst,
+    const ggml_tensor* src, ggml_tensor* dst) {
+    GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
+                ggml_backend_is_cann(backend_dst));
+
+    if (!ggml_backend_buffer_is_cann(src->buffer) ||
+        !ggml_backend_buffer_is_cann(dst->buffer)) {
+        return false;
+    }
+
+    ggml_backend_buffer_t buf_src =
+        src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst =
+        dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+    ggml_backend_cann_context* cann_ctx_src =
+        (ggml_backend_cann_context*)backend_src->context;
+    ggml_backend_cann_context* cann_ctx_dst =
+        (ggml_backend_cann_context*)backend_dst->context;
+
+    size_t copy_size = ggml_nbytes(dst);
+    if (backend_src != backend_dst) {
+        ggml_backend_cann_buffer_context* buf_ctx_src =
+            (ggml_backend_cann_buffer_context*)buf_src->context;
+        ggml_backend_cann_buffer_context* buf_ctx_dst =
+            (ggml_backend_cann_buffer_context*)buf_dst->context;
+
+        GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
+        GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
+
+        int32_t canAccessPeer = 0;
+        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
+                                           cann_ctx_dst->device));
+        if (!canAccessPeer) {
+            return false;
+        }
+
+        // need open both directions for memcpyasync between devices.
+        ggml_cann_set_device(cann_ctx_dst->device);
+        ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
+        ggml_cann_set_device(cann_ctx_src->device);
+        ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
+
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                   cann_ctx_src->stream()));
+
+        //TODO: workaround for Event didn`t work here.
+        aclrtSynchronizeStream(cann_ctx_src->stream());
+    } else {
+        // src and dst are on the same backend
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                   cann_ctx_dst->stream()));
+    }
+
+    return true;
+}
+
+/**
+ * @brief Synchronizes a CANN backend.
+ *
+ * This function synchronizes the specified CANN backend by waiting for all
+ * operations in its associated stream to complete.
+ *
+ * @param backend Pointer to the CANN backend structure to synchronize.
+ */
+static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+}
+
+/**
+ * @brief Computes a computational graph using a CANN backend.
+ *
+ * This function computes the operations defined in the computational graph
+ * using the specified CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure to use for computation.
+ * @param cgraph Pointer to the computational graph structure containing nodes
+ *               representing operations to be computed.
+ * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
+ *         completes successfully, otherwise an appropriate error status.
+ */
+static enum ggml_status ggml_backend_cann_graph_compute(
+    ggml_backend_t backend, ggml_cgraph* cgraph) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor* node = cgraph->nodes[i];
+
+        if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
+                    node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+/**
+ * @brief Checks if the CANN backend supports a specific operation.
+ *
+ * This function checks whether the specified operation is supported by the
+ * CANN backend.
+ *
+ * @param backend Pointer to the CANN backend structure to check support for
+ *                the operation.
+ * @param op Pointer to the tensor representing the operation to check.
+ * @return bool Returns true if the operation is supported by the backend,
+ *              otherwise false.
+ */
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
+                                                    const ggml_tensor* op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                    return true;
+                default:
+                    return false;
+            }
+        case GGML_OP_MUL_MAT: {
+            switch (op->src[0]->type) {
+                case GGML_TYPE_Q8_0:
+                    // Current groupsize should not be greater than k-1 in
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
+                    if (op->src[0]->ne[0] <= QK8_0) {
+                        return false;
+                    }
+                case GGML_TYPE_F16:
+                case GGML_TYPE_F32:
+                case GGML_TYPE_Q4_0:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        // embedding
+        case GGML_OP_GET_ROWS: {
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q8_0:
+                    return true;
+                default:
+                    return false;
+            }
+        } break;
+        case GGML_OP_CPY: {
+            switch (op->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_CONT: {
+            // TODO: support GGML_TYPE_BF16
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_ROPE: {
+            // TODO: with ops-test v == 1
+            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            // TODO: n_dims <= ne0
+            if (op->src[0]->ne[0] != op->op_params[1]) {
+                return false;
+            }
+            // TODO: ext_factor != 0
+            if (*ext_factor != 0) {
+                return false;
+            }
+
+            const int mode = ((const int32_t *) op->op_params)[2];
+            if (mode & GGML_ROPE_TYPE_MROPE) {
+                return false;
+            }
+            if (mode & GGML_ROPE_TYPE_VISION) {
+                return false;
+            }
+
+            return true;
+        }
+        case GGML_OP_UPSCALE: {
+            // aclnnUpsampleNearest2dGetWorkspaceSize not support
+            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_IM2COL:
+        case GGML_OP_CONCAT:
+        case GGML_OP_DUP:
+        case GGML_OP_REPEAT:
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_CLAMP:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_PAD:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(dev);
+}
+
+/**
+ * @brief Checks if the backend buffer type is associated with the CANN backend.
+ *
+ * This function checks whether the provided backend buffer type is associated
+ * with the CANN backend based on the comparison of its name retrieval function
+ * pointer.
+ *
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the buffer type is associated with the CANN
+ * backend, otherwise false.
+ */
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
+}
+
+/**
+ * @brief Determines if a tensor operation should be offloaded to the CANN
+ * backend.
+ *
+ * This function checks if a given tensor operation should be offloaded to the
+ * CANN backend based on the operation type and the size of the tensor. It
+ * returns true if the second dimension (ne[1]) of the tensor is greater than or
+ * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param op Pointer to the tensor operation to check.
+ * @return bool Returns true if the operation should be offloaded, otherwise
+ * false.
+ */
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
+                                                   const ggml_tensor* op) {
+    const int min_batch_size = 32;
+    GGML_UNUSED(dev);
+
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
+
+/**
+ * @brief Records an event on the CANN backend stream.
+ *
+ * This function records the given event on the ACL runtime stream associated
+ * with the backend context.
+ *
+ * @param event Pointer to the event structure to be recorded.
+ */
+static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
+}
+
+/**
+ * @brief Waits for a recorded event to complete on the CANN backend stream.
+ *
+ * This function makes the given backend wait for the event to complete on its
+ * ACL runtime stream.
+ *
+ * @param backend Pointer to the backend structure.
+ * @param event Pointer to the event structure that the backend needs to wait
+ * for.
+ */
+static void ggml_backend_cann_event_wait(ggml_backend_t backend,
+                                         ggml_backend_event_t event) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+    if (ggml_backend_is_cann(backend)) {
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
+                                       (aclrtEvent)event->context));
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+/**
+ * @brief Structure defining the interface for the CANN backend.
+ *
+ * This structure contains function pointers for various operations
+ * supported by the CANN backend, including name retrieval, memory
+ * management, tensor operations, synchronization, and event handling.
+ */
+static const ggml_backend_i ggml_backend_cann_interface = {
+    /* .get_name                = */ ggml_backend_cann_name,
+    /* .free                    = */ ggml_backend_cann_free,
+    /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
+    /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
+    /* .synchronize             = */ ggml_backend_cann_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_cann_graph_compute,
+    /* .event_record            = */ ggml_backend_cann_event_record,
+    /* .event_wait              = */ ggml_backend_cann_event_wait,
+};
+
+/**
+ * @brief Return the hardcoded GUID for the CANN backend.
+ *
+ * This function returns a static GUID which uniquely identifies the CANN
+ * backend.
+ *
+ * @return A pointer to the static GUID.
+ */
+static ggml_guid_t ggml_backend_cann_guid() {
+    static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+                             0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
+    return &guid;
+}
+
+// backend device
+struct ggml_backend_cann_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_get_device_memory(ctx->device, free, total);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cann_device_get_name(dev);
+    props->description = ggml_backend_cann_device_get_description(dev);
+    props->type        = ggml_backend_cann_device_get_type(dev);
+    ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
+
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ true,
+    };
+}
+
+static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ggml_backend_cann_init(ctx->device);
+}
+
+/**
+ * @brief Checks if the CANN backend supports a specific backend buffer type.
+ *
+ * This function determines whether the CANN backend supports the given backend
+ * buffer type by comparing the device context of the backend and buffer type.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @param buft Pointer to the backend buffer type to check.
+ * @return bool Returns true if the CANN backend supports the buffer type,
+ *              otherwise false.
+ */
+static bool ggml_backend_cann_supports_buft(
+    ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_cann(buft)) {
+        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx =
+                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        return buft_ctx->device == dev_ctx->device;
+    }
+    return false;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    return ggml_backend_cann_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_cann_host_buffer_type();
+}
+
+/**
+ * @brief Creates a new event for the CANN backend device.
+ *
+ * This function initializes a new event for the CANN backend by setting the
+ * device and creating an ACL runtime event. The created event is then wrapped
+ * in a ggml_backend_event structure and returned.
+ *
+ * @param backend Pointer to the CANN backend.
+ * @return ggml_backend_event_t Returns a pointer to the new event structure.
+ */
+static ggml_backend_event_t ggml_backend_cann_device_event_new(
+    ggml_backend_dev_t dev) {
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+
+    ggml_cann_set_device(dev_ctx->device);
+
+    aclrtEvent event;
+    ACL_CHECK(aclrtCreateEvent(&event));
+
+    return new ggml_backend_event{
+        /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
+        /* .context = */ event,
+    };
+}
+
+/**
+ * @brief Frees a CANN backend event.
+ *
+ * This function destroys the ACL runtime event associated with the given CANN
+ * backend event and then deletes the event structure itself.
+ *
+ * @param event Pointer to the event structure to be freed.
+ */
+static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+
+    delete event;
+    GGML_UNUSED(dev);
+}
+
+/**
+ * @brief Synchronizes the given event on the CANN backend.
+ *
+ * This function waits for the specified event to complete on the ACL runtime.
+ *
+ * @param event Pointer to the event structure to be synchronized.
+ */
+static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+
+    GGML_UNUSED(dev);
+}
+
+static const ggml_backend_device_i ggml_backend_cann_device_interface = {
+    /* .get_name                = */ ggml_backend_cann_device_get_name,
+    /* .get_description         = */ ggml_backend_cann_device_get_description,
+    /* .get_memory              = */ ggml_backend_cann_device_get_memory,
+    /* .get_type                = */ ggml_backend_cann_device_get_type,
+    /* .get_props               = */ ggml_backend_cann_device_get_props,
+    /* .init_backend            = */ ggml_backend_cann_device_init,    // called for every card
+    /* .get_buffer_type         = */ ggml_backend_cann_device_get_buffer_type,
+    /* .get_host_buffer_type    = */ ggml_backend_cann_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr    = */ NULL, // not supported for CANN
+    /* .supports_op             = */ ggml_backend_cann_supports_op,
+    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
+    /* .offload_op              = */ ggml_backend_cann_offload_op,
+    /* .event_new               = */ ggml_backend_cann_device_event_new,
+    /* .event_free              = */ ggml_backend_cann_device_event_free,
+    /* .event_synchronize       = */ ggml_backend_cann_device_event_synchronize,
+};
+
+
+// backend reg
+struct ggml_backend_cann_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_CANN_NAME;
+}
+
+static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+    // reserved for future use
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
+    /* .get_name          = */ ggml_backend_cann_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_cann_reg_get_device_count,
+    /* .get_device        = */ ggml_backend_cann_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_cann_reg_get_proc_address,
+};
+
+// backend registry, called only once for cann backend
+ggml_backend_reg_t ggml_backend_cann_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            aclInit(nullptr);
+            ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
+
+            for (int i = 0; i < ggml_cann_info().device_count; i++) {
+                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
+                dev_ctx->description = aclrtGetSocName();
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+                ggml_cann_set_device(i);
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_cann_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cann_reg_interface,
+                /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+ggml_backend_t ggml_backend_cann_init(int32_t device) {
+    aclInit(nullptr);
+    if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
+        GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return nullptr;
+    }
+    ggml_cann_set_device(ctx->device);
+    ggml_backend_t cann_backend =
+        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
+                         /* .interface = */ ggml_backend_cann_interface,
+                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                         /* .context   = */ ctx};
+
+    return cann_backend;
+}
+
+bool ggml_backend_is_cann(ggml_backend_t backend) {
+    return backend != NULL &&
+           ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+}
+
+int32_t ggml_backend_cann_get_device_count() {
+    return ggml_cann_info().device_count;
+}
+
+void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size) {
+    ggml_cann_set_device(device);
+    const char* soc_name = aclrtGetSocName();
+    snprintf(description, description_size, "%s", soc_name);
+}
+
+void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
+                                         size_t* total) {
+    ggml_cann_set_device(device);
+    ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
new file mode 100644
index 000000000..d687220c3
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -0,0 +1,30 @@
+file(GLOB SRC_FILES
+    get_row_f32.cpp
+    get_row_f16.cpp
+    get_row_q4_0.cpp
+    get_row_q8_0.cpp
+    quantize_f32_q8_0.cpp
+    quantize_f16_q8_0.cpp
+    quantize_float_to_q4_0.cpp
+    dup.cpp
+)
+
+set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
+set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels STATIC
+    ${SRC_FILES}
+)
+
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
new file mode 100644
index 000000000..7e153208c
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -0,0 +1,19 @@
+#ifndef ASCENDC_KERNELS_H
+#define ASCENDC_KERNELS_H
+
+#include "aclrtlaunch_ascendc_get_row_f32.h"
+#include "aclrtlaunch_ascendc_get_row_f16.h"
+#include "aclrtlaunch_ascendc_get_row_q8_0.h"
+#include "aclrtlaunch_ascendc_get_row_q4_0.h"
+
+#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
+#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
+
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
+
+#endif  // ASCENDC_KERNELS_H
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
new file mode 100644
index 000000000..c7ba38d10
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -0,0 +1,236 @@
+#include "kernel_operator.h"
+
+#include <cmath>
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
+
+template <typename SRC_T, typename DST_T>
+class DupByRows {
+   public:
+    __aicore__ inline DupByRows() {}
+    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
+                                size_t *input_nb_ub) {
+        /* Dup by rows when src is contigous on first dimension and dst is
+        contiguous, each kernel process one row.
+        */
+
+        // Input has four dims.
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // param
+        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
+        num_elem = input_ne_ub[0];
+
+        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
+        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
+        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
+                  / (input_ne_ub[1]);
+        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
+                - idx_ne2 * input_ne_ub[1];
+
+        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
+        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
+                     + input_nb_ub[1] * idx_ne1;
+
+        // dst is contiguous
+        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
+
+        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
+                                                                src_stride));
+        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
+                                                                dst_stride));
+
+        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
+                                                32 - 1) / 32 * 32);
+        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
+                                                32 - 1) / 32 * 32);
+    }
+
+    __aicore__ inline void copy_in() {
+        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
+        src_queue.EnQue(src_local);
+    }
+
+    __aicore__ inline void copy_out() {
+        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
+        DataCopyPad(dst_gm, dst_local, dataCopyParams);
+#endif
+        dst_queue.FreeTensor(dst_local);
+    }
+
+    __aicore__ inline void dup() {
+        // main process, copy one row data from src to dst.
+        copy_in();
+
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+
+        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
+        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
+                                        / BLOCK_NUM * BLOCK_NUM);
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+    __aicore__ inline void dup_with_cast() {
+        // main process, copy one row data from src to dst.
+        // cast dtype from src to dst.
+        copy_in();
+
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+
+        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+   private:
+
+    TPipe pipe;
+    GlobalTensor<SRC_T> src_gm;
+    GlobalTensor<DST_T> dst_gm;
+
+    int64_t num_rows;
+    int64_t num_elem;
+    int64_t idx_ne3;
+    int64_t idx_ne2;
+    int64_t idx_ne1;
+    int64_t src_stride;
+    int64_t dst_stride;
+
+    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<half, half> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<float_t, float_t> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<float_t, half> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup_with_cast();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    // copy params from gm to ub.
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<half, float_t> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup_with_cast();
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
new file mode 100644
index 000000000..416b45104
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -0,0 +1,197 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F16 {
+   public:
+    __aicore__ inline GET_ROW_F16() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        // TODO, use template for F16/f32
+        int64_t op_block_num = GetBlockNum();
+        op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
+                                             & ~31);
+        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
+                                              & ~31);
+
+        local_buffer_elems = input_local_buffer_size / sizeof(half);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if(tail != 0) {
+            len += elem_per_block;
+        }
+        DataCopy(input_local, input_gm[offset], len);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
+        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len],
+                        dataCopyParams);
+#endif
+        }
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        Cast(output_local, input_local, RoundMode::CAST_NONE,
+             local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f16(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F16 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
new file mode 100644
index 000000000..02116905b
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -0,0 +1,190 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F32 {
+   public:
+    __aicore__ inline GET_ROW_F32() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
+        local_buffer_elems = local_buffer_size / sizeof(float);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if(tail != 0) {
+            len += elem_per_block;
+        }
+        DataCopy(input_local, input_gm[offset], len);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
+        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len],
+                        dataCopyParams);
+#endif
+        }
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        DataCopy(output_local, input_local, local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f32(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F32 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
new file mode 100644
index 000000000..4fbe72208
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -0,0 +1,204 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support 4bit get row.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+
+#define QK4_0 32
+
+class GET_ROW_Q4_0 {
+   public:
+    __aicore__ inline GET_ROW_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK4_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK4_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] / 2;
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
+        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
+        DataCopy(input_local, input_gm[offset], QK4_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK4_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK4_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK4_0;
+
+        copy_in(input_offset);
+        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+
+        Muls(output_local, output_local, (float)scale, QK4_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int4b_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q4_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
new file mode 100644
index 000000000..ba9ab3c04
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@@ -0,0 +1,191 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+#define QK8_0 32
+
+class GET_ROW_Q8_0 {
+   public:
+    __aicore__ inline GET_ROW_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK8_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK8_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] * sizeof(int8_t);
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK8_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK8_0;
+
+        copy_in(input_offset);
+        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+        Muls(output_local, output_local, (float)scale, QK8_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int8_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q8_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
new file mode 100644
index 000000000..504b43afa
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -0,0 +1,218 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->8bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F16_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F16_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
+                                                 group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, 32);
+        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Abs(abs_local, cast_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+    TQue<QuePosition::VECOUT, 1> cast_queue;
+
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F16_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
new file mode 100644
index 000000000..05b0bc1df
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -0,0 +1,216 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->8bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F32_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F32_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
+                                                 ir * group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
+        pipe.InitBuffer(scale_queue, 1, 32);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+
+        Abs(abs_local, input_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(input_local, input_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECIN, 1> cast_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F32_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
new file mode 100644
index 000000000..1188937b7
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -0,0 +1,295 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->4bit quantization.\n");
+    }
+
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->4bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define Group_Size 32
+
+template <typename SRC_T>
+class QUANTIZE_FLOAT_TO_Q4_0 {
+   public:
+    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+        //                         permute=[0,0,0,0]):
+        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // input stride of data elements
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        // output stride of data elements
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / Group_Size;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t) / 2;
+
+        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
+                                                 group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
+        pipe.InitBuffer(output_queue, BUFFER_NUM,
+                            Group_Size * sizeof(int8_t) / 2);
+        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
+        DataCopy(input_local, input_gm[offset], Group_Size);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
+        // and using DataCopyPad to avoid 32 bits align.
+        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
+        LocalTensor<int8_t> output_int8_local =
+                                    output_local.ReinterpretCast<int8_t>();
+
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
+        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
+
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<float> input_local) {
+        DataCopy(cast_local, input_local, Group_Size);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<half> input_local) {
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + Group_Size * group;
+
+        // output_offset is stride for output_gm which datatype is int8_t and
+        // divided by 2 is needed for int4b_t.
+        const int64_t output_offset = (i1 * output_stride[1] +
+                                       i2 * output_stride[2] +
+                                       i3 * output_stride[3] +
+                                       Group_Size * group) / 2;
+        copy_in(input_offset);
+
+        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
+        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
+        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
+        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
+
+        input_to_cast(cast_local, input_local);
+
+        ReduceMax(max_local, cast_local, work_local, Group_Size);
+        ReduceMin(min_local, cast_local, work_local, Group_Size);
+        const float max_value = max_local.GetValue(0);
+        const float min_value = min_local.GetValue(0);
+        float d = max_value;
+        if (min_value < 0 && (-1 * min_value) > max_value) {
+            d = min_value;
+        }
+
+        d = d / (-8);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
+        }
+
+        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
+        float scalar = 8.5f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
+        scalar = 15.0f;
+        Mins(cast_local, cast_local, scalar, Group_Size);
+        scalar = -8.0f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+
+        // float->half->int4b
+        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
+        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
+
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        max_queue.FreeTensor(max_local);
+        min_queue.FreeTensor(min_local);
+        int8_queue.FreeTensor(int8_local);
+        half_queue.FreeTensor(half_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                // Copy Group_Size/2 length data each time.
+                if (scale_local_offset == Group_Size / 2) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local,
+                                      Group_Size / 2);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += Group_Size / 2;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+        scale_queue.FreeTensor(scale_local);
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<SRC_T> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<half> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<float> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
new file mode 100644
index 000000000..f13fd4dea
--- /dev/null
+++ b/ggml/src/ggml-common.h
@@ -0,0 +1,1853 @@
+#ifndef GGML_COMMON_DECL
+
+#if defined(GGML_COMMON_DECL_C)
+#include <stdint.h>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+// std-c++ allow anonymous unions but some compiler warn on it
+#define GGML_COMMON_AGGR_U data
+// std-c++ do not allow it.
+#define GGML_COMMON_AGGR_S data
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_METAL)
+#include <metal_stdlib>
+
+typedef half  ggml_half;
+typedef half2 ggml_half2;
+
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CUDA)
+#if defined(GGML_COMMON_DECL_MUSA)
+#include <musa_fp16.h>
+#else
+#include <cuda_fp16.h>
+#endif
+#include <cstdint>
+
+typedef half  ggml_half;
+typedef half2 ggml_half2;
+
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_HIP)
+#include <hip/hip_fp16.h>
+#include <cstdint>
+
+typedef half  ggml_half;
+typedef half2 ggml_half2;
+
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_SYCL)
+#include <sycl/half_type.hpp>
+#include <cstdint>
+
+typedef sycl::half  ggml_half;
+typedef sycl::half2 ggml_half2;
+
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
+
+#define GGML_COMMON_DECL
+#endif
+
+#if defined(GGML_COMMON_DECL)
+
+#ifndef __cplusplus
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+#endif // __cplusplus
+
+// QK = number of values after dequantization
+// QK_K = super-block size
+
+#define QK_K 256
+#define K_SCALE_SIZE 12
+
+#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+#define QR4_0 2
+
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+#define QR4_1 2
+
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+#define QR5_0 2
+
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+#define QR5_1 2
+
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+#define QR8_0 1
+
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+#define QR8_1 1
+
+#define QI2_K (QK_K / (4*QR2_K))
+#define QR2_K 4
+
+#define QI3_K (QK_K / (4*QR3_K))
+#define QR3_K 4
+
+#define QI4_K (QK_K / (4*QR4_K))
+#define QR4_K 2
+
+#define QI5_K (QK_K / (4*QR5_K))
+#define QR5_K 2
+
+#define QI6_K (QK_K / (4*QR6_K))
+#define QR6_K 2
+
+#define QI2_XXS (QK_K / (4*QR2_XXS))
+#define QR2_XXS 4
+
+#define QI2_XS (QK_K / (4*QR2_XS))
+#define QR2_XS 4
+
+#define QI2_S (QK_K / (4*QR2_S))
+#define QR2_S 4
+
+#define QI3_XXS (QK_K / (4*QR3_XXS))
+#define QR3_XXS 4
+
+#define QI3_XS (QK_K / (4*QR3_XS))
+#define QR3_XS 4
+
+#define QI1_S (QK_K / (4*QR1_S))
+#define QR1_S 8
+
+#define QI1_M (QK_K / (4*QR1_M))
+#define QR1_M 8
+
+#define QI4_NL (QK4_NL / (4*QR4_NL))
+#define QR4_NL 2
+
+#define QI4_XS (QK_K / (4*QR4_XS))
+#define QR4_XS 2
+
+#define QI3_S (QK_K / (4*QR3_S))
+#define QR3_S 4
+
+#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
+
+#define QK4_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qs[QK4_1 / 2]; // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+typedef struct {
+    ggml_half d;       // delta
+    int8_t  qs[QK8_0]; // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half s; // d * sum(qs[i])
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 ds;
+    } GGML_COMMON_AGGR_U;
+    int8_t qs[QK8_1]; // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
+
+//
+// Ternary quantization
+//
+
+// 1.6875 bpw
+typedef struct {
+    uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
+    uint8_t qh[QK_K/64]; // 4 elements per byte
+    ggml_half d;
+} block_tq1_0;
+static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
+
+// 2.0625 bpw
+typedef struct {
+    uint8_t qs[QK_K/4]; // 2 bits per element
+    ggml_half d;
+} block_tq2_0;
+static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
+
+//
+// Super-block quantization structures
+//
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elements each
+// Effectively 2.625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 3.4375 bits per weight
+typedef struct {
+    uint8_t hmask[QK_K/8]; // quants - high bit
+    uint8_t qs[QK_K/4];    // quants - low 2 bits
+    uint8_t scales[12];    // scales, quantized with 6 bits
+    ggml_half d;           // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
+
+// 4-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];           // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
+
+// 5-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_half d;             // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+// (Almost) "true" 2-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 2.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_half d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
+
+// 2.3125 bpw quants
+typedef struct {
+    ggml_half d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
+// 2.5625 bpw quants
+typedef struct {
+    ggml_half d;
+    uint8_t qs[QK_K/4];
+    uint8_t qh[QK_K/32];
+    uint8_t scales[QK_K/32];
+} block_iq2_s;
+static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
+
+// (Almost) "true" 3-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 3.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_half d;
+    uint8_t qs[3*QK_K/8];
+} block_iq3_xxs;
+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
+
+// 3.4375 bpw
+#define IQ3S_N_SCALE QK_K/64
+typedef struct {
+    ggml_half d;
+    uint8_t qs[QK_K/4];
+    uint8_t qh[QK_K/32];
+    uint8_t signs[QK_K/8];
+    uint8_t scales[IQ3S_N_SCALE];
+} block_iq3_s;
+static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
+
+// 1.5625 bpw
+typedef struct {
+    ggml_half d;
+    uint8_t  qs[QK_K/8];
+    uint16_t qh[QK_K/32];
+} block_iq1_s;
+static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
+
+// 1.75 bpw
+typedef struct {
+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
+} block_iq1_m;
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+
+// Used by IQ1_M quants
+typedef union {
+    ggml_half f16;
+    uint16_t  u16;
+} iq1m_scale_t;
+
+// Non-linear quants
+#define QK4_NL 32
+typedef struct {
+    ggml_half d;
+    uint8_t qs[QK4_NL/2];
+} block_iq4_nl;
+static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
+
+typedef struct {
+    ggml_half d;
+    uint16_t scales_h;
+    uint8_t  scales_l[QK_K/64];
+    uint8_t  qs[QK_K/2];
+} block_iq4_xs;
+static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
+
+#endif // GGML_COMMON_DECL
+#endif // GGML_COMMON_DECL
+
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef GGML_COMMON_IMPL
+
+#if defined(GGML_COMMON_IMPL_C)
+#include <stdint.h>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_METAL)
+#include <metal_stdlib>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const constant type name[size] = {
+#define GGML_TABLE_END() };
+
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
+#define GGML_TABLE_END() };
+
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_SYCL)
+
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
+#define GGML_COMMON_IMPL
+#endif
+
+#if defined(GGML_COMMON_IMPL)
+
+GGML_TABLE_BEGIN(uint8_t, kmask_iq2xs, 8)
+    1, 2, 4, 8, 16, 32, 64, 128
+GGML_TABLE_END()
+
+GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
+      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
+    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
+    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
+     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
+    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
+     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
+     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
+    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
+GGML_TABLE_END()
+
+//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
+GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
+    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
+    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
+    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
+    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
+    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
+    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
+    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
+    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
+    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
+    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
+    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
+    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
+    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
+    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
+    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
+    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
+    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
+    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
+    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
+    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
+    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
+    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
+    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
+    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
+    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
+    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
+    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
+    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
+    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
+    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
+    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
+    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
+GGML_TABLE_END()
+//#endif
+
+
+GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
+    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
+    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
+    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
+    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
+    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
+    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
+    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
+    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
+    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
+    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
+    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
+    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
+    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
+    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
+    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
+    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
+    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
+    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
+    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
+    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
+    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
+    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
+    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
+    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
+    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
+    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
+    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
+    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
+    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
+    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
+    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
+    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
+    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
+    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
+    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
+    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
+    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
+    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
+    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
+    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
+    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
+    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
+    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
+    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
+    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
+    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
+    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
+    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
+    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
+    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
+    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
+    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
+    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
+    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
+    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
+    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
+    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
+    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
+GGML_TABLE_END()
+
+GGML_TABLE_BEGIN(uint64_t, iq2xs_grid, 512)
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
+    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
+    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
+    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
+    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
+    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
+    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
+    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
+    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
+    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
+    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
+    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
+    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
+    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
+    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
+    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
+    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
+    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
+    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
+    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
+    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
+    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
+    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
+    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
+    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
+    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
+    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
+    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
+    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
+    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
+    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
+    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
+    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
+    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
+    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
+    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
+    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
+    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
+    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
+    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
+    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
+    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
+    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
+    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
+    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
+    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
+    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
+    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
+    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
+    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
+    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
+    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
+    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
+    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
+    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
+    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
+    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
+    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
+    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
+    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
+    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
+    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
+    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
+    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
+    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
+    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
+    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
+    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
+    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
+    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
+    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
+    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
+    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
+    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
+    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
+    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
+    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
+    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
+    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
+    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
+    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
+    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
+    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
+    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
+    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
+    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
+    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
+    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
+    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
+    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
+    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
+    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
+    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
+    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
+    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
+    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
+    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
+    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
+    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
+    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
+    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
+    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
+    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
+    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
+    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
+    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
+    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
+    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
+    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
+    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
+    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
+    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
+    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
+    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
+    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
+    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
+    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
+GGML_TABLE_END()
+
+GGML_TABLE_BEGIN(uint64_t, iq2s_grid, 1024)
+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
+    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
+    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
+    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
+    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
+    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
+    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
+    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
+    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
+    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
+    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
+    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
+    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
+    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
+    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
+    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
+    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
+    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
+    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
+    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
+    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
+    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
+    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
+    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
+    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
+    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
+    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
+    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
+    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
+    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
+    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
+    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
+    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
+    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
+    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
+    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
+    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
+    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
+    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
+    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
+    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
+    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
+    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
+    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
+    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
+    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
+    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
+    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
+    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
+    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
+    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
+    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
+    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
+    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
+    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
+    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
+    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
+    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
+    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
+    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
+    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
+    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
+    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
+    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
+    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
+    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
+    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
+    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
+    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
+    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
+    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
+    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
+    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
+    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
+    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
+    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
+    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
+    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
+    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
+    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
+    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
+    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
+    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
+    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
+    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
+    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
+    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
+    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
+    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
+    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
+    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
+    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
+    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
+    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
+    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
+    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
+    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
+    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
+    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
+    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
+    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
+    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
+    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
+    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
+    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
+    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
+    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
+    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
+    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
+    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
+    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
+    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
+    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
+    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
+    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
+    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
+    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
+    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
+    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
+    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
+    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
+    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
+    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
+    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
+    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
+    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
+    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
+    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
+    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
+    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
+    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
+    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
+    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
+    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
+    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
+    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
+    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
+    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
+    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
+    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
+    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
+    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
+    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
+    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
+    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
+    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
+    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
+    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
+    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
+    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
+    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
+    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
+    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
+    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
+    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
+    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
+    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
+    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
+    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
+    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
+    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
+    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
+    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
+    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
+    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
+    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
+    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
+    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
+    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
+    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
+    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
+    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
+    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
+    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
+    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
+    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
+    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
+    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
+    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
+    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
+    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
+    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
+    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
+    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
+    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
+    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
+    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
+    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
+    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
+    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
+    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
+    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
+    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
+    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
+    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
+    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
+    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
+    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
+    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
+    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
+    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
+    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
+    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
+    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
+    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
+    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
+    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
+    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
+    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
+    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
+    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
+    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
+    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
+    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
+    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
+    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
+    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
+    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
+    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
+    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
+    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
+    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
+    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
+    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
+    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
+    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
+    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
+    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
+    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
+    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
+    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
+    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
+    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
+    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
+    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
+    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
+    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
+    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
+    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
+    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
+    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
+    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
+    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
+    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
+    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
+    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
+    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
+    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
+    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
+GGML_TABLE_END()
+
+GGML_TABLE_BEGIN(uint32_t, iq3xxs_grid, 256)
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+GGML_TABLE_END()
+
+GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
+    0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
+    0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
+    0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
+    0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
+    0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
+    0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
+    0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
+    0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
+    0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
+    0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
+    0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
+    0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
+    0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
+    0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
+    0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
+    0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
+    0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
+    0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
+    0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
+    0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
+    0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
+    0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
+    0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
+    0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
+    0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
+    0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
+    0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
+    0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
+    0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
+    0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
+    0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
+    0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
+    0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
+    0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
+    0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
+    0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
+    0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
+    0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
+    0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
+    0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
+    0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
+    0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
+    0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
+    0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
+    0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
+    0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
+    0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
+    0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
+    0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
+    0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
+    0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
+    0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
+    0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
+    0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
+    0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
+    0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
+    0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
+    0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
+    0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
+    0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
+    0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
+    0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
+    0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
+    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
+GGML_TABLE_END()
+
+#define NGRID_IQ1S 2048
+#define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
+#if defined(GGML_COMMON_IMPL_C)
+GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
+    0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
+    0xffffffffffff0101, 0xffffffffff00ff00, 0xffffffffff000000, 0xffffffffff01ffff,
+    0xffffffffff01ff01, 0xffffffffff0101ff, 0xffffffffff010101, 0xffffffff00ff0000,
+    0xffffffff0000ff00, 0xffffffff000000ff, 0xffffffff00000001, 0xffffffff00010000,
+    0xffffffff01ffffff, 0xffffffff01ffff01, 0xffffffff01ff01ff, 0xffffffff01ff0101,
+    0xffffffff01000000, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff010101ff,
+    0xffffffff01010101, 0xffffff00ffff00ff, 0xffffff00ffff0000, 0xffffff00ff00ff00,
+    0xffffff00ff0000ff, 0xffffff00ff000001, 0xffffff00ff000100, 0xffffff00ff000101,
+    0xffffff00ff010000, 0xffffff0000ffff00, 0xffffff0000ff0001, 0xffffff0000ff0100,
+    0xffffff000000ff01, 0xffffff0000000000, 0xffffff0000000101, 0xffffff000001ff00,
+    0xffffff00000100ff, 0xffffff0000010001, 0xffffff00000101ff, 0xffffff0001ff0000,
+    0xffffff000100ff00, 0xffffff00010000ff, 0xffffff0001000001, 0xffffff0001010000,
+    0xffffff01ffffffff, 0xffffff01ffffff01, 0xffffff01ffff01ff, 0xffffff01ffff0101,
+    0xffffff01ff000000, 0xffffff01ff01ffff, 0xffffff01ff01ff01, 0xffffff01ff0101ff,
+    0xffffff01ff010101, 0xffffff0100ff0000, 0xffffff010000ff00, 0xffffff0100000100,
+    0xffffff01000100ff, 0xffffff0100010100, 0xffffff0101ffffff, 0xffffff0101ffff01,
+    0xffffff0101ff01ff, 0xffffff0101ff0101, 0xffffff010100ff00, 0xffffff0101000000,
+    0xffffff0101000100, 0xffffff010101ffff, 0xffffff010101ff01, 0xffffff01010101ff,
+    0xffffff0101010101, 0xffff00ffff00ff00, 0xffff00ffff0000ff, 0xffff00ffff000001,
+    0xffff00ffff010000, 0xffff00ff00ffff00, 0xffff00ff00ff0100, 0xffff00ff00000000,
+    0xffff00ff00000101, 0xffff00ff000100ff, 0xffff00ff00010000, 0xffff00ff0100ff00,
+    0xffff00ff01000100, 0xffff00ff01010000, 0xffff0000ffffff00, 0xffff0000ffff00ff,
+    0xffff0000ffff0000, 0xffff0000ffff0001, 0xffff0000ff000000, 0xffff0000ff0001ff,
+    0xffff0000ff000101, 0xffff0000ff010100, 0xffff000000ffffff, 0xffff000000ff0000,
+    0xffff000000ff0101, 0xffff00000000ffff, 0xffff00000000ff00, 0xffff0000000000ff,
+    0xffff000000000000, 0xffff000000000001, 0xffff000000000100, 0xffff00000001ffff,
+    0xffff00000001ff01, 0xffff000000010000, 0xffff0000000101ff, 0xffff000000010101,
+    0xffff000001ffff00, 0xffff00000100ff00, 0xffff000001000000, 0xffff0000010001ff,
+    0xffff000001000101, 0xffff00000101ff00, 0xffff0000010100ff, 0xffff000001010000,
+    0xffff000001010001, 0xffff000001010100, 0xffff0001ff0000ff, 0xffff0001ff000100,
+    0xffff000100ffff00, 0xffff000100ff00ff, 0xffff00010000ffff, 0xffff00010000ff01,
+    0xffff000100000000, 0xffff0001000001ff, 0xffff00010001ffff, 0xffff00010001ff00,
+    0xffff000100010001, 0xffff000100010100, 0xffff000101ff0000, 0xffff00010100ff00,
+    0xffff0001010000ff, 0xffff000101000100, 0xffff01ffffffffff, 0xffff01ffffffff01,
+    0xffff01ffffff01ff, 0xffff01ffffff0101, 0xffff01ffff000000, 0xffff01ffff01ffff,
+    0xffff01ffff01ff01, 0xffff01ffff0101ff, 0xffff01ffff010101, 0xffff01ff00ff0000,
+    0xffff01ff0000ff00, 0xffff01ff00000001, 0xffff01ff00010000, 0xffff01ff01ffffff,
+    0xffff01ff01ffff01, 0xffff01ff01ff01ff, 0xffff01ff01ff0101, 0xffff01ff01000000,
+    0xffff01ff0101ffff, 0xffff01ff0101ff01, 0xffff01ff010101ff, 0xffff01ff01010101,
+    0xffff0100ffff0000, 0xffff0100ff00ff00, 0xffff0100ff0000ff, 0xffff0100ff000100,
+    0xffff0100ff0100ff, 0xffff0100ff010000, 0xffff010000ffff00, 0xffff01000000ffff,
+    0xffff01000000ff00, 0xffff010000000000, 0xffff01000001ff00, 0xffff0100000100ff,
+    0xffff010000010100, 0xffff01000100ff00, 0xffff0100010000ff, 0xffff010001000001,
+    0xffff010001000100, 0xffff010001010000, 0xffff0101ffffffff, 0xffff0101ffffff01,
+    0xffff0101ffff01ff, 0xffff0101ffff0101, 0xffff0101ff000000, 0xffff0101ff01ffff,
+    0xffff0101ff01ff01, 0xffff0101ff0101ff, 0xffff0101ff010101, 0xffff010100ff0000,
+    0xffff01010000ff00, 0xffff010100000100, 0xffff01010001ff00, 0xffff010100010000,
+    0xffff010101ffffff, 0xffff010101ffff01, 0xffff010101ff0000, 0xffff010101ff01ff,
+    0xffff010101ff0101, 0xffff010101000000, 0xffff01010101ffff, 0xffff01010101ff01,
+    0xffff0101010101ff, 0xffff010101010101, 0xff00ffffff00ffff, 0xff00ffffff00ff00,
+    0xff00ffffff0000ff, 0xff00ffffff000100, 0xff00ffffff0100ff, 0xff00ffffff010000,
+    0xff00ffff00ffff00, 0xff00ffff00ff00ff, 0xff00ffff0000ffff, 0xff00ffff00000000,
+    0xff00ffff000001ff, 0xff00ffff0001ff00, 0xff00ffff000100ff, 0xff00ffff00010000,
+    0xff00ffff00010100, 0xff00ffff0100ff00, 0xff00ffff010000ff, 0xff00ffff01000001,
+    0xff00ffff0101ff00, 0xff00ffff01010000, 0xff00ff00ffffff00, 0xff00ff00ffff00ff,
+    0xff00ff00ffff0001, 0xff00ff00ffff0100, 0xff00ff00ff00ffff, 0xff00ff00ff00ff01,
+    0xff00ff00ff000000, 0xff00ff00ff0001ff, 0xff00ff00ff01ff00, 0xff00ff00ff0100ff,
+    0xff00ff00ff010100, 0xff00ff0000ff0000, 0xff00ff0000ff0101, 0xff00ff000000ffff,
+    0xff00ff000000ff00, 0xff00ff000000ff01, 0xff00ff00000000ff, 0xff00ff0000000000,
+    0xff00ff0000000001, 0xff00ff0000000100, 0xff00ff000001ffff, 0xff00ff0000010000,
+    0xff00ff0001ff00ff, 0xff00ff000100ff01, 0xff00ff0001000000, 0xff00ff000101ff00,
+    0xff00ff00010100ff, 0xff00ff01ff00ff00, 0xff00ff01ff0000ff, 0xff00ff01ff000001,
+    0xff00ff01ff010000, 0xff00ff0100ffffff, 0xff00ff0100ff0001, 0xff00ff0100ff0100,
+    0xff00ff010000ff01, 0xff00ff0100000000, 0xff00ff01000001ff, 0xff00ff0100000101,
+    0xff00ff01000100ff, 0xff00ff0100010001, 0xff00ff0101ff0000, 0xff00ff010100ff00,
+    0xff00ff01010000ff, 0xff00ff0101000001, 0xff00ff0101010000, 0xff0000ffffffff00,
+    0xff0000ffffff0001, 0xff0000ffffff0100, 0xff0000ffff0000ff, 0xff0000ffff000000,
+    0xff0000ffff0001ff, 0xff0000ffff000100, 0xff0000ffff01ff00, 0xff0000ffff010001,
+    0xff0000ff00ffff00, 0xff0000ff00ff0000, 0xff0000ff00ff0001, 0xff0000ff00ff01ff,
+    0xff0000ff00ff0101, 0xff0000ff0000ff00, 0xff0000ff000000ff, 0xff0000ff00000000,
+    0xff0000ff00000001, 0xff0000ff00000100, 0xff0000ff0001ff01, 0xff0000ff00010000,
+    0xff0000ff000101ff, 0xff0000ff01ff00ff, 0xff0000ff01ff0100, 0xff0000ff0100ffff,
+    0xff0000ff010000ff, 0xff0000ff01000000, 0xff0000ff010001ff, 0xff0000ff01000100,
+    0xff0000ff01000101, 0xff0000ff0101ff00, 0xff0000ff010100ff, 0xff0000ff01010000,
+    0xff0000ff01010100, 0xff000000ffffff01, 0xff000000ffff0000, 0xff000000ffff0101,
+    0xff000000ff00ff00, 0xff000000ff0000ff, 0xff000000ff000000, 0xff000000ff000001,
+    0xff000000ff000100, 0xff000000ff01ffff, 0xff000000ff01ff01, 0xff000000ff010000,
+    0xff000000ff0101ff, 0xff000000ff010101, 0xff00000000ffff00, 0xff00000000ff00ff,
+    0xff00000000ff0000, 0xff00000000ff0001, 0xff0000000000ff00, 0xff0000000000ff01,
+    0xff000000000000ff, 0xff00000000000000, 0xff00000000000001, 0xff00000000000100,
+    0xff00000000000101, 0xff0000000001ff00, 0xff000000000100ff, 0xff00000000010000,
+    0xff00000000010001, 0xff00000000010100, 0xff00000001ffffff, 0xff00000001ffff01,
+    0xff00000001ff00ff, 0xff00000001ff0000, 0xff00000001ff01ff, 0xff00000001ff0101,
+    0xff0000000100ffff, 0xff0000000100ff00, 0xff000000010000ff, 0xff00000001000000,
+    0xff00000001000001, 0xff00000001000100, 0xff00000001000101, 0xff0000000101ffff,
+    0xff0000000101ff01, 0xff00000001010000, 0xff000001ffffff00, 0xff000001ffff00ff,
+    0xff000001ffff0000, 0xff000001ffff0001, 0xff000001ff000000, 0xff000001ff000001,
+    0xff000001ff0001ff, 0xff000001ff000101, 0xff000001ff01ff00, 0xff000001ff010001,
+    0xff00000100ffffff, 0xff00000100ffff01, 0xff00000100ff00ff, 0xff00000100ff0000,
+    0xff00000100ff01ff, 0xff00000100ff0101, 0xff0000010000ff00, 0xff00000100000000,
+    0xff00000100000001, 0xff000001000001ff, 0xff00000100000100, 0xff0000010001ff00,
+    0xff000001000100ff, 0xff00000100010000, 0xff000001000101ff, 0xff00000100010100,
+    0xff00000100010101, 0xff00000101ff0001, 0xff00000101ff0101, 0xff0000010100ff01,
+    0xff00000101000000, 0xff000001010100ff, 0xff00000101010100, 0xff0001ffff00ff00,
+    0xff0001ffff000001, 0xff0001ffff010000, 0xff0001ff00ffff00, 0xff0001ff00ff00ff,
+    0xff0001ff00ff0001, 0xff0001ff00ff0100, 0xff0001ff0000ffff, 0xff0001ff00000000,
+    0xff0001ff000001ff, 0xff0001ff00000101, 0xff0001ff0001ffff, 0xff0001ff0001ff00,
+    0xff0001ff000100ff, 0xff0001ff00010001, 0xff0001ff00010100, 0xff0001ff01ff0000,
+    0xff0001ff0100ff00, 0xff0001ff010000ff, 0xff0001ff01010000, 0xff000100ff00ffff,
+    0xff000100ff00ff01, 0xff000100ff000000, 0xff000100ff000101, 0xff000100ff01ff00,
+    0xff000100ff010000, 0xff00010000ffff01, 0xff00010000ff00ff, 0xff00010000ff0000,
+    0xff00010000ff01ff, 0xff0001000000ff00, 0xff000100000000ff, 0xff00010000000000,
+    0xff00010000000001, 0xff00010000000100, 0xff00010000000101, 0xff0001000001ffff,
+    0xff00010000010000, 0xff00010000010101, 0xff00010001ff0100, 0xff0001000100ff00,
+    0xff0001000100ff01, 0xff00010001000000, 0xff000100010001ff, 0xff0001000101ff00,
+    0xff00010001010001, 0xff00010001010100, 0xff000101ffff0100, 0xff000101ff000001,
+    0xff000101ff0100ff, 0xff000101ff010001, 0xff00010100ff00ff, 0xff00010100ff0001,
+    0xff00010100ff0100, 0xff0001010000ffff, 0xff0001010000ff01, 0xff00010100000000,
+    0xff000101000001ff, 0xff0001010001ff00, 0xff00010100010001, 0xff00010100010100,
+    0xff00010101ff0000, 0xff0001010100ff00, 0xff00010101000001, 0xff00010101000101,
+    0xff01ffffffffffff, 0xff01ffffffffff01, 0xff01ffffffff01ff, 0xff01ffffffff0101,
+    0xff01ffffff000000, 0xff01ffffff01ffff, 0xff01ffffff01ff01, 0xff01ffffff010000,
+    0xff01ffffff0101ff, 0xff01ffffff010101, 0xff01ffff00ff0000, 0xff01ffff0000ff00,
+    0xff01ffff00000100, 0xff01ffff0001ff00, 0xff01ffff00010000, 0xff01ffff01ffffff,
+    0xff01ffff01ffff01, 0xff01ffff01ff01ff, 0xff01ffff01ff0101, 0xff01ffff01000000,
+    0xff01ffff0101ffff, 0xff01ffff0101ff01, 0xff01ffff01010000, 0xff01ffff010101ff,
+    0xff01ffff01010101, 0xff01ff00ffff0000, 0xff01ff00ff00ff00, 0xff01ff00ff0000ff,
+    0xff01ff00ff000100, 0xff01ff00ff010000, 0xff01ff0000ffff01, 0xff01ff0000ff00ff,
+    0xff01ff0000ff0100, 0xff01ff0000000000, 0xff01ff00000001ff, 0xff01ff0000000101,
+    0xff01ff000001ff00, 0xff01ff00000100ff, 0xff01ff0000010000, 0xff01ff0000010001,
+    0xff01ff0001ff0000, 0xff01ff000100ffff, 0xff01ff0001000001, 0xff01ff0001000100,
+    0xff01ff0001010000, 0xff01ff01ffffff00, 0xff01ff01ffff01ff, 0xff01ff01ffff0101,
+    0xff01ff01ff00ff00, 0xff01ff01ff000000, 0xff01ff01ff01ffff, 0xff01ff01ff01ff01,
+    0xff01ff01ff0101ff, 0xff01ff01ff010101, 0xff01ff0100ff0000, 0xff01ff010000ff00,
+    0xff01ff0100000001, 0xff01ff0100000100, 0xff01ff0100010000, 0xff01ff0101ffff00,
+    0xff01ff0101ff01ff, 0xff01ff0101ff0101, 0xff01ff010100ff00, 0xff01ff0101000000,
+    0xff01ff010101ffff, 0xff01ff010101ff01, 0xff01ff01010101ff, 0xff01ff0101010101,
+    0xff0100ffffff0000, 0xff0100ffff0000ff, 0xff0100ffff000001, 0xff0100ffff000100,
+    0xff0100ffff010000, 0xff0100ff00ff00ff, 0xff0100ff00ff0000, 0xff0100ff00ff0001,
+    0xff0100ff00ff0100, 0xff0100ff0000ff01, 0xff0100ff00000000, 0xff0100ff000001ff,
+    0xff0100ff00000101, 0xff0100ff00010001, 0xff0100ff01ff0000, 0xff0100ff0100ff00,
+    0xff0100ff010000ff, 0xff0100ff01000100, 0xff0100ff0101ff00, 0xff0100ff01010000,
+    0xff010000ffff0100, 0xff010000ff000000, 0xff010000ff01ff00, 0xff010000ff010100,
+    0xff01000000ffffff, 0xff01000000ff0000, 0xff01000000ff01ff, 0xff0100000000ff00,
+    0xff010000000000ff, 0xff01000000000000, 0xff01000000000100, 0xff0100000001ff01,
+    0xff01000000010000, 0xff010000000101ff, 0xff01000001ff0100, 0xff0100000100ffff,
+    0xff010000010000ff, 0xff01000001000000, 0xff010000010001ff, 0xff01000001000101,
+    0xff0100000101ff00, 0xff010000010100ff, 0xff01000001010001, 0xff01000001010100,
+    0xff010001ffff0000, 0xff010001ff00ffff, 0xff010001ff00ff01, 0xff010001ff000100,
+    0xff010001ff010000, 0xff01000100ffff00, 0xff01000100ff0100, 0xff01000100000000,
+    0xff0100010001ffff, 0xff0100010001ff00, 0xff01000100010100, 0xff01000101ff00ff,
+    0xff01000101ff0001, 0xff0100010100ffff, 0xff01000101000101, 0xff0101ffffffffff,
+    0xff0101ffffffff01, 0xff0101ffffff01ff, 0xff0101ffffff0101, 0xff0101ffff000000,
+    0xff0101ffff01ffff, 0xff0101ffff01ff01, 0xff0101ffff0101ff, 0xff0101ffff010101,
+    0xff0101ff00ff0000, 0xff0101ff0000ff00, 0xff0101ff000000ff, 0xff0101ff00010000,
+    0xff0101ff01ffffff, 0xff0101ff01ffff01, 0xff0101ff01ff01ff, 0xff0101ff01ff0101,
+    0xff0101ff0101ffff, 0xff0101ff0101ff01, 0xff0101ff010101ff, 0xff0101ff01010101,
+    0xff010100ffff0100, 0xff010100ff00ff00, 0xff010100ff0000ff, 0xff010100ff000100,
+    0xff010100ff010000, 0xff01010000ff0001, 0xff01010000ff0100, 0xff0101000000ff01,
+    0xff01010000000000, 0xff0101000001ff00, 0xff010100000100ff, 0xff01010000010001,
+    0xff01010000010100, 0xff01010001ff0000, 0xff0101000100ffff, 0xff01010001000001,
+    0xff01010001000100, 0xff010100010100ff, 0xff01010001010000, 0xff010101ffffffff,
+    0xff010101ffffff01, 0xff010101ffff01ff, 0xff010101ffff0101, 0xff010101ff01ffff,
+    0xff010101ff01ff01, 0xff010101ff0101ff, 0xff010101ff010101, 0xff01010100ff0000,
+    0xff0101010000ff00, 0xff01010100000001, 0xff01010100000100, 0xff01010100010000,
+    0xff01010101ffffff, 0xff01010101ffff01, 0xff01010101ff01ff, 0xff01010101ff0101,
+    0xff01010101000000, 0xff0101010101ffff, 0xff0101010101ff01, 0xff010101010101ff,
+    0xff01010101010101, 0x00ffffffffff0000, 0x00ffffffff00ff00, 0x00ffffffff000001,
+    0x00ffffffff010000, 0x00ffffff00ff0100, 0x00ffffff0000ff01, 0x00ffffff00000000,
+    0x00ffffff000001ff, 0x00ffffff00000101, 0x00ffffff0001ff00, 0x00ffffff000100ff,
+    0x00ffffff00010001, 0x00ffffff010000ff, 0x00ffffff01000100, 0x00ffffff0101ff00,
+    0x00ffffff01010001, 0x00ffff00ffffffff, 0x00ffff00ffffff00, 0x00ffff00ffff00ff,
+    0x00ffff00ffff0001, 0x00ffff00ffff0100, 0x00ffff00ff00ff01, 0x00ffff00ff000000,
+    0x00ffff00ff000001, 0x00ffff00ff0001ff, 0x00ffff00ff000101, 0x00ffff00ff01ff00,
+    0x00ffff00ff010001, 0x00ffff00ff010100, 0x00ffff0000ff0000, 0x00ffff0000ff01ff,
+    0x00ffff0000ff0101, 0x00ffff000000ff00, 0x00ffff00000000ff, 0x00ffff0000000000,
+    0x00ffff0000000001, 0x00ffff0000000100, 0x00ffff0000000101, 0x00ffff0000010000,
+    0x00ffff00000101ff, 0x00ffff0000010101, 0x00ffff0001ffff00, 0x00ffff0001ff00ff,
+    0x00ffff0001ff0001, 0x00ffff000100ffff, 0x00ffff000100ff01, 0x00ffff0001000000,
+    0x00ffff000101ffff, 0x00ffff000101ff00, 0x00ffff000101ff01, 0x00ffff01ffff0000,
+    0x00ffff01ff00ff00, 0x00ffff01ff0000ff, 0x00ffff01ff000001, 0x00ffff01ff010000,
+    0x00ffff0100ffff00, 0x00ffff010000ff01, 0x00ffff0100000000, 0x00ffff0100000101,
+    0x00ffff01000100ff, 0x00ffff0100010100, 0x00ffff0101ff0100, 0x00ffff01010000ff,
+    0x00ffff0101010000, 0x00ff00ffffffff00, 0x00ff00ffff000000, 0x00ff00ffff000100,
+    0x00ff00ffff010100, 0x00ff00ff00ff0000, 0x00ff00ff00ff01ff, 0x00ff00ff00ff0101,
+    0x00ff00ff0000ff00, 0x00ff00ff000000ff, 0x00ff00ff00000000, 0x00ff00ff00000001,
+    0x00ff00ff0001ff00, 0x00ff00ff0001ff01, 0x00ff00ff00010000, 0x00ff00ff000101ff,
+    0x00ff00ff00010101, 0x00ff00ff01ffff00, 0x00ff00ff01ff0001, 0x00ff00ff01ff0100,
+    0x00ff00ff0100ffff, 0x00ff00ff0100ff01, 0x00ff00ff01000000, 0x00ff00ff0101ffff,
+    0x00ff00ff0101ff00, 0x00ff00ff01010100, 0x00ff0000ffffff00, 0x00ff0000ffffff01,
+    0x00ff0000ffff0000, 0x00ff0000ffff0101, 0x00ff0000ff00ff00, 0x00ff0000ff0000ff,
+    0x00ff0000ff000000, 0x00ff0000ff000001, 0x00ff0000ff000100, 0x00ff0000ff01ffff,
+    0x00ff0000ff010000, 0x00ff0000ff010101, 0x00ff000000ffff00, 0x00ff000000ff00ff,
+    0x00ff000000ff0000, 0x00ff000000ff0001, 0x00ff000000ff0100, 0x00ff00000000ffff,
+    0x00ff00000000ff00, 0x00ff0000000000ff, 0x00ff000000000000, 0x00ff000000000001,
+    0x00ff0000000001ff, 0x00ff000000000100, 0x00ff00000001ff00, 0x00ff0000000100ff,
+    0x00ff000000010000, 0x00ff000000010001, 0x00ff000000010100, 0x00ff000001ffff01,
+    0x00ff000001ff00ff, 0x00ff000001ff0000, 0x00ff000001ff01ff, 0x00ff00000100ff00,
+    0x00ff0000010000ff, 0x00ff000001000000, 0x00ff000001000001, 0x00ff000001000100,
+    0x00ff000001000101, 0x00ff000001010000, 0x00ff0000010101ff, 0x00ff000001010101,
+    0x00ff0001ffffff00, 0x00ff0001ffff0000, 0x00ff0001ffff0100, 0x00ff0001ff0000ff,
+    0x00ff0001ff000000, 0x00ff0001ff0001ff, 0x00ff0001ff000101, 0x00ff0001ff01ff00,
+    0x00ff0001ff0100ff, 0x00ff0001ff010100, 0x00ff000100ffffff, 0x00ff000100ffff01,
+    0x00ff000100ff0000, 0x00ff000100ff01ff, 0x00ff00010000ffff, 0x00ff00010000ff00,
+    0x00ff00010000ff01, 0x00ff000100000000, 0x00ff000100000001, 0x00ff000100000100,
+    0x00ff00010001ff01, 0x00ff000100010000, 0x00ff0001000101ff, 0x00ff000101ffff00,
+    0x00ff000101ff0000, 0x00ff000101ff0101, 0x00ff0001010000ff, 0x00ff000101000000,
+    0x00ff00010101ff00, 0x00ff0001010100ff, 0x00ff000101010001, 0x00ff01ffffff0000,
+    0x00ff01ffff00ff00, 0x00ff01ffff000000, 0x00ff01ffff000101, 0x00ff01ffff010000,
+    0x00ff01ff00ffff01, 0x00ff01ff00ff0100, 0x00ff01ff0000ffff, 0x00ff01ff00000000,
+    0x00ff01ff000001ff, 0x00ff01ff0001ff00, 0x00ff01ff000100ff, 0x00ff01ff00010001,
+    0x00ff01ff00010100, 0x00ff01ff01ff0000, 0x00ff01ff0100ff00, 0x00ff01ff010000ff,
+    0x00ff01ff01000001, 0x00ff01ff01000100, 0x00ff01ff01010000, 0x00ff0100ffffff00,
+    0x00ff0100ffff0000, 0x00ff0100ffff0001, 0x00ff0100ffff0101, 0x00ff0100ff00ffff,
+    0x00ff0100ff0000ff, 0x00ff0100ff000000, 0x00ff0100ff0001ff, 0x00ff0100ff01ff00,
+    0x00ff0100ff0100ff, 0x00ff0100ff010001, 0x00ff010000ffffff, 0x00ff010000ff0000,
+    0x00ff010000ff0101, 0x00ff01000000ff00, 0x00ff01000000ff01, 0x00ff0100000000ff,
+    0x00ff010000000000, 0x00ff010000000001, 0x00ff010000000100, 0x00ff01000001ffff,
+    0x00ff01000001ff01, 0x00ff010000010000, 0x00ff010000010001, 0x00ff010000010101,
+    0x00ff010001ff0001, 0x00ff010001ff0100, 0x00ff01000100ff01, 0x00ff010001000000,
+    0x00ff010001000001, 0x00ff0100010001ff, 0x00ff01000101ff00, 0x00ff0100010100ff,
+    0x00ff010001010001, 0x00ff010001010100, 0x00ff0101ff000001, 0x00ff010100ff00ff,
+    0x00ff010100ff0001, 0x00ff010100ff0100, 0x00ff010100000000, 0x00ff0101000001ff,
+    0x00ff010100000101, 0x00ff0101000100ff, 0x00ff010100010100, 0x00ff0101010000ff,
+    0x00ff010101010000, 0x0000ffffffffff00, 0x0000ffffffff00ff, 0x0000ffffffff0000,
+    0x0000ffffffff0001, 0x0000ffffffff0100, 0x0000ffffff00ff01, 0x0000ffffff000000,
+    0x0000ffffff000101, 0x0000ffffff01ff00, 0x0000ffffff0100ff, 0x0000ffffff010100,
+    0x0000ffff00ffffff, 0x0000ffff00ff0000, 0x0000ffff00ff01ff, 0x0000ffff0000ff00,
+    0x0000ffff000000ff, 0x0000ffff00000000, 0x0000ffff00000001, 0x0000ffff00000100,
+    0x0000ffff00010000, 0x0000ffff000101ff, 0x0000ffff01ff0001, 0x0000ffff01ff0100,
+    0x0000ffff01000000, 0x0000ffff010001ff, 0x0000ffff0101ffff, 0x0000ffff0101ff00,
+    0x0000ffff01010001, 0x0000ffff01010100, 0x0000ff00ffff0000, 0x0000ff00ffff01ff,
+    0x0000ff00ffff0100, 0x0000ff00ffff0101, 0x0000ff00ff00ff00, 0x0000ff00ff0000ff,
+    0x0000ff00ff000000, 0x0000ff00ff000001, 0x0000ff00ff0001ff, 0x0000ff00ff000100,
+    0x0000ff00ff01ffff, 0x0000ff00ff010000, 0x0000ff00ff010001, 0x0000ff00ff0101ff,
+    0x0000ff00ff010101, 0x0000ff0000ffff00, 0x0000ff0000ff00ff, 0x0000ff0000ff0000,
+    0x0000ff0000ff0001, 0x0000ff0000ff0100, 0x0000ff000000ffff, 0x0000ff000000ff00,
+    0x0000ff000000ff01, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
+    0x0000ff00000001ff, 0x0000ff0000000100, 0x0000ff0000000101, 0x0000ff000001ff00,
+    0x0000ff00000100ff, 0x0000ff0000010000, 0x0000ff0000010001, 0x0000ff0000010100,
+    0x0000ff0001ffff01, 0x0000ff0001ff0000, 0x0000ff000100ff00, 0x0000ff00010000ff,
+    0x0000ff0001000000, 0x0000ff0001000001, 0x0000ff0001000100, 0x0000ff000101ffff,
+    0x0000ff0001010000, 0x0000ff0001010101, 0x0000ff01ffffff00, 0x0000ff01ffff0001,
+    0x0000ff01ff00ff01, 0x0000ff01ff000000, 0x0000ff01ff000101, 0x0000ff01ff01ff00,
+    0x0000ff01ff0100ff, 0x0000ff0100ffff01, 0x0000ff0100ff0000, 0x0000ff0100ff0101,
+    0x0000ff010000ff00, 0x0000ff01000000ff, 0x0000ff0100000000, 0x0000ff0100000001,
+    0x0000ff0100000100, 0x0000ff010001ff01, 0x0000ff0100010000, 0x0000ff0101ff0000,
+    0x0000ff010100ffff, 0x0000ff010100ff01, 0x0000ff0101000000, 0x0000ff0101000100,
+    0x0000ff0101000101, 0x0000ff01010100ff, 0x000000ffffff00ff, 0x000000ffffff0000,
+    0x000000ffff00ff00, 0x000000ffff0000ff, 0x000000ffff000000, 0x000000ffff000001,
+    0x000000ffff0001ff, 0x000000ffff000100, 0x000000ffff01ff00, 0x000000ffff010000,
+    0x000000ffff0101ff, 0x000000ffff010101, 0x000000ff00ffff00, 0x000000ff00ff00ff,
+    0x000000ff00ff0000, 0x000000ff00ff0001, 0x000000ff00ff0100, 0x000000ff00ff0101,
+    0x000000ff0000ffff, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
+    0x000000ff00000001, 0x000000ff000001ff, 0x000000ff00000100, 0x000000ff00000101,
+    0x000000ff0001ff00, 0x000000ff0001ff01, 0x000000ff000100ff, 0x000000ff00010000,
+    0x000000ff00010001, 0x000000ff00010100, 0x000000ff01ffffff, 0x000000ff01ff01ff,
+    0x000000ff01ff0101, 0x000000ff0100ff00, 0x000000ff010000ff, 0x000000ff01000000,
+    0x000000ff01000001, 0x000000ff01000100, 0x000000ff0101ff00, 0x000000ff010100ff,
+    0x000000ff01010000, 0x000000ff01010101, 0x00000000ffffff00, 0x00000000ffffff01,
+    0x00000000ffff00ff, 0x00000000ffff0000, 0x00000000ffff0001, 0x00000000ffff0100,
+    0x00000000ff00ffff, 0x00000000ff00ff00, 0x00000000ff00ff01, 0x00000000ff0000ff,
+    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff000101,
+    0x00000000ff01ff00, 0x00000000ff0100ff, 0x00000000ff010000, 0x00000000ff010001,
+    0x00000000ff010100, 0x0000000000ffffff, 0x0000000000ffff00, 0x0000000000ffff01,
+    0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, 0x0000000000ff01ff,
+    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
+    0x00000000000000ff, 0x0000000000000000, 0x0000000000000001, 0x00000000000001ff,
+    0x0000000000000100, 0x0000000000000101, 0x000000000001ffff, 0x000000000001ff00,
+    0x00000000000100ff, 0x0000000000010000, 0x0000000000010001, 0x00000000000101ff,
+    0x0000000000010100, 0x0000000000010101, 0x0000000001ffff00, 0x0000000001ff00ff,
+    0x0000000001ff0000, 0x0000000001ff0100, 0x0000000001ff0101, 0x000000000100ffff,
+    0x000000000100ff00, 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001,
+    0x00000000010001ff, 0x0000000001000100, 0x000000000101ff00, 0x00000000010100ff,
+    0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x00000001ffffffff,
+    0x00000001ffffff00, 0x00000001ffffff01, 0x00000001ffff00ff, 0x00000001ffff0001,
+    0x00000001ffff01ff, 0x00000001ffff0100, 0x00000001ff00ff00, 0x00000001ff0000ff,
+    0x00000001ff000000, 0x00000001ff0001ff, 0x00000001ff000100, 0x00000001ff01ffff,
+    0x00000001ff01ff00, 0x00000001ff01ff01, 0x00000001ff0100ff, 0x00000001ff010000,
+    0x00000001ff010001, 0x00000001ff0101ff, 0x00000001ff010100, 0x0000000100ffff00,
+    0x0000000100ff0000, 0x0000000100ff0001, 0x0000000100ff01ff, 0x0000000100ff0100,
+    0x0000000100ff0101, 0x000000010000ffff, 0x000000010000ff00, 0x000000010000ff01,
+    0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, 0x00000001000001ff,
+    0x0000000100000100, 0x0000000100000101, 0x000000010001ff00, 0x00000001000100ff,
+    0x0000000100010000, 0x0000000100010100, 0x0000000101ffff01, 0x0000000101ff0000,
+    0x0000000101ff0001, 0x0000000101ff01ff, 0x0000000101ff0100, 0x0000000101ff0101,
+    0x000000010100ff00, 0x0000000101000000, 0x0000000101000101, 0x000000010101ff01,
+    0x0000000101010000, 0x0000000101010001, 0x00000001010101ff, 0x0000000101010100,
+    0x000001ffffff00ff, 0x000001ffffff0000, 0x000001ffffff0001, 0x000001ffffff0100,
+    0x000001ffff00ffff, 0x000001ffff000000, 0x000001ffff0001ff, 0x000001ffff01ff00,
+    0x000001ffff010101, 0x000001ff00ff0000, 0x000001ff00ff01ff, 0x000001ff00ff0101,
+    0x000001ff0000ff00, 0x000001ff000000ff, 0x000001ff00000000, 0x000001ff00000001,
+    0x000001ff000001ff, 0x000001ff00000100, 0x000001ff0001ffff, 0x000001ff0001ff01,
+    0x000001ff000100ff, 0x000001ff00010000, 0x000001ff01ffff01, 0x000001ff01ff0100,
+    0x000001ff0100ffff, 0x000001ff0100ff01, 0x000001ff01000000, 0x000001ff010001ff,
+    0x000001ff0101ff00, 0x000001ff01010100, 0x00000100ffffff00, 0x00000100ffffff01,
+    0x00000100ffff0000, 0x00000100ffff0101, 0x00000100ff00ff00, 0x00000100ff0000ff,
+    0x00000100ff000000, 0x00000100ff000001, 0x00000100ff000100, 0x00000100ff010000,
+    0x0000010000ffff00, 0x0000010000ff00ff, 0x0000010000ff0000, 0x0000010000ff0001,
+    0x0000010000ff0100, 0x000001000000ffff, 0x000001000000ff00, 0x000001000000ff01,
+    0x00000100000000ff, 0x0000010000000000, 0x0000010000000001, 0x00000100000001ff,
+    0x0000010000000100, 0x0000010000000101, 0x000001000001ff00, 0x00000100000100ff,
+    0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010001ffff00,
+    0x0000010001ff0000, 0x0000010001ff0100, 0x000001000100ff00, 0x00000100010000ff,
+    0x0000010001000000, 0x0000010001000001, 0x00000100010001ff, 0x0000010001000100,
+    0x0000010001010000, 0x00000101ffff00ff, 0x00000101ffff01ff, 0x00000101ff000000,
+    0x00000101ff000101, 0x00000101ff01ffff, 0x00000101ff010000, 0x00000101ff010001,
+    0x00000101ff010100, 0x0000010100ff0000, 0x0000010100ff01ff, 0x0000010100ff0100,
+    0x000001010000ff00, 0x0000010100000000, 0x0000010100000001, 0x00000101000001ff,
+    0x0000010100000100, 0x000001010001ff01, 0x0000010100010000, 0x00000101000101ff,
+    0x0000010100010101, 0x0000010101ffff00, 0x0000010101ff0101, 0x000001010100ff01,
+    0x0000010101000000, 0x0000010101000001, 0x00000101010001ff, 0x0000010101000101,
+    0x000001010101ff00, 0x0001ffffffff0000, 0x0001ffffff0000ff, 0x0001ffffff000001,
+    0x0001ffffff000100, 0x0001ffffff010000, 0x0001ffff00ff00ff, 0x0001ffff0000ffff,
+    0x0001ffff00000000, 0x0001ffff00000001, 0x0001ffff000001ff, 0x0001ffff00000101,
+    0x0001ffff0001ff00, 0x0001ffff000100ff, 0x0001ffff00010001, 0x0001ffff00010100,
+    0x0001ffff01ffff00, 0x0001ffff01000001, 0x0001ffff01010000, 0x0001ff00ffffff00,
+    0x0001ff00ffff00ff, 0x0001ff00ffff0001, 0x0001ff00ffff0100, 0x0001ff00ff00ff01,
+    0x0001ff00ff000000, 0x0001ff00ff01ff00, 0x0001ff00ff01ff01, 0x0001ff00ff010001,
+    0x0001ff00ff010100, 0x0001ff0000ff0000, 0x0001ff0000ff0100, 0x0001ff000000ff00,
+    0x0001ff0000000000, 0x0001ff0000000001, 0x0001ff0000000100, 0x0001ff0000010000,
+    0x0001ff0000010001, 0x0001ff0000010101, 0x0001ff0001ff00ff, 0x0001ff0001ff0101,
+    0x0001ff000100ff01, 0x0001ff0001000000, 0x0001ff000101ff00, 0x0001ff0001010001,
+    0x0001ff0001010100, 0x0001ff01ff00ff00, 0x0001ff01ff000001, 0x0001ff01ff000100,
+    0x0001ff0100ffffff, 0x0001ff0100ffff00, 0x0001ff0100ff0001, 0x0001ff0100000000,
+    0x0001ff0100000001, 0x0001ff01000001ff, 0x0001ff010001ffff, 0x0001ff0101ff0000,
+    0x0001ff010100ff00, 0x0001ff0101000001, 0x0001ff0101010000, 0x000100ffff00ff00,
+    0x000100ffff00ff01, 0x000100ffff000000, 0x000100ffff000001, 0x000100ffff000101,
+    0x000100ffff01ff00, 0x000100ffff010001, 0x000100ffff010100, 0x000100ff00ffffff,
+    0x000100ff00ffff01, 0x000100ff00ff0000, 0x000100ff00ff01ff, 0x000100ff00ff0101,
+    0x000100ff0000ff00, 0x000100ff000000ff, 0x000100ff00000000, 0x000100ff00000001,
+    0x000100ff00000100, 0x000100ff00000101, 0x000100ff0001ffff, 0x000100ff0001ff01,
+    0x000100ff00010000, 0x000100ff01ff00ff, 0x000100ff01ff0000, 0x000100ff01ff0100,
+    0x000100ff0100ffff, 0x000100ff0100ff01, 0x000100ff010000ff, 0x000100ff01000000,
+    0x000100ff01000001, 0x000100ff010001ff, 0x000100ff01000101, 0x000100ff0101ff00,
+    0x000100ff010100ff, 0x000100ff01010100, 0x00010000ffff0000, 0x00010000ffff01ff,
+    0x00010000ffff0101, 0x00010000ff00ff00, 0x00010000ff000000, 0x00010000ff000001,
+    0x00010000ff000100, 0x0001000000ff00ff, 0x0001000000ff0000, 0x0001000000ff0001,
+    0x0001000000ff0100, 0x000100000000ffff, 0x000100000000ff00, 0x00010000000000ff,
+    0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x000100000001ff00,
+    0x00010000000100ff, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100,
+    0x0001000001ff0001, 0x0001000001ff0100, 0x0001000001ff0101, 0x000100000100ff00,
+    0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101,
+    0x000100000101ff01, 0x0001000001010000, 0x0001000001010001, 0x00010000010101ff,
+    0x00010001ffffff01, 0x00010001ffff0100, 0x00010001ff000000, 0x00010001ff01ffff,
+    0x00010001ff010001, 0x00010001ff0101ff, 0x00010001ff010100, 0x0001000100ffffff,
+    0x0001000100ff0000, 0x0001000100ff01ff, 0x0001000100ff0101, 0x000100010000ff00,
+    0x00010001000000ff, 0x0001000100000000, 0x0001000100000001, 0x00010001000001ff,
+    0x0001000100000101, 0x000100010001ffff, 0x0001000100010000, 0x00010001000101ff,
+    0x0001000101ffffff, 0x0001000101ffff01, 0x0001000101ff0000, 0x0001000101ff0101,
+    0x00010001010000ff, 0x0001000101000001, 0x00010001010001ff, 0x0001000101000100,
+    0x000100010101ffff, 0x00010001010100ff, 0x0001000101010001, 0x0001000101010101,
+    0x000101ffff000001, 0x000101ffff000100, 0x000101ffff010000, 0x000101ff00ffff00,
+    0x000101ff0000ff01, 0x000101ff00000000, 0x000101ff00000101, 0x000101ff0001ff00,
+    0x000101ff00010100, 0x000101ff01ff0000, 0x000101ff0100ff00, 0x000101ff010001ff,
+    0x000101ff01010001, 0x00010100ffffff00, 0x00010100ffff00ff, 0x00010100ff00ffff,
+    0x00010100ff000000, 0x00010100ff01ff00, 0x00010100ff0100ff, 0x00010100ff010001,
+    0x00010100ff010100, 0x0001010000ffffff, 0x0001010000ffff00, 0x0001010000ff0000,
+    0x0001010000ff0001, 0x0001010000ff01ff, 0x000101000000ff00, 0x00010100000000ff,
+    0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x000101000001ffff,
+    0x0001010000010000, 0x0001010000010101, 0x0001010001ffff01, 0x0001010001ff00ff,
+    0x0001010001ff0101, 0x0001010001000000, 0x000101000101ff00, 0x00010100010100ff,
+    0x0001010001010000, 0x0001010001010100, 0x00010101ff00ff00, 0x00010101ff000001,
+    0x00010101ff0001ff, 0x0001010100ffff00, 0x0001010100ff00ff, 0x0001010100ff0100,
+    0x000101010000ffff, 0x0001010100000000, 0x00010101000001ff, 0x0001010100000101,
+    0x00010101000100ff, 0x0001010100010000, 0x0001010100010100, 0x0001010101ff0001,
+    0x00010101010000ff, 0x00010101010001ff, 0x0001010101000101, 0x0001010101010001,
+    0x01ffffffffffffff, 0x01ffffffffffff01, 0x01ffffffffff01ff, 0x01ffffffffff0101,
+    0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, 0x01ffffffff010101,
+    0x01ffffff00ff0000, 0x01ffffff0000ffff, 0x01ffffff0000ff00, 0x01ffffff000000ff,
+    0x01ffffff00000001, 0x01ffffff00000100, 0x01ffffff00010000, 0x01ffffff01ffffff,
+    0x01ffffff01ffff01, 0x01ffffff01ff01ff, 0x01ffffff01ff0101, 0x01ffffff01000000,
+    0x01ffffff0101ffff, 0x01ffffff0101ff01, 0x01ffffff010101ff, 0x01ffffff01010101,
+    0x01ffff00ffff0000, 0x01ffff00ff00ff00, 0x01ffff00ff0000ff, 0x01ffff00ff000001,
+    0x01ffff00ff000100, 0x01ffff00ff010000, 0x01ffff0000ffff00, 0x01ffff0000ff00ff,
+    0x01ffff0000ff0100, 0x01ffff000000ffff, 0x01ffff000000ff01, 0x01ffff0000000000,
+    0x01ffff0000000001, 0x01ffff00000001ff, 0x01ffff0000000100, 0x01ffff00000100ff,
+    0x01ffff0000010001, 0x01ffff0000010100, 0x01ffff0001ff0000, 0x01ffff0001ff0100,
+    0x01ffff00010000ff, 0x01ffff0001000001, 0x01ffff0001000100, 0x01ffff0001010000,
+    0x01ffff01ffffffff, 0x01ffff01ffffff01, 0x01ffff01ffff01ff, 0x01ffff01ffff0101,
+    0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff01ff01, 0x01ffff01ff0101ff,
+    0x01ffff01ff010101, 0x01ffff010000ff00, 0x01ffff01000000ff, 0x01ffff0100000100,
+    0x01ffff0100010000, 0x01ffff0101ffffff, 0x01ffff0101ffff01, 0x01ffff0101ff01ff,
+    0x01ffff0101ff0101, 0x01ffff0101000000, 0x01ffff010101ffff, 0x01ffff010101ff01,
+    0x01ffff01010101ff, 0x01ffff0101010101, 0x01ff00ffff0000ff, 0x01ff00ffff000100,
+    0x01ff00ff00ffff00, 0x01ff00ff00ff00ff, 0x01ff00ff0000ff00, 0x01ff00ff00000000,
+    0x01ff00ff00000101, 0x01ff00ff0001ff00, 0x01ff00ff000100ff, 0x01ff00ff00010100,
+    0x01ff00ff010000ff, 0x01ff00ff01000100, 0x01ff0000ffffff00, 0x01ff0000ffff0100,
+    0x01ff0000ff00ff01, 0x01ff0000ff000000, 0x01ff0000ff000101, 0x01ff0000ff010001,
+    0x01ff0000ff010100, 0x01ff000000ffffff, 0x01ff000000ffff00, 0x01ff000000ff0000,
+    0x01ff000000ff01ff, 0x01ff00000000ff00, 0x01ff0000000000ff, 0x01ff000000000000,
+    0x01ff000000000001, 0x01ff000000000100, 0x01ff000000000101, 0x01ff000000010000,
+    0x01ff000000010001, 0x01ff0000000101ff, 0x01ff000000010101, 0x01ff000001ffff00,
+    0x01ff000001ff00ff, 0x01ff000001ff0001, 0x01ff000001ff0100, 0x01ff00000100ffff,
+    0x01ff00000100ff01, 0x01ff000001000000, 0x01ff0000010001ff, 0x01ff000001010001,
+    0x01ff0001ff00ff00, 0x01ff0001ff000001, 0x01ff0001ff000100, 0x01ff0001ff010000,
+    0x01ff000100ffff00, 0x01ff000100ff00ff, 0x01ff000100ff0100, 0x01ff000100ff0101,
+    0x01ff00010000ffff, 0x01ff000100000000, 0x01ff000100000100, 0x01ff000100000101,
+    0x01ff00010001ff00, 0x01ff000100010001, 0x01ff000100010101, 0x01ff000101ff0000,
+    0x01ff00010100ff00, 0x01ff000101000101, 0x01ff0001010100ff, 0x01ff01ffffffffff,
+    0x01ff01ffffffff01, 0x01ff01ffffff01ff, 0x01ff01ffffff0101, 0x01ff01ffff000000,
+    0x01ff01ffff01ffff, 0x01ff01ffff01ff01, 0x01ff01ffff0101ff, 0x01ff01ffff010101,
+    0x01ff01ff00ffff00, 0x01ff01ff00ff0000, 0x01ff01ff0000ff00, 0x01ff01ff000000ff,
+    0x01ff01ff00000100, 0x01ff01ff00010000, 0x01ff01ff00010100, 0x01ff01ff01ffffff,
+    0x01ff01ff01ffff01, 0x01ff01ff01ff01ff, 0x01ff01ff01ff0101, 0x01ff01ff01000000,
+    0x01ff01ff0101ffff, 0x01ff01ff0101ff01, 0x01ff01ff010101ff, 0x01ff01ff01010101,
+    0x01ff0100ffff0000, 0x01ff0100ffff0001, 0x01ff0100ff00ff00, 0x01ff0100ff0000ff,
+    0x01ff0100ff000001, 0x01ff0100ff010000, 0x01ff010000ffff00, 0x01ff010000ff00ff,
+    0x01ff010000ff0001, 0x01ff010000ff0100, 0x01ff01000000ffff, 0x01ff01000000ff01,
+    0x01ff010000000000, 0x01ff010000000101, 0x01ff01000001ff00, 0x01ff0100000100ff,
+    0x01ff010001ff0000, 0x01ff010001000001, 0x01ff010001000100, 0x01ff010001010000,
+    0x01ff0101ffffffff, 0x01ff0101ffffff01, 0x01ff0101ffff01ff, 0x01ff0101ffff0101,
+    0x01ff0101ff000000, 0x01ff0101ff01ffff, 0x01ff0101ff01ff01, 0x01ff0101ff0101ff,
+    0x01ff0101ff010101, 0x01ff010100ff0000, 0x01ff01010000ff00, 0x01ff0101000000ff,
+    0x01ff010100000001, 0x01ff010101ffffff, 0x01ff010101ffff01, 0x01ff010101ff01ff,
+    0x01ff010101ff0101, 0x01ff010101000000, 0x01ff01010101ffff, 0x01ff01010101ff01,
+    0x01ff0101010101ff, 0x01ff010101010101, 0x0100ffffffff0000, 0x0100ffffff00ff00,
+    0x0100ffffff000001, 0x0100ffffff0001ff, 0x0100ffffff000100, 0x0100ffffff010000,
+    0x0100ffff00ffff00, 0x0100ffff00ff0001, 0x0100ffff00ff0100, 0x0100ffff00000000,
+    0x0100ffff000001ff, 0x0100ffff00000101, 0x0100ffff00010100, 0x0100ffff00010101,
+    0x0100ffff01ff0000, 0x0100ffff0100ff00, 0x0100ffff010000ff, 0x0100ffff01000001,
+    0x0100ffff01000100, 0x0100ffff01010000, 0x0100ff00ffffff00, 0x0100ff00ffff00ff,
+    0x0100ff00ffff0001, 0x0100ff00ffff0100, 0x0100ff00ff00ffff, 0x0100ff00ff000000,
+    0x0100ff00ff0001ff, 0x0100ff00ff000101, 0x0100ff00ff01ff00, 0x0100ff00ff0100ff,
+    0x0100ff00ff010001, 0x0100ff00ff010100, 0x0100ff0000ffffff, 0x0100ff0000ff0000,
+    0x0100ff000000ffff, 0x0100ff000000ff00, 0x0100ff00000000ff, 0x0100ff0000000000,
+    0x0100ff0000000001, 0x0100ff0000000100, 0x0100ff000001ff01, 0x0100ff0000010000,
+    0x0100ff0001ff00ff, 0x0100ff0001ff0001, 0x0100ff000100ff01, 0x0100ff0001000000,
+    0x0100ff00010001ff, 0x0100ff000101ff00, 0x0100ff00010100ff, 0x0100ff0001010001,
+    0x0100ff0001010100, 0x0100ff01ffff0000, 0x0100ff01ff00ff00, 0x0100ff01ff0000ff,
+    0x0100ff01ff000100, 0x0100ff01ff010000, 0x0100ff0100ff00ff, 0x0100ff0100ff0001,
+    0x0100ff0100ff0100, 0x0100ff010000ffff, 0x0100ff010000ff01, 0x0100ff0100000000,
+    0x0100ff01000001ff, 0x0100ff0100010001, 0x0100ff0100010100, 0x0100ff0101ff0000,
+    0x0100ff01010000ff, 0x0100ff0101000001, 0x0100ff0101010100, 0x010000ffffffff00,
+    0x010000ffffff00ff, 0x010000ffffff0001, 0x010000ffff00ffff, 0x010000ffff000000,
+    0x010000ffff0001ff, 0x010000ffff010001, 0x010000ff00ffffff, 0x010000ff00ff0101,
+    0x010000ff0000ff00, 0x010000ff000000ff, 0x010000ff00000000, 0x010000ff00000001,
+    0x010000ff000001ff, 0x010000ff00000100, 0x010000ff0001ffff, 0x010000ff0001ff00,
+    0x010000ff0001ff01, 0x010000ff00010000, 0x010000ff01ff00ff, 0x010000ff01ff0001,
+    0x010000ff0100ff01, 0x010000ff010000ff, 0x010000ff01000000, 0x010000ff010001ff,
+    0x010000ff0101ff00, 0x010000ff01010100, 0x01000000ffffffff, 0x01000000ffff0000,
+    0x01000000ffff01ff, 0x01000000ffff0101, 0x01000000ff00ffff, 0x01000000ff00ff00,
+    0x01000000ff0000ff, 0x01000000ff000000, 0x01000000ff000001, 0x01000000ff000100,
+    0x01000000ff01ff00, 0x01000000ff010000, 0x01000000ff010100, 0x01000000ff010101,
+    0x0100000000ffff00, 0x0100000000ff00ff, 0x0100000000ff0000, 0x0100000000ff0001,
+    0x0100000000ff0100, 0x010000000000ffff, 0x010000000000ff00, 0x010000000000ff01,
+    0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, 0x01000000000001ff,
+    0x0100000000000100, 0x0100000000000101, 0x010000000001ff00, 0x01000000000100ff,
+    0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000001ffff00,
+    0x0100000001ff0000, 0x0100000001ff01ff, 0x010000000100ff00, 0x010000000100ff01,
+    0x01000000010000ff, 0x0100000001000000, 0x0100000001000001, 0x0100000001000100,
+    0x0100000001000101, 0x010000000101ffff, 0x010000000101ff01, 0x0100000001010000,
+    0x01000000010101ff, 0x0100000001010101, 0x01000001ffffff00, 0x01000001ffff00ff,
+    0x01000001ff00ffff, 0x01000001ff000000, 0x01000001ff000100, 0x01000001ff01ffff,
+    0x01000001ff010001, 0x01000001ff010100, 0x0100000100ff0000, 0x0100000100ff01ff,
+    0x0100000100ff0100, 0x010000010000ff00, 0x010000010000ff01, 0x0100000100000000,
+    0x0100000100000001, 0x0100000100000100, 0x0100000100010000, 0x01000001000101ff,
+    0x0100000101ffff01, 0x0100000101ff00ff, 0x0100000101ff0100, 0x0100000101ff0101,
+    0x010000010100ff01, 0x01000001010000ff, 0x0100000101000000, 0x01000001010100ff,
+    0x0100000101010001, 0x0100000101010100, 0x010001ffffff0000, 0x010001ffff000001,
+    0x010001ffff000100, 0x010001ffff010000, 0x010001ff00ffff00, 0x010001ff00ff0001,
+    0x010001ff0000ffff, 0x010001ff0000ff01, 0x010001ff00000000, 0x010001ff00000001,
+    0x010001ff00000101, 0x010001ff000100ff, 0x010001ff00010000, 0x010001ff01ff0000,
+    0x010001ff0100ff00, 0x010001ff01000001, 0x010001ff01000100, 0x010001ff01010000,
+    0x01000100ffff00ff, 0x01000100ffff0001, 0x01000100ffff0100, 0x01000100ff00ffff,
+    0x01000100ff00ff01, 0x01000100ff000000, 0x01000100ff0001ff, 0x01000100ff000101,
+    0x01000100ff01ffff, 0x01000100ff01ff00, 0x01000100ff0100ff, 0x01000100ff010001,
+    0x0100010000ffffff, 0x0100010000ffff01, 0x0100010000ff0000, 0x0100010000ff01ff,
+    0x0100010000ff0101, 0x010001000000ff00, 0x01000100000000ff, 0x0100010000000000,
+    0x0100010000000001, 0x0100010000000100, 0x010001000001ff01, 0x0100010000010000,
+    0x0100010000010001, 0x0100010000010101, 0x0100010001ffff00, 0x0100010001ff00ff,
+    0x010001000100ffff, 0x010001000100ff01, 0x0100010001000000, 0x0100010001000101,
+    0x010001000101ff00, 0x0100010001010001, 0x01000101ffff0000, 0x01000101ff000000,
+    0x01000101ff010000, 0x0100010100ff00ff, 0x0100010100ff0001, 0x0100010100ff0100,
+    0x010001010000ffff, 0x0100010100000000, 0x01000101000001ff, 0x010001010001ff00,
+    0x0100010101ff0000, 0x010001010100ff00, 0x01000101010000ff, 0x0100010101000000,
+    0x0100010101000001, 0x0101ffffffffffff, 0x0101ffffffffff01, 0x0101ffffffff01ff,
+    0x0101ffffffff0101, 0x0101ffffff000000, 0x0101ffffff01ffff, 0x0101ffffff01ff01,
+    0x0101ffffff0101ff, 0x0101ffffff010101, 0x0101ffff00ff0000, 0x0101ffff0000ff00,
+    0x0101ffff000000ff, 0x0101ffff00000001, 0x0101ffff00000100, 0x0101ffff01ffffff,
+    0x0101ffff01ffff01, 0x0101ffff01ff01ff, 0x0101ffff01ff0101, 0x0101ffff01000000,
+    0x0101ffff0101ffff, 0x0101ffff0101ff01, 0x0101ffff010101ff, 0x0101ffff01010101,
+    0x0101ff00ffff0000, 0x0101ff00ffff0100, 0x0101ff00ff00ff00, 0x0101ff00ff0000ff,
+    0x0101ff00ff000001, 0x0101ff00ff000100, 0x0101ff00ff000101, 0x0101ff0000ff0001,
+    0x0101ff0000ff0100, 0x0101ff000000ff00, 0x0101ff0000000000, 0x0101ff00000001ff,
+    0x0101ff0000000101, 0x0101ff000001ff00, 0x0101ff00000100ff, 0x0101ff0001ff0000,
+    0x0101ff000100ffff, 0x0101ff000100ff01, 0x0101ff0001000001, 0x0101ff0001000100,
+    0x0101ff01ffffff01, 0x0101ff01ffff01ff, 0x0101ff01ffff0101, 0x0101ff01ff00ffff,
+    0x0101ff01ff000100, 0x0101ff01ff01ff01, 0x0101ff01ff0101ff, 0x0101ff01ff010101,
+    0x0101ff0100ff0000, 0x0101ff010000ff00, 0x0101ff0100000001, 0x0101ff0100000100,
+    0x0101ff0100010000, 0x0101ff0101ffffff, 0x0101ff0101ffff01, 0x0101ff0101ff01ff,
+    0x0101ff0101ff0101, 0x0101ff0101000000, 0x0101ff010101ffff, 0x0101ff010101ff01,
+    0x0101ff01010101ff, 0x0101ff0101010101, 0x010100ffff000100, 0x010100ffff010000,
+    0x010100ff00ffff00, 0x010100ff00ff00ff, 0x010100ff0000ffff, 0x010100ff000000ff,
+    0x010100ff00000000, 0x010100ff000001ff, 0x010100ff00000101, 0x010100ff0001ff00,
+    0x010100ff00010000, 0x010100ff00010001, 0x010100ff000101ff, 0x010100ff00010100,
+    0x010100ff01ff0000, 0x01010000ffff0001, 0x01010000ffff0100, 0x01010000ff00ffff,
+    0x01010000ff00ff01, 0x01010000ff000000, 0x01010000ff0001ff, 0x01010000ff010001,
+    0x01010000ff010100, 0x0101000000ffff01, 0x0101000000ff0000, 0x010100000000ff00,
+    0x01010000000000ff, 0x0101000000000000, 0x0101000000000001, 0x0101000000000100,
+    0x0101000000010000, 0x0101000000010101, 0x0101000001ffff00, 0x0101000001ff00ff,
+    0x0101000001ff0000, 0x0101000001ff0001, 0x0101000001ff0100, 0x010100000100ff01,
+    0x0101000001000000, 0x01010000010001ff, 0x01010001ffff0000, 0x01010001ff00ff00,
+    0x01010001ff000001, 0x01010001ff000101, 0x01010001ff01ff00, 0x01010001ff010000,
+    0x0101000100ff00ff, 0x0101000100ff0001, 0x0101000100ff0101, 0x010100010000ff01,
+    0x0101000100000000, 0x0101000100000001, 0x01010001000001ff, 0x010100010001ffff,
+    0x010100010001ff01, 0x0101000101ff0001, 0x010100010100ffff, 0x0101000101000000,
+    0x0101000101000001, 0x0101000101000100, 0x010100010101ff00, 0x01010001010100ff,
+    0x0101000101010001, 0x010101ffffffffff, 0x010101ffffffff01, 0x010101ffffff01ff,
+    0x010101ffffff0101, 0x010101ffff01ffff, 0x010101ffff01ff01, 0x010101ffff0101ff,
+    0x010101ffff010101, 0x010101ff0000ff00, 0x010101ff000000ff, 0x010101ff00000001,
+    0x010101ff00000100, 0x010101ff01ffffff, 0x010101ff01ffff01, 0x010101ff01ff01ff,
+    0x010101ff01ff0101, 0x010101ff01000000, 0x010101ff0101ffff, 0x010101ff0101ff01,
+    0x010101ff010101ff, 0x010101ff01010101, 0x01010100ffff0000, 0x01010100ff0000ff,
+    0x01010100ff000100, 0x01010100ff01ff00, 0x01010100ff010000, 0x0101010000ffff00,
+    0x010101000000ffff, 0x0101010000000000, 0x0101010000000101, 0x010101000001ff00,
+    0x0101010000010001, 0x0101010000010100, 0x010101000100ffff, 0x0101010001000001,
+    0x01010101ffffffff, 0x01010101ffffff01, 0x01010101ffff01ff, 0x01010101ffff0101,
+    0x01010101ff01ffff, 0x01010101ff01ff01, 0x01010101ff0101ff, 0x01010101ff010101,
+    0x010101010000ff00, 0x01010101000000ff, 0x0101010100000001, 0x0101010101ffffff,
+    0x0101010101ffff01, 0x0101010101ff01ff, 0x0101010101ff0101, 0x0101010101000000,
+    0x010101010101ffff, 0x010101010101ff01, 0x01010101010101ff, 0x0101010101010101,
+GGML_TABLE_END()
+#else
+GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
+GGML_TABLE_END()
+#endif
+
+#endif // GGML_COMMON_IMPL
+#endif // GGML_COMMON_IMPL
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
new file mode 100644
index 000000000..6b3641c42
--- /dev/null
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -0,0 +1,346 @@
+function(ggml_add_cpu_backend_variant_impl tag_name)
+    if (tag_name)
+        set(GGML_CPU_NAME ggml-cpu-${tag_name})
+    else()
+        set(GGML_CPU_NAME ggml-cpu)
+    endif()
+
+    ggml_add_backend_library(${GGML_CPU_NAME})
+
+    list (APPEND GGML_CPU_SOURCES
+        ggml-cpu/ggml-cpu.c
+        ggml-cpu/ggml-cpu.cpp
+        ggml-cpu/ggml-cpu-aarch64.cpp
+        ggml-cpu/ggml-cpu-aarch64.h
+        ggml-cpu/ggml-cpu-hbm.cpp
+        ggml-cpu/ggml-cpu-hbm.h
+        ggml-cpu/ggml-cpu-quants.c
+        ggml-cpu/ggml-cpu-quants.h
+        ggml-cpu/ggml-cpu-traits.cpp
+        ggml-cpu/ggml-cpu-traits.h
+        ggml-cpu/amx/amx.cpp
+        ggml-cpu/amx/amx.h
+        ggml-cpu/amx/mmq.cpp
+        ggml-cpu/amx/mmq.h
+        ggml-cpu/ggml-cpu-impl.h
+        )
+
+    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP)
+        if (OpenMP_FOUND)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        else()
+            message(WARNING "OpenMP not found")
+        endif()
+    endif()
+
+    if (GGML_LLAMAFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
+
+        list(APPEND GGML_CPU_SOURCES
+                    ggml-cpu/llamafile/sgemm.cpp
+                    ggml-cpu/llamafile/sgemm.h)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+
+        message(STATUS "Using memkind for CPU HBM")
+
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
+
+        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+    endif()
+
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+
+        message(STATUS "ARM detected")
+
+        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
+
+            if (GGML_NATIVE)
+                # -mcpu=native does not always enable all the features in some compilers,
+                # so we check for them manually and enable them if available
+
+                execute_process(
+                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
+                    INPUT_FILE "/dev/null"
+                    OUTPUT_QUIET
+                    ERROR_VARIABLE ARM_MCPU
+                    RESULT_VARIABLE ARM_MCPU_RESULT
+                )
+                if (NOT ARM_MCPU_RESULT)
+                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                endif()
+                if ("${ARM_MCPU_FLAG}" STREQUAL "")
+                    set(ARM_MCPU_FLAG -mcpu=native)
+                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
+                endif()
+
+                include(CheckCXXSourceRuns)
+
+                function(check_arm_feature tag code)
+                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
+                    check_cxx_source_runs(
+                        "${code}"
+                        GGML_MACHINE_SUPPORTS_${tag}
+                    )
+                    if (GGML_MACHINE_SUPPORTS_${tag})
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
+                    else()
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
+                    endif()
+                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+                endfunction()
+
+                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
+
+                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
+            else()
+                if (GGML_CPU_ARM_ARCH)
+                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
+                endif()
+            endif()
+
+            # show enabled features
+            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+                set(FEAT_INPUT_FILE "NUL")
+            else()
+                set(FEAT_INPUT_FILE "/dev/null")
+            endif()
+
+            execute_process(
+                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
+                INPUT_FILE ${FEAT_INPUT_FILE}
+                OUTPUT_VARIABLE ARM_FEATURE
+                RESULT_VARIABLE ARM_FEATURE_RESULT
+            )
+            if (ARM_FEATURE_RESULT)
+                message(WARNING "Failed to get ARM features")
+            else()
+                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
+                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
+                    if (NOT ${feature_pos} EQUAL -1)
+                        message(STATUS "ARM feature ${feature} enabled")
+                    endif()
+                endforeach()
+            endif()
+        endif()
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+
+        message(STATUS "x86 detected")
+
+        if (MSVC)
+            # instruction set detection for MSVC only
+            if (GGML_NATIVE)
+                include(ggml-cpu/cmake/FindSIMD.cmake)
+            endif ()
+            if (GGML_AVX512)
+                list(APPEND ARCH_FLAGS /arch:AVX512)
+                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
+                # MSVC has no compile-time flags enabling specific
+                # AVX512 extensions, neither it defines the
+                # macros corresponding to the extensions.
+                # Do it manually.
+                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    endif()
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vnni)
+                    endif()
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512bf16)
+                    endif()
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
+                endif()
+            elseif (GGML_AVX2)
+                list(APPEND ARCH_FLAGS /arch:AVX2)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
+            elseif (GGML_AVX)
+                list(APPEND ARCH_FLAGS /arch:AVX)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX)
+            else ()
+                list(APPEND ARCH_FLAGS /arch:SSE4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+            endif()
+            if (GGML_AVX_VNNI)
+                list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+            endif()
+        else ()
+            if (GGML_NATIVE)
+                list(APPEND ARCH_FLAGS -march=native)
+            else ()
+                list(APPEND ARCH_FLAGS -msse4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                if (GGML_F16C)
+                    list(APPEND ARCH_FLAGS -mf16c)
+                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
+                endif()
+                if (GGML_FMA)
+                    list(APPEND ARCH_FLAGS -mfma)
+                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
+                endif()
+                if (GGML_AVX)
+                    list(APPEND ARCH_FLAGS -mavx)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
+                endif()
+                if (GGML_AVX2)
+                    list(APPEND ARCH_FLAGS -mavx2)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
+                endif()
+                if (GGML_AVX_VNNI)
+                    list(APPEND ARCH_FLAGS -mavxvnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
+                endif()
+                if (GGML_AVX512)
+                    list(APPEND ARCH_FLAGS -mavx512f)
+                    list(APPEND ARCH_FLAGS -mavx512cd)
+                    list(APPEND ARCH_FLAGS -mavx512vl)
+                    list(APPEND ARCH_FLAGS -mavx512dq)
+                    list(APPEND ARCH_FLAGS -mavx512bw)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                endif()
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_FLAGS -mamx-tile)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_FLAGS -mamx-int8)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_FLAGS -mamx-bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
+                endif()
+            endif()
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+        message(STATUS "PowerPC detected")
+        execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+        string(FIND "${POWER10_M}" "POWER10" substring_index)
+        if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+            set(substring_index -1)
+        endif()
+
+        if (${substring_index} GREATER_EQUAL 0)
+        list(APPEND ARCH_FLAGS -mcpu=power10)
+        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+        else()
+            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+            # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        message(STATUS "loongarch64 detected")
+
+        list(APPEND ARCH_FLAGS -march=loongarch64)
+        if (GGML_LASX)
+            list(APPEND ARCH_FLAGS -mlasx)
+        endif()
+        if (GGML_LSX)
+            list(APPEND ARCH_FLAGS -mlsx)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        message(STATUS "RISC-V detected")
+        if (GGML_RVV)
+            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+        endif()
+    else()
+        message(STATUS "Unknown architecture")
+    endif()
+
+    if (GGML_CPU_AARCH64)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
+    endif()
+
+    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
+    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
+    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
+    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
+
+    if (GGML_BACKEND_DL)
+        if (GGML_NATIVE)
+            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+        endif()
+
+        # The feature detection code is compiled as a separate target so that
+        # it can be built without the architecture flags
+        # Since multiple variants of the CPU backend may be included in the same
+        # build, using set_source_files_properties() to set the arch flags is not possible
+        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
+    endif()
+
+    if (EMSCRIPTEN)
+        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
+    endif()
+endfunction()
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
new file mode 100644
index 000000000..5ec5263ce
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,220 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-traits.h"
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+// AMX type_trais
+namespace ggml::cpu::amx {
+class tensor_traits : public ggml::cpu::tensor_traits {
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        size = ggml_backend_amx_desired_wsize(op);
+        return true;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT) {
+            ggml_backend_amx_mul_mat(params, op);
+            return true;
+        }
+        return false;
+    }
+};
+
+static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
+    static tensor_traits traits;
+    return &traits;
+}
+}  // namespace ggml::cpu::amx
+
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *) (buffer->context);
+}
+
+static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    memset((char *) tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                               const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *) tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+/*
+// need to figure what we need to do with buffer->extra.
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+*/
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ nullptr,
+    /* .cpy_tensor      = */ nullptr,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ nullptr,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = ggml_aligned_malloc(size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+namespace ggml::cpu::amx {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        // handle only 2d gemm for now
+        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+        };
+
+        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
+            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
+            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
+            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
+            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
+            // src1 must be host buffer
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            // src1 must be float32
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
+            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
+            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+        }
+
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::amx
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__gnu_linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#endif
+}
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+                        /* .is_host          = */ nullptr,
+                        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
+    };
+
+    if (!ggml_amx_init()) {
+        return nullptr;
+    }
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h
new file mode 100644
index 000000000..5b65d76bd
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/amx.h
@@ -0,0 +1,8 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+// GGML internal header
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+#endif
diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h
new file mode 100644
index 000000000..f392e8985
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/common.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-impl.h"
+
+#include <algorithm>
+#include <memory>
+#include <type_traits>
+
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+#define VNNI_BLK 4
+
+#define AMX_BLK_SIZE 32
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+#define TMM4 4
+#define TMM5 5
+#define TMM6 6
+#define TMM7 7
+
+// parallel routines
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(GGML_USE_OPENMP)
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+}
+
+// quantized types that have AMX support
+inline bool qtype_has_amx_kernels(const enum ggml_type type) {
+    // TODO: fix padding for vnni format
+    return (type == GGML_TYPE_Q4_0) ||
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
+}
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
new file mode 100644
index 000000000..0ea91596b
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -0,0 +1,2511 @@
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif
+
+#include "amx.h"
+#include "mmq.h"
+#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu-quants.h"
+#include "ggml-quants.h"
+#include <algorithm>
+#include <type_traits>
+
+#if defined(__gnu_linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+namespace {
+
+// Forced unrolling
+template <int n>
+struct Unroll {
+    template <typename Func, typename... Args>
+    ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        Unroll<n - 1>{}(f, args...);
+        f(std::integral_constant<int, n - 1>{}, args...);
+    }
+};
+
+template <>
+struct Unroll<1> {
+    template <typename Func, typename... Args>
+    ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        f(std::integral_constant<int, 0>{}, args...);
+    }
+};
+
+// type traits
+template <typename T> struct PackedTypes {};
+template <> struct PackedTypes<block_q4_0> { using type = int8_t; };
+template <> struct PackedTypes<block_q4_1> { using type = uint8_t; };
+template <> struct PackedTypes<block_q8_0> { using type = int8_t; };
+template <typename T> using packed_B_type = typename PackedTypes<T>::type;
+
+template <typename T>
+struct do_compensate : std::integral_constant<bool,
+    std::is_same<T, block_q8_0>::value> {};
+
+template <typename T>
+struct do_unpack : std::integral_constant<bool,
+    std::is_same<T, block_q4_0>::value ||
+    std::is_same<T, block_q4_1>::value> {};
+
+template <typename T>
+struct is_type_qkk : std::integral_constant<bool,
+    std::is_same<T, block_q4_K>::value ||
+    std::is_same<T, block_q5_K>::value ||
+    std::is_same<T, block_q6_K>::value ||
+    std::is_same<T, block_iq4_xs>::value> {};
+
+#define GGML_DISPATCH_FLOATING_TYPES(TYPE, ...)                                        \
+    [&] {                                                                              \
+        switch (TYPE) {                                                                \
+            case GGML_TYPE_F16: {                                                      \
+                using type = ggml_fp16_t;                                              \
+                constexpr int blck_size = 16;                                          \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_BF16: {                                                     \
+                using type = ggml_bf16_t;                                              \
+                constexpr int blck_size = 32;                                          \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            default:                                                                   \
+                fprintf(stderr, "Unsupported floating data type\n");                   \
+        }                                                                              \
+    }()
+
+#define GGML_DISPATCH_QTYPES(QT, ...)                                                  \
+    [&] {                                                                              \
+        switch (QT) {                                                                  \
+            case GGML_TYPE_Q4_0: {                                                     \
+                using type = block_q4_0;                                               \
+                using vec_dot_type = block_q8_0;                                       \
+                constexpr int blck_size = QK4_0;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q4_1: {                                                     \
+                using type = block_q4_1;                                               \
+                using vec_dot_type = block_q8_1;                                       \
+                constexpr int blck_size = QK4_1;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q8_0: {                                                     \
+                using type = block_q8_0;                                               \
+                using vec_dot_type = block_q8_0;                                       \
+                constexpr int blck_size = QK8_0;                                       \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q4_K: {                                                     \
+                using type = block_q4_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q5_K: {                                                     \
+                using type = block_q5_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_Q6_K: {                                                     \
+                using type = block_q6_K;                                               \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            case GGML_TYPE_IQ4_XS: {                                                   \
+                using type = block_iq4_xs;                                             \
+                using vec_dot_type = block_q8_K;                                       \
+                constexpr int blck_size = QK_K;                                        \
+                return __VA_ARGS__();                                                  \
+            }                                                                          \
+            default:                                                                   \
+                fprintf(stderr, "Unsupported quantized data type: %d\n", int(TYPE));   \
+        }                                                                              \
+    }()
+
+#define GGML_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...)                                     \
+    [&] {                                                                              \
+        if (BOOL_V) {                                                                  \
+            constexpr bool BOOL_NAME = true;                                           \
+            return __VA_ARGS__();                                                      \
+        } else {                                                                       \
+            constexpr bool BOOL_NAME = false;                                          \
+            return __VA_ARGS__();                                                      \
+        }                                                                              \
+    }()
+
+// define amx tile config data structure
+struct tile_config_t{
+    uint8_t palette_id = 0;
+    uint8_t start_row = 0;
+    uint8_t reserved_0[14] = {0};
+    uint16_t colsb[16] = {0};
+    uint8_t rows[16] = {0};
+};
+
+// Notes: amx tile config
+//
+// Typically, TMUL calculates A and B of size 16 x 64 containing INT8 values,
+// and accumulate the result to a 16 x 16 matrix C containing INT32 values,
+//
+// As many GGUF quantized types as `block_size` of 32, so a 16-16-32 config is used
+// instead of the normally used 16-16-64 config.
+//
+//    Block A: {16, 32}, dtype = int8_t
+//    Block B: {16, 32}, dtype = uint8_t/int8_t
+//    Block C: {16, 16}, dtype = int32_t
+//
+// Block B needs to be prepacked to vnni format before feeding into  TMUL:
+//    packed_B: from {n, k} to {k/vnni_blk, n, vnni_blck}, viewed in 2d, we get {8, 64}
+//
+// Therefore, we get tileconfig:
+//             A    B    C
+//    rows    16    8   16
+//    colsb   32   64   16
+//
+// For tile distribution, follow a 2-2-4 pattern, e.g. A used TMM2-TMM3, B used TMM0-TMM1,
+// C used TMM4-TMM7:
+//            B TMM0  B TMM1
+//    A TMM2  C TMM4  C TMM6
+//    A TMM3  C TMM5  C TMM7
+//
+// Each `amx` kernel handles 4 blocks at a time: 2MB * 2NB, when m < 2 * BLOCK_M, unpack A
+// will be needed.
+//
+// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
+// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
+//
+// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
+//    advanced-matrix-extensions-intrinsics-functions.html
+//
+
+#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb
+void ggml_tile_config_init(void) {
+    static thread_local bool is_first_time = true;
+
+    if (!is_first_time) {
+        return;
+    }
+
+    static thread_local tile_config_t tc;
+    tile_config_t current_tc;
+    _tile_storeconfig(&current_tc);
+
+    // load only when config changes
+    if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
+                               memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
+        tc.palette_id = 1;
+        tc.start_row = 0;
+        TC_CONFIG_TILE(TMM0, 8, 64);
+        TC_CONFIG_TILE(TMM1, 8, 64);
+        TC_CONFIG_TILE(TMM2, 16, 32);
+        TC_CONFIG_TILE(TMM3, 16, 32);
+        TC_CONFIG_TILE(TMM4, 16, 64);
+        TC_CONFIG_TILE(TMM5, 16, 64);
+        TC_CONFIG_TILE(TMM6, 16, 64);
+        TC_CONFIG_TILE(TMM7, 16, 64);
+        _tile_loadconfig(&tc);
+    }
+
+    is_first_time = false;
+}
+
+// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
+// See the notes `s8s8 igemm compensation in avx512-vnni` for detail.
+template <typename TB>
+int get_tile_size() {
+    int tile_size = TILE_N * sizeof(TB);
+    if (do_compensate<TB>::value) {
+        tile_size += TILE_N * sizeof(int32_t);
+    }
+    if (std::is_same<TB, block_q4_K>::value ||
+        std::is_same<TB, block_q5_K>::value) {
+        tile_size += TILE_N * 4;
+    }
+    if (std::is_same<TB, block_iq4_xs>::value) {
+        tile_size += TILE_N * 2;
+    }
+    return tile_size;
+}
+
+template <typename TB, int BLOCK_K>
+int get_row_size(int K) {
+    int KB = K / BLOCK_K;
+    int row_size = KB * sizeof(TB);
+    if (do_compensate<TB>::value) {
+        row_size += KB * sizeof(int32_t);
+    }
+    if (std::is_same<TB, block_q4_K>::value ||
+        std::is_same<TB, block_q5_K>::value) {
+        row_size += KB * 4;
+    }
+    if (std::is_same<TB, block_iq4_xs>::value) {
+        row_size += KB * 2;
+    }
+    return row_size;
+}
+
+// vectorized dtype conversion
+inline float FP16_TO_FP32(ggml_half val) {
+    __m256i v = _mm256_setr_epi16(
+        val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512 o = _mm512_cvtph_ps(v);
+    return _mm512_cvtss_f32(o);
+}
+
+inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
+    __m256i v = _mm256_set1_epi16(val);
+    return _mm512_cvtph_ps(v);
+}
+
+// horizontal reduce
+inline float _mm512_reduce_max_ps(const __m512 x) {
+    __m512 v = x;
+    __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = _mm512_max_ps(v, v1);
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = _mm512_max_ps(v, v1);
+    return _mm512_cvtss_f32(v);
+}
+
+// transpose utils
+#define SHUFFLE_EPI32(a, b, mask) \
+    _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
+inline void transpose_8x8_32bit(__m256i * v, __m256i * v1) {
+    // unpacking and 32-bit elements
+    v1[0] = _mm256_unpacklo_epi32(v[0], v[1]);
+    v1[1] = _mm256_unpackhi_epi32(v[0], v[1]);
+    v1[2] = _mm256_unpacklo_epi32(v[2], v[3]);
+    v1[3] = _mm256_unpackhi_epi32(v[2], v[3]);
+    v1[4] = _mm256_unpacklo_epi32(v[4], v[5]);
+    v1[5] = _mm256_unpackhi_epi32(v[4], v[5]);
+    v1[6] = _mm256_unpacklo_epi32(v[6], v[7]);
+    v1[7] = _mm256_unpackhi_epi32(v[6], v[7]);
+
+    // shuffling the 32-bit elements
+    v[0] = SHUFFLE_EPI32(v1[0], v1[2], 0x44);
+    v[1] = SHUFFLE_EPI32(v1[0], v1[2], 0xee);
+    v[2] = SHUFFLE_EPI32(v1[4], v1[6], 0x44);
+    v[3] = SHUFFLE_EPI32(v1[4], v1[6], 0xee);
+    v[4] = SHUFFLE_EPI32(v1[1], v1[3], 0x44);
+    v[5] = SHUFFLE_EPI32(v1[1], v1[3], 0xee);
+    v[6] = SHUFFLE_EPI32(v1[5], v1[7], 0x44);
+    v[7] = SHUFFLE_EPI32(v1[5], v1[7], 0xee);
+
+    // shuffling 128-bit elements
+    v1[0] = _mm256_permute2f128_si256(v[2], v[0], 0x02);
+    v1[1] = _mm256_permute2f128_si256(v[3], v[1], 0x02);
+    v1[2] = _mm256_permute2f128_si256(v[6], v[4], 0x02);
+    v1[3] = _mm256_permute2f128_si256(v[7], v[5], 0x02);
+    v1[4] = _mm256_permute2f128_si256(v[2], v[0], 0x13);
+    v1[5] = _mm256_permute2f128_si256(v[3], v[1], 0x13);
+    v1[6] = _mm256_permute2f128_si256(v[6], v[4], 0x13);
+    v1[7] = _mm256_permute2f128_si256(v[7], v[5], 0x13);
+}
+
+inline void transpose_16x4_32bit(__m512i * r, __m512i * d) {
+
+    static const __m512i index1 = _mm512_set_epi32(
+        0x0f, 0x0b, 0x07, 0x03,
+        0x0e, 0x0a, 0x06, 0x02,
+        0x0d, 0x09, 0x05, 0x01,
+        0x0c, 0x08, 0x04, 0x00);
+
+    d[0] = _mm512_permutexvar_epi32(index1, r[0]);
+    d[1] = _mm512_permutexvar_epi32(index1, r[1]);
+    d[2] = _mm512_permutexvar_epi32(index1, r[2]);
+    d[3] = _mm512_permutexvar_epi32(index1, r[3]);
+
+    r[0] = _mm512_shuffle_i32x4(d[0], d[1], 0x44);
+    r[1] = _mm512_shuffle_i32x4(d[0], d[1], 0xee);
+    r[2] = _mm512_shuffle_i32x4(d[2], d[3], 0x44);
+    r[3] = _mm512_shuffle_i32x4(d[2], d[3], 0xee);
+
+    d[0] = _mm512_shuffle_i32x4(r[0], r[2], 0x88);
+    d[1] = _mm512_shuffle_i32x4(r[0], r[2], 0xdd);
+    d[2] = _mm512_shuffle_i32x4(r[1], r[3], 0x88);
+    d[3] = _mm512_shuffle_i32x4(r[1], r[3], 0xdd);
+}
+
+inline void transpose_16x16_32bit(__m512i * v) {
+    __m512i v1[16];
+    v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+    v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+    v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+    v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+    v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+    v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+    v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+    v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+    v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+    v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+    v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+    v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+    v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+    v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+    v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+    v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+    v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+    v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+    v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+    v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+    v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+    v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+    v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+    v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+    v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+    v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+    v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+    v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+    v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+    v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+    v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+    v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+    v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+    v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+    v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+    v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+    v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+    v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+    v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+    v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+    v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+    v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+    v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+    v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+    v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+    v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+    v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+    v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+    v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+    v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+    v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+    v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+    v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+    v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+    v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+    v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+    v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+    v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+    v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+    v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+    v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+    v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+    v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+    v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    const int KB = k / QK_K;
+    constexpr int kVecs = QK_K / 16;
+
+    block_q8_K * y = reinterpret_cast<block_q8_K *>(vy);
+
+    // hold 16 float vecs from x
+    __m512  v[kVecs];
+
+    // hold the quants vecs
+    __m512i vq[kVecs / 4];
+
+    // hold the packed quants vecs
+    __m512i vq_packed[kVecs / 4];
+
+    const __m512 signBit = _mm512_set1_ps(-0.f);
+
+    for (int i = 0; i < KB; ++i) {
+        // Compute max(abs(e)) for the block
+        __m512 vamax = _mm512_set1_ps(0.f);
+        for (int j = 0; j < kVecs; ++j) {
+            v[j] = _mm512_loadu_ps(x); x += 16;
+            vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(signBit, v[j]));
+        }
+        const float amax = _mm512_reduce_max_ps(vamax);
+
+        // Quantize these floats
+        const float iscale = 127.f / amax;
+        y[i].d = GGML_FP32_TO_FP16(1 / iscale);
+        const float id = ( amax != 0.0f ) ? iscale : 0.f;
+        const __m512 vscale = _mm512_set1_ps(id);
+
+        // Apply multiplier and round to nearest integer
+        for (int j = 0; j < kVecs; ++j) {
+            v[j] = _mm512_mul_ps(v[j], vscale);
+            v[j] = _mm512_roundscale_ps(v[j], (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        }
+
+        // Pack to epi8 vecs
+        for (int j = 0; j < kVecs / 4; ++j) {
+            __m128i q8_0 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 0]));
+            __m128i q8_1 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 1]));
+            __m128i q8_2 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 2]));
+            __m128i q8_3 = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(v[j * 4 + 3]));
+
+            __m256i q8_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_0), (q8_1), 1);
+            __m256i q8_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(q8_2), (q8_3), 1);
+
+            vq[j] = _mm512_inserti32x8(_mm512_castsi256_si512(q8_01), q8_23, 1);
+            _mm512_storeu_si512((__m512i *)(y[i].qs + j * 64), vq[j]);
+        }
+
+        // Compute the bsums with vnni
+        transpose_16x4_32bit(vq, vq_packed);
+
+        const __m512i one = _mm512_set1_epi8(1);
+        __m512i sum = _mm512_setzero_si512();
+        for (int k = 0; k < 4; ++k) {
+            sum = _mm512_dpbusd_epi32(sum, one, vq_packed[k]);
+        }
+        _mm256_storeu_si256((__m256i *)(y[i].bsums), _mm512_cvtepi32_epi16(sum));
+    }
+}
+
+// quantize A from float to `vec_dot_type`
+template <typename T>
+inline void from_float(const float * x, char * vy, int64_t k);
+
+template <>
+inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
+    quantize_row_q8_0(x, (block_q8_0 *)vy, k);
+}
+
+template <>
+inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
+    quantize_row_q8_1(x, (block_q8_1 *)vy, k);
+}
+
+template <>
+inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
+#if 1
+    // TODO: this is reference impl!
+    quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
+#else
+    quantize_row_q8_K_vnni(x, vy, k);
+#endif
+}
+
+// load A from memory to array when nrows can not fill in whole tile
+void unpack_A(int8_t * RESTRICT tile, const block_q8_0 * RESTRICT A, int lda, int nr) {
+    assert(nr != TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+void unpack_A(int8_t * RESTRICT tile, const block_q8_1 * RESTRICT A, int lda, int nr) {
+    assert(nr != TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+template <typename TB>
+void unpack_A(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) {
+    assert(nr <= TILE_M);
+    for (int m = 0; m < nr; ++m) {
+        const __m256i v = _mm256_loadu_si256((const __m256i *)(A[m * lda].qs + k * 32));
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), v);
+    }
+}
+
+template <>
+void unpack_A<block_q6_K>(int8_t * RESTRICT tile, const block_q8_K * RESTRICT A, int lda, int k, int nr) {
+    assert(nr <= TILE_M);
+    // zero padding k from 16 to 32, so that we don't have to re-config amx
+    const __m128i zero = _mm_setzero_si128();
+    for (int m = 0; m < nr; ++m) {
+        const __m128i v = _mm_loadu_si128((const __m128i *)(A[m * lda].qs + k * 16));
+        const __m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(v), zero, 1);
+        _mm256_storeu_si256((__m256i *)(tile + m * TILE_K), r);
+    }
+}
+
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// used for block_q4_K
+inline __m512i bytes_from_nibbles_64(const uint8_t * rsi) {
+    const __m256i tmp = _mm256_loadu_si256((const __m256i *)rsi);
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    const __m256i q4l = _mm256_and_si256(tmp, lowMask);
+    const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(tmp, 4), lowMask);
+    return _mm512_inserti32x8(_mm512_castsi256_si512(q4l), q4h, 1);
+}
+
+// used for block_q5_K
+inline __m512i bytes_from_nibbles_64(const uint8_t * qs, const uint8_t * qh, int k) {
+    const __m256i lowMask = _mm256_set1_epi8(0xF);
+    __m256i hmask = _mm256_set1_epi8(1);
+    hmask = _mm256_slli_epi16(hmask, k);
+
+    const __m256i q5bits = _mm256_loadu_si256((const __m256i *)qs);
+    const __m256i hbits = _mm256_loadu_si256((const __m256i *)qh);
+
+    const __m256i q5l_0 = _mm256_and_si256(q5bits, lowMask);
+    const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 0), 4);
+    const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+    hmask = _mm256_slli_epi16(hmask, 1);
+
+    const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), lowMask);
+    const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), k + 1), 4);
+    const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+
+    return _mm512_inserti32x8(_mm512_castsi256_si512(q5_0), q5_1, 1);
+}
+
+// used for block_q6_K
+inline void bytes_from_nibbles_128(__m512i& r0, __m512i& r1, const uint8_t * qs, const uint8_t * qh) {
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(0x3);
+
+    const __m256i q6bits1 = _mm256_loadu_si256((const __m256i *)qs);
+    const __m256i q6bits2 = _mm256_loadu_si256((const __m256i *)(qs + 32));
+    const __m256i q6bitsH = _mm256_loadu_si256((const __m256i *)qh);
+
+    const __m256i q6h_0 = _mm256_slli_epi16(_mm256_and_si256(                  q6bitsH,     m2), 4);
+    const __m256i q6h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 2), m2), 4);
+    const __m256i q6h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 4), m2), 4);
+    const __m256i q6h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q6bitsH, 6), m2), 4);
+
+    const __m256i q6_0 = _mm256_or_si256(_mm256_and_si256(q6bits1, m4), q6h_0);
+    const __m256i q6_1 = _mm256_or_si256(_mm256_and_si256(q6bits2, m4), q6h_1);
+    const __m256i q6_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits1, 4), m4), q6h_2);
+    const __m256i q6_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q6bits2, 4), m4), q6h_3);
+
+    r0 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_0), q6_1, 1);
+    r1 = _mm512_inserti32x8(_mm512_castsi256_si512(q6_2), q6_3, 1);
+}
+
+inline __m512i packNibbles(__m512i r0, __m512i r1) {
+    return _mm512_or_si512(r0, _mm512_slli_epi16(r1, 4));
+}
+
+template <typename TB>
+inline void pack_qs(void * RESTRICT packed_B, const TB * RESTRICT B, int KB) {
+    int8_t tmp[8 * 64];
+    __m256i v[8], v2[8];
+    for (int n = 0; n < 8; ++n) {
+        v[n] = bytes_from_nibbles_32(B[n * KB].qs);
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)(tmp + n * 64), v2[n]);
+    }
+    for (int n = 0; n < 8; ++n) {
+        v[n] = bytes_from_nibbles_32(B[(n + 8) * KB].qs);
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)(tmp + n * 64 + 32), v2[n]);
+    }
+
+    // pack again with 128 to fully utilize vector length
+    for (int n = 0; n < 8; n += 2) {
+        __m512i r0 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64));
+        __m512i r1 = _mm512_loadu_si512((const __m512i *)(tmp + n * 64 + 64));
+        __m512i r1r0 = packNibbles(r0, r1);
+        _mm512_storeu_si512((__m512i *)((char *)packed_B + n * 32), r1r0);
+    }
+}
+
+template <>
+inline void pack_qs<block_q8_0>(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) {
+    __m256i v[8], v2[8];
+    for (int n = 0; n < 8; ++n) {
+        v[n] = _mm256_loadu_si256((const __m256i *)(B[n * KB].qs));
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64), v2[n]);
+    }
+    for (int n = 0; n < 8; ++n) {
+        v[n] = _mm256_loadu_si256((const __m256i *)(B[(n + 8) * KB].qs));
+    }
+    transpose_8x8_32bit(v, v2);
+    for (int n = 0; n < 8; ++n) {
+        _mm256_storeu_si256((__m256i *)((char *)packed_B + n * 64 + 32), v2[n]);
+    }
+}
+
+template <>
+inline void pack_qs<block_q4_K>(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) {
+    __m512i v[16];
+    // QK_K 256 with 8 groups, handle 2 groups at a time
+    char * pb = (char *)packed_B;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        // pack 2 groups { n, g,  k} to {g, k/4, 4n}
+        //          e.g. {16, 2, 32} to {2,   8, 64}
+        for (int n = 0; n < TILE_N; ++n) {
+            v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // pack again with 128 to fully utilize vector length
+        for (int n = 0; n < TILE_N; n += 2) {
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1]));
+            pb += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_q5_K>(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) {
+    __m512i v[16];
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    // QK_K 256 with 8 groups, handle 2 groups at a time
+    char * pb = (char *)packed_B;
+    char * ph = (char *)packed_B + (QK_K / 2) * TILE_N;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        // pack 2 groups { n, g,  k} to {g, k/4, 4n}
+        //          e.g. {16, 2, 32} to {2,   8, 64}
+        for (int n = 0; n < TILE_N; ++n) {
+            v[n] = bytes_from_nibbles_64(B[n * KB].qs + k * 32, B[n * KB].qh, /* group */2 * k);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // 1. pack lower 4bits with 2 groups
+        for (int n = 0; n < TILE_N; n += 2) {
+            // get lower 4 bits
+            const __m512i r0 = _mm512_and_si512(v[n], lowMask);
+            const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64;
+        }
+
+        // 2. pack higher 1bit with 2 groups
+        const __m512i hmask = _mm512_set1_epi8(0x10);
+        for (int g = 0; g < 2; ++g) {
+            __m512i hbits = _mm512_setzero_si512();
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 0], hmask), 4));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 1], hmask), 3));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 2], hmask), 2));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 8 + 3], hmask), 1));
+            hbits = _mm512_add_epi8(hbits,                   _mm512_and_si512(v[g * 8 + 4], hmask)    );
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 5], hmask), 1));
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 6], hmask), 2));
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 8 + 7], hmask), 3));
+            _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_q6_K>(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) {
+    __m512i v[32];
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    // QK_K 256 with 8 groups, handle 4 groups at a time
+    char * pb = (char *)packed_B;
+    char * ph = (char *)packed_B + (QK_K / 2) * TILE_N;
+    for (int k = 0; k < QK_K / 128; ++k) {
+        for (int n = 0; n < TILE_N; ++n) {
+            bytes_from_nibbles_128(v[n], v[n + 16], B[n * KB].ql + k * 64, B[n * KB].qh + k * 32);
+        }
+
+        // top half: group 0,1 or 4,5; bottom half: group 2,3 or 6,7
+        transpose_16x16_32bit(v);
+        transpose_16x16_32bit(v + 16);
+
+        // 1. pack lower 4bits with 4 groups
+        for (int n = 0; n < 32; n += 2) {
+            const __m512i r0 = _mm512_and_si512(v[n], lowMask);
+            const __m512i r1 = _mm512_and_si512(v[n + 1], lowMask);
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(r0, r1)); pb += 64;
+        }
+
+        // 2. pack higher 2bit with 4 groups
+        const __m512i hmask = _mm512_set1_epi8(0x30);
+        for (int g = 0; g < 8; ++g) {
+            __m512i hbits = _mm512_setzero_si512();
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 0], hmask), 4));
+            hbits = _mm512_add_epi8(hbits, _mm512_srli_epi16(_mm512_and_si512(v[g * 4 + 1], hmask), 2));
+            hbits = _mm512_add_epi8(hbits,                   _mm512_and_si512(v[g * 4 + 2], hmask)    );
+            hbits = _mm512_add_epi8(hbits, _mm512_slli_epi16(_mm512_and_si512(v[g * 4 + 3], hmask), 2));
+            _mm512_storeu_si512((__m512i *)ph, hbits); ph += 64;
+        }
+    }
+}
+
+template <>
+inline void pack_qs<block_iq4_xs>(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) {
+    __m512i v[16];
+    char * pb = (char *)packed_B;
+    for (int k = 0; k < QK_K / 64; ++k) {
+        for (int n = 0; n < TILE_N; ++n) {
+            __m256i r0 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 +  0);
+            __m256i r1 = bytes_from_nibbles_32(B[n * KB].qs + k * 32 + 16);
+            v[n] = _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+        }
+
+        transpose_16x16_32bit(v);
+
+        // pack again with 128 to fully utilize vector length
+        for (int n = 0; n < TILE_N; n += 2) {
+            _mm512_storeu_si512((__m512i *)pb, packNibbles(v[n], v[n + 1]));
+            pb += 64;
+        }
+    }
+}
+
+// pack B to vnni formats in 4bits or 8 bits
+void pack_B(void * RESTRICT packed_B, const block_q4_0 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K / 2);
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+    }
+}
+
+void pack_B(void * RESTRICT packed_B, const block_q4_1 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K / 2);
+    ggml_half * m0 = d0 + TILE_N;
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+        m0[n] = B[n * KB].m;
+    }
+}
+
+inline void s8s8_compensation(void * RESTRICT packed_B) {
+    // packed_B layout:
+    //   quants {TILE_N, TILEK}  int8_t
+    //   d0     {TILE_N}      ggml_half
+    //   comp   {TILE_N}        int32_t
+    const int offset = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+    __m512i vcomp = _mm512_setzero_si512();
+    const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+    for (int k = 0; k < 8; ++k) {
+        __m512i vb = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + k * 64));
+        vcomp = _mm512_dpbusd_epi32(vcomp, off, vb);
+    }
+    _mm512_storeu_si512((__m512i *)((char *)(packed_B) + offset), vcomp);
+}
+
+void pack_B(void * RESTRICT packed_B, const block_q8_0 * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+    ggml_half * d0 = reinterpret_cast<ggml_half *>((char *)packed_B + TILE_N * TILE_K);
+    for (int n = 0; n < TILE_N; ++n) {
+        d0[n] = B[n * KB].d;
+    }
+    s8s8_compensation(packed_B);
+}
+
+// convert 8 * {min, scale} from int6 to int8
+inline void unpack_mins_and_scales(const uint8_t * scales, uint32_t * utmp) {
+    const uint32_t kmask1 = 0x3f3f3f3f;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+    const uint32_t kmask3 = 0x03030303;
+
+    memcpy(utmp, scales, 12);
+    utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+    const uint32_t uaux = utmp[1] & kmask1;
+    utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+    utmp[2] = uaux;
+    utmp[0] &= kmask1;
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   scales {8, TILE_N}      uint8
+//   mins   {8, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+//   dmin   {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q4_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N);
+    uint8_t * mins = scales + 8 * TILE_N;
+    ggml_half * d = reinterpret_cast<ggml_half *>(mins + 8 * TILE_N);
+    ggml_half * dmin = d + TILE_N;
+
+    union {
+        uint32_t u32[4];
+        uint8_t  u8[16];
+    } s;
+
+    for (int n = 0; n < TILE_N; ++n) {
+        unpack_mins_and_scales(B[n * KB].scales, s.u32);
+        for (int k = 0; k < 8; ++k) {
+            scales[k * TILE_N + n] = s.u8[k];
+            mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
+        }
+        d[n] = B[n * KB].d;
+        dmin[n] = B[n * KB].dmin;
+    }
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   qh     {8, TILE_N,  4}  uint8
+//   scales {8, TILE_N}      uint8
+//   mins   {8, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+//   dmin   {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q5_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
+    uint8_t * mins = scales + 8 * TILE_N;
+    ggml_half * d = reinterpret_cast<ggml_half *>(mins + 8 * TILE_N);
+    ggml_half * dmin = d + TILE_N;
+
+    union {
+        uint32_t u32[4];
+        uint8_t  u8[16];
+    } s;
+
+    for (int n = 0; n < TILE_N; ++n) {
+        unpack_mins_and_scales(B[n * KB].scales, s.u32);
+        for (int k = 0; k < 8; ++k) {
+            scales[k * TILE_N + n] = s.u8[k];
+            mins[(k >> 1) * TILE_N * 2 + n * 2 + (k & 0x1)] = s.u8[k + 8];
+        }
+        d[n] = B[n * KB].d;
+        dmin[n] = B[n * KB].dmin;
+    }
+}
+
+// packed_B layout:
+//   quants {16, TILE_N, 8}  uint8
+//   qh     {16, TILE_N, 4}  uint8
+//   scales {16, TILE_N}      uint8
+//   d      {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_q6_K * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    uint8_t * scales = reinterpret_cast<uint8_t *>((char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
+    ggml_half * d = reinterpret_cast<ggml_half *>(scales + 16 * TILE_N);
+    for (int n = 0; n < TILE_N; ++n) {
+        const int8_t * ps = B[n * KB].scales;
+        for (int k = 0; k < 16; ++k) {
+            scales[k * TILE_N + n] = ps[k];
+        }
+        d[n] = B[n * KB].d;
+    }
+}
+
+// packed_B layout:
+//   quants {8, TILE_N, 16}  uint8
+//   scales {8, TILE_N}       int8
+//   d      {TILE_N}     ggml_half
+void pack_B(void * RESTRICT packed_B, const block_iq4_xs * RESTRICT B, int KB) {
+    pack_qs(packed_B, B, KB);
+
+    int8_t * scales = reinterpret_cast<int8_t *>((char *)packed_B + (QK_K / 2) * TILE_N);
+    ggml_half * d = reinterpret_cast<ggml_half *>(scales + 8 * TILE_N);
+
+    // pack the scales
+    for (int n = 0; n < TILE_N; ++n) {
+        uint16_t sh = B[n * KB].scales_h;
+        for (int k = 0; k < 8; k += 2) {
+            const int16_t ls1 = ((B[n * KB].scales_l[k / 2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((B[n * KB].scales_l[k / 2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            scales[(k + 0) * TILE_N + n] = ls1;
+            scales[(k + 1) * TILE_N + n] = ls2;
+            sh >>= 4;
+        }
+        d[n] = B[n * KB].d;
+    }
+}
+
+template<typename TB, typename packed_B_t = packed_B_type<TB>>
+void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) {
+    GGML_UNUSED(tile);
+    GGML_UNUSED(packed_B);
+}
+
+template <>
+void unpack_B<block_q4_0>(int8_t * RESTRICT tile, const void * RESTRICT packed_B) {
+  const __m512i off = _mm512_set1_epi8(8);
+  const __m512i lowMask = _mm512_set1_epi8(0xF);
+  for (int n = 0; n < 8; n += 2) {
+    __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32));
+    const __m512i r0 = _mm512_sub_epi8(_mm512_and_si512(bytes, lowMask), off);
+    const __m512i r1 = _mm512_sub_epi8(_mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask), off);
+    _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+    _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+  }
+}
+
+template <>
+void unpack_B<block_q4_1>(uint8_t * RESTRICT tile, const void * RESTRICT packed_B) {
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512((const __m512i *)((const char *)packed_B + n * 32));
+        const __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+// packed_B_t for QKK is int8_t
+template <typename TB>
+void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
+    const char * packed_B_group = (const char *)packed_B + k * packed_B_group_size;
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(packed_B_group + n * 32);
+        const __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        const __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <>
+void unpack_B<block_q5_K>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    // lower 4bits, stride 256 bytes
+    const int packed_l4_group_size = QK_K / 2 * TILE_N / 8;
+    const char * pb = (const char *)packed_B + k * packed_l4_group_size;
+
+    // higher 1bit, stride 64 bytes
+    const int packed_h1_group_size = QK_K / 8 * TILE_N / 8;
+    const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h1_group_size;
+    const __m512i hbits = _mm512_loadu_si512(ph);
+
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    __m512i hmask0 = _mm512_set1_epi8(0x1);
+    __m512i hmask1 = _mm512_set1_epi8(0x2);
+
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(pb + n * 32);
+        __m512i r0 = _mm512_and_si512(bytes, lowMask);
+        __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+        __m512i h0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), n), 4);
+        __m512i h1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), n + 1), 4);
+
+        hmask0 = _mm512_slli_epi16(hmask0, 2);
+        hmask1 = _mm512_slli_epi16(hmask1, 2);
+        r0 = _mm512_add_epi8(r0, h0);
+        r1 = _mm512_add_epi8(r1, h1);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <>
+void unpack_B<block_q6_K>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    // lower 4bits, stride 128 bytes
+    const int packed_l4_group_size = QK_K / 2 * TILE_N / 16;
+    const char * pb = (const char *)packed_B + k * packed_l4_group_size;
+
+    // higher 2bits, stride 64 bytes
+    const int packed_h2_group_size = QK_K / 4 * TILE_N / 16;
+    const char * ph = (const char *)packed_B + (QK_K / 2) * TILE_N + k * packed_h2_group_size;
+    const __m512i hbits = _mm512_loadu_si512(ph);
+
+    const __m512i off = _mm512_set1_epi8(32);
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+    __m512i hmask0 = _mm512_set1_epi8(0x3); // 0011
+    __m512i hmask1 = _mm512_set1_epi8(0xC); // 1100
+
+    // notes: skip zero padding from row4 to row7 as we have done so in `unpack_A`
+    __m512i bytes = _mm512_loadu_si512(pb);
+    __m512i r0 = _mm512_and_si512(bytes, lowMask);
+    __m512i r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+    __m512i h0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask0), 4);
+    __m512i h1 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask1), 2);
+    _mm512_storeu_si512((__m512i *)(tile +  0), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
+    _mm512_storeu_si512((__m512i *)(tile + 64), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
+
+    hmask0 = _mm512_slli_epi16(hmask0, 4);
+    hmask1 = _mm512_slli_epi16(hmask1, 4);
+
+    bytes = _mm512_loadu_si512(pb + 64);
+    r0 = _mm512_and_si512(bytes, lowMask);
+    r1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+    h0 =                   _mm512_and_si512(hbits, hmask0);
+    h1 = _mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), 2);
+    _mm512_storeu_si512((__m512i *)(tile + 128), _mm512_sub_epi8(_mm512_add_epi8(r0, h0), off));
+    _mm512_storeu_si512((__m512i *)(tile + 192), _mm512_sub_epi8(_mm512_add_epi8(r1, h1), off));
+}
+
+template <>
+void unpack_B<block_iq4_xs>(int8_t * RESTRICT tile, const void * RESTRICT packed_B, int k) {
+    static const __m512i values128 = _mm512_set_epi8(
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+        113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127
+    );
+
+    const int packed_B_group_size = QK_K / 2 * TILE_N / 8;
+    const char * pb = (const char *)packed_B + k * packed_B_group_size;
+    const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+    for (int n = 0; n < 8; n += 2) {
+        __m512i bytes = _mm512_loadu_si512(pb + n * 32);
+        const __m512i r0 = _mm512_shuffle_epi8(values128, _mm512_and_si512(bytes, lowMask));
+        const __m512i r1 = _mm512_shuffle_epi8(values128, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 +  0), r0);
+        _mm512_storeu_si512((__m512i *)(tile + n * 64 + 64), r1);
+    }
+}
+
+template <typename TA, typename TB, bool is_acc>
+struct acc_C {};
+
+template <bool is_acc>
+struct acc_C<block_q8_0, block_q4_0, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K / 2;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_1, block_q4_1, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_1 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K / 2;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+        const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            vsum = _mm512_fmadd_ps(vm0, vs1, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_0, block_q8_0, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_0 * A, int lda, const void * packed_B, int nr) {
+        const int offset = TILE_N * TILE_K;
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
+
+        for (int m = 0; m < nr; ++m) {
+            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+            vsum = _mm512_fmadd_ps(vtile, _mm512_mul_ps(vd0, vd1), vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q4_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N);
+        const uint8_t * mins = scales + 8 * TILE_N;
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(mins + 8 * TILE_N);
+        const ggml_half * dmin = d0 + TILE_N;
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+        const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums);
+            const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q5_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N);
+        const uint8_t * mins = scales + 8 * TILE_N;
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(mins + 8 * TILE_N);
+        const ggml_half * dmin = d0 + TILE_N;
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+        const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dmin));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vdm = _mm512_mul_ps(_mm512_set1_ps(-d1), vdmin);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[m * lda].bsums);
+            const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, _mm512_castsi128_si512(q8s));
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            vsum = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc_m), vdm, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_q6_K, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N);
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(scales + 16 * TILE_N);
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <bool is_acc>
+struct acc_C<block_q8_K, block_iq4_xs, is_acc> {
+    static void apply(float * RESTRICT C, int ldc, const int32_t * RESTRICT tile, const block_q8_K * A, int lda, const void * packed_B, int nr) {
+        const int8_t * scales = reinterpret_cast<const int8_t *>((const char *)packed_B + (QK_K / 2) * TILE_N);
+        const ggml_half * d0 = reinterpret_cast<const ggml_half *>(scales + 8 * TILE_N);
+
+        const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)d0));
+
+        for (int m = 0; m < nr; ++m) {
+            const float d1 = A[m * lda].d;
+            const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(d1), vd0);
+            const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
+
+            __m512 vsum;
+            if (is_acc) {
+                vsum = _mm512_loadu_ps(C + m * ldc);
+            } else {
+                vsum = _mm512_set1_ps(0.f);
+            }
+
+            vsum = _mm512_fmadd_ps(vtile, vd, vsum);
+            _mm512_storeu_ps(C + m * ldc, vsum);
+        }
+    }
+};
+
+template <typename TB> constexpr int get_quants_size();
+template <> constexpr int get_quants_size<block_q4_K>() { return (QK_K / 2) * TILE_N; }
+template <> constexpr int get_quants_size<block_q5_K>() { return (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N; }
+template <> constexpr int get_quants_size<block_q6_K>() { return (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N; }
+template <> constexpr int get_quants_size<block_iq4_xs>() { return (QK_K / 2) * TILE_N; }
+
+// used for QKK format
+template <typename TB, bool is_acc,
+          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
+inline void scale_C(const int32_t * RESTRICT tile, int32_t * RESTRICT sumi, const void * packed_B, int k, int nr) {
+    const uint8_t * scales = reinterpret_cast<const uint8_t *>((const char *)packed_B + get_quants_size<TB>());
+    const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(scales + k * TILE_N)));
+
+    for (int m = 0; m < nr; ++m) {
+        __m512i vsumi;
+        if (is_acc) {
+            vsumi = _mm512_loadu_si512(sumi + m * TILE_N);
+        } else {
+            vsumi = _mm512_setzero_si512();
+        }
+        __m512i vtile = _mm512_loadu_si512(tile + m * TILE_N);
+        vsumi = _mm512_add_epi32(vsumi, _mm512_mullo_epi32(vtile, vscale));
+        _mm512_storeu_si512((__m512i *)(sumi + m * TILE_N), vsumi);
+    }
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_avx {
+    static void apply(int K, const TA * RESTRICT A, const TB * RESTRICT B, TC * RESTRICT C, int ldc) {
+        GGML_UNUSED(K);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int K, const float * RESTRICT A, const ggml_fp16_t * RESTRICT B, float * RESTRICT C, int ldc) {
+        constexpr int ROWS = BLOCK_M;
+        constexpr int COLS = BLOCK_N;
+        assert(BLOCK_K == 16);
+
+        __m512 va;
+        __m512 vb[COLS];
+        __m512 vc[ROWS * COLS];
+
+        auto loadc = [&](auto idx) {
+            vc[idx] = _mm512_setzero_ps();
+        };
+        Unroll<ROWS * COLS>{}(loadc);
+
+        auto compute = [&](auto idx, auto k) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
+
+            if constexpr (col == 0) {
+                va = _mm512_loadu_ps(A + row * K + k);
+            }
+            if constexpr (row == 0) {
+                vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
+            }
+            vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
+        };
+
+        for (int k = 0; k < K; k += 16) {
+            Unroll<ROWS * COLS>{}(compute, k);
+        }
+
+        auto storec = [&](auto idx) {
+            constexpr int row = idx / COLS;
+            constexpr int col = idx % COLS;
+            C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
+        };
+        Unroll<ROWS * COLS>{}(storec);
+    }
+};
+
+#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE)                                \
+    tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply(    \
+        K, (const float *)src1->data + mb_start * K,                                \
+        (const type *)src0->data + nb_start * K,                                    \
+        (float *)dst->data + mb_start * ldc + nb_start, ldc);
+
+
+// re-organize in the format {NB, KB, TILE_SIZE}:
+#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
+
+template<typename TB, int BLOCK_K>
+void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
+    const int NB = N / TILE_N;
+    const int KB = K / BLOCK_K;
+    const int TILE_SIZE = get_tile_size<TB>();
+
+    // parallel on NB should be enough
+    parallel_for(NB, [&](int begin, int end) {
+        for (int n = begin; n < end; ++n) {
+            for (int k = 0; k < KB; ++k) {
+                int n0 = n * TILE_N;
+                pack_B((char *)packed_B + PACKED_INDEX(n, k, KB, TILE_SIZE), &B[n0 * KB + k], KB);
+            }
+        }
+    });
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni {};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_0);
+
+        const block_q8_0 * RESTRICT A = static_cast<const block_q8_0 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // sum of offsets, shared across COLS
+        //
+        // avx512-vnni does not have `_mm512_dpbssd_epi32`,
+        // need to transfrom ss to us:
+        //   a * (b - 8) is equavilent to b * a - 8 * a
+        //   s    u   u                   u   s   u   s
+        //
+        __m512i vcomp;
+
+        const __m512i off = _mm512_set1_epi8(8);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](auto col, auto i) {
+            // load a and compute compensation
+            if constexpr (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                vcomp = _mm512_setzero_si512();
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                    vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+            }
+
+            // load b
+            __m512i vsum = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; k += 2) {
+                __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32));
+                __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va[k + 0]);
+                __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va[k + 1]);
+            }
+            const int offset = TILE_N * TILE_K / 2;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            vsum = _mm512_sub_epi32(vsum, vcomp);
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_1);
+
+        const block_q8_1 * RESTRICT A = static_cast<const block_q8_1 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512i vb[8];
+        __m512 vc[COLS];
+        __m512 vd1, vs1;
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](auto col, auto i) {
+            // load a
+            if constexpr (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
+            }
+
+            // load b
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; k += 2) {
+                __m512i bytes = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 32));
+                vb[k + 0] = _mm512_and_si512(bytes, lowMask);
+                vb[k + 1] = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+            }
+            const int offset = TILE_N * TILE_K / 2;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset + TILE_N * sizeof(ggml_half))));
+
+            __m512i vsum = _mm512_setzero_si512();
+            for (int k = 0; k < 8; ++k) {
+                vsum = _mm512_dpbusd_epi32(vsum, vb[k], va[k]);
+            }
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+            vc[col] = _mm512_fmadd_ps(vm0, vs1, vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q8_0) + TILE_N * sizeof(int32_t);
+
+        const block_q8_0 * RESTRICT A = static_cast<const block_q8_0 *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        __m512i va[8];
+        __m512i vb[8];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // Notes: s8s8 igemm compensation in avx512-vnni
+        // change s8s8 to u8s8 with compensate
+        //   a * b = (a + 128) * b - 128 * b
+        //   s   s       u       s    u    s
+        //
+        // (128 * b is pre-computed when packing B to vnni formats)
+        //
+        const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](auto col, auto i) {
+            // load a and add offset 128
+            if constexpr (col == 0) {
+                const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
+                for (int k = 0; k < 8; ++k) {
+                    va[k] = _mm512_set1_epi32(a_ptr[k]);
+                    va[k] = _mm512_add_epi8(va[k], off);
+                }
+                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+            }
+
+            // load b
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            for (int k = 0; k < 8; ++k) {
+                vb[k] = _mm512_loadu_si512((const __m512i *)(b_ptr + k * 64));
+            }
+            const int offset = TILE_N * TILE_K;
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset)));
+            const int offset2 = TILE_N * TILE_K + TILE_N * sizeof(ggml_half);
+            const __m512i vcomp = _mm512_loadu_si512((const __m512i *)(b_ptr + offset2));
+
+            __m512i vsum = _mm512_setzero_si512();
+            for (int k = 0; k < 8; ++k) {
+                vsum = _mm512_dpbusd_epi32(vsum, va[k], vb[k]);
+            }
+            vsum = _mm512_sub_epi32(vsum, vcomp);
+
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(vsum), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q4_K) + TILE_N * 4;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // a.qs:   8 groups, 32 bytes each group (m256i)
+        __m512i va[8];
+        // a.bsum: 8 groups,  2 bytes each group (m128i)
+        __m512i va_bsum;
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_scales = (QK_K / 2) * TILE_N;
+        const int offset_mins   = (QK_K / 2) * TILE_N +  8 * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + 16 * TILE_N;
+        const int offset_dmin   = (QK_K / 2) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        // Notes: vnni formats in QK_K
+        //   a) quants vnni format
+        //     int8  {k/4, n, 4}, viewed as 2d {k/4, 4n}, k = 32
+        //     from {16, 32} to {8, 64}
+        //
+        //   b) min vnni format
+        //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
+        //     from {16,  8} to {4, 32}
+        //
+        auto compute = [&](auto col, auto i) {
+            // load a
+            if constexpr (col == 0) {
+                for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                    va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
+                }
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+                va_bsum = _mm512_castsi128_si512(q8s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // step 1: accumultate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs  = b_ptr;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                __m512i vsum = _mm512_setzero_si512();
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);
+
+                    __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs);
+                    __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+
+                    b_qs += 64;
+                }
+                // vacc += scale * (q8 @ q4)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+
+            // step 2: accumulate the mins
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+            const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin)));
+            vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q5_K) + TILE_N * 4;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // a.qs:   8 groups, 32 bytes each group (m256i)
+        __m512i va[8];
+        // a.bsum: 8 groups,  2 bytes each group (m128i)
+        __m512i va_bsum;
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_qh     = (QK_K / 2) * TILE_N;
+        const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N;
+        const int offset_mins   = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N +  8 * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N;
+        const int offset_dmin   = (QK_K / 2) * TILE_N + (QK_K / 8) * TILE_N + 16 * TILE_N + TILE_N * sizeof(ggml_half);
+
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
+        auto compute = [&](auto col, auto i) {
+            // load a
+            if constexpr (col == 0) {
+                for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                    va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
+                }
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+                va_bsum = _mm512_castsi128_si512(q8s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // step 1: accumultate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs  = b_ptr;
+            const char * b_qh  = b_ptr + offset_qh;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                __m512i vsum = _mm512_setzero_si512();
+                __m512i hmask0 = _mm512_set1_epi8(0x1);
+                __m512i hmask1 = _mm512_set1_epi8(0x2);
+                __m512i hbits = _mm512_loadu_si512((const __m512i *)(b_qh + k_group * 64));
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 0), va[k_group]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(k + 1), va[k_group]);
+
+                    __m512i bytes = _mm512_loadu_si512((const __m512i *)b_qs);
+                    __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                    __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+
+                    __m512i vh0 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask0), k), 4);
+                    __m512i vh1 = _mm512_slli_epi16(_mm512_srli_epi16(_mm512_and_si512(hbits, hmask1), k + 1), 4);
+
+                    hmask0 = _mm512_slli_epi16(hmask0, 2);
+                    hmask1 = _mm512_slli_epi16(hmask1, 2);
+                    vb0 = _mm512_add_epi8(vb0, vh0);
+                    vb1 = _mm512_add_epi8(vb1, vh1);
+
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+
+                    b_qs += 64;
+                }
+                // vacc += scale * (q8 @ q5)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+
+            // step 2: accumulate the mins
+            __m512i acc_m = _mm512_setzero_si512();
+            for (int k = 0; k < 4; ++k) {
+                __m512i vmask = _mm512_set1_epi32(k);
+                __m512i va = _mm512_permutexvar_epi32(vmask, va_bsum);
+                __m512i vb = _mm512_cvtepi8_epi16(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_mins + k * 32)));
+                acc_m = _mm512_dpwssds_epi32(acc_m, va, vb);
+            }
+            const __m512 vdmin = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_dmin)));
+            vc[col] = _mm512_fnmadd_ps(_mm512_cvtepi32_ps(acc_m), _mm512_mul_ps(vdmin, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_q6_K);
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // load the 256 bytes from A to 4 avx512 vectors
+        __m512i va[4];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_qh     = (QK_K / 2) * TILE_N;
+        const int offset_scales = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N;
+        const int offset_d0     = (QK_K / 2) * TILE_N + (QK_K / 4) * TILE_N + 16 * TILE_N;
+
+        // compensation
+        __m512i vcomp;
+
+        const __m512i m32s = _mm512_set1_epi32(32);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
+                // load a
+                va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
+                va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
+                va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128));
+                va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192));
+
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                vcomp = _mm512_mullo_epi32(_mm512_cvtepi16_epi32(q8sums), m32s);
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // accmulate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs = b_ptr;
+            const char * b_qh = b_ptr + offset_qh;
+            int mask = 0;
+            for (int k_group = 0; k_group < QK_K / 16; ++k_group) {
+                int r = k_group >> 2;
+                __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                __m512i vsum = _mm512_setzero_si512();
+                __m512i hmask = _mm512_set1_epi8(0x3);
+
+                __m512i bytes = _mm512_loadu_si512(b_qs);
+                __m512i hbits = _mm512_loadu_si512(b_qh);
+                __m512i vb0 = _mm512_and_si512(bytes, lowMask);
+                __m512i vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                __m512i vh0 = _mm512_slli_epi16(_mm512_and_si512(hbits, hmask), 4);
+                __m512i vh1 = _mm512_slli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 2)), 2);
+
+                vb0 = _mm512_add_epi8(vb0, vh0);
+                vb1 = _mm512_add_epi8(vb1, vh1);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                b_qs += 64;
+
+                va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                bytes = _mm512_loadu_si512(b_qs);
+                vb0 = _mm512_and_si512(bytes, lowMask);
+                vb1 = _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask);
+                vh0 =                   _mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 4));
+                vh1 = _mm512_srli_epi16(_mm512_and_si512(hbits, _mm512_slli_epi16(hmask, 6)), 2);
+                vb0 = _mm512_add_epi8(vb0, vh0);
+                vb1 = _mm512_add_epi8(vb1, vh1);
+                vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                b_qs += 64;
+                b_qh += 64;
+
+                // B * A - 32 * A
+                __m512i vmask = _mm512_set1_epi32(k_group);
+                vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));
+
+                // vacc += scale * (q8 @ q6)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](int col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+template <int BLOCK_M, int BLOCK_N, int BLOCK_K>
+struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, BLOCK_K> {
+    static void apply(int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+
+        constexpr int COLS = BLOCK_N / 16;
+        const int TILE_SIZE = TILE_N * sizeof(block_iq4_xs) + TILE_N * 2;
+
+        const block_q8_K * RESTRICT A = static_cast<const block_q8_K *>(_A);
+        const char * RESTRICT B = static_cast<const char *>(_B);
+
+        // load the 256 bytes from A to 4 avx512 vectors
+        __m512i va[4];
+        __m512 vc[COLS];
+        __m512 vd1;
+
+        // packed_B:
+        const int offset_scales = (QK_K / 2) * TILE_N ;
+        const int offset_d0     = (QK_K / 2) * TILE_N + 8 * TILE_N;
+
+        // compensation
+        __m512i vcomp;
+
+        const __m256i m128s = _mm256_set1_epi16(128);
+        const __m512i lowMask = _mm512_set1_epi8(0xF);
+
+        const __m512i values128 = _mm512_set_epi8(
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127,
+            113, 89, 69, 53, 38, 25, 13, 1, -10, -22, -35, -49, -65, -83, -104, -127
+        );
+        const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+        const __m512i values256 = _mm512_add_epi8(values128, off);
+
+        auto loadc = [&](auto col) {
+            vc[col] = _mm512_setzero_ps();
+        };
+        Unroll<COLS>{}(loadc);
+
+        auto compute = [&](auto col, auto i) {
+            if constexpr (col == 0) {
+                // load a
+                va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0));
+                va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64));
+                va[2] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 128));
+                va[3] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 192));
+
+                // compensation: 128 * A
+                const __m256i q8sums = _mm256_loadu_si256((const __m256i *)A[0 * KB + i].bsums);
+                vcomp = _mm512_castsi256_si512(_mm256_madd_epi16(q8sums, m128s));
+                vd1 = _mm512_set1_ps(A[0 * KB + i].d);
+            }
+
+            // accmulate the quants
+            __m512i acc = _mm512_setzero_si512();
+            const char * b_ptr = B + PACKED_INDEX(col, i, KB, TILE_SIZE);
+            const char * b_qs = b_ptr;
+            int mask = 0;
+            for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
+                int r = k_group >> 1;
+                __m512i vmask = _mm512_set1_epi32(k_group);
+                __m512i vsum = _mm512_setzero_si512();
+                for (int k = 0; k < 8; k += 2) {
+                    __m512i va0 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+                    __m512i va1 = _mm512_permutexvar_epi32(_mm512_set1_epi32(mask++), va[r]);
+
+                    __m512i bytes = _mm512_loadu_si512(b_qs);
+                    __m512i vb0 = _mm512_shuffle_epi8(values256, _mm512_and_si512(bytes, lowMask));
+                    __m512i vb1 = _mm512_shuffle_epi8(values256, _mm512_and_si512(_mm512_srli_epi16(bytes, 4), lowMask));
+
+                    vsum = _mm512_dpbusd_epi32(vsum, vb0, va0);
+                    vsum = _mm512_dpbusd_epi32(vsum, vb1, va1);
+                    b_qs += 64;
+                }
+                // (B + 128) * A - 128 * A
+                vsum = _mm512_sub_epi32(vsum, _mm512_permutexvar_epi32(vmask, vcomp));
+
+                // vacc += scale * (q8 @ q4)
+                const __m512i vscale = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i *)(b_ptr + offset_scales + k_group * TILE_N)));
+                acc = _mm512_add_epi32(acc, _mm512_mullo_epi32(vsum, vscale));
+            }
+            const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(b_ptr + offset_d0)));
+            vc[col] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(acc), _mm512_mul_ps(vd0, vd1), vc[col]);
+        };
+
+        for (int i = 0; i < KB; ++i) {
+            Unroll<COLS>{}(compute, i);
+        }
+
+        //store to C
+        auto storec = [&](auto col) {
+            _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
+        };
+        Unroll<COLS>{}(storec);
+    }
+};
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE)                                         \
+    tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply(   \
+        KB, (const char *)wdata + 0 * row_size_A,                                    \
+        (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE),     \
+        (float *) dst->data + 0 * N + nb_start, ldc)
+
+template <typename TA, typename TB, typename TC, int BLOCK_K,
+          typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
+void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, TC * RESTRICT C, int ldc) {
+    using packed_B_t = packed_B_type<TB>;
+    const int TILE_SIZE = get_tile_size<TB>();
+    const bool need_unpack = do_unpack<TB>::value;
+
+    GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
+    const TA * RESTRICT A = static_cast<const TA *>(_A);
+    const char * RESTRICT B = static_cast<const char *>(_B);
+
+    const int m0 = std::min(M, TILE_M);
+    const int m1 = std::max(M - TILE_M, 0);
+    const int lda = KB * sizeof(TA);
+    //const int ldb = KB * sizeof(TB);
+
+    static thread_local packed_B_t Tile0[TILE_N * TILE_K];
+    static thread_local packed_B_t Tile1[TILE_N * TILE_K];
+    static thread_local int8_t Tile23[TILE_M * TILE_K];
+
+    static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
+    static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
+
+    // double buffering C to interleave avx512 and amx
+    int32_t * C_cur = TileC0;
+    int32_t * C_pre = TileC1;
+
+    auto Tile4 = [&](int32_t * base) { return base; };
+    auto Tile5 = [&](int32_t * base) { return base + TILE_M * TILE_N; };
+    auto Tile6 = [&](int32_t * base) { return base + 2 * TILE_M * TILE_N; };
+    auto Tile7 = [&](int32_t * base) { return base + 3 * TILE_M * TILE_N; };
+
+    if (M == 2 * TILE_M) {
+        // i = 0
+        const char * B_blk0 = B + PACKED_INDEX(0, 0, KB, TILE_SIZE);
+        const char * B_blk1 = B + PACKED_INDEX(1, 0, KB, TILE_SIZE);
+        if (need_unpack) {
+            unpack_B<TB>(Tile0, B_blk0);
+            _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+        } else {
+            _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+        }
+
+        _tile_zero(TMM4);
+        _tile_loadd(TMM2, A[0].qs, lda);
+        _tile_dpbssd(TMM4, TMM2, TMM0);
+        _tile_stored(TMM4, Tile4(C_pre), TILE_N * sizeof(int32_t));
+
+        _tile_zero(TMM5);
+        _tile_loadd(TMM3, A[TILE_M * KB + 0].qs, lda);
+        _tile_dpbssd(TMM5, TMM3, TMM0);
+        _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));
+
+        if (need_unpack) {
+            unpack_B<TB>(Tile1, B_blk0);
+            _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+        } else {
+            _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+        }
+
+        _tile_zero(TMM6);
+        _tile_dpbssd(TMM6, TMM2, TMM1);
+        _tile_stored(TMM6, Tile6(C_pre), TILE_N * sizeof(int32_t));
+
+        _tile_zero(TMM7);
+        _tile_dpbssd(TMM7, TMM3, TMM1);
+        _tile_stored(TMM7, Tile7(C_pre), TILE_N * sizeof(int32_t));
+
+        for (int i = 1; i < KB; ++i) {
+            // index of previous iter
+            const int ii = i - 1;
+            const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
+            const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
+            GGML_DISPATCH_BOOL(ii > 0, is_acc, [&] {
+                if (need_unpack) {
+                    unpack_B<TB>(Tile0, B_blk0);
+                    _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+                } else {
+                    _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+                }
+                _tile_zero(TMM4);
+                _tile_loadd(TMM2, A[i].qs, lda);
+                acc_C<TA, TB, is_acc>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM4, TMM2, TMM0);
+                _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
+
+                _tile_zero(TMM5);
+                _tile_loadd(TMM3, A[TILE_M * KB + i].qs, lda);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM5, TMM3, TMM0);
+                _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
+
+                if (need_unpack) {
+                    unpack_B<TB>(Tile1, B_blk1);
+                    _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+                } else {
+                    _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+                }
+                _tile_zero(TMM6);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM6, TMM2, TMM1);
+                _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));
+
+                _tile_zero(TMM7);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+
+                _tile_dpbssd(TMM7, TMM3, TMM1);
+                _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
+
+                std::swap(C_cur, C_pre);
+            });
+        }
+        // final accumulation
+        {
+            int ii = KB - 1;
+            acc_C<TA, TB, true>::apply(C, ldc, Tile4(C_pre), &A[ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_M * ldc, ldc, Tile5(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(0, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_N, ldc, Tile6(C_pre), &A[ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+            acc_C<TA, TB, true>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_pre), &A[TILE_M * KB + ii], KB, B + PACKED_INDEX(1, ii, KB, TILE_SIZE), TILE_M);
+        }
+    } else {
+        for (int i = 0; i < KB; ++i) {
+            _tile_zero(TMM4);
+            _tile_zero(TMM6);
+            if (m1 != 0) {
+                _tile_zero(TMM5);
+                _tile_zero(TMM7);
+            }
+
+            const char * B_blk0 = B + PACKED_INDEX(0, i, KB, TILE_SIZE);
+            const char * B_blk1 = B + PACKED_INDEX(1, i, KB, TILE_SIZE);
+            if (need_unpack) {
+                unpack_B<TB>(Tile0, B_blk0);
+                _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+            } else {
+                _tile_loadd(TMM0, B_blk0, TILE_N * VNNI_BLK);
+            }
+
+            if (need_unpack) {
+                unpack_B<TB>(Tile1, B_blk1);
+                _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+            } else {
+                _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
+            }
+
+            if (m0 == TILE_M) {
+                _tile_loadd(TMM2, A[i].qs, lda);
+            } else {
+                unpack_A(Tile23, &A[i], KB, m0);
+                _tile_loadd(TMM2, Tile23, TILE_K);
+            }
+
+            _tile_dpbssd(TMM4, TMM2, TMM0);
+            _tile_dpbssd(TMM6, TMM2, TMM1);
+
+            _tile_stored(TMM4, Tile4(C_cur), TILE_N * sizeof(int32_t));
+            _tile_stored(TMM6, Tile6(C_cur), TILE_N * sizeof(int32_t));
+
+            GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+                acc_C<TA, TB, is_acc>::apply(C,          ldc, Tile4(C_cur), &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Tile6(C_cur), &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
+            });
+
+            if (m1 != 0) {
+                unpack_A(Tile23, &A[TILE_M * KB + i], KB, m1);
+                _tile_loadd(TMM3, Tile23, TILE_K);
+
+                _tile_dpbssd(TMM5, TMM3, TMM0);
+                _tile_dpbssd(TMM7, TMM3, TMM1);
+                _tile_stored(TMM5, Tile5(C_cur), TILE_N * sizeof(int32_t));
+                _tile_stored(TMM7, Tile7(C_cur), TILE_N * sizeof(int32_t));
+                GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+                    acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc,          ldc, Tile5(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
+                    acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Tile7(C_cur), &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
+                });
+            }
+        }
+    }
+    return;
+}
+
+template <typename TA, typename TB, typename TC, int BLOCK_K,
+          typename std::enable_if<is_type_qkk<TB>::value, int>::type = 0>
+void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const void * RESTRICT _B, float * RESTRICT C, int ldc) {
+    static_assert(std::is_same<TA, block_q8_K>::value);
+    const int TILE_SIZE = get_tile_size<TB>();
+
+    GGML_ASSERT(M <= 2 * TILE_M && N == 2 * TILE_N);
+    const TA * RESTRICT A = static_cast<const TA *>(_A);
+    const char * RESTRICT B = static_cast<const char *>(_B);
+
+    const int m0 = std::min(M, TILE_M);
+    const int m1 = std::max(M - TILE_M, 0);
+    //const int lda = KB * sizeof(TA);
+
+    static thread_local int8_t Tile0[TILE_N * TILE_K];
+    static thread_local int8_t Tile1[TILE_N * TILE_K];
+    static thread_local int8_t Tile23[TILE_M * TILE_K];
+
+    // mat mul result for each group
+    static thread_local int32_t Tile4[TILE_M * TILE_N];
+    static thread_local int32_t Tile5[TILE_M * TILE_N];
+    static thread_local int32_t Tile6[TILE_M * TILE_N];
+    static thread_local int32_t Tile7[TILE_M * TILE_N];
+
+    // sum of each QK_K block, contains 8 groups, int32
+    static thread_local int32_t Sumi4[TILE_M * TILE_N];
+    static thread_local int32_t Sumi5[TILE_M * TILE_N];
+    static thread_local int32_t Sumi6[TILE_M * TILE_N];
+    static thread_local int32_t Sumi7[TILE_M * TILE_N];
+
+    const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
+    for (int i = 0; i < KB; ++i) {
+        // step 1: accumulate the quants across 8 groups, each group with 32
+        for (int k = 0; k < QK_K / k_group_size; ++k) {
+            GGML_DISPATCH_BOOL(k > 0, is_acc, [&] {
+                _tile_zero(TMM4);
+                _tile_zero(TMM6);
+
+                unpack_B<TB>(Tile0, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k);
+                _tile_loadd(TMM0, Tile0, TILE_N * VNNI_BLK);
+
+                unpack_B<TB>(Tile1, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k);
+                _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
+
+                unpack_A<TB>(Tile23, &A[i], KB, k, m0);
+                _tile_loadd(TMM2, Tile23, TILE_K);
+
+                _tile_dpbssd(TMM4, TMM2, TMM0);
+                _tile_dpbssd(TMM6, TMM2, TMM1);
+
+                _tile_stored(TMM4, Tile4, TILE_N * sizeof(int32_t));
+                _tile_stored(TMM6, Tile6, TILE_N * sizeof(int32_t));
+
+                scale_C<TB, is_acc>(Tile4, Sumi4, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m0);
+                scale_C<TB, is_acc>(Tile6, Sumi6, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m0);
+
+                if (m1 != 0) {
+                    _tile_zero(TMM5);
+                    _tile_zero(TMM7);
+
+                    unpack_A<TB>(Tile23, &A[TILE_M * KB + i], KB, k, m1);
+                    _tile_loadd(TMM3, Tile23, TILE_K);
+
+                    _tile_dpbssd(TMM5, TMM3, TMM0);
+                    _tile_dpbssd(TMM7, TMM3, TMM1);
+
+                    _tile_stored(TMM5, Tile5, TILE_N * sizeof(int32_t));
+                    _tile_stored(TMM7, Tile7, TILE_N * sizeof(int32_t));
+
+                    scale_C<TB, is_acc>(Tile5, Sumi5, B + PACKED_INDEX(0, i, KB, TILE_SIZE), k, m1);
+                    scale_C<TB, is_acc>(Tile7, Sumi7, B + PACKED_INDEX(1, i, KB, TILE_SIZE), k, m1);
+                }
+            });
+        }
+
+        // step 2: accmulate the mins
+        GGML_DISPATCH_BOOL(i > 0, is_acc, [&] {
+            acc_C<TA, TB, is_acc>::apply(C,          ldc, Sumi4, &A[i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m0);
+            acc_C<TA, TB, is_acc>::apply(C + TILE_N, ldc, Sumi6, &A[i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m0);
+            if (m1 != 0) {
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc,          ldc, Sumi5, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(0, i, KB, TILE_SIZE), m1);
+                acc_C<TA, TB, is_acc>::apply(C + TILE_M * ldc + TILE_N, ldc, Sumi7, &A[TILE_M * KB + i], KB, B + PACKED_INDEX(1, i, KB, TILE_SIZE), m1);
+            }
+        });
+    }
+    return;
+}
+
+} // anonymous namespace
+
+// get the packed tensor size for quantized weights
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) {
+    const enum ggml_type TYPE = tensor->type;
+
+    const int K = tensor->ne[0]; // ne0: in_features
+    const int N = tensor->ne[1]; // ne1: out_features
+
+    auto get_tensor_size = [&] {
+        size_t row_size_B{0};
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            row_size_B = get_row_size<type, blck_size>(K);
+        });
+        return N * row_size_B;
+    };
+
+    if (qtype_has_amx_kernels(TYPE)) {
+        return get_tensor_size();
+    } else {
+        // for f16, bf16 we don't do packing
+        return ggml_nbytes(tensor);
+    }
+}
+
+// pack weight to vnni format
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now
+
+    const enum ggml_type TYPE = tensor->type;
+
+    const int K = tensor->ne[0]; // ne0: in_features
+    const int N = tensor->ne[1]; // ne1: out_features
+
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
+    });
+}
+
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src[0];
+
+    const enum ggml_type TYPE = src0->type;
+
+    const bool is_floating_type = TYPE == GGML_TYPE_F16;
+    if (is_floating_type) {
+        return 0;
+    }
+
+    const int M = dst->ne[1];
+    const int K = src0->ne[0];
+
+    size_t desired_wsize = 0;
+
+    GGML_DISPATCH_QTYPES(TYPE, [&] {
+        const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+        desired_wsize = M * row_size_A;
+    });
+
+    return desired_wsize;
+}
+
+// NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX)
+//
+// src0: weight in shape of {N, K}, quantized
+// src1: input  in shape of {M, K}, float32
+// dst:  output in shape of {M, N}, float32
+//
+// the function performs: dst = src1 @ src0.T
+//
+void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src[0];
+    struct ggml_tensor * src1 = dst->src[1];
+
+    const enum ggml_type TYPE = src0->type;
+
+    // f16 only has avx512 kernels for now,
+    // amx kernels will be added once 6th gen xeon is released.
+    const bool is_floating_type = TYPE == GGML_TYPE_F16;
+
+    const int M = dst->ne[1];
+    const int N = dst->ne[0];
+    const int K = src0->ne[0];
+    const int ldc = dst->nb[1] / dst->nb[0];
+
+    if (is_floating_type) {
+        constexpr int BLOCK_M = 4;
+        constexpr int BLOCK_N = 6;
+        const int MB = div_up(M, BLOCK_M);
+        const int NB = div_up(N, BLOCK_N);
+
+        parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
+            GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
+                for (int i = begin; i < end; ++i) {
+                    int mb = i / NB;
+                    int nb = i % NB;
+
+                    int mb_start = mb * BLOCK_M;
+                    int mb_size = std::min(BLOCK_M, M - mb_start);
+                    int nb_start = nb * BLOCK_N;
+                    int nb_size = std::min(BLOCK_N, N - nb_start);
+
+                    switch (mb_size << 4 | nb_size) {
+                        case 0x12: LAUNCH_TINYGEMM_KERNEL_AVX(1, 2); break;
+                        case 0x14: LAUNCH_TINYGEMM_KERNEL_AVX(1, 4); break;
+                        case 0x16: LAUNCH_TINYGEMM_KERNEL_AVX(1, 6); break;
+                        case 0x22: LAUNCH_TINYGEMM_KERNEL_AVX(2, 2); break;
+                        case 0x24: LAUNCH_TINYGEMM_KERNEL_AVX(2, 4); break;
+                        case 0x26: LAUNCH_TINYGEMM_KERNEL_AVX(2, 6); break;
+                        case 0x32: LAUNCH_TINYGEMM_KERNEL_AVX(3, 2); break;
+                        case 0x34: LAUNCH_TINYGEMM_KERNEL_AVX(3, 4); break;
+                        case 0x36: LAUNCH_TINYGEMM_KERNEL_AVX(3, 6); break;
+                        case 0x42: LAUNCH_TINYGEMM_KERNEL_AVX(4, 2); break;
+                        case 0x44: LAUNCH_TINYGEMM_KERNEL_AVX(4, 4); break;
+                        case 0x46: LAUNCH_TINYGEMM_KERNEL_AVX(4, 6); break;
+                        default: fprintf(stderr, "Unexpected block size!\n");
+                    }
+                }
+            });
+        });
+        return;
+    }
+
+    // pointer to work space, used convert A from float to quantized type
+    void * wdata = params->wdata;
+
+    //TODO: performance improvement: merge quant A
+    if (params->ith == 0) {
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
+            const size_t desired_wsize = M * row_size_A;
+            if (params->wsize < desired_wsize) {
+                GGML_ABORT("insufficient work space size");
+            }
+
+            // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
+            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
+            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
+
+            const float * A_data = static_cast<const float *>(src1->data);
+            for (int m = 0; m < M; ++m) {
+                from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
+            }
+        });
+    }
+
+    ggml_barrier(params->threadpool);
+
+    if (M == 1) {
+        // MB = 1 and handle 8 tiles in each block
+        constexpr int kTilesN = 4;
+        constexpr int BLOCK_N = TILE_N * kTilesN;
+        const int NB = div_up(N, BLOCK_N);
+
+        parallel_for_ggml(params, NB, [&](int begin, int end) {
+            GGML_DISPATCH_QTYPES(TYPE, [&] {
+                const int KB = K / blck_size;
+                const int TILE_SIZE = get_tile_size<type>();
+                const int row_size_A = KB * sizeof(vec_dot_type);
+                for (int i = begin; i < end; ++i) {
+                    int nb = i;
+                    int nb_start = nb * BLOCK_N;
+                    int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96
+
+                    switch (nb_size) {
+                        //case 160: LAUNCH_TINYGEMM_KERNEL_VNNI(160); break;
+                        case 128: LAUNCH_TINYGEMM_KERNEL_VNNI(128); break;
+                        case 96: LAUNCH_TINYGEMM_KERNEL_VNNI(96); break;
+                        case 64: LAUNCH_TINYGEMM_KERNEL_VNNI(64); break;
+                        case 32: LAUNCH_TINYGEMM_KERNEL_VNNI(32); break;
+                        default: fprintf(stderr, "Unexpected n block size!\n");
+                    }
+                }
+            });
+        });
+        return;
+    }
+
+    // handle 4 tiles at a tile
+    constexpr int BLOCK_M = TILE_M * 2;
+    constexpr int BLOCK_N = TILE_N * 2;
+    const int MB = div_up(M, BLOCK_M);
+    const int NB = div_up(N, BLOCK_N);
+
+    parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
+        // init tile config for each thread
+        ggml_tile_config_init();
+
+        GGML_DISPATCH_QTYPES(TYPE, [&] {
+            const int KB = K / blck_size;
+            const int TILE_SIZE = get_tile_size<type>();
+            const int row_size_A = KB * sizeof(vec_dot_type);
+
+            for (int i = begin; i < end; ++i) {
+                int mb = i / NB;
+                int nb = i % NB;
+
+                int mb_start = mb * BLOCK_M;
+                int mb_size = std::min(BLOCK_M, M - mb_start);
+                int nb_start = nb * BLOCK_N;
+                int nb_size = BLOCK_N;
+
+                tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
+                    mb_size, nb_size, KB,
+                    (const char *)wdata + mb_start * row_size_A,
+                    (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
+                    (float *) dst->data + mb_start * N + nb_start, ldc);
+            }
+        });
+    });
+}
+
+#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
diff --git a/ggml/src/ggml-cpu/amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h
new file mode 100644
index 000000000..baf768477
--- /dev/null
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "common.h"
+
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
+
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/cmake/FindSIMD.cmake b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
similarity index 94%
rename from cmake/FindSIMD.cmake
rename to ggml/src/ggml-cpu/cmake/FindSIMD.cmake
index 33377ec44..5533668ec 100644
--- a/cmake/FindSIMD.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
@@ -79,22 +79,22 @@ endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
-    set(LLAMA_AVX OFF)
+    set(GGML_AVX OFF)
 else()
-    set(LLAMA_AVX ON)
+    set(GGML_AVX ON)
 endif()
 
 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(LLAMA_AVX2 OFF)
+    set(GGML_AVX2 OFF)
 else()
-    set(LLAMA_AVX2 ON)
+    set(GGML_AVX2 ON)
 endif()
 
 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
-    set(LLAMA_AVX512 OFF)
+    set(GGML_AVX512 OFF)
 else()
-    set(LLAMA_AVX512 ON)
+    set(GGML_AVX512 ON)
 endif()
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
new file mode 100644
index 000000000..e8133d411
--- /dev/null
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -0,0 +1,323 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool AVX512DQ(void) { return f_7_ebx[17]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool AVX512BW(void) { return f_7_ebx[30]; }
+    bool AVX512VL(void) { return f_7_ebx[31]; }
+
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    int score = 0;
+    cpuid_x86 is;
+
+#ifdef GGML_FMA
+    if (!is.FMA()) { return 0; }
+    score += 1;
+#endif
+#ifdef GGML_F16C
+    if (!is.F16C()) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_SSE42
+    if (!is.SSE42()) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_AVX
+    if (!is.AVX()) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_AVX2
+    if (!is.AVX2()) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_AVX_VNNI
+    if (!is.AVX_VNNI()) { return 0; }
+    score += 1<<6;
+#endif
+#ifdef GGML_AVX512
+    if (!is.AVX512F()) { return 0; }
+    if (!is.AVX512CD()) { return 0; }
+    if (!is.AVX512VL()) { return 0; }
+    if (!is.AVX512DQ()) { return 0; }
+    if (!is.AVX512BW()) { return 0; }
+    score += 1<<7;
+#endif
+#ifdef GGML_AVX512_VBMI
+    if (!is.AVX512_VBMI()) { return 0; }
+    score += 1<<8;
+#endif
+#ifdef GGML_AVX512_BF16
+    if (!is.AVX512_BF16()) { return 0; }
+    score += 1<<9;
+#endif
+#ifdef GGML_AVX512_VNNI
+    if (!is.AVX512_VNNI()) { return 0; }
+    score += 1<<10;
+#endif
+#ifdef GGML_AMX_INT8
+    if (!is.AMX_INT8()) { return 0; }
+    score += 1<<11;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
new file mode 100644
index 000000000..b311a5b1c
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -0,0 +1,4247 @@
+#define GGML_COMMON_IMPL_CPP
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu-traits.h"
+
+#include <cmath>
+#include <cstring>
+#include <cassert>
+#include <cfloat>
+#include <cstdlib> // for qsort
+#include <cstdio>  // for GGML_ASSERT
+
+#include "ggml-cpu-aarch64.h"
+
+// TODO: move to include file?
+template <int K> constexpr int QK_0() {
+    if constexpr (K == 4) {
+        return QK4_0;
+    }
+    if constexpr (K == 8) {
+        return QK8_0;
+    }
+    return -1;
+}
+
+template <int K, int N> struct block {
+    ggml_half d[N];                         // deltas for N qK_0 blocks
+    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
+};
+
+// control size
+static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
+static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
+static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
+static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
+
+using block_q4_0x4 = block<4, 4>;
+using block_q4_0x8 = block<4, 8>;
+using block_q8_0x4 = block<8, 4>;
+using block_q8_0x8 = block<8, 8>;
+
+struct block_iq4_nlx4 {
+    ggml_half d[4];            // deltas for 4 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
+};
+
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#elif defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define UNUSED GGML_UNUSED
+
+// Functions to create the interleaved data layout formats
+
+// interleave 4 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x4
+// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
+// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
+//
+// - in                  : an array of block_q4_0 pointers
+// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
+//                         blck_size_interleave bytes
+// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
+//                         from bias offset form to pure sign form (this saves subtract
+//                         operations durin unpacking)
+//
+#if defined(__AVX__)
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
+#define GGML_F32Cx16_REPEAT_LOAD(x)  _mm512_cvtph_ps(_mm256_set_m128i(x, x))
+#endif
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
+#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
+#else
+#if defined(__AVX512F__)
+static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
+    float tmp[16];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
+    float tmp[16];
+    uint16_t tmphalf[8];
+    _mm_storeu_si128((__m128i*)tmphalf, x);
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
+    }
+
+    return _mm512_loadu_ps(tmp);
+}
+#endif
+static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
+    float tmp[8];
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
+    uint16_t tmphalf[8];
+    float tmp[8];
+
+    _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask)     __avx_repeat_f32cx8_load(x)
+#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask)     __avx_rearranged_f32cx8_load(x, arrangeMask)
+#if defined(__AVX512F__)
+#define GGML_F32Cx8x2_LOAD(x, y)     __avx512_f32cx8x2_load(x, y)
+#define GGML_F32Cx16_REPEAT_LOAD(x)  __avx512_repeat_f32cx16_load(x)
+#endif
+#endif
+#endif
+
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+#if defined(__AVX512F__)
+// add int16_t pairwise and return as 512 bit int vector
+static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
+    const __m512i ones = _mm512_set1_epi16(1);
+    return _mm512_madd_epi16(ones, x);
+}
+
+static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
+#if defined(__AVX512VNNI__)
+    const __m512i zero = _mm512_setzero_si512();
+    return _mm512_dpbusd_epi32(zero, ax, sy);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m512i dot = _mm512_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_int_32x16(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as 512 bit int vector
+static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
+    const __m512i zero = _mm512_setzero_si512();
+    // Get absolute values of x vectors
+    const __m512i ax = _mm512_abs_epi8(x);
+    // Sign the values of the y vectors
+    __mmask64 blt0 = _mm512_movepi8_mask(x);
+    const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
+    return mul_sum_us8_pairs_int32x16(ax, sy);
+}
+#endif
+
+// add int16_t pairwise and return as 256 bit int vector
+static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    return _mm256_madd_epi16(ones, x);
+}
+
+static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_int32x8(dot);
+#endif
+}
+
+// Integer variant of the function defined in ggml-quants.c
+// multiply int8_t, add results pairwise twice and return as 256 bit int vector
+static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbssd_epi32(zero, x, y);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_int32x8(ax, sy);
+#endif
+}
+#endif
+
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 8; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#else
+    // scalar
+    const int blck_size_interleave = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    float id[4];
+    __m256 srcv[4][4];
+    __m256 idvec[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            // Load elements into 4 AVX vectors
+            __m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
+            __m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
+            __m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
+            __m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
+
+            // Compute max(abs(e)) for the block
+            const __m256 signBit = _mm256_set1_ps( -0.0f );
+            __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+            maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+            __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+            max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+            max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+            const float maxScalar = _mm_cvtss_f32( max4 );
+
+            // Divided by 127.f to mirror results in quantize_row_q8_0
+            const float d = maxScalar  / 127.f;
+            id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
+
+            // Store the scale for the individual block
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+
+            // Store the values in blocks of eight values - Aim is to use these later for block interleaving
+            srcv[row_iter][0] = v0;
+            srcv[row_iter][1] = v1;
+            srcv[row_iter][2] = v2;
+            srcv[row_iter][3] = v3;
+            idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
+        }
+
+        // The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
+        for (int j = 0; j < 4; j++) {
+            // Apply the multiplier
+            __m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
+            __m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
+            __m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
+            __m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
+
+            // Round to nearest integer
+            v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+            v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+            v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+            v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+            // Convert floats to integers
+            __m256i i0 = _mm256_cvtps_epi32( v0 );
+            __m256i i1 = _mm256_cvtps_epi32( v1 );
+            __m256i i2 = _mm256_cvtps_epi32( v2 );
+            __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+            // Convert int32 to int16
+            i0 = _mm256_packs_epi32( i0, i1 );
+            i2 = _mm256_packs_epi32( i2, i3 );
+            // Convert int16 to int8
+            i0 = _mm256_packs_epi16( i0, i2 );
+
+            //  Permute and store the quantized weights in the required order after the pack instruction
+            const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+            i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+            _mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
+#else
+            // Since we don't have in AVX some necessary functions,
+            // we split the registers in half and call AVX2 analogs from SSE
+            __m128i ni0 = _mm256_castsi256_si128( i0 );
+            __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+            __m128i ni2 = _mm256_castsi256_si128( i1 );
+            __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+            __m128i ni4 = _mm256_castsi256_si128( i2 );
+            __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+            __m128i ni6 = _mm256_castsi256_si128( i3 );
+            __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+            // Convert int32 to int16
+            ni0 = _mm_packs_epi32( ni0, ni1 );
+            ni2 = _mm_packs_epi32( ni2, ni3 );
+            ni4 = _mm_packs_epi32( ni4, ni5 );
+            ni6 = _mm_packs_epi32( ni6, ni7 );
+            // Convert int16 to int8
+            ni0 = _mm_packs_epi16( ni0, ni2 );
+            ni4 = _mm_packs_epi16( ni4, ni6 );
+            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
+            _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
+#endif
+        }
+    }
+#else
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    if (blck_size_interleave == 4) {
+        quantize_q8_0_4x4(x, vy, n_per_row);
+    } else if (blck_size_interleave == 8) {
+        quantize_q8_0_4x8(x, vy, n_per_row);
+    } else {
+        assert(false);
+    }
+}
+
+static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+                int8x16_t a0 = vld1q_s8(a_ptr->qs);
+                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+                int32x4_t ret = vdupq_n_s32(0);
+
+                ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
+                ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
+                ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
+                ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
+
+                ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
+                ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
+                ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
+                ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                                vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+                int32x4_t ret0 = vdupq_n_s32(0);
+                int32x4_t ret1 = vdupq_n_s32(0);
+
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
+
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+
+        __asm__ __volatile__(
+            "ptrue p0.b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x10\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "mov z31.b, #0x0\n"
+            "mov x21, %x[nb]\n"
+            "2:"  // Block loop
+            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+            "mov z28.s, #0x0\n"
+            "mov z27.s, #0x0\n"
+            "ld1rd { z26.d }, p0/Z, [x22]\n"
+            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+            "sub x20, x22, #0x2\n"
+            "sub x21, x21, #0x1\n"
+            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+            "lsl z22.b, z30.b, #0x4\n"
+            "lsl z16.b, z29.b, #0x4\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+            "lsl z19.b, z25.b, #0x4\n"
+            "and z25.b, z25.b, #0xf0\n"
+            "ld1rh { z17.h }, p0/Z, [x20]\n"
+            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+            "sdot z28.s, z22.b, z26.b\n"
+            "sdot z27.s, z16.b, z26.b\n"
+            "lsl z16.b, z24.b, #0x4\n"
+            "add x22, x22, #0x22\n"
+            "and z24.b, z24.b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x90\n"
+            "fcvt z17.s, p0/m, z17.h\n"
+            "fcvt z18.s, p0/m, z18.h\n"
+            "sdot z28.s, z19.b, z23.b\n"
+            "sdot z27.s, z16.b, z23.b\n"
+            "fmul z18.s, z18.s, z17.s\n"
+            "sdot z28.s, z30.b, z21.b\n"
+            "sdot z27.s, z29.b, z21.b\n"
+            "sdot z28.s, z25.b, z20.b\n"
+            "sdot z27.s, z24.b, z20.b\n"
+            "uzp1 z17.s, z28.s, z27.s\n"
+            "uzp2 z16.s, z28.s, z27.s\n"
+            "add z17.s, z17.s, z16.s\n"
+            "asr z17.s, z17.s, #0x4\n"
+            "scvtf z17.s, p0/m, z17.s\n"
+            "fmla z31.s, p0/M, z17.s, z18.s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[nc], %x[nc], #0x8\n"
+            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE)
+#elif defined(__AVX2__)
+    // Lookup table to convert signed nibbles to signed bytes
+    __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+    signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+    __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+    __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+
+    // Permute mask used for easier vector processing at later stages
+    const __m256i m4b = _mm256_set1_epi8(0x0F);
+
+    int64_t b_nb = n / QK4_0;
+
+    const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+    const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
+
+    // Process Q8_0 blocks one by one
+    for (int64_t y = 0; y < nr; y++) {
+
+        // Pointers to LHS blocks of block_q8_0 format
+        const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
+
+        // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
+        for (int64_t x = 0; x < nc / 8; x++) {
+
+            // Pointers to RHS blocks
+            const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+            // Master FP accumulator
+            __m256 acc_row = _mm256_setzero_ps();
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
+                const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
+                const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
+                const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
+
+                // 4-bit -> 8-bit - Sign is maintained
+                const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
+                const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
+                const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
+                const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
+
+                const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
+                const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
+                const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
+                const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
+
+                // Load the scale values for the 8 blocks interleaved in block_q4_0x8
+                const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
+
+                // Load and convert to FP32 scale from block_q8_0
+                const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
+
+                // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
+                __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
+                __m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
+
+                lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
+                lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
+
+                __m256i iacc = _mm256_setzero_si256();
+
+                // Dot product done within 32 bit lanes and accumulated in the same vector
+                // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
+                // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
+                // ...........................................................................
+                // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
+
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
+                iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
+
+                // Accumulated values multipled with appropriate scales
+                acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
+            }
+
+            // Accumulated output values permuted so as to be stored in appropriate order post accumulation
+            acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
+            _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
+        }
+    }
+    return;
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+            for (int l = 0; l < nb; l++) {
+                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
+                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
+                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
+                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
+                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
+                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
+                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
+                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
+
+                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
+                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                // vector version needs Zvfhmin extension
+                const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
+                const float b_scales[8] = {
+                    GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                    GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                };
+                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
+                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
+            }
+            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float * res_ptr = s;
+
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            float32x4_t sumf = vdupq_n_f32(0);
+            for (int l = 0; l < nb; l++) {
+                uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
+                uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
+                uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
+                uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
+
+                int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
+                int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
+                int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
+                int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
+                int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
+                int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
+                int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
+                int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
+
+                int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
+                int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
+
+                int32x4_t sumi = vdupq_n_s32(0);
+                sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
+                sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
+                sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
+                sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
+                sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
+                sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
+                sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
+                sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
+
+                float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+                float32x4_t d = a_d * b_d;
+
+                sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
+            }
+
+            vst1q_f32(res_ptr + x * 4, sumf);
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v23.16b, #0x0\n"
+            "movi v16.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v0.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v21.16b, #0x0\n"
+            "movi v8.16b, #0x0\n"
+            "movi v1.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q3, [x28, #0x0]\n"
+            "ldr q31, [x25, #0x0]\n"
+            "movi v28.16b, #0x4\n"
+            "movi v10.4s, #0x0\n"
+            "ldr q22, [x28, #0x10]\n"
+            "ldr q6, [x25, #0x10]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            "ldr q27, [x28, #0x20]\n"
+            "ldr q30, [x28, #0x30]\n"
+            "movi v20.4s, #0x0\n"
+            "movi v24.16b, #0xf0\n"
+            "ldr d2, [x25, #-0x8]\n"
+            "ldr d26, [x23, #-0x8]\n"
+            "sshl v12.16b, v3.16b, v28.16b\n"
+            "sub x20, x28, #0x8\n"
+            "ldr d17, [x20, #0x0]\n"
+            "and v3.16b, v3.16b, v24.16b\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+            ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+            ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+            ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+            "sshl v31.16b, v22.16b, v28.16b\n"
+            "and v22.16b, v22.16b, v24.16b\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "fcvtl v2.4s, v2.4h\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+            ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+            ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+            ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+            "sshl v6.16b, v27.16b, v28.16b\n"
+            "sshl v28.16b, v30.16b, v28.16b\n"
+            "and v27.16b, v27.16b, v24.16b\n"
+            "and v30.16b, v30.16b, v24.16b\n"
+            "ldr q24, [x25, #0x20]\n"
+            ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x30]\n"
+            ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x40]\n"
+            ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x50]\n"
+            ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+            ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x60]\n"
+            ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+            ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+            ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+            "fmul v24.4s, v17.4s, v2.s[0]\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v15.4s, v10.4s, v24.4s\n"
+            "ldr q24, [x23, #0x0]\n"
+            "fmul v10.4s, v17.4s, v2.s[1]\n"
+            "fmla v19.4s, v29.4s, v10.4s\n"
+            "ldr q10, [x23, #0x10]\n"
+            "fmul v29.4s, v17.4s, v2.s[2]\n"
+            "fmul v2.4s, v17.4s, v2.s[3]\n"
+            "fmla v18.4s, v9.4s, v29.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+            "fmla v14.4s, v20.4s, v2.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x20]\n"
+            ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+            ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+            ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x30]\n"
+            ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x40]\n"
+            ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+            ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+            ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+            ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x50]\n"
+            ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x23, #0x60]\n"
+            ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+            ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+            ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+            ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+            "ldr q10, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x0]\n"
+            ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+            ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+            ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+            ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+            "fmul v10.4s, v17.4s, v26.s[0]\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v11.4s, v9.4s, v10.4s\n"
+            "ldr q9, [x22, #0x10]\n"
+            "fmul v10.4s, v17.4s, v26.s[1]\n"
+            "fmla v13.4s, v29.4s, v10.4s\n"
+            "ldr d29, [x22, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v26.s[2]\n"
+            "fmul v26.4s, v17.4s, v26.s[3]\n"
+            "fcvtl v29.4s, v29.4h\n"
+            "fmla v23.4s, v20.4s, v10.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v16.4s, v2.4s, v26.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v2.4s, #0x0\n"
+            ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x20]\n"
+            ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x30]\n"
+            ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+            ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x40]\n"
+            ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x50]\n"
+            ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+            ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+            "ldr q24, [x22, #0x60]\n"
+            ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+            "ldr q9, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+            ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+            "ldr q24, [x21, #0x0]\n"
+            ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+            "fmul v9.4s, v17.4s, v29.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "fmla v25.4s, v20.4s, v9.4s\n"
+            "ldr q9, [x21, #0x10]\n"
+            "fmul v20.4s, v17.4s, v29.s[1]\n"
+            "fmla v7.4s, v10.4s, v20.4s\n"
+            "ldr d20, [x21, #-0x8]\n"
+            "fmul v10.4s, v17.4s, v29.s[2]\n"
+            "fmul v29.4s, v17.4s, v29.s[3]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            "fmla v0.4s, v26.4s, v10.4s\n"
+            "movi v26.4s, #0x0\n"
+            "movi v10.4s, #0x0\n"
+            "fmla v4.4s, v2.4s, v29.4s\n"
+            "movi v2.4s, #0x0\n"
+            "movi v29.4s, #0x0\n"
+            ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+            ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+            "ldr q12, [x21, #0x20]\n"
+            "fmul v24.4s, v17.4s, v20.s[0]\n"
+            ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+            ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+            ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x30]\n"
+            "fmul v31.4s, v17.4s, v20.s[1]\n"
+            ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+            ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+            ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x40]\n"
+            "fmul v6.4s, v17.4s, v20.s[2]\n"
+            "fmul v20.4s, v17.4s, v20.s[3]\n"
+            ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+            ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+            "ldr q9, [x21, #0x50]\n"
+            ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+            ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+            ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+            ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+            "ldr q12, [x21, #0x60]\n"
+            ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+            ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+            ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+            "ldr q17, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+            ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+            ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+            ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+            ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+            ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+            ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+            ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "scvtf v10.4s, v10.4s, #0x4\n"
+            "fmla v5.4s, v26.4s, v24.4s\n"
+            "scvtf v2.4s, v2.4s, #0x4\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "fmla v21.4s, v10.4s, v31.4s\n"
+            "fmla v8.4s, v2.4s, v6.4s\n"
+            "fmla v1.4s, v29.4s, v20.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q16, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q0, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q21, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q8, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q1, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v15.16b, #0x0\n"
+            "movi v19.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v18.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q7, [x24, #0x0]\n"
+            "ldr q5, [x25, #0x0]\n"
+            "movi v9.16b, #0x4\n"
+            "movi v4.4s, #0x0\n"
+            "ldr q3, [x24, #0x10]\n"
+            "ldr q2, [x25, #0x10]\n"
+            "movi v1.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q13, [x24, #0x20]\n"
+            "ldr q31, [x25, #0x20]\n"
+            "movi v30.4s, #0x0\n"
+            "movi v29.16b, #0xf0\n"
+            "ldr q28, [x24, #0x30]\n"
+            "ldr q27, [x25, #0x30]\n"
+            "sshl v20.16b, v7.16b, v9.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr q26, [x25, #0x40]\n"
+            "ldr q25, [x25, #0x50]\n"
+            "sshl v17.16b, v3.16b, v9.16b\n"
+            "and v7.16b, v7.16b, v29.16b\n"
+            "ldr q24, [x25, #0x60]\n"
+            "ldr q16, [x25, #0x70]\n"
+            "sshl v22.16b, v13.16b, v9.16b\n"
+            "and v3.16b, v3.16b, v29.16b\n"
+            "ldr d21, [x20, #0x0]\n"
+            "ldr d12, [x25, #-0x8]\n"
+            ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+            ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+            ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+            ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+            "sshl v9.16b, v28.16b, v9.16b\n"
+            "subs x21, x21, #0x1\n"
+            "and v13.16b, v13.16b, v29.16b\n"
+            "and v28.16b, v28.16b, v29.16b\n"
+            "add x25, x25, #0x88\n"
+            "add x24, x24, #0x48\n"
+            "fcvtl v21.4s, v21.4h\n"
+            "fcvtl v12.4s, v12.4h\n"
+            ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+            ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+            ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+            ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+            "fmul v11.4s, v21.4s, v12.s[0]\n"
+            "fmul v23.4s, v21.4s, v12.s[1]\n"
+            "fmul v17.4s, v21.4s, v12.s[2]\n"
+            ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+            "fmul v6.4s, v21.4s, v12.s[3]\n"
+            ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+            ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+            ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+            ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+            ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+            ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+            ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+            ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+            ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+            ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+            ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+            ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+            ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+            ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+            ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+            ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+            ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+            ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+            ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+            ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+            ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+            ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+            ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+            "scvtf v4.4s, v4.4s, #0x4\n"
+            "scvtf v1.4s, v1.4s, #0x4\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "fmla v15.4s, v4.4s, v11.4s\n"
+            "scvtf v30.4s, v30.4s, #0x4\n"
+            "fmla v19.4s, v1.4s, v23.4s\n"
+            "fmla v18.4s, v0.4s, v17.4s\n"
+            "fmla v14.4s, v30.4s, v6.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q15, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q19, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q18, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q14, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x10, %x[nr]\n"
+            "mov x9, #0x88\n"
+            "cmp x10, #0x10\n"
+            "mul x9, %x[nb], x9\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x28, %x[b_ptr], #0x8\n"
+            "mov x27, %x[nc]\n"
+            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x25, %x[a_ptr], #0x8\n"
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "mov x24, %x[nb]\n"
+            "add x23, x25, x9\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "add x22, x23, x9\n"
+            "movi v11.16b, #0x0\n"
+            "movi v13.16b, #0x0\n"
+            "add x21, x22, x9\n"
+            "movi v22.16b, #0x0\n"
+            "movi v23.16b, #0x0\n"
+            "movi v25.16b, #0x0\n"
+            "movi v5.16b, #0x0\n"
+            "movi v7.16b, #0x0\n"
+            "movi v4.16b, #0x0\n"
+            "movi v6.16b, #0x0\n"
+            "movi v30.16b, #0x0\n"
+            "movi v24.16b, #0x0\n"
+            "movi v14.16b, #0x0\n"
+            "3:"  // Block loop
+            "ldr q21, [x28, #0x0]\n"
+            "ldr q16, [x28, #0x10]\n"
+            "movi v1.16b, #0x4\n"
+            "movi v19.4s, #0x0\n"
+            "ldr q27, [x25, #0x0]\n"
+            "ldr q15, [x25, #0x10]\n"
+            "movi v26.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            "ldr q29, [x28, #0x20]\n"
+            "ldr q3, [x28, #0x30]\n"
+            "movi v17.4s, #0x0\n"
+            "movi v0.16b, #0xf0\n"
+            "ldr d20, [x25, #-0x8]\n"
+            "ldr d9, [x23, #-0x8]\n"
+            "sshl v8.16b, v21.16b, v1.16b\n"
+            "sshl v31.16b, v16.16b, v1.16b\n"
+            "and v21.16b, v21.16b, v0.16b\n"
+            "and v16.16b, v16.16b, v0.16b\n"
+            "sub x20, x28, #0x8\n"
+            "subs x24, x24, #0x1\n"
+            "add x28, x28, #0x48\n"
+            ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+            ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+            "ldr q27, [x25, #0x20]\n"
+            ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+            ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+            "sshl v15.16b, v29.16b, v1.16b\n"
+            "sshl v1.16b, v3.16b, v1.16b\n"
+            "and v29.16b, v29.16b, v0.16b\n"
+            "and v3.16b, v3.16b, v0.16b\n"
+            "ldr q0, [x25, #0x30]\n"
+            "fcvtl v20.4s, v20.4h\n"
+            ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+            "fcvtl v9.4s, v9.4h\n"
+            ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+            "ldr q27, [x25, #0x40]\n"
+            ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+            ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+            "ldr q0, [x25, #0x50]\n"
+            ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+            ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+            "ldr q27, [x25, #0x60]\n"
+            ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+            ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+            "ldr q0, [x25, #0x70]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+            ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+            "ldr d27, [x20, #0x0]\n"
+            ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+            ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+            "fcvtl v27.4s, v27.4h\n"
+            "uzp1 v0.2d, v19.2d, v26.2d\n"
+            "uzp2 v26.2d, v19.2d, v26.2d\n"
+            "fmul v19.4s, v27.4s, v20.s[0]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v26.4s, v26.4s, #0x4\n"
+            "fmla v2.4s, v0.4s, v19.4s\n"
+            "ldr q19, [x23, #0x0]\n"
+            "uzp1 v0.2d, v18.2d, v17.2d\n"
+            "uzp2 v18.2d, v18.2d, v17.2d\n"
+            "fmul v17.4s, v27.4s, v20.s[1]\n"
+            "scvtf v0.4s, v0.4s, #0x4\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v10.4s, v26.4s, v17.4s\n"
+            "ldr q17, [x23, #0x10]\n"
+            "fmul v26.4s, v27.4s, v20.s[2]\n"
+            "fmul v20.4s, v27.4s, v20.s[3]\n"
+            "fmla v12.4s, v0.4s, v26.4s\n"
+            "ldr d0, [x22, #-0x8]\n"
+            "ldr d26, [x21, #-0x8]\n"
+            "fcvtl v0.4s, v0.4h\n"
+            "fmla v28.4s, v18.4s, v20.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x23, #0x20]\n"
+            "fcvtl v26.4s, v26.4h\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x23, #0x40]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q19, [x23, #0x60]\n"
+            ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+            ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+            "uzp1 v19.2d, v20.2d, v18.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp2 v20.2d, v20.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v9.s[0]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v11.4s, v19.4s, v18.4s\n"
+            "ldr q18, [x22, #0x0]\n"
+            "fmul v19.4s, v27.4s, v9.s[1]\n"
+            "fmla v13.4s, v20.4s, v19.4s\n"
+            "movi v19.4s, #0x0\n"
+            "movi v20.4s, #0x0\n"
+            ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x23, #0x30]\n"
+            ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x23, #0x50]\n"
+            ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x23, #0x70]\n"
+            "add x23, x23, #0x88\n"
+            ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v9.s[2]\n"
+            "fmul v9.4s, v27.4s, v9.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v22.4s, v17.4s, v19.4s\n"
+            "ldr q17, [x22, #0x10]\n"
+            "movi v19.4s, #0x0\n"
+            ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+            "fmla v23.4s, v20.4s, v9.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v9.4s, #0x0\n"
+            ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+            "ldr q18, [x22, #0x20]\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+            ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+            "ldr q18, [x22, #0x40]\n"
+            ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+            ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+            "ldr q18, [x22, #0x60]\n"
+            ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+            ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x22, #0x30]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+            "ldr q17, [x22, #0x50]\n"
+            ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+            ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+            "ldr q17, [x22, #0x70]\n"
+            "add x22, x22, #0x88\n"
+            ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+            ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+            "uzp1 v17.2d, v19.2d, v20.2d\n"
+            "uzp2 v20.2d, v19.2d, v20.2d\n"
+            "fmul v19.4s, v27.4s, v0.s[0]\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "fmla v25.4s, v17.4s, v19.4s\n"
+            "ldr q19, [x21, #0x0]\n"
+            "fmul v17.4s, v27.4s, v0.s[1]\n"
+            "fmla v5.4s, v20.4s, v17.4s\n"
+            "ldr q17, [x21, #0x10]\n"
+            "uzp1 v20.2d, v9.2d, v18.2d\n"
+            "uzp2 v9.2d, v9.2d, v18.2d\n"
+            "fmul v18.4s, v27.4s, v0.s[2]\n"
+            "fmul v0.4s, v27.4s, v0.s[3]\n"
+            "scvtf v20.4s, v20.4s, #0x4\n"
+            "scvtf v9.4s, v9.4s, #0x4\n"
+            "fmla v7.4s, v20.4s, v18.4s\n"
+            "movi v20.4s, #0x0\n"
+            "movi v18.4s, #0x0\n"
+            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+            "ldr q19, [x21, #0x20]\n"
+            "fmla v4.4s, v9.4s, v0.4s\n"
+            "movi v9.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+            "fmul v8.4s, v27.4s, v26.s[0]\n"
+            ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+            "ldr q17, [x21, #0x30]\n"
+            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+            "fmul v31.4s, v27.4s, v26.s[1]\n"
+            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+            "ldr q19, [x21, #0x40]\n"
+            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+            "fmul v15.4s, v27.4s, v26.s[2]\n"
+            "fmul v27.4s, v27.4s, v26.s[3]\n"
+            ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+            "ldr q1, [x21, #0x50]\n"
+            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+            "ldr q26, [x21, #0x60]\n"
+            ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+            ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+            "ldr q21, [x21, #0x70]\n"
+            "add x21, x21, #0x88\n"
+            ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+            ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+            ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+            ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+            "uzp1 v29.2d, v20.2d, v18.2d\n"
+            "uzp2 v21.2d, v20.2d, v18.2d\n"
+            "scvtf v29.4s, v29.4s, #0x4\n"
+            "uzp1 v18.2d, v9.2d, v0.2d\n"
+            "uzp2 v16.2d, v9.2d, v0.2d\n"
+            "scvtf v21.4s, v21.4s, #0x4\n"
+            "fmla v6.4s, v29.4s, v8.4s\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v30.4s, v21.4s, v31.4s\n"
+            "fmla v24.4s, v18.4s, v15.4s\n"
+            "fmla v14.4s, v16.4s, v27.4s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x27, x27, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q28, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q11, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q13, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q22, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q23, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q25, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q5, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q7, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q4, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q6, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q30, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q24, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "str q14, [x20, #0x0]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x10, x10, #0x10\n"
+            "cmp x10, #0x10\n"
+            "mov %x[res_ptr], x26\n"
+            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x10, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x24, %x[b_ptr], #0x8\n"
+            "mov x23, %x[nc]\n"
+            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "movi v2.16b, #0x0\n"
+            "movi v10.16b, #0x0\n"
+            "add x25, %x[a_ptr], #0x8\n"
+            "mov x21, %x[nb]\n"
+            "movi v12.16b, #0x0\n"
+            "movi v28.16b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ldr q6, [x24, #0x0]\n"
+            "ldr q5, [x24, #0x10]\n"
+            "movi v17.16b, #0x4\n"
+            "movi v8.4s, #0x0\n"
+            "ldr q4, [x25, #0x0]\n"
+            "ldr q13, [x25, #0x10]\n"
+            "movi v27.4s, #0x0\n"
+            "movi v0.4s, #0x0\n"
+            "ldr q31, [x24, #0x20]\n"
+            "ldr q14, [x24, #0x30]\n"
+            "movi v29.4s, #0x0\n"
+            "movi v22.16b, #0xf0\n"
+            "ldr q11, [x25, #0x20]\n"
+            "ldr q23, [x25, #0x30]\n"
+            "sshl v21.16b, v6.16b, v17.16b\n"
+            "sshl v16.16b, v5.16b, v17.16b\n"
+            "ldr q20, [x25, #0x40]\n"
+            "ldr q26, [x25, #0x50]\n"
+            "and v6.16b, v6.16b, v22.16b\n"
+            "and v5.16b, v5.16b, v22.16b\n"
+            "ldr q25, [x25, #0x60]\n"
+            "ldr q3, [x25, #0x70]\n"
+            "sshl v19.16b, v31.16b, v17.16b\n"
+            "sshl v18.16b, v14.16b, v17.16b\n"
+            "ldr d17, [x25, #-0x8]\n"
+            ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+            ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+            "and v31.16b, v31.16b, v22.16b\n"
+            ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+            ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+            "and v14.16b, v14.16b, v22.16b\n"
+            "sub x20, x24, #0x8\n"
+            "ldr d16, [x20, #0x0]\n"
+            "subs x21, x21, #0x1\n"
+            "add x25, x25, #0x88\n"
+            "fcvtl v17.4s, v17.4h\n"
+            "add x24, x24, #0x48\n"
+            ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+            ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+            ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+            ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+            "fcvtl v16.4s, v16.4h\n"
+            ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+            ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+            "fmul v23.4s, v16.4s, v17.s[0]\n"
+            "fmul v21.4s, v16.4s, v17.s[1]\n"
+            "fmul v1.4s, v16.4s, v17.s[2]\n"
+            "fmul v20.4s, v16.4s, v17.s[3]\n"
+            ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+            ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+            ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+            ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+            ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+            ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+            "uzp1 v19.2d, v8.2d, v27.2d\n"
+            "uzp2 v18.2d, v8.2d, v27.2d\n"
+            "scvtf v19.4s, v19.4s, #0x4\n"
+            "uzp1 v17.2d, v0.2d, v29.2d\n"
+            "uzp2 v16.2d, v0.2d, v29.2d\n"
+            "scvtf v18.4s, v18.4s, #0x4\n"
+            "fmla v2.4s, v19.4s, v23.4s\n"
+            "scvtf v17.4s, v17.4s, #0x4\n"
+            "scvtf v16.4s, v16.4s, #0x4\n"
+            "fmla v10.4s, v18.4s, v21.4s\n"
+            "fmla v12.4s, v17.4s, v1.4s\n"
+            "fmla v28.4s, v16.4s, v20.4s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x10, #0x1\n"
+            "str q2, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x2\n"
+            "str q10, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x10, #0x3\n"
+            "str q12, [x20, #0x0]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "str q28, [x20, #0x0]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x23, x23, #0x4\n"
+            "add %x[res_ptr], %x[res_ptr], #0x10\n"
+            "bne 6b\n"
+            "subs x10, x10, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x9\n"
+            "mov %x[res_ptr], x22\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+        );
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    float sumf[4][4];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x20, #0x4\n"
+            "mov x13, %x[nr]\n"
+            "mov z28.s, #-0x4\n"
+            "mov x12, #0x88\n"
+            "ptrue p1.b\n"
+            "whilelt p0.s, XZR, x20\n"
+            "cmp x13, #0x10\n"
+            "mul x12, %x[nb], x12\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x11, %x[b_ptr], #0x10\n"
+            "mov x10, %x[nc]\n"
+            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "mov x27, %x[nb]\n"
+            "add x26, x28, x12\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "add x25, x26, x12\n"
+            "mov z13.b, #0x0\n"
+            "mov z1.b, #0x0\n"
+            "add x24, x25, x12\n"
+            "mov z20.b, #0x0\n"
+            "mov z25.b, #0x0\n"
+            "mov z11.b, #0x0\n"
+            "mov z16.b, #0x0\n"
+            "mov z19.b, #0x0\n"
+            "mov z26.b, #0x0\n"
+            "mov z8.b, #0x0\n"
+            "mov z29.b, #0x0\n"
+            "mov z27.b, #0x0\n"
+            "mov z10.b, #0x0\n"
+            "3:"  // Block loop
+            "ld1b { z30.b }, p1/Z, [x11]\n"
+            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+            "mov z18.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            "ld1rqb { z3.b }, p1/Z, [x28]\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+            "mov z9.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+            "sub x20, x11, #0x10\n"
+            "sub x23, x28, #0x8\n"
+            "lsl z31.b, z30.b, #0x4\n"
+            "lsl z6.b, z21.b, #0x4\n"
+            "ld1h { z23.s }, p1/Z, [x20]\n"
+            "sub x22, x26, #0x8\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z21.b, z21.b, #0xf0\n"
+            "sub x21, x25, #0x8\n"
+            "sub x20, x24, #0x8\n"
+            "lsl z14.b, z4.b, #0x4\n"
+            "lsl z2.b, z17.b, #0x4\n"
+            "subs x27, x27, #0x1\n"
+            "add x11, x11, #0x90\n"
+            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+            "and z4.b, z4.b, #0xf0\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+            "and z17.b, z17.b, #0xf0\n"
+            "fcvt z23.s, p1/m, z23.h\n"
+            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+            "fscale z23.s, p1/m, z23.s, z28.s\n"
+            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+            "add x28, x28, #0x88\n"
+            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+            "ld1h { z3.s }, p0/Z, [x23]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            "uzp1 z5.d, z18.d, z7.d\n"
+            "uzp2 z18.d, z18.d, z7.d\n"
+            "mov z3.q, z3.q[0]\n"
+            "uzp1 z7.d, z9.d, z22.d\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z3.s[0]\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z24.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z5.b }, p1/Z, [x26]\n"
+            "fmul z9.s, z23.s, z3.s[1]\n"
+            "fmla z15.s, p1/M, z18.s, z9.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+            "fmul z9.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "fmla z12.s, p1/M, z7.s, z9.s\n"
+            "mov z9.s, #0x0\n"
+            "ld1h { z7.s }, p0/Z, [x22]\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            "fmla z0.s, p1/M, z22.s, z3.s\n"
+            "mov z22.s, #0x0\n"
+            "ld1h { z3.s }, p0/Z, [x21]\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+            "fcvt z7.s, p1/m, z7.h\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+            "mov z7.q, z7.q[0]\n"
+            "mov z3.q, z3.q[0]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "uzp1 z5.d, z9.d, z22.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z7.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z13.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x25]\n"
+            "fmul z5.s, z23.s, z7.s[1]\n"
+            "fmla z1.s, p1/M, z22.s, z5.s\n"
+            "mov z5.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+            "add x26, x26, #0x88\n"
+            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z5.d, z22.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z22.d, z5.d, z22.d\n"
+            "fmul z5.s, z23.s, z7.s[2]\n"
+            "fmul z7.s, z23.s, z7.s[3]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z20.s, p1/M, z18.s, z5.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+            "ld1h { z5.s }, p0/Z, [x20]\n"
+            "fcvt z5.s, p1/m, z5.h\n"
+            "fmla z25.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+            "mov z5.q, z5.q[0]\n"
+            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+            "uzp1 z9.d, z22.d, z7.d\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "uzp2 z22.d, z22.d, z7.d\n"
+            "fmul z7.s, z23.s, z3.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z11.s, p1/M, z9.s, z7.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x24]\n"
+            "fmul z7.s, z23.s, z3.s[1]\n"
+            "fmla z16.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z22.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z7.d, z22.d, z7.d\n"
+            "fmul z22.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "fmla z19.s, p1/M, z18.s, z22.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+            "fmul z22.s, z23.s, z5.s[0]\n"
+            "fmla z26.s, p1/M, z7.s, z3.s\n"
+            "mov z3.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "mov z9.s, #0x0\n"
+            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+            "mov z31.s, #0x0\n"
+            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+            "fmul z14.s, z23.s, z5.s[1]\n"
+            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+            "fmul z2.s, z23.s, z5.s[2]\n"
+            "fmul z23.s, z23.s, z5.s[3]\n"
+            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+            "add x24, x24, #0x88\n"
+            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z3.d, z7.d\n"
+            "uzp2 z5.d, z3.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp1 z6.d, z9.d, z31.d\n"
+            "uzp2 z9.d, z9.d, z31.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "fmla z8.s, p1/M, z18.s, z22.s\n"
+            "scvtf z6.s, p1/m, z6.s\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "fmla z29.s, p1/M, z5.s, z14.s\n"
+            "fmla z27.s, p1/M, z6.s, z2.s\n"
+            "fmla z10.s, p1/M, z9.s, z23.s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x10, x10, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z13.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z1.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z20.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z25.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z11.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z16.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z19.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z26.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z8.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z29.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z27.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z10.s }, p1, [x20]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x13, x13, #0x10\n"
+            "cmp x13, #0x10\n"
+            "mov %x[res_ptr], x9\n"
+            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x13, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x25, %x[b_ptr], #0x10\n"
+            "mov x24, %x[nc]\n"
+            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov x22, %x[nb]\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ld1b { z3.b }, p1/Z, [x25]\n"
+            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+            "mov z2.s, #0x0\n"
+            "mov z25.s, #0x0\n"
+            "ld1rqb { z26.b }, p1/Z, [x28]\n"
+            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+            "mov z27.s, #0x0\n"
+            "mov z19.s, #0x0\n"
+            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+            "sub x21, x25, #0x10\n"
+            "sub x20, x28, #0x8\n"
+            "lsl z20.b, z3.b, #0x4\n"
+            "lsl z4.b, z6.b, #0x4\n"
+            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+            "and z3.b, z3.b, #0xf0\n"
+            "and z6.b, z6.b, #0xf0\n"
+            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+            "lsl z8.b, z29.b, #0x4\n"
+            "lsl z14.b, z16.b, #0x4\n"
+            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1h { z17.s }, p1/Z, [x21]\n"
+            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+            "and z16.b, z16.b, #0xf0\n"
+            "ld1h { z4.s }, p0/Z, [x20]\n"
+            "subs x22, x22, #0x1\n"
+            "add x28, x28, #0x88\n"
+            "fcvt z17.s, p1/m, z17.h\n"
+            "add x25, x25, #0x90\n"
+            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+            "fcvt z4.s, p1/m, z4.h\n"
+            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+            "fscale z17.s, p1/m, z17.s, z28.s\n"
+            "mov z4.q, z4.q[0]\n"
+            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+            "fmul z23.s, z17.s, z4.s[0]\n"
+            "fmul z9.s, z17.s, z4.s[1]\n"
+            "fmul z21.s, z17.s, z4.s[2]\n"
+            "fmul z4.s, z17.s, z4.s[3]\n"
+            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+            "uzp1 z31.d, z2.d, z25.d\n"
+            "uzp2 z13.d, z2.d, z25.d\n"
+            "scvtf z31.s, p1/m, z31.s\n"
+            "uzp1 z17.d, z27.d, z19.d\n"
+            "uzp2 z18.d, z27.d, z19.d\n"
+            "scvtf z13.s, p1/m, z13.s\n"
+            "fmla z24.s, p1/M, z31.s, z23.s\n"
+            "scvtf z17.s, p1/m, z17.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "fmla z15.s, p1/M, z13.s, z9.s\n"
+            "fmla z12.s, p1/M, z17.s, z21.s\n"
+            "fmla z0.s, p1/M, z18.s, z4.s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x13, #0x1\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x2\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x3\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x24, x24, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "bne 6b\n"
+            "subs x13, x13, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x12\n"
+            "mov %x[res_ptr], x23\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#elif defined(__AVX2__) || defined(__AVX512F__)
+    {
+        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
+        int64_t b_nb = n / QK4_0;
+        int64_t y = 0;
+        // Mask to mask out nibbles from packed bytes
+        const __m256i m4b = _mm256_set1_epi8(0x0F);
+        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
+        // Lookup table to convert signed nibbles to signed bytes
+        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+        // Permute mask used for easier vector processing at later stages
+        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
+        int64_t xstart = 0;
+        int anr = nr - nr%16; // Used to align nr with boundary of 16
+    #ifdef __AVX512F__
+        int anc = nc - nc%16; // Used to align nc with boundary of 16
+        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
+        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
+        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
+        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
+
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < anr / 4; y += 4) {
+
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
+            }
+
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
+
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+                // Master FP accumulators
+                __m512 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                    // Process LHS in pairs of rows
+                    for (int rp = 0; rp < 4; rp++) {
+
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                        __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                        __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                        __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                        __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                        __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                        __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                        __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                        __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                        __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                        __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                        __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                        __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                        __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                        __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                        __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                        __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                        // Shuffle pattern one - left side input
+
+                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+
+                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m512i iacc_mat_00_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_01_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_10_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                        __m512i iacc_mat_11_sp1 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                        __m512i iacc_mat_00_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_01_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                        __m512i iacc_mat_10_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                        __m512i iacc_mat_11_sp2 =
+                            _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                        // Straighten out to make 4 row vectors
+                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
+                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
+                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
+                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
+                        const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
+
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
+
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
+
+                // Master FP accumulators
+                __m512 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
+
+                    const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
+                    const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
+                    const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
+                    const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
+                    const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
+
+                    const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
+                    const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
+                    const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
+                    const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
+
+                    const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
+                    const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
+
+                    const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
+                    const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
+
+                    const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
+                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+
+
+                    // Scale values - Load the weight scale values of two block_q4_0x8
+                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
+
+                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                    // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
+                    __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
+                    __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
+                    __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
+                    __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
+                    __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
+                    __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
+                    __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
+                    __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
+                    __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
+                    __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
+                    __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
+                    __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
+
+                    __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
+                    __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
+                    __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
+                    __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
+                    __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
+                    __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
+                    __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
+                    __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
+
+                    // Shuffle pattern one - left side input
+
+                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                    // Shuffle pattern two - left side input
+
+                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                    // Resembles MMLAs into 2x2 matrices in ARM Version
+                    __m512i iacc_mat_00_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_01_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_10_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
+                    __m512i iacc_mat_11_sp1 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
+                    __m512i iacc_mat_00_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_01_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
+                    __m512i iacc_mat_10_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
+                    __m512i iacc_mat_11_sp2 =
+                        _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
+
+                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                    __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                    __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                    __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                    __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                    // Straighten out to make 4 row vectors
+                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
+                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
+                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
+                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
+
+                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
+                    const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
+
+                    // Multiply with appropiate scales and accumulate
+                    acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
+                    acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
+                    acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        if (anc != nc) {
+            xstart = anc/8;
+            y = 0;
+        }
+    #endif // __AVX512F__
+
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+
+        for (; y < anr / 4; y += 4) {
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
+            }
+
+            // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = xstart; x < nc / 8; x++) {
+
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+                // Master FP accumulators
+                __m256 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
+
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
+
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
+
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
+
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
+
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
+
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
+
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
+
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
+
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
+
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
+
+                    // Process LHS in groups of four
+                    for (int rp = 0; rp < 4; rp++) {
+                        // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                        // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
+                        __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
+                        __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
+                        __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
+                        __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
+                        __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
+                        __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
+                        __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
+                        __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
+                        __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
+                        __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
+                        __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
+                        __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
+
+                        // Shuffle pattern one - left side input
+                        const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                        const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                        const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                        const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                        // Shuffle pattern two - left side input
+                        const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                        const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                        const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                        const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                        // Resembles MMLAs into 2x2 matrices in ARM Version
+                        __m256i iacc_mat_00_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_01_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_10_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                        __m256i iacc_mat_11_sp1 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                        __m256i iacc_mat_00_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_01_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                        __m256i iacc_mat_10_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                        __m256i iacc_mat_11_sp2 =
+                            _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+
+                        // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                        __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                        __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                        __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                        __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+                        // Straighten out to make 4 row vectors
+                        __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
+                        __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
+                        __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
+                        __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
+
+                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                        const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
+
+                        // Multiply with appropiate scales and accumulate
+                        acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
+                        acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
+                        acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
+                        acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32,  255)), acc_rows[rp * 4 + 3]);
+                    }
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 16; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+
+        // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < nr / 4; y ++) {
+
+            const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
+
+            // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+            for (int64_t x = xstart; x < nc / 8; x++) {
+
+                const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
+
+                // Master FP accumulators
+                __m256 acc_rows[4];
+                for (int i = 0; i < 4; i++) {
+                    acc_rows[i] = _mm256_setzero_ps();
+                }
+
+                for (int64_t b = 0; b < nb; b++) {
+                    // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
+                    const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
+                    const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
+                    const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
+                    const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
+
+                    // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                    const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
+                    const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
+                    const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
+
+                    // 4-bit -> 8-bit - Sign is maintained
+                    const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b));  //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
+                    const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b));  //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
+
+                    const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b));  //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
+                    const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b));  //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
+
+                    const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b));  //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
+                    const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b));  //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
+
+                    const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b));  //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
+                    const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b));  //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
+
+                    // Shuffle pattern one - right side input
+                    const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136);  //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
+                    const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136);  //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
+
+                    const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136);  //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
+                    const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136);  //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
+
+                    const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136);  //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
+                    const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136);  //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
+
+                    const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136);  //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
+                    const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136);  //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
+
+                    // Shuffle pattern two - right side input
+
+                    const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221);  //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
+                    const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221);  //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
+
+                    const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221);  //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
+                    const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221);  //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
+
+                    const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221);  //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
+                    const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221);  //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
+
+                    const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221);  //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
+                    const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221);  //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
+
+                    // Scale values - Load the wight scale values of block_q4_0x8
+                    const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
+
+                    // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
+                    // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
+                    __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
+                    __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
+                    __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
+                    __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
+                    __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
+                    __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
+                    __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
+                    __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
+                    __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
+                    __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
+                    __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
+                    __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
+
+                    // Shuffle pattern one - left side input
+
+                    const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+
+                    const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+
+                    const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+
+                    const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+
+                    // Shuffle pattern two - left side input
+
+                    const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+
+                    const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+
+                    const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+
+                    const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+
+                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
+                    // Resembles MMLAs into 2x2 matrices in ARM Version
+                    __m256i iacc_mat_00_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
+                    __m256i iacc_mat_01_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
+                    __m256i iacc_mat_10_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
+                    __m256i iacc_mat_11_sp1 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
+                    __m256i iacc_mat_00_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
+                    __m256i iacc_mat_01_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
+                    __m256i iacc_mat_10_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
+                    __m256i iacc_mat_11_sp2 =
+                        _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
+
+                    // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
+                    __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
+                    __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
+                    __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
+                    __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
+
+
+                    // Straighten out to make 4 row vectors
+                    __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
+                    __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
+                    __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
+                    __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
+
+                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
+                    const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
+
+                    // Multiply with appropiate scales and accumulate
+                    acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
+                    acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
+                    acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
+                    acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
+                }
+
+                // Store the accumulated values
+                for (int i = 0; i < 4; i++) {
+                    _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
+                }
+            }
+        }
+        return;
+    }
+#elif defined(__riscv_v_intrinsic)
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                for (int l = 0; l < nb; l++) {
+                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                    // vector version needs Zvfhmin extension
+                    const float a_scales[4] = {
+                        GGML_FP16_TO_FP32(a_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(a_ptr[l].d[3])
+                    };
+                    const float b_scales[8] = {
+                        GGML_FP16_TO_FP32(b_ptr[l].d[0]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[1]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[2]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[3]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[4]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[5]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[6]),
+                        GGML_FP16_TO_FP32(b_ptr[l].d[7])
+                    };
+                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+
+                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
+                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
+                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
+                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l0;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l0 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
+                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
+                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
+                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
+                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l1;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l1 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
+                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
+                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
+                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
+                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l2;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l2 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
+                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
+                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
+                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
+                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l3;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l3 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
+                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
+                    }
+                }
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
+            }
+        }
+
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+                float32x4_t sumf[4];
+                for (int m = 0; m < 4; m++) {
+                    sumf[m] = vdupq_n_f32(0);
+                }
+
+                for (int l = 0; l < nb; l++) {
+                    float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
+                    float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+
+                    int32x4_t sumi_0 = vdupq_n_s32(0);
+                    int32x4_t sumi_1 = vdupq_n_s32(0);
+                    int32x4_t sumi_2 = vdupq_n_s32(0);
+                    int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                    for (int k = 0; k < 4; k++) {
+                        int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
+                        int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
+
+                        uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
+                        int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
+                        int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
+
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
+                    }
+
+                    sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                    sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                    sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                    sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+                }
+            }
+        }
+        return;
+    }
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
+    block_q4_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_0 * 2 / blck_size_interleave;
+
+    if (blck_size_interleave == 8) {
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint64_t elems;
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        const uint32_t xor_mask = 0x88888888;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint32_t elems;
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+// interleave 8 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x8
+// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
+// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
+    block_q4_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_0 * 4 / blck_size_interleave;
+    const uint64_t xor_mask = 0x8888888888888888ULL;
+
+    for (int i = 0; i < end; ++i) {
+        int src_id = i % 8;
+        int src_offset = (i / 8) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elems;
+        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+        elems ^= xor_mask;
+        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+    }
+
+    return out;
+}
+
+static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    constexpr int nrows_interleaved = 4;
+
+    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
+    const block_q4_0 * src = (const block_q4_0 *)data;
+    block_q4_0 dst_tmp[4];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 8);
+    constexpr int nrows_interleaved = 8;
+
+    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
+    const block_q4_0 * src = (const block_q4_0*) data;
+    block_q4_0 dst_tmp[8];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
+    block_iq4_nlx4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_NL * 2 / blck_size_interleave;
+
+    // TODO: this branch seems wrong
+    //if (blck_size_interleave == 8) {
+    //    for (int i = 0; i < end; ++i) {
+    //        int src_id = i % 4;
+    //        int src_offset = (i / 4) * blck_size_interleave;
+    //        int dst_offset = i * blck_size_interleave;
+
+    //        // Using memcpy to avoid unaligned memory accesses
+    //        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
+    //    }
+    //} else
+    if (blck_size_interleave == 4) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    GGML_ASSERT(interleave_block == 4);
+
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
+    block_iq4_nl dst_tmp[4];
+    int nrow = ggml_nrows(t);
+    int nrows_interleaved = 4;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+namespace ggml::cpu::aarch64 {
+// repack
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+int repack(struct ggml_tensor *, const void *, size_t);
+
+// TODO: generalise.
+template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
+}
+
+template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
+}
+
+// TODO: needs to be revisited
+//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+//    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
+//}
+
+// gemv
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+void gemv(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <>
+void gemv<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+// gemm
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+void gemm(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <>
+void gemm<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+class tensor_traits_base : public ggml::cpu::tensor_traits {
+  public:
+    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
+};
+
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_traits : public tensor_traits_base {
+
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        // not realy a GGML_TYPE_Q8_0 but same size.
+        switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
+            return true;
+        case GGML_OP_MUL_MAT_ID:
+            size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
+            size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
+            size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
+            return true;
+        default:
+            // GGML_ABORT("fatal error");
+            break;
+        }
+        return false;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            forward_mul_mat(params, op);
+            return true;
+        case GGML_OP_MUL_MAT_ID:
+            forward_mul_mat_id(params, op);
+            return true;
+        default:
+            // GGML_ABORT("fatal error");
+            break;
+        }
+        return false;
+    }
+
+    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        GGML_ASSERT(ne0 == ne01);
+        GGML_ASSERT(ne1 == ne11);
+        GGML_ASSERT(ne2 == ne12);
+        GGML_ASSERT(ne3 == ne13);
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
+        // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
+
+        char *       wdata = static_cast<char *>(params->wdata);
+        const size_t nbw1  = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+
+        assert(params->wsize >= nbw1 * ne11);
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
+
+        int64_t i11_processed = 0;
+        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+            quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10,
+                              INTER_SIZE);
+        }
+        i11_processed = ne11 - ne11 % 4;
+        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        const void * src1_wdata      = params->wdata;
+        const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+        int64_t      src0_start      = (ith * ne01) / nth;
+        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+        src0_end   = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
+        if (src0_start >= src0_end) {
+            return;
+        }
+
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data) + src0_start, ne01,
+                                                 (const char *) src0->data + src0_start * nb01,
+                                                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                                                 (const char *) src0->data + src0_start * nb01,
+                                                 (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                                                 src0_end - src0_start);
+        }
+    }
+
+    void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        const ggml_tensor * ids  = op->src[2];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
+
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+        GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(ne03 == 1);
+        GGML_ASSERT(ne13 == 1);
+        GGML_ASSERT(ne3  == 1);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        // row groups
+        const int n_ids = ids->ne[0]; // n_expert_used
+        const int n_as  = ne02;       // n_expert
+
+        const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        struct mmid_row_mapping {
+            int32_t i1;
+            int32_t i2;
+        };
+
+        GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
+                                      n_as * ne12 * sizeof(mmid_row_mapping)));
+
+        auto                      wdata             = (char *) params->wdata;
+        auto                      wdata_src1_end    = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
+        int64_t *                 matrix_row_counts = (int64_t *) (wdata_src1_end);                      // [n_as]
+        struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as);  // [n_as][ne12]
+
+        // src1: float32 => block_q8_0
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
+                           (void *)               (wdata + i12 * nbw2 + i11 * nbw1),
+                           ne10);
+            }
+        }
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
+
+        if (ith == 0) {
+            // initialize matrix_row_counts
+            memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+
+            // group rows by src0 matrix
+            for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+                for (int32_t id = 0; id < n_ids; ++id) {
+                    const int32_t i02 =
+                        *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
+
+                    GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                    MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
+                    matrix_row_counts[i02] += 1;
+                }
+            }
+        }
+
+        ggml_barrier(params->threadpool);
+
+        // compute each matrix multiplication in sequence
+        for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+            const int64_t cne1 = matrix_row_counts[cur_a];
+
+            if (cne1 == 0) {
+                continue;
+            }
+
+            auto src0_cur = (const char *) src0->data + cur_a*nb02;
+
+            //const int64_t nr0 = ne01; // src0 rows
+            const int64_t nr1 = cne1; // src1 rows
+
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            src0_cur_start =
+                (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
+            src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
+
+            if (src0_cur_start >= src0_cur_end) return;
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+                const int id       = row_mapping.i1; // selected expert index
+
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+
+                auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
+
+                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(
+                        ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start,
+                        ne01,                    src0_cur + src0_cur_start * nb01,
+                        src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+        }
+#undef MMID_MATRIX_ROW
+    }
+
+    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
+                       (int) NB_COLS, (int) INTER_SIZE);
+        return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+    }
+};
+
+// instance for Q4
+static const tensor_traits<block_q4_0, 4, 4> q4_0_4x4_q8_0;
+static const tensor_traits<block_q4_0, 8, 4> q4_0_4x8_q8_0;
+static const tensor_traits<block_q4_0, 8, 8> q4_0_8x8_q8_0;
+
+// instance for IQ4
+static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
+
+}  // namespace ggml::cpu::aarch64
+
+static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
+    if (cur->type == GGML_TYPE_Q4_0) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+            if (cur->ne[1] % 8 == 0) {
+                return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
+            }
+        }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                       const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
+    auto OK            = tensor_traits->repack(tensor, data, size);
+
+    GGML_ASSERT(OK == 0);
+    GGML_UNUSED(buffer);
+}
+
+static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_AARCH64";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+
+    if (buffer == nullptr) {
+        return nullptr;
+    }
+
+    buffer->buft              = buft;
+    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
+    buffer->iface.set_tensor  = ggml_backend_cpu_aarch64_buffer_set_tensor;
+    buffer->iface.get_tensor  = nullptr;
+    buffer->iface.cpy_tensor  = nullptr;
+    return buffer;
+}
+
+static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+namespace ggml::cpu::aarch64 {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        if (    op->op == GGML_OP_MUL_MAT &&
+                op->src[0]->buffer &&
+                (ggml_n_dims(op->src[0]) == 2) &&
+                op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
+                ggml_aarch64_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+            // may be possible if Q8_0 packed...
+        } else if (op->op == GGML_OP_MUL_MAT_ID
+                && op->src[0]->buffer
+                && (ggml_n_dims(op->src[0]) == 3)
+                && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
+                && ggml_aarch64_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
+                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+            }
+        }
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::aarch64
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ nullptr,
+                           },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
+    };
+
+    return &ggml_backend_cpu_buffer_type_aarch64;
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
new file mode 100644
index 000000000..6e84c826b
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-cpu-traits.h"
+#include "ggml.h"
+
+// GGML internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
new file mode 100644
index 000000000..fa8dea2af
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
@@ -0,0 +1,55 @@
+#ifdef GGML_USE_CPU_HBM
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+
+#include "ggml-cpu-hbm.h"
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                           size_t                     size) {
+    void * ptr;
+    int    result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+                           },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/ggml-cpu-hbm.h
new file mode 100644
index 000000000..09a1f09d7
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+// GGML CPU internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
new file mode 100644
index 000000000..9ddd972a5
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -0,0 +1,380 @@
+#pragma once
+
+// GGML CPU internal header
+
+#include "ggml.h"
+#include "ggml-impl.h"
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+//#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+
+#if defined(_MSC_VER)
+
+#define m512bh(p) p
+#define m512i(p) p
+
+#else
+
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <sys/prctl.h>
+#endif
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+
+typedef uint16_t ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+typedef __fp16 ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddlvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddlvq_s16(int16x8_t v) {
+    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
+    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    uint8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t  int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t  int8x16x2_t
+#define ggml_int8x16x4_t  int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8
+#define ggml_vqtbl1q_u8   vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#if defined(__loongarch64)
+#if defined(__loongarch_asx)
+#include <lasxintrin.h>
+#endif
+#if defined(__loongarch_sx)
+#include <lsxintrin.h>
+#endif
+#endif
+
+#if defined(__loongarch_asx)
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
+}
+
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
+}
+#endif
+
+// TODO: move to ggml-threading
+void ggml_barrier(struct ggml_threadpool * tp);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
new file mode 100644
index 000000000..27ec14935
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -0,0 +1,10910 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-quants.h"
+#include "ggml-cpu-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid warnings for hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
+#define UNUSED GGML_UNUSED
+
+// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+    const __m128i mone = _mm_set1_epi16(1);
+
+    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
+}
+
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+    // GGML_FP16_TO_FP32 is faster than Intel F16C
+    return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
+                           _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+#if defined(__loongarch_sx)
+
+static __m128i lsx_packs_w(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_w(a, 15);
+    tmp1 = __lsx_vsat_w(b, 15);
+    return __lsx_vpickev_h(tmp1, tmp);
+}
+
+static __m128i lsx_packs_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_h(a, 7);
+    tmp1 = __lsx_vsat_h(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packus_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_hu(a, 7);
+    tmp1 = __lsx_vsat_hu(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_h_b(a, b);
+    tmp2 = __lsx_vmulwod_h_b(a, b);
+    return __lsx_vsadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_madd_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_w_h(a, b);
+    tmp2 = __lsx_vmulwod_w_h(a, b);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
+    v4i32 __ret = {d, c, b, a};
+    return (__m128i)__ret;
+}
+
+static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+    __m128i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lsx_vreplgr2vr_b(f);
+    zero = __lsx_vldi(0);
+    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
+    return __lsx_vshuf_b(a, zero, tmp2);
+}
+
+static __m128i lsx_hadd_h(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_h(b, a);
+    __m128i tmp2 = __lsx_vpickod_h(b, a);
+    return __lsx_vadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_hadd_w(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_w(b, a);
+    __m128i tmp2 = __lsx_vpickod_w(b, a);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128 lsx_hadd_s(__m128 a, __m128 b) {
+    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
+    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
+
+    return __lsx_vfadd_s(tmp1, tmp2);
+}
+
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =lsx_hadd_s(a, b);
+    __m128 res_1 =lsx_hadd_s(c, d);
+    __m128 res =lsx_hadd_s(res_0, res_1);
+    res =lsx_hadd_s(res, res);
+    res =lsx_hadd_s(res, res);
+
+    return ((v4f32)res)[0];
+}
+#endif
+
+#if defined(__loongarch_asx)
+
+#ifdef __clang__
+#define VREGS_PREFIX "$vr"
+#define XREGS_PREFIX "$xr"
+#else // GCC
+#define VREGS_PREFIX "$f"
+#define XREGS_PREFIX "$f"
+#endif
+#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+    __m256i out = __lasx_xvldi(0);
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX"\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "+f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+    __m256i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".ifnc %[out], %[hi]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
+        "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out), [hi] "+f" (inhi)
+        : [lo] "f" (inlo)
+    );
+    return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".ifnc %[out], %[in]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    vori.b $vr\\i, $vr\\j, 0        \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+
+static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
+    v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
+    return (__m256i)__ret;
+}
+
+static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
+    v4i64 __ret = {d, c, b, a};
+    return (__m256i)__ret;
+}
+
+static __m256i lasx_insertf128( __m128i x, __m128i y) {
+    return lasx_set_q(x, y);
+}
+
+static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+    __m256i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lasx_xvreplgr2vr_b(f);
+    zero = __lasx_xvldi(0);
+    tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
+    return __lasx_xvshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_extu8_16(__m128i a) {
+    return __lasx_vext2xv_hu_bu(____m256i(a));
+}
+
+static __m256i lasx_ext8_16(__m128i a) {
+    return __lasx_vext2xv_h_b(____m256i(a));
+}
+
+static __m256i lasx_ext16_32(__m128i a) {
+    return __lasx_vext2xv_w_h(____m256i(a));
+}
+
+static __m128i lasx_extracti128( __m256i a, int pos) {
+    __m128i ret;
+    if( pos == 0)
+    {
+       ret = lasx_extracti128_lo(a);
+    } else {
+       ret = lasx_extracti128_hi(a);
+    }
+    return ret;
+}
+
+static __m128 lasx_extractf128( __m256 a, int pos) {
+    __m128 ret;
+    if( pos == 0)
+    {
+       ret = (__m128)lasx_extracti128_lo((__m256i)a);
+    } else {
+       ret = (__m128)lasx_extracti128_hi((__m256i)a);
+    }
+    return ret;
+}
+
+static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_h_b(a, b);
+    tmp2 = __lasx_xvmulwod_h_b(a, b);
+    return __lasx_xvsadd_h(tmp1, tmp2);
+}
+
+static __m256i lasx_madd_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_w_h(a, b);
+    tmp2 = __lasx_xvmulwod_w_h(a, b);
+    return __lasx_xvadd_w(tmp1, tmp2);
+}
+
+static __m256i lasx_packs_w(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_w(a, 15);
+    tmp1 = __lasx_xvsat_w(b, 15);
+    return __lasx_xvpickev_h(tmp1, tmp);
+}
+
+static __m256i lasx_packs_h(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_h(a, 7);
+    tmp1 = __lasx_xvsat_h(b, 7);
+    return __lasx_xvpickev_b(tmp1, tmp);
+}
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = __lsx_vsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = __lsx_vsigncov_b(x, y);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = lsx_maddubs_h(ax, sy);
+    const __m128i ones = __lsx_vreplgr2vr_h(1);
+    return lsx_madd_h(ones, dot);
+}
+
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = lasx_extractf128(x, 1);
+    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
+    return ((v4f32)res)[0];
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+
+    __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
+    __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
+
+    __m128i  tmp1_128 = lasx_extracti128_lo(tmp1);
+    __m128i  tmp2_128 = lasx_extracti128_lo(tmp2);
+
+    __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
+
+    __m128i ev = __lsx_vpickev_w(sum128, sum128);
+    __m128i od = __lsx_vpickod_w(sum128, sum128);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    __m128i ev = __lsx_vpickev_w(a, a);
+    __m128i od = __lsx_vpickod_w(a, a);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = lasx_set_d(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+
+    __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
+    const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
+    bytes = __lasx_xvor_v(bytes, bit_mask);
+    return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+    const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
+    __m128i hi = __lsx_vsrli_h(lo, 4);
+    return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    __m256i v = __lasx_xvpackod_h(x, x);
+    __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
+    return __lasx_xvffint_s_w(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = lasx_maddubs_h(ax, sy);
+    return sum_i16_pairs_float(dot);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+
+    // Get absolute values of x vectors
+    const __m256i ax = __lasx_xvsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = __lasx_xvsigncov_b(x, y);
+
+    return mul_sum_us8_pairs_float(ax, sy);
+}
+
+static inline __m128i packNibbles( __m256i bytes ) {
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
+     __m256i high = __lasx_xvandn_v(lowByte, bytes);
+    __m256i low = __lasx_xvand_v(lowByte, bytes);
+    high = __lasx_xvsrli_h(high, 4);
+    bytes = __lasx_xvor_v(low, high);
+    // Compress uint16_t lanes into bytes
+    __m128i *r0 = (__m128i *)&bytes;
+    __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
+    __m128i *r1 = (__m128i *)&tmp_h128;
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2, tmp3;
+
+    tmp = __lsx_vmax_h(zero, *r0);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    tmp = __lsx_vmax_h(zero, *r1);
+    tmp3 = __lsx_vsat_hu(tmp, 7);
+    return  __lsx_vpickev_b(tmp3, tmp2);
+}
+#endif  //__loongarch_asx
+
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q4_0_ref(x, y, k);
+}
+
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q4_1_ref(x, y, k);
+}
+
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q5_0_ref(x, y, k);
+}
+
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q5_1_ref(x, y, k);
+}
+
+void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+    }
+
+#elif defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        __m256 v0 = (__m256)__lasx_xvld( x , 0);
+        __m256 v1 = (__m256)__lasx_xvld( x , 32);
+        __m256 v2 = (__m256)__lasx_xvld( x , 64);
+        __m256 v3 = (__m256)__lasx_xvld( x , 96);
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
+        const float max_scalar = ((v4f32)max4)[0];
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128( i0, 0 );
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0);
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+    }
+#elif defined(__wasm_simd128__)
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(
+                d * (wasm_i32x4_extract_lane(accv, 0) +
+                     wasm_i32x4_extract_lane(accv, 1) +
+                     wasm_i32x4_extract_lane(accv, 2) +
+                     wasm_i32x4_extract_lane(accv, 3)));
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float max_scalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#elif defined(__riscv_v_intrinsic)
+
+    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
+
+        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
+
+        // convert to integer
+        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
+        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = GGML_FP32_TO_FP16(sum*d);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        vector int accv = vec_splats(0);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+
+            accv = vec_add(accv, vi[j]);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+
+        accv = vec_add(accv, vec_sld(accv, accv, 4));
+        accv = vec_add(accv, vec_sld(accv, accv, 8));
+        y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
+    }
+
+#elif defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
+        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
+        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
+        __m256 v3 = (__m256)__lasx_xvld( x , 96 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
+        const float max_scalar = ((v4f32)max4)[0];
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = __lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128(i0, 0);
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0 );
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
+        const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
+        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+//
+// 2-6 bit quantization in super-blocks
+//
+
+//
+// ===================== Helper functions
+//
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
+        const float * restrict qw) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+        }
+        return 1/iscale;
+    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
+    for (int i = 0; i < n; ++i) {
+#endif
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
+static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (do_rmse) {
+        float sumlx = 0;
+        float suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            L[i] = l;
+            float w = x[i]*x[i];
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        for (int itry = 0; itry < 5; ++itry) {
+            int n_changed = 0;
+            for (int i = 0; i < n; ++i) {
+                float w = x[i]*x[i];
+                float slx = sumlx - w*x[i]*L[i];
+                if (slx > 0) {
+                    float sl2 = suml2 - w*L[i]*L[i];
+                    int new_l = nearest_int(x[i] * sl2 / slx);
+                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
+                    if (new_l != L[i]) {
+                        slx += w*x[i]*new_l;
+                        sl2 += w*new_l*new_l;
+                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
+                            L[i] = new_l; sumlx = slx; suml2 = sl2;
+                            ++n_changed;
+                        }
+                    }
+                }
+            }
+            if (!n_changed) {
+                break;
+            }
+        }
+        for (int i = 0; i < n; ++i) {
+            L[i] += nmax;
+        }
+        return sumlx / suml2;
+    }
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+    }
+    return 1/iscale;
+}
+
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
+    float min = x[0];
+    float max = x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+    }
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = 0;
+        return 0.f;
+    }
+    if (min > 0) min = 0;
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    for (int itry = 0; itry < ntry; ++itry) {
+        float sumlx = 0; int suml2 = 0;
+        bool did_change = false;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            if (l != L[i]) {
+                L[i] = l;
+                did_change = true;
+            }
+            sumlx += (x[i] - min)*l;
+            suml2 += l*l;
+        }
+        scale = sumlx/suml2;
+        float sum = 0;
+        for (int i = 0; i < n; ++i) {
+            sum += x[i] - scale*L[i];
+        }
+        min = alpha*min + (1 - alpha)*sum/n;
+        if (min > 0) min = 0;
+        iscale = 1/scale;
+        if (!did_change) break;
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+//========================- 2-bit (de)-quantization
+
+void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
+    quantize_row_q2_K_ref(x, vy, k);
+}
+
+//========================= 3-bit (de)-quantization
+
+void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
+    quantize_row_q3_K_ref(x, vy, k);
+}
+
+// ====================== 4-bit (de)-quantization
+
+void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q4_K * restrict y = vy;
+    quantize_row_q4_K_ref(x, y, k);
+}
+
+// ====================== 5-bit (de)-quantization
+
+void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q5_K * restrict y = vy;
+    quantize_row_q5_K_ref(x, y, k);
+}
+
+// ====================== 6-bit (de)-quantization
+
+void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q6_K * restrict y = vy;
+    quantize_row_q6_K_ref(x, y, k);
+}
+
+// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
+
+void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq1_0 * restrict y = vy;
+    quantize_row_tq1_0_ref(x, y, k);
+}
+
+void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq2_0 * restrict y = vy;
+    quantize_row_tq2_0_ref(x, y, k);
+}
+
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+//===================================== Q8_K ==============================================
+
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
+    quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+//
+// Helper functions
+//
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#elif defined(__loongarch_asx)
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return __lsx_vld((const __m128i*)k_shuffle + i, 0);
+}
+#endif
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_0 * restrict vx0 = vx;
+        const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * restrict vy0 = vy;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_0 * restrict b_x0 = &vx0[i];
+            const block_q4_0 * restrict b_x1 = &vx1[i];
+            const block_q8_0 * restrict b_y0 = &vy0[i];
+            const block_q8_0 * restrict b_y1 = &vy1[i];
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+            const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // sub 8
+            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
+            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
+            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
+            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
+
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
+
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_0 * restrict x0 = &x[ib + 0];
+        const block_q4_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib + 0];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        qx = _mm256_sub_epi8( qx, off );
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
+        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
+        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
+        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
+        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
+
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        // subtract offset
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector signed char v8 = vec_splats((signed char)0x8);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_sub(q4x0, v8);
+        q4x1 = vec_sub(q4x1, v8);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi0 = vec_sum4s(qv1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = __lasx_xvreplgr2vr_b( 8 );
+        qx = __lasx_xvsub_b( qx, off );
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+
+#elif defined(__loongarch_sx)
+    // set constants
+    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
+    const __m128i off = __lsx_vreplgr2vr_b(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = (__m128)__lsx_vldi(0);
+    __m128 acc_1 = (__m128)__lsx_vldi(0);
+    __m128 acc_2 = (__m128)__lsx_vldi(0);
+    __m128 acc_3 = (__m128)__lsx_vldi(0);
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
+
+        __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
+        __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
+        bx_0 = __lsx_vsub_b(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
+        __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
+        bx_1 = __lsx_vsub_b(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
+
+        __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
+        __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
+        bx_2 = __lsx_vsub_b(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
+        __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
+        bx_3 = __lsx_vsub_b(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = __lsx_vffint_s_w(i32_0);
+        __m128 p1 = __lsx_vffint_s_w(i32_1);
+        __m128 p2 = __lsx_vffint_s_w(i32_2);
+        __m128 p3 = __lsx_vffint_s_w(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
+        __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
+        __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
+        __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = __lsx_vfadd_s(p0_d, acc_0);
+        acc_1 = __lsx_vfadd_s(p1_d, acc_1);
+        acc_2 = __lsx_vfadd_s(p2_d, acc_2);
+        acc_3 = __lsx_vfadd_s(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_1 * restrict vx0 = vx;
+        const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+        const block_q8_1 * restrict vy0 = vy;
+        const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+        float32x4_t summs0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_1 * restrict b_x0 = &vx0[i];
+            const block_q4_1 * restrict b_x1 = &vx1[i];
+            const block_q8_1 * restrict b_y0 = &vy0[i];
+            const block_q8_1 * restrict b_y1 = &vy1[i];
+
+            float32_t summs_t[4] = {
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
+                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
+                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
+            };
+            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            // mmla into int32x4_t
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        sumv2 = vaddq_f32(sumv2, summs0);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_1 * restrict x0 = &x[ib + 0];
+        const block_q4_1 * restrict x1 = &x[ib + 1];
+        const block_q8_1 * restrict y0 = &y[ib + 0];
+        const block_q8_1 * restrict y1 = &y[ib + 1];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
+        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
+        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_0 * restrict x0 = &x[ib];
+        const block_q5_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_0 * restrict x0 = &x[ib];
+        const block_q8_0 * restrict y0 = &y[ib];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__riscv_v_intrinsic)
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // These temporary registers are for masking and shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
+
+    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
+    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (; ib < nb; ++ib) {
+        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
+
+        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+
+        // ((qh & (1u << (j + 16))) >> (j + 12));
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
+        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
+        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector unsigned char v4 = vec_splats((unsigned char)4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
+        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
+        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
+
+        qv0 = vec_add(qv0, qv1);
+
+        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_1 * restrict x0 = &x[ib];
+        const block_q5_1 * restrict x1 = &x[ib + 1];
+        const block_q8_1 * restrict y0 = &y[ib];
+        const block_q8_1 * restrict y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
+        summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_1 * restrict x0 = &x[ib];
+        const block_q8_1 * restrict y0 = &y[ib];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__riscv_v_intrinsic)
+    uint32_t qh;
+
+    size_t vl = __riscv_vsetvl_e8m1(qk/2);
+
+    // temporary registers for shift operations
+    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
+    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
+
+    for (; ib < nb; ++ib) {
+        memcpy(&qh, x[ib].qh, sizeof(uint32_t));
+
+        // load qh
+        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
+
+        // ((qh >> (j +  0)) << 4) & 0x10;
+        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
+        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
+        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
+
+        // ((qh >> (j + 12))     ) & 0x10;
+        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
+        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
+
+        // narrowing
+        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
+        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
+
+        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
+        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
+
+        // load
+        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
+
+        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
+        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
+
+        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
+        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
+
+        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
+        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
+
+        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
+        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
+
+        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
+        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+
+        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
+        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
+        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q8_0 * restrict vx0 = vx;
+        const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * restrict vy0 = vy;
+        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q8_0 * restrict b_x0 = &vx0[i];
+            const block_q8_0 * restrict b_y0 = &vy0[i];
+
+            const block_q8_0 * restrict b_x1 = &vx1[i];
+            const block_q8_0 * restrict b_y1 = &vy1[i];
+
+            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
+            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
+            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
+            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = {
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
+                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
+
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q8_0 * restrict x0 = &x[ib + 0];
+        const block_q8_0 * restrict x1 = &x[ib + 1];
+        const block_q8_0 * restrict y0 = &y[ib + 0];
+        const block_q8_0 * restrict y1 = &y[ib + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+#elif defined(__riscv_v_intrinsic)
+    size_t vl = __riscv_vsetvl_e8m1(qk);
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[ib].qs, vl);
+        vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
+
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
+    }
+#elif defined(__POWER9_VECTOR__)
+    const vector signed int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
+        vector signed char q8x1 = vec_xl(16, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_mule(q8x0, q8y0);
+        vector signed short qv1 = vec_mulo(q8x0, q8y0);
+        vector signed short qv2 = vec_mule(q8x1, q8y1);
+        vector signed short qv3 = vec_mulo(q8x1, q8y1);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+        vsumi0 = vec_sum4s(qv2, vsumi0);
+        vsumi1 = vec_sum4s(qv3, vsumi1);
+
+        vsumi0 = vec_add(vsumi0, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+        __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq1_0 * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
+
+    const uint8x16_t shift = vld1q_u8(k_shift);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        // first 32 bytes of 5 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
+            uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
+            uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
+            int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
+            int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
+            const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
+            const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+            sumi0 = vdotq_s32(sumi0, sqx8, qy8);
+            sumi1 = vdotq_s32(sumi1, sqx9, qy9);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
+#endif
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
+            uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
+            qx5 = vmulq_u8(qx5, shift);
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#elif defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+
+        // first 32 bytes of 5 elements
+        {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
+            // 8-bit multiplies with shifts, masks and adds
+            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
+            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
+            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
+            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
+
+            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
+            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
+            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
+            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
+            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
+            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
+            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
+            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
+            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
+            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
+            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
+            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+            qx4 = _mm256_maddubs_epi16(qx4, qy4);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+            sumi2 = _mm256_add_epi16(sumi2, qx4);
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
+            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
+            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
+            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
+            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
+            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
+            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
+
+            // avx2 does not have 8-bit multiplies, so 16-bit it is.
+            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
+            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
+            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
+
+            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
+            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
+            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
+            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
+            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
+            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
+            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
+            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
+            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
+            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
+
+            qx01 = _mm256_maddubs_epi16(qx01, qy01);
+            qx23 = _mm256_maddubs_epi16(qx23, qy23);
+            qx45 = _mm256_maddubs_epi16(qx45, qy45);
+
+            sumi0 = _mm256_add_epi16(sumi0, qx01);
+            sumi1 = _mm256_add_epi16(sumi1, qx23);
+            sumi2 = _mm256_add_epi16(sumi2, qx45);
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq2_0 * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    const uint8x16_t m3 = vdupq_n_u8(3);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
+            uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
+            uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
+            uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
+            uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
+            uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
+            uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
+
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#elif defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums, because 256*127 still fits
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
+            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
+            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
+            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
+
+            // 0, 1, 2 (should not be 3)
+            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_add_epi16(sumi0, sumi1);
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    const uint8x16_t m3 = vdupq_n_u8(0x3);
+    const uint8x16_t m4 = vdupq_n_u8(0xF);
+
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q2bytes;
+    uint8_t aux[16];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * restrict sc = x[i].scales;
+
+        const uint8x16_t mins_and_scales = vld1q_u8(sc);
+        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
+        vst1q_u8(aux, scales);
+
+        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
+        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
+                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
+        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
+                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
+        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
+
+        int isum = 0;
+        int is = 0;
+
+// We use this macro instead of a function call because for some reason
+// the code runs 2-3% slower, even if the function is declared inline
+#define MULTIPLY_ACCUM_WITH_SCALE(index)\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+
+#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
+        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
+        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
+        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
+        MULTIPLY_ACCUM_WITH_SCALE((index));
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
+
+            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
+            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
+
+            MULTIPLY_ACCUM_WITH_SCALE(0);
+
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
+
+            is += 8;
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
+        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
+
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
+            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
+            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
+            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
+
+            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
+            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
+
+            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = _mm256_add_epi32(p0, p1);
+            p2 = _mm256_add_epi32(p2, p3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(0x3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // load mins and scales from block_q2_K.scales[QK_K/16]
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
+        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
+
+        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
+        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
+        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
+
+        // sumf += -dmin * summs in 32bits*8
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
+
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
+            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
+            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
+            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
+            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
+            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
+            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
+            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
+            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
+            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
+
+            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
+
+            p0 = _mm_add_epi32(p0, p1);
+            p2 = _mm_add_epi32(p2, p3);
+            p4 = _mm_add_epi32(p4, p5);
+            p6 = _mm_add_epi32(p6, p7);
+
+            // isum in 32bits*4*2
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
+        }
+
+        // sumf += dall * isum - dmin * summs in 32bits
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        size_t vl = 16;
+
+        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
+        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
+
+        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
+
+        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
+        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
+        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
+        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+
+        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
+
+        vl = 32;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
+
+        uint8_t is=0;
+        int isum=0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load Q2
+            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
+
+            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
+            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
+
+            // duplicate scale elements for product
+            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
+            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
+            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
+            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
+
+            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
+            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
+            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
+            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
+
+            // load Q8
+            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
+            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
+
+            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
+            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
+            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
+            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
+
+            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
+
+            q2+=32;  q8+=128;  is=8;
+
+        }
+
+        sumf += dall * isum;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
+        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
+
+        q2xmins = vec_sr(q2xmins, v4);
+        vector signed short q2xmins0 = vec_unpackh(q2xmins);
+        vector signed short q2xmins1 = vec_unpackl(q2xmins);
+
+        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
+        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
+        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
+        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
+            q2 += 32;
+
+            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
+            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
+            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
+            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
+            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
+            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
+            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
+            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
+            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
+            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
+            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
+            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
+            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
+
+            vector signed short vscales_07 = vec_unpackh(vscales);
+            vector signed int vscales_03 = vec_unpackh(vscales_07);
+            vector signed int vscales_47 = vec_unpackl(vscales_07);
+            vector signed int vs0 = vec_splat(vscales_03, 0);
+            vector signed int vs1 = vec_splat(vscales_03, 1);
+            vector signed int vs2 = vec_splat(vscales_03, 2);
+            vector signed int vs3 = vec_splat(vscales_03, 3);
+            vector signed int vs4 = vec_splat(vscales_47, 0);
+            vector signed int vs5 = vec_splat(vscales_47, 1);
+            vector signed int vs6 = vec_splat(vscales_47, 2);
+            vector signed int vs7 = vec_splat(vscales_47, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
+            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
+            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
+            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
+            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xF);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q2 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+        const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
+        const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
+        const __m256i mins = lasx_ext8_16(mins8);
+        const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
+
+        const __m256i all_scales = lasx_ext8_16(scales8);
+        const __m128i l_scales = lasx_extracti128(all_scales, 0);
+        const __m128i h_scales = lasx_extracti128(all_scales, 1);
+        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i q2_0 = __lasx_xvand_v(q2bits, m3);
+            const __m256i q2_1 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 2), m3);
+            const __m256i q2_2 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 4), m3);
+            const __m256i q2_3 = __lasx_xvand_v(__lasx_xvsrli_h(q2bits, 6), m3);
+
+            __m256i p0 = lasx_maddubs_h(q2_0, q8_0);
+            __m256i p1 = lasx_maddubs_h(q2_1, q8_1);
+            __m256i p2 = lasx_maddubs_h(q2_2, q8_2);
+            __m256i p3 = lasx_maddubs_h(q2_3, q8_3);
+
+            p0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = __lasx_xvadd_w(p0, p1);
+            p2 = __lasx_xvadd_w(p2, p3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
+        }
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const uint8x16_t m3b = vdupq_n_u8(0x3);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+
+    const uint8x16_t m0 = vdupq_n_u8(1);
+    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
+    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
+    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
+    const int8_t m32 = 32;
+
+    ggml_int8x16x4_t q3bytes;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q3h;
+
+        int32_t isum = 0;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= m32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
+            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
+            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
+            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
+            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
+            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+
+            scale += 4;
+
+            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
+            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
+            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
+            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
+                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
+            }
+
+        }
+        sum += d * isum;
+
+    }
+
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i mone = _mm256_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
+
+        // integer accumulator
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+        int is  = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
+
+            // prepare low and high bits
+            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
+            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
+            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
+            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
+            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = _mm256_add_epi32(p16_0, p16_1);
+            p16_2 = _mm256_add_epi32(p16_2, p16_3);
+            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
+
+        }
+
+        // multiply with block scale and accumulate
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i mone = _mm_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    const uint32_t *aux;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // Set up scales
+        aux = (const uint32_t *)x[i].scales;
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
+
+        // integer accumulator
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
+            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+
+            // prepare low and high bits
+            const int bit = j << 2;
+
+            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
+            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
+            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
+            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
+
+            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
+            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
+            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+
+            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
+            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
+            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+
+            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
+            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
+            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+
+            // load Q8 quants from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
+            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
+            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
+            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
+            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
+            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
+
+            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
+
+            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            // multiply with scales
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
+
+            // accumulate
+            p16_0 = _mm_add_epi32(p16_0, p16_1);
+            p16_2 = _mm_add_epi32(p16_2, p16_3);
+            p16_4 = _mm_add_epi32(p16_4, p16_5);
+            p16_6 = _mm_add_epi32(p16_6, p16_7);
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
+
+        }
+
+        // multiply with block scale and accumulate
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+
+        size_t vl = 32;
+        uint8_t m =  1;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
+
+        int sum_t = 0;
+
+        for (int j = 0; j < QK_K; j += 128) {
+
+            vl = 32;
+
+            // load Q3
+            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
+
+            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
+            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
+            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
+            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
+
+            // compute mask for subtraction
+            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
+            m <<= 1;
+
+            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
+            m <<= 1;
+
+            // load Q8 and take product with Q3
+            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            // retrieve lane to multiply with scale
+            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
+            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
+            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
+            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
+            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
+            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
+            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
+            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
+
+            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q3 += 32;    q8 += 128;   scale += 8;
+
+        }
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        sumf += d*sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector signed char v1 = vec_splats((signed char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(u0, lowMask1);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
+        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
+        vector signed char u31 = vec_and(u3, lowMask2);
+
+        u1 = vec_or(u1, u30);
+        u2 = vec_or(vec_sr(u0, v4), u31);
+
+        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
+
+        vscales = vec_sub(vscales, off);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
+            q3 += 32;
+
+            //the low 2 bits
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
+            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
+            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
+            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
+            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
+
+            //the 3rd bit
+            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
+            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
+            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
+            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
+            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
+            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
+            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
+            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
+            qxhs0 = vec_sr(qxhs0, v4);
+            qxhs1 = vec_sr(qxhs1, v4);
+
+            vector signed char q3x00 = vec_sub(qxs00, qxh00);
+            vector signed char q3x01 = vec_sub(qxs01, qxh01);
+            vector signed char q3x02 = vec_sub(qxs02, qxh02);
+            vector signed char q3x03 = vec_sub(qxs03, qxh03);
+            vector signed char q3x10 = vec_sub(qxs10, qxh10);
+            vector signed char q3x11 = vec_sub(qxs11, qxh11);
+            vector signed char q3x12 = vec_sub(qxs12, qxh12);
+            vector signed char q3x13 = vec_sub(qxs13, qxh13);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short vscales_h = vec_unpackh(vscales);
+            vector signed short vs0 = vec_splat(vscales_h, 0);
+            vector signed short vs1 = vec_splat(vscales_h, 1);
+            vector signed short vs2 = vec_splat(vscales_h, 2);
+            vector signed short vs3 = vec_splat(vscales_h, 3);
+            vector signed short vs4 = vec_splat(vscales_h, 4);
+            vector signed short vs5 = vec_splat(vscales_h, 5);
+            vector signed short vs6 = vec_splat(vscales_h, 6);
+            vector signed short vs7 = vec_splat(vscales_h, 7);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
+            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
+            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
+            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
+            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
+            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
+            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
+            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs2, vsumi1);
+            vsumi2 = vec_msum(qv02, vs4, vsumi2);
+            vsumi3 = vec_msum(qv03, vs6, vsumi3);
+            vsumi4 = vec_msum(qv10, vs1, vsumi4);
+            vsumi5 = vec_msum(qv11, vs3, vsumi5);
+            vsumi6 = vec_msum(qv12, vs5, vsumi6);
+            vsumi7 = vec_msum(qv13, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m3 = __lasx_xvreplgr2vr_b(3);
+    const __m256i mone = __lasx_xvreplgr2vr_b(1);
+    const __m128i m32 = __lsx_vreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * restrict q3 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = lsx_set_w(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = __lsx_vsub_b(scales128, m32);
+        const __m256i all_scales = lasx_ext8_16(scales128);
+        const __m128i l_scales = lasx_extracti128(all_scales, 0);
+        const __m128i h_scales = lasx_extracti128(all_scales, 1);
+        const __m256i scales[2] = {lasx_insertf128(l_scales, l_scales), lasx_insertf128(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
+
+        // integer accumulator
+        __m256i sumi = __lasx_xvldi(0);
+
+        int bit = 0;
+        int is  = 0;
+        __m256i xvbit;
+
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            // prepare low and high bits
+            const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
+            const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
+            const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
+            const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit);
+            const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
+            const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0);
+            __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1);
+            __m256i q8s_2 = lasx_maddubs_h(q3h_2, q8_2);
+            __m256i q8s_3 = lasx_maddubs_h(q3h_3, q8_3);
+
+            __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1);
+            __m256i p16_2 = lasx_maddubs_h(q3l_2, q8_2);
+            __m256i p16_3 = lasx_maddubs_h(q3l_3, q8_3);
+
+            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = lasx_madd_h(lasx_shuffle_b(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = __lasx_xvadd_w(p16_0, p16_1);
+            p16_2 = __lasx_xvadd_w(p16_2, p16_3);
+            sumi  = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
+        }
+        // multiply with block scale and accumulate
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_FEATURE_SVE
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, K_SCALE_SIZE);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int vector_length = ggml_cpu_get_sve_cnt()*8;
+        const svuint8_t m4b = svdup_n_u8(0xf);
+        const svint32_t mzero = svdup_n_s32(0);
+        svint32_t sumi1 = svdup_n_s32(0);
+        svint32_t sumi1_1 = svdup_n_s32(0);
+        svint32_t sumi1_2 = svdup_n_s32(0);
+        svint32_t sumi2 = svdup_n_s32(0);
+        svint32_t sumi2_1 = svdup_n_s32(0);
+        svint32_t sumi2_2 = svdup_n_s32(0);
+        switch (vector_length) {
+            case 128:
+                {
+                    for (int j = 0; j < QK_K/64; ++j) {
+                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
+                        svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+                        q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                        q4 += 32;
+                    }
+                    sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
+                    sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
+                    sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
+                } break;
+            case 256:
+            case 512:
+                {
+                    for (int j = 0; j < QK_K/64; ++j) {
+                        const svuint8_t q4bits  = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
+                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
+                        svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
+                        sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
+                        q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
+                        sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                    }
+                    sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
+                } break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        }
+    }
+    *s = sumf;
+#elif __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q4bytes;
+    ggml_int8x16x2_t q8bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
+
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+
+            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4l = _mm256_and_si256(q4bits, m4);
+            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+            p16l = _mm256_madd_epi16(scale_l, p16l);
+
+            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+            p16h = _mm256_madd_epi16(scale_h, p16h);
+            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+
+            sumi = _mm256_add_epi32(sumi, sumj);
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+
+            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_0 = _mm_add_epi32(sumi_0, p16l);
+            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_1 = _mm_add_epi32(sumi_1, p16l);
+
+            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_0 = _mm_add_epi32(sumi_0, p16h);
+            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_1 = _mm_add_epi32(sumi_1, p16h);
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        size_t vl = 8;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        vl = 32;
+
+        int32_t sum_1 = 0;
+        int32_t sum_2 = 0;
+
+        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q4
+            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+            // load Q8 and multiply it with lower Q4 nibble
+            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
+            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
+
+            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
+
+            // load Q8 and multiply it with upper Q4 nibble
+            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
+            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
+
+            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
+
+            q4 += 32;    q8 += 64;
+
+        }
+
+        sumf += d*(sum_1 + sum_2);
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((uint8_t)2);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short vscales = vec_unpackh(utmps);
+        vector signed short q4xmins = vec_unpackl(utmps);
+        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
+        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
+
+        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; j+=2) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
+            q4 += 64;
+
+            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
+            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
+            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
+            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
+            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
+            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y20 = vec_xl( 64, q8);
+            vector signed char q8y30 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
+            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
+            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
+            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
+            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vector signed int vs2 = vec_splat(vscales_h, 2);
+            vector signed int vs3 = vec_splat(vscales_h, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
+
+            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+    GGML_UNUSED(kmask1);
+    GGML_UNUSED(kmask2);
+    GGML_UNUSED(kmask3);
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+    __m128 acc_m = (__m128)__lsx_vldi(0);
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
+
+        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
+        const __m256i scales = lasx_insertf128(sc128, sc128);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4l = __lasx_xvand_v(q4bits, m4);
+            const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4);
+
+            const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16l = lasx_maddubs_h(q4l, q8l);
+            p16l = lasx_madd_h(scale_l, p16l);
+
+            const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16h = lasx_maddubs_h(q4h, q8h);
+            p16h = lasx_madd_h(scale_h, p16h);
+            const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
+
+            sumi = __lasx_xvadd_w(sumi, sumj);
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
+    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
+
+
+    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const uint8x16_t mone = vdupq_n_u8(1);
+    const uint8x16_t mtwo = vdupq_n_u8(2);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t q5bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        int32_t sumi_mins = vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q5h;
+
+        int32_t sumi = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
+            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
+            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
+            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
+            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
+
+            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
+            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
+            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
+            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
+
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * sumi_mins;
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m256i mone  = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
+        __m256i hmask = mone;
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
+
+            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
+
+            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m128i mone  = _mm_set1_epi8(1);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
+        __m128i hmask = mone;
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int bit = 0;
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+
+            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
+            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
+            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_0 = _mm_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm_madd_epi16(scale_0, p16_1);
+
+            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
+            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
+            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_2 = _mm_madd_epi16(scale_1, p16_2);
+            p16_3 = _mm_madd_epi16(scale_1, p16_3);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __riscv_v_intrinsic
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    float sums = 0.0;
+
+    size_t vl;
+
+    for (int i = 0; i < nb; ++i) {
+
+        vl = 8;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+
+        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
+        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        vl = 32;
+        int32_t aux32 = 0;
+        int is = 0;
+
+        uint8_t m = 1;
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q5 and Q8
+            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
+            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
+            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
+
+            // compute mask for addition
+            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
+            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
+            m <<= 1;
+
+            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
+            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
+            m <<= 1;
+
+            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
+            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
+
+            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
+            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
+
+            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
+            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
+
+            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
+            q5 += 32;    q8 += 64;
+
+        }
+
+        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
+        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
+
+    }
+
+    *s = sumf+sums;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed short vscales = vec_unpackh(utmps);
+
+        vector signed short q5xmins = vec_unpackl(utmps);
+        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
+        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
+
+        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q5, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
+            q5 += 32;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+
+            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
+            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
+            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
+            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
+            qxhs0 = vec_sr(qxhs0, v2);
+            qxhs1 = vec_sr(qxhs1, v2);
+
+            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
+            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
+            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
+            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
+
+            vector signed char q8y00 = vec_xl( 0, q8);
+            vector signed char q8y10 = vec_xl(16, q8);
+            vector signed char q8y01 = vec_xl(32, q8);
+            vector signed char q8y11 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vscales = vec_sld(vscales, vscales, 12);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+    GGML_UNUSED(kmask1);
+    GGML_UNUSED(kmask2);
+    GGML_UNUSED(kmask3);
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+    const __m128i mzero = __lsx_vldi(0);
+    const __m256i mone  = __lasx_xvreplgr2vr_b(1);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0.f;
+
+   for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * restrict q5 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(lasx_extracti128(mins_and_scales, 1), q8s);
+        const __m128i hsum = lsx_hadd_w(lsx_hadd_w(prod, mzero), mzero);
+        summs += dmin * __lsx_vpickve2gr_w(hsum, 0);    //TODO check
+
+        const __m128i sc128  = lasx_extracti128(mins_and_scales, 0);
+        const __m256i scales = lasx_insertf128(sc128, sc128);
+
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
+        __m256i hmask = mone;
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        int bit = 0;
+        __m256i xvbit;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = lasx_shuffle_b(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
+
+            xvbit = __lasx_xvreplgr2vr_h(bit++);
+            const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
+            const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
+            const __m256i q5_0  = __lasx_xvadd_b(q5l_0, q5h_0);
+            hmask = __lasx_xvslli_h(hmask, 1);
+
+            xvbit = __lasx_xvreplgr2vr_h(bit++);
+            const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
+            const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
+            const __m256i q5_1  = __lasx_xvadd_b(q5l_1, q5h_1);
+            hmask = __lasx_xvslli_h(hmask, 1);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i p16_0 = lasx_maddubs_h(q5_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q5_1, q8_1);
+
+            p16_0 = lasx_madd_h(scale_0, p16_0);
+            p16_1 = lasx_madd_h(scale_1, p16_1);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+    float sum = 0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0xF);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+    //const int8x16_t  m32s = vdupq_n_s8(32);
+
+    const uint8x16_t mone = vdupq_n_u8(3);
+
+    ggml_int8x16x4_t q6bytes;
+    ggml_uint8x16x4_t q6h;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const int8x16_t scales = vld1q_s8(scale);
+        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
+
+        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
+                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
+        int32_t isum_mins = vaddvq_s32(prod);
+
+        int32_t isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
+            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
+            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 2);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+
+            scale += 4;
+
+            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            shifted = vshrq_n_u8(qhbits.val[0], 4);
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[0], 6);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 6);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            scale += 4;
+        }
+        //sum += isum * d_all * y[i].d;
+        sum += d_all * y[i].d * (isum - 32 * isum_mins);
+
+    }
+    *s = sum;
+
+#elif defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(3);
+    const __m256i m32s = _mm256_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
+
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m15 = _mm_set1_epi8(15);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        // handle the q6_k -32 offset separately using bsums
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+
+            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
+
+            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
+
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
+            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
+            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
+            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
+        }
+
+        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
+        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        size_t vl;
+
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+        int sum_t = 0;
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            vl = 32;
+
+            // load qh
+            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+            // load Q6
+            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
+
+            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
+            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
+            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
+
+            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+            // load Q8 and take product
+            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+            vl = 16;
+
+            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
+            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
+            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
+            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
+            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
+            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
+            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
+            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
+
+            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+            q6 += 64;   qh += 32;   q8 += 128;   is=8;
+
+        }
+
+        sumf += d * sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict qs = x[i].scales;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q6, 0, 0);
+            __builtin_prefetch(qh, 0, 0);
+            __builtin_prefetch(q8, 0, 0);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
+            q6 += 64;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+            vector signed char qxs20 = vec_and(qxs2, lowMask);
+            vector signed char qxs21 = vec_sr(qxs2, v4);
+            vector signed char qxs30 = vec_and(qxs3, lowMask);
+            vector signed char qxs31 = vec_sr(qxs3, v4);
+
+            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
+            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
+            qh += 32;
+
+            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
+            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
+            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
+            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
+            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
+            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
+            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
+            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
+
+            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
+            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
+            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
+            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
+            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
+            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
+            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
+            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y20 = vec_xl( 32, q8);
+            vector signed char q8y30 = vec_xl( 48, q8);
+            vector signed char q8y01 = vec_xl( 64, q8);
+            vector signed char q8y11 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
+            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
+            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
+            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
+            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
+            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
+            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
+            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
+
+            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
+            qs += 8;
+
+            vector signed short vs0 = vec_splat(vscales, 0);
+            vector signed short vs1 = vec_splat(vscales, 1);
+            vector signed short vs2 = vec_splat(vscales, 2);
+            vector signed short vs3 = vec_splat(vscales, 3);
+            vector signed short vs4 = vec_splat(vscales, 4);
+            vector signed short vs5 = vec_splat(vscales, 5);
+            vector signed short vs6 = vec_splat(vscales, 6);
+            vector signed short vs7 = vec_splat(vscales, 7);
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs4, vsumi1);
+            vsumi2 = vec_msum(qv10, vs1, vsumi2);
+            vsumi3 = vec_msum(qv11, vs5, vsumi3);
+            vsumi4 = vec_msum(qv20, vs2, vsumi4);
+            vsumi5 = vec_msum(qv21, vs6, vsumi5);
+            vsumi6 = vec_msum(qv30, vs3, vsumi6);
+            vsumi7 = vec_msum(qv31, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined __loongarch_asx
+
+    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
+    const __m256i m2 = __lasx_xvreplgr2vr_b(3);
+    const __m256i m32s = __lasx_xvreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        const __m128i scales = __lsx_vld((const __m128i*)x[i].scales, 0);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = lsx_shuffle_b(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = lsx_shuffle_b(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = lsx_shuffle_b(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = lsx_shuffle_b(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
+
+            const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(q4bitsH, m2), 4);
+            const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = __lasx_xvslli_h(__lasx_xvand_v(__lasx_xvsrli_h(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0);
+            __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1);
+            __m256i q8s_2 = lasx_maddubs_h(m32s, q8_2);
+            __m256i q8s_3 = lasx_maddubs_h(m32s, q8_3);
+
+            __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0);
+            __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1);
+            __m256i p16_2 = lasx_maddubs_h(q4_2, q8_2);
+            __m256i p16_3 = lasx_maddubs_h(q4_3, q8_3);
+
+            p16_0 = __lasx_xvsub_h(p16_0, q8s_0);
+            p16_1 = __lasx_xvsub_h(p16_1, q8s_1);
+            p16_2 = __lasx_xvsub_h(p16_2, q8s_2);
+            p16_3 = __lasx_xvsub_h(p16_3, q8s_3);
+
+            p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0);
+            p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1);
+            p16_2 = lasx_madd_h(lasx_ext8_16(scale_2), p16_2);
+            p16_3 = lasx_madd_h(lasx_ext8_16(scale_3), p16_3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
+        }
+
+        acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * restrict a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.25f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            uint32_t aux32[4];
+            const uint8_t * aux8 = (const uint8_t *)aux32;
+
+            memcpy(aux32, q2, 4*sizeof(uint32_t));
+            q2 += 8;
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = aux32[1] >> 28;
+            const uint16_t ls1 = aux32[3] >> 28;
+
+            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+
+            const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * restrict x = vx;
+    const block_q8_K   * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    int32x4x4_t scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+        const uint8x8_t scales8 = vld1_u8(x[i].scales);
+        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
+        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
+        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
+        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
+        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
+        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
+        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
+        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
+        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
+        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
+        int32x4_t sumi = vdupq_n_s32(0);
+        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
+            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
+            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
+            q2 += 8;
+        }
+        sumf += d*vaddvq_s32(sumi);
+    }
+    *s = 0.125f * sumf;
+
+#elif defined(__AVX2__)
+
+    const __m256i mone = _mm256_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
+    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
+    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+    const __m256i m511 = _mm256_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
+            aux_gindex = _mm256_and_si256(q2_data, m511);
+
+            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
+            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
+            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
+            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
+            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
+
+            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const __m128i mone = _mm_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
+    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
+    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
+    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
+    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
+    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
+    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
+    const __m128i m511 = _mm_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
+            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
+            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
+
+            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
+            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
+            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
+            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
+            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
+            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
+
+            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
+            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
+            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
+            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
+
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
+            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
+            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
+            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
+
+            // AVX2 full_signs_1 is full_sign_bits_0 here
+            // AVX2 full_signs_2 is full_sign_bits_1 here
+            __m128i signs_0, signs_1;
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
+            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
+            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
+            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
+
+            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
+            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
+            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
+            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
+            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__loongarch_asx)
+
+    const __m256i mone = __lasx_xvreplgr2vr_b(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
+    const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
+    const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
+    const __m256i m511 = __lasx_xvreplgr2vr_h(511);
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = __lsx_vreplgr2vr_d(aux64);
+        stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
+        const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0);  q2 += 16;
+            aux_gindex = __lasx_xvand_v(q2_data, m511);
+
+            const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
+            const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+
+            const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
+            const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
+            const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
+
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const __m256i dot3  = lasx_maddubs_h(q2_3, q8s_3);
+            const __m256i dot4  = lasx_maddubs_h(q2_4, q8s_4);
+
+            const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+#elif defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * restrict q2 = x[i].qs;
+        const uint8_t  * restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
+            q2 += 8;
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * restrict q2 = x[i].qs;
+        const uint8_t  * restrict sc = x[i].scales;
+        const int8_t   * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+    const uint8x16_t m1 = vdupq_n_u8(1);
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
+            qs += 8;
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
+            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            signs += 4;
+
+            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
+            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
+
+            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
+            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
+            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
+            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+
+    *s = 0.125f * sumf;
+
+#elif defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
+        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
+            qs += 8;
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t *  restrict q2 = x[i].qs;
+        const uint8_t *  restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const uint8_t *  restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
+            q2 += 8;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
+            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
+            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
+            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+    uint64_t aux64;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * restrict q8 = y[i].qs;
+
+        __m128i tmp1;
+        memcpy(&aux64, x[i].scales, 8);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
+        const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
+        const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
+            q3 += 16;
+            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
+            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
+            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
+            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.5f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
+        const int8_t  * restrict q8 = y[i].qs;
+
+#pragma GCC unroll 1
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
+            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
+            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
+            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            q3 += 16;
+
+            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
+            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
+            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
+            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
+
+            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
+            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
+            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
+            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
+            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
+            signs += 2;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.25f * vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    typedef union {
+        uint16x8_t vec_index;
+        uint16_t   index[8];
+    } vec_index_t;
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+
+    const int16x8_t  hshift = vld1q_s16(k_shift);
+    const uint16x8_t m256   = vdupq_n_u16(256);
+    const uint8x16_t m1     = vdupq_n_u8(1);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+    vec_index_t idx;
+
+    uint32_t scales32[2];
+    const uint8_t * scales8 = (const uint8_t *)scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t   * restrict q8 = y[i].qs;
+
+        memcpy(scales32, x[i].scales, 4);
+        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
+        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            signs += 4;
+
+            q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
+            sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+    *s = sumf;
+
+#elif defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = _mm256_set1_epi32(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
+            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
+            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
+            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
+    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
+    const __m128i idx_mask  = _mm_set1_epi32(256);
+
+    typedef union {
+        __m128i  vec[4];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
+            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
+            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
+            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = idx.vec[0];
+            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
+            idx.vec[3] = idx.vec[2];
+
+            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
+            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
+            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
+            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
+
+            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
+            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
+            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
+            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
+
+            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
+            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#elif defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        const uint8_t *  restrict q3 = x[i].qs;
+        const uint8_t *  restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
+        const uint8_t *  restrict sc = x[i].scales;
+        const int8_t  *  restrict q8 = y[i].qs;
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
+                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
+            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
+                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
+            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
+                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
+            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
+                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
+            q3 += 16;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
+            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
+            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
+            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            sc ++;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+
+    __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = __lasx_xvreplgr2vr_w(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
+            idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
+            idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
+            idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
+            idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = lasx_set_w(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = lasx_set_w(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict qs = x[i].qs;
+        const uint8_t * restrict qh = x[i].qh;
+        const uint8_t * restrict signs = x[i].signs;
+        const int8_t  * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+#if defined(__AVX2__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return _mm256_maddubs_epi16(ax, sy);
+}
+#elif defined(__loongarch_asx)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = __lasx_xvsigncov_b(x, x);
+    const __m256i sy = __lasx_xvsigncov_b(x, y);
+    __m256i tmp1, tmp2, tmp3;
+    tmp1 = __lasx_xvmulwev_h_bu_b(ax, sy);
+    tmp2 = __lasx_xvmulwod_h_bu_b(ax, sy);
+    tmp3 = __lasx_xvadd_h(tmp1, tmp2);
+    return __lasx_xvsat_h(tmp3, 15);
+}
+#endif
+
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi1 = 0, sumi2 = 0, sumi3 = 0;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
+            qs += 8;
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
+
+            const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            sumi1 += vaddvq_s32(p1) * ls1;
+            sumi2 += vaddvq_s32(p2) * ls2;
+            sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
+
+        }
+
+        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = _mm256_setzero_si256();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+            qs += 8;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined __AVX__
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
+            qs += 8;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined(__POWER9_VECTOR__)
+    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
+    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = vec_splats((int32_t)0);
+        vector signed int vsumi1 = vec_splats((int32_t)0);
+        vector signed int vsumi2 = vec_splats((int32_t)0);
+        vector signed int vsumi3 = vec_splats((int32_t)0);
+        vector signed int vsumi8 = vec_splats((int32_t)0);
+
+        const uint8_t  * restrict q1 = x[i].qs;
+        const uint16_t * restrict qh = x[i].qh;
+        const int8_t   * restrict q8 = y[i].qs;
+        const int16_t  * restrict qs = y[i].bsums;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q1, 0, 1);
+            __builtin_prefetch(qh, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
+            q1 += 8;
+
+            vector signed char q1x0 = (vector signed char)aux64x2_0;
+            vector signed char q1x1 = (vector signed char)aux64x2_1;
+            vector signed char q1x2 = (vector signed char)aux64x2_2;
+            vector signed char q1x3 = (vector signed char)aux64x2_3;
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
+            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+
+            vector signed short q8ysums = vec_xl_len(qs, 8);
+            qs += 4;
+            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
+
+            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
+            qh += 2;
+            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
+
+            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
+
+            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = __lasx_xvldi(0);
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
+
+            __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
+
+            qs += 8;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+
+            __m256i tmp1, tmp5, tmp6;
+            tmp1 = __lasx_xvreplgr2vr_h(ls1);
+            tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
+            const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
+
+            tmp1 = __lasx_xvreplgr2vr_h(ls2);
+            tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
+            const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
+        accum1 += d * sumi1;
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_m * restrict x = vx;
+    const block_q8_K  * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+    iq1m_scale_t scale;
+
+#if defined __ARM_NEON
+    const int32x4_t mask  = vdupq_n_s32(0x7);
+    const int32x4_t mone  = vdupq_n_s32(1);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t deltas;
+    deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
+    deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
+    deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
+    deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    uint32_t aux32;
+    const uint8_t * aux8 = (const uint8_t *)&aux32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int32x4_t sumi1 = mzero;
+        int32x4_t sumi2 = mzero;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
+            const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
+            const int32x4_t p12 = vpaddq_s32(p1, p2);
+
+            const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
+            aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
+
+            const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
+            const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
+            const int32x4_t p34 = vpaddq_s32(p3, p4);
+
+            int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
+
+            scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
+
+            sumi1 = vmlaq_s32(sumi1, scales_4, p12);
+            sumi2 = vmlaq_s32(sumi2, scales_4, p34);
+
+            qs += 8; qh += 4;
+
+        }
+
+        sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m256i mask = _mm256_set1_epi16(0x7);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m256i q1b_1 = _mm256_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
+            );
+            const __m256i q1b_2 = _mm256_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
+            );
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+
+            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+            const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
+            const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
+
+            __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
+            __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
+
+            scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
+            scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
+            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
+            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
+            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
+            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
+        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#elif defined __AVX__
+    const __m128i mask = _mm_set1_epi16(0x7);
+    const __m128i mone = _mm_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+
+            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
+            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
+            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
+            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
+
+            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
+            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
+            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
+            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
+
+            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
+            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
+            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
+            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
+            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
+            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
+            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
+            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#else
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * restrict x = vx;
+    const block_q8_0   * restrict y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    uint8x16x2_t q4bits;
+    int8x16x4_t q4b;
+    int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
+        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
+        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
+        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
+        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
+        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
+
+        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+        sumf +=
+            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
+            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
+    }
+
+#elif defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(p_1), accum1);
+        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+
+        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
+        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
+
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#elif defined (__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
+    const __m256i mone = __lasx_xvreplgr2vr_h(1);
+
+    __m256 accum1 = (__m256)__lasx_xvldi(0);
+    __m256 accum2 = (__m256)__lasx_xvldi(0);
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
+        const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
+        const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
+        const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
+        const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
+        const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = lasx_madd_h(p16_1, mone);
+        const __m256i p_2 = lasx_madd_h(p16_2, mone);
+        accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                __lasx_xvffint_s_w(p_1), accum1);
+        accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                __lasx_xvffint_s_w(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * restrict x = vx;
+    const block_q8_K   * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    ggml_uint8x16x2_t q4bits;
+    ggml_int8x16x4_t q4b;
+    ggml_int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    float sumf = 0;
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        const int8_t  * q8 = y[ibl].qs;
+        const uint8_t * q4 = x[ibl].qs;
+        uint16_t h = x[ibl].scales_h;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/64; ++ib) {
+
+            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
+            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
+            h >>= 4;
+            sumi1 += vaddvq_s32(prod_1) * ls1;
+            sumi2 += vaddvq_s32(prod_2) * ls2;
+
+        }
+
+        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#elif defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
+            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
+            sumi1 = _mm256_add_epi32(p_1, sumi1);
+            sumi2 = _mm256_add_epi32(p_2, sumi2);
+        }
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
+            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
+            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
+            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
+            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
+            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
+            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
+            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
+        }
+        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
+        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#elif defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
+        vector float vyd = vec_splats(y[ibl].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        uint16_t h = x[ibl].scales_h;
+
+        const uint8_t * restrict q4 = x[ibl].qs;
+        const uint8_t * restrict sc = x[ibl].scales_l;
+        const int8_t  * restrict q8 = y[ibl].qs;
+
+        for (int ib = 0; ib < QK_K/64; ib ++ ) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            q4 += 32;
+
+            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
+            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
+            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
+            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
+
+            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
+            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
+            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
+            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
+            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
+            h >>= 4;
+            sc ++;
+
+            vector signed short vscales01 = vec_splats((int16_t)ls0);
+            vector signed short vscales23 = vec_splats((int16_t)ls1);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#elif defined(__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+    __m256i tmp1;
+    __m128i tmp0, tmp2, tmp3, tmp4, mask_8f, mask;
+
+    mask_8f = __lsx_vreplgr2vr_b(0x8f);
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        __m128i zero = __lsx_vldi(0);
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
+            const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0);  qs += 16;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp3 = __lsx_vand_v(tmp0, mask);
+            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_1, m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp4 = __lsx_vand_v(tmp0, mask);
+            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+            const __m256i q4b_1 = lasx_insertf128(tmp3, tmp4);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp3 = __lsx_vand_v(tmp0, mask);
+            tmp3 = __lsx_vshuf_b(values128, zero, tmp3);
+
+            tmp2 = __lsx_vand_v(__lsx_vand_v(q4bits_2, m4b), mask_8f);
+            tmp0 = __lsx_vori_b(tmp2, 0x10);
+            mask = __lsx_vsle_b(zero, tmp2);
+            tmp4 = __lsx_vand_v(tmp0, mask);
+            tmp4 = __lsx_vshuf_b(values128, zero, tmp4);
+
+            const __m256i q4b_2 = lasx_insertf128(tmp3, tmp4);
+
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            __m256i tmp5, tmp6;
+            tmp1 = __lasx_xvreplgr2vr_h(ls1);
+            tmp5 = __lasx_xvmulwev_w_h(p16_1, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(p16_1, tmp1);
+            const __m256i p_1 = __lasx_xvadd_w(tmp5, tmp6);
+            tmp1 = __lasx_xvreplgr2vr_h(ls2);
+            tmp5 = __lasx_xvmulwev_w_h(p16_2, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(p16_2, tmp1);
+            const __m256i p_2 = __lasx_xvadd_w(tmp5, tmp6);
+            sumi1 = __lasx_xvadd_w(p_1, sumi1);
+            sumi2 = __lasx_xvadd_w(p_2, sumi2);
+        }
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
+// ============================ 4-bit non-linear quants
+
+void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
+    assert(k % QK4_NL == 0);
+    quantize_row_iq4_nl_ref(x, y, k);
+}
+
+void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq4_xs(x, y, 1, k, NULL);
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h
new file mode 100644
index 000000000..e33d9d473
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML CPU internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+// Dot product
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
new file mode 100644
index 000000000..62a0712da
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
@@ -0,0 +1,36 @@
+#include "ggml-cpu-traits.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+namespace ggml::cpu {
+tensor_traits::~tensor_traits() {}
+
+extra_buffer_type::~extra_buffer_type() {}
+}  // namespace ggml::cpu
+
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->compute_forward(params, op)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h
new file mode 100644
index 000000000..99a6186b1
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h
@@ -0,0 +1,38 @@
+#pragma once
+#include "ggml-backend-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+#    include <vector>
+extern "C" {
+#endif
+
+// return true if op part of extra "accelerator"
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
+
+#ifdef __cplusplus
+}
+
+namespace ggml::cpu {
+// register in tensor->extra
+class tensor_traits {
+  public:
+    virtual ~tensor_traits();
+    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
+    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
+};
+
+class extra_buffer_type {
+  public:
+    virtual ~extra_buffer_type();
+    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
+    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
+};
+}  // namespace ggml::cpu
+
+// implemented in ggml-cpu.cpp.
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
+
+#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
new file mode 100644
index 000000000..fdb430a43
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -0,0 +1,14390 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include "ggml-quants.h"
+#include "ggml-cpu-quants.h"
+#include "ggml-threading.h"
+#include "amx/amx.h"
+#include "ggml.h"
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <signal.h>
+#if defined(__gnu_linux__)
+#include <syscall.h>
+#endif
+
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
+#undef GGML_USE_LLAMAFILE
+#endif
+
+#ifdef GGML_USE_LLAMAFILE
+#include "llamafile/sgemm.h"
+#endif
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+
+// disable POSIX deprecation warnings
+// these functions are never going away, anyway
+#pragma warning(disable: 4996)
+
+// unreachable code because of multiple instances of code after GGML_ABORT
+#pragma warning(disable: 4702)
+#endif
+
+// Note: once we move threading into a separate C++ file
+// will use std::hardware_destructive_interference_size instead of hardcoding it here
+// and we'll use C++ attribute syntax.
+#define GGML_CACHE_LINE  64
+
+#if defined(__clang__) || defined(__GNUC__)
+#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define GGML_TSAN_ENABLED 1
+#endif
+#else  // __has_feature
+#if defined(__SANITIZE_THREAD__)
+#define GGML_TSAN_ENABLED 1
+#endif
+#endif // __has_feature
+
+#define UNUSED GGML_UNUSED
+#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+#endif
+
+// floating point type used to accumulate sums
+typedef double ggml_float;
+
+#define GGML_GELU_FP16
+#define GGML_GELU_QUICK_FP16
+
+#define GGML_SOFT_MAX_UNROLL 4
+#define GGML_VEC_DOT_UNROLL  2
+#define GGML_VEC_MAD_UNROLL  32
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+
+#if defined(__ARM_ARCH)
+struct ggml_arm_arch_features_type {
+    int has_neon;
+    int has_dotprod;
+    int has_i8mm;
+    int has_sve;
+    int sve_cnt;
+} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
+#endif
+
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+typedef atomic_int atomic_flag;
+
+#define ATOMIC_FLAG_INIT 0
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
+static void atomic_store(atomic_int * ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
+static LONG atomic_load(atomic_int * ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
+}
+static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
+    return InterlockedExchange(ptr, 1);
+}
+static void atomic_flag_clear(atomic_flag * ptr) {
+    InterlockedExchange(ptr, 0);
+}
+static void atomic_thread_fence(memory_order mo) {
+    MemoryBarrier();
+}
+#else // clang
+#include <stdatomic.h>
+#endif
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
+    (void) unused;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void * unused) {
+    (void) unused;
+    int ret = (int) WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return ret;
+}
+
+static int sched_yield (void) {
+    Sleep (0);
+    return 0;
+}
+#else
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <sched.h>
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+
+typedef void * thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#endif
+
+typedef pthread_t ggml_thread_t;
+
+#if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
+#include <TargetConditionals.h>
+#endif
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+
+static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32] = {
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_F16] = {
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot_type             = GGML_TYPE_F16,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0] = {
+        .from_float               = quantize_row_q4_0,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [GGML_TYPE_Q4_1] = {
+        .from_float               = quantize_row_q4_1,
+        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [GGML_TYPE_Q5_0] = {
+        .from_float               = quantize_row_q5_0,
+        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .from_float               = quantize_row_q5_1,
+        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q8_0] = {
+        .from_float               = quantize_row_q8_0,
+        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+    },
+    [GGML_TYPE_Q8_1] = {
+        .from_float               = quantize_row_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q2_K] = {
+        .from_float               = quantize_row_q2_K,
+        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q3_K] = {
+        .from_float               = quantize_row_q3_K,
+        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_K] = {
+        .from_float               = quantize_row_q4_K,
+        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q5_K] = {
+        .from_float               = quantize_row_q5_K,
+        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q6_K] = {
+        .from_float               = quantize_row_q6_K,
+        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_XXS] = {
+        .from_float               = NULL,
+        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_XS] = {
+        .from_float               = NULL,
+        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ3_XXS] = {
+        // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
+        //.from_float               = quantize_row_iq3_xxs,
+        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ3_S] = {
+        //.from_float               = quantize_row_iq3_s,
+        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ2_S] = {
+        //.from_float               = quantize_row_iq2_s,
+        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ1_S] = {
+        .from_float               = NULL,
+        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ1_M] = {
+        .from_float               = NULL,
+        .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ4_NL] = {
+        .from_float               = quantize_row_iq4_nl,
+        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_IQ4_XS] = {
+        .from_float               = quantize_row_iq4_xs,
+        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q8_K] = {
+        .from_float               = quantize_row_q8_K,
+    },
+    [GGML_TYPE_BF16] = {
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
+        .vec_dot_type             = GGML_TYPE_BF16,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TQ1_0] = {
+        .from_float               = quantize_row_tq1_0,
+        .vec_dot                  = ggml_vec_dot_tq1_0_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TQ2_0] = {
+        .from_float               = quantize_row_tq2_0,
+        .vec_dot                  = ggml_vec_dot_tq2_0_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+};
+
+const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
+    return &type_traits_cpu[type];
+}
+
+//
+// simd mappings
+//
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+//   number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 NEON
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              float32x4_t
+#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
+#define GGML_F32x4_LOAD         vld1q_f32
+#define GGML_F32x4_STORE        vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD          vaddq_f32
+#define GGML_F32x4_MUL          vmulq_f32
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_F32x4_REDUCE(res, x)                       \
+{                                                       \
+    int offset = GGML_F32_ARR >> 1;                     \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_F16_STEP 32
+    #define GGML_F16_EPR  8
+
+    #define GGML_F16x8              float16x8_t
+    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
+    #define GGML_F16x8_STORE        vst1q_f16
+    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_F16x8_ADD          vaddq_f16
+    #define GGML_F16x8_MUL          vmulq_f16
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_F16_VEC                GGML_F16x8
+    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
+    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
+    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
+    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_F16_STEP 16
+    #define GGML_F16_EPR  4
+
+    #define GGML_F32Cx4              float32x4_t
+    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
+    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_F32Cx4_ADD          vaddq_f32
+    #define GGML_F32Cx4_MUL          vmulq_f32
+    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
+
+    #define GGML_F16_VEC                GGML_F32Cx4
+    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
+    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
+    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+#elif defined(__AVX__)
+
+#define GGML_SIMD
+
+// F32 AVX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD    _mm256_loadu_ps
+#define GGML_F32x8_STORE   _mm256_storeu_ps
+#if defined(__FMA__)
+    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD     _mm256_add_ps
+#define GGML_F32x8_MUL     _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
+                                 _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8             __m256
+#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+    float arr[8];
+
+    _mm256_storeu_ps(arr, y);
+
+    for (int i = 0; i < 8; i++)
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+}
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         _mm256_add_ps
+#define GGML_F32Cx8_MUL         _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              vector float
+#define GGML_F32x4_ZERO         0.0f
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    res = vec_extract(x[0], 0) +               \
+          vec_extract(x[0], 1) +               \
+          vec_extract(x[0], 2) +               \
+          vec_extract(x[0], 3);                \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_F16_STEP       GGML_F32_STEP
+#define GGML_F16_EPR        GGML_F32_EPR
+#define GGML_F16_VEC        GGML_F32x4
+#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
+#define GGML_F16_VEC_STORE(p, r, i)                             \
+  if (i & 0x1)                                                  \
+    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
+                                   r[i - GGML_ENDIAN_BYTE(0)]), \
+            0, p - GGML_F16_EPR)
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              v128_t
+#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD         wasm_v128_load
+#define GGML_F32x4_STORE        wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD          wasm_f32x4_add
+#define GGML_F32x4_MUL          wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR  4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_FP16_TO_FP32(p[3]);
+
+    return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+    float tmp[4];
+
+    wasm_v128_store(tmp, x);
+
+    p[0] = GGML_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4             v128_t
+#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA         GGML_F32x4_FMA
+#define GGML_F16x4_ADD         wasm_f32x4_add
+#define GGML_F16x4_MUL         wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F16_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F16_VEC                GGML_F16x4
+#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD    _mm_loadu_ps
+#define GGML_F32x4_STORE   _mm_storeu_ps
+#if defined(__FMA__)
+    // TODO: Does this work?
+    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD     _mm_add_ps
+#define GGML_F32x4_MUL     _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
+    float arr[4];
+
+    _mm_storeu_ps(arr, y);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         _mm_add_ps
+#define GGML_F32Cx4_MUL         _mm_mul_ps
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#elif defined(__loongarch_asx)
+
+#define GGML_SIMD
+
+// F32 LASX
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
+#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
+#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
+#define GGML_F32x8_ADD     __lasx_xvfadd_s
+#define GGML_F32x8_MUL     __lasx_xvfmul_s
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    float *tmp_p = (float *)&x[0]; \
+    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 LASX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by LASX, so we use F32 instead
+
+#define GGML_F32Cx8          __m256
+#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+
+static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
+    __m256i a;
+    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
+    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+    return __lasx_xvfcvtl_s_h(a);
+}
+
+static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
+    __m256i a = __lasx_xvfcvt_h_s(y, y);
+    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
+}
+#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
+#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__loongarch_sx)
+
+#define GGML_SIMD
+
+// F32 LSX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    __lsx_vldi(0)
+#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
+#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
+#define GGML_F32x4_ADD     __lsx_vfadd_s
+#define GGML_F32x4_MUL     __lsx_vfmul_s
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 LSX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return __lsx_vld(tmp, 0);
+}
+
+static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
+    float arr[4];
+
+    __lsx_vst(y, arr, 0);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         __lsx_vfadd_s
+#define GGML_F32Cx4_MUL         __lsx_vfmul_s
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#endif
+
+// GGML_F32_ARR / GGML_F16_ARR
+//   number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
+
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
+
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
+
+    // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
+    atomic_int GGML_CACHE_ALIGN n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
+    atomic_int abort;         // Used for aborting processing of a graph
+
+    struct ggml_compute_state * workers;   // per thread state
+    int          n_threads_max; // number of threads in the pool
+    atomic_int   n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;        // Scheduling priority
+    uint32_t     poll;        // Polling level (0 - no polling)
+
+    enum ggml_status ec;
+};
+
+// Per-thread state
+struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
+    ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    int  last_graph;
+    bool pending;
+#endif
+    struct ggml_threadpool * threadpool;
+    int ith;
+};
+
+//
+// fundamental operations
+//
+
+inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
+inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
+
+inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
+inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
+   assert(nrc == 1);
+   UNUSED(nrc);
+   UNUSED(bx);
+   UNUSED(by);
+   UNUSED(bs);
+
+#if defined(GGML_SIMD)
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    int i = 0;
+    ggml_float sumf = 0;
+
+#if defined(__AVX512BF16__)
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 64 <= n; i += 64) {
+        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
+                             m512bh(_mm512_loadu_si512((y + i))));
+        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
+                             m512bh(_mm512_loadu_si512((y + i + 32))));
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#undef LOAD
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
+    __m256 c1 = _mm256_setzero_ps();
+    __m256 c2 = _mm256_setzero_ps();
+    __m256 c3 = _mm256_setzero_ps();
+    __m256 c4 = _mm256_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+    }
+    __m128 g;
+    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+                       _mm256_add_ps(c2, c4));
+    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+                   _mm256_castps256_ps128(c1));
+    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+    g = _mm_add_ss(g, _mm_movehdup_ps(g));
+    sumf += (ggml_float)_mm_cvtss_f32(g);
+
+#undef LOAD
+#endif
+
+    for (; i < n; ++i) {
+        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
+                             GGML_BF16_TO_FP32(y[i]));
+    }
+    *s = sumf;
+}
+
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    ggml_float sumf = 0.0;
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F16_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#endif
+
+    *s = sumf;
+}
+
+// compute GGML_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
+
+    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+
+                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            }
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#endif
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        s[i] = sumf[i];
+    }
+}
+
+inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#endif
+}
+
+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#endif
+}
+
+// xs and vs are byte strides of x and v
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+
+    const float * restrict x[GGML_VEC_MAD_UNROLL];
+    const float * restrict v[GGML_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
+//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
+inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] *= v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= v;
+    }
+#endif
+}
+
+inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#endif
+}
+
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
+inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
+inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
+inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
+inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
+inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
+inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
+inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
+inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
+
+static const float GELU_COEF_A     = 0.044715f;
+static const float GELU_QUICK_COEF = -1.702f;
+static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+inline static float ggml_gelu_f32(float x) {
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_f16[i16[i]];
+    }
+}
+
+#ifdef GGML_GELU_FP16
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        if (x[i] <= -10.0f) {
+            y[i] = 0.0f;
+        } else if (x[i] >= 10.0f) {
+            y[i] = x[i];
+        } else {
+            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            memcpy(&t, &fp16, sizeof(uint16_t));
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        }
+    }
+}
+#else
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_gelu_quick_f32(float x) {
+    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
+}
+
+//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_GELU_QUICK_FP16
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_quick_f32(x[i]);
+    }
+}
+#endif
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ggml_silu_f32(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+#if __FINITE_MATH_ONLY__
+#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
+#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#endif
+
+#if defined(__ARM_NEON) && defined(__aarch64__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static float32x4_t ggml_v_expf(float32x4_t x) {
+    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
+    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
+    const float32x4_t n = vsubq_f32(z, r);
+    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
+                                    vdupq_n_f32(0x1.7f7d1cp-20f));
+    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
+    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
+    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
+    const float32x4_t u = vmulq_f32(b, b);
+    const float32x4_t j = vfmaq_f32(
+        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
+        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
+                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
+    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
+        return vfmaq_f32(k, j, k);
+    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
+    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
+    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
+    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
+                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static float32x4_t ggml_v_silu(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t zero = vdupq_n_f32(0.0f);
+    const float32x4_t neg_x = vsubq_f32(zero, x);
+    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
+    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+    return vdivq_f32(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m512 ggml_v_expf(__m512 x) {
+  const __m512 r = _mm512_set1_ps(0x1.8p23f);
+  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
+  const __m512 n = _mm512_sub_ps(z, r);
+  const __m512 b =
+      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
+  const __mmask16 d =
+      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
+  const __m512 u = _mm512_mul_ps(b, b);
+  const __m512 j = _mm512_fmadd_ps(
+      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
+                                      _mm512_set1_ps(0x1.573e2ep-5f)),
+                      u,
+                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
+                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
+      u,
+      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
+  const __m512 res = _mm512_scalef_ps(j, n);
+  if (_mm512_kortestz(d, d))
+    return res;
+  const __m512 zero = _mm512_setzero_ps();
+  const __m512 alt = _mm512_mask_blend_ps(
+      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
+  return _mm512_mask_blend_ps(d, res, alt);
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m512 ggml_v_silu(__m512 x) {
+    const __m512 one = _mm512_set1_ps(1);
+    const __m512 zero = _mm512_setzero_ps();
+    const __m512 neg_x = _mm512_sub_ps(zero, x);
+    const __m512 exp_neg_x = ggml_v_expf(neg_x);
+    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
+    return _mm512_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX2__) && defined(__FMA__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m256 ggml_v_expf(__m256 x) {
+  const __m256 r = _mm256_set1_ps(0x1.8p23f);
+  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
+  const __m256 n = _mm256_sub_ps(z, r);
+  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
+                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
+  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
+  const __m256 k = _mm256_castsi256_ps(
+      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
+  const __m256i c = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(126), _CMP_GT_OQ));
+  const __m256 u = _mm256_mul_ps(b, b);
+  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
+                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
+                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
+                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
+                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
+  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
+    return _mm256_fmadd_ps(j, k, k);
+  const __m256i g = _mm256_and_si256(
+      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
+      _mm256_set1_epi32(0x82000000u));
+  const __m256 s1 =
+      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
+  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
+  const __m256i d = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(192), _CMP_GT_OQ));
+  return _mm256_or_ps(
+      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
+      _mm256_andnot_ps(
+          _mm256_castsi256_ps(d),
+          _mm256_or_ps(
+              _mm256_and_ps(_mm256_castsi256_ps(c),
+                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
+              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m256 ggml_v_silu(__m256 x) {
+    const __m256 one = _mm256_set1_ps(1);
+    const __m256 zero = _mm256_setzero_ps();
+    const __m256 neg_x = _mm256_sub_ps(zero, x);
+    const __m256 exp_neg_x = ggml_v_expf(neg_x);
+    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
+    return _mm256_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
+
+#if defined(__FMA__)
+#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
+#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
+#else
+#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
+#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
+#endif
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m128 ggml_v_expf(__m128 x) {
+    const __m128 r = _mm_set1_ps(0x1.8p23f);
+    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
+    const __m128 n = _mm_sub_ps(z, r);
+    const __m128 b =
+        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
+    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
+    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
+    const __m128i c =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
+    const __m128 u = _mm_mul_ps(b, b);
+    const __m128 j =
+        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
+                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
+                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
+    if (!_mm_movemask_epi8(c))
+        return MADD128(j, k, k);
+    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
+                                    _mm_set1_epi32(0x82000000u));
+    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
+    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
+    const __m128i d =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
+    return _mm_or_ps(
+        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
+        _mm_andnot_ps(_mm_castsi128_ps(d),
+                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
+                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __AVX2__ / __SSE2__
+
+static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
+
+static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
+
+static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+
+    int i = 0;
+    ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (ggml_float)expf(val);
+    }
+    return sum = (ggml_float)logf(sum);
+}
+
+inline static float ggml_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+    }
+}
+
+inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+#else
+    vDSP_sve(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_BF16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    *s = max;
+#else
+    vDSP_maxv(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
+    ggml_vec_norm_f32(n, s, x);
+    *s = 1.f/(*s);
+}
+
+inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void ggml_thread_cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void ggml_thread_cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void ggml_thread_cpu_relax(void) {;}
+#endif
+
+//
+// NUMA support
+//
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    enum ggml_numa_strategy numa_strategy;
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+    uint32_t current_node; // node on which main process is execting
+#if defined(__gnu_linux__)
+    cpu_set_t cpuset; // cpuset from numactl
+#else
+    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
+#endif
+};
+
+//
+// ggml state
+//
+
+struct ggml_state {
+    struct ggml_numa_nodes numa;
+};
+
+static struct ggml_state g_state = {0};
+
+void ggml_barrier(struct ggml_threadpool * tp) {
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    if (n_threads == 1) {
+        return;
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp barrier
+#else
+    int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
+
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
+
+    if (n_barrier == (n_threads - 1)) {
+        // last thread
+        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
+
+        // exit barrier (fill seq-cst fence)
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
+        return;
+    }
+
+    // wait for other threads
+    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
+        ggml_thread_cpu_relax();
+    }
+
+    // exit barrier (full seq-cst fence)
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+#endif
+}
+
+#if defined(__gnu_linux__)
+static cpu_set_t ggml_get_numa_affinity(void) {
+    cpu_set_t cpuset;
+    pthread_t thread;
+    thread = pthread_self();
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+    return cpuset;
+}
+#else
+static uint32_t ggml_get_numa_affinity(void) {
+    return 0; // no NUMA support
+}
+#endif
+
+void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
+#if defined(__gnu_linux__)
+    struct stat st;
+    char path[256];
+    int rv;
+
+    // set numa scheme
+    g_state.numa.numa_strategy = numa_flag;
+
+    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
+
+    g_state.numa.cpuset = ggml_get_numa_affinity();
+
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
+    // figure out which node we're on
+    uint current_cpu;
+    int getcpu_ret = 0;
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
+    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
+#else
+    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
+#   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
+#       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
+#   endif
+    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
+#endif
+
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+
+    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
+
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_numa_node * node = &g_state.numa.nodes[n];
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    UNUSED(numa_flag);
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
+
+#if defined(__ARM_ARCH)
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+static void ggml_init_arm_arch_features(void) {
+#if defined(__linux__) && defined(__aarch64__)
+    uint32_t hwcap = getauxval(AT_HWCAP);
+    uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
+    ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
+    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
+    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
+#endif
+#elif defined(__APPLE__)
+    int oldp = 0;
+    size_t size = sizeof(oldp);
+    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_neon = oldp;
+
+    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_dotprod = oldp;
+
+    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    ggml_arm_arch_features.has_i8mm = oldp;
+
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#else
+// Run-time CPU feature detection not implemented for this platform, fallback to compile time
+#if defined(__ARM_NEON)
+    ggml_arm_arch_features.has_neon = 1;
+#else
+    ggml_arm_arch_features.has_neon = 0;
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_arm_arch_features.has_i8mm = 1;
+#else
+    ggml_arm_arch_features.has_i8mm = 0;
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+    ggml_arm_arch_features.has_sve = 1;
+    ggml_arm_arch_features.sve_cnt = 16;
+#else
+    ggml_arm_arch_features.has_sve = 0;
+    ggml_arm_arch_features.sve_cnt = 0;
+#endif
+#endif
+}
+#endif
+
+struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
+    GGML_ASSERT(!ggml_get_no_alloc(ctx));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+
+    ggml_set_i32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
+    GGML_ASSERT(!ggml_get_no_alloc(ctx));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+
+    ggml_set_f32(result, value);
+
+    return result;
+}
+
+struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    return tensor;
+}
+
+struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
+    const int n     = ggml_nrows(tensor);
+    const int nc    = tensor->ne[0];
+    const size_t n1 = tensor->nb[1];
+
+    char * const data = tensor->data;
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                assert(tensor->nb[0] == sizeof(int8_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I16:
+            {
+                assert(tensor->nb[0] == sizeof(int16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_I32:
+            {
+                assert(tensor->nb[0] == sizeof(int32_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                assert(tensor->nb[0] == sizeof(ggml_bf16_t));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                assert(tensor->nb[0] == sizeof(float));
+                for (int i = 0; i < n; i++) {
+                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+                }
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    return tensor;
+}
+
+int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_BF16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                GGML_ASSERT(tensor->nb[0] == sizeof(float));
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_BF16:
+            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                return ((int8_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I16:
+            {
+                return ((int16_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_I32:
+            {
+                return ((int32_t *)(tensor->data))[i];
+            }
+        case GGML_TYPE_F16:
+            {
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_BF16:
+            {
+                return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+            }
+        case GGML_TYPE_F32:
+            {
+                return ((float *)(tensor->data))[i];
+            }
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(tensor->data))[i] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(tensor->data))[i] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            return ((int8_t *) data)[0];
+        case GGML_TYPE_I16:
+            return ((int16_t *) data)[0];
+        case GGML_TYPE_I32:
+            return ((int32_t *) data)[0];
+        case GGML_TYPE_F16:
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+        case GGML_TYPE_BF16:
+            return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
+        case GGML_TYPE_F32:
+            return ((float *) data)[0];
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// ggml_compute_forward_dup
+
+static void ggml_compute_forward_dup_same_cont(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    const size_t nb0 = ggml_type_size(src0->type);
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by elements
+    const int ne = ggml_nelements(dst);
+    const int dr = (ne + nth - 1) / nth;
+    const int ie0 = dr * ith;
+    const int ie1 = MIN(ie0 + dr, ne);
+
+    if (ie0 < ie1) {
+        memcpy(
+            ((char *)  dst->data + ie0*nb0),
+            ((char *) src0->data + ie0*nb0),
+            (ie1 - ie0) * nb0);
+    }
+}
+
+static void ggml_compute_forward_dup_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_fp16_t)) {
+            if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+    if (ggml_is_contiguous(dst)) {
+        if (nb00 == sizeof(ggml_bf16_t)) {
+            if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
+                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+                            }
+
+                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+        return;
+    }
+
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_BF16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+static void ggml_compute_forward_dup_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        // TODO: simplify
+        if (nb00 == sizeof(float)) {
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                const size_t rs = ne00 * nb00;
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, rs);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
+                ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
+
+                size_t id = 0;
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                char * dst_ptr = (char *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += rs * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            id += rs;
+                        }
+                        id += rs * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            if (dst->type == GGML_TYPE_F32) {
+                size_t id = 0;
+                float * dst_ptr = (float *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = *src0_ptr;
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_F16) {
+                size_t id = 0;
+                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else if (dst->type == GGML_TYPE_BF16) {
+                size_t id = 0;
+                ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+                for (int i03 = 0; i03 < ne03; i03++) {
+                    for (int i02 = 0; i02 < ne02; i02++) {
+                        id += ne00 * ir0;
+                        for (int i01 = ir0; i01 < ir1; i01++) {
+                            for (int i00 = 0; i00 < ne00; i00++) {
+                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+                                dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr);
+                                id++;
+                            }
+                        }
+                        id += ne00 * (ne01 - ir1);
+                    }
+                }
+            } else {
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else if (dst->type == GGML_TYPE_BF16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                i10 += ne00 * ir0;
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr);
+
+                        if (++i10 == ne0) {
+                            i10 = 0;
+                            if (++i11 == ne1) {
+                                i11 = 0;
+                                if (++i12 == ne2) {
+                                    i12 = 0;
+                                    if (++i13 == ne3) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                i10 += ne00 * (ne01 - ir1);
+                while (i10 >= ne0) {
+                    i10 -= ne0;
+                    if (++i11 == ne1) {
+                        i11 = 0;
+                        if (++i12 == ne2) {
+                            i12 = 0;
+                            if (++i13 == ne3) {
+                                i13 = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        GGML_ABORT("fatal error"); // TODO: implement
+    }
+}
+
+// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
+static void ggml_compute_forward_dup_bytes(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    GGML_ASSERT(src0->type == dst->type);
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+        ggml_compute_forward_dup_same_cont(params, dst);
+        return;
+    }
+
+    const size_t type_size = ggml_type_size(src0->type);
+    const int ith = params->ith; // thread index
+    const int nth = params->nth; // number of threads
+
+
+    // parallelize by rows
+    const int nr = ne01;
+    // number of rows per thread
+    const int dr = (nr + nth - 1) / nth;
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (src0->type == dst->type &&
+        ne00 == ne0 &&
+        nb00 == type_size && nb0 == type_size) {
+        // copy by rows
+        const size_t rs = ne00 * type_size;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
+                }
+            }
+        }
+        return;
+    }
+
+    if (ggml_is_contiguous(dst)) {
+        size_t id = 0;
+        char * dst_ptr = (char *) dst->data;
+        const size_t rs = ne00 * type_size;
+
+        if (nb00 == type_size) {
+            // src0 is contigous on first dimension, copy by rows
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                        memcpy(dst_ptr + id, src0_ptr, rs);
+                        id += rs;
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        } else {
+            //printf("%s: this is not optimal - fix me\n", __func__);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    id += rs * ir0;
+                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
+                            memcpy(dst_ptr + id, src0_ptr, type_size);
+
+                            id += type_size;
+                        }
+                    }
+                    id += rs * (ne01 - ir1);
+                }
+            }
+        }
+
+        return;
+    }
+
+    // dst counters
+
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            i10 += ne00 * ir0;
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+            for (int64_t i01 = ir0; i01 < ir1; i01++) {
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                    memcpy(dst_ptr, src0_ptr, type_size);
+
+                    if (++i10 == ne0) {
+                        i10 = 0;
+                        if (++i11 == ne1) {
+                            i11 = 0;
+                            if (++i12 == ne2) {
+                                i12 = 0;
+                                if (++i13 == ne3) {
+                                    i13 = 0;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            i10 += ne00 * (ne01 - ir1);
+            while (i10 >= ne0) {
+                i10 -= ne0;
+                if (++i11 == ne1) {
+                    i11 = 0;
+                    if (++i12 == ne2) {
+                        i12 = 0;
+                        if (++i13 == ne3) {
+                            i13 = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_dup_q(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+
+    size_t qk = ggml_blck_size(type);
+    const int64_t nr = ggml_nelements(src1) / qk;
+
+    // destination must be contiguous in the first dimension
+    GGML_ASSERT(nb10 == ggml_type_size(dst->type));
+    // must either have first dimension large enough to hold a row, or fully contiguous
+    GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+
+        uint32_t i = ir * qk;
+
+        const int64_t i03 = i/(ne00 * ne01 * ne02);
+        const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+        const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+        const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+        const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+        const int64_t i13 = i/(ne10 * ne11 * ne12);
+        const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+        const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+        const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+        const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+
+        dequantize_row_q(
+                (const void *) ((char *) src0->data + x_offset),
+                     (float *) ((char *)  dst->data + dst_offset), qk);
+    }
+}
+
+static void ggml_compute_forward_dup(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (src0->type == dst->type) {
+        ggml_compute_forward_dup_bytes(params, dst);
+        return;
+    }
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_dup_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_dup_bf16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_dup_f32(params, dst);
+            } break;
+        default:
+            {
+                if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_dup_q(params, dst);
+                    break;
+                }
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add
+
+static void ggml_compute_forward_add_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_f16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    }
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        if (dst->type == GGML_TYPE_F16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_bf16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (dst->type == GGML_TYPE_F32) {
+        GGML_ASSERT( nb0 == sizeof(float));
+    }
+    else {
+        GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+        GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    }
+
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        if (dst->type == GGML_TYPE_BF16) {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+                }
+            }
+        } else {
+            for (int ir = ir0; ir < ir1; ++ir) {
+                // src0, src1 and dst are same shape => same indices
+                const int i3 = ir/(ne2*ne1);
+                const int i2 = (ir - i3*ne2*ne1)/ne1;
+                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+                ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+                for (int i = 0; i < ne0; i++) {
+                    dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+                }
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_f16_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_fp16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_bf16_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(ggml_bf16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+            ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
+            }
+        }
+    }
+    else {
+        // src1 is not contiguous
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void ggml_compute_forward_add_q_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    const enum ggml_type dtype = dst->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        // src1 and dst are same shape as src0 => same indices
+        const int i13 = i03;
+        const int i12 = i02;
+        const int i11 = i01;
+
+        const int i3 = i03;
+        const int i2 = i02;
+        const int i1 = i01;
+
+        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
+        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne00);
+        // add src1
+        ggml_vec_acc_f32(ne00, wdata, src1_row);
+        // quantize row to dst
+        if (quantize_row_q != NULL) {
+            quantize_row_q(wdata, dst_row, ne00);
+        } else {
+            memcpy(dst_row, wdata, ne0*nb0);
+        }
+    }
+}
+
+static void ggml_compute_forward_add(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add_f16_f16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_f16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                if (src1->type == GGML_TYPE_BF16) {
+                    ggml_compute_forward_add_bf16_bf16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add_bf16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+            {
+                ggml_compute_forward_add_q_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add1
+
+static void ggml_compute_forward_add1_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+#ifdef GGML_USE_ACCELERATE
+        UNUSED(ggml_vec_add1_f32);
+
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                (float *) ((char *) src1->data), 0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                ne0);
+#else
+        ggml_vec_add1_f32(ne0,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+               *(float *) src1->data);
+#endif
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_f16_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_q_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+    ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float;
+
+    // we don't support permuted src0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(dst->type == src0->type);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
+        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
+
+        assert(ne0 % 32 == 0);
+
+        // unquantize row from src0 to temp buffer
+        dequantize_row_q(src0_row, wdata, ne0);
+        // add src1
+        ggml_vec_acc1_f32(ne0, wdata, v);
+        // quantize row to dst
+        quantize_row_q(wdata, dst_row, ne0);
+    }
+}
+
+static void ggml_compute_forward_add1_bf16_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = *(float *) src1->data;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1_bf16_bf16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_scalar(src1));
+
+    // scalar to add
+    const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+    GGML_ASSERT(dst->type  == GGML_TYPE_BF16);
+
+    GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are same shape => same indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        ggml_bf16_t * dst_ptr  = (ggml_bf16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+        for (int i = 0; i < ne0; i++) {
+            dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+        }
+    }
+}
+
+static void ggml_compute_forward_add1(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add1_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                if (src1->type == GGML_TYPE_F16) {
+                    ggml_compute_forward_add1_f16_f16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_f16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                if (src1->type == GGML_TYPE_BF16) {
+                    ggml_compute_forward_add1_bf16_bf16(params, dst);
+                }
+                else if (src1->type == GGML_TYPE_F32) {
+                    ggml_compute_forward_add1_bf16_f32(params, dst);
+                }
+                else {
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+            {
+                ggml_compute_forward_add1_q_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_acc
+
+static void ggml_compute_forward_acc_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during acc
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during acc
+    const size_t nb0 = ggml_element_size(src0);
+
+    const size_t nb00 = nb0;
+    const size_t nb01 = nb1;
+    const size_t nb02 = nb2;
+    const size_t nb03 = nb3;
+
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
+    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+#ifdef GGML_USE_ACCELERATE
+        vDSP_vadd(
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
+#else
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+#endif
+    }
+}
+
+static void ggml_compute_forward_acc(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_acc_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sub
+
+static void ggml_compute_forward_sub_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sub(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sub_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mul
+
+static void ggml_compute_forward_mul_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0 ; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_mul_f32);
+
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mul(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mul_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_div
+
+static void ggml_compute_forward_div_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_div_f32);
+
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_div(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_div_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sqr
+
+static void ggml_compute_forward_sqr_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n     = ggml_nrows(src0);
+    const int nc    = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqr_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqr(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqr_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sqrt
+
+static void ggml_compute_forward_sqrt_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sqrt_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sqrt(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sqrt_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_log
+
+static void ggml_compute_forward_log_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_log_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_log(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_log_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sin
+
+static void ggml_compute_forward_sin_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sin_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sin(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sin_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cos
+
+static void ggml_compute_forward_cos_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_cos_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_cos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cos_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sum
+
+static void ggml_compute_forward_sum_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    ggml_float sum     = 0;
+    ggml_float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32_ggf(ne00,
+                        &row_sum,
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((float *) dst->data)[0] = sum;
+}
+
+static void ggml_compute_forward_sum_f16(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+
+    assert(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f16_ggf(ne00,
+                    &row_sum,
+                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+}
+
+static void ggml_compute_forward_sum_bf16(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_scalar(dst));
+
+    assert(src0->nb[0] == sizeof(ggml_bf16_t));
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
+
+    float sum = 0;
+    float row_sum = 0;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_bf16_ggf(ne00,
+                    &row_sum,
+                    (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+                sum += row_sum;
+            }
+        }
+    }
+    ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
+}
+
+static void ggml_compute_forward_sum(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_sum_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_sum_bf16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sum_rows
+
+static void ggml_compute_forward_sum_rows_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne0 == 1);
+    GGML_ASSERT(ne1 == ne01);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    for (int64_t i3 = 0; i3 < ne03; i3++) {
+        for (int64_t i2 = 0; i2 < ne02; i2++) {
+            for (int64_t i1 = 0; i1 < ne01; i1++) {
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                float row_sum = 0;
+                ggml_vec_sum_f32(ne00, &row_sum, src_row);
+                dst_row[0] = row_sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_sum_rows(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sum_rows_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mean
+
+static void ggml_compute_forward_mean_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    assert(ne0 == 1);
+    assert(ne1 == ne01);
+    assert(ne2 == ne02);
+    assert(ne3 == ne03);
+
+    UNUSED(ne0);
+    UNUSED(ne1);
+    UNUSED(ne2);
+    UNUSED(ne3);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                ggml_vec_sum_f32(ne00,
+                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+
+                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mean(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_mean_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_argmax
+
+static void ggml_compute_forward_argmax_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(src0->nb[0] == sizeof(float));
+    assert(dst->nb[0] == sizeof(float));
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+
+    const size_t nb01 = src0->nb[1];
+    const size_t nb0 = dst->nb[0];
+
+    for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float * src = (float *) ((char *) src0->data + i1*nb01);
+        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
+        int v = 0;
+        ggml_vec_argmax_f32(ne00, &v, src);
+        dst_[0] = v;
+    }
+}
+
+static void ggml_compute_forward_argmax(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argmax_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_count_equal
+
+static void ggml_compute_forward_count_equal_i32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(src0->type == GGML_TYPE_I32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_I64);
+
+    const int64_t nr = ggml_nrows(src0);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t * sums = (int64_t *) params->wdata;
+    int64_t sum_thread = 0;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 =  ir                        / (ne02*ne01);
+        const int64_t i02 = (ir - i03*ne03)            /       ne01;
+        const int64_t i01 =  ir - i03*ne03 - i02*ne02;
+
+        const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
+        const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
+
+        for (int64_t i00 = 0; i00 < ne00; ++i00) {
+            const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
+            const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
+
+            sum_thread += val0 == val1;
+        }
+    }
+    if (ith != 0) {
+        sums[ith] = sum_thread;
+    }
+    ggml_barrier(params->threadpool);
+
+    if (ith != 0) {
+        return;
+    }
+
+    for (int ith_other = 1; ith_other < nth; ++ith_other) {
+        sum_thread += sums[ith_other];
+    }
+    *((int64_t *) dst->data) = sum_thread;
+}
+
+static void ggml_compute_forward_count_equal(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_count_equal_i32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_repeat
+
+static void ggml_compute_forward_repeat_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_vec_cpy_f32(ne00,
+                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
+                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
+                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
+                                // ggml_vec_cpy_f16(ne00, y, x)
+                                for (int i = 0; i < ne00; ++i) {
+                                    y[i]  = x[i];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_I16:
+            {
+                ggml_compute_forward_repeat_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_repeat_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_repeat_back
+
+static void ggml_compute_forward_repeat_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne00/ne0);
+    const int nr1 = (int)(ne01/ne1);
+    const int nr2 = (int)(ne02/ne2);
+    const int nr3 = (int)(ne03/ne3);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (ggml_is_contiguous(dst)) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    } else {
+        for         (int k3 = 0; k3 < ne3; k3++) {
+            for     (int k2 = 0; k2 < ne2; k2++) {
+                for (int k1 = 0; k1 < ne1; k1++) {
+                    ggml_vec_set_f32(ne0,
+                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
+                        0);
+                }
+            }
+        }
+    }
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3; i3++) {
+        for                     (int k3 = 0; k3 < ne3; k3++) {
+            for                 (int i2 = 0; i2 < nr2; i2++) {
+                for             (int k2 = 0; k2 < ne2; k2++) {
+                    for         (int i1 = 0; i1 < nr1; i1++) {
+                        for     (int k1 = 0; k1 < ne1; k1++) {
+                            for (int i0 = 0; i0 < nr0; i0++) {
+                                ggml_vec_acc_f32(ne0,
+                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
+                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_repeat_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_repeat_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_concat
+
+static void ggml_compute_forward_concat_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+
+    int64_t o[4] = {0, 0, 0, 0};
+    o[dim] = src0->ne[dim];
+
+    const float * x;
+
+    // TODO: smarter multi-theading
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = ith; i2 < ne2; i2 += nth) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                for (int i0 = 0; i0 < ne0; i0++) {
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        x = (const float *) ((const char *)src0->data + (i0       )*nb00 + (i1       )*nb01 + (i2       )*nb02 + (i3       )*nb03);
+                    } else {
+                        x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
+                    }
+
+                    float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_concat(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_concat_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_abs
+
+static void ggml_compute_forward_abs_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_abs_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_abs(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_abs_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sgn
+
+static void ggml_compute_forward_sgn_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sgn_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sgn(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sgn_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_neg
+
+static void ggml_compute_forward_neg_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_neg_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_neg(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_neg_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_step
+
+static void ggml_compute_forward_step_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_step_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_step(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_step_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_tanh
+
+static void ggml_compute_forward_tanh_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_tanh_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_tanh(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_tanh_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_elu
+
+static void ggml_compute_forward_elu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_elu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_elu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_elu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_relu
+
+static void ggml_compute_forward_relu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_relu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_relu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_sigmoid
+
+static void ggml_compute_forward_sigmoid_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_sigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_sigmoid(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_sigmoid_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_gelu
+
+static void ggml_compute_forward_gelu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_gelu_quick
+
+static void ggml_compute_forward_gelu_quick_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_quick_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_quick(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_quick_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_silu
+
+static void ggml_compute_forward_silu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+// ggml_compute_forward_leaky_relu
+
+static void ggml_compute_forward_leaky_relu_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_leaky_relu_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
+    }
+}
+
+static void ggml_compute_forward_leaky_relu(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_leaky_relu_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_silu_back
+
+static void ggml_compute_forward_silu_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * grad = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    assert(ggml_is_contiguous_1(grad));
+    assert(ggml_is_contiguous_1(src1));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src1, dst));
+    assert(ggml_are_same_shape(src1, grad));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src1->ne[0];
+    const int nr = ggml_nrows(src1);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_silu_backward_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src1->data + i1*(src1->nb[1])),
+                (float *) ((char *) grad->data + i1*(grad->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_silu_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_silu_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+static void ggml_compute_forward_hardswish_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardswish_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void ggml_compute_forward_hardswish(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardswish_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardsigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardsigmoid_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_exp_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_exp_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_exp(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_exp_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_norm
+
+static void ggml_compute_forward_norm_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps >= 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)x[i00];
+                }
+
+                float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                ggml_float sum2 = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    float v = x[i00] - mean;
+                    y[i00] = v;
+                    sum2 += (ggml_float)(v*v);
+                }
+
+                float variance = sum2/ne00;
+                const float scale = 1.0f/sqrtf(variance + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_norm(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_group_rms_norm
+
+static void ggml_compute_forward_rms_norm_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps >= 0.0f);
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+                ggml_float sum = 0.0;
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum += (ggml_float)(x[i00] * x[i00]);
+                }
+
+                const float mean = sum/ne00;
+
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                memcpy(y, x, ne00 * sizeof(float));
+                // for (int i00 = 0; i00 < ne00; i00++) {
+                //     y[i00] = x[i00];
+                // }
+
+                const float scale = 1.0f/sqrtf(mean + eps);
+
+                ggml_vec_scale_f32(ne00, y, scale);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output
+    const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    // TODO: optimize
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                // src1 is same shape as src0 => same indices
+                const int64_t i11 = i01;
+                const int64_t i12 = i02;
+                const int64_t i13 = i03;
+
+                const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * x  = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+
+                ggml_float sum_xx  = 0.0;
+                ggml_float sum_xdz = 0.0;
+
+                for (int64_t i00 = 0; i00 < ne00; i00++) {
+                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
+                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
+                }
+
+                //const float mean     = (float)(sum_xx)/ne00;
+                const float mean_eps = (float)(sum_xx)/ne00 + eps;
+                const float sum_eps  = (float)(sum_xx) + eps*ne00;
+                //const float mean_xdz = (float)(sum_xdz)/ne00;
+                // we could cache rms from forward pass to improve performance.
+                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
+                //const float rms      = sqrtf(mean_eps);
+                const float rrms     = 1.0f / sqrtf(mean_eps);
+                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+                {
+                    // z = rms_norm(x)
+                    //
+                    // rms_norm(src1) =
+                    //     scale(
+                    //         src1,
+                    //         div(
+                    //             1,
+                    //             sqrt(
+                    //                 add(
+                    //                     scale(
+                    //                         sum(
+                    //                             sqr(
+                    //                                 src1)),
+                    //                         (1.0/N)),
+                    //                     eps))));
+
+                    // postorder:
+                    // ## op    args         grad
+                    // 00 param src1         grad[#00]
+                    // 01 const 1
+                    // 02 sqr   (#00)        grad[#02]
+                    // 03 sum   (#02)        grad[#03]
+                    // 04 const 1/N
+                    // 05 scale (#03, #04)   grad[#05]
+                    // 06 const eps
+                    // 07 add   (#05, #06)   grad[#07]
+                    // 08 sqrt  (#07)        grad[#08]
+                    // 09 div   (#01,#08)    grad[#09]
+                    // 10 scale (#00,#09)    grad[#10]
+                    //
+                    // backward pass, given grad[#10]
+                    // #10: scale
+                    // grad[#00] += scale(grad[#10],#09)
+                    // grad[#09] += sum(mul(grad[#10],#00))
+                    // #09: div
+                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+                    // #08: sqrt
+                    // grad[#07] += mul(grad[#08], div(0.5, #08))
+                    // #07: add
+                    // grad[#05] += grad[#07]
+                    // #05: scale
+                    // grad[#03] += scale(grad[#05],#04)
+                    // #03: sum
+                    // grad[#02] += repeat(grad[#03], #02)
+                    // #02:
+                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+                    //
+                    // substitute and simplify:
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#02] = repeat(grad[#03], #02)
+                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
+                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
+                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
+                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
+                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
+                    // a = b*c + d*e
+                    // a = b*c*f/f + d*e*f/f
+                    // a = (b*c*f + d*e*f)*(1/f)
+                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
+                    // a = (b + d*e/c)*c
+                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
+                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                }
+                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+                // post-order:
+                // dx := x
+                // dx := scale(dx,-mean_xdz/mean_eps)
+                // dx := add(dx, dz)
+                // dx := scale(dx, rrms)
+                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
+                ggml_vec_cpy_f32  (ne00, dx, x);
+                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
+                ggml_vec_acc_f32  (ne00, dx, dz);
+                ggml_vec_scale_f32(ne00, dx, rrms);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rms_norm_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rms_norm_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_group_norm
+
+static void ggml_compute_forward_group_norm_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    // TODO: optimize
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+    int n_channels = src0->ne[2];
+    int n_groups = dst->op_params[0];
+    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
+    for (int i = ith; i < n_groups; i += nth) {
+        int start = i * n_channels_per_group;
+        int end = start + n_channels_per_group;
+        if (end > n_channels) {
+            end = n_channels;
+        }
+        int step = end - start;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            ggml_float sum = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    ggml_float sumr = 0.0;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        sumr += (ggml_float)x[i00];
+                    }
+                    sum += sumr;
+                }
+            }
+            const float mean = sum / (ne00 * ne01 * step);
+
+            ggml_float sum2 = 0.0;
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+                    ggml_float sumr = 0.0;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        float v = x[i00] - mean;
+                        y[i00] = v;
+                        sumr += (ggml_float)(v * v);
+                    }
+                    sum2 += sumr;
+                }
+            }
+            const float variance = sum2 / (ne00 * ne01 * step);
+            const float scale = 1.0f / sqrtf(variance + eps);
+
+            for (int64_t i02 = start; i02 < end; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+                    ggml_vec_scale_f32(ne00, y, scale);
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_group_norm(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_group_norm_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_mul_mat
+
+static void ggml_compute_forward_mul_mat_one_chunk(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const enum ggml_type type,
+    const int64_t num_rows_per_vec_dot,
+    const int64_t ir0_start,
+    const int64_t ir0_end,
+    const int64_t ir1_start,
+    const int64_t ir1_end) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int64_t i13 = (ir1 / (ne12 * ne1));
+                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
+    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
+    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: extract to "extra_op"
+#if GGML_USE_LLAMAFILE
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    if (src1_cont) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)src1->data + i12*nb12 + i13*nb13,
+                                     nb11/ggml_type_size(src1->type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     src0->type,
+                                     src1->type,
+                                     dst->type))
+                    goto UseGgmlGemm1;
+        return;
+    }
+UseGgmlGemm1:;
+#endif
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = params->wdata;
+
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(params->wsize >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                ne10);
+                }
+            }
+        }
+    }
+
+    if (ith == 0) {
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
+    }
+
+    ggml_barrier(params->threadpool);
+
+#if GGML_USE_LLAMAFILE
+    if (src1->type != vec_dot_type) {
+        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
+                                     row_size/ggml_type_size(vec_dot_type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     src0->type,
+                                     vec_dot_type,
+                                     dst->type))
+                    goto UseGgmlGemm2;
+        return;
+    }
+UseGgmlGemm2:;
+#endif
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int64_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const int64_t nr1 = ne1 * ne2 * ne3;
+
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;
+
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+    }
+
+    // The number of elements in each chunk
+    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = ith;
+
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int64_t ith0 = current_chunk % nchunk0;
+        const int64_t ith1 = current_chunk / nchunk0;
+
+        const int64_t ir0_start = dr0 * ith0;
+        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+        const int64_t ir1_start = dr1 * ith1;
+        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+
+        if (nth >= nchunk0 * nchunk1) {
+            break;
+        }
+
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
+    }
+}
+
+// ggml_compute_forward_mul_mat_id
+
+static void ggml_compute_forward_mul_mat_id(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * ids = dst->src[2];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t    const vec_dot         = type_traits_cpu[type].vec_dot;
+    enum ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
+    ggml_from_float_t const from_float      = type_traits_cpu[vec_dot_type].from_float;
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // row groups
+    const int n_ids = ids->ne[0]; // n_expert_used
+    const int n_as  = ne02;       // n_expert
+
+    char * wdata_src1_end = (src1->type == vec_dot_type) ?
+            (char *) params->wdata :
+            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
+
+    struct mmid_row_mapping {
+        int32_t i1;
+        int32_t i2;
+    };
+
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = params->wdata;
+
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(params->wsize >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                               ne10);
+                }
+            }
+        }
+    }
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
+
+    if (ith == 0) {
+        // initialize matrix_row_counts
+        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
+
+        // group rows by src0 matrix
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+            for (int id = 0; id < n_ids; ++id) {
+                const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                assert(i02 >= 0 && i02 < n_as);
+
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
+                matrix_row_counts[i02] += 1;
+            }
+        }
+    }
+
+    ggml_barrier(params->threadpool);
+
+    // compute each matrix multiplication in sequence
+    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int64_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
+
+        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+        const int64_t nr0 = ne01; // src0 rows
+        const int64_t nr1 = cne1; // src1 rows
+
+        // distribute the thread work across the inner or outer loop based on which one is larger
+
+        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+
+        const int64_t ith0 = ith % nth0;
+        const int64_t ith1 = ith / nth0;
+
+        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
+        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
+
+        const int64_t ir010 = dr0*ith0;
+        const int64_t ir011 = MIN(ir010 + dr0, nr0);
+
+        const int64_t ir110 = dr1*ith1;
+        const int64_t ir111 = MIN(ir110 + dr1, nr1);
+
+        // threads with no work simply yield (not sure if it helps)
+        //if (ir010 >= ir011 || ir110 >= ir111) {
+        //    sched_yield();
+        //    continue;
+        //}
+
+        // block-tiling attempt
+        const int64_t blck_0 = 16;
+        const int64_t blck_1 = 16;
+
+        // attempt to reduce false-sharing (does not seem to make a difference)
+        float tmp[16];
+
+        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
+            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
+                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+                    const int64_t _i12 = ir1; // logical row index for this expert
+
+                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
+                    const int id       = row_mapping.i1; // selected expert index
+
+                    const int64_t  i11 = id % ne11;
+                    const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                    const int64_t  i1 = id;  // selected expert index
+                    const int64_t  i2 = i12; // row
+
+                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                    //       the original src1 data pointer, so we should index using the indices directly
+                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                    const char * src1_col = (const char *) wdata +
+                        (src1_cont || src1->type != vec_dot_type
+                        ? (i11      + i12*ne11)*row_size
+                        : (i11*nb11 + i12*nb12));
+
+                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
+
+                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                    //}
+
+                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
+                    }
+
+                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
+                }
+            }
+        }
+    }
+
+#undef MMID_MATRIX_ROW
+}
+
+// ggml_compute_forward_out_prod
+
+static void ggml_compute_forward_out_prod_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    GGML_ASSERT(ne2 % ne02 == 0);
+    GGML_ASSERT(ne3 % ne03 == 0);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    if (ith == 0) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    }
+    ggml_barrier(params->threadpool);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // block-tiling attempt
+    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
+    const int64_t blck_1 = 16;
+
+    // dps == dst per src0, used for group query attention
+    const int64_t dps2 = ne2 / ne02;
+    const int64_t dps3 = ne3 / ne03;
+
+    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
+        const int64_t bir1 = MIN(bir + blck_1, ir1);
+        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
+            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
+            for (int64_t ir = bir; ir < bir1; ++ir) {
+                // dst indices
+                const int64_t i3 = ir/(ne2*ne1);
+                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                const int64_t i02 = i2 / dps2;
+                const int64_t i03 = i3 / dps3;
+
+                //const int64_t i10 = i1;
+                const int64_t i12 = i2;
+                const int64_t i13 = i3;
+
+#if GGML_VEC_MAD_UNROLL > 2
+                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
+                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
+
+                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+                }
+                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#else
+                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
+
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
+#endif
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_out_prod_q_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 dim0
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+
+    // dst dim0 cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    if (ith == 0) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+    }
+    ggml_barrier(params->threadpool);
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        //const int64_t i10 = i1;
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+            dequantize_row_q(s0, wdata, ne0);
+            ggml_vec_mad_f32(ne0, d, wdata, *s1);
+        }
+    }
+}
+
+static void ggml_compute_forward_out_prod(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+            {
+                ggml_compute_forward_out_prod_q_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                GGML_ABORT("fatal error"); // todo
+                // ggml_compute_forward_out_prod_f16_f32(params, dst);
+            }
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_out_prod_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_scale
+
+static void ggml_compute_forward_scale_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        if (dst->data != src0->data) {
+            // src0 is same shape as dst => same indices
+            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
+        }
+        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
+    }
+}
+
+static void ggml_compute_forward_scale(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_scale_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_set
+
+static void ggml_compute_forward_set_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
+static void ggml_compute_forward_set_i32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
+
+    GGML_ASSERT(nb10 == sizeof(int32_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_vec_cpy_i32(nc,
+                (int32_t *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
+static void ggml_compute_forward_set(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_set_f32(params, dst);
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_set_i32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cpy
+
+static void ggml_compute_forward_cpy(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, dst);
+}
+
+// ggml_compute_forward_cont
+
+static void ggml_compute_forward_cont(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    ggml_compute_forward_dup(params, dst);
+}
+
+// ggml_compute_forward_reshape
+
+static void ggml_compute_forward_reshape(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_view
+
+static void ggml_compute_forward_view(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_permute
+
+static void ggml_compute_forward_permute(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_transpose
+
+static void ggml_compute_forward_transpose(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * dst) {
+    // NOP
+    UNUSED(params);
+    UNUSED(dst);
+}
+
+// ggml_compute_forward_get_rows
+
+static void ggml_compute_forward_get_rows_q(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == ggml_type_size(type));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        dequantize_row_q(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(ggml_fp16_t));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_fp16_to_fp32_row(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_bf16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(ggml_bf16_t));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_bf16_to_fp32_row(
+                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+    }
+}
+
+static void ggml_compute_forward_get_rows_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(float));
+    assert(ggml_nrows(dst) == nr);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i = ir0; i < ir1; ++i) {
+        const int64_t i12 = i/(ne11*ne10);
+        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+        ggml_vec_cpy_f32(nc,
+                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
+                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+    }
+}
+
+static void ggml_compute_forward_get_rows(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+            {
+                ggml_compute_forward_get_rows_q(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_f16(params, dst);
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_get_rows_bf16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_get_rows_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_get_rows_back
+
+static void ggml_compute_forward_get_rows_back_f32_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    memset(dst->data, 0, ggml_nbytes(dst));
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        for (int j = 0; j < nc; ++j) {
+            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rows_back_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    memset(dst->data, 0, ggml_nbytes(dst));
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nelements(src1);
+
+    GGML_ASSERT( dst->ne[0] == nc);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        const int r = ((int32_t *) src1->data)[i];
+
+        ggml_vec_add_f32(nc,
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *)  dst->data + r*dst->nb[1]),
+                (float *) ((char *) src0->data + i*src0->nb[1]));
+    }
+}
+
+static void ggml_compute_forward_get_rows_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_get_rows_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    //static bool first = true;
+    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+    //if (first) {
+    //    first = false;
+    //} else {
+    //    for (int k = 0; k < dst->ne[1]; ++k) {
+    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
+    //            for (int i = 0; i < 16; ++i) {
+    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+    //            }
+    //            printf("\n");
+    //        }
+    //        printf("\n");
+    //    }
+    //    printf("\n");
+    //    exit(0);
+    //}
+}
+
+// ggml_compute_forward_diag
+
+static void ggml_compute_forward_diag_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(ne00 == ne0);
+    GGML_ASSERT(ne00 == ne1);
+    GGML_ASSERT(ne01 == 1);
+    GGML_ASSERT(ne02 == ne2);
+    GGML_ASSERT(ne03 == ne3);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    for (int i3 = 0; i3 < ne3; i3++) {
+        for (int i2 = 0; i2 < ne2; i2++) {
+            for (int i1 = 0; i1 < ne1; i1++) {
+                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
+                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
+                for (int i0 = 0; i0 < i1; i0++) {
+                    d[i0] = 0;
+                }
+                d[i1] = s[i1];
+                for (int i0 = i1+1; i0 < ne0; i0++) {
+                    d[i0] = 0;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_diag_mask_inf
+
+static void ggml_compute_forward_diag_mask_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const float value) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int  n_past  = ((int32_t *) dst->op_params)[0];
+    const bool inplace = src0->data == dst->data;
+
+    GGML_ASSERT(n_past >= 0);
+
+    if (!inplace) {
+        if (ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+            GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    // TODO: handle transposed/permuted matrices
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    const int nr = src0->ne[1];
+    const int nz = n/nr;
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int k = 0; k < nz; k++) {
+        for (int j = ith; j < nr; j += nth) {
+            for (int i = n_past; i < nc; i++) {
+                if (i > n_past + j) {
+                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_inf(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_diag_mask_zero(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_diag_mask_f32(params, dst, 0);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_soft_max
+
+static void ggml_compute_forward_soft_max_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    assert(ggml_is_contiguous(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //const int64_t ne11 = src1 ? src1->ne[1] : 1;
+
+    // TODO: is this supposed to be ceil instead of floor?
+    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
+    const uint32_t n_head      = ne02;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
+
+    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        // ALiBi
+        const uint32_t h = (i1/ne01)%ne02; // head
+        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+
+        // broadcast the mask across rows
+        ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
+        float       * mp_f32 = src1 ? (float       *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
+
+        ggml_vec_cpy_f32  (nc, wp, sp);
+        ggml_vec_scale_f32(nc, wp, scale);
+        if (mp_f32) {
+            if (use_f16) {
+                for (int i = 0; i < nc; ++i) {
+                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
+                }
+            } else {
+                for (int i = 0; i < nc; ++i) {
+                    wp[i] += slope*mp_f32[i];
+                }
+            }
+        }
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(wp[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, wp);
+
+        ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
+        assert(sum > 0.0);
+
+        sum = 1.0/sum;
+        ggml_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dp[i]));
+            assert(!isinf(dp[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_soft_max_ext_back
+
+static void ggml_compute_forward_soft_max_ext_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_are_same_shape(src1, dst));
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+
+    GGML_ASSERT(max_bias == 0.0f);
+
+    // TODO: handle transposed/permuted matrices
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(dy[i]));
+            assert(!isnan(y[i]));
+        }
+#endif
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.T*y
+        // dx = J * dy
+        // dxk = sum_i(Jki * dyi)
+        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
+        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
+        // dxk = -yk * dot(y, dy) + yk*dyk
+        // dxk = yk * (- dot(y, dy) + dyk)
+        // dxk = yk * (dyk - dot(y, dy))
+        //
+        // post-order:
+        // dot_y_dy := dot(y, dy)
+        // dx := dy
+        // dx := dx - dot_y_dy
+        // dx := dx * y
+
+        // linear runtime, no additional memory
+        float dot_y_dy = 0;
+        ggml_vec_dot_f32  (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
+        ggml_vec_cpy_f32  (nc, dx, dy);
+        ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
+        ggml_vec_mul_f32  (nc, dx, dx, y);
+        ggml_vec_scale_f32(nc, dx, scale);
+
+#ifndef NDEBUG
+        for (int i = 0; i < nc; ++i) {
+            assert(!isnan(dx[i]));
+            assert(!isinf(dx[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_soft_max_ext_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_max_ext_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_clamp
+
+static void ggml_compute_forward_clamp_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    float min;
+    float max;
+    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    for (int j = ith; j < n; j += nth) {
+        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
+        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
+
+        for (int i = 0; i < nc; i++) {
+            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
+        }
+    }
+}
+
+static void ggml_compute_forward_clamp(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_clamp_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q8_K:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rope
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+    return 1 - MIN(1, MAX(0, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    *cos_theta = cosf(theta) * mscale;
+    *sin_theta = sinf(theta) * mscale;
+}
+
+static void ggml_rope_cache_init(
+     float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta = theta_base;
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta *= theta_scale;
+    }
+}
+
+static void ggml_mrope_cache_init(
+     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
+     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta_t = theta_base_t;
+    float theta_h = theta_base_h;
+    float theta_w = theta_base_w;
+    float theta_e = theta_base_e;  // extra position id for vision encoder
+    int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+    int sec_w = sections[1] + sections[0];
+    int sec_e = sections[2] + sec_w;
+    GGML_ASSERT(sect_dims <= ne0);
+
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+
+        int sector = (i0 / 2) % sect_dims;
+        if (indep_sects) {
+            // compute theta independently for each dim sections
+            // (i.e. reset corresponding theta when `i0` go from one section to another)
+            if (sector == 0) {
+                theta_t = theta_base_t;
+            }
+            else if (sector == sections[0]) {
+                theta_h = theta_base_h;;
+            }
+            else if (sector == sec_w) {
+                theta_w = theta_base_w;
+            }
+            else if (sector == sec_e) {
+                theta_e = theta_base_e;
+            }
+        }
+
+        float theta = theta_t;
+        if (sector >= sections[0] && sector < sec_w) {
+            theta = theta_h;
+        }
+        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+            theta = theta_w;
+        }
+        else if (sector >= sec_w + sections[2]) {
+            theta = theta_e;
+        }
+
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+
+        theta_t *= theta_scale;
+        theta_w *= theta_scale;
+        theta_h *= theta_scale;
+        theta_e *= theta_scale;
+    }
+}
+
+static void ggml_compute_forward_rope_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const bool forward) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne0/2);
+    }
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
+        for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
+            for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                if (is_neox || is_mrope) {
+                    if (is_vision){
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims];
+
+                            dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims/2];
+
+                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        }
+                    }
+                } else {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[1];
+
+                        dst_data[0] = x0*cos_theta - x1*sin_theta;
+                        dst_data[1] = x0*sin_theta + x1*cos_theta;
+                    }
+                }
+
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const int64_t ic = i0/2;
+
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims];
+
+                        dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
+                    }
+                } else {
+                    // fill the remain channels with data from src tensor
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// TODO: deduplicate f16/f32 code
+static void ggml_compute_forward_rope_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const bool forward) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
+    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+
+    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne0/2);
+    }
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+
+            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                if (is_neox || is_mrope) {
+                    if (is_vision) {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+
+                            dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+
+                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                            const float x0 = GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    }
+                } else {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[1]);
+
+                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                }
+
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const int64_t ic = i0/2;
+
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
+
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+
+                        dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                    }
+                } else {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_rope(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, dst, true);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_f32(params, dst, true);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rope_back
+
+static void ggml_compute_forward_rope_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_rope_f16(params, dst, false);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rope_f32(params, dst, false);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_conv_transpose_1d
+
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            ggml_fp16_t * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f16(ne02, &v, 0,
+                        (ggml_fp16_t *)    wdata_src + i1n, 0,
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02;
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            float * const wdata = (float *) params->wdata + 0;
+
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
+
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
+
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = src[i10];
+                }
+            }
+        }
+
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    // total rows in dst
+    const int nr = ne1;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f32(ne02, &v, 0,
+                        wdata_src + i1n, 0,
+                        wdata_kernel + i00*ne02, 0, 1);
+                dst_data[i10*s0 + i00] += v;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_transpose_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_im2col_f32
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_im2col_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
+
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+
+                        // micro kernel
+                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// ggml_compute_forward_im2col_f16
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void ggml_compute_forward_im2col_f16(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+
+                        // micro kernel
+                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_im2col(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+    switch (dst->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_im2col_f16(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_im2col_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_im2col_back_f32
+
+static void ggml_compute_forward_im2col_back_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
+    const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne3 : ne2;
+    const int64_t IC = is_2D ? ne2 : ne1;
+    const int64_t IH = is_2D ? ne1 : 1;
+    const int64_t IW = ne0;
+
+    const int64_t KH = is_2D ? ne11 : 1;
+    const int64_t KW = ne10;
+
+    const int64_t OH = is_2D ? ne02 : 1;
+    const int64_t OW = ne01;
+
+    int ofs0 = is_2D ? nb3 : nb2;
+    int ofs1 = is_2D ? nb2 : nb1;
+
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iic = ith; iic < IC; iic += nth) {
+                for (int64_t iih = 0; iih < IH; iih++) {
+                    for (int64_t iiw = 0; iiw < IW; iiw++) {
+
+                        // micro kernel
+                        float grad = 0.0f;
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                // For s0 > 1 some values were skipped over in the forward pass.
+                                // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well.
+                                const int64_t tmpw = (iiw + p0 - ikw*d0);
+                                if (tmpw % s0 != 0) {
+                                    continue;
+                                }
+                                const int64_t iow = tmpw / s0;
+
+                                // Equivalent logic as above except for s1.
+                                int64_t ioh;
+                                if (is_2D) {
+                                    const int64_t tmph = iih + p1 - ikh*d1;
+
+                                    if (tmph % s1 != 0) {
+                                        continue;
+                                    }
+
+                                    ioh = tmph / s1;
+                                } else {
+                                    ioh = 0;
+                                }
+
+                                if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) {
+                                    continue;
+                                }
+
+                                const float * const grad_in = (const float *) src0->data
+                                    + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                                grad += grad_in[iic*(KH*KW) + ikh*KW + ikw];
+                            }
+                        }
+                        float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
+                        dst_data[iih*IW + iiw] = grad;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_conv_transpose_2d
+
+static void ggml_compute_forward_conv_transpose_2d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nk = ne00*ne01*ne02*ne03;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (ith == 0) {
+        memset(params->wdata, 0, params->wsize);
+
+        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        for (int64_t i00 = 0; i00 < ne00; i00++) {
+                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
+                        }
+                    }
+                }
+            }
+        }
+
+        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            for (int i12 = 0; i12 < ne12; i12++) {
+                for (int i11 = 0; i11 < ne11; i11++) {
+                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
+                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
+                    for (int i10 = 0; i10 < ne10; i10++) {
+                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
+                    }
+                }
+            }
+        }
+
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
+    ggml_barrier(params->threadpool);
+
+    const int32_t stride = ggml_get_op_params_i32(dst, 0);
+
+    // total patches in dst
+    const int np = ne2;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
+    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
+        float * dst_data = (float *)((char *) dst->data + i2*nb2);
+        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+        for (int i11 = 0; i11 < ne11; i11++) {
+            for (int i10 = 0; i10 < ne10; i10++) {
+                const int i1n = i11*ne10*ne12 + i10*ne12;
+                for (int i01 = 0; i01 < ne01; i01++) {
+                    for (int i00 = 0; i00 < ne00; i00++) {
+                        float v = 0;
+                        ggml_vec_dot_f16(ne03, &v, 0,
+                                wdata_src + i1n, 0,
+                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
+                    }
+                }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_pool_1d_sk_p0
+
+static void ggml_compute_forward_pool_1d_sk_p0(
+        const struct ggml_compute_params * params,
+        const enum ggml_op_pool op,
+        const int k,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src = dst->src[0];
+
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const char * cdata = (const char *)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+    float * drow = (float *)dst->data;
+
+    const int64_t rs = dst->ne[0];
+
+    while (cdata < data_end) {
+        const void * srow = (const void *)cdata;
+        int j = 0;
+        for (int64_t i = 0; i < rs; ++i) {
+            switch (op) {
+                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
+                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+            }
+            for (int ki = 0; ki < k; ++ki) {
+                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                switch (op) {
+                    case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
+                    case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
+                    case GGML_OP_POOL_COUNT:                       GGML_ABORT("fatal error");
+                }
+                ++j;
+            }
+            switch (op) {
+                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
+                case GGML_OP_POOL_MAX:                       break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+            }
+        }
+
+        cdata += src->nb[1];
+        drow  += rs;
+    }
+}
+
+// ggml_compute_forward_pool_1d
+
+static void ggml_compute_forward_pool_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int s0 = opts[2];
+    const int p0 = opts[3];
+    GGML_ASSERT(p0 == 0); // padding not supported
+    GGML_ASSERT(k0 == s0); // only s = k supported
+
+    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
+}
+
+// ggml_compute_forward_pool_2d
+
+static void ggml_compute_forward_pool_2d(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src = dst->src[0];
+
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+    const char * cdata = (const char*)src->data;
+    const char * const data_end = cdata + ggml_nbytes(src);
+
+    const int64_t px = dst->ne[0];
+    const int64_t py = dst->ne[1];
+    const int64_t pa = px * py;
+
+    float * dplane = (float *)dst->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            float * const drow = dplane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                float * const out =  drow + ox;
+                switch (op) {
+                    case GGML_OP_POOL_AVG:     *out = 0;        break;
+                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+                }
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                for (int ky = 0; ky < k1; ++ky) {
+                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
+                    const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
+                    for (int kx = 0; kx < k0; ++kx) {
+                        int j = ix + kx;
+                        if (j < 0 || j >= src->ne[0]) continue;
+                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                        switch (op) {
+                            case GGML_OP_POOL_AVG:                     *out += srow_j; break;
+                            case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
+                            case GGML_OP_POOL_COUNT:               GGML_ABORT("fatal error");
+                        }
+                    }
+                }
+                switch (op) {
+                    case GGML_OP_POOL_AVG:           *out /= ka; break;
+                    case GGML_OP_POOL_MAX:                       break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
+                }
+            }
+        }
+
+        cdata  += src->nb[2];
+        dplane += pa;
+    }
+}
+
+// ggml_compute_forward_pool_2d_back
+
+static void ggml_compute_forward_pool_2d_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src  = dst->src[0];
+    const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst
+
+    assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    char       * cdata  = (char       *) dst->data;
+    const char * cdataf = (const char *) dstf->data;
+    const char * const data_end = cdata + ggml_nbytes(dst);
+
+    GGML_ASSERT(params->ith == 0);
+    memset(cdata, 0, ggml_nbytes(dst));
+
+    const int64_t px = src->ne[0];
+    const int64_t py = src->ne[1];
+    const int64_t pa = px * py;
+
+    const float * splane = (const float *) src->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            const float * const srow = splane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                const float grad0 = srow[ox];
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                if (op == GGML_OP_POOL_MAX) {
+                    float maxval = -FLT_MAX;
+                    int kxmax = -1;
+                    int kymax = -1;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            const float val = dst->type == GGML_TYPE_F32 ?
+                                ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
+                            if (val <= maxval) {
+                                continue;
+                            }
+
+                            maxval = val;
+                            kxmax = kx;
+                            kymax = ky;
+                        }
+                    }
+
+                    if (kxmax == -1 || kymax == -1) {
+                        continue;
+                    }
+
+                    void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax));
+                    const int j = ix + kxmax;
+                    if (dst->type == GGML_TYPE_F32) {
+                        ((float *) drow)[j] += grad0;
+                    } else {
+                        ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
+                    }
+                } else if (op == GGML_OP_POOL_AVG) {
+                    const float grad = grad0 / ka;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        void * drow = (void *)(cdata + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            if (dst->type == GGML_TYPE_F32) {
+                                ((float *) drow)[j] += grad;
+                            } else {
+                                ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
+                            }
+                        }
+                    }
+                } else {
+                    GGML_ASSERT(false);
+                }
+            }
+        }
+
+        cdata  += dst->nb[2];
+        cdataf += dst->nb[2];
+        splane += pa;
+    }
+}
+
+// ggml_compute_forward_upscale
+
+static void ggml_compute_forward_upscale_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const float sf0 = (float)ne0/src0->ne[0];
+    const float sf1 = (float)ne1/src0->ne[1];
+    const float sf2 = (float)ne2/src0->ne[2];
+    const float sf3 = (float)ne3/src0->ne[3];
+
+    // TODO: optimize
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3 / sf3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2 / sf2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / sf1;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / sf0;
+
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+
+                    *y = *x;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_upscale(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_upscale_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
+// ggml_compute_forward_pad
+
+static void ggml_compute_forward_pad_f32(
+    const struct ggml_compute_params * params,
+          struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    float * dst_ptr = (float *) dst->data;
+
+    // TODO: optimize
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    } else {
+                        dst_ptr[dst_idx] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_pad(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_pad_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_pad_reflect_1d
+
+static void ggml_compute_forward_pad_reflect_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t * opts = (const int32_t *) dst->op_params;
+    const int p0 = opts[0];
+    const int p1 = opts[1];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                float * left  = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 +         p0*nb0);
+                float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
+
+                ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
+
+                for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0];   }
+                for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
+            }
+        }
+    }
+}
+
+// ggml_compute_forward_arange
+
+static void ggml_compute_forward_arange_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float start = ggml_get_op_params_f32(dst, 0);
+    const float stop  = ggml_get_op_params_f32(dst, 1);
+    const float step  = ggml_get_op_params_f32(dst, 2);
+
+    const int64_t steps = (int64_t) ceilf((stop - start) / step);
+
+    GGML_ASSERT(ggml_nelements(dst) == steps);
+
+    for (int64_t i = ith; i < steps; i+= nth) {
+        float value = start + step * i;
+        ((float *)dst->data)[i] = value;
+    }
+}
+
+static void ggml_compute_forward_arange(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_arange_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_timestep_embedding_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int dim = ggml_get_op_params_i32(dst, 0);
+    const int max_period = ggml_get_op_params_i32(dst, 1);
+
+    int half = dim / 2;
+
+    for (int64_t i = 0; i < ne00; i++) {
+        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
+        for (int64_t j = ith; j < half; j += nth) {
+            float timestep = ((float *)src0->data)[i];
+            float freq = (float)expf(-logf(max_period) * j / half);
+            float arg = timestep * freq;
+            embed_data[j] = cosf(arg);
+            embed_data[j + half] = sinf(arg);
+        }
+        if (dim % 2 != 0 && ith == 0) {
+            embed_data[dim] = 0.f;
+        }
+    }
+}
+
+static void ggml_compute_forward_timestep_embedding(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_timestep_embedding_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_argsort
+
+static void ggml_compute_forward_argsort_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const float * src_data = (float *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_argsort(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_argsort_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_flash_attn_ext
+
+static void ggml_compute_forward_flash_attn_ext_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne2 == N);
+
+    // input tensor rows must be contiguous
+    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
+    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
+    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nev0 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t rk2 = neq2/nek2;
+    const int64_t rk3 = neq3/nek3;
+
+    const int64_t rv2 = neq2/nev2;
+    const int64_t rv3 = neq3/nev3;
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float scale         = 1.0f;
+    float max_bias      = 0.0f;
+    float logit_softcap = 0.0f;
+
+    memcpy(&scale,         (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias,      (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
+
+    if (logit_softcap != 0) {
+        scale /= logit_softcap;
+    }
+
+    const uint32_t n_head      = neq2;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    enum ggml_type    const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
+    ggml_from_float_t const q_to_vec_dot   = type_traits_cpu[k_vec_dot_type].from_float;
+    ggml_vec_dot_t    const kq_vec_dot     = type_traits_cpu[k->type].vec_dot;
+    ggml_to_float_t   const v_to_float     = ggml_get_type_traits(v->type)->to_float;
+
+    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
+    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
+
+    // loop over n_batch and n_head
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        const uint32_t h = iq2; // head index
+        const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
+
+        float S = 0.0f;      // sum
+        float M = -INFINITY; // maximum KQ value
+
+        float       * VKQ32 = (float       *) params->wdata + ith*(3*D + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator
+        float       * V32   =                 (VKQ32 + 1*D); // (temporary) FP32 V buffer
+        ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*D); // (temporary) FP16 VKQ accumulator
+        ggml_fp16_t * Q_q   = (ggml_fp16_t *) (VKQ32 + 2*D); // (temporary) buffer for Q converted to quantized/FP16
+
+        if (v->type == GGML_TYPE_F16) {
+            memset(VKQ16, 0, D*sizeof(ggml_fp16_t));
+        } else {
+            memset(VKQ32, 0, D*sizeof(float));
+        }
+
+        const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
+
+        // k indices
+        const int ik3 = iq3 / rk3;
+        const int ik2 = iq2 / rk2;
+
+        // v indices
+        const int iv3 = iq3 / rv3;
+        const int iv2 = iq2 / rv2;
+
+        const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
+        q_to_vec_dot(pq, Q_q, D);
+
+        // online softmax / attention
+        // loop over n_kv and n_head_kv
+        // ref: https://arxiv.org/pdf/2112.05682.pdf
+        for (int64_t ic = 0; ic < nek1; ++ic) {
+            const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
+            if (mv == -INFINITY) {
+                continue;
+            }
+
+            float s; // KQ value
+
+            const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
+            kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
+
+            s = s*scale; // scale KQ value
+
+            if (logit_softcap != 0.0f) {
+                s = logit_softcap*tanhf(s);
+            }
+
+            s += mv; // apply mask
+
+            const float Mold = M;
+
+            float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
+            float vs = 1.0f; // post-softmax KQ value, expf(s - M)
+
+            const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
+
+            if (v->type == GGML_TYPE_F16) {
+                if (s > M) {
+                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
+                    M = s;
+                    ms = expf(Mold - M);
+
+                    // V = V*expf(Mold - M)
+                    ggml_vec_scale_f16(D, VKQ16, ms);
+                } else {
+                    // no new maximum, ms == 1.0f, vs != 1.0f
+                    vs = expf(s - M);
+                }
+
+                // V += v*expf(s - M)
+                ggml_vec_mad_f16(D, VKQ16, (const ggml_fp16_t *) v_data, vs);
+            } else {
+                if (s > M) {
+                    // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
+                    M = s;
+                    ms = expf(Mold - M);
+
+                    // V = V*expf(Mold - M)
+                    ggml_vec_scale_f32(D, VKQ32, ms);
+                } else {
+                    // no new maximum, ms == 1.0f, vs != 1.0f
+                    vs = expf(s - M);
+                }
+
+                v_to_float(v_data, V32, D);
+
+                // V += v*expf(s - M)
+                ggml_vec_mad_f32(D, VKQ32, V32, vs);
+            }
+
+            S = S*ms + vs; // scale and increment sum with partial sum
+        }
+
+        if (v->type == GGML_TYPE_F16) {
+            for (int64_t d = 0; d < D; ++d) {
+                VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
+            }
+        }
+
+        // V /= S
+        const float S_inv = 1.0f/S;
+        ggml_vec_scale_f32(D, VKQ32, S_inv);
+
+        // dst indices
+        const int i1 = iq1;
+        const int i2 = iq2;
+        const int i3 = iq3;
+
+        // original
+        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
+
+        // permute(0, 2, 1, 3)
+        memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
+    }
+}
+
+static void ggml_compute_forward_flash_attn_ext(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+    switch (dst->op_params[3]) {
+        case GGML_PREC_DEFAULT:
+        case GGML_PREC_F32:
+            {
+                // uses F32 accumulators
+                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_flash_attn_back
+
+static void ggml_compute_forward_flash_attn_back_f32(
+        const struct ggml_compute_params * params,
+        const bool masked,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * q = dst->src[0];
+    const struct ggml_tensor * k = dst->src[1];
+    const struct ggml_tensor * v = dst->src[2];
+    const struct ggml_tensor * d = dst->src[3];
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+    const int64_t M = P + N;
+
+    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
+    const int mxDM = MAX(D, Mup);
+
+    // GGML_ASSERT(ne0 == D);
+    // GGML_ASSERT(ne1 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(float));
+    GGML_ASSERT(nbk0 == sizeof(float));
+    GGML_ASSERT(nbv0 == sizeof(float));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev1 == D);
+    GGML_ASSERT(ned1 == N);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    if (ith == 0) {
+        memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
+    }
+    ggml_barrier(params->threadpool);
+
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+
+    enum ggml_type result_type = dst->type;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+    void * grad_q = (char *) dst->data;
+    void * grad_k = (char *) dst->data + offs_k;
+    void * grad_v = (char *) dst->data + offs_v;
+
+    const size_t nbgq1 = nb0*neq0;
+    const size_t nbgq2 = nb0*neq0*neq1;
+    const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+    const size_t nbgk1 = nb0*nek0;
+    const size_t nbgk2 = nb0*nek0*nek1;
+    const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+    const size_t nbgv1 = nb0*nev0;
+    const size_t nbgv2 = nb0*nev0*nev1;
+    const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+    // parallelize by k rows using ggml_vec_dot_f32
+
+    // total rows in k
+    const int nr = nek2*nek3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float scale = 1.0f/sqrtf(D);
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    // how often k2 (and v2) is repeated in q2
+    int nrep = neq2/nek2;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int ik3 = ir/(nek2);
+        const int ik2 = ir - ik3*nek2;
+
+        const int iq3 = ik3;
+        const int id3 = ik3;
+        const int iv3 = ik3;
+        const int iv2 = ik2;
+
+        for (int irep = 0; irep < nrep; ++irep) {
+            const int iq2 = ik2 + irep*nek2;
+            const int id2 = iq2;
+
+            // (ik2 + irep*nek2) % nek2 == ik2
+            for (int iq1 = 0; iq1 < neq1; ++iq1) {
+                const int id1 = iq1;
+
+                // not sure about CACHE_LINE_SIZE_F32..
+                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+
+                for (int i = M; i < Mup; ++i) {
+                    S[i] = -INFINITY;
+                }
+
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    // k indices
+                    const int ik1 = ic;
+
+                    // S indices
+                    const int i1 = ik1;
+
+                    ggml_vec_dot_f32(neq0,
+                            S + i1, 0,
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
+                }
+
+                // scale
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                for (int64_t i = masked_begin; i < M; i++) {
+                    S[i] = -INFINITY;
+                }
+
+                // softmax
+                // exclude known -INF S[..] values from max and loop
+                // dont forget to set their SM values to zero
+                {
+                    float max = -INFINITY;
+                    ggml_vec_max_f32(masked_begin, &max, S);
+
+                    ggml_float sum = 0.0;
+                    {
+#ifdef GGML_SOFT_MAX_ACCELERATE
+                        max = -max;
+                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                        vvexpf(SM, SM, &Mup);
+                        ggml_vec_sum_f32(Mup, &sum, SM);
+#else
+                        sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
+#endif
+                    }
+
+                    assert(sum > 0.0);
+
+                    sum = 1.0/sum;
+                    ggml_vec_scale_f32(masked_begin, SM, sum);
+
+                }
+
+                // step-by-step explanation
+                {
+                    // forward-process                    shape      grads from backward process
+                    // parallel_for ik2,ik3:
+                    //  for irep:
+                    //   iq2 = ik2 + irep*nek2
+                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
+                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
+                    //   for iq1:
+                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                    //    S0     = -Inf                   [D,1,1,1]
+                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
+                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                    //   ~S5[i]  = dot(vcur[:,i], S4)
+                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
+                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
+                    // dst                               backward-/ grad[dst]                 = d
+                    //
+                    // output gradients with their dependencies:
+                    //
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S4]   = grad[S5] @ vcur
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[vcur] = grad[S5].T @ S4
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // in post-order:
+                    //
+                    // S1         = qcur @ kcur.T
+                    // S2         = S1 * scale
+                    // S3         = diag_mask_inf(S2, P)
+                    // S4         = softmax(S3)
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // using less variables (SM=S4):
+                    //
+                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
+                    // SM            = softmax(S)
+                    // S             = d[:D,iq1,iq2,iq3] @ vcur
+                    // dot_SM_gradSM = dot(SM, S)
+                    // S             = SM * (S - dot(SM, S))
+                    // S             = diag_mask_zero(S, P) * scale
+                    //
+                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
+                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
+                }
+
+                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // for ic:
+                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
+                // exclude known future zero S[..] values from operation
+                ggml_vec_set_f32(masked_begin, S, 0);
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            S,
+                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
+                }
+
+                // S = SM * (S - dot(SM, S))
+                float dot_SM_gradSM = 0;
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
+                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+                ggml_vec_mul_f32 (masked_begin, S, S, SM);
+
+                // S = diag_mask_zero(S, P) * scale
+                // already done by above ggml_vec_set_f32
+
+                // exclude known zero S[..] values from operation
+                ggml_vec_scale_f32(masked_begin, S, scale);
+
+                // S    shape [M,1]
+                // SM   shape [M,1]
+                // kcur shape [D,M]
+                // qcur shape [D,1]
+                // vcur shape [M,D]
+
+                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+                // for ic:
+                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
+                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
+                            S[ic]);
+                }
+
+                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
+                // for ic:
+                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
+                            S[ic]);
+                }
+
+                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
+                // for ic:
+                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
+                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
+                // exclude known zero SM[..] values from mad
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(masked_begin,
+                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
+                            SM,
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const bool masked,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * q = dst->src[0];
+
+    switch (q->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_ssm_conv
+
+static void ggml_compute_forward_ssm_conv_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // conv_x
+    const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc  = src1->ne[0]; // d_conv
+    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+    const int nr  = src0->ne[1]; // d_inner
+    const int n_t =  dst->ne[1]; // tokens per sequence
+    const int n_s =  dst->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT( dst->ne[0] == nr);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            // {d_conv - 1 + n_t, d_inner, n_seqs}
+            // sliding window
+            const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
+            const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
+            float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
+
+            // TODO: transpose the output for smaller strides for big batches?
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // rowwise dot product
+                // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
+                float sumf = 0.0f;
+
+                // d_conv
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
+                }
+                x[i1] = sumf;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_conv(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_conv_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_ssm_scan
+
+static void ggml_compute_forward_ssm_scan_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0]; // s
+    const struct ggml_tensor * src1 = dst->src[1]; // x
+    const struct ggml_tensor * src2 = dst->src[2]; // dt
+    const struct ggml_tensor * src3 = dst->src[3]; // A
+    const struct ggml_tensor * src4 = dst->src[4]; // B
+    const struct ggml_tensor * src5 = dst->src[5]; // C
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nc  = src0->ne[0]; // d_state
+    const int64_t nr  = src0->ne[1]; // d_inner
+    const int64_t n_t = src1->ne[1]; // number of tokens per sequence
+    const int64_t n_s = src0->ne[2]; // number of sequences in the batch
+
+    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    // required for the dot product between s and C
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+    // required for per-sequence offsets for states
+    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[3])
+    GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+
+            // use the output as the source for the next token-wise iterations
+            if (i2 > 0) { s0 = s; }
+
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                float x_dt = x[i1] * dt_soft_plus;
+                float sumf = 0.0f;
+                // d_state
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    int i = i0 + i1*nc;
+                    // state = prev_state * dA + dB * x
+                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                    // y = rowwise_dotprod(state, C)
+                    sumf += state * C[i0];
+                    s[i] = state;
+                }
+                y[i1] = sumf;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_scan(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    switch (dst->src[0]->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_scan_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_win_part
+
+static void ggml_compute_forward_win_part_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
+
+    assert(ne00 == ne0);
+    assert(ne3  == nep0*nep1);
+
+    // TODO: optimize / multi-thread
+    for (int py = 0; py < nep1; ++py) {
+        for (int px = 0; px < nep0; ++px) {
+            const int64_t i3 = py*nep0 + px;
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                        const int64_t i02 = py*w + i2;
+                        const int64_t i01 = px*w + i1;
+                        const int64_t i00 = i0;
+
+                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
+                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
+
+                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                            ((float *) dst->data)[i] = 0.0f;
+                        } else {
+                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_part(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_part_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_win_unpart
+
+static void ggml_compute_forward_win_unpart_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+
+    const int32_t w = ((const int32_t *)(dst->op_params))[0];
+
+    // padding
+    const int px = (w - ne1%w)%w;
+    //const int py = (w - ne2%w)%w;
+
+    const int npx = (px + ne1)/w;
+    //const int npy = (py + ne2)/w;
+
+    assert(ne0 == ne00);
+
+    // TODO: optimize / multi-thread
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int ip2 = i2/w;
+                const int ip1 = i1/w;
+
+                const int64_t i02 = i2%w;
+                const int64_t i01 = i1%w;
+                const int64_t i00 = i0;
+
+                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
+                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
+
+                ((float *) dst->data)[j] = ((float *) src0->data)[i];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_unpart(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_win_unpart_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+//gmml_compute_forward_unary
+
+static void ggml_compute_forward_unary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const enum ggml_unary_op op = ggml_get_unary_op(dst);
+
+    switch (op) {
+        case GGML_UNARY_OP_ABS:
+            {
+                ggml_compute_forward_abs(params, dst);
+            } break;
+        case GGML_UNARY_OP_SGN:
+            {
+                ggml_compute_forward_sgn(params, dst);
+            } break;
+        case GGML_UNARY_OP_NEG:
+            {
+                ggml_compute_forward_neg(params, dst);
+            } break;
+        case GGML_UNARY_OP_STEP:
+            {
+                ggml_compute_forward_step(params, dst);
+            } break;
+        case GGML_UNARY_OP_TANH:
+            {
+                ggml_compute_forward_tanh(params, dst);
+            } break;
+        case GGML_UNARY_OP_ELU:
+            {
+                ggml_compute_forward_elu(params, dst);
+            } break;
+        case GGML_UNARY_OP_RELU:
+            {
+                ggml_compute_forward_relu(params, dst);
+            } break;
+        case GGML_UNARY_OP_SIGMOID:
+            {
+                ggml_compute_forward_sigmoid(params, dst);
+            } break;
+        case GGML_UNARY_OP_GELU:
+            {
+                ggml_compute_forward_gelu(params, dst);
+            } break;
+        case GGML_UNARY_OP_GELU_QUICK:
+            {
+                ggml_compute_forward_gelu_quick(params, dst);
+            } break;
+        case GGML_UNARY_OP_SILU:
+            {
+                ggml_compute_forward_silu(params, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSWISH:
+            {
+                ggml_compute_forward_hardswish(params, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSIGMOID:
+            {
+                ggml_compute_forward_hardsigmoid(params, dst);
+            } break;
+        case GGML_UNARY_OP_EXP:
+            {
+                ggml_compute_forward_exp(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_get_rel_pos
+
+static void ggml_compute_forward_get_rel_pos_f16(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    UNUSED(params);
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int64_t w = ne1;
+
+    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
+    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            const int64_t pos = (w - i1 - 1) + i2;
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_get_rel_pos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
+            {
+                ggml_compute_forward_get_rel_pos_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_add_rel_pos
+
+static void ggml_compute_forward_add_rel_pos_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src2 = dst->src[2];
+
+    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
+    if (!inplace) {
+        if (params->ith == 0) {
+            memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+
+    float * src1_data = (float *) src1->data;
+    float * src2_data = (float *) src2->data;
+    float * dst_data  = (float *) dst->data;
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    // total patches in dst
+    const int np = ne13;
+
+    // patches per thread
+    const int dp = (np + nth - 1)/nth;
+
+    // patch range for this thread
+    const int ip0 = dp*ith;
+    const int ip1 = MIN(ip0 + dp, np);
+
+    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
+                for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                    const int64_t jp0  = jp1 + i10;
+                    const float src1_e = src1_data[jp0];
+                    const float src2_e = src2_data[jp0];
+
+                    const int64_t jdh = jp0 * ne10;
+                    const int64_t jdw = jdh - (ne10 - 1) * i10;
+
+                    for (int64_t j = 0; j < ne10; ++j) {
+                        dst_data[jdh + j     ] += src2_e;
+                        dst_data[jdw + j*ne10] += src1_e;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_add_rel_pos(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_add_rel_pos_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_rwkv_wkv6
+
+static void ggml_compute_forward_rwkv_wkv6_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const int64_t T = dst->src[1]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t HEADS = dst->src[1]->ne[1];
+    const int64_t n_seqs = dst->src[5]->ne[1];
+    const int64_t head_size = C / HEADS;
+
+    float * dst_data = (float *) dst->data;
+    float * state = ((float *) dst->data) + C * T;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (ith >= HEADS) {
+        return;
+    }
+
+    const int h_start = (HEADS * ith) / nth;
+    const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ?
+                (HEADS * (ith + 1)) / nth : HEADS;
+
+    float * k =          (float *) dst->src[0]->data;
+    float * v =          (float *) dst->src[1]->data;
+    float * r =          (float *) dst->src[2]->data;
+    float * time_faaaa = (float *) dst->src[3]->data;
+    float * time_decay = (float *) dst->src[4]->data;
+
+    size_t t_stride = HEADS * head_size; // Same to C
+
+    size_t h_stride = C / HEADS;
+    GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
+    size_t h_stride_2d = head_size * head_size;
+
+    if (ith == 0) {
+        memset(dst_data, 0, T * C * sizeof(float));
+    }
+    ggml_barrier(params->threadpool);
+
+
+    #if defined(__AVX__) && !defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x8
+        #define GGML_F32X_SET1 GGML_F32x8_SET1
+        #define GGML_F32X_LOAD GGML_F32x8_LOAD
+        #define GGML_F32X_STORE GGML_F32x8_STORE
+        #define GGML_F32X_MUL GGML_F32x8_MUL
+        #define GGML_F32X_FMA GGML_F32x8_FMA
+        #define WKV_VECTOR_SIZE 8
+    #elif defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x16
+        #define GGML_F32X_SET1 GGML_F32x16_SET1
+        #define GGML_F32X_LOAD GGML_F32x16_LOAD
+        #define GGML_F32X_STORE GGML_F32x16_STORE
+        #define GGML_F32X_MUL GGML_F32x16_MUL
+        #define GGML_F32X_FMA GGML_F32x16_FMA
+        #define WKV_VECTOR_SIZE 16
+    #elif defined(__ARM_NEON) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32x4
+        #define GGML_F32X_SET1 GGML_F32x4_SET1
+        #define GGML_F32X_LOAD GGML_F32x4_LOAD
+        #define GGML_F32X_STORE GGML_F32x4_STORE
+        #define GGML_F32X_MUL GGML_F32x4_MUL
+        #define GGML_F32X_FMA GGML_F32x4_FMA
+        #define WKV_VECTOR_SIZE 4
+    #endif
+
+    #ifdef WKV_VECTOR_SIZE
+        const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
+
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_i_offset = h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float r_val = r[t_h_i_offset];
+                    float time_faaaa_val = time_faaaa[h_i_offset];
+                    float time_decay_val = time_decay[t_h_i_offset];
+
+                    // Broadcast scalar values to vectors
+                    GGML_F32X k_vec = GGML_F32X_SET1(k_val);
+                    GGML_F32X r_vec = GGML_F32X_SET1(r_val);
+                    GGML_F32X time_faaaa_vec = GGML_F32X_SET1(time_faaaa_val);
+                    GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
+
+                    for (int64_t j = 0; j < vec_count; j++) {
+                        size_t base_j = j * WKV_VECTOR_SIZE;
+                        size_t t_h_j_offset = t_h_offset + base_j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
+
+                        // Load x elements at once
+                        GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]);
+                        GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]);
+                        GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]);
+
+                        // Compute kv = v * k
+                        GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec);
+
+                        // Compute temp = kv * time_faaaa + prev_state
+                        GGML_F32X temp_vec = GGML_F32X_FMA(prev_state_vec, kv_vec, time_faaaa_vec);
+
+                        // Update dst: dst += temp * r
+                        dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, r_vec);
+                        GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec);
+
+                        // Update state: state = prev_state * time_decay + kv
+                        GGML_F32X new_state_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, time_decay_vec);
+                        GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], new_state_vec);
+                    }
+
+                    // Handle remaining elements, this will not be used.
+                    for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                        dst_data[t_h_j_offset] += temp_val * r_val;
+                        state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                    }
+                }
+            }
+        }
+
+    #else
+        // basically fused operations:
+        // dst = r @ (time_faaaa * (k @ v) + state),
+        // state = time_decay * state + (k @ v),
+        // recursive through each token
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_i_offset = h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float r_val = r[t_h_i_offset];
+                    float time_faaaa_val = time_faaaa[h_i_offset];
+                    // RWKV v6: different time_decay for each token.
+                    float time_decay_val = time_decay[t_h_i_offset];
+
+                    for (int64_t j = 0; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                        dst_data[t_h_j_offset] += temp_val * r_val;
+                        state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                    }
+                }
+            }
+        }
+    #endif
+}
+
+
+static void ggml_compute_forward_rwkv_wkv6(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_rwkv_wkv6_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_gla
+
+static void ggml_compute_forward_gla_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+    const int64_t T = dst->src[1]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t HEADS = dst->src[1]->ne[1];
+    const int64_t n_seqs = dst->src[4]->ne[1];
+    const int64_t head_size = C / HEADS;
+    const float scale = ggml_get_op_params_f32(dst, 0);
+
+    float * dst_data = (float *) dst->data;
+    float * state = ((float *) dst->data) + C * T;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (ith >= HEADS) {
+        return;
+    }
+
+    const int h_start = (HEADS * ith) / nth;
+    const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ?
+                (HEADS * (ith + 1)) / nth : HEADS;
+
+    float * k = (float *) dst->src[0]->data;
+    float * v = (float *) dst->src[1]->data;
+    float * q = (float *) dst->src[2]->data;
+    float * g = (float *) dst->src[3]->data;
+
+    size_t t_stride = HEADS * head_size; // Same to C
+
+    size_t h_stride = C / HEADS;
+    GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
+    size_t h_stride_2d = head_size * head_size;
+
+    if (ith == 0) {
+        memset(dst_data, 0, T * C * sizeof(float));
+    }
+    ggml_barrier(params->threadpool);
+
+
+    #if defined(__AVX__) && !defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x8
+        #define GGML_F32X_SET1 GGML_F32x8_SET1
+        #define GGML_F32X_LOAD GGML_F32x8_LOAD
+        #define GGML_F32X_STORE GGML_F32x8_STORE
+        #define GGML_F32X_MUL GGML_F32x8_MUL
+        #define GGML_F32X_FMA GGML_F32x8_FMA
+        #define GLA_VECTOR_SIZE 8
+    #elif defined(__AVX512F__)
+        #define GGML_F32X GGML_F32x16
+        #define GGML_F32X_SET1 GGML_F32x16_SET1
+        #define GGML_F32X_LOAD GGML_F32x16_LOAD
+        #define GGML_F32X_STORE GGML_F32x16_STORE
+        #define GGML_F32X_MUL GGML_F32x16_MUL
+        #define GGML_F32X_FMA GGML_F32x16_FMA
+        #define GLA_VECTOR_SIZE 16
+    #elif defined(__ARM_NEON) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32x4
+        #define GGML_F32X_SET1 GGML_F32x4_SET1
+        #define GGML_F32X_LOAD GGML_F32x4_LOAD
+        #define GGML_F32X_STORE GGML_F32x4_STORE
+        #define GGML_F32X_MUL GGML_F32x4_MUL
+        #define GGML_F32X_FMA GGML_F32x4_FMA
+        #define GLA_VECTOR_SIZE 4
+    #endif
+
+    #ifdef GLA_VECTOR_SIZE
+        const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
+
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
+
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float q_val = q[t_h_i_offset] * scale;
+                    float g_val = g[t_h_i_offset];
+
+                    // Broadcast scalar values to vectors
+                    GGML_F32X k_vec = GGML_F32X_SET1(k_val);
+                    GGML_F32X q_vec = GGML_F32X_SET1(q_val);
+                    GGML_F32X g_vec = GGML_F32X_SET1(g_val);
+
+                    for (int64_t j = 0; j < vec_count; j++) {
+                        size_t base_j = j * GLA_VECTOR_SIZE;
+                        size_t t_h_j_offset = t_h_offset + base_j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
+
+                        // Load x elements at once
+                        GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]);
+                        GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]);
+                        GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]);
+
+                        // Compute kv = v * k
+                        GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec);
+
+                        // Compute temp = prev_state * g + kv
+                        GGML_F32X temp_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, g_vec);
+
+                        // Update dst: dst += temp * q
+                        dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, q_vec);
+                        GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec);
+
+                        // Update state
+                        GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], temp_vec);
+                    }
+
+                    // Handle remaining elements, this will not be used.
+                    for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val + prev_state_val * g_val;
+                        dst_data[t_h_j_offset] += temp_val * q_val;
+                        state_cur[h_2d_i_j_offset] = temp_val;
+                    }
+                }
+            }
+        }
+
+    #else
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
+
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+
+                    float k_val = k[t_h_i_offset];
+                    float q_val = q[t_h_i_offset] * scale;
+                    float g_val = g[t_h_i_offset];
+
+                    for (int64_t j = 0; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = prev_state_val * g_val + kv_val;
+                        dst_data[t_h_j_offset] += temp_val * q_val;
+                        state_cur[h_2d_i_j_offset] = temp_val;
+                    }
+                }
+            }
+        }
+    #endif
+}
+
+
+static void ggml_compute_forward_gla(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gla_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_unary
+
+static void ggml_compute_forward_map_unary_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_unary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_unary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_binary
+
+static void ggml_compute_forward_map_binary_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(src1));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_map_binary(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_binary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom1_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom2_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const ggml_custom3_op_f32_t fun) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+    const struct ggml_tensor * c = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b, c);
+}
+
+// ggml_compute_forward_map_custom1
+
+static void ggml_compute_forward_map_custom1(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+
+    struct ggml_map_custom1_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_map_custom2
+
+static void ggml_compute_forward_map_custom2(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+
+    struct ggml_map_custom2_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_map_custom3
+
+static void ggml_compute_forward_map_custom3(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * a = dst->src[0];
+    const struct ggml_tensor * b = dst->src[1];
+    const struct ggml_tensor * c = dst->src[2];
+
+    struct ggml_map_custom3_op_params p;
+    memcpy(&p, dst->op_params, sizeof(p));
+
+    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
+}
+
+// ggml_compute_forward_cross_entropy_loss
+
+static void ggml_compute_forward_cross_entropy_loss_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_scalar(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0->ne[0];
+    const int64_t nr = ggml_nrows(src0);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    float * sums =  (float *) params->wdata;
+    float * st   = ((float *) params->wdata) + nth + ith*nc;
+    float sum_thread = 0.0f;
+
+    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    for (int64_t i1 = ir0; i1 < ir1; ++i1) {
+        const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]);
+        const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max);
+        assert(sum_softmax >= 0.0);
+
+        ggml_vec_add1_f32(nc, st, st, -sum_softmax);
+        ggml_vec_mul_f32(nc, st, st, s1);
+
+        float sum_st = 0.0f;
+        ggml_vec_sum_f32(nc, &sum_st, st);
+        sum_thread += sum_st;
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            assert(!isnan(st[i]));
+            assert(!isinf(st[i]));
+        }
+#endif
+    }
+    sums[ith] = sum_thread;
+    ggml_barrier(params->threadpool);
+
+    if (ith == 0) {
+        float * dp = (float *) dst->data;
+        ggml_vec_sum_f32(nth, dp, sums);
+        dp[0] *= -1.0f / (float) nr;
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_cross_entropy_loss_back
+
+static void ggml_compute_forward_cross_entropy_loss_back_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * grad  = dst->src[0]; // gradient of forward pass output
+    const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass
+    const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0f));
+    GGML_ASSERT(ggml_is_contiguous(src1f));
+    GGML_ASSERT(ggml_is_contiguous(grad));
+    GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst));
+
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    // TODO: handle transposed/permuted matrices
+    const int64_t nc = src0f->ne[0];
+    const int64_t nr = ggml_nrows(src0f);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    const float d_by_nr = ((const float *) grad->data)[0] / (float) nr;
+
+    for (int64_t i1 = ir0; i1 < ir1; i1++) {
+        float       * ds0 = (float       *)((char       *) dst->data   + i1*dst->nb[1]);
+        const float * s0  = (const float *)((const char *) src0f->data + i1*src0f->nb[1]);
+        const float * s1  = (const float *)((const char *) src1f->data + i1*src1f->nb[1]);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            //printf("p[%d] = %f\n", i, p[i]);
+            assert(!isnan(s0[i]));
+            assert(!isnan(s1[i]));
+        }
+#endif
+
+        // soft_max
+        float max = -INFINITY;
+        ggml_vec_max_f32(nc, &max, s0);
+        const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
+        assert(sum > 0.0);
+        ggml_vec_scale_f32(nc, ds0, 1.0/sum);
+
+        // grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr
+        ggml_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_vec_scale_f32(nc, ds0, d_by_nr);
+
+#ifndef NDEBUG
+        for (int64_t i = 0; i < nc; ++i) {
+            assert(!isnan(ds0[i]));
+            assert(!isinf(ds0[i]));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_cross_entropy_loss_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static void ggml_compute_forward_opt_step_adamw_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0         = dst->src[0];
+    const struct ggml_tensor * src0_grad    = dst->src[1];
+    const struct ggml_tensor * src0_grad_m  = dst->src[2];
+    const struct ggml_tensor * src0_grad_v  = dst->src[3];
+    const struct ggml_tensor * adamw_params = dst->src[4];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
+    const float alpha  = adamw_params_ptr[0];
+    const float beta1  = adamw_params_ptr[1];
+    const float beta2  = adamw_params_ptr[2];
+    const float eps    = adamw_params_ptr[3];
+    const float wd     = adamw_params_ptr[4];
+    const float beta1h = adamw_params_ptr[5];
+    const float beta2h = adamw_params_ptr[6];
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const size_t offset = i03*nb03 + i02*nb02 + i01*nb01;
+
+        float       * w = (float       *) ((char       *) src0->data        + offset); // weight
+        const float * g = (const float *) ((const char *) src0_grad->data   + offset); // grad
+        float       * m = (float       *) ((char       *) src0_grad_m->data + offset);
+        float       * v = (float       *) ((char       *) src0_grad_v->data + offset);
+
+        for (int i00 = 0; i00 < ne00; ++i00) {
+            m[i00] = m[i00]*beta1 +        g[i00]*(1.0f - beta1);
+            v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2);
+
+            const float mh =       m[i00]*beta1h;
+            const float vh = sqrtf(v[i00]*beta2h) + eps;
+
+            // The weight decay is applied independently of the Adam momenta m and v.
+            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
+            // See: https://arxiv.org/pdf/1711.05101v3.pdf
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
+        }
+    }
+}
+
+static void ggml_compute_forward_opt_step_adamw(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_opt_step_adamw_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+/////////////////////////////////
+
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    GGML_ASSERT(params);
+
+    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
+        return;
+    }
+
+    // extra_buffer op?
+    if (ggml_cpu_extra_compute_forward(params, tensor)) return;
+
+    switch (tensor->op) {
+        case GGML_OP_DUP:
+            {
+                ggml_compute_forward_dup(params, tensor);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_compute_forward_add(params, tensor);
+            } break;
+        case GGML_OP_ADD1:
+            {
+                ggml_compute_forward_add1(params, tensor);
+            } break;
+        case GGML_OP_ACC:
+            {
+                ggml_compute_forward_acc(params, tensor);
+            } break;
+        case GGML_OP_SUB:
+            {
+                ggml_compute_forward_sub(params, tensor);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_compute_forward_mul(params, tensor);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_compute_forward_div(params, tensor);
+            } break;
+        case GGML_OP_SQR:
+            {
+                ggml_compute_forward_sqr(params, tensor);
+            } break;
+        case GGML_OP_SQRT:
+            {
+                ggml_compute_forward_sqrt(params, tensor);
+            } break;
+        case GGML_OP_LOG:
+            {
+                ggml_compute_forward_log(params, tensor);
+            } break;
+        case GGML_OP_SIN:
+            {
+                ggml_compute_forward_sin(params, tensor);
+            } break;
+        case GGML_OP_COS:
+            {
+                ggml_compute_forward_cos(params, tensor);
+            } break;
+        case GGML_OP_SUM:
+            {
+                ggml_compute_forward_sum(params, tensor);
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                ggml_compute_forward_sum_rows(params, tensor);
+            } break;
+        case GGML_OP_MEAN:
+            {
+                ggml_compute_forward_mean(params, tensor);
+            } break;
+        case GGML_OP_ARGMAX:
+            {
+                ggml_compute_forward_argmax(params, tensor);
+            } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                ggml_compute_forward_count_equal(params, tensor);
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                ggml_compute_forward_repeat(params, tensor);
+            } break;
+        case GGML_OP_REPEAT_BACK:
+            {
+                ggml_compute_forward_repeat_back(params, tensor);
+            } break;
+        case GGML_OP_CONCAT:
+            {
+                ggml_compute_forward_concat(params, tensor);
+            } break;
+        case GGML_OP_SILU_BACK:
+            {
+                ggml_compute_forward_silu_back(params, tensor);
+            } break;
+        case GGML_OP_NORM:
+            {
+                ggml_compute_forward_norm(params, tensor);
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                ggml_compute_forward_rms_norm(params, tensor);
+            } break;
+        case GGML_OP_RMS_NORM_BACK:
+            {
+                ggml_compute_forward_rms_norm_back(params, tensor);
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                ggml_compute_forward_group_norm(params, tensor);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_compute_forward_mul_mat(params, tensor);
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                ggml_compute_forward_mul_mat_id(params, tensor);
+            } break;
+        case GGML_OP_OUT_PROD:
+            {
+                ggml_compute_forward_out_prod(params, tensor);
+            } break;
+        case GGML_OP_SCALE:
+            {
+                ggml_compute_forward_scale(params, tensor);
+            } break;
+        case GGML_OP_SET:
+            {
+                ggml_compute_forward_set(params, tensor);
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_compute_forward_cpy(params, tensor);
+            } break;
+        case GGML_OP_CONT:
+            {
+                ggml_compute_forward_cont(params, tensor);
+            } break;
+        case GGML_OP_RESHAPE:
+            {
+                ggml_compute_forward_reshape(params, tensor);
+            } break;
+        case GGML_OP_VIEW:
+            {
+                ggml_compute_forward_view(params, tensor);
+            } break;
+        case GGML_OP_PERMUTE:
+            {
+                ggml_compute_forward_permute(params, tensor);
+            } break;
+        case GGML_OP_TRANSPOSE:
+            {
+                ggml_compute_forward_transpose(params, tensor);
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_compute_forward_get_rows(params, tensor);
+            } break;
+        case GGML_OP_GET_ROWS_BACK:
+            {
+                ggml_compute_forward_get_rows_back(params, tensor);
+            } break;
+        case GGML_OP_DIAG:
+            {
+                ggml_compute_forward_diag(params, tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                ggml_compute_forward_diag_mask_inf(params, tensor);
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+            {
+                ggml_compute_forward_diag_mask_zero(params, tensor);
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                ggml_compute_forward_soft_max(params, tensor);
+            } break;
+        case GGML_OP_SOFT_MAX_BACK:
+            {
+                ggml_compute_forward_soft_max_ext_back(params, tensor);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                ggml_compute_forward_rope(params, tensor);
+            } break;
+        case GGML_OP_ROPE_BACK:
+            {
+                ggml_compute_forward_rope_back(params, tensor);
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                ggml_compute_forward_clamp(params, tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_compute_forward_conv_transpose_1d(params, tensor);
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                ggml_compute_forward_im2col(params, tensor);
+            } break;
+        case GGML_OP_IM2COL_BACK:
+            {
+                ggml_compute_forward_im2col_back_f32(params, tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                ggml_compute_forward_conv_transpose_2d(params, tensor);
+            } break;
+        case GGML_OP_POOL_1D:
+            {
+                ggml_compute_forward_pool_1d(params, tensor);
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                ggml_compute_forward_pool_2d(params, tensor);
+            } break;
+        case GGML_OP_POOL_2D_BACK:
+            {
+                ggml_compute_forward_pool_2d_back(params, tensor);
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                ggml_compute_forward_upscale(params, tensor);
+            } break;
+        case GGML_OP_PAD:
+            {
+                ggml_compute_forward_pad(params, tensor);
+            } break;
+        case GGML_OP_PAD_REFLECT_1D:
+            {
+                ggml_compute_forward_pad_reflect_1d(params, tensor);
+            } break;
+        case GGML_OP_ARANGE:
+            {
+                ggml_compute_forward_arange(params, tensor);
+            } break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            {
+                ggml_compute_forward_timestep_embedding(params, tensor);
+            } break;
+        case GGML_OP_ARGSORT:
+            {
+                ggml_compute_forward_argsort(params, tensor);
+            } break;
+        case GGML_OP_LEAKY_RELU:
+            {
+                ggml_compute_forward_leaky_relu(params, tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
+            } break;
+        case GGML_OP_FLASH_ATTN_BACK:
+            {
+                int32_t t = ggml_get_op_params_i32(tensor, 0);
+                GGML_ASSERT(t == 0 || t == 1);
+                bool masked = t != 0;
+                ggml_compute_forward_flash_attn_back(params, masked, tensor);
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                ggml_compute_forward_ssm_conv(params, tensor);
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                ggml_compute_forward_ssm_scan(params, tensor);
+            } break;
+        case GGML_OP_WIN_PART:
+            {
+                ggml_compute_forward_win_part(params, tensor);
+            } break;
+        case GGML_OP_WIN_UNPART:
+            {
+                ggml_compute_forward_win_unpart(params, tensor);
+            } break;
+        case GGML_OP_UNARY:
+            {
+                ggml_compute_forward_unary(params, tensor);
+            } break;
+        case GGML_OP_GET_REL_POS:
+            {
+                ggml_compute_forward_get_rel_pos(params, tensor);
+            } break;
+        case GGML_OP_ADD_REL_POS:
+            {
+                ggml_compute_forward_add_rel_pos(params, tensor);
+            } break;
+        case GGML_OP_RWKV_WKV6:
+            {
+                ggml_compute_forward_rwkv_wkv6(params, tensor);
+            } break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            {
+                ggml_compute_forward_gla(params, tensor);
+            } break;
+        case GGML_OP_MAP_UNARY:
+            {
+                ggml_unary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_unary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_BINARY:
+            {
+                ggml_binary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_binary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1_F32:
+            {
+                ggml_custom1_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2_F32:
+            {
+                ggml_custom2_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                ggml_custom3_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                ggml_compute_forward_map_custom1(params, tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                ggml_compute_forward_map_custom2(params, tensor);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                ggml_compute_forward_map_custom3(params, tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            {
+                ggml_compute_forward_cross_entropy_loss(params, tensor);
+            }
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            {
+                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
+            }
+            break;
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                ggml_compute_forward_opt_step_adamw(params, tensor);
+            }
+            break;
+        case GGML_OP_NONE:
+            {
+                // nop
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// Android's libc implementation "bionic" does not support setting affinity
+#if defined(__gnu_linux__)
+static void set_numa_thread_affinity(int thread_n) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    int node_num;
+    int rv;
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    switch(g_state.numa.numa_strategy) {
+        case GGML_NUMA_STRATEGY_DISTRIBUTE:
+            // run thread on node_num thread_n / (threads per node)
+            node_num = thread_n % g_state.numa.n_nodes;
+            break;
+        case GGML_NUMA_STRATEGY_ISOLATE:
+            // run thread on current_node
+            node_num = g_state.numa.current_node;
+            break;
+        case GGML_NUMA_STRATEGY_NUMACTL:
+            // use the cpuset that numactl gave us
+            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
+            if (rv) {
+                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
+            }
+            return;
+        default:
+            return;
+    }
+
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (size_t i = 0; i < node->n_cpus; ++i) {
+        CPU_SET_S(node->cpus[i], setsize, cpus);
+    }
+
+    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+
+static void clear_numa_thread_affinity(void) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
+        CPU_SET_S(i, setsize, cpus);
+    }
+
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
+    }
+
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+static void clear_numa_thread_affinity(void) {}
+#endif
+
+static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+    int n_tasks = 0;
+
+    if (ggml_is_empty(node)) {
+        // no need to multi-thread a no-op
+        n_tasks = 1;
+        return n_tasks;
+    }
+
+    switch (node->op) {
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_SUB:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_SUM:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
+        case GGML_OP_ARGMAX:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_COUNT_EQUAL:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_REPEAT:
+        case GGML_OP_REPEAT_BACK:
+        case GGML_OP_LEAKY_RELU:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(node)) {
+                case GGML_UNARY_OP_ABS:
+                case GGML_UNARY_OP_SGN:
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_ELU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_EXP:
+                    {
+                        n_tasks = 1;
+                    } break;
+
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_SILU:
+                    {
+                        n_tasks = n_threads;
+                    } break;
+                default:
+                    GGML_ABORT("fatal error");
+            }
+            break;
+        case GGML_OP_SILU_BACK:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_RMS_NORM_BACK:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_CONCAT:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_OUT_PROD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                // FIXME: get_rows can use additional threads, but the cost of launching additional threads
+                // decreases performance with GPU offloading
+                //n_tasks = n_threads;
+                n_tasks = 1;
+            } break;
+        case GGML_OP_SCALE:
+        case GGML_OP_SET:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_GET_ROWS_BACK:
+        case GGML_OP_DIAG:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX_BACK:
+        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
+        case GGML_OP_ADD_REL_POS:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                n_tasks = 1; //TODO
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
+            } break;
+        case GGML_OP_IM2COL:
+        case GGML_OP_IM2COL_BACK:
+        case GGML_OP_CONV_TRANSPOSE_1D:
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_POOL_1D:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_POOL_2D_BACK:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_FLASH_ATTN_BACK:
+        case GGML_OP_SSM_CONV:
+        case GGML_OP_SSM_SCAN:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_WIN_PART:
+        case GGML_OP_WIN_UNPART:
+        case GGML_OP_GET_REL_POS:
+        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_GATED_LINEAR_ATTN:
+        case GGML_OP_MAP_UNARY:
+        case GGML_OP_MAP_BINARY:
+        case GGML_OP_MAP_CUSTOM1_F32:
+        case GGML_OP_MAP_CUSTOM2_F32:
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_MAP_CUSTOM1:
+            {
+                struct ggml_map_custom1_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_MAP_CUSTOM2:
+            {
+                struct ggml_map_custom2_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_MAP_CUSTOM3:
+            {
+                struct ggml_map_custom3_op_params p;
+                memcpy(&p, node->op_params, sizeof(p));
+                if (p.n_tasks == GGML_N_TASKS_MAX) {
+                    n_tasks = n_threads;
+                } else {
+                    n_tasks = MIN(p.n_tasks, n_threads);
+                }
+            } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+        case GGML_OP_OPT_STEP_ADAMW:
+            {
+                n_tasks = n_threads;
+            } break;
+        case GGML_OP_NONE:
+            {
+                n_tasks = 1;
+            } break;
+        case GGML_OP_COUNT:
+            {
+                GGML_ABORT("fatal error");
+            }
+        default:
+            {
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < GGML_OP_COUNT) {
+                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
+                GGML_ABORT("fatal error");
+            }
+    }
+
+    assert(n_tasks > 0);
+
+    return n_tasks;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+static bool ggml_thread_apply_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+    // This is up to the applications.
+    DWORD p = THREAD_PRIORITY_NORMAL;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    if (!SetThreadPriority(GetCurrentThread(), p)) {
+        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+
+    return true;
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    // Not supported on Apple platforms
+    UNUSED(mask);
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#elif defined(__gnu_linux__)
+// TODO: this may not work on BSD, to be verified
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    if (prio == GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+
+    return true;
+}
+
+#else // unsupported platforms
+
+static bool ggml_thread_apply_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+
+static bool ggml_thread_apply_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+
+#endif
+
+static bool ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
+    }
+    return false;
+}
+
+static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
+    if (!threadpool) return;
+
+    const int n_threads = threadpool->n_threads_max;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    threadpool->stop = true;
+    threadpool->pause = false;
+
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+
+    for (int j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    ggml_mutex_destroy(&threadpool->mutex);
+    ggml_cond_destroy(&threadpool->cond);
+#endif // GGML_USE_OPENMP
+
+    const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
+    ggml_aligned_free(threadpool->workers, workers_size);
+    ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
+}
+
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
+void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (!threadpool->pause) {
+       ggml_threadpool_pause_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_lock(&threadpool->mutex);
+    if (threadpool->pause) {
+       ggml_threadpool_resume_locked(threadpool);
+    }
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                               int   n_threads,
+            struct ggml_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+    }
+    if (n_threads <= 0) {
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
+    }
+
+    size_t work_size = 0;
+
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    int max_tasks = 1;
+
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        const int n_tasks = ggml_get_n_tasks(node, n_threads);
+
+        max_tasks = MAX(max_tasks, n_tasks);
+
+        size_t cur = 0;
+
+        if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
+
+            switch (node->op) {
+                case GGML_OP_CPY:
+                case GGML_OP_DUP:
+                    {
+                        if (ggml_is_quantized(node->type) ||
+                            // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+                            (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
+                            (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_ADD:
+                case GGML_OP_ADD1:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_ACC:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_COUNT_EQUAL:
+                    {
+                        cur = ggml_type_size(node->type)*n_tasks;
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+
+                        if (node->src[1]->type != vec_dot_type) {
+                            cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT_ID:
+                    {
+                        cur = 0;
+                        const struct ggml_tensor * src0 = node->src[0];
+                        const struct ggml_tensor * src1 = node->src[1];
+                        const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
+                        if (src1->type != vec_dot_type) {
+                            cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                        }
+                        const int n_as = src0->ne[2];
+                        cur += GGML_PAD(cur, sizeof(int64_t));       // align
+                        cur += n_as * sizeof(int64_t);               // matrix_row_counts
+                        cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+                    } break;
+                case GGML_OP_OUT_PROD:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                case GGML_OP_ROPE:
+                case GGML_OP_ROPE_BACK:
+                    {
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                    } break;
+                case GGML_OP_CONV_TRANSPOSE_1D:
+                    {
+                        GGML_ASSERT(node->src[0]->ne[3] == 1);
+                        GGML_ASSERT(node->src[1]->ne[2] == 1);
+                        GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                        const int64_t ne00 = node->src[0]->ne[0];  // K
+                        const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                        const int64_t ne02 = node->src[0]->ne[2];  // Cin
+                        const int64_t ne10 = node->src[1]->ne[0];  // L
+                        const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                        if ((node->src[0]->type == GGML_TYPE_F16 ||
+                             node->src[0]->type == GGML_TYPE_BF16) &&
+                            node->src[1]->type == GGML_TYPE_F32) {
+                            cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+                            cur += sizeof(ggml_fp16_t)*ne10*ne11;
+                        } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                                   node->src[1]->type == GGML_TYPE_F32) {
+                            cur += sizeof(float)*ne00*ne01*ne02;
+                            cur += sizeof(float)*ne10*ne11;
+                        } else {
+                            GGML_ABORT("fatal error");
+                        }
+                    } break;
+                case GGML_OP_CONV_TRANSPOSE_2D:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // W
+                        const int64_t ne01 = node->src[0]->ne[1]; // H
+                        const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                        const int64_t ne03 = node->src[0]->ne[3]; // Channels In
+
+                        const int64_t ne10 = node->src[1]->ne[0]; // W
+                        const int64_t ne11 = node->src[1]->ne[1]; // H
+                        const int64_t ne12 = node->src[1]->ne[2]; // Channels In
+
+                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
+                        cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+                    } break;
+                case GGML_OP_FLASH_ATTN_EXT:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // D
+
+                        cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
+                    } break;
+                case GGML_OP_FLASH_ATTN_BACK:
+                    {
+                        const int64_t    D = node->src[0]->ne[0];
+                        const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                        if (node->src[1]->type == GGML_TYPE_F32) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == GGML_TYPE_F16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == GGML_TYPE_BF16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        }
+                    } break;
+
+                case GGML_OP_CROSS_ENTROPY_LOSS:
+                    {
+                        cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+                    } break;
+                case GGML_OP_COUNT:
+                    {
+                        GGML_ABORT("fatal error");
+                    }
+                default:
+                    break;
+            }
+        }
+
+        work_size = MAX(work_size, cur);
+    }
+
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads);
+    }
+
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
+
+    return cplan;
+}
+
+static thread_ret_t ggml_graph_compute_thread(void * data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_threadpool    * tp    = state->threadpool;
+
+    const struct ggml_cgraph * cgraph = tp->cgraph;
+    const struct ggml_cplan  * cplan  = tp->cplan;
+
+    set_numa_thread_affinity(state->ith);
+
+    struct ggml_compute_params params = {
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ tp,
+    };
+
+    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+
+        ggml_compute_forward(&params, node);
+
+        if (state->ith == 0 && cplan->abort_callback &&
+                cplan->abort_callback(cplan->abort_callback_data)) {
+            atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
+            tp->ec    = GGML_STATUS_ABORTED;
+        }
+
+        if (node_n + 1 < cgraph->n_nodes) {
+            ggml_barrier(state->threadpool);
+        }
+    }
+
+    ggml_barrier(state->threadpool);
+
+    return 0;
+}
+
+#ifndef GGML_USE_OPENMP
+
+// check if thread is active
+static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
+    return (state->ith < n_threads);
+}
+
+// check if thread is ready to proceed (exit from polling or sleeping)
+static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
+
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = ggml_graph_compute_thread_active(state);
+        state->last_graph = new_graph;
+    }
+
+    return state->pending;
+}
+
+// sync thread state after polling
+static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    #ifdef GGML_TSAN_ENABLED
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+    UNUSED(state);
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    // Skip polling for unused threads
+    if (!ggml_graph_compute_thread_active(state)) {
+        return state->pending;
+    }
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
+        // No new work. Keep polling.
+        ggml_thread_cpu_relax();
+    }
+
+    return state->pending;
+}
+
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    if (ggml_graph_compute_poll_for_work(state)) {
+        ggml_graph_compute_thread_sync(state);
+        return state->pending;
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_thread_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+    }
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return state->pending;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_threadpool * threadpool = state->threadpool;
+
+    ggml_thread_apply_priority(threadpool->prio);
+    if (ggml_thread_cpumask_is_valid(state->cpumask)) {
+        ggml_thread_apply_affinity(state->cpumask);
+    }
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+
+            ggml_graph_compute_thread(state);
+        }
+    }
+
+    return (thread_ret_t) 0;
+}
+
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
+{
+    // Always take the mutex here because the worker threads are doing hybrid poll/wait
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
+
+    // Update the number of active threads
+    atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+
+    // Indicate the graph is ready to be processed
+    // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
+
+    if (threadpool->pause) {
+       // Update main thread prio and affinity to match the threadpool settings
+       ggml_thread_apply_priority(threadpool->prio);
+       if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+           ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+       }
+
+       // resume does cond broadcast
+       ggml_threadpool_resume_locked(threadpool);
+    } else {
+       ggml_cond_broadcast(&threadpool->cond);
+    }
+
+    ggml_mutex_unlock(&threadpool->mutex);
+}
+
+#endif // GGML_USE_OPENMP
+
+static struct ggml_threadpool * ggml_threadpool_new_impl(
+    struct ggml_threadpool_params * tpp,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_threadpool * threadpool =
+        ggml_aligned_malloc(sizeof(struct ggml_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = tpp->paused;
+        threadpool->abort            = -1;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+    // Allocate and init workers state
+    const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
+    struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
+
+    memset(workers, 0, workers_size);
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j].threadpool = threadpool;
+        workers[j].ith        = j;
+    }
+
+    threadpool->workers = workers;
+
+#ifndef GGML_USE_OPENMP
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
+
+    // Spin the threads for all workers, and update CPU placements.
+    // Place the main thread last (towards the higher numbered CPU cores).
+
+    int32_t cpumask_iter = 0;
+
+    for (int j = 1; j < tpp->n_threads; j++) {
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+        int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
+        GGML_ASSERT(rc == 0);
+    }
+
+    ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+
+    if (!threadpool->pause) {
+        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+        ggml_thread_apply_priority(threadpool->prio);
+        if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+            ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
+    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+}
+
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
+    ggml_cpu_init();
+
+    GGML_ASSERT(cplan);
+    GGML_ASSERT(cplan->n_threads > 0);
+    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
+
+    int n_threads                               = cplan->n_threads;
+    struct ggml_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+    } else {
+        // Reset some of the parameters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->current_chunk    = 0;
+        threadpool->abort            = -1;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
+
+#ifdef GGML_USE_OPENMP
+    if (n_threads > 1) {
+        #pragma omp parallel num_threads(n_threads)
+        {
+            #pragma omp single
+            {
+                // update the number of threads from the actual number of threads that we got from OpenMP
+                n_threads = omp_get_num_threads();
+                atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
+            }
+
+            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
+        }
+    } else {
+        atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
+        ggml_graph_compute_thread(&threadpool->workers[0]);
+    }
+#else
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
+        n_threads = threadpool->n_threads_max;
+    }
+
+    // Kick all threads to start the new graph
+    ggml_graph_compute_kickoff(threadpool, n_threads);
+
+    // This is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
+#endif
+
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
+    enum ggml_status ret = threadpool->ec;
+
+    if (disposable_threadpool) {
+        ggml_threadpool_free(threadpool);
+    }
+
+    return ret;
+}
+
+enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
+
+    cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
+
+    return ggml_graph_compute(cgraph, &cplan);
+}
+
+
+int ggml_cpu_has_avx(void) {
+#if defined(__AVX__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx_vnni(void) {
+#if defined(__AVXVNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx2(void) {
+#if defined(__AVX2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512(void) {
+#if defined(__AVX512F__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vbmi(void) {
+#if defined(__AVX512VBMI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_vnni(void) {
+#if defined(__AVX512VNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_avx512_bf16(void) {
+#if defined(__AVX512BF16__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_amx_int8(void) {
+#if defined(__AMX_INT8__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fma(void) {
+#if defined(__FMA__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_arm_fma(void) {
+#if defined(__ARM_FEATURE_FMA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_riscv_v(void) {
+#if defined(__riscv_v_intrinsic)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_f16c(void) {
+#if defined(__F16C__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_fp16_va(void) {
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_wasm_simd(void) {
+#if defined(__wasm_simd128__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_llamafile(void) {
+#if defined(GGML_USE_LLAMAFILE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_ssse3(void) {
+#if defined(__SSSE3__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_vsx(void) {
+#if defined(__POWER9_VECTOR__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_neon(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_NEON)
+    return ggml_arm_arch_features.has_neon;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_dotprod(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
+    return ggml_arm_arch_features.has_dotprod;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_sve(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
+    return ggml_arm_arch_features.has_sve;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_matmul_int8(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
+    return ggml_arm_arch_features.has_i8mm;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_get_sve_cnt(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
+    return ggml_arm_arch_features.sve_cnt;
+#else
+    return 0;
+#endif
+}
+
+void ggml_cpu_init(void) {
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    ggml_critical_section_start();
+
+    static bool is_first_call = true;
+
+    if (is_first_call) {
+        // initialize GELU, Quick GELU, SILU and EXP F32 tables
+        {
+            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+            for (int i = 0; i < (1 << 16); ++i) {
+                union {
+                    uint16_t u16;
+                    ggml_fp16_t fp16;
+                } u = {i};
+                float f = GGML_FP16_TO_FP32(u.fp16);
+                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+            }
+
+            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+        }
+
+#if defined(__ARM_ARCH)
+        ggml_init_arm_arch_features();
+#endif
+
+        is_first_call = false;
+    }
+
+    ggml_critical_section_end();
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
new file mode 100644
index 000000000..2ccb4b472
--- /dev/null
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -0,0 +1,637 @@
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-aarch64.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-impl.h"
+#include "amx/amx.h"
+
+#include <cctype>
+#include <string>
+#include <vector>
+
+#ifdef GGML_USE_CPU_HBM
+#include "ggml-cpu-hbm.h"
+#endif
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+// ggml-backend interface
+
+std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
+    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
+        std::vector<ggml_backend_buffer_type_t> bufts;
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+        if (ggml_backend_amx_buffer_type()) {
+            bufts.push_back(ggml_backend_amx_buffer_type());
+        }
+#endif
+
+#ifdef GGML_USE_CPU_AARCH64
+        if (ggml_backend_cpu_aarch64_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+        }
+#endif
+
+        bufts.push_back(NULL);
+
+        return bufts;
+    }();
+
+    return bufts;
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
+    return ggml_backend_cpu_get_extra_buffers_type().data();
+
+    GGML_UNUSED(device);
+}
+
+static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) return true;
+    }
+    return false;
+}
+
+// CPU backend - backend (stream)
+
+struct ggml_backend_cpu_context {
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    uint8_t *           work_data;
+    size_t              work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
+    return "CPU";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    delete[] cpu_ctx->work_data;
+    delete cpu_ctx;
+    delete backend;
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
+        if (cpu_plan->cplan.work_data == NULL) {
+            delete cpu_plan;
+            return NULL;
+        }
+    }
+
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    delete[] cpu_plan->cplan.work_data;
+    delete cpu_plan;
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        delete[] cpu_ctx->work_data;
+        cpu_ctx->work_data = new uint8_t[cplan.work_size];
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        cpu_ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return ggml_graph_compute(cgraph, &cplan);
+}
+
+static const struct ggml_backend_i ggml_backend_cpu_i = {
+    /* .get_name                = */ ggml_backend_cpu_get_name,
+    /* .free                    = */ ggml_backend_cpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_cpu_guid(void) {
+    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    // initialize CPU backend now to avoid slowing the first graph computation
+    ggml_cpu_init();
+
+    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
+    if (ctx == NULL) {
+        return NULL;
+    }
+
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
+
+    ggml_backend_t cpu_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_cpu_guid(),
+        /* .interface = */ ggml_backend_cpu_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context   = */ ctx,
+    };
+
+    if (cpu_backend == NULL) {
+        delete ctx;
+        return NULL;
+    }
+
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
+// CPU backend - device
+
+struct ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+
+    ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                TEXT("ProcessorNameString"),
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    TEXT("ProcessorNameString"),
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+
+static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
+    return "CPU";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
+
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cpu_device_get_name(dev);
+    props->description = ggml_backend_cpu_device_get_description(dev);
+    props->type        = ggml_backend_cpu_device_get_type(dev);
+    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_cpu_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
+        return true;
+    }
+
+    // extra_buffer_op?
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra) {
+            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
+            if (buf_extra && buf_extra->supports_op(dev, op)) {
+                return true;
+            }
+        }
+    }
+
+    // the other case need host buffer.
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
+            return false;
+        }
+    }
+
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return
+                op->type != GGML_TYPE_IQ3_XXS &&
+                op->type != GGML_TYPE_IQ3_S   &&
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ2_S   &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
+        case GGML_OP_SOFT_MAX_BACK: {
+            if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
+                return false;
+            }
+            float max_bias = 0.0f;
+
+            memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
+
+            return max_bias == 0.0f;
+        }
+        case GGML_OP_IM2COL_BACK:
+            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
+        case GGML_OP_OUT_PROD:
+            return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
+                src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        default:
+            return true;
+    }
+}
+
+static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
+    /* .get_name             = */ ggml_backend_cpu_device_get_name,
+    /* .get_description      = */ ggml_backend_cpu_device_get_description,
+    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
+    /* .get_type             = */ ggml_backend_cpu_device_get_type,
+    /* .get_props            = */ ggml_backend_cpu_device_get_props,
+    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// CPU backend - backend (reg)
+
+static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
+    return "CPU";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_cpu_device_context ctx;
+    static ggml_backend_device ggml_backend_cpu_device = {
+        /* .iface   = */ ggml_backend_cpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &ctx,
+    };
+
+    return &ggml_backend_cpu_device;
+}
+
+// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
+static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        ggml_cpu_init();
+
+        std::vector<ggml_backend_feature> features;
+        if (ggml_cpu_has_sse3()) {
+            features.push_back({ "SSE3", "1" });
+        }
+        if (ggml_cpu_has_ssse3()) {
+            features.push_back({ "SSSE3", "1" });
+        }
+        if (ggml_cpu_has_avx()) {
+            features.push_back({ "AVX", "1" });
+        }
+        if (ggml_cpu_has_avx_vnni()) {
+            features.push_back({ "AVX_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx2()) {
+            features.push_back({ "AVX2", "1" });
+        }
+        if (ggml_cpu_has_f16c()) {
+            features.push_back({ "F16C", "1" });
+        }
+        if (ggml_cpu_has_fma()) {
+            features.push_back({ "FMA", "1" });
+        }
+        if (ggml_cpu_has_avx512()) {
+            features.push_back({ "AVX512", "1" });
+        }
+        if (ggml_cpu_has_avx512_vbmi()) {
+            features.push_back({ "AVX512_VBMI", "1" });
+        }
+        if (ggml_cpu_has_avx512_vnni()) {
+            features.push_back({ "AVX512_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx512_bf16()) {
+            features.push_back({ "AVX512_BF16", "1" });
+        }
+        if (ggml_cpu_has_amx_int8()) {
+            features.push_back({ "AMX_INT8", "1" });
+        }
+        if (ggml_cpu_has_neon()) {
+            features.push_back({ "NEON", "1" });
+        }
+        if (ggml_cpu_has_arm_fma()) {
+            features.push_back({ "ARM_FMA", "1" });
+        }
+        if (ggml_cpu_has_fp16_va()) {
+            features.push_back({ "FP16_VA", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
+        if (ggml_cpu_has_sve()) {
+            features.push_back({ "SVE", "1" });
+        }
+        if (ggml_cpu_has_dotprod()) {
+            features.push_back({ "DOTPROD", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
+        if (ggml_cpu_get_sve_cnt() > 0) {
+            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
+            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
+        }
+        if (ggml_cpu_has_riscv_v()) {
+            features.push_back({ "RISCV_V", "1" });
+        }
+        if (ggml_cpu_has_vsx()) {
+            features.push_back({ "VSX", "1" });
+        }
+        if (ggml_cpu_has_wasm_simd()) {
+            features.push_back({ "WASM_SIMD", "1" });
+        }
+        if (ggml_cpu_has_llamafile()) {
+            features.push_back({ "LLAMAFILE", "1" });
+        }
+    #ifdef GGML_USE_ACCELERATE
+        features.push_back({ "ACCELERATE", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_HBM
+        features.push_back({ "CPU_HBM", "1" });
+    #endif
+    #ifdef GGML_USE_OPENMP
+        features.push_back({ "OPENMP", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_AARCH64
+        features.push_back({ "AARCH64_REPACK", "1" });
+    #endif
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
+        return (void *)fct;
+    }
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
+        return (void *)fct;
+    }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cpu_get_features;
+    }
+    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
+        return (void *)ggml_backend_cpu_set_abort_callback;
+    }
+    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
+        return (void *)ggml_numa_init;
+    }
+    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
+        return (void *)ggml_is_numa;
+    }
+
+    // threadpool - TODO:  move to ggml-base
+    if (strcmp(name, "ggml_threadpool_new") == 0) {
+        return (void *)ggml_threadpool_new;
+    }
+    if (strcmp(name, "ggml_threadpool_free") == 0) {
+        return (void *)ggml_threadpool_free;
+    }
+    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
+        return (void *)ggml_backend_cpu_set_threadpool;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
+    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
+    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+    // init CPU feature detection
+    ggml_cpu_init();
+
+    static struct ggml_backend_reg ggml_backend_cpu_reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_cpu_reg_i,
+        /* .context     = */ NULL,
+    };
+
+    return &ggml_backend_cpu_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
new file mode 100644
index 000000000..c22a66287
--- /dev/null
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -0,0 +1,2597 @@
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wpedantic"
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+#include "sgemm.h"
+#include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-quants.h"
+
+#include <atomic>
+#include <array>
+
+#ifdef _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE __attribute__((__noinline__))
+#endif
+
+#if defined(__ARM_NEON) || defined(__AVX512F__)
+#define VECTOR_REGISTERS 32
+#else
+#define VECTOR_REGISTERS 16
+#endif
+
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+namespace {
+
+inline float unhalf(ggml_fp16_t d) {
+    return GGML_FP16_TO_FP32(d);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// VECTORIZED ARITHMETIC OPERATIONS
+
+#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
+inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
+inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
+#endif  // __SSE__
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
+inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
+inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
+#endif // __AVX__
+
+#if defined(__AVX512F__)
+inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
+inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
+inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
+#endif // __AVX512F__
+
+#if defined(__ARM_NEON)
+inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
+inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
+inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
+#endif // __ARM_NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
+inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
+inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__MMA__)
+typedef vector unsigned char vec_t;
+typedef __vector_quad acc_t;
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// VECTORIZED FUSED MULTIPLY ADD
+
+/**
+ * Computes a * b + c.
+ */
+template <typename T, typename U>
+inline U madd(T a, T b, U c) {
+    return add(mul(a, b), c);
+}
+
+#if defined(__FMA__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+template <>
+inline __m256 madd(__m256 a, __m256 b, __m256 c) {
+    return _mm256_fmadd_ps(a, b, c);
+}
+#endif
+#if defined(__AVX512F__)
+template <>
+inline __m512 madd(__m512 a, __m512 b, __m512 c) {
+    return _mm512_fmadd_ps(a, b, c);
+}
+#endif
+#if defined(__AVX512BF16__)
+template <>
+inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
+    return _mm512_dpbf16_ps(c, a, b);
+}
+template <>
+inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
+    return _mm256_dpbf16_ps(c, a, b);
+}
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_FMA)
+template <>
+inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
+    return vfmaq_f32(c, b, a);
+}
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+template <>
+inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
+    return vfmaq_f16(c, b, a);
+}
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// VECTORIZED HORIZONTAL SUM
+
+#if defined(__ARM_NEON)
+inline float hsum(float32x4_t x) {
+    return vaddvq_f32(x);
+}
+#endif // __ARM_NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+inline float hsum(float16x8_t x) {
+    return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
+                                vcvt_f32_f16(vget_high_f16(x))));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+inline float hsum(__m128 x) {
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
+    x = _mm_add_ss(x, _mm_movehdup_ps(x));
+#else
+    __m128 t;
+    t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
+    x = _mm_add_ps(x, t);
+    t = _mm_movehl_ps(t, x);
+    x = _mm_add_ss(x, t);
+#endif
+    return _mm_cvtss_f32(x);
+}
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+inline float hsum(__m256 x) {
+    return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
+                           _mm256_castps256_ps128(x)));
+}
+#endif // __AVX__
+
+#if defined(__AVX512F__)
+inline float hsum(__m512 x) {
+    return _mm512_reduce_add_ps(x);
+}
+#endif // __AVX512F__
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// VECTORIZED MEMORY LOADING
+
+template <typename T, typename U> T load(const U *);
+
+#if defined(__ARM_NEON)
+template <> inline float32x4_t load(const float *p) {
+    return vld1q_f32(p);
+}
+#if !defined(_MSC_VER)
+// FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <> inline float16x8_t load(const ggml_fp16_t *p) {
+    return vld1q_f16((const float16_t *)p);
+}
+template <> inline float32x4_t load(const ggml_fp16_t *p) {
+    return vcvt_f32_f16(vld1_f16((const float16_t *)p));
+}
+#endif // _MSC_VER
+#endif // __ARM_NEON
+
+#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+template <> inline __m128 load(const float *p) {
+    return _mm_loadu_ps(p);
+}
+#endif  // __SSE__
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+template <> inline __m256 load(const float *p) {
+    return _mm256_loadu_ps(p);
+}
+#endif // __AVX__
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+template <> inline __m256 load(const ggml_bf16_t *p) {
+    return _mm256_castsi256_ps(
+        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
+}
+#endif // __AVX2__
+
+#if defined(__F16C__)
+template <> inline __m256 load(const ggml_fp16_t *p) {
+    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
+}
+#endif // __F16C__
+
+#if defined(__AVX512F__)
+template <> inline __m512 load(const float *p) {
+    return _mm512_loadu_ps(p);
+}
+template <> inline __m512 load(const ggml_fp16_t *p) {
+    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
+}
+template <> inline __m512 load(const ggml_bf16_t *p) {
+    return _mm512_castsi512_ps(
+        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
+}
+#endif // __AVX512F__
+
+#if defined(__AVX512BF16__)
+template <> inline __m512bh load(const ggml_bf16_t *p) {
+    return (__m512bh)_mm512_loadu_ps((const float *)p);
+}
+template <> inline __m256bh load(const ggml_bf16_t *p) {
+    return (__m256bh)_mm256_loadu_ps((const float *)p);
+}
+template <> inline __m512bh load(const float *p) {
+    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
+}
+template <> inline __m256bh load(const float *p) {
+    return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// FLOATING POINT MATRIX MULTIPLICATION
+
+template <int M>
+static inline int64_t BLOCK_SIZE(size_t m) {
+    const int64_t NB_BLOC_M = (m + M - 1) / M;
+    return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
+}
+
+static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
+    return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
+}
+
+template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
+class tinyBLAS {
+  public:
+    tinyBLAS(const ggml_compute_params * params, int64_t k,
+             const TA *A, int64_t lda,
+             const TB *B, int64_t ldb,
+             TC *C, int64_t ldc)
+        : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
+    }
+
+    bool matmul(int64_t m, int64_t n) {
+        if (k % KN != 0)
+            return false;
+        // compute RM for only need tile with size RM&RM-1
+#if VECTOR_REGISTERS == 32
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 4>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 2>(m, n, SIZE_N, 12);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
+            mnpack<4, 6, 1>(m, n, SIZE_N, 12);
+            return true;
+        }
+#else  // VECTOR_REGISTERS == 16
+        if (m % 16 == 0 && (m/16 >= params->nth)) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 4>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 8 == 0 ) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 2>(m, n, SIZE_N, 24);
+            return true;
+        }
+        if (m % 4 == 0) {
+            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
+            mnpack<4, 3, 1>(m, n, SIZE_N, 24);
+            return true;
+        }
+#endif
+        return false;
+    }
+
+  private:
+    template <int RM, int RN, int BM>
+    inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
+        if (SIZE_N == RN) {
+            return gemm<RM, RN, BM>(m, n, BN);
+        }
+        if constexpr (RN > 1) {
+            return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
+        } else {
+            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_ASSERT(false); // we have miss something.
+        }
+    }
+
+    template <int RM, int RN>
+    inline void gemm_bloc(int64_t ii, int64_t jj) {
+        D Cv[RN][RM] = {};
+        for (int64_t l = 0; l < k; l += KN) {
+            // help compiler for op order.
+            if constexpr (RM <= RN) {
+                V Av[RM];
+                for (int64_t i = 0; i < RM; ++i) {
+                    Av[i] = load<V>(A + lda * (ii + i) + l);
+                }
+                for (int64_t j = 0; j < RN; ++j) {
+                    V Bv = load<V>(B + ldb * (jj + j) + l);
+                    for (int64_t i = 0; i < RM; ++i) {
+                        Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
+                    }
+                }
+            } else {
+                V Bv[RN];
+                for (int64_t j = 0; j < RN; ++j) {
+                    Bv[j] = load<V>(B + ldb * (jj + j) + l);
+                }
+                for (int64_t i = 0; i < RM; ++i) {
+                    V Av = load<V>(A + lda * (ii + i) + l);
+                    for (int64_t j = 0; j < RN; ++j) {
+                        Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
+                    }
+                }
+            }
+        }
+        for (int64_t j = 0; j < RN; ++j)
+            for (int64_t i = 0; i < RM; ++i)
+                C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+    }
+
+    template <int RM, int RN, int BM>
+    NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
+        static std::atomic<int64_t> current_chunk;
+
+        GGML_ASSERT(m % (RM * BM) == 0);
+        const int64_t ytiles = m / (RM * BM);
+        const int64_t xtiles = (n + RN -1) / RN;
+        const int64_t jj_RN = (xtiles - (xtiles * RN - n));
+
+        // "round" bloc_size to "nearest" BN
+        const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
+        const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
+        const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
+        const int64_t nb_job = ytiles * NB_BN;
+
+        if (params->ith == 0) {
+            GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        int64_t job = params->ith;
+        while (job < nb_job) {
+            const int64_t ii = (job % ytiles) * RM * BM;
+            const int64_t jb =  job / ytiles;
+            const int64_t jr0 = BLOC_POS(jb  , jj_BN, SIZE_BN);
+            const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
+
+            const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
+            const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
+            const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
+
+            for (int64_t bi = 0; bi < BM * RM; bi += RM) {
+                int64_t jj = jj0;
+                for (; jj < jj1; jj += RN) {
+                    gemm_bloc<RM, RN>(ii + bi, jj);
+                }
+                if constexpr (RN > 1) {
+                    for (; jj < jj2; jj += RN - 1) {
+                        gemm_bloc<RM, RN-1>(ii + bi, jj);
+                    }
+                }
+                GGML_ASSERT(jj == jj2);
+            }
+
+            // next step.
+            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+        }
+
+        ggml_barrier(params->threadpool);
+        return;
+    }
+
+    const ggml_compute_params * params;
+    const TA *const A;
+    const TB *const B;
+    TC *const C;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// QUANT ZERO MATRIX MULTIPLICATION
+
+#if defined(__ARM_FEATURE_DOTPROD)
+template <typename TA>
+class tinyBLAS_Q0_ARM {
+  public:
+    tinyBLAS_Q0_ARM(int64_t k,
+                    const TA *A, int64_t lda,
+                    const block_q8_0 *B, int64_t ldb,
+                    float *C, int64_t ldc,
+                    int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+        mnpack(0, m, 0, n);
+    }
+
+  private:
+    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
+        case 0x33:
+            mc = 3;
+            nc = 3;
+            gemm<3, 3>(m0, m, n0, n);
+            break;
+        case 0x32:
+            mc = 3;
+            nc = 2;
+            gemm<3, 2>(m0, m, n0, n);
+            break;
+        case 0x23:
+            mc = 2;
+            nc = 3;
+            gemm<2, 3>(m0, m, n0, n);
+            break;
+        case 0x22:
+            mc = 2;
+            nc = 2;
+            gemm<2, 2>(m0, m, n0, n);
+            break;
+        case 0x31:
+            mc = 3;
+            nc = 1;
+            gemm<3, 1>(m0, m, n0, n);
+            break;
+        case 0x13:
+            mc = 1;
+            nc = 3;
+            gemm<1, 3>(m0, m, n0, n);
+            break;
+        case 0x21:
+            mc = 2;
+            nc = 1;
+            gemm<2, 1>(m0, m, n0, n);
+            break;
+        case 0x12:
+            mc = 1;
+            nc = 2;
+            gemm<1, 2>(m0, m, n0, n);
+            break;
+        case 0x11:
+            mc = 1;
+            nc = 1;
+            gemm<1, 1>(m0, m, n0, n);
+            break;
+        default:
+            return;
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            float32x4_t Cv[RN][RM] = {};
+            for (int64_t l = 0; l < k; ++l)
+                for (int64_t j = 0; j < RN; ++j)
+                    for (int64_t i = 0; i < RM; ++i)
+                        Cv[j][i] = vmlaq_n_f32(Cv[j][i],
+                                               vcvtq_f32_s32(vdotq_s32(
+                                                   vdotq_s32(vdupq_n_s32(0),
+                                                             load_lo(A + lda * (ii + i) + l),
+                                                             load_lo(B + ldb * (jj + j) + l)),
+                                                   load_hi(A + lda * (ii + i) + l),
+                                                   load_hi(B + ldb * (jj + j) + l))),
+                                               unhalf(A[lda * (ii + i) + l].d) *
+                                               unhalf(B[ldb * (jj + j) + l].d));
+            for (int64_t j = 0; j < RN; ++j)
+                for (int64_t i = 0; i < RM; ++i)
+                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+        }
+    }
+
+    inline int8x16_t load_lo(const block_q8_0 *b) {
+        return vld1q_s8(b->qs);
+    }
+
+    inline int8x16_t load_hi(const block_q8_0 *b) {
+        return vld1q_s8(b->qs + 16);
+    }
+
+    inline int8x16_t load_lo(const block_q4_0 *b) {
+        return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
+                                                     vdupq_n_u8(0x0f))),
+                        vdupq_n_s8(0x8));
+    }
+
+    inline int8x16_t load_hi(const block_q4_0 *b) {
+        return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
+                        vdupq_n_s8(0x8));
+    }
+
+    const TA *const A;
+    const block_q8_0 *const B;
+    float *const C;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+#endif // __ARM_FEATURE_DOTPROD
+
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_Q0_AVX {
+  public:
+    tinyBLAS_Q0_AVX(int64_t k,
+                    const TA *A, int64_t lda,
+                    const TB *B, int64_t ldb,
+                    TC *C, int64_t ldc,
+                    int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+        mnpack(0, m, 0, n);
+    }
+
+  private:
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
+#if VECTOR_REGISTERS == 32
+        case 0x44:
+            mc = 4;
+            nc = 4;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemm4xN<4>(m0, m, n0, n);
+#else
+            gemm<4, 4>(m0, m, n0, n);
+#endif
+            break;
+        case 0x43:
+            mc = 4;
+            nc = 3;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemm4xN<3>(m0, m, n0, n);
+#else
+            gemm<4, 3>(m0, m, n0, n);
+#endif
+            break;
+        case 0x34:
+            mc = 3;
+            nc = 4;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemmMx4<3>(m0, m, n0, n);
+#else
+            gemm<3, 4>(m0, m, n0, n);
+#endif
+            break;
+        case 0x33:
+            mc = 3;
+            nc = 3;
+            gemm<3, 3>(m0, m, n0, n);
+            break;
+        case 0x42:
+            mc = 4;
+            nc = 2;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemm4xN<2>(m0, m, n0, n);
+#else
+            gemm<4, 2>(m0, m, n0, n);
+#endif
+            break;
+        case 0x24:
+            mc = 2;
+            nc = 4;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemmMx4<2>(m0, m, n0, n);
+#else
+            gemm<2, 4>(m0, m, n0, n);
+#endif
+            break;
+#else
+        case 0x44:
+        case 0x43:
+        case 0x42:
+            mc = 4;
+            nc = 2;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemm4xN<2>(m0, m, n0, n);
+#else
+            gemm<4, 2>(m0, m, n0, n);
+#endif
+            break;
+        case 0x34:
+        case 0x24:
+            mc = 2;
+            nc = 4;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemmMx4<2>(m0, m, n0, n);
+#else
+            gemm<2, 4>(m0, m, n0, n);
+#endif
+            break;
+        case 0x33:
+#endif
+        case 0x32:
+            mc = 3;
+            nc = 2;
+            gemm<3, 2>(m0, m, n0, n);
+            break;
+        case 0x23:
+            mc = 2;
+            nc = 3;
+            gemm<2, 3>(m0, m, n0, n);
+            break;
+        case 0x41:
+            mc = 4;
+            nc = 1;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemm4xN<1>(m0, m, n0, n);
+#else
+            gemm<4, 1>(m0, m, n0, n);
+#endif
+            break;
+        case 0x22:
+            mc = 2;
+            nc = 2;
+            gemm<2, 2>(m0, m, n0, n);
+            break;
+        case 0x14:
+            mc = 1;
+            nc = 4;
+#if defined(__AVX2__) && defined(__F16C__)
+            gemmMx4<1>(m0, m, n0, n);
+#else
+            gemm<1, 4>(m0, m, n0, n);
+#endif
+            break;
+        case 0x31:
+            mc = 3;
+            nc = 1;
+            gemm<3, 1>(m0, m, n0, n);
+            break;
+        case 0x13:
+            mc = 1;
+            nc = 3;
+            gemm<1, 3>(m0, m, n0, n);
+            break;
+        case 0x21:
+            mc = 2;
+            nc = 1;
+            gemm<2, 1>(m0, m, n0, n);
+            break;
+        case 0x12:
+            mc = 1;
+            nc = 2;
+            gemm<1, 2>(m0, m, n0, n);
+            break;
+        case 0x11:
+            mc = 1;
+            nc = 1;
+            gemm<1, 1>(m0, m, n0, n);
+            break;
+        default:
+            return;
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+#if defined(__AVX2__) && defined(__F16C__)
+// Templated functions for gemm of dimensions 4xN
+    template <int RN>
+    NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / 4;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * 4;
+            int64_t jj = n0 + job % xtiles * RN;
+            __m256 Cv[RN][4] = {};
+            for (int64_t l = 0; l < k; ++l) {
+                uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
+                // Convert delta values for four blocks to float values
+                __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
+                __m256i avec0 = load(A + lda * (ii + 0) + l);
+                __m256i avec1 = load(A + lda * (ii + 1) + l);
+                __m256i avec2 = load(A + lda * (ii + 2) + l);
+                __m256i avec3 = load(A + lda * (ii + 3) + l);
+                for (int64_t j = 0; j < RN; ++j) {
+                        __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
+                        // Computation of product of delta values for four blocks and replicate it across 256 bit lane
+                        __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
+                        dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
+                        // Computation of dot product and multiplication with appropriate delta value products
+                        Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
+                                    updot(_mm256_sign_epi8(avec0, avec0),
+                                          _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
+                                    Cv[j][0]);
+                        Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
+                                    updot(_mm256_sign_epi8(avec1, avec1),
+                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
+                                    Cv[j][1]);
+                        Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
+                                    updot(_mm256_sign_epi8(avec2, avec2),
+                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
+                                    Cv[j][2]);
+                        Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
+                                    updot(_mm256_sign_epi8(avec3, avec3),
+                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
+                                    Cv[j][3]);
+                }
+            }
+
+            for (int64_t j = 0; j < RN; ++j)
+                for (int64_t i = 0; i < 4; ++i)
+                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+        }
+    }
+
+    // Templated functions for gemm of dimensions Mx4
+    template <int RM>
+    NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / 4;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * 4;
+            __m256 Cv[4][RM] = {};
+            for (int64_t l = 0; l < k; ++l) {
+                uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
+                // Convert delta values for four blocks to float values
+                __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
+                __m256i bvec0 = load(B + ldb * (jj + 0) + l);
+                __m256i bvec1 = load(B + ldb * (jj + 1) + l);
+                __m256i bvec2 = load(B + ldb * (jj + 2) + l);
+                __m256i bvec3 = load(B + ldb * (jj + 3) + l);
+                for (int64_t i = 0; i < RM; ++i) {
+                    __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
+                    // Computation of product of delta values for four blocks and replicate it across 256 bit lane
+                    __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
+                    dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
+                    // Computation of dot product and multiplication with appropriate delta value products
+                    Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
+                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                            load(A + lda * (ii + i) + l)),
+                                            _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
+                                    Cv[0][i]);
+                    Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
+                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                            load(A + lda * (ii + i) + l)),
+                                            _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
+                                    Cv[1][i]);
+                    Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
+                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                            load(A + lda * (ii + i) + l)),
+                                            _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
+                                    Cv[2][i]);
+                    Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
+                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                            load(A + lda * (ii + i) + l)),
+                                            _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
+                                    Cv[3][i]);
+                }
+            }
+            for (int64_t j = 0; j < 4; ++j)
+                for (int64_t i = 0; i < RM; ++i)
+                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+        }
+    }
+#endif
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            __m256 Cv[RN][RM] = {};
+            for (int64_t l = 0; l < k; ++l)
+                for (int64_t j = 0; j < RN; ++j)
+                    for (int64_t i = 0; i < RM; ++i) {
+#if defined(__AVX2__)
+                        __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                              load(A + lda * (ii + i) + l)),
+                                             _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
+                                                              load(A + lda * (ii + i) + l)));
+#else
+                        __m128i ali0 = load0(A + lda * (ii + i) + l);
+                        __m128i ali1 = load1(A + lda * (ii + i) + l);
+                        __m128i blj0 = load0(B + ldb * (jj + j) + l);
+                        __m128i blj1 = load1(B + ldb * (jj + j) + l);
+
+                        __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
+                        __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
+                        __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
+                        __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
+
+                        // updot
+                        const __m128i oneFill = _mm_set1_epi16(1);
+                        __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
+                        __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
+                        __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
+#endif
+                        Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
+                                                       unhalf(B[ldb * (jj + j) + l].d)),
+                                                       udTmp,
+                                                       Cv[j][i]);
+                    }
+            for (int64_t j = 0; j < RN; ++j)
+                for (int64_t i = 0; i < RM; ++i)
+                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
+        }
+    }
+
+    inline __m256i load(const block_q8_0 *b) {
+        return _mm256_loadu_si256((const __m256i *)b->qs);
+    }
+
+    inline __m128i load0(const block_q8_0 *b) {
+        return _mm_loadu_si128((const __m128i *)b->qs);
+    }
+
+    inline __m128i load1(const block_q8_0 *b) {
+        return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
+    }
+
+    inline __m256i load(const block_q4_0 *b) {
+        return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
+    }
+
+    inline __m128i load0(const block_q4_0 *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
+    }
+
+    inline __m128i load1(const block_q4_0 *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
+    }
+
+    inline __m256i load(const block_q5_0 *b) {
+        return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
+    }
+
+    inline __m128i load0(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
+        __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
+        bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxl, bytesl);
+    }
+
+    inline __m128i load1(const block_q5_0* b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        uint32_t x32;
+        memcpy(&x32, b->qh, sizeof(uint32_t));
+        __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
+        __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
+                                        _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                     _mm_shuffle_epi8(_mm_set1_epi32(x32),
+                                                                      _mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
+        bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
+        return _mm_or_si128(qxh, bytesh);
+    }
+
+    inline __m256i load(const block_iq4_nl *b) {
+        return MM256_SET_M128I(load1(b), load0(b));
+    }
+
+    inline __m128i load0(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
+    }
+
+    inline __m128i load1(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
+    }
+
+    inline __m256 updot(__m256i u, __m256i s) {
+        __m256i res;
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+        res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
+#elif defined(__AVXVNNI__)
+        res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
+#else
+        res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
+#endif
+        return _mm256_cvtepi32_ps(res);
+    }
+
+    static inline __m256i denibble(const uint8_t *p) {
+        __m128i x = _mm_loadu_si128((const __m128i *)p);
+        return _mm256_and_si256(_mm256_set1_epi8(15),
+                                _mm256_insertf128_si256(_mm256_castsi128_si256(x),
+                                                        _mm_srli_epi16(x, 4), 1));
+    }
+
+    static inline __m256i bittobyte(const uint8_t *p) {
+        uint32_t x32;
+        memcpy(&x32, p, sizeof(uint32_t));
+        __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
+                                          _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
+                                                          _mm256_shuffle_epi8(_mm256_set1_epi32(x32),
+                                                                              _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
+                                                                                                0x0101010101010101, 0x0000000000000000))));
+        return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *const C;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+#endif // __AVX__
+
+//PPC Implementation
+#if defined(__MMA__)
+
+#define SAVE_ACC(ACC, ii, jj) \
+   __builtin_mma_disassemble_acc(vec_C, ACC); \
+   for (int I = 0; I < 4; I++) { \
+      for (int J = 0; J < 4; J++) { \
+         *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
+      } \
+   } \
+
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_Q0_PPC {
+  public:
+    tinyBLAS_Q0_PPC(int64_t k,
+                const TA *A, int64_t lda,
+                const TB *B, int64_t ldb,
+                TC *C, int64_t ldc,
+                int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+        mnpack(0, m, 0, n);
+    }
+
+  private:
+
+    template<int RM, int RN>
+    inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
+       for (int I = 0; I < RM; I++) {
+          for (int J = 0; J < RN; J++) {
+             *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
+          }
+       }
+    }
+
+    template<int size>
+    inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
+       vector signed int vec_C[4];
+       vector float CA[4] = {0};
+       vector float res[4] = {0};
+       __builtin_mma_disassemble_acc(vec_C, ACC);
+       for (int i = 0; i < 4; i++) {
+          CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
+          res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
+          fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
+       }
+    }
+
+    template<typename VA, typename VB>
+    void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+        int64_t i, j;
+        TA *aoffset = NULL;
+        VA *vecOffset = NULL;
+        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
+        VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
+        VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
+        VB t1, t2, t3, t4, t5, t6, t7, t8;
+        vector unsigned char xor_vector;
+        uint8_t flip_vec = 0x80;
+        xor_vector = vec_splats(flip_vec);
+        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset5 = aoffset4 + lda;
+            aoffset6 = aoffset5 + lda;
+            aoffset7 = aoffset6 + lda;
+            aoffset8 = aoffset7 + lda;
+            aoffset += 8 * lda;
+
+            i = (cols >> 3);
+            if (i > 0) {
+               do {
+                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
+                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
+                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
+                    C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
+                    C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
+                    C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
+                    C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
+                    C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
+
+                    __builtin_vsx_disassemble_pair(c1, &C1);
+                    __builtin_vsx_disassemble_pair(c2, &C2);
+                    __builtin_vsx_disassemble_pair(c3, &C3);
+                    __builtin_vsx_disassemble_pair(c4, &C4);
+                    __builtin_vsx_disassemble_pair(c5, &C5);
+                    __builtin_vsx_disassemble_pair(c6, &C6);
+                    __builtin_vsx_disassemble_pair(c7, &C7);
+                    __builtin_vsx_disassemble_pair(c8, &C8);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+
+                    t1 = vec_perm(c5[0], c6[0], swiz1);
+                    t2 = vec_perm(c5[0], c6[0], swiz2);
+                    t3 = vec_perm(c7[0], c8[0], swiz1);
+                    t4 = vec_perm(c7[0], c8[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset+128);
+                    vec_xst(t6, 0, vecOffset+144);
+                    vec_xst(t7, 0, vecOffset+160);
+                    vec_xst(t8, 0, vecOffset+176);
+
+                    t1 = vec_perm(c5[1], c6[1], swiz1);
+                    t2 = vec_perm(c5[1], c6[1], swiz2);
+                    t3 = vec_perm(c7[1], c8[1], swiz1);
+                    t4 = vec_perm(c7[1], c8[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset+192);
+                    vec_xst(t6, 0, vecOffset+208);
+                    vec_xst(t7, 0, vecOffset+224);
+                    vec_xst(t8, 0, vecOffset+240);
+
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    aoffset4 += lda;
+                    aoffset5 += lda;
+                    aoffset6 += lda;
+                    aoffset7 += lda;
+                    aoffset8 += lda;
+                    vecOffset += 256;
+                    i--;
+               } while(i > 0);
+            }
+            j--;
+        } while(j > 0);
+    }
+
+    if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+
+        i = (cols >> 3);
+            if (i > 0) {
+               do {
+                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
+                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
+                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
+                    C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
+
+                    __builtin_vsx_disassemble_pair(c1, &C1);
+                    __builtin_vsx_disassemble_pair(c2, &C2);
+                    __builtin_vsx_disassemble_pair(c3, &C3);
+                    __builtin_vsx_disassemble_pair(c4, &C4);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    aoffset4 += lda;
+                    vecOffset += 128;
+                    i--;
+               } while(i > 0);
+            }
+        }
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            i = (cols >> 3);
+        if (i > 0) {
+                do {
+                    switch(rows) {
+                        case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
+                                __builtin_vsx_disassemble_pair(c3, &C3);
+                        case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
+                                __builtin_vsx_disassemble_pair(c2, &C2);
+                        case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
+                                __builtin_vsx_disassemble_pair(c1, &C1);
+                                break;
+                    }
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    if (flip == true) {
+                       t5 = vec_xor(t5, xor_vector);
+                       t6 = vec_xor(t6, xor_vector);
+                       t7 = vec_xor(t7, xor_vector);
+                       t8 = vec_xor(t8, xor_vector);
+                    }
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    vecOffset += 128;
+                    i--;
+               } while(i > 0);
+            }
+        }
+    }
+
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        int m_rem = MIN(m - m0, 8);
+        int n_rem = MIN(n - n0, 8);
+        // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
+        // issues. After resolving them, below code will be enabled.
+        /*if (m_rem >= 16 && n_rem >= 8) {
+            mc = 16;
+            nc = 8;
+            gemm<16,8>(m0, m, n0, n);
+        } else if(m_rem >= 8 && n_rem >= 16) {
+            mc = 8;
+            nc = 16;
+            gemm<8,16>(m0, m, n0, n);
+        }*/
+        if (m_rem >= 8 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 8) {
+            mc = 4;
+            nc = 8;
+            gemm<4,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 4) {
+            mc = 8;
+            nc = 4;
+            gemm<8,4>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 4) {
+            mc = 4;
+            nc = 4;
+            gemm_small<4, 4>(m0, m, n0, n);
+        } else if ((m_rem < 4) && (n_rem > 4)) {
+            nc = 4;
+            switch(m_rem) {
+                case 1:
+                    mc = 1;
+                    gemm_small<1, 4>(m0, m, n0, n);
+                    break;
+                case 2:
+                    mc = 2;
+                    gemm_small<2, 4>(m0, m, n0, n);
+                    break;
+                case 3:
+                    mc = 3;
+                    gemm_small<3, 4>(m0, m, n0, n);
+                    break;
+                default:
+                    return;
+            }
+        } else if ((m_rem > 4) && (n_rem < 4)) {
+            mc = 4;
+            switch(n_rem) {
+                case 1:
+                    nc = 1;
+                    gemm_small<4, 1>(m0, m, n0, n);
+                    break;
+                case 2:
+                    nc = 2;
+                    gemm_small<4, 2>(m0, m, n0, n);
+                    break;
+                case 3:
+                    nc = 3;
+                    gemm_small<4, 3>(m0, m, n0, n);
+                    break;
+                default:
+                    return;
+            }
+        } else {
+            switch((m_rem << 4) | n_rem) {
+                case 0x43:
+                    mc = 4;
+                    nc = 3;
+                    gemm_small<4, 3>(m0, m, n0, n);
+                    break;
+                case 0x42:
+                    mc = 4;
+                    nc = 2;
+                    gemm_small<4, 2>(m0, m, n0, n);
+                    break;
+                case 0x41:
+                    mc = 4;
+                    nc = 1;
+                    gemm_small<4, 1>(m0, m, n0, n);
+                    break;
+                case 0x34:
+                    mc = 3;
+                    nc = 4;
+                    gemm_small<3, 4>(m0, m, n0, n);
+                    break;
+                case 0x33:
+                    mc = 3;
+                    nc = 3;
+                    gemm_small<3, 3>(m0, m, n0, n);
+                    break;
+                case 0x32:
+                    mc = 3;
+                    nc = 2;
+                    gemm_small<3, 2>(m0, m, n0, n);
+                    break;
+                case 0x31:
+                    mc = 3;
+                    nc = 1;
+                    gemm_small<3, 1>(m0, m, n0, n);
+                    break;
+                case 0x24:
+                    mc = 2;
+                    nc = 4;
+                    gemm_small<2, 4>(m0, m, n0, n);
+                    break;
+                case 0x23:
+                    mc = 2;
+                    nc = 3;
+                    gemm_small<2, 3>(m0, m, n0, n);
+                    break;
+                case 0x22:
+                    mc = 2;
+                    nc = 2;
+                    gemm_small<2, 2>(m0, m, n0, n);
+                    break;
+                case 0x21:
+                    mc = 2;
+                    nc = 1;
+                    gemm_small<2, 1>(m0, m, n0, n);
+                    break;
+                case 0x14:
+                    mc = 1;
+                    nc = 4;
+                    gemm_small<1, 4>(m0, m, n0, n);
+                    break;
+                case 0x13:
+                    mc = 1;
+                    nc = 3;
+                    gemm_small<1, 3>(m0, m, n0, n);
+                    break;
+                case 0x12:
+                    mc = 1;
+                    nc = 2;
+                    gemm_small<1, 2>(m0, m, n0, n);
+                    break;
+                case 0x11:
+                    mc = 1;
+                    nc = 1;
+                    gemm_small<1, 1>(m0, m, n0, n);
+                    break;
+                default:
+                    return;
+            }
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+    void KERNEL_4x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[16] = {0};
+        acc_t acc_0, acc_1;
+        std::array<int, 4> comparray;
+        vector float fin_res[8] = {0};
+        vector float vs[8] = {0};
+        for (int l = 0; l < k; l++) {
+            __builtin_mma_xxsetaccz(&acc_0);
+            __builtin_mma_xxsetaccz(&acc_1);
+            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
+            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
+            for(int x = 0; x < 8; x++) {
+                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
+            }
+            for (int I = 0; I<4; I++) {
+                for (int J = 0; J<4; J++) {
+                    *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
+                    *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
+                }
+            }
+            auto aoffset = A+(ii*lda)+l;
+            for (int i = 0; i < 4; i++) {
+                comparray[i] = 0;
+                int ca = 0;
+                const int8_t *at = aoffset->qs;
+                for (int j = 0; j < 32; j++)
+                    ca += (int)*at++;
+                comparray[i] = ca;
+                aoffset += lda;
+            }
+            compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
+        }
+        save_res<4, 4>(ii, jj, 0, fin_res);
+        save_res<4, 4>(ii, jj+4, 4, fin_res);
+    }
+
+    void KERNEL_8x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[16], vec_B[8] = {0};
+        acc_t acc_0, acc_1;
+        std::array<int, 8> comparray;
+        vector float fin_res[8] = {0};
+        vector float vs[8] = {0};
+        for (int l = 0; l < k; l++) {
+            __builtin_mma_xxsetaccz(&acc_0);
+            __builtin_mma_xxsetaccz(&acc_1);
+            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
+            for(int x = 0; x < 8; x++) {
+                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
+            }
+            for (int I = 0; I<8; I++) {
+                for (int J = 0; J<4; J++) {
+                    *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
+                }
+            }
+            auto aoffset = A+(ii*lda)+l;
+            for (int i = 0; i < 8; i++) {
+                comparray[i] = 0;
+                int ca = 0;
+                const int8_t *at = aoffset->qs;
+                for (int j = 0; j < 32; j++)
+                    ca += (int)*at++;
+                comparray[i] = ca;
+                aoffset += lda;
+            }
+            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
+        }
+        save_res<4, 4>(ii, jj, 0, fin_res);
+        save_res<4, 4>(ii+4, jj, 4, fin_res);
+    }
+
+    void KERNEL_8x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[16], vec_B[16] = {0};
+        acc_t acc_0, acc_1, acc_2, acc_3;
+        std::array<int, 8> comparray;
+        vector float fin_res[16] = {0};
+        vector float vs[16] = {0};
+        for (int l = 0; l < k; l++) {
+            __builtin_mma_xxsetaccz(&acc_0);
+            __builtin_mma_xxsetaccz(&acc_1);
+            __builtin_mma_xxsetaccz(&acc_2);
+            __builtin_mma_xxsetaccz(&acc_3);
+            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
+            for(int x = 0; x < 8; x++) {
+                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
+                __builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
+                __builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
+            }
+            for (int I = 0; I<8; I++) {
+                for (int J = 0; J<4; J++) {
+                    *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
+                    *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
+                }
+            }
+            auto aoffset = A+(ii*lda)+l;
+            for (int i = 0; i < 8; i++) {
+                comparray[i] = 0;
+                int ca = 0;
+                const int8_t *at = aoffset->qs;
+                for (int j = 0; j < 32; j++)
+                    ca += (int)*at++;
+                comparray[i] = ca;
+                aoffset += lda;
+            }
+            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
+            compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
+            compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
+        }
+        save_res<4, 4>(ii, jj, 0, fin_res);
+        save_res<4, 4>(ii+4, jj, 4, fin_res);
+        save_res<4, 4>(ii, jj+4, 8, fin_res);
+        save_res<4, 4>(ii+4, jj+4, 12, fin_res);
+    }
+
+    template<int RM, int RN>
+    void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        vec_t vec_A[8], vec_B[8] = {0};
+        vector signed int vec_C[4];
+        acc_t acc_0;
+
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            std::array<int, RM> comparray;
+            vector float res[4] = {0};
+            vector float fin_res[4] = {0};
+            vector float vs[4] = {0};
+            vector float CA[4] = {0};
+            __builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
+            __builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
+            for (int l = 0; l < k; l++) {
+                __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
+                __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
+                __builtin_mma_xxsetaccz(&acc_0);
+                packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
+                packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
+                for(int x = 0; x < 8; x+=4) {
+                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
+                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
+                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
+                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
+                }
+                for (int I = 0; I<RM; I++) {
+                    for (int J = 0; J<RN; J++) {
+                        *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
+                    }
+                }
+                __builtin_mma_disassemble_acc(vec_C, &acc_0);
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < RM; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    const int8_t *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
+
+                for (int i = 0; i < RM; i++) {
+                    CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
+                    res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
+                    fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
+                }
+            }
+            save_res<RM, RN>(ii, jj, 0, fin_res);
+        }
+    }
+
+    template<int RM, int RN>
+    inline void kernel(int64_t ii, int64_t jj) {
+       if constexpr(RM == 4 && RN == 8) {
+          KERNEL_4x8(ii,jj);
+       } else if constexpr(RM == 8 && RN == 4) {
+          KERNEL_8x4(ii,jj);
+       } else if constexpr(RM == 8 && RN == 8) {
+          KERNEL_8x8(ii,jj);
+       } else {
+          static_assert(false, "RN/RM values not supported");
+       }
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            kernel<RM, RN>(ii, jj);
+        }
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *C;
+    TA *At;
+    TB *Bt;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_PPC {
+  public:
+    tinyBLAS_PPC(int64_t k,
+                const TA *A, int64_t lda,
+                const TB *B, int64_t ldb,
+                TC *C, int64_t ldc,
+                int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+       mnpack(0, m, 0, n);
+    }
+
+  private:
+
+    void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
+
+    template<typename VA>
+    void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
+        int64_t i, j;
+        TA *aoffset = NULL, *boffset = NULL;
+        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
+        VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        VA t1, t2, t3, t4, t5, t6, t7, t8;
+        aoffset = const_cast<TA*>(a);
+        boffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+                i = (cols >> 3);
+                if (i > 0) {
+                    do {
+                        C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                        C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                        C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                        C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                        C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
+                        C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
+                        C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
+                        C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
+                        __builtin_vsx_disassemble_pair(c1, &C1);
+                        __builtin_vsx_disassemble_pair(c2, &C2);
+                        __builtin_vsx_disassemble_pair(c3, &C3);
+                        __builtin_vsx_disassemble_pair(c4, &C4);
+                        __builtin_vsx_disassemble_pair(c5, &C5);
+                        __builtin_vsx_disassemble_pair(c6, &C6);
+                        __builtin_vsx_disassemble_pair(c7, &C7);
+                        __builtin_vsx_disassemble_pair(c8, &C8);
+
+                        t1 = vec_mergeh(c1[0], c2[0]);
+                        t2 = vec_mergeh(c3[0], c4[0]);
+                        t3 = vec_mergeh(c5[0], c6[0]);
+                        t4 = vec_mergeh(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset);
+                        vec_xst(t6, 0, boffset+4);
+                        vec_xst(t7, 0, boffset+8);
+                        vec_xst(t8, 0, boffset+12);
+
+                        t1 = vec_mergel(c1[0], c2[0]);
+                        t2 = vec_mergel(c3[0], c4[0]);
+                        t3 = vec_mergel(c5[0], c6[0]);
+                        t4 = vec_mergel(c7[0], c8[0]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+16);
+                        vec_xst(t6, 0, boffset+20);
+                        vec_xst(t7, 0, boffset+24);
+                        vec_xst(t8, 0, boffset+28);
+
+                        t1 = vec_mergeh(c1[1], c2[1]);
+                        t2 = vec_mergeh(c3[1], c4[1]);
+                        t3 = vec_mergeh(c5[1], c6[1]);
+                        t4 = vec_mergeh(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+32);
+                        vec_xst(t6, 0, boffset+36);
+                        vec_xst(t7, 0, boffset+40);
+                        vec_xst(t8, 0, boffset+44);
+
+                        t1 = vec_mergel(c1[1], c2[1]);
+                        t2 = vec_mergel(c3[1], c4[1]);
+                        t3 = vec_mergel(c5[1], c6[1]);
+                        t4 = vec_mergel(c7[1], c8[1]);
+                        t5 = vec_xxpermdi(t1, t2, 0);
+                        t6 = vec_xxpermdi(t3, t4, 0);
+                        t7 = vec_xxpermdi(t1, t2, 3);
+                        t8 = vec_xxpermdi(t3, t4, 3);
+                        vec_xst(t5, 0, boffset+48);
+                        vec_xst(t6, 0, boffset+52);
+                        vec_xst(t7, 0, boffset+56);
+                        vec_xst(t8, 0, boffset+60);
+
+                        aoffset1 += 8*lda;
+                        aoffset2 += 8*lda;
+                        aoffset3 += 8*lda;
+                        aoffset4 += 8*lda;
+                        boffset += 64;
+                        i--;
+                    } while(i > 0);
+                }
+                if (cols & 4) {
+                    c1[0] = vec_xl(0, aoffset1);
+                    c2[0] = vec_xl(0, aoffset2);
+                    c3[0] = vec_xl(0, aoffset3);
+                    c4[0] = vec_xl(0, aoffset4);
+                    c5[0] = vec_xl(0, aoffset5);
+                    c6[0] = vec_xl(0, aoffset6);
+                    c7[0] = vec_xl(0, aoffset7);
+                    c8[0] = vec_xl(0, aoffset8);
+
+                    t1 = vec_mergeh(c1[0], c2[0]);
+                    t2 = vec_mergeh(c3[0], c4[0]);
+                    t3 = vec_mergeh(c5[0], c6[0]);
+                    t4 = vec_mergeh(c7[0], c8[0]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergel(c1[0], c2[0]);
+                    t2 = vec_mergel(c3[0], c4[0]);
+                    t3 = vec_mergel(c5[0], c6[0]);
+                    t4 = vec_mergel(c7[0], c8[0]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t3, t4, 0);
+                    t7 = vec_xxpermdi(t1, t2, 3);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+                }
+            j--;
+            } while(j > 0);
+        }
+
+        if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+            i = (cols >> 3);
+            if (i > 0) {
+                do {
+                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
+                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
+                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
+                    C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
+                    __builtin_vsx_disassemble_pair(c1, &C1);
+                    __builtin_vsx_disassemble_pair(c2, &C2);
+                    __builtin_vsx_disassemble_pair(c3, &C3);
+                    __builtin_vsx_disassemble_pair(c4, &C4);
+
+                    t1 = vec_mergeh(c1[0], c2[0]);
+                    t2 = vec_mergeh(c3[0], c4[0]);
+                    t3 = vec_mergel(c1[0], c2[0]);
+                    t4 = vec_mergel(c3[0], c4[0]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset);
+                    vec_xst(t6, 0, boffset+4);
+                    vec_xst(t7, 0, boffset+8);
+                    vec_xst(t8, 0, boffset+12);
+
+                    t1 = vec_mergeh(c1[1], c2[1]);
+                    t2 = vec_mergeh(c3[1], c4[1]);
+                    t3 = vec_mergel(c1[1], c2[1]);
+                    t4 = vec_mergel(c3[1], c4[1]);
+                    t5 = vec_xxpermdi(t1, t2, 0);
+                    t6 = vec_xxpermdi(t1, t2, 3);
+                    t7 = vec_xxpermdi(t3, t4, 0);
+                    t8 = vec_xxpermdi(t3, t4, 3);
+                    vec_xst(t5, 0, boffset+16);
+                    vec_xst(t6, 0, boffset+20);
+                    vec_xst(t7, 0, boffset+24);
+                    vec_xst(t8, 0, boffset+28);
+
+                    aoffset1 += 8*lda;
+                    aoffset2 += 8*lda;
+                    aoffset3 += 8*lda;
+                    aoffset4 += 8*lda;
+                    boffset += 32;
+                    i--;
+                } while(i > 0);
+            }
+
+            if (cols & 4) {
+                c1[0] = vec_xl(0, aoffset1);
+                c2[0] = vec_xl(0, aoffset2);
+                c3[0] = vec_xl(0, aoffset3);
+                c4[0] = vec_xl(0, aoffset4);
+
+                t1 = vec_mergeh(c1[0], c2[0]);
+                t2 = vec_mergeh(c3[0], c4[0]);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1[0], c2[0]);
+                t2 = vec_mergel(c3[0], c4[0]);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            if (cols & 4) {
+                c1[0] = vec_xl(0, aoffset1);
+                c2[0] = vec_xl(0, aoffset2);
+                c3[0] = vec_xl(0, aoffset3);
+
+                t1 = vec_mergeh(c1[0], c2[0]);
+                t2 = vec_mergeh(c3[0], c4[0]);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset);
+                vec_xst(t4, 0, boffset+4);
+
+                t1 = vec_mergel(c1[0], c2[0]);
+                t2 = vec_mergel(c3[0], c4[0]);
+                t3 = vec_xxpermdi(t1, t2, 0);
+                t4 = vec_xxpermdi(t1, t2, 3);
+                vec_xst(t3, 0, boffset+8);
+                vec_xst(t4, 0, boffset+12);
+            }
+        }
+    }
+    void KERNEL_4x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[4], vec_C[4];
+        acc_t acc_0;
+        __builtin_mma_xxsetaccz(&acc_0);
+        for (int l = 0; l < k; l+=4) {
+            packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
+            packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+    }
+
+    void KERNEL_4x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[8], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
+            packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
+            __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
+            __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+    }
+
+    void KERNEL_8x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[4], vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int64_t l = 0; l < k; l+=4) {
+            packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
+            packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
+            __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
+            __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii+4, jj);
+    }
+
+    void KERNEL_8x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[16], vec_B[16], vec_C[4];
+        acc_t acc_0, acc_1, acc_2, acc_3;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        __builtin_mma_xxsetaccz(&acc_2);
+        __builtin_mma_xxsetaccz(&acc_3);
+        for (int l = 0; l < k; l+=8) {
+            packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
+            packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
+            for(int x = 0; x < 16; x+=2) {
+                __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
+                __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
+                __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
+            }
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+        SAVE_ACC(&acc_2, ii+4, jj);
+        SAVE_ACC(&acc_3, ii+4, jj+4);
+    }
+
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        int m_rem = MIN(m - m0, 16);
+        int n_rem = MIN(n - n0, 16);
+        if (m_rem >= 16 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if(m_rem >= 8 && n_rem >= 16) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 8) {
+            mc = 4;
+            nc = 8;
+            gemm<4,8>(m0, m, n0, n);
+        } else if (m_rem >= 8 && n_rem >= 4) {
+            mc = 8;
+            nc = 4;
+            gemm<8,4>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 4) {
+            mc = 4;
+            nc = 4;
+            gemm<4,4>(m0, m, n0, n);
+        } else if ((m_rem < 4) && (n_rem > 4)) {
+            nc = 4;
+            switch(m_rem) {
+                case 1:
+                    mc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    mc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    mc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else if ((m_rem > 4) && (n_rem < 4)) {
+            mc = 4;
+            switch(n_rem) {
+                case 1:
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 2:
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 3:
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        } else {
+            switch((m_rem << 4) | n_rem) {
+                case 0x43:
+                    mc = 4;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x42:
+                    mc = 4;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x41:
+                    mc = 4;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x34:
+                    mc = 3;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x33:
+                    mc = 3;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x32:
+                    mc = 3;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x31:
+                    mc = 3;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x24:
+                    mc = 2;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x23:
+                    mc = 2;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x22:
+                    mc = 2;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x21:
+                    mc = 2;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x14:
+                    mc = 1;
+                    nc = 4;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x13:
+                    mc = 1;
+                    nc = 3;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x12:
+                    mc = 1;
+                    nc = 2;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                case 0x11:
+                    mc = 1;
+                    nc = 1;
+                    gemm_small(m0, m, n0, n, mc, nc);
+                    break;
+                default:
+                    return;
+            }
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+     void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            vec_t vec_C[4];
+            acc_t acc_0;
+            __builtin_mma_xxsetaccz(&acc_0);
+            vec_t vec_A[4], vec_B[4];
+            for (int l=0; l<k; l+=4) {
+                if (RN >= 4 && RM == 1) {
+                    TA* a = const_cast<TA*>(A+(ii)*lda+l);
+                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
+                    vec_A[0] = (vec_t)vec_xl(0,a);
+                    vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
+                    vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
+                    vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
+                } else {
+                    packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
+                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
+                }
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
+                __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_0);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < RN; J++) {
+                    *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
+                }
+            }
+       }
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (RM == 4 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x4;
+        } else if (RM == 4 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_4x8;
+        } else if (RM == 8 && RN == 4) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x4;
+        } else if (RM == 8 && RN == 8) {
+            kernel = &tinyBLAS_PPC::KERNEL_8x8;
+        }
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            (this->*kernel)(ii, jj);
+        }
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *C;
+    TA *At;
+    TB *Bt;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+#endif
+} // namespace
+
+/**
+ * Performs optimized matrix multiplication on CPU.
+ *
+ * This subroutine may compute C = Aᵀ * B with column major ordering.
+ * Despite its name, this isn't a generalized implementation. Work is
+ * only performed when a handwritten kernel is written and available.
+ * Otherwise the caller should fall back to a general matmul routine.
+ *
+ * For example, for single-threaded single-precision GEMM you can say
+ *
+ *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
+ *                     0, 1,
+ *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
+ *
+ * @param m is rows in `A` and `C`
+ * @param n is cols in `B` and `C`
+ * @param k is cols in `A` and rows in `B`
+ * @param A is first input matrix (always transposed)
+ * @param lda is row stride of `A`
+ * @param B is second input matrix (never transposed)
+ * @param ldb is row stride of `B`
+ * @param C is input/output array of output matrices
+ * @param ldc is row stride of `C`
+ * @param ith is thread id (must be less than `nth`)
+ * @param nth is number of threads (must be greater than zero)
+ * @param Atype is GGML data type of `A`
+ * @param Btype is GGML data type of `B`
+ * @param Ctype is GGML data type of `C`
+ * @return true if this function was able to service the matmul request
+ */
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
+                     const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
+                     int64_t ldc, int Atype, int Btype, int Ctype) {
+
+    assert(m >= 0);
+    assert(n >= 0);
+    assert(k >= 0);
+    assert(lda >= k);
+    assert(ldb >= k);
+    assert(ldc >= m);
+    assert(params->nth > 0);
+    assert(params->ith < params->nth);
+
+    // only enable sgemm for prompt processing
+    if (n < 2)
+        return false;
+
+    if (Ctype != GGML_TYPE_F32)
+        return false;
+
+    switch (Atype) {
+
+    case GGML_TYPE_F32: {
+        if (Btype != GGML_TYPE_F32)
+            return false;
+#if defined(__AVX512F__)
+        tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+        return tb.matmul(m, n);
+#elif defined(__AVX__) || defined(__AVX2__)
+        tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+        return tb.matmul(m, n);
+#elif defined(__ARM_NEON)
+        if (n < 4)
+            return false;
+        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+        return tb.matmul(m, n);
+#elif defined(__MMA__)
+        if (k % 8)
+            return false;
+        tinyBLAS_PPC<float, float, float> tb{
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
+    case GGML_TYPE_BF16: {
+#if defined(__AVX512BF16__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__AVX512F__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__AVX2__)
+        if (Btype == GGML_TYPE_BF16) {
+            tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
+                (const ggml_bf16_t *)A, lda,
+                (const ggml_bf16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#endif
+        return false;
+    }
+    case GGML_TYPE_F16: {
+#if defined(__AVX512F__)
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
+                (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
+                (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+        if (n < 8)
+            return false;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+        if (Btype == GGML_TYPE_F32) {
+            tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const float *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
+#endif
+        return false;
+    }
+
+    case GGML_TYPE_Q8_0: {
+        if (Btype != GGML_TYPE_Q8_0)
+           return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
+            k, (const block_q8_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#elif defined(__ARM_FEATURE_DOTPROD)
+        tinyBLAS_Q0_ARM<block_q8_0> tb{
+            k, (const block_q8_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+
+#elif defined(__MMA__)
+        if (n < 8 && n != 4)
+           return false;
+        if (m < 8 && m != 4)
+           return false;
+        tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
+            k, (const block_q8_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+
+#else
+        return false;
+#endif
+    }
+
+    case GGML_TYPE_Q4_0: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
+            k, (const block_q4_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#elif defined(__ARM_FEATURE_DOTPROD)
+        tinyBLAS_Q0_ARM<block_q4_0> tb{
+            k, (const block_q4_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
+    case GGML_TYPE_Q5_0: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
+            k, (const block_q5_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
+    case GGML_TYPE_IQ4_NL: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
+            k, (const block_iq4_nl *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
+    default:
+        return false;
+    }
+
+    (void)params;
+    (void)m;
+    (void)n;
+    (void)k;
+    (void)A;
+    (void)lda;
+    (void)B;
+    (void)ldb;
+    (void)C;
+    (void)ldc;
+    (void)Atype;
+    (void)Btype;
+    (void)Ctype;
+}
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
new file mode 100644
index 000000000..3d2909515
--- /dev/null
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
+                     const void *, int64_t, const void *, int64_t, void *, int64_t,
+                     int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
new file mode 100644
index 000000000..119fd39b8
--- /dev/null
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -0,0 +1,152 @@
+cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+find_package(CUDAToolkit)
+
+if (CUDAToolkit_FOUND)
+    message(STATUS "CUDA Toolkit found")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # native == GPUs available at build time
+        # 52     == Maxwell, lowest CUDA 12 standard
+        # 60     == P100, FP16 CUDA intrinsics
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
+        # 70     == V100, FP16 tensor cores
+        # 75     == Turing, int8 tensor cores
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+            set(CMAKE_CUDA_ARCHITECTURES "native")
+        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+    enable_language(CUDA)
+
+    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
+    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_CUDA "*.cu")
+    file(GLOB   SRCS "template-instances/fattn-mma*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    endif()
+
+    ggml_add_backend_library(ggml-cuda
+                             ${GGML_HEADERS_CUDA}
+                             ${GGML_SOURCES_CUDA}
+                            )
+
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        add_compile_definitions(GGML_CUDA_F16)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        if (WIN32)
+            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+        else ()
+            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        endif()
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
+    endif()
+
+    set(CUDA_CXX_FLAGS "")
+
+    set(CUDA_FLAGS -use_fast_math)
+
+    if (GGML_FATAL_WARNINGS)
+        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    endif()
+
+    if (GGML_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()
+
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+    endif()
+
+    if (NOT MSVC)
+        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+    endif()
+
+    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+
+    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+else()
+    message(FATAL_ERROR "CUDA Toolkit not found")
+endif()
diff --git a/ggml/src/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu
new file mode 100644
index 000000000..96bfe1c9d
--- /dev/null
+++ b/ggml/src/ggml-cuda/acc.cu
@@ -0,0 +1,47 @@
+#include "acc.cuh"
+
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset) {
+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
+}
+
+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
+}
diff --git a/ggml/src/ggml-cuda/acc.cuh b/ggml/src/ggml-cuda/acc.cuh
new file mode 100644
index 000000000..1168ea1b2
--- /dev/null
+++ b/ggml/src/ggml-cuda/acc.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ACC_BLOCK_SIZE 256
+
+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu
new file mode 100644
index 000000000..b5e495a24
--- /dev/null
+++ b/ggml/src/ggml-cuda/arange.cu
@@ -0,0 +1,34 @@
+#include "arange.cuh"
+
+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+    dst[nidx] = start + step * nidx;
+}
+
+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
+}
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float start;
+    float stop;
+    float step;
+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
+
+    int64_t steps = (int64_t)ceil((stop - start) / step);
+    GGML_ASSERT(ggml_nelements(dst) == steps);
+
+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
+}
diff --git a/ggml/src/ggml-cuda/arange.cuh b/ggml/src/ggml-cuda/arange.cuh
new file mode 100644
index 000000000..41e74fdfc
--- /dev/null
+++ b/ggml/src/ggml-cuda/arange.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ARANGE_BLOCK_SIZE 256
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
new file mode 100644
index 000000000..5340eedc0
--- /dev/null
+++ b/ggml/src/ggml-cuda/argmax.cu
@@ -0,0 +1,91 @@
+#include <algorithm>
+#include <cstdint>
+
+#include "argmax.cuh"
+#include "common.cuh"
+#include "sum.cuh"
+
+static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
+    const int64_t row = blockIdx.x;
+
+    float maxval = -FLT_MAX;
+    int   argmax = -1;
+    const float * rowx = x + row * ncols;
+
+    for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
+        const float val = rowx[col];
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
+    }
+
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
+    }
+
+    const int n_warps = blockDim.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    if (n_warps > 1) {
+        constexpr int    max_warps = 1024 / WARP_SIZE;
+        __shared__ float shared_maxval[max_warps];
+        __shared__ int   shared_argmax[max_warps];
+        if (lane_id == 0) {
+            shared_maxval[warp_id] = maxval;
+            shared_argmax[warp_id] = argmax;
+        }
+
+        __syncthreads();
+
+        if (warp_id == 0) {
+            if (lane_id < n_warps) {
+                maxval = shared_maxval[lane_id];
+                argmax = shared_argmax[lane_id];
+            }
+#pragma unroll
+            for (int offset = 16; offset > 0; offset >>= 1) {
+                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+                if (val > maxval) {
+                    maxval = val;
+                    argmax = col;
+                }
+            }
+        }
+    }
+
+    if (warp_id == 0 && lane_id == 0) {
+        dst[row] = argmax;
+    }
+}
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ne00  = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const float * src0_d = (const float *) src0->data;
+    int32_t     * dst_d  = (int32_t     *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t num_blocks = nrows;
+    const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
+    const dim3 blocks_dim(num_threads, 1, 1);
+    const dim3 blocks_num(num_blocks, 1, 1);
+
+    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
+}
diff --git a/ggml/src/ggml-cuda/argmax.cuh b/ggml/src/ggml-cuda/argmax.cuh
new file mode 100644
index 000000000..5b7223adc
--- /dev/null
+++ b/ggml/src/ggml-cuda/argmax.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
new file mode 100644
index 000000000..607ded855
--- /dev/null
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -0,0 +1,104 @@
+#include "argsort.cuh"
+
+template<typename T>
+static inline __device__ void ggml_cuda_swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
+    // bitonic sort
+    int col = threadIdx.x;
+    int row = blockIdx.y;
+
+    if (col >= ncols_pad) {
+        return;
+    }
+
+    const float * x_row = x + row * ncols;
+    extern __shared__ int dst_row[];
+
+    // initialize indices
+    dst_row[col] = col;
+
+    __syncthreads();
+
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+static int next_power_of_2(int x) {
+    int n = 1;
+    while (n < x) {
+        n *= 2;
+    }
+    return n;
+}
+
+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    const dim3 block_dims(ncols_pad, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+}
diff --git a/ggml/src/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh
new file mode 100644
index 000000000..68a001547
--- /dev/null
+++ b/ggml/src/ggml-cuda/argsort.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
new file mode 100644
index 000000000..ce4b9cfb5
--- /dev/null
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -0,0 +1,361 @@
+#include "binbcast.cuh"
+#include <cstdint>
+
+static __device__ __forceinline__ float op_repeat(const float a, const float b) {
+    return b;
+    GGML_UNUSED(a);
+}
+
+static __device__ __forceinline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __device__ __forceinline__ float op_sub(const float a, const float b) {
+    return a - b;
+}
+
+static __device__ __forceinline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __device__ __forceinline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13) {
+    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
+    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
+    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13) {
+
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+template <typename T>
+static __global__ void k_repeat_back(
+    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
+
+    const int64_t tid0  = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
+    const int64_t tid1  = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
+    const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
+    const int64_t tid2  = tid23 % ne2;
+    const int64_t tid3  = tid23 / ne2;
+
+    if (tid0 >= ne0) {
+        return;
+    }
+
+    T sum = 0;
+    for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
+        for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
+            for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
+                for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
+                    sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
+                }
+            }
+        }
+    }
+    dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
+}
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_cuda {
+    template<typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
+            cudaStream_t stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne[] = {ne0, ne1, ne2, ne3};
+        int64_t cne0[] = {ne00, ne01, ne02, ne03};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+
+        size_t cnb[] = {nb0, nb1, nb2, nb3};
+        size_t cnb0[] = {nb00, nb01, nb02, nb03};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
+            for (int i = 0; i < 4; i++) {
+                if (nr[i] != 1) {
+                    break;
+                }
+                if (i > 0) {
+                    collapse_nb(cnb, cne);
+                    collapse_nb(cnb0, cne0);
+                    collapse_nb(cnb1, cne1);
+                    collapse(cne);
+                    collapse(cne0);
+                    collapse(cne1);
+                }
+            }
+        }
+
+        {
+            int64_t ne0 = cne[0];
+            int64_t ne1 = cne[1];
+            int64_t ne2 = cne[2];
+            int64_t ne3 = cne[3];
+
+            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
+            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
+            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
+            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb[0];
+            size_t nb1 = cnb[1];
+            size_t nb2 = cnb[2];
+            size_t nb3 = cnb[3];
+
+            size_t nb00 = cnb0[0];
+            size_t nb01 = cnb0[1];
+            size_t nb02 = cnb0[2];
+            size_t nb03 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            size_t s00 = nb00 / sizeof(src0_t);
+            size_t s01 = nb01 / sizeof(src0_t);
+            size_t s02 = nb02 / sizeof(src0_t);
+            size_t s03 = nb03 / sizeof(src0_t);
+
+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+
+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+
+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s00 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            dim3 block_dims;
+            block_dims.x = std::min<unsigned int>(hne0, block_size);
+            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
+            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
+
+            dim3 block_nums(
+                (hne0 + block_dims.x - 1) / block_dims.x,
+                (ne1 + block_dims.y - 1) / block_dims.y,
+                (ne2*ne3 + block_dims.z - 1) / block_dims.z
+            );
+
+            if (block_nums.z > 65535) {
+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd,
+                    ne0, ne1, ne2, ne3,
+                    ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s00, */ s01, s02, s03,
+                    /* s10, */ s11, s12, s13);
+            } else {
+                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd,
+                    ne0, ne1, ne2, ne3,
+                    ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s00, */ s01, s02, s03,
+                    /* s10, */ s11, s12, s13);
+            }
+        }
+    }
+};
+
+template <typename T>
+static void repeat_back_cuda(
+    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
+
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
+    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
+        (src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
+}
+
+template<class op>
+static void ggml_cuda_op_bin_bcast(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == dst->type);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    GGML_ASSERT(ne2*ne3 <= (1 << 15));
+
+    const size_t ts = ggml_type_size(src0->type);
+    const size_t s00 = nb00 / ts;
+    const size_t s01 = nb01 / ts;
+    const size_t s02 = nb02 / ts;
+    const size_t s03 = nb03 / ts;
+
+    switch (dst->type) {
+        case GGML_TYPE_F32: {
+            const float * src0_d = (const float *) src0->data;
+            float       * dst_d  = (float       *) dst->data;
+            repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
+        } break;
+        default: {
+            GGML_ASSERT(false);
+        } break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh
new file mode 100644
index 000000000..3ac1c9b03
--- /dev/null
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu
new file mode 100644
index 000000000..8009a3e3d
--- /dev/null
+++ b/ggml/src/ggml-cuda/clamp.cu
@@ -0,0 +1,34 @@
+#include "clamp.cuh"
+
+static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
+}
+
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
+}
diff --git a/ggml/src/ggml-cuda/clamp.cuh b/ggml/src/ggml-cuda/clamp.cuh
new file mode 100644
index 000000000..7f9559dd1
--- /dev/null
+++ b/ggml/src/ggml-cuda/clamp.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CLAMP_BLOCK_SIZE 256
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
new file mode 100644
index 000000000..174916bc9
--- /dev/null
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -0,0 +1,714 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cuda.h"
+
+#include <cstdint>
+#include <memory>
+
+#if defined(GGML_USE_HIP)
+#define GGML_COMMON_DECL_HIP
+#define GGML_COMMON_IMPL_HIP
+#else
+#define GGML_COMMON_DECL_CUDA
+#define GGML_COMMON_IMPL_CUDA
+#if defined(GGML_USE_MUSA)
+#define GGML_COMMON_DECL_MUSA
+#define GGML_COMMON_IMPL_MUSA
+#endif
+#endif
+#include "ggml-common.h"
+
+#include <cstdio>
+#include <array>
+#include <cassert>
+#include <cfloat>
+#include <string>
+#include <vector>
+
+#if defined(GGML_USE_HIP)
+#include "vendors/hip.h"
+#elif defined(GGML_USE_MUSA)
+#include "vendors/musa.h"
+#else
+#include "vendors/cuda.h"
+#endif // defined(GGML_USE_HIP)
+
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+
+#define WARP_SIZE 32
+#define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
+#define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
+
+#define GGML_CUDA_CC_PASCAL     600
+#define GGML_CUDA_CC_DP4A       610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define GGML_CUDA_CC_VOLTA      700
+#define GGML_CUDA_CC_TURING     750
+#define GGML_CUDA_CC_AMPERE     800
+#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
+
+// GCN/CNDA, wave size is 64
+#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
+#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
+#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
+#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
+#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
+
+// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
+#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
+#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
+#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+
+#define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
+#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
+#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
+#define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
+
+#define GGML_CUDA_CC_QY1        210
+#define GGML_CUDA_CC_QY2        220
+
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define GGML_CUDA_MAX_STREAMS 8
+
+[[noreturn]]
+void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+
+#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
+     do {                                                                           \
+        auto err_ = (err);                                                          \
+        if (err_ != (success)) {                                                    \
+            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
+        }                                                                           \
+    } while (0)
+
+#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
+
+#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        return cublasGetStatusString(err);
+    }
+#else
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        switch (err) {
+            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+            default: return "unknown error";
+        }
+    }
+#endif // CUDART_VERSION >= 12000
+
+#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
+
+#if !defined(GGML_USE_HIP)
+static const char * cu_get_error_str(CUresult err) {
+    const char * err_str;
+    cuGetErrorString(err, &err_str);
+    return err_str;
+}
+#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
+#endif
+
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef float2 dfloat2;
+#endif // GGML_CUDA_F16
+
+#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
+#define GGML_USE_VMM
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
+
+#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
+#define FP16_AVAILABLE
+#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
+
+#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
+#define FAST_FP16_AVAILABLE
+#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#define FP16_MMA_AVAILABLE
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
+#define NEW_MMA_AVAILABLE
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
+
+#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#define FLASH_ATTN_AVAILABLE
+#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+
+static constexpr bool fast_fp16_available(const int cc) {
+    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
+}
+
+// Any FP16 tensor cores are available.
+static constexpr bool fp16_mma_available(const int cc) {
+    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
+}
+
+// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
+static constexpr bool new_mma_available(const int cc) {
+    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
+}
+
+static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    return __AMDGCN_WAVEFRONT_SIZE;
+#else
+    return 32;
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+}
+
+[[noreturn]]
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    GGML_UNUSED(arch_list);
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    __trap();
+
+    GGML_UNUSED(no_device_code); // suppress unused function warning
+}
+
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ int warp_reduce_sum(int x) {
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+    return __reduce_add_sync(0xffffffff, x);
+#else
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, width);
+    }
+    return x;
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, offset, width);
+    }
+    return x;
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
+    }
+    return a;
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#ifdef FP16_AVAILABLE
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
+    }
+    return a;
+
+#else
+    NO_DEVICE_CODE;
+    return a;
+#endif // FP16_AVAILABLE
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+    for (int offset = width/2; offset > 0; offset >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
+    }
+    return x;
+}
+
+static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
+#ifdef FP16_AVAILABLE
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+    return __float2half(fmaxf(__half2float(a), __half2float(b)));
+#else
+    return __hmax(a, b);
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+
+#else
+   NO_DEVICE_CODE;
+   GGML_UNUSED(b);
+   return a;
+#endif // FP16_AVAILABLE
+}
+
+static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
+#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
+    return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
+#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
+    return __hmax2(a, b);
+#elif !defined(GGML_USE_HIP)
+    half2 ret;
+    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
+    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
+    return ret;
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
+#endif
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
+#pragma unroll
+   for (int offset = width/2; offset > 0; offset >>= 1) {
+       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
+   }
+   return x;
+#else
+   GGML_UNUSED(x);
+   NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
+}
+
+#if CUDART_VERSION < CUDART_HMASK
+static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
+    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
+    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
+    return mask_low | mask_high;
+}
+#endif // CUDART_VERSION < CUDART_HMASK
+
+static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(RDNA3)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+
+#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
+    return __dp4a(a, b, c);
+#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
+    const int8_t * a8 = (const int8_t *) &a;
+    const int8_t * b8 = (const int8_t *) &b;
+    return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
+
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+}
+
+// TODO: move to ggml-common.h
+static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
+
+static __device__ __forceinline__ float get_alibi_slope(
+    const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
+) {
+    if (max_bias <= 0.0f) {
+        return 1.0f;
+    }
+    const float base = h < n_head_log2 ? m0 : m1;
+    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+    return powf(base, exph);
+}
+
+template <ggml_type type>
+struct ggml_cuda_type_traits;
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_F16> {
+    static constexpr int qk = 1;
+    static constexpr int qr = 1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
+    static constexpr int qk = QK4_0;
+    static constexpr int qr = QR4_0;
+    static constexpr int qi = QI4_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
+    static constexpr int qk = QK4_1;
+    static constexpr int qr = QR4_1;
+    static constexpr int qi = QI4_1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
+    static constexpr int qk = QK5_0;
+    static constexpr int qr = QR5_0;
+    static constexpr int qi = QI5_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
+    static constexpr int qk = QK5_1;
+    static constexpr int qr = QR5_1;
+    static constexpr int qi = QI5_1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
+    static constexpr int qk = QK8_0;
+    static constexpr int qr = QR8_0;
+    static constexpr int qi = QI8_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_K;
+    static constexpr int qi = QI2_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_K;
+    static constexpr int qi = QI3_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_K;
+    static constexpr int qi = QI4_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR5_K;
+    static constexpr int qi = QI5_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR6_K;
+    static constexpr int qi = QI6_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_XXS;
+    static constexpr int qi = QI2_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_XS;
+    static constexpr int qi = QI2_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_S;
+    static constexpr int qi = QI2_S;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_XXS;
+    static constexpr int qi = QI3_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR1_S;
+    static constexpr int qi = QI1_S;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR1_M;
+    static constexpr int qi = QI1_M;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
+    static constexpr int qk = QK4_NL;
+    static constexpr int qr = QR4_NL;
+    static constexpr int qi = QI4_NL;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_XS;
+    static constexpr int qi = QI4_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_S;
+    static constexpr int qi = QI3_S;
+};
+
+//////////////////////
+
+struct ggml_cuda_device_info {
+    int device_count;
+
+    struct cuda_device_info {
+        int     cc;                 // compute capability
+        int     nsm;                // number of streaming multiprocessors
+        size_t  smpb;               // max. shared memory per block
+        size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    vmm;                // virtual memory support
+        size_t  vmm_granularity;    // granularity of virtual memory
+        size_t  total_vram;
+        int     warp_size;          // Number of threads in a dispatch
+    };
+
+    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
+
+    std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
+};
+
+const ggml_cuda_device_info & ggml_cuda_info();
+
+void ggml_cuda_set_device(int device);
+int ggml_cuda_get_device();
+
+struct ggml_cuda_pool {
+    virtual ~ggml_cuda_pool() = default;
+
+    virtual void * alloc(size_t size, size_t * actual_size) = 0;
+    virtual void free(void * ptr, size_t size) = 0;
+};
+
+template<typename T>
+struct ggml_cuda_pool_alloc {
+    ggml_cuda_pool * pool = nullptr;
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    ggml_cuda_pool_alloc() = default;
+
+    explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
+    }
+
+    ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
+        alloc(size);
+    }
+
+    ~ggml_cuda_pool_alloc() {
+        if (ptr != nullptr) {
+            pool->free(ptr, actual_size);
+        }
+    }
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(pool != nullptr);
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    T * alloc(ggml_cuda_pool & pool, size_t size) {
+        this->pool = &pool;
+        return alloc(size);
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
+    ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
+    ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
+    ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
+};
+
+
+// backend interface
+
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+
+#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
+#define USE_CUDA_GRAPH
+#endif
+
+struct ggml_graph_node_properties {
+    void * node_address;
+    ggml_op node_op;
+    int64_t ne[GGML_MAX_DIMS];
+    size_t nb[GGML_MAX_DIMS];
+    void * src_address[GGML_MAX_SRC];
+    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+};
+
+struct ggml_cuda_graph {
+#ifdef USE_CUDA_GRAPH
+    ~ggml_cuda_graph() {
+        if (instance != nullptr) {
+            CUDA_CHECK(cudaGraphExecDestroy(instance));
+        }
+        if (graph != nullptr) {
+            CUDA_CHECK(cudaGraphDestroy(graph));
+        }
+    }
+    cudaGraph_t graph = nullptr;
+    cudaGraphExec_t instance = nullptr;
+    size_t num_nodes = 0;
+    std::vector<cudaGraphNode_t> nodes;
+    std::vector<cudaKernelNodeParams> params;
+    bool disable_due_to_gpu_arch = false;
+    bool disable_due_to_too_many_updates = false;
+    bool disable_due_to_failed_graph_capture = false;
+    int number_consecutive_updates = 0;
+    std::vector<ggml_graph_node_properties> ggml_graph_properties;
+    std::vector<char **> updated_kernel_arg;
+#endif
+};
+
+struct ggml_backend_cuda_context {
+    int device;
+    std::string name;
+    cudaEvent_t copy_event = nullptr;
+
+    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
+    cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    std::unique_ptr<ggml_cuda_graph> cuda_graph;
+
+    explicit ggml_backend_cuda_context(int device) :
+        device(device),
+        name(GGML_CUDA_NAME + std::to_string(device)) {
+    }
+
+    ~ggml_backend_cuda_context() {
+        if (copy_event != nullptr) {
+            CUDA_CHECK(cudaEventDestroy(copy_event));
+        }
+        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
+            for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
+                if (streams[i][j] != nullptr) {
+                    CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
+                }
+            }
+            if (cublas_handles[i] != nullptr) {
+                CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
+            }
+        }
+    }
+
+    cudaStream_t stream(int device, int stream) {
+        if (streams[device][stream] == nullptr) {
+            ggml_cuda_set_device(device);
+            CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
+        }
+        return streams[device][stream];
+    }
+
+    cudaStream_t stream() {
+        return stream(device, 0);
+    }
+
+    cublasHandle_t cublas_handle(int device) {
+        if (cublas_handles[device] == nullptr) {
+            ggml_cuda_set_device(device);
+            CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
+            CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
+        }
+        return cublas_handles[device];
+    }
+
+    cublasHandle_t cublas_handle() {
+        return cublas_handle(device);
+    }
+
+    // pool
+    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
+
+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
+
+    ggml_cuda_pool & pool(int device) {
+        if (pools[device] == nullptr) {
+            pools[device] = new_pool_for_device(device);
+        }
+        return *pools[device];
+    }
+
+    ggml_cuda_pool & pool() {
+        return pool(device);
+    }
+};
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
new file mode 100644
index 000000000..aafbaf803
--- /dev/null
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -0,0 +1,221 @@
+#include "concat.cuh"
+
+// contiguous kernels
+static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (nidx < ne00) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * gridDim.y;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            (nidx - ne00) +
+            blockIdx.y * (ne0 - ne00) +
+            blockIdx.z * (ne0 - ne00) * gridDim.y;
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (blockIdx.y < ne01) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            blockIdx.z * ne0 * ne01;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx +
+            (blockIdx.y - ne01) * ne0 +
+            blockIdx.z * ne0 * (gridDim.y - ne01);
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (blockIdx.z < ne02) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            blockIdx.z * ne0 * gridDim.y;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            (blockIdx.z - ne02) * ne0 *  gridDim.y;
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2);
+    if (dim == 0) {
+        concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
+        return;
+    }
+    if (dim == 1) {
+        concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
+        return;
+    }
+    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
+}
+
+// non-contiguous kernel (slow)
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
+    concat_f32_non_cont(
+        const char * src0,
+        const char * src1,
+              char * dst,
+           int64_t   ne00,
+           int64_t   ne01,
+           int64_t   ne02,
+           int64_t   ne03,
+          uint64_t   nb00,
+          uint64_t   nb01,
+          uint64_t   nb02,
+          uint64_t   nb03,
+           int64_t /*ne10*/,
+           int64_t /*ne11*/,
+           int64_t /*ne12*/,
+           int64_t /*ne13*/,
+          uint64_t   nb10,
+          uint64_t   nb11,
+          uint64_t   nb12,
+          uint64_t   nb13,
+           int64_t   ne0,
+           int64_t /*ne1*/,
+           int64_t /*ne2*/,
+           int64_t /*ne3*/,
+          uint64_t   nb0,
+          uint64_t   nb1,
+          uint64_t   nb2,
+          uint64_t   nb3){
+    static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
+
+    const int64_t i3 = blockIdx.z;
+    const int64_t i2 = blockIdx.y;
+    const int64_t i1 = blockIdx.x;
+
+    const float * x;
+
+    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
+        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+        } else {
+            if constexpr (dim == 0) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+            } else if constexpr (dim == 1) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+            } else if constexpr (dim == 2) {
+                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+            } else if constexpr (dim == 3) {
+                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+            }
+        }
+
+        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+        *y = *x;
+    }
+}
+
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    const int32_t dim = ((int32_t *) dst->op_params)[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+        const float * src0_d = (const float *)src0->data;
+        const float * src1_d = (const float *)src1->data;
+
+        float * dst_d = (float *)dst->data;
+
+        if (dim != 3) {
+            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                concat_f32_cuda(
+                        src0_d + i3 * (src0->nb[3] / 4),
+                        src1_d + i3 * (src1->nb[3] / 4),
+                        dst_d + i3 * ( dst->nb[3] / 4),
+                        src0->ne[0], src0->ne[1], src0->ne[2],
+                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
+            }
+        } else {
+            const size_t size0 = ggml_nbytes(src0);
+            const size_t size1 = ggml_nbytes(src1);
+
+            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+        }
+    } else {
+        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
+        auto launch_kernel = [&](auto dim) {
+            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
+                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+        };
+        switch (dim) {
+            case 0:
+                launch_kernel(std::integral_constant<int, 0>{});
+                break;
+            case 1:
+                launch_kernel(std::integral_constant<int, 1>{});
+                break;
+            case 2:
+                launch_kernel(std::integral_constant<int, 2>{});
+                break;
+            case 3:
+                launch_kernel(std::integral_constant<int, 3>{});
+                break;
+            default:
+                GGML_ABORT("Invalid dim: %d", dim);
+                break;
+        }
+    }
+}
diff --git a/ggml/src/ggml-cuda/concat.cuh b/ggml/src/ggml-cuda/concat.cuh
new file mode 100644
index 000000000..aa506a05f
--- /dev/null
+++ b/ggml/src/ggml-cuda/concat.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONCAT_BLOCK_SIZE 256
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu
new file mode 100644
index 000000000..b1e94d6f7
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -0,0 +1,87 @@
+#include "conv-transpose-1d.cuh"
+
+static  __global__ void conv_transpose_1d_kernel(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst) {
+    int global_index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (global_index >= output_size) {
+        return;
+    }
+
+    int out_index = global_index / dst_ne0;
+
+    float accumulator = 0;
+
+    for (int c = 0; c < src0_ne2; c++) {
+        int idx = global_index % dst_ne0;
+
+        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+        int input_offset = src1_ne0 * c;
+
+        for (int i = 0; i < src1_ne0; i++) {
+            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+                continue;
+            }
+            int weight_idx = idx - i*s0;
+
+            float kernel_weight = src0[kernel_offset + weight_idx];
+            float input_value =  src1[input_offset+i];
+
+            accumulator += kernel_weight * input_value;
+        }
+    }
+    dst[global_index] = accumulator;
+}
+
+static void conv_transpose_1d_f32_f32_cuda(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst,
+        cudaStream_t stream) {
+
+    const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
+    conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
+        s0,p0,d0,output_size,
+        src0_ne0, src0_ne1,  src0_ne2, src0_ne3,
+        src1_ne0, src1_ne1,  src1_ne2, src1_ne3,
+        dst_ne0,  dst_ne1,   dst_ne2,  dst_ne3,
+        src0,src1, dst);
+}
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int s0 = opts[0];
+    const int p0 = 0;//opts[3];
+    const int d0 = 1;//opts[4];
+
+    const int64_t kernel_size = ggml_nelements(src0);
+    const int64_t input_size = ggml_nelements(src1);
+    const int64_t output_size = ggml_nelements(dst);
+
+    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+        src0_d, src1_d, dst_d, stream);
+}
diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cuh b/ggml/src/ggml-cuda/conv-transpose-1d.cuh
new file mode 100644
index 000000000..6c2cf666b
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
new file mode 100644
index 000000000..5b0dfacef
--- /dev/null
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -0,0 +1,688 @@
+#include "convert.cuh"
+#include "dequantize.cuh"
+
+#define CUDA_Q8_0_NE_ALIGN 2048
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int64_t ib = i/qk; // block index
+    const int64_t iqs = (i%qk)/qr; // quant index
+    const int64_t iybs = i - i%qk; // y block start index
+    const int64_t y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = v.x;
+    y[iybs + iqs + y_offset] = v.y;
+}
+
+template <bool need_check>
+static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
+    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
+
+    const int64_t   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
+    const int * x0 = ((int *) vx) + blockIdx.x * nint;
+    half2 * y2 = (half2 *) (y + i0);
+
+    __shared__ int vals[nint];
+
+#pragma unroll
+    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
+        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
+            break;
+        }
+
+        const int ix = ix0 + threadIdx.x;
+        vals[ix] = x0[ix];
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
+        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
+            return;
+        }
+
+        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
+        const half    d = *b0;
+        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
+
+        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
+    }
+#else
+    GGML_UNUSED(vx);
+    GGML_UNUSED(y);
+    GGML_UNUSED(k);
+    NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int64_t i = blockIdx.x;
+
+    // assume 32 threads
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
+    const float d = __half2float(x->d);
+    const float dm = -8*d;
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d * (q[l] & 0xF) + dm;
+        y[l+16] = d * (q[l] >>  4) + dm;
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
+
+    const int64_t i = blockIdx.x;
+
+    // assume 32 threads
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
+    const float2 d = __half22float2(x->dm);
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
+        y[l+16] = d.x * (q[l] >>  4) + d.y;
+    }
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t n   = tid/32;
+    const int64_t l   = tid - 32*n;
+    const int64_t is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i = blockIdx.x;
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+    const int64_t r = threadIdx.x/4;
+    const int64_t tid = r/2;
+    const int64_t is0 = r%2;
+    const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int64_t n = tid / 4;
+    const int64_t j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int64_t is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+}
+
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int64_t i = blockIdx.x;
+
+    // assume 32 threads
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t is  = 2*il;
+    const int64_t n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int64_t i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/16;   // il is in 0...3
+    const int64_t ir  = tid%16;   // ir is in 0...15
+    const int64_t is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int64_t i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int64_t tid = threadIdx.x;
+    const int64_t ip  = tid/32;   // ip is 0 or 1
+    const int64_t il  = tid - 32*ip; // 0...32
+    const int64_t is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq2_s * x = (const block_iq2_s *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq3_s * x = (const block_iq3_s *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * qs = x[i].qs + 8*ib;
+    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
+    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
+    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
+    const uint8_t signs = x[i].signs[4*ib + il];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_s * x = (const block_iq1_s  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = (float)x[ib].d;
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const int64_t i   = blockIdx.x;
+    const block_iq4_xs * x = (const block_iq4_xs *)vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
+    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
+    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
+    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
+    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
+        const bool need_check = false;
+        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
+    } else {
+        const bool need_check = true;
+        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
+    }
+}
+
+template<typename dst_t>
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb32 = k / 32;
+    const int nb = (k + 255) / 256;
+    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
+}
+
+template<typename dst_t>
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template<typename dst_t>
+static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
+template <typename src_t, typename dst_t>
+static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const src_t * x = (src_t *) vx;
+
+    y[i] = x[i];
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
+                return dequantize_block_q8_0_f16_cuda;
+            }
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_cuda;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_cuda;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_cuda;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_cuda;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_cuda;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_cuda;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_cuda;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_cuda;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_cuda;
+        case GGML_TYPE_F32:
+            return convert_unary_cuda<float>;
+        default:
+            return nullptr;
+    }
+}
+
+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_cuda;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_cuda;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_cuda;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_cuda;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_cuda;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_cuda;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_cuda;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_cuda;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_cuda;
+        case GGML_TYPE_F16:
+            return convert_unary_cuda<half>;
+        case GGML_TYPE_BF16:
+            return convert_unary_cuda<nv_bfloat16>;
+        default:
+            return nullptr;
+    }
+}
diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh
new file mode 100644
index 000000000..5394be9f1
--- /dev/null
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -0,0 +1,13 @@
+#include "common.cuh"
+
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+
+template<typename T>
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
+
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<half> to_fp16_cuda_t;
+
+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
+
+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu
new file mode 100644
index 000000000..08898115d
--- /dev/null
+++ b/ggml/src/ggml-cuda/count-equal.cu
@@ -0,0 +1,64 @@
+#include "common.cuh"
+#include "count-equal.cuh"
+
+#include <cstdint>
+
+template <typename T>
+static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) {
+    const int64_t i0 = (int64_t) blockIdx.x*dk;
+    const int64_t i1 = min(i0 + dk, k);
+
+    int nequal = 0;
+
+    for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) {
+        const T xi = x[i];
+        const T yi = y[i];
+        nequal += xi == yi;
+    }
+
+    nequal = warp_reduce_sum(nequal);
+
+    if (threadIdx.x != 0) {
+        return;
+    }
+
+    atomicAdd((int *) dst, nequal);
+}
+
+void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == src1->type);
+    GGML_ASSERT( dst->type == GGML_TYPE_I64);
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    int64_t * dst_d  = (int64_t *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
+    const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
+
+    CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1);
+
+    switch (src0->type) {
+        case GGML_TYPE_I32: {
+            const int * src0_d = (const int *) src0->data;
+            const int * src1_d = (const int *) src1->data;
+            count_equal<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_d, dne, ne);
+        } break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/count-equal.cuh b/ggml/src/ggml-cuda/count-equal.cuh
new file mode 100644
index 000000000..8467da79e
--- /dev/null
+++ b/ggml/src/ggml-cuda/count-equal.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
+
+void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
new file mode 100644
index 000000000..54c0f66d2
--- /dev/null
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -0,0 +1,543 @@
+#include "cpy.cuh"
+
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+
+static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = __float2half(*xi);
+}
+
+static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const half * xi = (const half *) cxi;
+    half * dsti = (half *) cdsti;
+
+    *dsti = *xi;
+}
+
+static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+    const half * xi = (const half *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = fmaxf(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = roundf(x0);
+    }
+}
+
+static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
+    const block_q8_0 * xi = (const block_q8_0 *) cxi;
+    float * dsti = (float *) cdsti;
+
+    const float d = (float)xi->d;
+
+    for (int j = 0; j < QK8_0; j++) {
+       dsti[j] = xi->qs[j] * d;
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x = d;
+    dsti->dm.y = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q5_0 * dsti = (block_q5_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK5_0; ++j) {
+        const float v = xi[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -16;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK5_0/2 + j]*id;
+
+        const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
+        const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
+
+        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+    }
+    memcpy(dsti->qh, &qh, sizeof(qh));
+}
+
+static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q5_1 * dsti = (block_q5_1 *) cdsti;
+
+    float min = xi[0];
+    float max = xi[0];
+
+    for (int j = 1; j < QK5_1; ++j) {
+        const float v = xi[j];
+        min = v < min ? v : min;
+        max = v > max ? v : max;
+    }
+
+    const float d  = (max - min) / 31;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x = d;
+    dsti->dm.y = min;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_1/2; ++j) {
+        const float x0 = (xi[0       + j] - min)*id;
+        const float x1 = (xi[QK5_1/2 + j] - min)*id;
+
+        const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+        const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
+    }
+    memcpy(dsti->qh, &qh, sizeof(qh));
+}
+
+
+static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
+    if (x <= val[0]) return 0;
+    if (x >= val[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < val[mav]) mu = mav; else ml = mav;
+    }
+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
+
+static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_NL; ++j) {
+        const float v = xi[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    float d = vmax / kvalues_iq4nl[0];
+    const float id = d ? 1.0f/d : 0.0f;
+
+    float sumqx = 0, sumq2 = 0;
+    for (int j = 0; j < QK4_NL/2; ++j) {
+        const float x0 = xi[0        + j]*id;
+        const float x1 = xi[QK4_NL/2 + j]*id;
+        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
+        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
+        dsti->qs[j] = xi0 | (xi1 << 4);
+        const float v0 = kvalues_iq4nl[xi0];
+        const float v1 = kvalues_iq4nl[xi1];
+        const float w0 = xi[0        + j]*xi[0        + j];
+        const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
+        sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
+        sumq2 += w0*v0*v0 + w1*v1*v1;
+    }
+
+    dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static void ggml_cpy_f16_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_q8_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_q8_0_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = ne;
+    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_q4_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_q4_1_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_q5_0_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK5_0 == 0);
+    const int num_blocks = ne / QK5_0;
+    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_q5_1_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK5_1 == 0);
+    const int num_blocks = ne / QK5_1;
+    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f32_iq4_nl_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    GGML_ASSERT(ne % QK4_NL == 0);
+    const int num_blocks = ne / QK4_NL;
+    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+static void ggml_cpy_f16_f16_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    //GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+    const int64_t nb03 = src0->nb[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+
+    //GGML_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+    const int64_t nb13 = src1->nb[3];
+
+    cudaStream_t main_stream = ctx.stream();
+
+    char * src0_ddc = (char *) src0->data;
+    char * src1_ddc = (char *) src1->data;
+
+    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
+        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else {
+        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    ggml_cuda_cpy(ctx, src0, dst);
+}
+
+void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+        return nullptr;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_f32_f16<cpy_1_f32_f32>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
+        return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+        return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+        return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+        return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+    } else {
+        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
new file mode 100644
index 000000000..28b06cdda
--- /dev/null
+++ b/ggml/src/ggml-cuda/cpy.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+#define CUDA_CPY_BLOCK_SIZE 64
+
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
+
+void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu
new file mode 100644
index 000000000..27599a2b0
--- /dev/null
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -0,0 +1,189 @@
+#include "common.cuh"
+#include "cross-entropy-loss.cuh"
+#include "sum.cuh"
+
+#include <cmath>
+#include <cstdint>
+
+template <bool use_shared>
+static __global__ void cross_entropy_loss_f32(
+        const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
+    extern __shared__ float tmp[];
+
+    logits += int64_t(blockIdx.x)*nclasses;
+    labels += int64_t(blockIdx.x)*nclasses;
+
+    // Find maximum for softmax:
+    float max_logit = -INFINITY;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = logits[i];
+        max_logit = fmaxf(max_logit, val);
+
+        if (use_shared) {
+            tmp[i] = val;
+        }
+    }
+    max_logit = warp_reduce_max(max_logit);
+
+    // Calculate log(softmax(logits)) which is just logits - max:
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float logit_i = use_shared ? tmp[i] : logits[i];
+        sum += expf(logit_i - max_logit);
+    }
+    sum = warp_reduce_sum(sum);
+    sum = logf(sum);
+
+    // log(exp(logits - max) / sum) = (logits - max) - log(sum)
+    float loss = 0.0f;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float logit_i = use_shared ? tmp[i] : logits[i];
+        loss += (logit_i - max_logit - sum) * labels[i];
+    }
+    loss = -warp_reduce_sum(loss) / (float)k;
+
+    if (threadIdx.x != 0) {
+        return;
+    }
+
+    dst[blockIdx.x] = loss;
+}
+
+template <bool use_shared>
+static __global__ void cross_entropy_loss_back_f32(
+        const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels,
+        float * __restrict__ dst, const int nclasses) {
+    extern __shared__ float tmp[];
+
+    logits += int64_t(blockIdx.x)*nclasses;
+    labels += int64_t(blockIdx.x)*nclasses;
+    dst    += int64_t(blockIdx.x)*nclasses;
+
+    float maxval = -INFINITY;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = logits[i];
+        maxval = fmaxf(maxval, val);
+
+        if (use_shared) {
+            tmp[i] = val;
+        }
+    }
+    maxval = warp_reduce_max(maxval);
+
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval);
+        sum += val;
+
+        if (use_shared) {
+            tmp[i] = val;
+        } else {
+            dst[i] = val;
+        }
+    }
+    sum = warp_reduce_sum(sum);
+    const float sm_scale = 1.0f/sum;
+
+    const float d_by_nrows = *grad/gridDim.x;
+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
+        const float val = use_shared ? tmp[i] : dst[i];
+        dst[i] = (val*sm_scale - labels[i])*d_by_nrows;
+    }
+}
+
+void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int64_t ne00  = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *) dst->data;
+
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t stream = ctx.stream();
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(nrows, 1, 1);
+    const size_t nbytes_shared = ne00*sizeof(float);
+
+    const int    id    = ggml_cuda_get_device();
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
+
+    ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
+
+    if (nbytes_shared <= smpbo) {
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+        if (!shared_memory_limit_raised[id]) {
+            CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
+            shared_memory_limit_raised[id] = true;
+        }
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+        cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
+    } else {
+        cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
+    }
+    CUDA_CHECK(cudaGetLastError());
+
+    // Combine results from individual blocks:
+    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
+}
+
+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * grad  = dst->src[0];
+    const ggml_tensor * src0f = dst->src[1];
+    const ggml_tensor * src1f = dst->src[2];
+
+    GGML_ASSERT(src0f->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1f->type == GGML_TYPE_F32);
+    GGML_ASSERT( grad->type == GGML_TYPE_F32);
+    GGML_ASSERT(  dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_scalar(grad));
+    GGML_ASSERT(ggml_is_contiguous(src0f));
+    GGML_ASSERT(ggml_is_contiguous(src1f));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0f, src1f));
+    GGML_ASSERT(ggml_are_same_shape(src0f, dst));
+
+    const int64_t ne00  = src0f->ne[0];
+    const int64_t nrows = ggml_nrows(src0f);
+
+    const float * grad_d  = (const float *) grad->data;
+    const float * src0f_d = (const float *) src0f->data;
+    const float * src1f_d = (const float *) src1f->data;
+    float       * dst_d   = (float       *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
+    const dim3 blocks_num(nrows, 1, 1);
+    const size_t nbytes_shared = ne00*sizeof(float);
+
+    const int    id    = ggml_cuda_get_device();
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
+
+    if (nbytes_shared <= smpbo) {
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+        if (!shared_memory_limit_raised[id]) {
+            CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
+            shared_memory_limit_raised[id] = true;
+        }
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+        cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
+    } else {
+        cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
+    }
+}
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cuh b/ggml/src/ggml-cuda/cross-entropy-loss.cuh
new file mode 100644
index 000000000..9ec7152ff
--- /dev/null
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
+
+void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh
new file mode 100644
index 000000000..bd3c2d9db
--- /dev/null
+++ b/ggml/src/ggml-cuda/dequantize.cuh
@@ -0,0 +1,103 @@
+#include "common.cuh"
+
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 8.0f) * d;
+    v.y = (v.y - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x = (v.x - 16.0f) * d;
+    v.y = (v.y - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = __low2half(x[ib].dm);
+    const dfloat m = __high2half(x[ib].dm);
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x = (v.x * d) + m;
+    v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x = x[ib].qs[iqs + 0];
+    v.y = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x *= d;
+    v.y *= d;
+#endif // GGML_CUDA_F16
+}
diff --git a/ggml/src/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu
new file mode 100644
index 000000000..4b713ba22
--- /dev/null
+++ b/ggml/src/ggml-cuda/diagmask.cu
@@ -0,0 +1,40 @@
+#include "diagmask.cuh"
+
+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
+    const int col = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const dim3 block_nums(nrows_x, block_num_x, 1);
+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
+}
+
+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
+}
diff --git a/ggml/src/ggml-cuda/diagmask.cuh b/ggml/src/ggml-cuda/diagmask.cuh
new file mode 100644
index 000000000..6cdbef17e
--- /dev/null
+++ b/ggml/src/ggml-cuda/diagmask.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+
+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
new file mode 100644
index 000000000..d40ee2da4
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -0,0 +1,847 @@
+#pragma once
+
+#include "common.cuh"
+#include "convert.cuh"
+#include "vecdotq.cuh"
+
+#include <cstdint>
+
+#define FATTN_KQ_STRIDE       256
+#define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
+#define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
+
+typedef void (* fattn_kernel_t)(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3);
+
+typedef half (*vec_dot_KQ_f16_t)(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
+typedef float (*vec_dot_KQ_f32_t)(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
+
+template<typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
+    GGML_UNUSED(Q_v);
+
+    T sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const int ib    = k_KQ /  QI8_1;
+        const int iqs4  = k_KQ %  QI4_0;
+        const int shift = k_KQ & (QI8_1/2);
+
+        const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
+        const int u = Q_q8[k_KQ_0/WARP_SIZE];
+
+        const int sumi = ggml_cuda_dp4a(v, u, 0);
+
+#ifdef FP16_AVAILABLE
+        if (std::is_same<T, half>::value) {
+            const half2  * Q_ds = (const half2  *) Q_ds_v;
+
+            const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
+            sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
+        } else
+#endif // FP16_AVAILABLE
+        {
+            const float2 * Q_ds = (const float2 *) Q_ds_v;
+
+            sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (8/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
+        }
+    }
+
+    return sum;
+}
+
+template<typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
+    GGML_UNUSED(Q_v);
+
+    T sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const int ib    = k_KQ /  QI8_1;
+        const int iqs4  = k_KQ %  QI4_1;
+        const int shift = k_KQ & (QI8_1/2);
+
+        const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
+        const int u = Q_q8[k_KQ_0/WARP_SIZE];
+
+        const int sumi = ggml_cuda_dp4a(v, u, 0);
+
+#ifdef FP16_AVAILABLE
+        if (std::is_same<T, half>::value) {
+            const half2  * Q_ds = (const half2  *) Q_ds_v;
+
+            const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
+            const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
+            sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
+        } else
+#endif // FP16_AVAILABLE
+        {
+            const float2 * Q_ds = (const float2 *) Q_ds_v;
+
+            const float sumid4d8   =  __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
+            const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
+
+            sum += (T) (sumid4d8 + m4s8scaled);
+        }
+    }
+
+    return sum;
+}
+
+template<typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
+    GGML_UNUSED(Q_v);
+
+    T sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const int ib    = k_KQ /  QI8_1;
+        const int iqs4  = k_KQ %  QI5_0;
+        const int iqs8  = k_KQ %  QI8_1;
+        const int shift = k_KQ & (QI8_1/2);
+
+        int v = (get_int_b2(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
+        const int vh = get_int_b2(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
+        v |= (vh <<  4) & 0x00000010; // 0 ->  4
+        v |= (vh << 11) & 0x00001000; // 1 -> 12
+        v |= (vh << 18) & 0x00100000; // 2 -> 20
+        v |= (vh << 25) & 0x10000000; // 3 -> 28
+
+        const int u = Q_q8[k_KQ_0/WARP_SIZE];
+
+        const int sumi = ggml_cuda_dp4a(v, u, 0);
+
+#ifdef FP16_AVAILABLE
+        if (std::is_same<T, half>::value) {
+            const half2  * Q_ds = (const half2  *) Q_ds_v;
+
+            const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/WARP_SIZE];
+            sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
+        } else
+#endif // FP16_AVAILABLE
+        {
+            const float2 * Q_ds = (const float2 *) Q_ds_v;
+
+            sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/WARP_SIZE].x - (16/QI8_1)*Q_ds[k_KQ_0/WARP_SIZE].y));
+        }
+    }
+
+    return sum;
+}
+
+template<typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
+    GGML_UNUSED(Q_v);
+
+    T sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const int ib    = k_KQ /  QI8_1;
+        const int iqs4  = k_KQ %  QI5_1;
+        const int iqs8  = k_KQ %  QI8_1;
+        const int shift = k_KQ & (QI8_1/2);
+
+        int v = (get_int_b2(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
+        const int vh = get_int_b2(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
+        v |= (vh <<  4) & 0x00000010; // 0 ->  4
+        v |= (vh << 11) & 0x00001000; // 1 -> 12
+        v |= (vh << 18) & 0x00100000; // 2 -> 20
+        v |= (vh << 25) & 0x10000000; // 3 -> 28
+
+        const int u = Q_q8[k_KQ_0/WARP_SIZE];
+
+        const int sumi = ggml_cuda_dp4a(v, u, 0);
+
+#ifdef FP16_AVAILABLE
+        if (std::is_same<T, half>::value) {
+            const half2  * Q_ds = (const half2  *) Q_ds_v;
+
+            const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/WARP_SIZE];
+            const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
+            sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
+        } else
+#endif // FP16_AVAILABLE
+        {
+            const float2 * Q_ds = (const float2 *) Q_ds_v;
+
+            const float sumid5d8   =  __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].x * sumi;
+            const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/WARP_SIZE].y / QI8_1;
+
+            sum += (T) (sumid5d8 + m5s8scaled);
+        }
+    }
+
+    return sum;
+}
+
+template <typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
+
+    const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
+    GGML_UNUSED(Q_v);
+
+    T sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const int ib  = k_KQ / QI8_0;
+        const int iqs = k_KQ % QI8_0;
+
+        const int v = get_int_b2(K_q8_0[ib].qs, iqs);
+
+        T Q_d;
+        if (std::is_same<T, half>::value) {
+            const half2  * Q_ds = (const half2  *) Q_ds_v;
+            Q_d = __low2half(Q_ds[k_KQ_0/WARP_SIZE]);
+        } else {
+            const float2 * Q_ds = (const float2 *) Q_ds_v;
+            Q_d = Q_ds[k_KQ_0/WARP_SIZE].x;
+        }
+
+        sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/WARP_SIZE], K_q8_0[ib].d, Q_d);
+    }
+
+    return sum;
+}
+
+template <typename T, int D>
+static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
+    const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
+
+    const half2 * K_h2 = (const half2 *) K_c;
+    GGML_UNUSED(Q_q8);
+    GGML_UNUSED(Q_ds_v);
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        const half2 * Q_h2 = (const half2 *) Q_v;
+
+        half2 sum2 = make_half2(0.0f, 0.0f);
+
+#pragma unroll
+        for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
+            const int k_KQ = k_KQ_0 + threadIdx.x;
+
+            const half2 K_ik = K_h2[k_KQ];
+            sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
+        }
+
+        return __low2half(sum2) + __high2half(sum2);
+    }
+#endif // FP16_AVAILABLE
+
+    const float2 * Q_f2 = (const float2 *) Q_v;
+
+    float sum = 0.0f;
+
+#pragma unroll
+    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
+        const int k_KQ = k_KQ_0 + threadIdx.x;
+
+        const half2 K_ik = K_h2[k_KQ];
+        sum +=  __low2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].x;
+        sum += __high2float(K_ik) * Q_f2[k_KQ_0/WARP_SIZE].y;
+    }
+
+    return sum;
+}
+
+template <typename Tds>
+static __device__ __forceinline__ void quantize_q8_1_to_shared(
+    const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
+
+    float vals[sizeof(int)] = {0.0f};
+#pragma unroll
+    for (int l = 0; l < sizeof(int); ++l) {
+        vals[l] = scale * x[4*threadIdx.x + l];
+    }
+
+    float amax = fabsf(vals[0]);
+    float sum  = vals[0];
+#pragma unroll
+    for (int l = 1; l < sizeof(int); ++l) {
+        amax = fmaxf(amax, fabsf(vals[l]));
+        sum += vals[l];
+    }
+#pragma unroll
+    for (int mask = QI8_1/2; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, 32));
+        sum +=             __shfl_xor_sync(0xFFFFFFFF, sum,  mask, 32);
+    }
+
+    const float d = amax / 127;
+    int q32 = 0;
+    int8_t * q8 = (int8_t *) &q32;
+
+    if (d != 0.0f) {
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            q8[l] = roundf(vals[l] / d);
+        }
+    }
+
+    yq32[threadIdx.x] = q32;
+    if (threadIdx.x % QI8_1 == 0) {
+        if (std::is_same<Tds, half2>::value) {
+            ((half2  *) yds)[threadIdx.x/QI8_1] =  make_half2(d, sum);
+        } else {
+            ((float2 *) yds)[threadIdx.x/QI8_1] = make_float2(d, sum);
+        }
+    }
+}
+
+typedef half  (*dequantize_1_f16_t)(const void *, const int64_t);
+typedef float (*dequantize_1_f32_t)(const void *, const int64_t);
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const int64_t ib    =  i          /  QK4_0;
+    const int     iqs   =  i          % (QK4_0/2);
+    const int     shift = (i % QK4_0) / (QK4_0/2);
+
+    const T   d  = x[ib].d;
+    const int q0 = x[ib].qs[iqs];
+    const int q  = ((q0 >> (4*shift)) & 0x0F) - 8;
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        return ((half) d)*((half) q);
+    }
+#endif // FP16_AVAILABLE
+
+    return ((float) d)*((float) q);
+}
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const int64_t ib    =  i          /  QK4_1;
+    const int     iqs   =  i          % (QK4_1/2);
+    const int     shift = (i % QK4_1) / (QK4_1/2);
+
+    const half2 dm = x[ib].dm;
+    const int   q0 = x[ib].qs[iqs];
+    const int   q  = ((q0 >> (4*shift)) & 0x0F);
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        return __low2half(dm)*((half) q) + __high2half(dm);
+    }
+#endif // FP16_AVAILABLE
+
+    return __low2float(dm)*((float) q) + __high2float(dm);
+}
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const int64_t ib    =  i          /  QK5_0;
+    const int     idq   =  i          %  QK5_0;
+    const int     iqs   =  i          % (QK5_0/2);
+    const int     shift = (i % QK5_0) / (QK5_0/2);
+
+    const T   d   = x[ib].d;
+    const int ql0 = x[ib].qs[iqs];
+    const int qh0 = get_int_b2(x[ib].qh, 0);
+    const int ql  = ((ql0 >> (4*shift)) & 0x0F);
+    const int qh  = ((qh0 >> idq) << 4) & 0x10;
+    const int q   = (ql | qh) - 16;
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        return ((half) d)*((half) q);
+    }
+#endif // FP16_AVAILABLE
+
+    return ((float) d)*((float) q);
+}
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const int64_t ib    =  i          /  QK5_1;
+    const int     idq   =  i          %  QK5_1;
+    const int     iqs   =  i          % (QK5_1/2);
+    const int     shift = (i % QK5_1) / (QK5_1/2);
+
+    const half2 dm  = x[ib].dm;
+    const int   ql0 = x[ib].qs[iqs];
+    const int   qh0 = get_int_b4(x[ib].qh, 0);
+    const int   ql  = ((ql0 >> (4*shift)) & 0x0F);
+    const int   qh  = ((qh0 >> idq) << 4) & 0x10;
+    const int   q   = (ql | qh);
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        return __low2half(dm)*((half) q) + __high2half(dm);
+    }
+#endif // FP16_AVAILABLE
+
+    return __low2float(dm)*((float) q) + __high2float(dm);
+}
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int64_t ib  = i / QK8_0;
+    const int     iqs = i % QK8_0;
+
+    const T   d = x[ib].d;
+    const int q = x[ib].qs[iqs];
+
+#ifdef FP16_AVAILABLE
+    if (std::is_same<T, half>::value) {
+        return ((half) d)*((half) q);
+    }
+#endif // FP16_AVAILABLE
+
+    return ((float) d)*((float) q);
+}
+
+template <typename T>
+static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) {
+    const half * x = (const half *) vx;
+
+    return x[i];
+}
+
+template <int D>
+constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) {
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D> :
+        nullptr;
+}
+
+template <int D>
+constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) {
+    return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D> :
+        type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D> :
+        type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D> :
+        type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D> :
+        type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D> :
+        type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D> :
+        nullptr;
+}
+
+constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) {
+    return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<half> :
+        type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<half> :
+        type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<half> :
+        type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<half> :
+        type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<half> :
+        type_V == GGML_TYPE_F16 ? dequantize_1_f16<half> :
+        nullptr;
+}
+
+constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
+    return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<float> :
+        type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<float> :
+        type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<float> :
+        type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<float> :
+        type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<float> :
+        type_V == GGML_TYPE_F16 ? dequantize_1_f16<float> :
+        nullptr;
+}
+
+// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpass-failed"
+#endif // __clang__
+
+template<int D, int ncols, int KQ_stride> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_stream_k_fixup(
+        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
+    const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
+
+    const int iter_k = ne11 / KQ_stride;
+    const int iter_j = (ne01 + (ncols - 1)) / ncols;
+
+    const int bidx0 = blockIdx.x;
+
+    const int kbc0      = (bidx0 + 0)*iter_k*iter_j*ne02 / gridDim.x;
+    const int kbc0_stop = (bidx0 + 1)*iter_k*iter_j*ne02 / gridDim.x;
+
+    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
+    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
+    const bool did_not_write_last      = kbc0/iter_k == kbc0_stop/iter_k && kbc0_stop % iter_k != 0;
+    if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
+        return;
+    }
+
+    const int channel = kbc0 / (iter_k*iter_j);
+    const int jt      = (kbc0 - channel*iter_k*iter_j) / iter_k;
+
+    dst += jt*ncols*ne02*D + channel*D;
+
+    // Load the partial result that needs a fixup:
+    float dst_val[ncols] = {0.0f};
+    float max_val[ncols] = {0.0f};
+    float rowsum[ncols]  = {0.0f};
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (jt*ncols + j >= ne01) {
+            break;
+        }
+        dst_val[j] = dst[j*ne02*D + threadIdx.x];
+
+        const float2 tmp = dst_fixup[bidx0*ncols + j];
+        max_val[j] = tmp.x;
+        rowsum[j]  = tmp.y;
+    }
+
+    // Iterate over previous blocks and compute the combined results.
+    // All CUDA blocks that get here must have a previous block that needs a fixup.
+    int bidx = bidx0 - 1;
+    int kbc_stop = kbc0;
+    while(true) {
+        const int kbc = bidx*iter_k*iter_j*ne02 / gridDim.x;
+        if (kbc == kbc_stop) { // Did not have any data.
+            bidx--;
+            kbc_stop = kbc;
+            continue;
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            if (jt*ncols + j >= ne01) {
+                break;
+            }
+            const float dst_add = dst_fixup_data[bidx*ncols*D + j*D + threadIdx.x];
+
+            const float2 tmp = dst_fixup[(gridDim.x + bidx)*ncols + j];
+
+            // Scale the current and new value accumulators depending on the max. values.
+            const float max_val_new = fmaxf(max_val[j], tmp.x);
+
+            const float diff_val = max_val[j] - max_val_new;
+            const float diff_add = tmp.x      - max_val_new;
+
+            const float scale_val = diff_val >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_val) : 0.0f;
+            const float scale_add = diff_add >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_add) : 0.0f;
+
+            dst_val[j] = scale_val*dst_val[j] + scale_add*dst_add;
+            rowsum[j]  = scale_val*rowsum[j]  + scale_add*tmp.y;
+
+            max_val[j] = max_val_new;
+        }
+
+        // If this block started in a previous tile we are done and don't need to combine additional partial results.
+        if (kbc % iter_k == 0 || kbc/iter_k < kbc0/iter_k) {
+            break;
+        }
+        bidx--;
+        kbc_stop = kbc;
+    }
+
+    // Write back final result:
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (jt*ncols + j >= ne01) {
+            return;
+        }
+        dst[j*ne02*D + threadIdx.x] = dst_val[j] / rowsum[j];
+    }
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif // __clang__
+
+template<int D, int parallel_blocks> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_combine_results(
+        const float  * __restrict__ VKQ_parts,
+        const float2 * __restrict__ VKQ_meta,
+        float * __restrict__ dst) {
+    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
+    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
+    dst       +=                 D * gridDim.y*blockIdx.x;
+
+    const int tid = threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ float2 meta[parallel_blocks];
+    if (tid < 2*parallel_blocks) {
+        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
+    }
+
+    __syncthreads();
+
+    float kqmax = meta[0].x;
+#pragma unroll
+    for (int l = 1; l < parallel_blocks; ++l) {
+        kqmax = max(kqmax, meta[l].x);
+    }
+
+    float VKQ_numerator   = 0.0f;
+    float VKQ_denominator = 0.0f;
+#pragma unroll
+    for (int l = 0; l < parallel_blocks; ++l) {
+        const float diff = meta[l].x - kqmax;
+        const float KQ_max_scale = expf(diff);
+        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
+        *((uint32_t *) &KQ_max_scale) &= ftz_mask;
+
+        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
+        VKQ_denominator += KQ_max_scale * meta[l].y;
+    }
+
+    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
+}
+
+static void on_no_fattn_vec_case(const int D) {
+    if (D == 64) {
+        fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
+        fprintf(stderr, "By default only f16 KV cache is supported.\n");
+        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
+        GGML_ABORT("fatal error");
+    } else if (D == 128) {
+        fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
+        fprintf(stderr, "Supported combinations:\n");
+        fprintf(stderr, "  - K == q4_0, V == q4_0,  4.50 BPV\n");
+        fprintf(stderr, "  - K == q8_0, V == q8_0,  8.50 BPV\n");
+        fprintf(stderr, "  - K == f16,  V == f16,  16.00 BPV\n");
+        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
+        GGML_ABORT("fatal error");
+    } else {
+        fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
+        fprintf(stderr, "Only f16 is supported.\n");
+        GGML_ABORT("fatal error");
+    }
+}
+
+// parallel_blocks == 0 is stream-k decomposition
+template <int D, int cols_per_block, int parallel_blocks, int KQ_stride>
+void launch_fattn(
+    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
+    const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
+) {
+    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * K = dst->src[1];
+    const ggml_tensor * V = dst->src[2];
+
+    const ggml_tensor * mask = dst->src[3];
+
+    ggml_tensor * KQV = dst;
+
+    GGML_ASSERT(Q->type == GGML_TYPE_F32);
+    GGML_ASSERT(KQV->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
+    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
+                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
+
+    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
+
+    GGML_ASSERT(Q->ne[3] == 1);
+
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t main_stream = ctx.stream();
+    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+
+    ggml_cuda_pool_alloc<half>   K_f16(pool);
+    ggml_cuda_pool_alloc<half>   V_f16(pool);
+    ggml_cuda_pool_alloc<float>  dst_tmp(pool);
+    ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
+
+    const char * K_data = (const char *) K->data;
+    size_t nb11 = K->nb[1];
+    size_t nb12 = K->nb[2];
+    size_t nb13 = K->nb[3];
+
+    const char * V_data = (const char *) V->data;
+    size_t nb21 = V->nb[1];
+    size_t nb22 = V->nb[2];
+    size_t nb23 = V->nb[3];
+
+    if (need_f16_K && K->type != GGML_TYPE_F16) {
+        K_f16.alloc(ggml_nelements(K));
+        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
+        to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
+        K_data = (char *) K_f16.ptr;
+
+        const size_t bs = ggml_blck_size(K->type);
+        const size_t ts = ggml_type_size(K->type);
+
+        nb11 = nb11*bs*sizeof(half)/ts;
+        nb12 = nb12*bs*sizeof(half)/ts;
+        nb13 = nb13*bs*sizeof(half)/ts;
+    }
+
+    if (need_f16_V && V->type != GGML_TYPE_F16) {
+        V_f16.alloc(ggml_nelements(V));
+        to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
+        to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
+        V_data = (char *) V_f16.ptr;
+
+        const size_t bs = ggml_blck_size(V->type);
+        const size_t ts = ggml_type_size(V->type);
+
+        nb21 = nb21*bs*sizeof(half)/ts;
+        nb22 = nb22*bs*sizeof(half)/ts;
+        nb23 = nb23*bs*sizeof(half)/ts;
+    }
+
+    const int ntiles_x = ((Q->ne[1] + cols_per_block - 1) / cols_per_block);
+    const int ntiles_total = ntiles_x*Q->ne[2]*Q->ne[3];
+
+    const dim3 block_dim(WARP_SIZE, nwarps, 1);
+    dim3 blocks_num;
+    if (parallel_blocks == 0) {
+        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
+        const int tiles_nwaves  = (ntiles_total - nsm - 1) / nsm;
+        const bool tiles_inefficient = 3*nsm < 2*tiles_nwaves*ntiles_total;
+        const bool short_context = K->ne[1] < 4096;
+
+        const int nblocks_stream_k = 2*nsm;
+
+        blocks_num.x = short_context && !tiles_inefficient ? ntiles_total : nblocks_stream_k;
+        blocks_num.y = 1;
+        blocks_num.z = 1;
+
+        dst_tmp_meta.alloc(blocks_num.x*cols_per_block * (2*2 + D) * sizeof(float));
+    } else {
+        blocks_num.x = parallel_blocks*ntiles_x;
+        blocks_num.y = Q->ne[2];
+        blocks_num.z = Q->ne[3];
+
+        if (parallel_blocks > 1) {
+            dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
+            dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
+        }
+    }
+
+
+    float scale         = 1.0f;
+    float max_bias      = 0.0f;
+    float logit_softcap = 0.0f;
+
+    memcpy(&scale,         (const float *) KQV->op_params + 0, sizeof(float));
+    memcpy(&max_bias,      (const float *) KQV->op_params + 1, sizeof(float));
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (logit_softcap != 0.0f) {
+        scale /= logit_softcap;
+    }
+
+    const uint32_t n_head      = Q->ne[2];
+    const uint32_t n_head_log2 = 1u << uint32_t(floorf(log2f(float(n_head))));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
+        (const char *) Q->data,
+        K_data,
+        V_data,
+        mask ? ((const char *) mask->data) : nullptr,
+        (parallel_blocks) > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
+        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
+        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
+        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
+        mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
+        Q->nb[1], Q->nb[2], Q->nb[3],
+        nb11, nb12, nb13,
+        nb21, nb22, nb23,
+        KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    if constexpr (parallel_blocks == 0) {
+        if (blocks_num.x % ntiles_total != 0) { // Fixup is only needed if the SMs work on fractional tiles.
+            const dim3 block_dim_combine(D, 1, 1);
+            const dim3 blocks_num_combine = blocks_num;
+
+            flash_attn_stream_k_fixup<D, cols_per_block, KQ_stride>
+                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
+                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
+        }
+    } else if constexpr (parallel_blocks > 1) {
+        const dim3 block_dim_combine(D, 1, 1);
+        const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
+
+        flash_attn_combine_results<D, parallel_blocks>
+            <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
+            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
+    }
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
new file mode 100644
index 000000000..05bc91a3b
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -0,0 +1,637 @@
+#include "common.cuh"
+#include "mma.cuh"
+#include "fattn-common.cuh"
+
+template<int D, int ncols, int nwarps, int KQ_stride, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
+static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
+        const float2 * const __restrict__ Q_f2,
+        const half2  * const __restrict__ K_h2,
+        const half2  * const __restrict__ V_h2,
+        const half   * const __restrict__ maskh,
+        float2       * const __restrict__ dstk,
+        float2       * const __restrict__ dstk_fixup,
+        const float scale,
+        const float slope,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3,
+        const int jt,
+        const int kb0_start,
+        const int kb0_stop) {
+#ifdef NEW_MMA_AVAILABLE
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    typedef mma_A_I16K8<half2> mma_A;
+    typedef mma_B_J8K8<half2>  mma_B;
+    typedef mma_C_I16J8<float> mma_C_KQ;
+    typedef mma_C_I16J8<half2> mma_C_VKQ;
+
+    static_assert(nwarps*mma_B::J % ncols == 0, "bad nwarps");
+    constexpr int np = nwarps*mma_B::J / ncols; // Number of parallel CUDA warps per Q column.
+
+    static_assert(D         % nwarps == 0, "bad D");
+    static_assert(KQ_stride % nwarps == 0, "bad KQ_stride");
+
+    constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
+    extern __shared__ half2 tile_KV[]; // Temporary shared buffer for loading K/V data with KQ_stride*D logical elements.
+
+    const int stride_Q    = nb01 / sizeof(float2);
+    const int stride_KV   = nb11 / sizeof(half2);
+    const int stride_mask = nb31 / sizeof(half);
+
+    mma_B Q_B[D/(2*mma_B::K)];
+    mma_C_VKQ VKQ_C[D/mma_C_VKQ::I];
+
+    float2    KQ_rowsum = {0.0f, 0.0f};
+    float2       KQ_max = {-FLT_MAX/2.0f, -FLT_MAX/2.0f};
+    float2 KQ_max_scale = {0.0f, 0.0f};
+
+    // Temporarily load Q data into tile_KV, will be loaded into registers afterwards.
+    // The loading is done with decreasing granularity for D for better memory bandwidth.
+    const half2 scale_h2 = make_half2(scale, scale);
+#pragma unroll
+    for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
+        const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
+        const int k0_stop  =                             D/2 - (D/2) % (1*stride_k);
+        const int stride_j = WARP_SIZE / stride_k;
+
+        if (nwarps*stride_j > ncols && threadIdx.y*stride_j >= ncols) {
+            break;
+        }
+
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps*stride_j) {
+            const int j = j0 + threadIdx.y*stride_j + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+
+            if (jt*ncols + j < ne01) {
+#pragma unroll
+                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    const float2 tmp = Q_f2[(jt*ncols + j)*stride_Q + k];
+                    tile_KV[j*D2_padded + k] = scale_h2 * make_half2(tmp.x, tmp.y);
+                }
+            } else {
+#pragma unroll
+                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    tile_KV[j*D2_padded + k] = make_half2(0.0f, 0.0f);
+                }
+            }
+        }
+    }
+
+    __syncthreads();
+
+    {
+        const int j0 = (threadIdx.y / np) * mma_B::J;
+
+#pragma unroll
+        for (int k0 = 0; k0 < D/2; k0 += mma_B::K) {
+            Q_B[k0/mma_B::K].load_ldmatrix(tile_KV + j0*D2_padded + k0, D2_padded);
+        }
+    }
+
+    __syncthreads();
+
+    // Iterate over ne11 == previous tokens:
+    for (int kb0 = kb0_start; kb0 < kb0_stop; ++kb0) {
+        const int k_VKQ_0 = kb0*KQ_stride;
+        mma_C_KQ KQ_C[KQ_stride/(np*mma_C_KQ::I)];
+
+        // Load K data into tile with decreasing granularity for D for better memory bandwidth:
+        static_assert(KQ_stride % (4*nwarps) == 0, "out of bounds");
+#pragma unroll
+        for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
+            const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
+            const int k0_stop  =                             D/2 - (D/2) % (1*stride_k);
+            const int stride_i = WARP_SIZE / stride_k;
+
+#pragma unroll
+            for (int i_KQ_0 = 0; i_KQ_0 < KQ_stride; i_KQ_0 += nwarps*stride_i) {
+                const int i_KQ = i_KQ_0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+
+#pragma unroll
+                for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += stride_k) {
+                    const int k_KQ = k_KQ_0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    tile_KV[i_KQ*D2_padded + k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV + k_KQ];
+                }
+            }
+        }
+
+        __syncthreads();
+
+        // Calculate tile of KQ:
+#pragma unroll
+        for (int i_KQ_00 = 0; i_KQ_00 < KQ_stride; i_KQ_00 += np*mma_A::I) {
+            const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*mma_A::I;
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += mma_A::K) {
+                mma_A K_A;
+                K_A.load_ldmatrix(tile_KV + i_KQ_0*D2_padded + k_KQ_0, D2_padded);
+                KQ_C[i_KQ_00/(np*mma_A::I)].mma(K_A, Q_B[k_KQ_0/mma_A::K]);
+            }
+        }
+
+        __syncthreads();
+
+        if (use_logit_softcap) {
+            static_assert(KQ_stride % (np*mma_C_KQ::I) == 0, "bad loop size");
+#pragma unroll
+            for (int i = 0; i < KQ_stride/(np*mma_C_KQ::I); ++i) {
+#pragma unroll
+                for (int l = 0; l < mma_C_KQ::ne; ++l) {
+                    KQ_C[i].x[l] = logit_softcap*tanhf(KQ_C[i].x[l]);
+                }
+            }
+        }
+
+        if (maskh) {
+            static_assert(KQ_stride % (np       *mma_C_KQ::I) == 0, "bad loop size");
+            static_assert(ncols     % (nwarps/np*mma_C_KQ::J) == 0, "bad loop size");
+#pragma unroll
+            for (int i00 = 0; i00 < KQ_stride; i00 += np*mma_C_KQ::I) {
+                const int i0 = i00 + (threadIdx.y % np)*mma_C_KQ::I;
+#pragma unroll
+                for (int l = 0; l < mma_C_KQ::ne; ++l) {
+                    const int i = i0 + mma_C_KQ::get_i(l);
+                    const int j = (threadIdx.y / np)*mma_C_KQ::J + mma_C_KQ::get_j(l);
+
+                    KQ_C[i00/(np*mma_C_KQ::I)].x[l] += slope*__half2float(maskh[j*stride_mask + k_VKQ_0 + i]);
+                }
+            }
+        }
+
+        // Calculate softmax for each KQ column using the current max. value.
+        // The divisor is stored in KQ_rowsum and will be applied at the end.
+        float2 KQ_max_new = KQ_max;
+        static_assert(KQ_stride % (np*mma_C_KQ::I) == 0, "bad loop size");
+#pragma unroll
+        for (int k = 0; k < KQ_stride/(np*mma_C_KQ::I); ++k) {
+#pragma unroll
+            for (int l0 = 0; l0 < mma_C_KQ::ne; l0 += 2) {
+                KQ_max_new.x = fmaxf(KQ_max_new.x, KQ_C[k].x[l0 + 0]);
+                KQ_max_new.y = fmaxf(KQ_max_new.y, KQ_C[k].x[l0 + 1]);
+            }
+        }
+
+        // Values per KQ column are spread across 8 threads, does not need full warp reduce:
+#pragma unroll
+        for (int offset = 16; offset > 2; offset >>= 1) {
+            KQ_max_new.x = fmaxf(KQ_max_new.x, __shfl_xor_sync(0xFFFFFFFF, KQ_max_new.x, offset, WARP_SIZE));
+            KQ_max_new.y = fmaxf(KQ_max_new.y, __shfl_xor_sync(0xFFFFFFFF, KQ_max_new.y, offset, WARP_SIZE));
+        }
+
+        {
+            const float2 diff = make_float2(KQ_max.x - KQ_max_new.x, KQ_max.y - KQ_max_new.y);
+            KQ_max_scale = make_float2(expf(diff.x), expf(diff.y));
+            if (diff.x <= SOFTMAX_FTZ_THRESHOLD) {
+                KQ_max_scale.x = 0.0f;
+            }
+            if (diff.y <= SOFTMAX_FTZ_THRESHOLD) {
+                KQ_max_scale.y = 0.0f;
+            }
+            KQ_max = KQ_max_new;
+        }
+
+        float2 KQ_rowsum_add = make_float2(0.0f, 0.0f);
+        static_assert(KQ_stride % (np*mma_C_KQ::I) == 0, "bad loop size");
+#pragma unroll
+        for (int k = 0; k < KQ_stride/(np*mma_C_KQ::I); ++k) {
+#pragma unroll
+            for (int l = 0; l < mma_C_KQ::ne; ++l) {
+                const float KQ_max_l = l % 2 == 0 ? KQ_max.x : KQ_max.y;
+                const float diff = KQ_C[k].x[l] - KQ_max_l;
+                KQ_C[k].x[l] = expf(diff);
+                if (diff <= SOFTMAX_FTZ_THRESHOLD) {
+                    KQ_C[k].x[l] = 0.0f;
+                }
+
+                if (l % 2 == 0) {
+                    KQ_rowsum_add.x += KQ_C[k].x[l];
+                } else {
+                    KQ_rowsum_add.y += KQ_C[k].x[l];
+                }
+            }
+        }
+
+        // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
+        KQ_rowsum.x = KQ_max_scale.x*KQ_rowsum.x + KQ_rowsum_add.x;
+        KQ_rowsum.y = KQ_max_scale.y*KQ_rowsum.y + KQ_rowsum_add.y;
+
+        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale.x, KQ_max_scale.y);
+#pragma unroll
+        for (int i = 0; i < D/mma_C_VKQ::I; ++i) {
+#pragma unroll
+            for (int l = 0; l < mma_C_VKQ::ne; ++l) {
+                VKQ_C[i].x[l] *= KQ_max_scale_h2;
+            }
+        }
+
+        // Convert KQ C tiles into B tiles for VKQ calculation:
+        mma_B B[KQ_stride/(np*2*mma_B::K)];
+        static_assert(KQ_stride % (np*2*mma_B::K) == 0, "bad loop size");
+#pragma unroll
+        for (int k = 0; k < KQ_stride/(np*2*mma_B::K); ++k) {
+            B[k] = KQ_C[k].to_mma_B();
+        }
+
+        // Load V data into tile with decreasing granularity for D for better memory bandwidth:
+        static_assert(KQ_stride % (4*nwarps) == 0, "out of bounds");
+#pragma unroll
+        for (int stride_i : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
+            const int i0_start = stride_i == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_i);
+            const int i0_stop  =                             D/2 - (D/2) % (1*stride_i);
+            const int stride_k = WARP_SIZE / stride_i;
+
+#pragma unroll
+            for (int k_V_0 = 0; k_V_0 < KQ_stride; k_V_0 += nwarps*stride_k) {
+                const int k_V = k_V_0 + threadIdx.y*stride_k + (stride_i == WARP_SIZE ? 0 : threadIdx.x / stride_i);
+
+#pragma unroll
+                for (int i_V_0 = i0_start; i_V_0 < i0_stop; i_V_0 += stride_i) {
+                    const int i_V = i_V_0 + (stride_i == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_i);
+
+                    tile_KV[k_V*D2_padded + i_V] = V_h2[(k_VKQ_0 + k_V)*stride_KV + i_V];
+                }
+            }
+        }
+
+        __syncthreads();
+
+        // Calculate VKQ tile:
+#pragma unroll
+        for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += mma_C_VKQ::I) {
+            static_assert((KQ_stride/2) % (np*mma_A::K) == 0, "bad loop size");
+#pragma unroll
+            for (int k00 = 0; k00 < KQ_stride/2; k00 += np*mma_A::K) {
+                const int k0 = k00 + (threadIdx.y % np)*mma_A::K;
+
+                mma_A A;
+                A.load_ldmatrix_trans(tile_KV + 2*k0*D2_padded + i_VKQ_0/2, D2_padded);
+                VKQ_C[i_VKQ_0/mma_C_VKQ::I].mma(A, B[k00/(np*mma_A::K)]);
+            }
+        }
+
+        __syncthreads();
+    }
+
+    // Finally, sum up partial KQ rowsums.
+    // The partial sums are spread across 8 threads each, does not need full reduce.
+#pragma unroll
+    for (int offset = 16; offset > 2; offset >>= 1) {
+        KQ_rowsum.x += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum.x, offset, WARP_SIZE);
+        KQ_rowsum.y += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum.y, offset, WARP_SIZE);
+    }
+
+    // Write VKQ accumulators to shared memory in column-major format.
+    // It's faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
+    // Also for np > 1 the combination is done via these values in shared memory.
+    const int j_cwd = threadIdx.y*mma_B::J + mma_B::get_j(-1); // j combine write data
+#pragma unroll
+    for (int k0 = 0; k0 < D/2; k0 += mma_B::K) {
+        const mma_B B = VKQ_C[k0/mma_B::K].to_mma_B(); // Conversion of C to B matrix puts it in column-major format.
+
+#pragma unroll
+        for (int l = 0; l < mma_B::ne; ++l) {
+            const int k = k0 + mma_B::get_k(l);
+
+            tile_KV[j_cwd*D2_padded + k] = B.x[l];
+        }
+    }
+
+    const int j_cwmo = (threadIdx.x % (2*mma_C_VKQ::J)) / mma_C_VKQ::J; // j combine write meta offset
+    const int j_cwm = threadIdx.y*(2*mma_C_VKQ::J) + 2*mma_C_VKQ::get_j(-1) + j_cwmo; // j combine write meta
+    const float2 KQ_cmr = make_float2(((const float *) &KQ_max)[j_cwmo], ((const float *) &KQ_rowsum)[j_cwmo]); // KQ combine max rowsum
+
+    if (((!needs_fixup && !is_fixup) || np > 1) && threadIdx.x < 2*mma_C_VKQ::J) {
+        // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
+        ((float2 *) tile_KV)[j_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+    }
+
+    __syncthreads();
+
+    static_assert(np == 1 || np == 2 || np == 4, "bad np");
+    if (np == 1) {
+        // No combination is needed, the meta data can be directly written from registers to VRAM.
+        if (needs_fixup && threadIdx.x < mma_B::J) {
+            float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
+            dstk_fixup_meta[j_cwm] = KQ_cmr;
+        }
+        if (is_fixup && threadIdx.x < mma_B::J) {
+            float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
+            dstk_fixup_meta[j_cwm] = KQ_cmr;
+        }
+    } else if (threadIdx.y % np == 0) {
+        // Combine the meta data for parallel warps via shared memory.
+        // Warps with threadIdx.y % np != 0 must NOT return early.
+        // All threads must return simultaneously to avoid race conditions with work on the next tile.
+
+        float * meta_j = (float *) tile_KV + (threadIdx.y*mma_B::J + threadIdx.x)*D2_padded + D/2;
+
+        float KQ_cm = -FLT_MAX/2; // KQ combine max per parallel warp.
+        if (np*mma_B::J == WARP_SIZE || threadIdx.x < np*mma_B::J) {
+            KQ_cm = meta_j[0];
+        }
+
+        float KQ_cmn = KQ_cm; // KQ combine max new, max between all parallel warps.
+#pragma unroll
+        for (int offset = np*mma_B::J/2; offset >= mma_B::J; offset >>= 1) {
+            KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
+        }
+
+        const float KQ_cms = expf(KQ_cm - KQ_cmn); // KQ combine max scale per warp.
+        float KQ_crs = 0.0f; // KQ combine rowsum, scaled sum of all parallel warps.
+        if (np*mma_B::J == WARP_SIZE || threadIdx.x < np*mma_B::J) {
+            KQ_crs = KQ_cms*meta_j[1];
+        }
+#pragma unroll
+        for (int offset = np*mma_B::J/2; offset >= mma_B::J; offset >>= 1) {
+            KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
+        }
+
+        // Write back combined meta data:
+        if (np*mma_B::J == WARP_SIZE || threadIdx.x < np*mma_B::J) {
+            meta_j[0] = KQ_cmn; // Combined max. KQ values.
+            meta_j[1] = KQ_crs; // Combined KQ rowsums.
+            meta_j[2] = KQ_cms; // KQ max scales per parallel warp.
+        }
+        if (needs_fixup && threadIdx.x < mma_B::J) {
+            float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
+            dstk_fixup_meta[(threadIdx.y/np)*mma_B::J + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
+        }
+        if (is_fixup && threadIdx.x < mma_B::J) {
+            float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
+            dstk_fixup_meta[(threadIdx.y/np)*mma_B::J + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
+        }
+    }
+
+    if (np > 1) {
+        __syncthreads();
+    }
+
+    if (np == 1 || threadIdx.y % np == 0) {
+        // The first 2*2*gridDim.x*ncols floats in dstk_fixup are for storing max. values and row sums.
+        // The values after that are for the partial results of the individual blocks.
+        float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(D/2));
+
+#pragma unroll
+        for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
+            const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
+            const int k0_stop  =                             D/2 - (D/2) % (1*stride_k);
+            const int stride_j = WARP_SIZE / stride_k;
+
+            if (nwarps*stride_j > ncols && threadIdx.y*stride_j >= ncols) {
+                break;
+            }
+
+#pragma unroll
+            for (int j0_dst = 0; j0_dst < ncols; j0_dst += (nwarps/np)*stride_j) {
+                const int j_dst = j0_dst + (threadIdx.y/np)*stride_j + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+                const int j_tile_KV = (j_dst/mma_B::J)*(np*mma_B::J) + j_dst % mma_B::J;
+
+                if (!is_fixup && jt*ncols + j_dst >= ne01) {
+                    continue;
+                }
+                const float * meta_j = (const float *) tile_KV + j_tile_KV*D2_padded + D/2;
+#pragma unroll
+                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    float2 dstk_val = make_float2(0.0f, 0.0f);
+#pragma unroll
+                    for (int ip = 0; ip < np; ++ip) {
+                        const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*mma_B::J*D2_padded + 2];
+                        const float2 dstk_val_add = __half22float2(tile_KV[(j_tile_KV + ip*mma_B::J)*D2_padded + k]);
+                        dstk_val.x += dstk_val_add.x*KQ_crs;
+                        dstk_val.y += dstk_val_add.y*KQ_crs;
+                    }
+
+                    if (!needs_fixup && !is_fixup) {
+                        const float KQ_rowsum_j = meta_j[1];
+                        dstk_val.x /= KQ_rowsum_j;
+                        dstk_val.y /= KQ_rowsum_j;
+                    }
+
+                    if (is_fixup) {
+                        dstk_fixup_data[j_dst*(D/2) + k] = dstk_val;
+                    } else {
+                        dstk[(jt*ncols + j_dst)*ne02*(D/2) + k] = dstk_val;
+                    }
+                }
+            }
+        }
+    }
+
+    if (np > 1) {
+        __syncthreads();
+    }
+#else
+   NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+}
+
+template<int D, int ncols, int nwarps, int KQ_stride, bool use_logit_softcap>
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(nwarps*WARP_SIZE, 2)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_ext_f16(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+    // Skip unused kernel variants for faster compilation:
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    static_assert(FATTN_KQ_STRIDE % KQ_stride == 0, "bad KQ_stride");
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+
+    const int iter_k = ne11 / KQ_stride;
+    const int iter_j = (ne01 + (ncols - 1)) / ncols;
+
+    // kbc == k block continuous, current index in continuous ijk space.
+    int       kbc      = (blockIdx.x + 0)*iter_k*iter_j*ne02 / gridDim.x;
+    const int kbc_stop = (blockIdx.x + 1)*iter_k*iter_j*ne02 / gridDim.x;
+
+    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
+    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
+    // In the most general case >2 seams can fall into the same tile.
+
+    // kb0 == k start index when in the output tile.
+    int kb0_start = kbc % iter_k;
+    int kb0_stop  = min(iter_k, kb0_start + kbc_stop - kbc);
+    while (kbc < kbc_stop && kb0_stop == iter_k) {
+        const int channel = kbc / (iter_k*iter_j);
+        const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
+
+        const float2 * Q_f2  = (const float2 *) (Q + nb02* channel);
+        const half2  * K_h2  = (const half2  *) (K + nb12*(channel / gqa_ratio));
+        const half2  * V_h2  = (const half2  *) (V + nb12*(channel / gqa_ratio)); // K and V have same shape
+        const half   * maskh = mask ? (const half  *) mask + (nb31/sizeof(half))*jt*ncols : nullptr;
+        float2       * dstk  = ((float2 *) dst) + channel*(D/2);
+
+        const float slope = get_alibi_slope(max_bias, channel, n_head_log2, m0, m1);
+
+        constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
+        if (kb0_start == 0) {
+            constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
+            flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
+                (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
+                ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne31, nb31, nb01, nb02, nb03, nb11, nb12, nb13, nb21, nb22, nb23, ne0, ne1, ne2, ne3,
+                jt, kb0_start, kb0_stop);
+        } else {
+            constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
+            flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
+                (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
+                ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne31, nb31, nb01, nb02, nb03, nb11, nb12, nb13, nb21, nb22, nb23, ne0, ne1, ne2, ne3,
+                jt, kb0_start, kb0_stop);
+        }
+
+        kbc += iter_k;
+        kbc -= kbc % iter_k;
+
+        kb0_start = 0;
+        kb0_stop  = min(iter_k, kbc_stop - kbc);
+    }
+
+    if (kbc >= kbc_stop) {
+        return;
+    }
+
+    const int channel = kbc / (iter_k*iter_j);
+    const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
+
+    const float2 * Q_f2  = (const float2 *) (Q + nb02* channel);
+    const half2  * K_h2  = (const half2  *) (K + nb12*(channel / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V + nb12*(channel / gqa_ratio)); // K and V have same shape
+    const half   * maskh = mask ? (const half  *) mask + (nb31/sizeof(half))*jt*ncols : nullptr;
+    float2       * dstk  = ((float2 *) dst) + channel*(D/2);
+
+    const float slope = get_alibi_slope(max_bias, channel, n_head_log2, m0, m1);
+
+    constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
+    constexpr bool needs_fixup = false;
+    flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
+        (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
+        ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne31, nb31, nb01, nb02, nb03, nb11, nb12, nb13, nb21, nb22, nb23, ne0, ne1, ne2, ne3,
+        jt, kb0_start, kb0_stop);
+}
+
+template <int D, int cols_per_block>
+void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    typedef mma_A_I16K8<half2> mma_A;
+    typedef mma_B_J8K8<half2>  mma_B;
+
+    static_assert(D              % mma_B::K == 0, "bad D");
+    static_assert(cols_per_block % mma_B::J == 0, "bad cols_per_block");
+
+    const ggml_tensor * KQV = dst;
+
+    constexpr int    KQ_stride     = D <= 128 ? 64 : 32;
+    constexpr int    nwarps        = (KQ_stride == 32 && cols_per_block <= 16) ?
+                                     cols_per_block/mma_B::J * KQ_stride/mma_A::I : (cols_per_block <= 8 ? 4 : 8);
+    constexpr size_t nbytes_shared = std::max(KQ_stride, nwarps*mma_B::J) * (D + 8) * sizeof(half);
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    fattn_kernel_t fattn_kernel;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, KQ_stride, use_logit_softcap>;
+    } else {
+        constexpr bool use_logit_softcap = true;
+        fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, KQ_stride, use_logit_softcap>;
+    }
+    launch_fattn<D, cols_per_block, 0, KQ_stride>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+}
+
+#define DECL_FATTN_MMA_F16_CASE(D, cols_per_block)                          \
+    template void ggml_cuda_flash_attn_ext_mma_f16_case                     \
+    <D, cols_per_block>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
+
+extern DECL_FATTN_MMA_F16_CASE( 64,  8);
+extern DECL_FATTN_MMA_F16_CASE( 80,  8);
+extern DECL_FATTN_MMA_F16_CASE( 96,  8);
+extern DECL_FATTN_MMA_F16_CASE(112,  8);
+extern DECL_FATTN_MMA_F16_CASE(128,  8);
+extern DECL_FATTN_MMA_F16_CASE(256,  8);
+
+extern DECL_FATTN_MMA_F16_CASE( 64, 16);
+extern DECL_FATTN_MMA_F16_CASE( 80, 16);
+extern DECL_FATTN_MMA_F16_CASE( 96, 16);
+extern DECL_FATTN_MMA_F16_CASE(112, 16);
+extern DECL_FATTN_MMA_F16_CASE(128, 16);
+extern DECL_FATTN_MMA_F16_CASE(256, 16);
+
+extern DECL_FATTN_MMA_F16_CASE( 64, 32);
+extern DECL_FATTN_MMA_F16_CASE( 80, 32);
+extern DECL_FATTN_MMA_F16_CASE( 96, 32);
+extern DECL_FATTN_MMA_F16_CASE(112, 32);
+extern DECL_FATTN_MMA_F16_CASE(128, 32);
+extern DECL_FATTN_MMA_F16_CASE(256, 32);
+
+extern DECL_FATTN_MMA_F16_CASE( 64, 64);
+extern DECL_FATTN_MMA_F16_CASE( 80, 64);
+extern DECL_FATTN_MMA_F16_CASE( 96, 64);
+extern DECL_FATTN_MMA_F16_CASE(112, 64);
+extern DECL_FATTN_MMA_F16_CASE(128, 64);
+extern DECL_FATTN_MMA_F16_CASE(256, 64);
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
new file mode 100644
index 000000000..d4edbad07
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -0,0 +1,365 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-tile-f16.cuh"
+
+#define FATTN_KQ_STRIDE_TILE_F16 64
+
+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(nwarps*WARP_SIZE, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_tile_ext_f16(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#ifdef FP16_AVAILABLE
+
+#ifndef FLASH_ATTN_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FLASH_ATTN_AVAILABLE
+
+    // Skip unused kernel variants for faster compilation:
+#ifdef FP16_MMA_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FP16_MMA_AVAILABLE
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *)  mask + ne11*ic0;
+
+    const int stride_KV2 = nb11 / sizeof(half2);
+
+    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const half  slopeh = __float2half(slopef);
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+
+    __shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
+    half2 * KQ2 = (half2 *) KQ;
+
+    __shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
+
+    half kqmax[ncols/nwarps];
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        kqmax[j0/nwarps] = -HALF_MAX_HALF;
+    }
+    half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
+
+    half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
+
+    // Convert Q to half2 and store in registers:
+    __shared__ half2 Q_h2[ncols][D/2];
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
+            Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
+        }
+    }
+
+    __syncthreads();
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        half kqmax_new[ncols/nwarps];
+#pragma unroll
+        for (int j = 0; j < ncols/nwarps; ++j) {
+            kqmax_new[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
+                const int k_KQ = k_KQ_0 + threadIdx.x;
+
+                KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
+            }
+        }
+
+        __syncthreads();
+
+        half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
+
+#pragma unroll
+        for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
+            half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
+            half2 Q_k[ncols/nwarps];
+
+#pragma unroll
+            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
+                const int i_KQ = i_KQ_0 + threadIdx.x;
+
+                K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
+            }
+#pragma unroll
+            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                const int j_KQ = j_KQ_0 + threadIdx.y;
+
+                Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
+            }
+
+#pragma unroll
+            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
+#pragma unroll
+                for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                    sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
+                }
+            }
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
+            const int i_KQ = i_KQ_0 + threadIdx.x;
+
+#pragma unroll
+            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                const int j_KQ = j_KQ_0 + threadIdx.y;
+
+                half sum;
+                if (use_logit_softcap) {
+                    const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
+                    sum = logit_softcap * tanhf(tmp.x + tmp.y);
+                } else {
+                    sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
+                }
+                sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+
+                kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
+
+                KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
+            const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
+            kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
+
+#pragma unroll
+            for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
+                const half2 val = h2exp(diff);
+                kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
+                KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
+            const int k = k0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
+            half2  V_k[(D/2)/WARP_SIZE][2];
+            half2 KQ_k[ncols/nwarps];
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
+                V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
+            }
+#pragma unroll
+            for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+                const int j = j0 + threadIdx.y;
+
+                KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+#pragma unroll
+                for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+                    VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
+                    VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
+        const int j_VKQ = j_VKQ_0 + threadIdx.y;
+
+        if (ic0 + j_VKQ >= ne01) {
+            return;
+        }
+
+        half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
+        kqsum_j = warp_reduce_sum((float)kqsum_j);
+
+#pragma unroll
+        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
+            const int i0 = i00 + 2*threadIdx.x;
+
+            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
+            if (parallel_blocks == 1) {
+                dst_val /= __half2half2(kqsum_j);
+            }
+            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] =  __low2float(dst_val);
+            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
+        }
+
+        if (parallel_blocks != 1 && threadIdx.x == 0) {
+            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        }
+    }
+#else
+   NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+}
+
+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+    switch (Q->ne[0]) {
+        case  64: {
+            constexpr int    D             = 64;
+            constexpr int    nwarps        = 8;
+            constexpr size_t nbytes_shared = 0;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+        } break;
+        case 128: {
+            constexpr int    D             = 128;
+            constexpr int    nwarps        = 8;
+            constexpr size_t nbytes_shared = 0;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+        } break;
+        default: {
+            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
+        } break;
+    }
+}
+
+void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+
+    const int32_t precision = KQV->op_params[3];
+    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (Q->ne[1] <= 16) {
+        constexpr int cols_per_block = 16;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 32) {
+        constexpr int cols_per_block = 32;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    constexpr int cols_per_block = 32;
+    constexpr int parallel_blocks = 1;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+    } else {
+        constexpr bool use_logit_softcap = true;
+        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+    }
+}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cuh b/ggml/src/ggml-cuda/fattn-tile-f16.cuh
new file mode 100644
index 000000000..ffc587842
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
new file mode 100644
index 000000000..0d274f332
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -0,0 +1,356 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-tile-f32.cuh"
+
+#define FATTN_KQ_STRIDE_TILE_F32 32
+
+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(nwarps*WARP_SIZE, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_tile_ext_f32(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#ifndef FLASH_ATTN_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FLASH_ATTN_AVAILABLE
+
+    // Skip unused kernel variants for faster compilation:
+#ifdef FP16_MMA_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FP16_MMA_AVAILABLE
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const half   * maskh = (const half   *)  mask + ne11*ic0;
+
+    const int stride_KV2 = nb11 / sizeof(half2);
+
+    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+
+    __shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
+
+    __shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
+    float2 * KV_tmp2 = (float2 *) KV_tmp;
+
+    float kqmax[ncols/nwarps];
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        kqmax[j0/nwarps] = -FLT_MAX/2.0f;
+    }
+    float kqsum[ncols/nwarps] = {0.0f};
+
+    float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
+
+    // Convert Q to half2 and store in registers:
+    __shared__ float Q_f[ncols][D];
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
+            float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
+            Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
+            Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
+        }
+    }
+
+    __syncthreads();
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        float kqmax_new[ncols/nwarps];
+#pragma unroll
+        for (int j = 0; j < ncols/nwarps; ++j) {
+            kqmax_new[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
+                const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
+                KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] =  __low2float(tmp);
+                KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
+            }
+        }
+
+        __syncthreads();
+
+        float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
+
+#pragma unroll
+        for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
+            float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
+            float Q_k[ncols/nwarps];
+
+#pragma unroll
+            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
+                const int i_KQ = i_KQ_0 + threadIdx.x;
+
+                K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
+            }
+#pragma unroll
+            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                const int j_KQ = j_KQ_0 + threadIdx.y;
+
+                Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
+            }
+
+#pragma unroll
+            for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
+#pragma unroll
+                for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                    sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
+                }
+            }
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
+            const int i_KQ = i_KQ_0 + threadIdx.x;
+
+#pragma unroll
+            for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
+                const int j_KQ = j_KQ_0 + threadIdx.y;
+
+                if (use_logit_softcap) {
+                    sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
+                }
+
+                sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+
+                kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
+
+                KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
+            const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
+            kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
+
+            float kqsum_add = 0.0f;
+#pragma unroll
+            for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
+                const float val = expf(diff);
+                kqsum_add += val;
+                KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
+            }
+            kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
+                VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
+            const int k = k0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                KV_tmp2[k*(D/2) + i].x =  __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
+                KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
+            float2 V_k[(D/2)/WARP_SIZE];
+            float  KQ_k[ncols/nwarps];
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
+            }
+#pragma unroll
+            for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+                const int j = j0 + threadIdx.y;
+
+                KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+#pragma unroll
+                for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+                    VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
+                    VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
+        const int j_VKQ = j_VKQ_0 + threadIdx.y;
+
+        if (ic0 + j_VKQ >= ne01) {
+            return;
+        }
+
+        float kqsum_j = kqsum[j_VKQ_0/nwarps];
+        kqsum_j = warp_reduce_sum(kqsum_j);
+
+#pragma unroll
+        for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
+            const int i0 = i00 + 2*threadIdx.x;
+
+            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
+            if (parallel_blocks == 1) {
+                dst_val.x /= kqsum_j;
+                dst_val.y /= kqsum_j;
+            }
+            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
+            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
+        }
+
+        if (parallel_blocks != 1 && threadIdx.x == 0) {
+            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        }
+    }
+}
+
+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+    switch (Q->ne[0]) {
+        case  64: {
+            constexpr int    D             = 64;
+            constexpr int    nwarps        = 8;
+            constexpr size_t nbytes_shared = 0;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+        } break;
+        case 128: {
+            constexpr int    D             = 128;
+            constexpr int    nwarps        = 8;
+            constexpr size_t nbytes_shared = 0;
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+        } break;
+        default: {
+            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
+        } break;
+    }
+}
+
+void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q = dst->src[0];
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (Q->ne[1] <= 16) {
+        constexpr int cols_per_block = 16;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 32) {
+        constexpr int cols_per_block = 32;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    constexpr int cols_per_block = 32;
+    constexpr int parallel_blocks = 1;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+    } else {
+        constexpr bool use_logit_softcap = true;
+        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+    }
+}
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cuh b/ggml/src/ggml-cuda/fattn-tile-f32.cuh
new file mode 100644
index 000000000..b1c546c80
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
new file mode 100644
index 000000000..d9ac44246
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -0,0 +1,448 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_vec_ext_f16(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#ifdef FP16_AVAILABLE
+
+#ifndef FLASH_ATTN_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FLASH_ATTN_AVAILABLE
+
+    // Skip unused kernel variants for faster compilation:
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
+    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
+    constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    Q += nb02* blockIdx.y              + nb01*ic0;
+    K += nb12*(blockIdx.y / gqa_ratio);
+    V += nb22*(blockIdx.y / gqa_ratio);
+
+    const half * maskh = (const half   *)  mask + ne11*ic0;
+
+    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const half  slopeh = __float2half(slopef);
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+    constexpr int nwarps = D / WARP_SIZE;
+    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ half KQ[ncols*D];
+    half2 * KQ2 = (half2 *) KQ;
+
+    half kqmax[ncols];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqmax[j] = -HALF_MAX_HALF;
+    }
+    half kqsum[ncols] = {0.0f};
+
+    __shared__ half kqmax_shared[ncols][WARP_SIZE];
+    __shared__ half kqsum_shared[ncols][WARP_SIZE];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (threadIdx.y == 0) {
+            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    // Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
+    half2  Q_h2[ncols][D/(2*WARP_SIZE)];
+    int   Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)];
+    half2  Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
+    if (Q_q8_1) {
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            if (j0 + nwarps > ncols && j >= ncols) {
+                break;
+            }
+
+            // Reuse KQ as temporary storage for converting Q to q8_1:
+            int   * tmp_q_i32 = (int   *) &KQ[j*D];
+            half2 * tmp_q_ds  = (half2 *) (tmp_q_i32 + D/sizeof(int));
+
+            // Set memory to zero if out of bounds:
+            if (ncols > 2 && ic0 + j >= ne01) {
+#pragma unroll
+                for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    tmp_q_i32[i] = 0;
+                }
+                if (threadIdx.x < D/QK8_1) {
+                    tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f);
+                }
+                continue;
+            }
+
+            const float * Q_f = (const float *) (Q + j*nb01);
+#pragma unroll
+            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                quantize_q8_1_to_shared<half2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            int   * tmp_q_i32 = (int   *) &KQ[j*D];
+            half2 * tmp_q_ds  = (half2 *) (tmp_q_i32 + D/sizeof(int));
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
+                Q_ds[j][i0/WARP_SIZE]  = tmp_q_ds[i/QI8_1];
+            }
+        }
+
+        __syncthreads();
+    } else {
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
+                Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
+            }
+        }
+    }
+
+
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        KQ[j*D + tid] = -HALF_MAX_HALF;
+    }
+
+    half2 VKQ[ncols] = {{0.0f, 0.0f}};
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
+        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
+        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
+        half kqmax_new = kqmax[0];
+        half kqmax_new_arr[ncols];
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            kqmax_new_arr[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
+                break;
+            }
+
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
+                sum = warp_reduce_sum((float)sum);
+
+                if (use_logit_softcap) {
+                    sum = logit_softcap*tanhf(sum);
+                }
+
+                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
+
+                if (ncols == 1) {
+                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
+                } else {
+                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
+                }
+
+                if (threadIdx.x == 0) {
+                    KQ[j*D + i_KQ] = sum;
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
+
+            if (threadIdx.x == 0) {
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+
+            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
+            kqmax[j] = kqmax_new_j;
+
+            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
+            kqsum[j] = kqsum[j]*KQ_max_scale + val;
+            KQ[j*D + tid] = val;
+
+            VKQ[j] *= __half2half2(KQ_max_scale);
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k0 = 0; k0 < D; k0 += 2) {
+            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
+                break;
+            }
+
+            half2 V_k;
+            reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid);
+            reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid);
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqsum[j] = warp_reduce_sum((float)kqsum[j]);
+        if (threadIdx.x == 0) {
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
+        }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
+        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
+            break;
+        }
+
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
+        kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
+
+        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
+        if (parallel_blocks == 1) {
+            dst_val /= kqsum[j_VKQ];
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+    }
+
+    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    }
+#else
+   NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+}
+
+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    constexpr int nwarps = D/WARP_SIZE;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    constexpr bool need_f16_K = D != 128;
+    constexpr bool need_f16_V = D != 128 && D != 64;
+    constexpr size_t nbytes_shared = 0;
+    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+}
+
+template <int D, ggml_type type_K, ggml_type type_V>
+void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+    const ggml_tensor * K   = dst->src[1];
+    const ggml_tensor * V   = dst->src[2];
+
+    const int32_t precision = KQV->op_params[3];
+    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+
+    GGML_ASSERT(K->type == type_K);
+    GGML_ASSERT(V->type == type_V);
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (Q->ne[1] == 1) {
+        constexpr int cols_per_block  = 1;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] == 2) {
+        constexpr int cols_per_block  = 2;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 4) {
+        constexpr int cols_per_block  = 4;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 8) {
+        constexpr int cols_per_block  = 8;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    constexpr int cols_per_block  = 8;
+    constexpr int parallel_blocks = 1;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+    } else {
+        constexpr bool use_logit_softcap = true;
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+    }
+}
+
+#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V)                          \
+    template void ggml_cuda_flash_attn_ext_vec_f16_case                     \
+    <D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
+
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0);
+
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16);
+
+extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
new file mode 100644
index 000000000..6ef8f9dcc
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -0,0 +1,425 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+
+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(D, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_vec_ext_f32(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#ifndef FLASH_ATTN_AVAILABLE
+    NO_DEVICE_CODE;
+    return;
+#endif // FLASH_ATTN_AVAILABLE
+
+    // Skip unused kernel variants for faster compilation:
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
+    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
+    constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
+
+    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
+    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    Q += nb02* blockIdx.y              + nb01*ic0;
+    K += nb12*(blockIdx.y / gqa_ratio);
+    V += nb22*(blockIdx.y / gqa_ratio); // K and V have same shape
+    const half * maskh = (const half   *)  mask + ne11*ic0;
+
+    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+
+    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
+    constexpr int nwarps = D / WARP_SIZE;
+    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+    __builtin_assume(tid < D);
+
+    __shared__ float KQ[ncols*D];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        KQ[j*D + tid] = -FLT_MAX/2.0f;
+    }
+
+    float kqmax[ncols];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqmax[j] = -FLT_MAX/2.0f;
+    }
+    float kqsum[ncols] = {0.0f};
+
+    __shared__ float kqmax_shared[ncols][WARP_SIZE];
+    __shared__ float kqsum_shared[ncols][WARP_SIZE];
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        if (threadIdx.y == 0) {
+            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
+            kqsum_shared[j][threadIdx.x] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
+    float2  Q_f2[ncols][D/(2*WARP_SIZE)];
+    int    Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)];
+    float2  Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
+    if (Q_q8_1) {
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            if (j0 + nwarps > ncols && j >= ncols) {
+                break;
+            }
+
+            // Reuse KQ as temporary storage for converting Q to q8_1:
+            int    * tmp_q_i32 = (int    *) &KQ[j*D];
+            float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));
+
+            // Set memory to zero if out of bounds:
+            if (ncols > 2 && ic0 + j >= ne01) {
+#pragma unroll
+                for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                    const int i = i0 + threadIdx.x;
+
+                    tmp_q_i32[i] = 0;
+                }
+                if (threadIdx.x < D/QK8_1) {
+                    tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
+                }
+                continue;
+            }
+
+            const float * Q_f = (const float *) (Q + j*nb01);
+#pragma unroll
+            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            int    * tmp_q_i32 = (int    *) &KQ[j*D];
+            float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
+                Q_ds[j][i0/WARP_SIZE]  = tmp_q_ds[i/QI8_1];
+            }
+        }
+
+        __syncthreads();
+    } else {
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                Q_f2[j][i0/WARP_SIZE]    = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
+                Q_f2[j][i0/WARP_SIZE].x *= scale;
+                Q_f2[j][i0/WARP_SIZE].y *= scale;
+            }
+        }
+    }
+
+    float VKQ[ncols] = {0.0f};
+
+    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
+    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+        // Calculate KQ tile and keep track of new maximum KQ values:
+
+        float kqmax_new_arr[ncols];
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            kqmax_new_arr[j] = kqmax[j];
+        }
+
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
+            const int i_KQ = i_KQ_0 + threadIdx.y;
+
+            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
+                break;
+            }
+
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
+                sum = warp_reduce_sum(sum);
+
+                if (use_logit_softcap) {
+                    sum = logit_softcap*tanhf(sum);
+                }
+
+                sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
+
+                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
+
+                if (threadIdx.x == 0) {
+                    KQ[j*D + i_KQ] = sum;
+                }
+            }
+        }
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            float kqmax_new_j = kqmax_new_arr[j];
+
+            if (threadIdx.x == 0) {
+                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < ncols; ++j) {
+            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
+            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+
+            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
+            kqmax[j] = kqmax_new_j;
+
+            const float val = expf(KQ[j*D + tid] - kqmax[j]);
+            kqsum[j] = kqsum[j]*KQ_max_scale + val;
+            KQ[j*D + tid] = val;
+
+            VKQ[j] *= KQ_max_scale;
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int k = 0; k < D; ++k) {
+            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
+                break;
+            }
+
+            const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid);
+#pragma unroll
+            for (int j = 0; j < ncols; ++j) {
+                VKQ[j] += V_ki*KQ[j*D + k];
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j = 0; j < ncols; ++j) {
+        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        if (threadIdx.x == 0) {
+            kqsum_shared[j][threadIdx.y] = kqsum[j];
+        }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
+        if (ncols > 2 && ic0 + j_VKQ >= ne01) {
+            break;
+        }
+
+        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
+        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
+
+        float dst_val = VKQ[j_VKQ];
+        if (parallel_blocks == 1) {
+            dst_val /= kqsum[j_VKQ];
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+    }
+
+    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    }
+}
+
+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    constexpr int nwarps = D/WARP_SIZE;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    constexpr bool need_f16_K = D != 128;
+    constexpr bool need_f16_V = D != 128 && D != 64;
+    constexpr size_t nbytes_shared = 0;
+    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+}
+
+template <int D, ggml_type type_K, ggml_type type_V>
+void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+    const ggml_tensor * K   = dst->src[1];
+    const ggml_tensor * V   = dst->src[2];
+
+    GGML_ASSERT(K->type == type_K);
+    GGML_ASSERT(V->type == type_V);
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (Q->ne[1] == 1) {
+        constexpr int cols_per_block  = 1;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] == 2) {
+        constexpr int cols_per_block  = 2;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 4) {
+        constexpr int cols_per_block  = 4;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 8) {
+        constexpr int cols_per_block  = 8;
+        constexpr int parallel_blocks = 4;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        } else {
+            constexpr bool use_logit_softcap = true;
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        }
+        return;
+    }
+
+    constexpr int cols_per_block  = 8;
+    constexpr int parallel_blocks = 1;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+    } else {
+        constexpr bool use_logit_softcap = true;
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+    }
+}
+
+#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V)                          \
+    template void ggml_cuda_flash_attn_ext_vec_f32_case                     \
+    <D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
+
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0);
+
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
+extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16);
+
+extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
new file mode 100644
index 000000000..45702ad65
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -0,0 +1,648 @@
+// Old and deprecated WMMA FlashAttention implementation.
+// It is still needed for Volta since the memory layout of NVIDIA tensor cores changed with Turing.
+// Long-term the WMMA code should be replaced with a dedicated Volta implementation.
+
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-wmma-f16.cuh"
+
+#ifdef FP16_MMA_AVAILABLE
+#include <mma.h>
+#endif // FP16_MMA_AVAILABLE
+
+// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
+template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+__launch_bounds__(nwarps*WARP_SIZE, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void flash_attn_ext_f16(
+        const char * __restrict__ Q,
+        const char * __restrict__ K,
+        const char * __restrict__ V,
+        const char * __restrict__ mask,
+        float      * __restrict__ dst,
+        float2     * __restrict__ dst_meta,
+        const float scale,
+        const float max_bias,
+        const float m0,
+        const float m1,
+        const uint32_t n_head_log2,
+        const float logit_softcap,
+        const int ne00,
+        const int ne01,
+        const int ne02,
+        const int ne03,
+        const int ne10,
+        const int ne11,
+        const int ne12,
+        const int ne13,
+        const int ne31,
+        const int nb31,
+        const int nb01,
+        const int nb02,
+        const int nb03,
+        const int nb11,
+        const int nb12,
+        const int nb13,
+        const int nb21,
+        const int nb22,
+        const int nb23,
+        const int ne0,
+        const int ne1,
+        const int ne2,
+        const int ne3) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+    // Skip unused kernel variants for faster compilation:
+    if (use_logit_softcap && !(D == 128 || D == 256)) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+
+    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
+    const int ip  =        blockIdx.x % parallel_blocks;  // Index in group of blocks running for the same column in parallel.
+
+    static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
+    static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
+    constexpr int frag_m = ncols == 8 ? 32 : 16;
+    constexpr int frag_n = ncols == 8 ?  8 : 16;
+    static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0.");
+    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::row_major> frag_a_K;
+    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_a_V;
+    typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b,    frag_m, frag_n, 16, half, nvcuda::wmma::col_major> frag_b;
+    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, KQ_acc_t>                      frag_c_KQ;
+    typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, frag_m, frag_n, 16, half>                          frag_c_VKQ;
+
+    constexpr int KQ_stride_tc  = nwarps*frag_m; // Number of KQ rows calculated in parallel.
+    constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy.
+    static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps.");
+
+    // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts:
+    constexpr int D_padded = D + 8;
+    constexpr int kqs_padded = FATTN_KQ_STRIDE + 8;
+    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
+
+    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
+    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y              + nb01*ic0);
+    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y / gqa_ratio));
+    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const half  * maskh = (const half  *)  mask + (nb31/sizeof(half))* ic0;
+    const half2 * mask2 = (const half2 *)  mask + (nb31/sizeof(half))*(ic0/2);
+
+    const int stride_Q  = nb01 / sizeof(float);
+    const int stride_KV = nb11 / sizeof(half);
+
+    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const half  slopeh = __float2half(slopef);
+    const half2 slope2 = make_half2(slopef, slopef);
+
+    const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
+
+    frag_b Q_b[D/16][ncols/frag_n];
+
+    // A single buffer for temporarily holding tiles of KQ and VKQ parts:
+    constexpr int mem_KQ = ncols*kqs_padded*kqar;
+    constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded;
+    __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts];
+    float * KQ_f = (float *) KQ;
+    half2 * KQ2 = (half2 *) KQ;
+
+    float    KQ_rowsum_f[ncols/nwarps] = {0.0f};
+    float       KQ_max_f[ncols/nwarps];
+    float KQ_max_scale_f[ncols/nwarps] = {0.0f};
+
+#pragma unroll
+    for (int j = 0; j < ncols/nwarps; ++j) {
+        KQ_max_f[j] = -FLT_MAX/2.0f;
+    }
+
+    half2    KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}};
+    half2       KQ_max_h2[ncols/nwarps];
+    half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}};
+
+#pragma unroll
+    for (int j = 0; j < ncols/nwarps; ++j) {
+        KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF);
+    }
+
+    __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice.
+    half2 * VKQ2 = (half2 *) VKQ;
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+#pragma unroll
+        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+            if (i0 + WARP_SIZE > D/2 && i >= D/2) {
+                break;
+            }
+            VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f);
+        }
+    }
+
+    // Convert Q to half and apply scale, temporarily store in KQ:
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+#pragma unroll
+        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+            if (i0 + WARP_SIZE > D && i >= D) {
+                break;
+            }
+            KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
+        }
+    }
+
+    __syncthreads();
+
+    // Load Q into tensor core fragments/registers since it will be used frequently:
+#pragma unroll
+    for (int i0 = 0; i0 < D; i0 += 16) {
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+            nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded);
+        }
+    }
+
+    __syncthreads();
+
+    // Iterate over ne11 == previous tokens:
+    for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
+        // Calculate tile of KQ:
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
+            frag_c_KQ KQ_c[ncols/frag_n];
+#pragma unroll
+            for (int j = 0; j < ncols/frag_n; ++j) {
+                nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f);
+            }
+#pragma unroll
+            for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
+                frag_a_K K_a;
+                nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
+#pragma unroll
+                for (int j = 0; j < ncols/frag_n; ++j) {
+                    nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
+                }
+            }
+#pragma unroll
+            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+                nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major);
+            }
+        }
+
+        __syncthreads();
+
+        // Calculate softmax for each KQ column using the current max. value.
+        // The divisor is stored in KQ_rowsum and will be applied at the end.
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            if (std::is_same<KQ_acc_t, float>::value) {
+                float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE];
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
+
+                    if (use_logit_softcap) {
+                        KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
+                    }
+                }
+
+                float KQ_max_new = KQ_max_f[j0/nwarps];
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
+                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
+                }
+                KQ_max_new = warp_reduce_max(KQ_max_new);
+
+                const float diff = KQ_max_f[j0/nwarps] - KQ_max_new;
+                KQ_max_scale_f[j0/nwarps] = expf(diff);
+                if (diff <= SOFTMAX_FTZ_THRESHOLD) {
+                    KQ_max_scale_f[j0/nwarps] = 0.0f;
+                }
+                KQ_max_f[j0/nwarps] = KQ_max_new;
+
+                float KQ_rowsum_add = 0.0f;
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps];
+                    KQ_f_tmp[k0/WARP_SIZE] = expf(diff);
+                    if (diff <= SOFTMAX_FTZ_THRESHOLD) {
+                        KQ_f_tmp[k0/WARP_SIZE] = 0.0f;
+                    }
+                    KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE];
+                    KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE];
+                }
+                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
+
+                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
+                KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add;
+            } else {
+                half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)];
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
+
+                    if (use_logit_softcap) {
+                        // There is no dedicated tangens hyperbolicus function for half2.
+                        KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
+                        KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
+                                               /(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
+
+                        KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
+                    }
+                }
+
+                half2 KQ_max_new = KQ_max_h2[j0/nwarps];
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
+                    KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
+                }
+                KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
+                const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
+                KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
+                const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
+                *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask;
+                KQ_max_h2[j0/nwarps] = KQ_max_new;
+
+                half2 KQ_rowsum_add = make_half2(0.0f, 0.0f);
+#pragma unroll
+                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
+                    const int k = k0 + threadIdx.x;
+
+                    const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps];
+                    KQ2_tmp[k0/WARP_SIZE] = h2exp(diff);
+                    const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
+                    *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask;
+                    KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE];
+                    KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE];
+                }
+                KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add);
+
+                // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
+                KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add;
+            }
+        }
+
+        __syncthreads();
+
+        frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n];
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+#pragma unroll
+            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
+                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
+                nvcuda::wmma::load_matrix_sync(
+                    KQ_b[k0/(VKQ_ratio*16)][j0/frag_n],
+                    KQ + j0*(kqar*kqs_padded) + k,
+                    kqar*kqs_padded);
+            }
+        }
+
+        frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n];
+#pragma unroll
+        for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) {
+#pragma unroll
+            for (int j = 0; j < ncols/frag_n; ++j) {
+                nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f);
+            }
+
+#pragma unroll
+            for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) {
+                const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
+
+                frag_a_V v_a;
+                nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
+#pragma unroll
+                for (int j = 0; j < ncols/frag_n; ++j) {
+                    nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded);
+#pragma unroll
+        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) {
+#pragma unroll
+            for (int j0 = 0; j0 < ncols; j0 += frag_n) {
+                nvcuda::wmma::store_matrix_sync(
+                    KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio),
+                    VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n],
+                    D_padded, nvcuda::wmma::mem_col_major);
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+            half2 VKQ_scale;
+            if (std::is_same<KQ_acc_t, float>::value) {
+                VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]);
+            } else {
+                VKQ_scale = KQ_max_scale_h2[j0/nwarps];
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+                if (i0 + WARP_SIZE > D/2 && i >= D/2) {
+                    break;
+                }
+
+                half2 VKQ_add = make_half2(0.0f, 0.0f);
+#pragma unroll
+                for (int l = 0; l < VKQ_ratio; ++l) {
+                    VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i];
+                }
+                VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add;
+            }
+        }
+
+        __syncthreads();
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
+        const int j_VKQ = j0 + threadIdx.y;
+        if (ic0 + j_VKQ >= ne01) {
+            return;
+        }
+        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+
+        float KQ_rowsum_j;
+        if (std::is_same<KQ_acc_t, float>::value) {
+            KQ_rowsum_j = KQ_rowsum_f[j0/nwarps];
+        } else {
+            KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
+        }
+
+#pragma unroll
+        for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+            if (i0 + WARP_SIZE > D && i >= D) {
+                break;
+            }
+            float dst_val = VKQ[j_VKQ*D_padded + i];
+            if (parallel_blocks == 1) {
+                dst_val /= KQ_rowsum_j;
+            }
+            dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
+        }
+
+        if (parallel_blocks == 1 || threadIdx.x != 0) {
+            continue;
+        }
+
+        float2 dst_meta_val;
+        if (std::is_same<KQ_acc_t, float>::value) {
+            dst_meta_val.x = KQ_max_f[j0/nwarps];
+        } else {
+            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
+        }
+        dst_meta_val.y = KQ_rowsum_j;
+        dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
+    }
+#else
+   NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+}
+
+constexpr int get_max_power_of_2(int x) {
+    return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
+}
+
+static_assert(get_max_power_of_2(1) == 1, "Test failed.");
+static_assert(get_max_power_of_2(2) == 2, "Test failed.");
+static_assert(get_max_power_of_2(4) == 4, "Test failed.");
+static_assert(get_max_power_of_2(6) == 2, "Test failed.");
+
+// Number of VKQ rows calculated in parallel:
+constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) {
+    return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m;
+}
+
+static_assert(get_VKQ_stride(128, 1, 32) ==  32, "Test failed.");
+static_assert(get_VKQ_stride(128, 2, 32) ==  64, "Test failed.");
+static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed.");
+static_assert(get_VKQ_stride( 64, 1, 32) ==  32, "Test failed.");
+static_assert(get_VKQ_stride( 64, 2, 32) ==  64, "Test failed.");
+static_assert(get_VKQ_stride( 64, 4, 32) ==  64, "Test failed.");
+static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
+static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
+static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
+
+template <int D, int cols_per_block, typename KQ_acc_t>
+void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+
+    constexpr int nwarps = 4;
+
+    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
+    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
+    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+
+    float logit_softcap;
+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
+
+    if (4*blocks_num_pb1 < 2*nsm) {
+        constexpr int parallel_blocks = 4;
+        fattn_kernel_t fattn_kernel;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            fattn_kernel = flash_attn_ext_f16<
+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+        } else {
+            constexpr bool use_logit_softcap = true;
+            fattn_kernel = flash_attn_ext_f16<
+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+        }
+        launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        return;
+    }
+    if (2*blocks_num_pb1 < 2*nsm) {
+        constexpr int parallel_blocks = 2;
+        fattn_kernel_t fattn_kernel;
+        if (logit_softcap == 0.0f) {
+            constexpr bool use_logit_softcap = false;
+            fattn_kernel = flash_attn_ext_f16<
+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+        } else {
+            constexpr bool use_logit_softcap = true;
+            fattn_kernel = flash_attn_ext_f16<
+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+        }
+        launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        return;
+    }
+    constexpr int parallel_blocks = 1;
+    fattn_kernel_t fattn_kernel;
+    if (logit_softcap == 0.0f) {
+        constexpr bool use_logit_softcap = false;
+        fattn_kernel = flash_attn_ext_f16<
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+    } else {
+        constexpr bool use_logit_softcap = true;
+        fattn_kernel = flash_attn_ext_f16<
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+    }
+    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+}
+
+void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+
+    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
+
+    if (prec != GGML_PREC_DEFAULT) {
+        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
+            constexpr int cols_per_block = 16;
+            switch (Q->ne[0]) {
+                case 64:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
+                    break;
+                case 80:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
+                    break;
+                case 96:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
+                    break;
+                case 112:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
+                    break;
+                case 128:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
+                    break;
+                case 256:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
+                    break;
+                default:
+                    GGML_ABORT("fatal error");
+                    break;
+            }
+        } else {
+            constexpr int cols_per_block = 32;
+            switch (Q->ne[0]) {
+                case 64:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
+                    break;
+                case 80:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
+                    break;
+                case 96:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
+                    break;
+                case 112:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
+                    break;
+                case 128:
+                    ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
+                    break;
+                // case 256:
+                //     ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
+                //     break;
+                default:
+                    GGML_ABORT("fatal error");
+                    break;
+            }
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
+        constexpr int cols_per_block = 8;
+        switch (Q->ne[0]) {
+            case 64:
+                ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
+                break;
+            case 96:
+                ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
+                break;
+            case 128:
+                ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
+                break;
+            case 256:
+                ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
+                break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+        return;
+    }
+
+    if (Q->ne[1] <= 32) {
+        constexpr int cols_per_block = 16;
+        switch (Q->ne[0]) {
+            case 64:
+                ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
+                break;
+            case 80:
+                ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
+                break;
+            case 96:
+                ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
+                break;
+            case 112:
+                ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
+                break;
+            case 128:
+                ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
+                break;
+            case 256:
+                ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
+                break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+        return;
+    }
+
+    constexpr int cols_per_block = 32;
+    switch (Q->ne[0]) {
+        case 64:
+            ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
+            break;
+        case 80:
+            ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
+            break;
+        case 96:
+            ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
+            break;
+        case 112:
+            ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
+            break;
+        case 128:
+            ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
+            break;
+        case 256:
+            ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
new file mode 100644
index 000000000..beeea95eb
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
new file mode 100644
index 000000000..b0cf152f5
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -0,0 +1,272 @@
+#include "common.cuh"
+#include "fattn-common.cuh"
+#include "fattn-mma-f16.cuh"
+#include "fattn-tile-f16.cuh"
+#include "fattn-tile-f32.cuh"
+#include "fattn-vec-f16.cuh"
+#include "fattn-vec-f32.cuh"
+#include "fattn-wmma-f16.cuh"
+#include "fattn.cuh"
+
+template <int cols_per_block>
+static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+
+    switch (Q->ne[0]) {
+        case 64:
+            ggml_cuda_flash_attn_ext_mma_f16_case< 64, cols_per_block>(ctx, dst);
+            break;
+        case 80:
+            ggml_cuda_flash_attn_ext_mma_f16_case< 80, cols_per_block>(ctx, dst);
+            break;
+        case 96:
+            ggml_cuda_flash_attn_ext_mma_f16_case< 96, cols_per_block>(ctx, dst);
+            break;
+        case 112:
+            ggml_cuda_flash_attn_ext_mma_f16_case<112, cols_per_block>(ctx, dst);
+            break;
+        case 128:
+            ggml_cuda_flash_attn_ext_mma_f16_case<128, cols_per_block>(ctx, dst);
+            break;
+        case 256:
+            ggml_cuda_flash_attn_ext_mma_f16_case<256, cols_per_block>(ctx, dst);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+
+    if (Q->ne[1] <= 8) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
+        return;
+    }
+
+    if (Q->ne[1] <= 16) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<16>(ctx, dst);
+        return;
+    }
+
+    if (Q->ne[1] <= 32) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<32>(ctx, dst);
+        return;
+    }
+
+    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<64>(ctx, dst);
+}
+
+#define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
+    if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) {    \
+        ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \
+        return;                                                             \
+    }                                                                       \
+
+static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * Q = dst->src[0];
+    ggml_tensor * K = dst->src[1];
+    ggml_tensor * V = dst->src[2];
+
+#ifdef GGML_CUDA_FA_ALL_QUANTS
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
+
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
+#else
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
+
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+
+    FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
+#endif // GGML_CUDA_FA_ALL_QUANTS
+
+    on_no_fattn_vec_case(Q->ne[0]);
+}
+
+#define FATTN_VEC_F32_CASE(D, type_K, type_V)                               \
+    if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) {    \
+        ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
+        return;                                                             \
+    }                                                                       \
+
+static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_tensor * Q = dst->src[0];
+    ggml_tensor * K = dst->src[1];
+    ggml_tensor * V = dst->src[2];
+
+#ifdef GGML_CUDA_FA_ALL_QUANTS
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_0)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q4_1)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_0)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q5_1)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_Q8_0)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
+
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
+#else
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
+
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+
+    FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
+#endif // GGML_CUDA_FA_ALL_QUANTS
+
+    on_no_fattn_vec_case(Q->ne[0]);
+}
+
+void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV = dst;
+    const ggml_tensor * Q   = dst->src[0];
+
+    ggml_cuda_set_device(ctx.device);
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
+
+    // On AMD the tile kernels perform poorly, use the vec kernel instead:
+    if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
+        if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
+            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
+        } else {
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+        }
+        return;
+    }
+
+    if (!fast_fp16_available(cc)) {
+        if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+        } else {
+            ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
+        }
+        return;
+    }
+
+    if (!fp16_mma_available(cc)) {
+        if (prec == GGML_PREC_DEFAULT) {
+            if (Q->ne[1] <= 8) {
+                ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
+            } else {
+                ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
+            }
+        } else {
+            if (Q->ne[1] <= 8) {
+                ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+            } else {
+                ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
+            }
+        }
+        return;
+    }
+
+    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
+        if (prec == GGML_PREC_DEFAULT) {
+            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
+            return;
+        } else if(Q->ne[0] <= 128) {
+            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
+            return;
+        }
+    }
+
+    // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
+    if (cc == GGML_CUDA_CC_VOLTA) {
+        ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
+        return;
+    }
+
+    ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
+}
diff --git a/ggml/src/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh
new file mode 100644
index 000000000..ad3ca7a8d
--- /dev/null
+++ b/ggml/src/ggml-cuda/fattn.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
new file mode 100644
index 000000000..4cef53a98
--- /dev/null
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -0,0 +1,234 @@
+#include "getrows.cuh"
+#include "dequantize.cuh"
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(
+        const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
+        /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
+        /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
+        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
+
+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib   =  i00/qk;      // block index
+    const int iqs  = (i00%qk)/qr;  // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0]        = v.x;
+    dst_row[iybs + iqs + y_offset] = v.y;
+}
+
+template<typename src0_t, typename dst_t>
+static __global__ void k_get_rows_float(
+        const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
+        /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
+        /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
+        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
+
+    const int i00 =  blockIdx.x*blockDim.x + threadIdx.x;
+    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template<typename grad_t, typename dst_t>
+static __global__ void k_get_rows_back_float(
+        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
+    const int col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    float sum = 0.0f;
+
+    for (int64_t i = 0; i < nrows_grad; ++i) {
+        if (rows[i] != dst_row) {
+            continue;
+        }
+        sum += grad[i*ncols + col];
+    }
+
+    dst[dst_row*ncols + col] = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(
+        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+        const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
+        src0_dd, src1_dd, dst_dd,
+        ne00, /*ne01, ne02, ne03,*/
+        /*ne10, ne11,*/ ne12, /*ne13,*/
+        /* s0,*/ s1, s2, s3,
+        /* nb00,*/ nb01, nb02, nb03,
+        s10, s11, s12/*, s13*/);
+
+    GGML_UNUSED(dst);
+}
+
+template<typename src0_t>
+static void get_rows_cuda_float(
+        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+        const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(ne13 == 1);
+
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
+        src0_dd, src1_dd, dst_dd,
+        ne00, /*ne01, ne02, ne03,*/
+        /*ne10, ne11,*/ ne12, /*ne13,*/
+        /* s0,*/ s1, s2, s3,
+        /* nb00,*/ nb01, nb02, nb03,
+        s10, s11, s12/*, s13*/);
+
+    GGML_UNUSED(dst);
+}
+
+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    const void    * src0_d = (const void    *) src0->data;
+    const int32_t * src1_d = (const int32_t *) src1->data;
+    float         * dst_d  = (float         *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0]  == ggml_type_size(dst->type));
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            break;
+    }
+}
+
+void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
+    const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const float   * src0_d = (const float   *) src0->data;
+    const int32_t * src1_d = (const int32_t *) src1->data;
+    float         * dst_d  = (float         *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    GGML_ASSERT(ne02*ne03 == 1);
+    GGML_ASSERT(ne12*ne13 == 1);
+    GGML_ASSERT(ne2*ne3 == 1);
+
+    const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
+    const dim3 block_nums(block_num_x, ne1, 1);
+
+    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
+}
diff --git a/ggml/src/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh
new file mode 100644
index 000000000..a1ca643f1
--- /dev/null
+++ b/ggml/src/ggml-cuda/getrows.cuh
@@ -0,0 +1,8 @@
+#include "common.cuh"
+
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
+
+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
new file mode 100644
index 000000000..4dbaefdba
--- /dev/null
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -0,0 +1,3460 @@
+#include "ggml-cuda.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-cuda/common.cuh"
+#include "ggml-cuda/acc.cuh"
+#include "ggml-cuda/arange.cuh"
+#include "ggml-cuda/argmax.cuh"
+#include "ggml-cuda/argsort.cuh"
+#include "ggml-cuda/binbcast.cuh"
+#include "ggml-cuda/clamp.cuh"
+#include "ggml-cuda/concat.cuh"
+#include "ggml-cuda/conv-transpose-1d.cuh"
+#include "ggml-cuda/convert.cuh"
+#include "ggml-cuda/count-equal.cuh"
+#include "ggml-cuda/cpy.cuh"
+#include "ggml-cuda/cross-entropy-loss.cuh"
+#include "ggml-cuda/diagmask.cuh"
+#include "ggml-cuda/fattn.cuh"
+#include "ggml-cuda/getrows.cuh"
+#include "ggml-cuda/im2col.cuh"
+#include "ggml-cuda/mmq.cuh"
+#include "ggml-cuda/mmv.cuh"
+#include "ggml-cuda/mmvq.cuh"
+#include "ggml-cuda/norm.cuh"
+#include "ggml-cuda/opt-step-adamw.cuh"
+#include "ggml-cuda/out-prod.cuh"
+#include "ggml-cuda/pad.cuh"
+#include "ggml-cuda/pool2d.cuh"
+#include "ggml-cuda/quantize.cuh"
+#include "ggml-cuda/rope.cuh"
+#include "ggml-cuda/scale.cuh"
+#include "ggml-cuda/softmax.cuh"
+#include "ggml-cuda/sum.cuh"
+#include "ggml-cuda/sumrows.cuh"
+#include "ggml-cuda/tsembd.cuh"
+#include "ggml-cuda/unary.cuh"
+#include "ggml-cuda/upscale.cuh"
+#include "ggml-cuda/wkv6.cuh"
+#include "ggml-cuda/gla.cuh"
+#include "ggml.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <charconv>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <float.h>
+#include <limits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+[[noreturn]]
+void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
+    int id = -1; // in case cudaGetDevice fails
+    (void)cudaGetDevice(&id);
+
+    GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
+    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
+    GGML_LOG_ERROR("  %s\n", stmt);
+    // abort with GGML_ABORT to get a stack trace
+    GGML_ABORT(GGML_CUDA_NAME " error");
+}
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+void ggml_cuda_set_device(int device) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+
+    if (device == current_device) {
+        return;
+    }
+
+    CUDA_CHECK(cudaSetDevice(device));
+}
+
+int ggml_cuda_get_device() {
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    return id;
+}
+
+static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
+    ggml_cuda_set_device(device);
+#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
+    auto res = hipMallocManaged(ptr, size);
+    if (res == hipSuccess) {
+        // if error we "need" to know why...
+        CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
+    }
+    return res;
+#else
+
+#if !defined(GGML_USE_HIP)
+    cudaError_t err;
+    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
+    {
+        err = cudaMallocManaged(ptr, size);
+    }
+    else
+    {
+        err = cudaMalloc(ptr, size);
+    }
+    return err;
+#else
+    return cudaMalloc(ptr, size);
+#endif // !defined(GGML_USE_HIP)
+
+#endif
+}
+
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+static int ggml_cuda_parse_id(char devName[]) {
+    // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
+    // these values are not stable so this is susceptible to breakage
+    // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
+    int archMajor = 0x0;
+    int archMinor = 0x0;
+    int archNum = GGML_CUDA_CC_OFFSET_AMD;
+    int archLen = strlen(devName);
+    char archName[archLen + 1];
+
+    // strip leading 'gfx' while copying into our buffer
+    if (archLen > 3) {
+        strcpy(archName, &devName[3]);
+        archLen -= 3;
+    }
+
+    // trim trailing :xnack- or :sramecc- statuses
+    archLen = strcspn(archName, ":");
+    archName[archLen] = '\0';
+
+    // tease out the version information
+    if (archLen > 8) {
+        // versions labeled generic use '-' as delimiter
+        // strip the trailing "-generic" then iterate through what remains
+        if ((strstr(archName, "-generic"))) {
+            archName[archLen - 8] = '\0';
+            char * pch;
+            if ((pch = strtok(archName, "-"))) {
+                archMajor = (int)strtoul(pch, 0, 16);
+                if ((pch = strtok(NULL, "-"))) {
+                    archMinor = 0x10 * (int)strtoul(pch, 0, 16);
+                }
+            }
+        }
+    } else if (archLen >= 3) {
+        // last two digits should be the minor * 0x10 + stepping
+        archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
+        archName[archLen - 2] = '\0';
+
+        // only the major version remains
+        archMajor = (int)strtoul(archName, 0, 16);
+    }
+    archNum += archMajor * 0x100;
+    archNum += archMinor;
+    return archNum;
+}
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+
+static ggml_cuda_device_info ggml_cuda_init() {
+#ifdef __HIP_PLATFORM_AMD__
+    // Workaround for a rocBLAS bug when using multiple graphics cards:
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+    {
+        int major_version = 0;
+        size_t version_length = 0;
+        if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
+            std::string version(version_length, '\0');
+            if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
+                version.resize(::strlen(version.c_str()));
+                int parsed_value = 0;
+                if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
+                    major_version = parsed_value;
+                }
+            }
+        }
+        if (major_version < 4) {
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
+            rocblas_initialize();
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+#endif
+
+    ggml_cuda_device_info info = {};
+
+    cudaError_t err = cudaGetDeviceCount(&info.device_count);
+    if (err != cudaSuccess) {
+        GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
+        return info;
+    }
+
+    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
+
+    int64_t total_vram = 0;
+#ifdef GGML_CUDA_FORCE_MMQ
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    yes\n", __func__);
+#else
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    no\n", __func__);
+#endif // GGML_CUDA_FORCE_MMQ
+#ifdef GGML_CUDA_FORCE_CUBLAS
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
+#else
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
+#endif // GGML_CUDA_FORCE_CUBLAS
+    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
+    for (int id = 0; id < info.device_count; ++id) {
+        int device_vmm = 0;
+
+#if defined(GGML_USE_VMM)
+        CUdevice device;
+        CU_CHECK(cuDeviceGet(&device, id));
+        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
+
+        if (device_vmm) {
+            CUmemAllocationProp alloc_prop = {};
+            alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            alloc_prop.location.id = id;
+            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+        }
+#endif // defined(GGML_USE_VMM)
+        info.devices[id].vmm = !!device_vmm;
+
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+
+        info.default_tensor_split[id] = total_vram;
+        total_vram += prop.totalGlobalMem;
+
+        info.devices[id].nsm       = prop.multiProcessorCount;
+        info.devices[id].smpb      = prop.sharedMemPerBlock;
+        info.devices[id].warp_size = prop.warpSize;
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+        info.devices[id].smpbo = prop.sharedMemPerBlock;
+
+        info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
+        if ((info.devices[id].cc & 0xff00) == 0x0) {
+            GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s  cc %d.%d\n",
+                            id, prop.name, prop.gcnArchName, prop.major, prop.minor);
+
+            // Fallback to prop.major and prop.minor
+            if (prop.major > 0) {
+                info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
+                info.devices[id].cc += prop.minor * 0x10;
+            }
+        }
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
+                      id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
+                      device_vmm ? "yes" : "no", prop.warpSize);
+#else
+        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+        info.devices[id].cc = 100*prop.major + 10*prop.minor;
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    }
+
+    for (int id = 0; id < info.device_count; ++id) {
+        info.default_tensor_split[id] /= total_vram;
+    }
+
+    // configure logging to stdout
+    // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+    return info;
+}
+
+const ggml_cuda_device_info & ggml_cuda_info() {
+    static ggml_cuda_device_info info = ggml_cuda_init();
+    return info;
+}
+
+// #define DEBUG_CUDA_MALLOC
+
+// buffer pool for cuda (legacy)
+struct ggml_cuda_pool_leg : public ggml_cuda_pool {
+    static const int MAX_BUFFERS = 256;
+
+    int device;
+    struct ggml_cuda_buffer {
+        void * ptr = nullptr;
+        size_t size = 0;
+    };
+
+    ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
+    size_t pool_size = 0;
+
+    explicit ggml_cuda_pool_leg(int device) :
+        device(device) {
+    }
+
+    ~ggml_cuda_pool_leg() {
+        ggml_cuda_set_device(device);
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cuda_buffer & b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                CUDA_CHECK(cudaFree(b.ptr));
+                pool_size -= b.size;
+            }
+        }
+        GGML_ASSERT(pool_size == 0);
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+#ifdef DEBUG_CUDA_MALLOC
+        int nnz = 0;
+        size_t max_size = 0;
+#endif
+        size_t best_diff = 1ull << 36;
+        int ibest = -1;
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cuda_buffer& b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+#ifdef DEBUG_CUDA_MALLOC
+                ++nnz;
+                if (b.size > max_size) max_size = b.size;
+#endif
+                if (b.size >= size) {
+                    size_t diff = b.size - size;
+                    if (diff < best_diff) {
+                        best_diff = diff;
+                        ibest = i;
+                        if (!best_diff) {
+                            void * ptr = b.ptr;
+                            *actual_size = b.size;
+                            b.ptr = nullptr;
+                            b.size = 0;
+                            return ptr;
+                        }
+                    }
+                }
+            }
+        }
+        if (ibest >= 0) {
+            ggml_cuda_buffer& b = buffer_pool[ibest];
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+        void * ptr;
+        size_t look_ahead_size = (size_t) (1.05 * size);
+        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+        ggml_cuda_set_device(device);
+        CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
+        *actual_size = look_ahead_size;
+        pool_size += look_ahead_size;
+#ifdef DEBUG_CUDA_MALLOC
+        GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
+                           (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
+#endif
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+        for (int i = 0; i < MAX_BUFFERS; ++i) {
+            ggml_cuda_buffer& b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr = ptr;
+                b.size = size;
+                return;
+            }
+        }
+        GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+        ggml_cuda_set_device(device);
+        CUDA_CHECK(cudaFree(ptr));
+        pool_size -= size;
+    }
+};
+
+// pool with virtual memory
+#if defined(GGML_USE_VMM)
+struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
+    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
+
+    int device;
+    CUdeviceptr pool_addr = 0;
+    size_t pool_used = 0;
+    size_t pool_size = 0;
+    size_t granularity;
+#if defined(GGML_USE_HIP)
+    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
+#endif
+
+    explicit ggml_cuda_pool_vmm(int device) :
+        device(device),
+        granularity(ggml_cuda_info().devices[device].vmm_granularity) {
+    }
+
+    ~ggml_cuda_pool_vmm() {
+        if (pool_addr != 0) {
+#if defined(GGML_USE_HIP)
+            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
+            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
+                CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
+            }
+#else
+            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
+#endif
+            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
+        }
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+        // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+        const size_t alignment = 128;
+        size = alignment * ((size + alignment - 1) / alignment);
+
+        size_t avail = pool_size - pool_used;
+
+        if (size > avail) {
+            // round up to the next multiple of the granularity
+            size_t reserve_size = size - avail;
+            reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
+
+            GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+
+            // allocate more physical memory
+            CUmemAllocationProp prop = {};
+            prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = device;
+            CUmemGenericAllocationHandle handle;
+            CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+
+            // reserve virtual address space (if not already reserved)
+            if (pool_addr == 0) {
+                CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+            }
+
+            // map at the end of the pool
+            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
+            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
+#if defined(GGML_USE_HIP)
+            mappings.push_back({start_ptr, reserve_size});
+#endif
+
+            // the memory allocation handle is no longer needed after mapping
+            CU_CHECK(cuMemRelease(handle));
+
+            // set access
+            CUmemAccessDesc access = {};
+            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            access.location.id = device;
+            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
+
+            // add to the pool
+            pool_size += reserve_size;
+
+            //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+            //       device, (unsigned long long) (pool_size/1024/1024),
+            //       (unsigned long long) (reserve_size/1024/1024));
+        }
+
+        GGML_ASSERT(pool_addr != 0);
+
+        void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
+        *actual_size = size;
+        pool_used += size;
+
+#ifdef DEBUG_CUDA_MALLOC
+        printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
+#endif
+
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+#ifdef DEBUG_CUDA_MALLOC
+        printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
+#endif
+
+        pool_used -= size;
+
+        // all deallocations must be in reverse order of the allocations
+        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
+    }
+};
+#endif // defined(GGML_USE_VMM)
+
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
+#if defined(GGML_USE_VMM)
+    if (ggml_cuda_info().devices[device].vmm) {
+        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
+    }
+#endif // defined(GGML_USE_VMM)
+    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
+}
+
+// cuda buffer
+
+struct ggml_backend_cuda_buffer_context {
+    int device;
+    void * dev_ptr = nullptr;
+    std::string name;
+
+    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
+        device(device), dev_ptr(dev_ptr),
+        name(GGML_CUDA_NAME + std::to_string(device)) {
+    }
+
+    ~ggml_backend_cuda_buffer_context() {
+        CUDA_CHECK(cudaFree(dev_ptr));
+    }
+};
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    if (tensor->view_src != NULL) {
+        assert(tensor->view_src->buffer->buft == buffer->buft);
+        return;
+    }
+
+    if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+        // initialize padding to 0 to avoid possible NaN values
+        size_t original_size = ggml_nbytes(tensor);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size) {
+            ggml_cuda_set_device(ctx->device);
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
+        }
+    }
+}
+
+static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+}
+
+static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+}
+
+static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+}
+
+static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_cuda(src->buffer)) {
+        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
+        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
+        if (src_ctx->device == dst_ctx->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
+        } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
+#endif
+        }
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaDeviceSynchronize());
+    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_cuda_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cuda_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// cuda buffer type
+struct ggml_backend_cuda_buffer_type_context {
+    int device;
+    std::string name;
+};
+
+static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    ggml_cuda_set_device(buft_ctx->device);
+
+    void * dev_ptr;
+    cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
+    if (err != cudaSuccess) {
+        // clear the error
+        (void)cudaGetLastError();
+        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    size_t size = ggml_nbytes(tensor);
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    GGML_UNUSED(buft);
+}
+
+static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cuda_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (device >= ggml_backend_cuda_get_device_count()) {
+        return nullptr;
+    }
+
+    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+
+    static bool ggml_backend_cuda_buffer_type_initialized = false;
+
+    if (!ggml_backend_cuda_buffer_type_initialized) {
+        for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
+            ggml_backend_cuda_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
+                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
+                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
+            };
+        }
+        ggml_backend_cuda_buffer_type_initialized = true;
+    }
+
+    return &ggml_backend_cuda_buffer_types[device];
+}
+
+// cuda split buffer
+
+static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
+    int64_t row_rounding = 0;
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+            continue;
+        }
+
+        const int cc = ggml_cuda_info().devices[id].cc;
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
+    }
+    return row_rounding;
+}
+
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
+    const int64_t nrows = ggml_nrows(tensor);
+    const int64_t rounding = get_row_rounding(tensor_split);
+
+    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low -= *row_low % rounding;
+
+    if (id == ggml_backend_cuda_get_device_count() - 1) {
+        *row_high = nrows;
+    } else {
+        *row_high = nrows*tensor_split[id + 1];
+        *row_high -= *row_high % rounding;
+    }
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+struct ggml_backend_cuda_split_buffer_type_context {
+    int main_device;
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+    std::string name;
+};
+
+struct ggml_backend_cuda_split_buffer_context {
+    ~ggml_backend_cuda_split_buffer_context() {
+        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+            for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+                for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
+                    if (extra->events[id][is] != nullptr) {
+                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+                    }
+                }
+                if (extra->data_device[id] != nullptr) {
+                    CUDA_CHECK(cudaFree(extra->data_device[id]));
+                }
+            }
+            delete extra;
+        }
+    }
+
+    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+};
+
+
+static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
+    return (void *)0x1000;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+    ctx->tensor_extras.push_back(extra);
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        // FIXME: do not crash if cudaMalloc fails
+        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+        ggml_cuda_set_device(id);
+        char * buf;
+        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
+        extra->data_device[id] = buf;
+
+        for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+        }
+    }
+    tensor->extra = extra;
+}
+
+static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        const char * buf_host = (const char *)data + offset_split;
+        CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+    }
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+    }
+}
+
+static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf_host = (char *)data + offset_split;
+        CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    }
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+    }
+}
+
+static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+}
+
+static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// cuda split buffer type
+
+static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // instead, we allocate them for each tensor separately in init_tensor
+    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        total_size += ggml_nbytes_split(tensor, nrows_split);
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
+
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
+
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
+    if (all_zero) {
+        tensor_split_arr = ggml_cuda_info().default_tensor_split;
+    } else {
+        float split_sum = 0.0f;
+        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
+            tensor_split_arr[i] = split_sum;
+            split_sum += tensor_split[i];
+        }
+        for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
+            tensor_split_arr[i] /= split_sum;
+        }
+    }
+
+    auto it = buft_map.find({main_device, tensor_split_arr});
+    if (it != buft_map.end()) {
+        return &it->second;
+    }
+    auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
+        main_device,
+        tensor_split_arr,
+        GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
+    };
+
+    struct ggml_backend_buffer_type buft {
+        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
+        /* .context = */ ctx,
+    };
+
+    auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
+    return &result.first->second;
+}
+
+// host buffer type
+
+static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_CUDA_NAME "_Host";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    CUDA_CHECK(cudaFreeHost(buffer->context));
+}
+
+static void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        // clear the error
+        (void)cudaGetLastError();
+        GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_cuda_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cuda_buffer_type_host;
+}
+
+//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
+//    return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+//}
+
+/// kernels
+
+typedef void (*ggml_cuda_op_mul_mat_t)(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
+
+#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+static cudaError_t ggml_cuda_cpy_tensor_2d(
+    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
+
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
+    const char * src_ptr = (const char *) src->data;
+    char       * dst_ptr = (char       *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    const int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
+    } else if (nb0 == ts) {
+        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
+            if (r != cudaSuccess) {
+                return r;
+            }
+        }
+        return cudaSuccess;
+    }
+}
+
+static void ggml_cuda_op_mul_mat_cublas(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id = ggml_cuda_get_device();
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int64_t ldc = id == ctx.device ? ne0 : row_diff;
+
+    const int compute_capability = ggml_cuda_info().devices[id].cc;
+
+    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
+
+    if (compute_capability >= GGML_CUDA_CC_VOLTA && use_fp16) {
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16.alloc(ne);
+            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
+        }
+        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
+
+        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16.alloc(ne);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
+        }
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
+
+        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+
+        if (GGML_CUDA_CC_IS_CDNA(compute_capability)) {
+            const float alpha = 1.0f;
+            const float beta = 0.0f;
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha, src0_ptr,  CUDA_R_16F, ne00,
+                                src1_ptr,  CUDA_R_16F, ne10,
+                        &beta,   dst_dd_i, CUDA_R_32F, ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
+
+            const half alpha_f16 = 1.0f;
+            const half beta_f16 = 0.0f;
+
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha_f16, src0_ptr,      CUDA_R_16F, ne00,
+                                    src1_ptr,      CUDA_R_16F, ne10,
+                        &beta_f16,  dst_f16.get(), CUDA_R_16F, ldc,
+                        CUBLAS_COMPUTE_16F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+            to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        }
+    } else {
+        ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
+        ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+        }
+        if (src1->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src1_ddq_as_f32.alloc(src1_ncols*ne10);
+            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
+        }
+
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+        CUBLAS_CHECK(
+            cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha, src0_ddf_i,  ne00,
+                            src1_ddf1_i, ne10,
+                    &beta,  dst_dd_i,    ldc));
+    }
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_padded_row_size);
+}
+
+static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        ggml_cuda_set_device(id);
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        ggml_cuda_set_device(id);
+
+        for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != main_device && id_other != main_device) {
+                continue;
+            }
+
+            int can_access_peer;
+            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
+                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
+                        CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        (void)cudaGetLastError();
+                    }
+                } else {
+                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
+                    if (err != cudaErrorPeerAccessNotEnabled) {
+                        CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        (void)cudaGetLastError();
+                    }
+                }
+            }
+        }
+    }
+
+    ggml_cuda_set_device(main_device);
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+
+    GGML_UNUSED(main_device);
+}
+
+static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
+    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
+    cudaMemcpy3DPeerParms p = {};
+    p.dstDevice = dstDevice;
+    p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
+    p.srcDevice = srcDevice;
+    p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
+    p.extent = make_cudaExtent(width, height, 1);
+    return cudaMemcpy3DPeerAsync(&p, stream);
+#else
+    // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
+    GGML_UNUSED(dstDevice);
+    GGML_UNUSED(srcDevice);
+    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+}
+
+static void ggml_cuda_op_mul_mat(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
+    quantize_cuda_t quantize_src1) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int64_t nb2 = dst->nb[2];
+    const int64_t nb3 = dst->nb[3];
+
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
+    ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
+    ggml_backend_cuda_buffer_context * dst_ctx  = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+    const int64_t i03_divisor = ne13 / ne03;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+    GGML_ASSERT(!(split && ne03 < ne13));
+
+    ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
+
+
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+    if (split) {
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+        tensor_split = buft_ctx->tensor_split;
+    }
+
+    struct dev_data {
+        int cc;
+
+        ggml_cuda_pool_alloc<char>   src0_dd_alloc;
+        ggml_cuda_pool_alloc<float> src1_ddf_alloc;
+        ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
+        ggml_cuda_pool_alloc<float>   dst_dd_alloc;
+
+        char  *  src0_dd = nullptr;
+        float * src1_ddf = nullptr; // float
+        char  * src1_ddq = nullptr; // q8_1
+        float *   dst_dd = nullptr;
+
+        int64_t  row_low;
+        int64_t row_high;
+    };
+
+    dev_data dev[GGML_CUDA_MAX_DEVICES];
+
+    int used_devices = 0;
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        dev[id].cc = ggml_cuda_info().devices[id].cc;
+
+        // by default, use all rows
+        dev[id].row_low  = 0;
+        dev[id].row_high = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(tensor_split);
+
+            if (id != 0) {
+                dev[id].row_low  = ne01*tensor_split[id];
+                if (dev[id].row_low < ne01) {
+                    dev[id].row_low -= dev[id].row_low % rounding;
+                }
+            }
+
+            if (id != ggml_backend_cuda_get_device_count() - 1) {
+                dev[id].row_high  = ne01*tensor_split[id + 1];
+                if (dev[id].row_high < ne01) {
+                    dev[id].row_high -= dev[id].row_high % rounding;
+                }
+            }
+        }
+    }
+
+    for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = id == src1_ctx->device;
+        const bool  dst_on_device = id == dst_ctx->device;
+
+        ggml_cuda_set_device(id);
+        cudaStream_t stream = ctx.stream(id, 0);
+
+        if (src0_is_contiguous) {
+            dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
+        } else {
+            // If src0 is not contiguous it will be copied to a temporary buffer.
+            // This buffer needs to be cleared entirely because multiple regions will function as padding.
+            const size_t nbytes_data    = ggml_nbytes(src0);
+            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
+            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
+        // TODO: remove this for MUSA once the Guilty Lockup issue is resolved
+#ifndef GGML_USE_MUSA
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
+#else // GGML_USE_MUSA
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
+#endif // !GGML_USE_MUSA
+        }
+
+        // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
+        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+            const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
+            const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
+            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            dev[id].src1_ddf = (float *) src1->data;
+        } else {
+            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
+        }
+
+        if (quantize_src1) {
+            size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
+            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
+            }
+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
+                CUDA_CHECK(cudaGetLastError());
+            }
+        }
+
+        if (dst_on_device) {
+            dev[id].dst_dd = (float *) dst->data;
+        } else {
+            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
+            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        ggml_cuda_set_device(ctx.device);
+        CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
+                continue;
+            }
+
+            const bool src1_on_device = id == src1_ctx->device;
+            const bool  dst_on_device = id == dst_ctx->device;
+            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
+
+            ggml_cuda_set_device(id);
+            cudaStream_t stream = ctx.stream(id, is);
+
+            // wait for main GPU data if necessary
+            if (split && (id != ctx.device || is != 0)) {
+                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                    src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
+                } else {
+                    src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                }
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
+                char  *  src0_dd_i =  dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
+                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (id == ctx.device) {
+                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1_is_contiguous) {
+                    if (id != ctx.device) {
+                        if (quantize_src1) {
+                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
+                            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                                const size_t pitch = ne11*sizeof(block_q8_1_mmq);
+                                const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
+                                const size_t height = src1_padded_col_size/(4*QK8_1);
+                                CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
+                            } else {
+                                CUDA_CHECK(cudaMemcpyPeerAsync(
+                                    src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
+                            }
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1->data;
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
+                                                            src1_ncols*ne10*sizeof(float), stream));
+                        }
+                    }
+                } else if (src1_on_device && !src1_is_contiguous) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ABORT("fatal error");
+                }
+
+                if (quantize_src1 && !src1_is_contiguous) {
+                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
+                    CUDA_CHECK(cudaGetLastError());
+                }
+
+                if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                        src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
+                }
+
+                // do the computation
+                op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
+                CUDA_CHECK(cudaGetLastError());
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device = dst->data;
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
+                        CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
+                            dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != ctx.device || is != 0)) {
+                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
+                }
+            }
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && ggml_backend_cuda_get_device_count() > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
+
+        ggml_cuda_set_device(ctx.device);
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            if (dev[id].row_low == dev[id].row_high) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
+            }
+        }
+    }
+}
+
+static __global__ void k_compute_batched_ptrs(
+        const half * src0_as_f16, const half * src1_as_f16, char * dst,
+        const void ** ptrs_src, void ** ptrs_dst,
+        int64_t ne12, int64_t ne13,
+        int64_t ne23,
+        size_t  nb02, size_t  nb03,
+        size_t  nb12, size_t  nb13,
+        size_t  nbd2, size_t  nbd3,
+        int64_t r2,   int64_t r3) {
+    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int64_t i03 = i13 / r3;
+    int64_t i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
+}
+
+static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t ne_dst = ggml_nelements(dst);
+
+    cudaStream_t main_stream = ctx.stream();
+
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
+
+    void * src0_ddq = src0->data;
+    half * src0_f16 = (half *) src0_ddq;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf  = (float *) dst->data;
+
+    // convert src1 to fp16
+    ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
+    if (src1->type != GGML_TYPE_F16) {
+        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+        const int64_t ne_src1 = ggml_nelements(src1);
+        src1_f16_alloc.alloc(ne_src1);
+        GGML_ASSERT(to_fp16_cuda != nullptr);
+        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
+    }
+    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
+
+    ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
+    char * dst_t;
+
+    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
+    cudaDataType_t      cu_data_type    = CUDA_R_16F;
+
+    // dst strides
+    size_t nbd2 = dst->nb[2];
+    size_t nbd3 = dst->nb[3];
+
+    const half  alpha_f16 = 1.0f;
+    const half  beta_f16  = 0.0f;
+
+    const float alpha_f32 = 1.0f;
+    const float beta_f32  = 0.0f;
+
+    const void * alpha = &alpha_f16;
+    const void * beta  = &beta_f16;
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        dst_t = (char *) dst_f16.alloc(ne_dst);
+
+        nbd2 /= sizeof(float) / sizeof(half);
+        nbd3 /= sizeof(float) / sizeof(half);
+    } else {
+        dst_t = (char *) dst_ddf;
+
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+        cu_data_type    = CUDA_R_32F;
+
+        alpha = &alpha_f32;
+        beta  = &beta_f32;
+    }
+
+    if (GGML_CUDA_CC_IS_CDNA(ggml_cuda_info().devices[ctx.device].cc)) {
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+        alpha = &alpha_f32;
+        beta  = &beta_f32;
+    }
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
+                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
+                            cu_compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+#ifdef GGML_USE_MUSA
+    GGML_ASSERT(false);
+#else // !GGML_USE_MUSA
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(
+        cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
+                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
+                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
+                ne12*ne13,
+                cu_compute_type,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
+        ggml_cuda_pool_alloc<      void *> ptrs_dst(ctx.pool(), 1*ne23);
+
+        dim3 block_dims(ne13, ne12);
+        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
+                src0_f16, src1_f16, dst_t,
+                ptrs_src.get(), ptrs_dst.get(),
+                ne12, ne13,
+                ne23,
+                nb02, nb03,
+                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
+                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
+                nbd2, nbd3,
+                r2, r3);
+        CUDA_CHECK(cudaGetLastError());
+
+        CUBLAS_CHECK(
+        cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
+                ne01, ne11, ne10,
+                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
+                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
+                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
+                ne23,
+                cu_compute_type,
+                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+#endif // GGML_USE_MUSA
+#endif
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
+    }
+}
+
+static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
+
+    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+
+    bool any_gpus_with_slow_fp16   = false;
+    bool any_gpus_without_fp16_mma = false;
+
+    if (split) {
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+        auto & tensor_split = buft_ctx->tensor_split;
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            // skip devices that are not going to do any work:
+            if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+                continue;
+            }
+
+            const int cc              = ggml_cuda_info().devices[id].cc;
+            use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+            any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
+            any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
+        }
+    } else {
+        const int cc              = ggml_cuda_info().devices[ctx.device].cc;
+        use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+        any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
+        any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
+    }
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+
+    if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+        // the custom F16 vector kernel can be used over batched cuBLAS GEMM
+        // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
+        ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
+               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+        // general KQ + KQV multi-batch without FlashAttention
+        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
+    } else if (use_mul_mat_vec) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
+    } else if (use_mul_mat_vec_q) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
+    } else if (use_mul_mat_q) {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
+    } else {
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
+    }
+}
+
+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+
+static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
+                                                 int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
+                                                 const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
+                                                 int64_t ne11, int64_t ne10,
+                                                 size_t nb11, size_t nb12) {
+    int32_t iid1 = blockIdx.x;
+    int32_t id = blockIdx.y;
+
+    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
+
+    if (row_id_i != i02) {
+        return;
+    }
+
+    const int64_t i11 = id % ne11;
+    const int64_t i12 = iid1;
+
+    __shared__ int src1_row;
+    if (threadIdx.x == 0) {
+        src1_row = atomicAdd(cur_src1_row, 1);
+        row_mapping[src1_row] = {id, iid1};
+    }
+    __syncthreads();
+
+    const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
+    float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
+
+    for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
+        src1_row_contiguous[i] = src1_row_original[i];
+    }
+}
+
+static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
+                                                  const mmid_row_mapping * __restrict__ row_mapping,
+                                                  int64_t ne0,
+                                                  size_t nb1, size_t nb2) {
+    int32_t i = blockIdx.x;
+
+    const int32_t i1 = row_mapping[i].i1;
+    const int32_t i2 = row_mapping[i].i2;
+
+    const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
+    float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
+
+    for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
+        dst_row_original[j] = dst_row_contiguous[j];
+    }
+}
+
+static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * ids  = dst->src[2];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t n_as = ne02;
+    const int64_t n_ids = ids->ne[0];
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    const char * ids_dev = (const char *) ids->data;
+    CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row  = *dst;
+
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *)  dst->data;
+
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[3] = nb02;
+
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+
+    if (ne12 == 1) {
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+            for (int64_t id = 0; id < n_ids; id++) {
+                const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                const int64_t i11 = id % ne11;
+                const int64_t i12 = iid1;
+
+                const int64_t i1 = id;
+                const int64_t i2 = i12;
+
+                src0_row.data = src0_original + i02*nb02;
+                src1_row.data = src1_original + i11*nb11 + i12*nb12;
+                dst_row.data  =  dst_original + i1*nb1   + i2*nb2;
+
+                ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+            }
+        }
+    } else {
+        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
+        ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
+
+        src1_row.data = src1_contiguous.get();
+        dst_row.data  =  dst_contiguous.get();
+
+        for (int64_t i02 = 0; i02 < n_as; i02++) {
+            int64_t num_src1_rows = 0;
+
+            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+                for (int64_t id = 0; id < n_ids; id++) {
+                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+                    if (row_id_i != i02) {
+                        continue;
+                    }
+
+                    num_src1_rows++;
+                }
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+            ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
+            ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
+            CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
+
+            {
+                dim3 block_dims(std::min((unsigned int)ne10, 768u));
+                dim3 grid_dims(ids->ne[1], n_ids);
+                k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                        src1_original, src1_contiguous.get(),
+                        dev_cur_src1_row.get(), dev_row_mapping.get(),
+                        ids_dev, i02, ids->nb[1], ids->nb[0],
+                        ne11, ne10,
+                        nb11, nb12);
+                CUDA_CHECK(cudaGetLastError());
+            }
+
+            src0_row.data = src0_original + i02*nb02;
+
+            GGML_ASSERT(nb11 == sizeof(float)*ne10);
+            GGML_ASSERT(nb1 == sizeof(float)*ne0);
+
+            src1_row.ne[1] = num_src1_rows;
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.ne[1] = num_src1_rows;
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+
+            {
+                dim3 block_dims(std::min((unsigned int)ne0, 768u));
+                dim3 grid_dims(num_src1_rows);
+                k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                        dst_original, dst_contiguous.get(),
+                        dev_row_mapping.get(),
+                        ne0,
+                        nb1, nb2);
+                CUDA_CHECK(cudaGetLastError());
+            }
+        }
+    }
+}
+
+static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+    // why is this here instead of mul_mat?
+    if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
+        ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
+    }
+
+    switch (dst->op) {
+        case GGML_OP_ARGMAX:
+            ggml_cuda_argmax(ctx, dst);
+            break;
+        case GGML_OP_COUNT_EQUAL:
+            ggml_cuda_count_equal(ctx, dst);
+            break;
+        case GGML_OP_REPEAT:
+            ggml_cuda_op_repeat(ctx, dst);
+            break;
+        case GGML_OP_REPEAT_BACK:
+            ggml_cuda_op_repeat_back(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_cuda_op_get_rows(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS_BACK:
+            ggml_cuda_op_get_rows_back(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_cuda_dup(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
+            break;
+        case GGML_OP_CONT:
+            ggml_cuda_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1: // TODO: more efficient implementation
+            ggml_cuda_op_add(ctx, dst);
+            break;
+        case GGML_OP_SUB:
+            ggml_cuda_op_sub(ctx, dst);
+            break;
+        case GGML_OP_ACC:
+            ggml_cuda_op_acc(ctx, dst);
+            break;
+        case GGML_OP_MUL:
+            ggml_cuda_op_mul(ctx, dst);
+            break;
+        case GGML_OP_DIV:
+            ggml_cuda_op_div(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_NEG:
+                    ggml_cuda_op_neg(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_STEP:
+                    ggml_cuda_op_step(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_GELU:
+                    ggml_cuda_op_gelu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    ggml_cuda_op_silu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    ggml_cuda_op_gelu_quick(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    ggml_cuda_op_tanh(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    ggml_cuda_op_relu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SIGMOID:
+                    ggml_cuda_op_sigmoid(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    ggml_cuda_op_hardsigmoid(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    ggml_cuda_op_hardswish(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_EXP:
+                    ggml_cuda_op_exp(ctx, dst);
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_cuda_op_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_cuda_op_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_cuda_op_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_cuda_op_upscale(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_cuda_op_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggml_cuda_op_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_cuda_op_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_cuda_op_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_SILU_BACK:
+            ggml_cuda_op_silu_back(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_cuda_op_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM_BACK:
+            ggml_cuda_op_rms_norm_back(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            ggml_cuda_mul_mat_id(ctx, dst);
+            break;
+        case GGML_OP_OUT_PROD:
+            ggml_cuda_out_prod(ctx, dst);
+            break;
+        case GGML_OP_SCALE:
+            ggml_cuda_op_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_cuda_op_sqr(ctx, dst);
+            break;
+        case GGML_OP_SQRT:
+            ggml_cuda_op_sqrt(ctx, dst);
+            break;
+        case GGML_OP_SIN:
+            ggml_cuda_op_sin(ctx, dst);
+            break;
+        case GGML_OP_COS:
+            ggml_cuda_op_cos(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_cuda_op_clamp(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+                break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_cuda_op_diag_mask_inf(ctx, dst);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_cuda_op_soft_max(ctx, dst);
+            break;
+        case GGML_OP_SOFT_MAX_BACK:
+            ggml_cuda_op_soft_max_back(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_cuda_op_rope(ctx, dst);
+            break;
+        case GGML_OP_ROPE_BACK:
+            ggml_cuda_op_rope_back(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_cuda_op_im2col(ctx, dst);
+            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_cuda_op_conv_transpose_1d(ctx,dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_cuda_op_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM:
+            ggml_cuda_op_sum(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_cuda_op_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_cuda_op_argsort(ctx, dst);
+            break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            ggml_cuda_flash_attn_ext(ctx, dst);
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+            ggml_cuda_cross_entropy_loss(ctx, dst);
+            break;
+        case GGML_OP_RWKV_WKV6:
+            ggml_cuda_op_rwkv_wkv6(ctx, dst);
+            break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            ggml_cuda_op_gated_linear_attn(ctx, dst);
+            break;
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+            ggml_cuda_cross_entropy_loss_back(ctx, dst);
+            break;
+        case GGML_OP_OPT_STEP_ADAMW:
+            ggml_cuda_opt_step_adamw(ctx, dst);
+            break;
+        default:
+            return false;
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
+        CUDA_CHECK(err);
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend
+
+static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    return cuda_ctx->name.c_str();
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    delete cuda_ctx;
+    delete backend;
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
+}
+
+static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
+        return false;
+    }
+
+    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
+        return false;
+    }
+
+    // device -> device copy
+    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
+    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
+
+    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
+#endif
+        return false;
+    }
+
+    if (backend_src != backend_dst) {
+        // copy on src stream
+        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
+        } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+#endif
+        }
+
+        // record event on src stream after the copy
+        if (!cuda_ctx_src->copy_event) {
+            ggml_cuda_set_device(cuda_ctx_src->device);
+            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
+        }
+
+        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
+
+        // wait on dst stream for the copy to complete
+        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
+    } else {
+        // src and dst are on the same backend
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
+    }
+    return true;
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
+
+    GGML_UNUSED(backend);
+}
+
+#ifdef USE_CUDA_GRAPH
+static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
+
+    // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
+            use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
+#endif
+        }
+
+        if (node->op == GGML_OP_MUL_MAT_ID) {
+            use_cuda_graph = false; // This node type is not supported by CUDA graph capture
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+#endif
+        }
+
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+            // disable CUDA graphs for batch size > 1 for now.
+            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+            use_cuda_graph = false;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+#endif
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            // store the copy op parameter which changes with each token.
+            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
+            // store a pointer to each copy op CUDA kernel to identify it later
+            void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
+            if (!ptr) {
+                use_cuda_graph = false;
+#ifndef NDEBUG
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+#endif
+            } else {
+                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
+                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
+                }
+            }
+        }
+
+        if (!use_cuda_graph) {
+            break;
+        }
+    }
+
+    return use_cuda_graph;
+}
+
+static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    graph_node_properties->node_address = node->data;
+    graph_node_properties->node_op = node->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        graph_node_properties->ne[i] = node->ne[i];
+        graph_node_properties->nb[i] = node->nb[i];
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
+    }
+    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
+}
+
+static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    if (node->data != graph_node_properties->node_address &&
+          node->op != GGML_OP_CPY &&
+          node->op != GGML_OP_VIEW) {
+        return false;
+    }
+
+    if (node->op != graph_node_properties->node_op) {
+        return false;
+    }
+
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (node->ne[i] != graph_node_properties->ne[i]) {
+            return false;
+        }
+        if (node->nb[i] != graph_node_properties->nb[i]) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (node->src[i] &&
+            node->src[i]->data != graph_node_properties->src_address[i] &&
+            node->op != GGML_OP_CPY &&
+            node->op != GGML_OP_VIEW
+        ) {
+            return false;
+        }
+    }
+
+    if (node->op == GGML_OP_SCALE &&
+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
+
+    if (cuda_graph_update_required) {
+        // Extract nodes from graph
+        // First call with null argument gets number of nodes in graph
+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
+        // Subsequent call with non-null argument gets nodes
+        cuda_ctx->cuda_graph->nodes.clear();
+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+        cuda_ctx->cuda_graph->params.clear();
+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
+
+            // Loop over nodes, and extract kernel parameters from each node
+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+                cudaGraphNodeType node_type;
+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
+                if (node_type == cudaGraphNodeTypeKernel) {
+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
+                    if (stat == cudaErrorInvalidDeviceFunction) {
+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
+                        // We don't need to update blas nodes, so clear error and move on.
+                        (void)cudaGetLastError();
+                    } else {
+                        GGML_ASSERT(stat == cudaSuccess);
+                    }
+                }
+            }
+        }
+    } else {
+        // One of the arguments to the copy kernel is updated for each token, hence we need to
+        // replace that argument with the updated value in the CUDA graph
+        // on update steps, the live parameters will already be captured
+        int k = 0;
+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
+                cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
+            }
+        }
+    }
+}
+
+static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
+
+    bool cuda_graph_update_required = false;
+
+    if (cuda_ctx->cuda_graph->instance == nullptr) {
+        cuda_graph_update_required = true;
+    }
+
+    // Check if the graph size has changed
+    if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
+        cuda_graph_update_required = true;
+        cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
+    }
+
+    // Loop over nodes in GGML graph to determine if CUDA graph update is required
+    // and store properties to allow this comparison for the next token
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        bool has_matching_properties = true;
+        if (!cuda_graph_update_required) {
+            has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
+        }
+        if (!has_matching_properties) {
+            cuda_graph_update_required = true;
+        }
+        set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
+    }
+
+    return cuda_graph_update_required;
+}
+
+static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
+
+    cudaGraphExecUpdateResultInfo result_info;
+#ifdef __HIP_PLATFORM_AMD__
+    hipGraphNode_t errorNode;
+    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
+#else
+    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
+#endif
+    if (stat == cudaErrorGraphExecUpdateFailure) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
+#endif
+
+        // The pre-existing graph exec cannot be updated due to violated constraints
+        // so instead clear error and re-instantiate
+        (void)cudaGetLastError();
+        CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
+        cuda_ctx->cuda_graph->instance = nullptr;
+        CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+    } else {
+        GGML_ASSERT(stat == cudaSuccess);
+    }
+}
+#endif
+
+static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
+    bool & cuda_graph_update_required) {
+
+    while (!graph_evaluated_or_captured) {
+        // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
+        // With the use of CUDA graphs, the execution will be performed by the graph launch.
+        if (!use_cuda_graph || cuda_graph_update_required) {
+            for (int i = 0; i < cgraph->n_nodes; i++) {
+                ggml_tensor * node = cgraph->nodes[i];
+
+                if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                    continue;
+                }
+
+#ifndef NDEBUG
+                assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    if (node->src[j] != nullptr) {
+                        assert(node->src[j]->buffer);
+                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
+                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
+                    }
+                }
+#endif
+
+                bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
+                if (!ok) {
+                    GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+                }
+                GGML_ASSERT(ok);
+            }
+        }
+
+#ifdef USE_CUDA_GRAPH
+        if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
+            if (cuda_ctx->cuda_graph->graph != nullptr) {
+                CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
+                cuda_ctx->cuda_graph->graph = nullptr;
+            }
+
+            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
+            graph_evaluated_or_captured = true; // CUDA graph has been captured
+        } else {
+            graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
+        }
+    }
+
+    if (use_cuda_graph) {
+        if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
+            CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+        }
+
+        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
+
+        // Update graph executable
+        update_cuda_graph_executable(cuda_ctx);
+
+        // Launch graph
+        CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
+#else
+        graph_evaluated_or_captured = true;
+#endif  // USE_CUDA_GRAPH
+    }
+}
+
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    ggml_cuda_set_device(cuda_ctx->device);
+
+    // vector of pointers to CUDA cpy kernels, which are required to identify
+    // kernel parameters which need updated in the graph for each token
+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
+
+#ifdef USE_CUDA_GRAPH
+    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+
+    // Objects required for CUDA Graph
+    if (cuda_ctx->cuda_graph == nullptr) {
+        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
+    }
+
+    bool use_cuda_graph = true;
+    bool cuda_graph_update_required = false;
+
+    if (cuda_ctx->cuda_graph->graph == nullptr) {
+        if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
+            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+#endif
+        }
+    }
+
+    // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
+    // or previous graph capture failure.
+    // Also disable for multi-gpu for now. TO DO investigate
+    if (disable_cuda_graphs_due_to_env
+        || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
+        || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
+        || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
+        use_cuda_graph = false;
+    }
+
+    if (use_cuda_graph) {
+        cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+
+        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+        if (use_cuda_graph && cuda_graph_update_required) {
+            cuda_ctx->cuda_graph->number_consecutive_updates++;
+        } else {
+            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
+        }
+
+        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
+            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+#endif
+        }
+    }
+
+    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
+        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+    }
+
+#else
+    bool use_cuda_graph = false;
+    bool cuda_graph_update_required = false;
+#endif // USE_CUDA_GRAPH
+
+    bool graph_evaluated_or_captured = false;
+
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
+}
+
+static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    if (ggml_backend_is_cuda(backend)) {
+        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
+    } else {
+#if 0
+        // untested
+        auto wait_fn = [](void * user_data) {
+            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
+            ggml_backend_event_synchronize(event);
+        };
+
+        CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
+#endif
+        GGML_ABORT("fatal error");
+    }
+}
+
+static const ggml_backend_i ggml_backend_cuda_interface = {
+    /* .get_name                = */ ggml_backend_cuda_get_name,
+    /* .free                    = */ ggml_backend_cuda_free,
+    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
+    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
+    /* .synchronize             = */ ggml_backend_cuda_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+    /* .event_record            = */ ggml_backend_cuda_event_record,
+    /* .event_wait              = */ ggml_backend_cuda_event_wait,
+};
+
+static ggml_guid_t ggml_backend_cuda_guid() {
+    static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
+    return &guid;
+}
+
+bool ggml_backend_is_cuda(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
+}
+
+int ggml_backend_cuda_get_device_count() {
+    return ggml_cuda_info().device_count;
+}
+
+void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    snprintf(description, description_size, "%s", prop.name);
+}
+
+void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
+    ggml_cuda_set_device(device);
+
+    CUDA_CHECK(cudaMemGetInfo(free, total));
+}
+
+bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
+    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
+        return false;
+    }
+
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
+    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
+    if (err != cudaSuccess) {
+        // clear the error
+        (void)cudaGetLastError();
+
+        GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
+        return false;
+    }
+    return true;
+#else
+    return false;
+#endif
+}
+
+void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
+    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
+        return;
+    }
+
+    cudaError_t err = cudaHostUnregister(buffer);
+    if (err != cudaSuccess) {
+        // clear the error
+        (void)cudaGetLastError();
+    }
+}
+
+
+// backend device
+
+struct ggml_backend_cuda_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(cudaMemGetInfo(free, total));
+}
+
+static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cuda_device_get_name(dev);
+    props->description = ggml_backend_cuda_device_get_description(dev);
+    props->type        = ggml_backend_cuda_device_get_type(dev);
+    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+#ifdef GGML_CUDA_NO_PEER_COPY
+    bool events = false;
+#else
+    bool events = true;
+#endif
+
+    props->caps = {
+        /* .async                 = */ true,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ events,
+    };
+}
+
+static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ggml_backend_cuda_init(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ggml_backend_cuda_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_cuda_host_buffer_type();
+}
+
+// TODO: move these functions here
+static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+
+    // split buffers can only be used with GGML_OP_MUL_MAT
+    if (op->op != GGML_OP_MUL_MAT) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
+                return false;
+            }
+        }
+    }
+
+    // check if all the sources are allocated on this device
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
+            ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
+            if (buft_ctx->device != dev_ctx->device) {
+                return false;
+            }
+        }
+    }
+
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_EXP:
+                    return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                struct ggml_tensor * a = op->src[0];
+                struct ggml_tensor * b = op->src[1];
+                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                // this avoids some edge cases (and the performance would not be good anyways)
+                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
+                    int64_t row_low;
+                    int64_t row_high;
+                    get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
+                    if (row_low == row_high) {
+                        return false;
+                    }
+                }
+                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+                    return false;
+                }
+#ifdef GGML_USE_MUSA
+                if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
+                    !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
+                    return false;
+                }
+#endif // GGML_USE_MUSA
+                switch (a->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q2_K:
+                    case GGML_TYPE_Q3_K:
+                    case GGML_TYPE_Q4_K:
+                    case GGML_TYPE_Q5_K:
+                    case GGML_TYPE_Q6_K:
+                    case GGML_TYPE_Q8_K:
+                    case GGML_TYPE_IQ1_M:
+                    case GGML_TYPE_IQ1_S:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ4_NL:
+                    case GGML_TYPE_IQ4_XS:
+                    case GGML_TYPE_BF16:
+#ifdef GGML_USE_MUSA
+                        if (a->type == GGML_TYPE_Q3_K) {
+                            return false;
+                        }
+#endif // GGML_USE_MUSA
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_OUT_PROD:
+            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_GET_ROWS_BACK:
+            {
+                return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_DUP:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_ARGMAX:
+        case GGML_OP_COUNT_EQUAL:
+            {
+                return true;
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_REPEAT_BACK:
+                return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
+        case GGML_OP_CONCAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_SILU_BACK:
+            return ggml_is_contiguous(op->src[0]);
+            break;
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+            return true;
+        case GGML_OP_RMS_NORM_BACK:
+            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+            return true;
+        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+            return true;
+        case GGML_OP_SOFT_MAX_BACK: {
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
+            return max_bias == 0.0f;
+        }
+        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK: {
+            const size_t ts = ggml_type_size(op->src[0]->type);
+            const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
+            return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
+        }
+        case GGML_OP_IM2COL:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_SUM:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+            return true;
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_GATED_LINEAR_ATTN:
+            return true;
+        case GGML_OP_FLASH_ATTN_EXT: {
+#ifndef FLASH_ATTN_AVAILABLE
+            return false;
+#endif
+            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
+                return false;
+            }
+            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
+                return true;
+            }
+            if (op->src[0]->ne[0] == 128) {
+                return true;
+            }
+            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
+                return true;
+            }
+            const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+            return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+        }
+        case GGML_OP_CROSS_ENTROPY_LOSS:
+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+        case GGML_OP_OPT_STEP_ADAMW:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+}
+
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_GET_ROWS:
+            return 0;
+        case GGML_OP_MUL_MAT:
+            return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
+            return op->ne[2];
+        default:
+            return ggml_nrows(op);
+    }
+}
+
+static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return get_op_batch_size(op) >= min_batch_size;
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
+#ifdef GGML_CUDA_NO_PEER_COPY
+    return nullptr;
+#else
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
+
+    ggml_cuda_set_device(dev_ctx->device);
+
+    cudaEvent_t event;
+    CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+
+    return new ggml_backend_event {
+        /* .device  = */ dev,
+        /* .context = */ event,
+    };
+#endif
+}
+
+static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    GGML_UNUSED(dev);
+
+    CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
+    delete event;
+}
+
+static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    GGML_UNUSED(dev);
+    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
+}
+
+static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+    /* .get_name                = */ ggml_backend_cuda_device_get_name,
+    /* .get_description         = */ ggml_backend_cuda_device_get_description,
+    /* .get_memory              = */ ggml_backend_cuda_device_get_memory,
+    /* .get_type                = */ ggml_backend_cuda_device_get_type,
+    /* .get_props               = */ ggml_backend_cuda_device_get_props,
+    /* .init_backend            = */ ggml_backend_cuda_device_init_backend,
+    /* .get_buffer_type         = */ ggml_backend_cuda_device_get_buffer_type,
+    /* .get_host_buffer_type    = */ ggml_backend_cuda_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr    = */ NULL,
+    /* .supports_op             = */ ggml_backend_cuda_device_supports_op,
+    /* .supports_buft           = */ ggml_backend_cuda_device_supports_buft,
+    /* .offload_op              = */ ggml_backend_cuda_device_offload_op,
+    /* .event_new               = */ ggml_backend_cuda_device_event_new,
+    /* .event_free              = */ ggml_backend_cuda_device_event_free,
+    /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
+};
+
+// backend reg
+
+struct ggml_backend_cuda_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_CUDA_NAME;
+}
+
+static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
+    ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        std::vector<ggml_backend_feature> features;
+    #define _STRINGIFY(...) #__VA_ARGS__
+    #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
+
+    #ifdef __CUDA_ARCH_LIST__
+        features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_MMQ
+        features.push_back({ "FORCE_MMQ", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_FORCE_CUBLAS
+        features.push_back({ "FORCE_CUBLAS", "1" });
+    #endif
+
+    #ifndef GGML_USE_VMM
+        features.push_back({ "NO_VMM", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_NO_PEER_COPY
+        features.push_back({ "NO_PEER_COPY", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_F16
+        features.push_back({ "F16", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_USE_GRAPHS
+        features.push_back({ "USE_GRAPHS", "1" });
+    #endif
+
+    #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+        features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
+    #endif
+
+    #ifdef GGML_CUDA_FA_ALL_QUANTS
+        features.push_back({ "FA_ALL_QUANTS", "1" });
+    #endif
+
+    #undef _STRINGIFY
+    #undef STRINGIFY
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_cuda_split_buffer_type;
+    }
+    if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
+        return (void *)ggml_backend_cuda_register_host_buffer;
+    }
+    if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
+        return (void *)ggml_backend_cuda_unregister_host_buffer;
+    }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cuda_get_features;
+    }
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
+    /* .get_name          = */ ggml_backend_cuda_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_cuda_reg_get_device_count,
+    /* .get_device        = */ ggml_backend_cuda_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_cuda_reg_get_proc_address,
+};
+
+// backend registry
+ggml_backend_reg_t ggml_backend_cuda_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+
+            for (int i = 0; i < ggml_cuda_info().device_count; i++) {
+                ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
+
+                ggml_cuda_set_device(i);
+                cudaDeviceProp prop;
+                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                dev_ctx->description = prop.name;
+
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_cuda_device_interface,
+                    /* .reg     = */ &reg,
+                    /* .context = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_cuda_reg_interface,
+                /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+ggml_backend_t ggml_backend_cuda_init(int device) {
+    if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
+        GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
+        return nullptr;
+    }
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_cuda_guid(),
+        /* .interface = */ ggml_backend_cuda_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
+        /* .context   = */ ctx,
+    };
+
+    return cuda_backend;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
diff --git a/ggml/src/ggml-cuda/gla.cu b/ggml/src/ggml-cuda/gla.cu
new file mode 100644
index 000000000..f7d615a82
--- /dev/null
+++ b/ggml/src/ggml-cuda/gla.cu
@@ -0,0 +1,93 @@
+#include "common.cuh"
+#include "gla.cuh"
+
+template<int HEAD_SIZE>
+static __global__ void gated_linear_attn_f32(const int B, const int T, const int C, const int H, const float scale,
+     const float * k, const float * v, const float * r, const float * td, const float * s, float * dst) {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    const int head_size = HEAD_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+
+    float state[head_size];
+    __shared__ float _k[head_size], _r[head_size], _td[head_size];
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
+        __syncthreads();
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        __syncthreads();
+
+        const float _v = v[t];
+        float y = 0;
+        for (int j = 0; j < head_size; j += 4) {
+            const float4 & k = (float4 &)(_k[j]);
+            const float4 & r = (float4 &)(_r[j]);
+            const float4 & td = (float4 &)(_td[j]);
+            float4 & s = (float4 &)(state[j]);
+            float4 kv;
+
+            kv.x = k.x * _v;
+            kv.y = k.y * _v;
+            kv.z = k.z * _v;
+            kv.w = k.w * _v;
+
+            s.x = s.x * td.x + kv.x;
+            s.y = s.y * td.y + kv.y;
+            s.z = s.z * td.z + kv.z;
+            s.w = s.w * td.w + kv.w;
+
+            y += r.x * s.x;
+            y += r.y * s.y;
+            y += r.z * s.z;
+            y += r.w * s.w;
+        }
+        dst[t] = y * scale;
+    }
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+
+void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const float * k_d  = (const float *)dst->src[0]->data;
+    const float * v_d  = (const float *)dst->src[1]->data;
+    const float * r_d  = (const float *)dst->src[2]->data;
+    const float * td_d = (const float *)dst->src[3]->data;
+    const float * s_d  = (const float *)dst->src[4]->data;
+
+    const int64_t B = dst->src[4]->ne[1];
+    const int64_t T = dst->src[0]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[1];
+
+    float scale;
+    memcpy(&scale, (float*)dst->op_params, sizeof(float));
+
+    float * dst_d = (float *)dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == 64 || C / H == 128);
+
+
+    if (C / H == 64) {
+        gated_linear_attn_f32<64><<<B * H, C / H, 0, stream>>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    } else {
+        gated_linear_attn_f32<128><<<B * H, C / H, 0, stream>>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    }
+}
diff --git a/ggml/src/ggml-cuda/gla.cuh b/ggml/src/ggml-cuda/gla.cuh
new file mode 100644
index 000000000..2c82ad7dd
--- /dev/null
+++ b/ggml/src/ggml-cuda/gla.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu
new file mode 100644
index 000000000..86a54e42b
--- /dev/null
+++ b/ggml/src/ggml-cuda/im2col.cu
@@ -0,0 +1,103 @@
+#include "im2col.cuh"
+
+template <typename T>
+static  __global__ void im2col_kernel(
+        const float * x, T * dst, int64_t batch_offset,
+        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
+        int s0, int s1, int p0, int p1, int d0, int d1) {
+    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i >= pelements) {
+        return;
+    }
+
+    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
+    const int64_t  kx = i / ksize;
+    const int64_t  kd = kx * ksize;
+    const int64_t  ky = (i - kd) / OW;
+    const int64_t  ix = i % OW;
+
+    const int64_t  oh = blockIdx.y;
+    const int64_t  batch = blockIdx.z / IC;
+    const int64_t  ic = blockIdx.z % IC;
+
+    const int64_t iiw = ix * s0 + kx * d0 - p0;
+    const int64_t iih = oh * s1 + ky * d1 - p1;
+
+    const int64_t offset_dst =
+        ((batch * OH + oh) * OW + ix) * CHW +
+        (ic * (KW * KH) + ky * KW + kx);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = 0.0f;
+    } else {
+        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
+        dst[offset_dst] = x[offset_src + iih * IW + iiw];
+    }
+}
+
+template <typename T>
+static void im2col_cuda(const float * x, T* dst,
+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+    const int parallel_elements = OW * KW * KH;
+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    dim3 block_nums(num_blocks, OH, batch * IC);
+    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
+}
+
+static void im2col_cuda_f16(const float * x, half * dst,
+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+
+    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
+}
+
+static void im2col_cuda_f32(const float * x, float * dst,
+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
+
+    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
+}
+
+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src1_d = (const float *)src1->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t  delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const int64_t batch        = src1->ne[is_2D ? 3 : 2];
+    const size_t  batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
+
+    if(dst->type == GGML_TYPE_F16) {
+        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
+    } else {
+        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
+    }
+}
diff --git a/ggml/src/ggml-cuda/im2col.cuh b/ggml/src/ggml-cuda/im2col.cuh
new file mode 100644
index 000000000..1ce8fae4d
--- /dev/null
+++ b/ggml/src/ggml-cuda/im2col.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
new file mode 100644
index 000000000..bbc0a35ae
--- /dev/null
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -0,0 +1,458 @@
+// This file contains primitives that expose the tensor core PTX instructions for CUDA code.
+// The primitives can be used in a similar way as the nvcuda::wmma interface but with a well-defined memory layout.
+// The documentation for the PTX instructions can be found under:
+//   https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-multiply-accumulate-operation-using-mma-instruction
+//
+// Like with nvcuda::wmma there are three types of matrix tiles: A, B, and C with A @ B = C.
+// A is a row-major matrix with shape I x K.
+// B is a column-major matrix with shape K x J.
+// C is a column-major matrix with shape I x J.
+// Note that along their lowest dimension I, J, and K are measured in physical 32 bit elements instead of logical elements.
+// The functions get_i, get_j, and get_k can be used to get the physical 32 bit index of the lth element of a thread within a tile.
+// All matrix tiles have ne physical 32 bit elements per warp.
+//
+// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
+
+#include "common.cuh"
+
+
+#if CUDART_VERSION >= 11080
+
+static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
+    int ret = 0;
+
+#ifdef NEW_MMA_AVAILABLE
+    asm("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;"
+        : "+r"(ret) : "r"(x));
+#else
+    NO_DEVICE_CODE;
+#endif // defined(NEW_MMA_AVAILABLE)
+    return ret;
+}
+
+#else
+
+static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
+    // Imagine transposing row-major matrix to column-major matrix.
+    const int src_i_low  = 2 * (threadIdx.x % 4);
+    const int src_i_high = src_i_low + 1;
+    const int src_j      = threadIdx.x / 4;
+
+    const int src_laneid_low  = src_i_low  * 4 + src_j / 2;
+    const int src_laneid_high = src_i_high * 4 + src_j / 2;
+
+    const int shift_low  = ((src_j + 0) % 2) * 16;
+    const int shift_high = ((src_j + 1) % 2) * 16;
+
+    const int ret_low  = (__shfl_sync(0xFFFFFFFF, x, src_laneid_low,  WARP_SIZE) >> shift_low)  & 0x0000FFFF;
+    const int ret_high = (__shfl_sync(0xFFFFFFFF, x, src_laneid_high, WARP_SIZE) << shift_high) & 0xFFFF0000;
+
+    return ret_low | ret_high;
+}
+
+#endif // CUDART_VERSION >= 11080
+
+
+template <typename T>
+struct mma_A_I16K4 {
+    static_assert(sizeof(T) == 4, "bad type size");
+
+    static constexpr int I  = 16;
+    static constexpr int K  = 4;
+    static constexpr int ne = 2;
+
+    T x[ne];
+
+    static __device__ __forceinline__ int get_i(const int l) {
+        const int ret = (l%2) * (I/2) + threadIdx.x / K;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  I);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_k(const int /* l */) {
+        const int ret = threadIdx.x % K;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  K);
+        return ret;
+    }
+
+    __device__ __forceinline__ void load_generic(const T * __restrict__ xs0, const int & stride) {
+#pragma unroll
+        for (int l = 0; l < ne; ++l) {
+            x[l] = xs0[get_i(l)*stride + get_k(l)];
+        }
+    }
+
+    __device__ __forceinline__ void load_ldmatrix(const T * __restrict__ xs0, const int & stride) {
+#ifdef NEW_MMA_AVAILABLE
+        int * xi = (int *) x;
+        const int * xs = (const int *) xs0 + (threadIdx.x%I)*stride;
+        asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
+            : "+r"(xi[0]), "+r"(xi[1])
+            : "l"(xs));
+#else
+        load_generic(xs0, stride);
+#endif // NEW_MMA_AVAILABLE
+    }
+};
+
+template <typename T>
+struct mma_A_I16K8 {
+    static_assert(sizeof(T) == 4, "bad type size");
+
+    static constexpr int I  = 16;
+    static constexpr int K  = 8;
+    static constexpr int ne = 4;
+
+    T x[ne];
+
+    static __device__ __forceinline__ int get_i(const int l) {
+        const int ret = (l%2) * (I/2) + threadIdx.x / (K/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  I);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_k(const int l) {
+        const int ret = (l/2) * (K/2) + threadIdx.x % (K/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  K);
+        return ret;
+    }
+
+    __device__ __forceinline__ void load_generic(const T * __restrict__ xs0, const int & stride) {
+#pragma unroll
+        for (int l = 0; l < ne; ++l) {
+            x[l] = xs0[get_i(l)*stride + get_k(l)];
+        }
+    }
+
+    __device__ __forceinline__ void load_ldmatrix(const T * __restrict__ xs0, const int & stride) {
+#ifdef NEW_MMA_AVAILABLE
+        int * xi = (int * ) x;
+        const int * xs = (const int *) xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
+        asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
+            : "+r"(xi[0]), "+r"(xi[1]), "+r"(xi[2]), "+r"(xi[3])
+            : "l"(xs));
+#else
+        GGML_UNUSED(xs0);
+        GGML_UNUSED(stride);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    __device__ __forceinline__ void load_ldmatrix_trans(const T * __restrict__ xs0, const int & stride) {
+#ifdef NEW_MMA_AVAILABLE
+        int * xi = (int * ) x;
+        const int * xs = (const int *) xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
+        asm("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];"
+            : "+r"(xi[0]), "+r"(xi[2]), "+r"(xi[1]), "+r"(xi[3])
+            : "l"(xs));
+#else
+        GGML_UNUSED(xs0);
+        GGML_UNUSED(stride);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    __device__ __forceinline__ void transpose() {
+        int * xi  = (int *) x;
+        xi[0] = ggml_cuda_movmatrix(xi[0]);
+
+        const int tmp = ggml_cuda_movmatrix(xi[1]);
+        xi[1] = ggml_cuda_movmatrix(xi[2]);
+        xi[2] = tmp;
+
+        xi[3] = ggml_cuda_movmatrix(xi[3]);
+    }
+};
+
+template <typename T>
+struct mma_B_J8K4 {
+    static_assert(sizeof(T) == 4, "bad type size");
+
+    static constexpr int J  = 8;
+    static constexpr int K  = 4;
+    static constexpr int ne = 1;
+
+    T x[ne];
+
+    static __device__ __forceinline__ int get_j(const int /* l */) {
+        const int ret = threadIdx.x / K;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  J);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_k(const int /* l */) {
+        const int ret = threadIdx.x % K;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  K);
+        return ret;
+    }
+
+    __device__ __forceinline__ void load_generic(const T * __restrict__ xs0, const int & stride) {
+#pragma unroll
+        for (int l = 0; l < ne; ++l) {
+            x[l] = xs0[get_j(l)*stride + get_k(l)];
+        }
+    }
+
+    __device__ __forceinline__ void load_ldmatrix(const T * __restrict__ xs0, const int & stride) {
+#ifdef NEW_MMA_AVAILABLE
+        int * xi = (int *) x;
+        const int * xs = (const int *) xs0 + (threadIdx.x%J)*stride;
+        asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
+            : "+r"(xi[0]) : "l"(xs));
+#else
+        load_generic(xs0, stride);
+#endif // NEW_MMA_AVAILABLE
+    }
+};
+
+template <typename T>
+struct mma_B_J8K8 {
+    static_assert(sizeof(T) == 4, "bad type size");
+
+    static constexpr int J  = 8;
+    static constexpr int K  = 8;
+    static constexpr int ne = 2;
+
+    T x[ne];
+
+    static __device__ __forceinline__ int get_j(const int /* l */) {
+        const int ret = threadIdx.x / (K/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  J);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_k(const int l) {
+        const int ret = l * (K/2) + threadIdx.x % (K/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  K);
+        return ret;
+    }
+
+    __device__ __forceinline__ void load_generic(const T * __restrict__ xs0, const int & stride) {
+#pragma unroll
+        for (int l = 0; l < ne; ++l) {
+            x[l] = xs0[get_j(l)*stride + get_k(l)];
+        }
+    }
+
+    __device__ __forceinline__ void load_ldmatrix(const T * __restrict__ xs0, const int & stride) {
+#ifdef NEW_MMA_AVAILABLE
+        int * xi = (int *) x;
+        const int * xs = (const int *) xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
+        asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
+            : "+r"(xi[0]), "+r"(xi[1])
+            : "l"(xs));
+#else
+        load_generic(xs0, stride);
+#endif // NEW_MMA_AVAILABLE
+    }
+};
+
+template <typename T>
+struct mma_C_I16J8 {};
+
+template <>
+struct mma_C_I16J8<int> {
+    static constexpr int I  = 16;
+    static constexpr int J  = 8;
+    static constexpr int ne = 4;
+
+    int x[ne] = {0};
+
+    static __device__ __forceinline__ int get_i(const int l) {
+        const int ret = (l/2) * (I/2) + threadIdx.x / (J/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  I);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_j(const int l) {
+        const int ret = 2 * (threadIdx.x % (J/2)) + l%2;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  J);
+        return ret;
+    }
+
+    __device__ __forceinline__ void mma(const mma_A_I16K4<int> & mma_A, const mma_B_J8K4<int> & mma_B) {
+#ifdef NEW_MMA_AVAILABLE
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
+            : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
+#else
+        // On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[0]), "+r"(x[1])
+            : "r"(mma_A.x[0]), "r"(mma_B.x[0]));
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[2]), "+r"(x[3])
+            : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(mma_A);
+        GGML_UNUSED(mma_B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    __device__ __forceinline__ void mma(const mma_A_I16K8<int> & mma_A, const mma_B_J8K8<int> & mma_B) {
+#ifdef NEW_MMA_AVAILABLE
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
+            : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
+            : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
+#else
+        // On Turing m16n8k32 mma is not available, use 4x m8n8k16 mma instead:
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[0]), "+r"(x[1])
+            : "r"(mma_A.x[0]), "r"(mma_B.x[0]));
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[2]), "+r"(x[3])
+            : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[0]), "+r"(x[1])
+            : "r"(mma_A.x[2]), "r"(mma_B.x[1]));
+        asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
+            : "+r"(x[2]), "+r"(x[3])
+            : "r"(mma_A.x[3]), "r"(mma_B.x[1]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(mma_A);
+        GGML_UNUSED(mma_B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+};
+
+template <>
+struct mma_C_I16J8<half2> {
+    static constexpr int I  = 16;
+    static constexpr int J  = 4;
+    static constexpr int ne = 2;
+
+    half2 x[ne] = {{0.0f, 0.0f}, {0.0f, 0.0f}};
+
+    static __device__ __forceinline__ int get_i(const int l) {
+        const int ret = l * (I/2) + threadIdx.x / J;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  I);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_j(const int /* l */) {
+        const int ret = threadIdx.x % J;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  J);
+        return ret;
+    }
+
+    __device__ __forceinline__ void mma(const mma_A_I16K8<half2> & mma_A, const mma_B_J8K8<half2> & mma_B) {
+#ifdef NEW_MMA_AVAILABLE
+        int * Axi = (int *) mma_A.x;
+        int * Bxi = (int *) mma_B.x;
+        int * xi  = (int *) x;
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%0, %1};"
+            : "+r"(xi[0]), "+r"(xi[1])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
+#else
+        // On Turing m16n8k16 mma is not available, use 2x m8n8k8 mma instead:
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(xi[0]), "+r"(xi[1])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(xi[0]), "+r"(xi[1])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(mma_A);
+        GGML_UNUSED(mma_B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    __device__ __forceinline__ mma_B_J8K8<half2> to_mma_B() {
+        mma_B_J8K8<half2> mma_B;
+
+        int * xi   = (int *) x;
+        int * Bxi  = (int *) mma_B.x;
+        Bxi[0] = ggml_cuda_movmatrix(xi[0]);
+        Bxi[1] = ggml_cuda_movmatrix(xi[1]);
+
+        return mma_B;
+    }
+};
+
+template <>
+struct mma_C_I16J8<float> {
+    static constexpr int I  = 16;
+    static constexpr int J  = 8;
+    static constexpr int ne = 4;
+
+    float x[ne] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+    static __device__ __forceinline__ int get_i(const int l) {
+        const int ret = (l/2) * (I/2) + threadIdx.x / (J/2);
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  I);
+        return ret;
+    }
+
+    static __device__ __forceinline__ int get_j(const int l) {
+        const int ret = 2 * (threadIdx.x % (J/2)) + l%2;
+        GGML_CUDA_ASSUME(ret >= 0);
+        GGML_CUDA_ASSUME(ret <  J);
+        return ret;
+    }
+
+    __device__ __forceinline__ void mma(const mma_A_I16K8<half2> & mma_A, const mma_B_J8K8<half2> & mma_B) {
+#ifdef NEW_MMA_AVAILABLE
+        int * Axi = (int *) mma_A.x;
+        int * Bxi = (int *) mma_B.x;
+        int * xi  = (int *) x;
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
+            : "+r"(xi[0]), "+r"(xi[1]), "+r"(xi[2]), "+r"(xi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]));
+#else
+        // On Turing m16n8k16 mma is not available, use 2x m8n8k8 mma instead:
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(xi[0]), "+r"(xi[1]), "+r"(xi[2]), "+r"(xi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(xi[0]), "+r"(xi[1]), "+r"(xi[2]), "+r"(xi[3])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(mma_A);
+        GGML_UNUSED(mma_B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    __device__ __forceinline__ mma_B_J8K8<half2> to_mma_B() {
+        mma_B_J8K8<half2> mma_B;
+        mma_B.x[0] = make_half2(x[0], x[1]);
+        mma_B.x[1] = make_half2(x[2], x[3]);
+
+        int * Bxi  = (int *) mma_B.x;
+        Bxi[0] = ggml_cuda_movmatrix(Bxi[0]);
+        Bxi[1] = ggml_cuda_movmatrix(Bxi[1]);
+
+        return mma_B;
+    }
+
+    __device__ __forceinline__ void load_generic(const float * __restrict__ xs0, const int & stride) {
+#pragma unroll
+        for (int l = 0; l < ne; ++l) {
+            x[l] = xs0[get_j(l)*stride + get_i(l)];
+        }
+    }
+};
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
new file mode 100644
index 000000000..45212f66c
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -0,0 +1,152 @@
+#include "mmq.cuh"
+
+void ggml_cuda_op_mul_mat_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+    const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
+
+    int id = ggml_cuda_get_device();
+    const int compute_capability = ggml_cuda_info().devices[id].cc;
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the kernel writes into
+    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
+
+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
+    const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ2_XXS:
+            mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ2_XS:
+            mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ2_S:
+            mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ3_S:
+            mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ1_S:
+            mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ4_XS:
+            mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
+}
+
+bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
+#ifdef GGML_CUDA_FORCE_CUBLAS
+    return false;
+#endif // GGML_CUDA_FORCE_CUBLAS
+
+    bool mmq_supported;
+
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            mmq_supported = true;
+            break;
+        default:
+            mmq_supported = false;
+            break;
+    }
+
+    if (!mmq_supported) {
+        return false;
+    }
+
+    if (new_mma_available(cc)) {
+        return true;
+    }
+
+    if (cc < GGML_CUDA_CC_DP4A) {
+        return false;
+    }
+
+#ifdef GGML_CUDA_FORCE_MMQ
+    return true;
+#endif //GGML_CUDA_FORCE_MMQ
+
+    if (cc < GGML_CUDA_CC_OFFSET_AMD) {
+        return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    }
+
+    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+}
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
new file mode 100644
index 000000000..7a2c4d85b
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -0,0 +1,2939 @@
+#pragma once
+
+#include "common.cuh"
+#include "vecdotq.cuh"
+#include "mma.cuh"
+
+#include <climits>
+#include <cstdint>
+
+#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
+#define MMQ_ITER_K 256
+#define MMQ_NWARPS 8
+
+typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride);
+typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00);
+typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
+
+enum mmq_q8_1_ds_layout {
+    MMQ_Q8_1_DS_LAYOUT_D4,
+    MMQ_Q8_1_DS_LAYOUT_DS4,
+    MMQ_Q8_1_DS_LAYOUT_D2S6,
+};
+
+struct block_q8_1_mmq {
+    // The y float data is converted to a data layout that can simply be copied to shared memory as a contiguous block.
+    // The y float data is first grouped as blocks of 128 values.
+    // These blocks are then treated as individual data values and transposed.
+    //
+    // To avoid shared memory bank conflicts each block is padded with 16 bytes.
+    // This padding is also used to store block scales/partial sums.
+    // The scales multiplied with the quantized data are equal to the unquantized values.
+    // The partial sums are obtained by summing up a subgroup of the contained values (prior to quantization)
+    //     and are only needed for performance reasons.
+    //
+    // The exact data stored depends on the x data type.
+    union {
+        float d4[4];    // 1 32 bit scale per 32 values, stored as d0,d1,d2,d3
+        half2 ds4[4];   // 1 16 bit scale + 1 16 bit partial sum per 32 values, stored as d0,s0,d1,s1,d2,s2,d3,s3
+        half  d2s6[8];  // 1 16 bit scale per 64 values + 1 16 bit partial sum per 16 values for the first 96 values,
+                        //     stored as d0,d1,s1,s2,s3,s4,s5
+    };
+    int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
+};
+static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
+static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
+
+static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
+    switch (type_x) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return MMQ_Q8_1_DS_LAYOUT_DS4;
+        case GGML_TYPE_Q5_0:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        case GGML_TYPE_Q5_1:
+            return MMQ_Q8_1_DS_LAYOUT_DS4;
+        case GGML_TYPE_Q8_0:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        case GGML_TYPE_Q2_K:
+            return MMQ_Q8_1_DS_LAYOUT_D2S6;
+        case GGML_TYPE_Q3_K:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return MMQ_Q8_1_DS_LAYOUT_DS4;
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        case GGML_TYPE_IQ1_S:
+            return MMQ_Q8_1_DS_LAYOUT_DS4;
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+struct tile_x_sizes {
+    int qs;
+    int dm;
+    int sc;
+};
+
+static constexpr int get_mmq_x_max_host(const int cc) {
+    return new_mma_available(cc) ? 128 :
+#ifdef GGML_CUDA_FORCE_MMQ
+        cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128                     : 64;
+#else
+        cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
+#endif // GGML_CUDA_FORCE_MMQ
+}
+
+static constexpr __device__ int get_mmq_x_max_device() {
+#ifdef NEW_MMA_AVAILABLE
+    return 128;
+#else // NEW_MMA_AVAILABLE
+
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    return 128;
+#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#ifdef GGML_CUDA_FORCE_MMQ
+    return MMQ_DP4A_MAX_BATCH_SIZE;
+#else // GGML_CUDA_FORCE_MMQ
+    return 128;
+#endif // GGML_CUDA_FORCE_MMQ
+#else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+
+    return 64;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // NEW_MMA_AVAILABLE
+}
+
+static constexpr int get_mmq_y_host(const int cc) {
+    return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc)  ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
+}
+
+static constexpr __device__ int get_mmq_y_device() {
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA1)
+    return 64;
+#else
+    return 128;
+#endif // defined RDNA1
+#else
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+    return 128;
+#else
+    return 64;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+}
+
+#define MMQ_DP4A_TXS_Q4_0    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0   + mmq_y/QI4_0,     0}
+#define MMQ_DP4A_TXS_Q4_1    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_1   + mmq_y/QI4_1,     0}
+#define MMQ_DP4A_TXS_Q8_0    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_0 + mmq_y/(QI8_0/2), 0}
+#define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*4/QI8_0 + mmq_y/(QI8_0/4), 0}
+#define MMQ_DP4A_TXS_Q8_1    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_1 + mmq_y/(QI8_1/2), 0}
+#define MMQ_DP4A_TXS_Q2_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE         + mmq_y,           0}
+#define MMQ_DP4A_TXS_Q3_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y,                                     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q4_K    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_K,                     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q5_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K   + mmq_y/QI5_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q6_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K   + mmq_y/QI6_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
+
+static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
+    return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
+        type == GGML_TYPE_Q4_1    ? MMQ_DP4A_TXS_Q4_1 :
+        type == GGML_TYPE_Q5_0    ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_Q5_1    ? MMQ_DP4A_TXS_Q8_1 :
+        type == GGML_TYPE_Q8_0    ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_Q2_K    ? MMQ_DP4A_TXS_Q2_K :
+        type == GGML_TYPE_Q3_K    ? MMQ_DP4A_TXS_Q3_K :
+        type == GGML_TYPE_Q4_K    ? MMQ_DP4A_TXS_Q4_K :
+        type == GGML_TYPE_Q5_K    ? MMQ_DP4A_TXS_Q5_K :
+        type == GGML_TYPE_Q6_K    ? MMQ_DP4A_TXS_Q6_K :
+        type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ2_XS  ? MMQ_DP4A_TXS_Q8_0_16 :
+        type == GGML_TYPE_IQ2_S   ? MMQ_DP4A_TXS_Q8_0_16 :
+        type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ3_S   ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ1_S   ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ4_XS  ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ4_NL  ? MMQ_DP4A_TXS_Q8_0 :
+        tile_x_sizes{0, 0, 0};
+}
+
+#define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
+#define MMQ_MMA_TILE_X_K_Q8_1 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
+#define MMQ_MMA_TILE_X_K_Q2_K (2*WARP_SIZE + WARP_SIZE                         + 4)
+#define MMQ_MMA_TILE_X_K_Q3_K (2*WARP_SIZE + WARP_SIZE/2                       + 4)
+#define MMQ_MMA_TILE_X_K_Q6_K (2*WARP_SIZE + WARP_SIZE/QI6_K     + WARP_SIZE/8 + 7)
+
+static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
+static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
+
+static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_Q4_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        type == GGML_TYPE_Q5_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_Q5_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        type == GGML_TYPE_Q8_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_Q2_K    ? MMQ_MMA_TILE_X_K_Q2_K :
+        type == GGML_TYPE_Q3_K    ? MMQ_MMA_TILE_X_K_Q3_K :
+        type == GGML_TYPE_Q4_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        type == GGML_TYPE_Q5_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        type == GGML_TYPE_Q6_K    ? MMQ_MMA_TILE_X_K_Q6_K :
+        type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ2_XS  ? MMQ_MMA_TILE_X_K_Q3_K :
+        type == GGML_TYPE_IQ2_S   ? MMQ_MMA_TILE_X_K_Q3_K :
+        type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ3_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ1_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ4_XS  ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ4_NL  ? MMQ_MMA_TILE_X_K_Q8_0 :
+        0;
+}
+
+#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
+
+static int mmq_get_granularity_host(const int mmq_x, const int cc) {
+    return new_mma_available(cc) && mmq_x >= 48 ? 16 : 8;
+}
+
+#ifdef NEW_MMA_AVAILABLE
+static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
+    return mmq_x >= 48 ? 16 : 8;
+}
+#else
+static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */) {
+    return 8;
+}
+#endif // NEW_MMA_AVAILABLE
+
+// ------------------------------------------------------------
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI4_0;
+    const int kqsx = threadIdx.x % QI4_0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
+        const int qs0 = get_int_b2(bxi->qs, kqsx);
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0]     = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808);
+#else
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0       + kbxd] = bxi->d;
+#else
+        x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + kbxd] = bxi->d;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_0*VDR_Q4_0_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
+
+                int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+                for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+                    u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + kyqs +  l];
+                    u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_0)];
+                }
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+                    (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_0], u,
+                     x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/(QR4_0*QI4_0)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI4_1;
+    const int kqsx = threadIdx.x % QI4_1;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
+        const int qs0 = get_int_b4(bxi->qs, kqsx);
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0]     = (qs0 >> 0) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F;
+#else
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_dm[i*MMQ_MMA_TILE_X_K_Q8_1       + kbxd] = bxi->dm;
+#else
+        x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + kbxd] = bxi->dm;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_1*VDR_Q4_1_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const int kyqs = QI8_1 * ((k01/2) / (QI8_1/2)) + (k01/2) % (QI8_1/2);
+
+                int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+                for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+                    u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + kyqs +  l];
+                    u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + kyqs + (l + QI4_1)];
+                }
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+                    (&x_qs[i*(WARP_SIZE + 1) + k0/QR4_1], u,
+                     x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/(QR4_1*QI4_1)], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_0, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI5_0;
+    const int kqsx = threadIdx.x % QI5_0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
+
+        const int ql = get_int_b2(bxi->qs, kqsx);
+        const int qh = get_int_b2(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + 0]     = qs0;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + kbx*(2*QI5_0) + kqsx + 0]     = qs0;
+        x_qs[i*(2*WARP_SIZE + 1)     + kbx*(2*QI5_0) + kqsx + QI5_0] = qs1;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0       + kbxd] = bxi->d;
+#else
+        x_df[i*(WARP_SIZE/QI5_0) + i/QI5_0 + kbxd] = bxi->d;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI5_1;
+    const int kqsx = threadIdx.x % QI5_1;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
+
+        const int ql = get_int_b4(bxi->qs, kqsx);
+        const int qh = get_int_b4(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + 0]     = qs0;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + kbx*(2*QI5_1) + kqsx + 0]     = qs0;
+        x_qs[i*(2*WARP_SIZE + 1)     + kbx*(2*QI5_1) + kqsx + QI5_1] = qs1;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_dm[i*MMQ_MMA_TILE_X_K_Q8_1       + kbxd] = bxi->dm;
+#else
+        x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + kbxd] = bxi->dm;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_tile + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI8_0;
+    const int kqsx = threadIdx.x % QI8_0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 0         + threadIdx.x] = get_int_b2(bxi[0].qs,               kqsx);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx);
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + 0         + threadIdx.x] = get_int_b2(bxi[0].qs,               kqsx);
+        x_qs[i*(2*WARP_SIZE + 1)     + WARP_SIZE + threadIdx.x] = get_int_b2(bxi[WARP_SIZE/QI8_0].qs, kqsx);
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = 2*WARP_SIZE / QI8_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0/2) {
+        int i = i0 + threadIdx.y * (QI8_0/2) + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0             + kbxd] = bxi->d;
+#else
+        x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + kbxd] = bxi->d;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
+                    (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0 % WARP_SIZE],
+                     x_df[i*(2*WARP_SIZE/QI8_0) + i/(QI8_0/2) + k0/QI8_0], y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps, mmq_q8_1_ds_layout ds_layout>
+static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    typedef mma_A_I16K8<int> mma_A;
+    typedef mma_B_J8K8<int>  mma_B;
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + 2*WARP_SIZE;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+    const half2 * y_ds = (const half2 *) y;
+
+    mma_A A[ntx][WARP_SIZE/QI8_0];
+    float dA[ntx][mma_C::ne/2][WARP_SIZE/QI8_0];
+
+    const int i0 = (threadIdx.y/ntx)*rows_per_warp;
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
+            const int k0 = k00 + k01;
+
+            A[n][k01/QI8_0].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
+        }
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_A::I + mma_C::get_i(2*l);
+
+#pragma unroll
+            for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
+                const int k0 = k00 + k01;
+
+                dA[n][l][k01/QI8_0] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
+            }
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
+            mma_B  B;
+            float dB[mma_C::ne/2];
+
+            B.load_generic(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
+                    dB[l] =             y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+                } else {
+                    dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                }
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C C;
+                C.mma(A[n][k01/QI8_0], B);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += C.x[l]*dA[n][l/2][k01/QI8_0]*dB[l%2];
+                }
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += VDR_Q8_0_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+                    (&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
+                    x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI8_1], y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    typedef mma_A_I16K8<int> mma_A;
+    typedef mma_B_J8K8<int>  mma_B;
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + 2*WARP_SIZE;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_dm = (const half2 *) y;
+
+    mma_A    A[ntx][WARP_SIZE/QI8_1];
+    float2 dmA[ntx][mma_C::ne/2][WARP_SIZE/QI8_1];
+
+    const int i0 = (threadIdx.y/ntx)*rows_per_warp;
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
+            const int k0 = k00 + k01;
+
+            A[n][k01/QI8_1].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
+        }
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_A::I + mma_C::get_i(2*l);
+
+#pragma unroll
+            for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
+                const int k0 = k00 + k01;
+
+                dmA[n][l][k01/QI8_1] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
+            }
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
+            mma_B    B;
+            float2 dsB[mma_C::ne/2];
+
+            B.load_generic(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); // faster than load_ldmatrix
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C C;
+                C.mma(A[n][k01/QI8_1], B);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dmA[n][l/2][k01/QI8_1].x*dsB[l%2].x*C.x[l];
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dmA[n][l/2][k01/QI8_1].y*dsB[l%2].y;
+                }
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_16_q8_1_impl<QI8_0>(
+                    &x_qs[i*(2*WARP_SIZE + 1) + k0],
+                    &y_qs[j*MMQ_TILE_Y_K + k01],
+                    &x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)],
+                    y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+#ifdef NEW_MMA_AVAILABLE
+
+    typedef mma_A_I16K4<int> mma_A;
+    typedef mma_A_I16K8<int> mma_A_K8;
+    typedef mma_B_J8K4<int>  mma_B;
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + WARP_SIZE*2;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
+
+    mma_A   A[ntx][8];
+    float  dA[ntx][mma_C::ne/2][8];
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
+            const int k0 = k00 + k01;
+
+            ((mma_A_K8 *) A[n])[k01/8].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
+        }
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
+
+#pragma unroll
+            for (int k01 = 0; k01 < WARP_SIZE; k01 += 4) {
+                const int k0 = k00 + k01;
+
+                dA[n][l][k01/4] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4];
+            }
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
+            mma_B B[2];
+            float dB[mma_C::ne/2];
+
+            // Here load_generic is faster than load_ldmatrix.
+            B[0].load_generic(y_qs + j0*MMQ_TILE_Y_K + (k01 + 0),        MMQ_TILE_Y_K);
+            B[1].load_generic(y_qs + j0*MMQ_TILE_Y_K + (k01 + mma_B::K), MMQ_TILE_Y_K);
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C C[2];
+                C[0].mma(A[n][k01/4 + 0], B[0]);
+                C[1].mma(A[n][k01/4 + 1], B[1]);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dB[l%2]*(C[0].x[l]*dA[n][l/2][k01/4 + 0] + C[1].x[l]*dA[n][l/2][k01/4 + 1]);
+                }
+            }
+        }
+    }
+#else
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % QI2_K;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI2_K) {
+        int i = i0 + threadIdx.y*(WARP_SIZE/QI2_K) + threadIdx.x/QI2_K;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride;
+
+        const int x_ql_0 = get_int_b2(bxi->qs, kqsx);
+
+#pragma unroll
+        for (int l = 0; l < QR2_K; ++l) {
+            const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
+
+            const int x_qs_k = (x_ql_0 >> (2*l)) & 0x03030303;
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q2_K + k] = x_qs_k;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + k] = x_qs_k;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int sc_m = bxi->scales[kqsx];
+#ifdef FAST_FP16_AVAILABLE
+        const half2 x_dm_ik = __hmul2(bxi->dm, make_half2(sc_m & 0x0F, sc_m >> 4));
+#else
+        const float2 bxi_dmf = __half22float2(bxi->dm);
+        const half2 x_dm_ik = make_half2(bxi_dmf.x*(sc_m & 0x0F), bxi_dmf.y*(sc_m >> 4));
+#endif // FAST_FP16_AVAILABLE
+
+#ifdef NEW_MMA_AVAILABLE
+        x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + kqsx] = x_dm_ik;
+#else
+        x_dm[i*(WARP_SIZE + 1)       + kqsx] = x_dm_ik;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+    float2 y_df[mmq_x/nwarps];
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+        y_df[j0/nwarps] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
+    }
+
+#pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                if (k01 < WARP_SIZE/2) {
+                    constexpr int ns = 2;
+                    sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
+                        &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
+                        &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
+                        &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
+                } else {
+                    constexpr int ns = 1;
+                    sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
+                        &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
+                        &x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
+                        &y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
+                }
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+#ifdef NEW_MMA_AVAILABLE
+
+    typedef mma_A_I16K4<int> mma_A;
+    typedef mma_A_I16K8<int> mma_A_K8;
+    typedef mma_B_J8K4<int>  mma_B;
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE*2;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
+
+    mma_A   A[ntx][8];
+    float  dA[ntx][mma_C::ne/2][8];
+    float  mA[ntx][mma_C::ne/2][8];
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
+            const int k0 = k00 + k01;
+
+            ((mma_A_K8 *) A[n])[k01/QI8_1].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
+        }
+    }
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
+
+#pragma unroll
+            for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1/2) {
+                const int k0 = k00 + k01;
+
+                const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/(QI8_1/2)]);
+
+                dA[n][l][k01/(QI8_1/2)] = dm.x;
+                mA[n][l][k01/(QI8_1/2)] = dm.y;
+            }
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+        float2 dB[mma_C::ne/2];
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int j = j0 + mma_C::get_j(l);
+
+            dB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K]);
+        }
+
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
+            mma_B B[2];
+
+            // Here load_generic is faster than load_ldmatrix.
+            B[0].load_generic(y_qs + j0*MMQ_TILE_Y_K + (k01 + 0),        MMQ_TILE_Y_K);
+            B[1].load_generic(y_qs + j0*MMQ_TILE_Y_K + (k01 + mma_B::K), MMQ_TILE_Y_K);
+
+            mma_C Cm[2];
+            if (k01 >= WARP_SIZE * 3/4) {
+                mma_A A1;
+                A1.x[0] = 0x01010101;
+                A1.x[1] = 0x01010101;
+                Cm[0].mma(A1, B[0]);
+                Cm[1].mma(A1, B[1]);
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C Cd[2];
+
+                Cd[0].mma(A[n][k01/4 + 0], B[0]);
+                Cd[1].mma(A[n][k01/4 + 1], B[1]);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    float tmp = Cd[0].x[l]*dA[n][l/2][k01/4 + 0] + Cd[1].x[l]*dA[n][l/2][k01/4 + 1];
+                    if (k01 >= WARP_SIZE * 3/4) {
+                        tmp -= Cm[0].x[l]*mA[n][l/2][k01/4 + 0] + Cm[1].x[l]*mA[n][l/2][k01/4 + 1];
+                    }
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += tmp*(k01 < WARP_SIZE/2 ? dB[l%2].x : dB[l%2].y);
+                }
+            }
+        }
+
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE * 3/4; k01 += QI8_1) {
+            float2 sB[mma_C::ne/2];
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                sB[l] = __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] -= mA[n][l/2][k01/4 + 0]*sB[l%2].x;
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] -= mA[n][l/2][k01/4 + 1]*sB[l%2].y;
+                }
+            }
+        }
+    }
+#else
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+    int   * x_sc = (int   *) (x_df + txs.dm);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % QI3_K;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI3_K) {
+        int i = i0 + threadIdx.y * (WARP_SIZE/QI3_K) + threadIdx.x / QI3_K;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
+
+        const int x_ql_0 = get_int_b2(bxi->qs,    kqsx);
+        const int x_qh_0 = get_int_b2(bxi->hmask, kqsx % (QI3_K/2)) >> (4 * (kqsx / (QI3_K/2)));
+
+#pragma unroll
+        for (int l = 0; l < QR3_K; ++l) {
+            const int k = (kqsx/8)*32 + l*8 + kqsx % 8;
+
+            const int x_ql_k =  (x_ql_0 >> (2*l))       & 0x03030303;
+            const int x_qh_k = ((x_qh_0 >>    l)  << 2) & 0x04040404;
+
+            const int x_qs_k = __vsubss4(x_ql_k | x_qh_k, 0x04040404);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + k] = x_qs_k;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + k] = x_qs_k;
+#endif // NEW_MMA_AVAILABLE
+        }
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
+        int i = i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
+
+        const int ksc = threadIdx.x % (WARP_SIZE/8);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_b2(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_b2(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+#ifdef NEW_MMA_AVAILABLE
+        const int8_t * sc8 = (const int8_t *) &sc;
+        const float d = bxi->d;
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
+        }
+#else
+        x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = sc;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+#ifndef NEW_MMA_AVAILABLE
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*WARP_SIZE) {
+        int i = (i0 + threadIdx.y*WARP_SIZE + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
+
+        x_df[i] = bxi->d;
+    }
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * x_sc = (const int   *) x_df + txs.dm;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const int8_t * scales = ((const int8_t *) (x_sc + i*(WARP_SIZE/8) + i/8)) + k0/4;
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
+                    &x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], scales,
+                    x_df[i], y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, const int ksc) {
+    // scale arrangement after the following two lines:
+    //   - ksc == 0: sc0, sc1, sc2, sc3
+    //   - ksc == 1: sc4, sc5, sc6, sc7
+    //   - ksc == 2:  m0,  m1,  m2,  m3
+    //   - ksc == 3:  m4,  m5,  m6,  m7
+    return ((scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F) | // lower 4 bits
+           ((scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030);  // upper 2 bits
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + txs.qs);
+    int   * x_sc = (int   *) (x_dm + txs.dm);
+#endif // NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+        const int qs0 = get_int_b4(bxi->qs, threadIdx.x);
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
+#else
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+#ifdef NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
+        int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+
+        const int * scales = (const int *) bxi->scales;
+        const int ksc = threadIdx.x % (WARP_SIZE/16);
+
+        const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
+        const int  m32 = unpack_scales_q45_K(scales, ksc + 2);
+
+        const uint8_t * sc8 = (const uint8_t *) &sc32;
+        const uint8_t *  m8 = (const uint8_t *)  &m32;
+
+        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
+        }
+    }
+
+#else
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI4_K) {
+        int i = (i0 + threadIdx.y*QI4_K + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+
+        x_dm[i] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = threadIdx.x % (WARP_SIZE/8);
+        const int scales8 = unpack_scales_q45_K(scales, ksc);
+
+        x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
+    }
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
+    const int   * x_sc = (const int   *) x_dm + txs.dm;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_K*VDR_Q4_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const uint8_t * sc = (const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/32] + 2*(k01/16);
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
+                    &x_qs[i*(WARP_SIZE + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
+                    x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_dm = (half2 *) (x_qs + txs.qs);
+    int   * x_sc = (int   *) (x_dm + txs.dm);
+#endif // NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+        const int ky = QR5_K*threadIdx.x;
+
+        const int ql = get_int_b4(bxi->qs, threadIdx.x);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_b4(bxi->qh, threadIdx.x % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + QI5_K/4;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1;
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + kq0] = ql0 | qh0;
+        x_qs[i*(2*WARP_SIZE + 1)     + kq1] = ql1 | qh1;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+#ifdef NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
+        int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+
+        const int * scales = (const int *) bxi->scales;
+        const int ksc = threadIdx.x % (WARP_SIZE/16);
+
+        const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
+        const int  m32 = unpack_scales_q45_K(scales, ksc + 2);
+
+        const uint8_t * sc8 = (const uint8_t *) &sc32;
+        const uint8_t *  m8 = (const uint8_t *)  &m32;
+
+        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
+        }
+    }
+
+#else
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI5_K) {
+        int i = (i0 + threadIdx.y*QI5_K + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+
+        x_dm[i] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
+        int i = (i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = threadIdx.x % (WARP_SIZE/8);
+        const int scales8 = unpack_scales_q45_K(scales, ksc);
+
+        x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
+    }
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + txs.qs;
+    const int   * x_sc = (const int   *) x_dm + txs.dm;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR5_K*VDR_Q5_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k00/32]) + 2*(k01/16);
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
+                    &x_qs[i*(QR5_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
+                    x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+    int   * x_sc = (int   *) (x_df + WARP_SIZE/QI6_K);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+    int   * x_sc = (int   *) (x_df + txs.dm);
+#endif // NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
+
+        const int ql = get_int_b2(bxi->ql, threadIdx.x);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (threadIdx.x / (QI6_K/2)) + threadIdx.x % (QI6_K/4));
+        const int qh0 = ((qh >> ((threadIdx.x & 0x08) >> 2)) << 4) & 0x30303030;
+        const int qh1 =  (qh >> ((threadIdx.x & 0x08) >> 2))       & 0x30303030;
+
+        const int kq0 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + 0;
+        const int kq1 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + QI6_K/2;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_qs[i*(2*WARP_SIZE + 1)     + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K;  // == 1 if QK_K == 256
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q6_K       + kbxd] = bxi->d;
+#else
+        x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + kbxd] = bxi->d;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8));
+#else
+        x_sc[i*(WARP_SIZE/8) + i/8   + threadIdx.x % (WARP_SIZE/8)] = get_int_b2(bxi->scales, threadIdx.x % (QI6_K/8));
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * x_sc = (const int   *) x_df + txs.dm;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QR6_K*VDR_Q6_K_Q8_1_MMQ) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]);
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
+                    &x_qs[i*(QR6_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc,
+                    x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+#ifdef NEW_MMA_AVAILABLE
+
+    typedef mma_A_I16K4<int> mma_A;
+    typedef mma_B_J8K4<int>  mma_B;
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + WARP_SIZE*2;
+    const int   * x_sc = (const int   *) x_df + WARP_SIZE/QI6_K;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
+
+    mma_A   A[ntx][8];
+    int   scA[ntx][mma_C::ne/2][8];
+    float  dA[ntx][mma_C::ne/2];
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
+            const int k0 = k00 + k01;
+
+            A[n][k01/4 + 0].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + 0),        MMQ_MMA_TILE_X_K_Q6_K);
+            A[n][k01/4 + 1].load_ldmatrix(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q6_K + (k0 + mma_A::K), MMQ_MMA_TILE_X_K_Q6_K);
+        }
+
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += 16) {
+            const int k0 = k00 + k01;
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
+
+                const int      sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q6_K + k0/16];
+                const int8_t * sc        = (const int8_t *) &sc_packed;
+
+#pragma unroll
+                for (int ksc = 0; ksc < sizeof(int); ++ksc) {
+                    scA[n][l][k01/4 + ksc] = sc[ksc];
+                }
+            }
+        }
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
+
+            dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q6_K];
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+        float tmp[ntx][mma_C::ne] = {{0.0f}};
+
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
+            mma_B B[2];
+            float dB[mma_C::ne/2];
+
+            // Here load_generic is faster than load_ldmatrix.
+            B[0].load_generic(y_qs + j0*MMQ_TILE_Y_K + 0        + k01, MMQ_TILE_Y_K);
+            B[1].load_generic(y_qs + j0*MMQ_TILE_Y_K + mma_B::K + k01, MMQ_TILE_Y_K);
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C C[2];
+                C[0].mma(A[n][k01/4 + 0], B[0]);
+                C[1].mma(A[n][k01/4 + 1], B[1]);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    tmp[n][l] += (C[0].x[l]*scA[n][l/2][k01/4 + 0] + C[1].x[l]*scA[n][l/2][k01/4 + 1])*dB[l%2];
+                }
+            }
+        }
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+            for (int l = 0; l < mma_C::ne; ++l) {
+                sum[(j0/mma_C::J + n)*mma_C::ne + l] += tmp[n][l]*dA[n][l/2];
+            }
+        }
+    }
+#else
+    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
+    NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_nl(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_NL, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = threadIdx.x / QI4_NL;
+    const int kqsx = threadIdx.x % QI4_NL;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbx;
+
+        const int aux_q4 = get_int_b2(bxi->qs, kqsx);
+        const int2 v = get_int_from_table_16(aux_q4);
+        const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4;
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + k0 + 0] = v.x;
+        x_qs[i*(2*WARP_SIZE + 1)     + k0 + 4] = v.y;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_NL;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_NL) {
+        int i = i0 + threadIdx.y * QI4_NL + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq4_nl * bxi = (const block_iq4_nl *) x + kbx0 + i*stride + kbxd;
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kbxd] = __half2float(bxi->d);
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_XXS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XXS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XXS) + threadIdx.x/(QI2_XXS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_xxs * bxi = (const block_iq2_xxs *) x + kbx0 + i*stride;
+
+        const int q2 = get_int_b2(bxi->qs, 2*kqsx+0);
+        const uint8_t * aux8 = (const uint8_t *) &q2;
+        const uint32_t aux32 = get_int_b2(bxi->qs, 2*kqsx+1);
+
+#pragma unroll
+        for (int l = 0; l < QR2_XXS; ++l) {
+            const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
+            const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
+
+            const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
+            const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+
+            const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
+            const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid0;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid1;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int ls = aux32 >> 28;
+        const float d = bxi->d;
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = (ls*d + d/2)/4;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_XS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XS) + threadIdx.x/(QI2_XS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_xs * bxi = (const block_iq2_xs *) x + kbx0 + i*stride;
+
+        const int2 q2_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint16_t * q2 = (const uint16_t *) &q2_packed;
+
+    #pragma unroll
+        for (int l = 0; l < QR2_XS; ++l) {
+            const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
+            const uint32_t * signs    = (const uint32_t *)(ksigns64   + (q2[l] >> 9));
+
+            const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
+            const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int ls = bxi->scales[kqsx];
+        const float d = bxi->d;
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#else
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_S/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_S/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_S) + threadIdx.x/(QI2_S/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_s * bxi = (const block_iq2_s *) x + kbx0 + i*stride;
+
+        const int       qs_packed = get_int_b2(bxi->qs, kqsx);
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+        const int       signs_packed_32 = get_int_b2(bxi->qs, QK_K/32 + kqsx);
+        const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+#pragma unroll
+        for (int l = 0; l < QR2_S; ++l) {
+            const int * grid_pos = (const int *)(iq2s_grid + (qs[l] | ((qh << (8-2*l)) & 0x300)));
+
+            const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
+            const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
+
+            const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
+            const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int ls = bxi->scales[kqsx];
+        const float d = bxi->d;
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#else
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_xxs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI3_XXS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_XXS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_XXS) + threadIdx.x/(QI3_XXS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq3_xxs * bxi = (const block_iq3_xxs *) x + kbx0 + i*stride;
+
+        const int2 q3_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint8_t * q3 = (const uint8_t *) &q3_packed;
+        const uint32_t aux32 = get_int_b2(bxi->qs, QK_K/16 + kqsx);
+
+#pragma unroll
+        for (int l = 0; l < QR3_XXS; ++l) {
+            const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
+
+            const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
+
+            const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
+            const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int ls = aux32 >> 28;
+        const float d = bxi->d;
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = (ls*d + d/2)/2;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI3_S/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_S/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_S) + threadIdx.x/(QI3_S/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq3_s * bxi = (const block_iq3_s *) x + kbx0 + i*stride;
+
+        const int2      qs_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+        const int       signs_packed_32 = get_int_b2(bxi->signs, kqsx);
+        const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+#pragma unroll
+        for (int l = 0; l < QR3_S; ++l) {
+            const int2 grid_pos = make_int2(
+                iq3s_grid[qs[2*l+0] | ((qh << (8 - 2*l)) & 0x100)],
+                iq3s_grid[qs[2*l+1] | ((qh << (7 - 2*l)) & 0x100)]);
+
+            const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
+            const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
+
+            const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
+            const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+1)] = grid_h;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F);
+        const float d = bxi->d;
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = ls*d;
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_ds = (half2 *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_ds = (half2 *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % QI1_S;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI1_S) {
+        int i = i0 + threadIdx.y*(WARP_SIZE/QI1_S) + threadIdx.x/QI1_S;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq1_s * bxi = (const block_iq1_s *) x + kbx0 + i*stride;
+
+        const int       qs_packed = get_int_b2(bxi->qs, kqsx);
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+    #pragma unroll
+        for (int l = 0; l < QR1_S/2; ++l) {
+            const int grid = iq1s_grid_gpu[qs[l] | (((qh >> (3*l)) & 0x07) << 8)];
+
+            const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+            const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+#ifdef NEW_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+0)] = grid0;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+1)] = grid1;
+#endif // NEW_MMA_AVAILABLE
+        }
+
+        const float  d1q   = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1);
+        const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+
+#ifdef NEW_MMA_AVAILABLE
+        x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta);
+#else
+        x_ds[i*(WARP_SIZE/4) + i/4   + kqsx] = make_half2(d1q, d1q*delta);
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_xs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef NEW_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // NEW_MMA_AVAILABLE
+
+    const int kbx  = 0;           // threadIdx.x / QI4_XS
+    const int kqsx = threadIdx.x; // threadIdx.x % QI4_XS
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride + kbx;
+
+        const int aux_q4 = get_int_b4(bxi->qs, kqsx);
+        const int2 v = get_int_from_table_16(aux_q4);
+        const int k0 = 8 * (threadIdx.x / 4) + threadIdx.x % 4;
+#ifdef NEW_MMA_AVAILABLE
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 4] = v.y;
+#else
+        x_qs[i*(2*WARP_SIZE + 1)     + k0 + 0] = v.x;
+        x_qs[i*(2*WARP_SIZE + 1)     + k0 + 4] = v.y;
+#endif // NEW_MMA_AVAILABLE
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq4_xs * bxi = (const block_iq4_xs *) x + kbx0 + i*stride;
+
+        const float d = __half2float(bxi->d);
+
+        const int ls = ((bxi->scales_l[(threadIdx.x % 8)/2] >> (4*(threadIdx.x % 2))) & 0x0F)
+            | (((bxi->scales_h >> (2*(threadIdx.x % 8))) & 0x03) << 4);
+
+#ifdef NEW_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * (ls - 32);
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + threadIdx.x % 8] = d * (ls - 32);
+#endif // NEW_MMA_AVAILABLE
+    }
+}
+
+template<int mmq_x, int mmq_y, int nwarps, bool need_check>
+static __device__ __forceinline__ void mmq_write_back_dp4a(
+    const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+        if (j > j_max) {
+            return;
+        }
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            if (need_check && i > i_max) {
+                continue;
+            }
+
+            dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+        }
+    }
+}
+
+template<int mmq_x, int mmq_y, int nwarps, bool need_check>
+static __device__ __forceinline__ void mmq_write_back_mma(
+    const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
+
+    typedef mma_C_I16J8<int> mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    const int i0 = (threadIdx.y / ntx) * (ntx*mma_C::I);
+#ifdef NEW_MMA_AVAILABLE
+    static_assert(nwarps*mma_C::I == mmq_y, "nwarps*mma_C::I != mmq_y");
+#endif // NEW_MMA_AVAILABLE
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+            for (int l = 0; l < mma_C::ne; ++l) {
+                const int j = j0 + (threadIdx.y % ntx) * mma_C::J + mma_C::get_j(l);
+
+                if (j > j_max) {
+                    continue;
+                }
+
+                const int i = i0 + n*mma_C::I + mma_C::get_i(l);
+
+                if (need_check && i > i_max) {
+                    continue;
+                }
+
+                dst[j*stride + i] = sum[(j0/mma_C::J + n)*mma_C::ne + l];
+            }
+        }
+    }
+}
+
+// -------------------------------------------------------------------------------------------------------------------------------------
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check, ggml_type type>
+struct mmq_type_traits;
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
+    static constexpr int              vdr          = VDR_Q4_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_DS4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
+    static constexpr int              vdr          = VDR_Q4_1_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_1<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
+    static constexpr int              vdr          = VDR_Q5_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q5_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
+    static constexpr int              vdr          = VDR_Q5_1_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q5_1<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
+    static constexpr int              vdr          = VDR_Q8_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q8_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
+    static constexpr int              vdr          = VDR_Q2_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q2_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q2_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q2_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
+    static constexpr int              vdr          = VDR_Q3_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q3_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
+    static constexpr int              vdr          = VDR_Q4_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
+    static constexpr int              vdr          = VDR_Q5_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q5_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
+    static constexpr int              vdr          = VDR_Q6_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q6_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XXS> {
+    static constexpr int              vdr          = VDR_IQ2_XXS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_xxs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XS> {
+    static constexpr int              vdr          = VDR_IQ2_XS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_xs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_S> {
+    static constexpr int              vdr          = VDR_IQ2_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_XXS> {
+    static constexpr int              vdr          = VDR_IQ3_XXS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq3_xxs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_S> {
+    static constexpr int              vdr          = VDR_IQ3_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq3_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ1_S> {
+    static constexpr int              vdr          = VDR_IQ1_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq1_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_NL> {
+    static constexpr int              vdr          = VDR_IQ4_NL_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq4_nl<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_XS> {
+    static constexpr int              vdr          = VDR_IQ4_XS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq4_xs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
+static __device__ void mul_mat_q_process_tile(
+    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
+    const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0,
+    const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) {
+
+    constexpr int              qk         = ggml_cuda_type_traits<type>::qk;
+    constexpr int              mmq_y      = get_mmq_y_device();
+    constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
+
+    extern __shared__ char data_mul_mat_q[];
+    int * tile_y = (int *) data_mul_mat_q;
+    int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE);
+
+#ifdef NEW_MMA_AVAILABLE
+    constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot_mma;
+    constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
+#else
+    constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot_dp4a;
+    constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
+#endif // NEW_MMA_AVAILABLE
+
+    constexpr int blocks_per_iter = MMQ_ITER_K / qk;
+
+    float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
+
+    const int tile_x_max_i = ne01 - it*mmq_y - 1;
+    const int tile_y_max_j = ne11 - jt*mmq_x - 1;
+
+    const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
+
+    for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
+        load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01);
+
+        {
+            const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
+#pragma unroll
+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
+                int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
+
+                tile_y[l] = by0[l];
+            }
+        }
+
+        __syncthreads();
+
+        vec_dot(tile_x, tile_y, sum, 0);
+
+        __syncthreads();
+
+        {
+            const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
+#pragma unroll
+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
+                int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
+
+                tile_y[l] = by0[l];
+            }
+        }
+
+        __syncthreads();
+
+        vec_dot(tile_x, tile_y, sum, WARP_SIZE);
+
+        __syncthreads();
+    }
+
+    if (fixup) {
+        write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
+    } else {
+        write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
+    }
+}
+
+
+// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+    __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
+#else
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+    __launch_bounds__(WARP_SIZE*nwarps, 1)
+#else
+    __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+static __global__ void mul_mat_q(
+    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
+    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
+
+    // Skip unused template specializations for faster compilation:
+    if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    constexpr int qk    = ggml_cuda_type_traits<type>::qk;
+    constexpr int mmq_y = get_mmq_y_device();
+
+    // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
+#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
+    {
+        constexpr bool fixup = false;
+        mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+            (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+                blockIdx.x, blockIdx.y, 0, ne00/qk);
+        return;
+    }
+#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
+
+    const     int64_t blocks_per_ne00 = ne00 / qk;
+    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
+
+    const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x
+    const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y
+
+    // kbc == k block continuous, current index in continuous ijk space.
+    int64_t kbc      = (int64_t) blockIdx.x     *blocks_per_ne00*ntx*nty / gridDim.x;
+    int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x;
+
+    kbc      -= (kbc      % blocks_per_ne00) % blocks_per_iter;
+    kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
+
+    // kb0 == k index when doing the matrix multiplication for an output tile.
+    int kb0_start = kbc % blocks_per_ne00;
+    int kb0_stop  = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
+    while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
+        const int jt =  kbc /    (blocks_per_ne00*nty);                    // j index of current tile.
+        const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile.
+
+        constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
+        mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+            (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+             it, jt, kb0_start, kb0_stop);
+
+        kbc += blocks_per_ne00;
+        kbc -= kbc % blocks_per_ne00;
+
+        kb0_start = 0;
+        kb0_stop  = min(blocks_per_ne00, kbc_stop - kbc);
+    }
+
+    if (kbc >= kbc_stop) {
+        return;
+    }
+
+    const int jt =  kbc /    (blocks_per_ne00*nty);
+    const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
+
+    constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
+    mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
+        (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
+            it, jt, kb0_start, kb0_stop);
+}
+
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+static __global__ void mul_mat_q_stream_k_fixup(
+    float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) {
+
+    constexpr int     mmq_y           = get_mmq_y_device();
+    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
+    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
+    const     int64_t blocks_per_ne00 = ne00 / qk;
+
+    float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
+
+    const int ntx = (ne11 + mmq_x - 1) / mmq_x;
+    const int nty = (ne01 + mmq_y - 1) / mmq_y;
+
+    bool any_fixup = false;
+
+    const int bidx_start = ((blockIdx.y*nty + blockIdx.x)     * block_num_mmq)                           / (gridDim.y*gridDim.x);
+    const int bidx_stop  = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x);
+
+    int64_t kbc_0;
+    int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq;
+
+    for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) {
+        kbc_0 = kbc_stop_0;
+        kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq;
+
+        const int64_t kbc      = kbc_0      - (kbc_0      % blocks_per_ne00) % blocks_per_iter;
+        const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter;
+
+        // Skip fixup tile if the MMQ CUDA block never wrote anything to it:
+        if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) {
+            continue;
+        }
+
+        const int jt =  kbc_stop /    (blocks_per_ne00*nty);
+        const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
+
+        // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
+        if (it != blockIdx.x || jt != blockIdx.y) {
+            continue;
+        }
+
+        any_fixup = true;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
+            }
+        }
+    }
+
+    if (!any_fixup) {
+        return;
+    }
+
+    dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y;
+
+    const int i_max = ne01 - blockIdx.x*mmq_y - 1;
+    const int j_max = ne11 - blockIdx.y*mmq_x - 1;
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+        if (j > j_max) {
+            return;
+        }
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            if (need_check && i > i_max) {
+                continue;
+            }
+
+            dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+        }
+    }
+}
+
+struct mmq_args {
+    const char * x; const char * y; float * dst;
+    int64_t ne00; int64_t ne01; int64_t stride01;
+    int64_t ne10; int64_t ne11; int64_t stride11;
+    int64_t ne0;
+    bool use_stream_k;
+};
+
+template<ggml_type type>
+static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) {
+    const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
+    const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
+    const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
+    const int shmem_y = mmq_x*sizeof(block_q8_1_mmq);
+    return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
+}
+
+template <ggml_type type, int mmq_x>
+static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
+    const int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
+    const int nsm = ggml_cuda_info().devices[id].nsm;
+    const int mmq_y = get_mmq_y_host(cc);
+
+    const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1);
+
+    const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+    static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+    if (!shmem_limit_raised[id]) {
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        shmem_limit_raised[id] = true;
+    }
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+
+    const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
+    const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
+    const dim3 block_nums_xy_tiling(nty, ntx, 1);
+
+    if (!args.use_stream_k) {
+        if (args.ne01 % mmq_y == 0) {
+            constexpr bool need_check = false;
+            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
+                (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        } else {
+            constexpr bool need_check = true;
+            mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
+                (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+        }
+        return;
+    }
+
+    const dim3 block_nums_mmq(nsm, 1, 1);
+
+    ggml_cuda_pool & pool = ctx.pool(id);
+    ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
+
+    if (args.ne01 % mmq_y == 0) {
+        constexpr bool need_check = false;
+
+        mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+
+        mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
+            (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
+    } else {
+        constexpr bool need_check = true;
+
+        mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
+
+        mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
+            (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
+    }
+}
+
+template <ggml_type type>
+void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
+    const int id    = ggml_cuda_get_device();
+    const int nsm   = ggml_cuda_info().devices[id].nsm;
+    const int cc    = ggml_cuda_info().devices[id].cc;
+    const int smpbo = ggml_cuda_info().devices[id].smpbo;
+
+    const int mmq_x_max = get_mmq_x_max_host(cc);
+    const int mmq_y = get_mmq_y_host(cc);
+    const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
+    const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
+
+    int mmq_x_best  = 0;
+    int nparts_best = INT_MAX;
+
+    for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) {
+        const int granularity = mmq_get_granularity_host(mmq_x, cc);
+
+        if (mmq_x % granularity != 0 || mmq_get_shmem<type>(mmq_x, mmq_y, cc) > smpbo) {
+            continue;
+        }
+
+        const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x;
+        const int nwaves_xy_tiling = ntiles_x*block_num_y;
+        const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling;
+
+        if (nparts < nparts_best) {
+            mmq_x_best  = mmq_x;
+            nparts_best = nparts;
+        }
+    }
+
+    switch (mmq_x_best) {
+        case   8:
+            launch_mul_mat_q<type,   8>(ctx, args, stream);
+            break;
+        case  16:
+            launch_mul_mat_q<type,  16>(ctx, args, stream);
+            break;
+        case  24:
+            launch_mul_mat_q<type,  24>(ctx, args, stream);
+            break;
+        case  32:
+            launch_mul_mat_q<type,  32>(ctx, args, stream);
+            break;
+        case  40:
+            launch_mul_mat_q<type,  40>(ctx, args, stream);
+            break;
+        case  48:
+            launch_mul_mat_q<type,  48>(ctx, args, stream);
+            break;
+        case  56:
+            launch_mul_mat_q<type,  56>(ctx, args, stream);
+            break;
+        case  64:
+            launch_mul_mat_q<type,  64>(ctx, args, stream);
+            break;
+        case  72:
+            launch_mul_mat_q<type,  72>(ctx, args, stream);
+            break;
+        case  80:
+            launch_mul_mat_q<type,  80>(ctx, args, stream);
+            break;
+        case  88:
+            launch_mul_mat_q<type,  88>(ctx, args, stream);
+            break;
+        case  96:
+            launch_mul_mat_q<type,  96>(ctx, args, stream);
+            break;
+        case 104:
+            launch_mul_mat_q<type, 104>(ctx, args, stream);
+            break;
+        case 112:
+            launch_mul_mat_q<type, 112>(ctx, args, stream);
+            break;
+        case 120:
+            launch_mul_mat_q<type, 120>(ctx, args, stream);
+            break;
+        case 128:
+            launch_mul_mat_q<type, 128>(ctx, args, stream);
+            break;
+        default:
+            fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+#define DECL_MMQ_CASE(type)                                                        \
+    template void mul_mat_q_case<type>(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) \
+
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
+extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
+
+// -------------------------------------------------------------------------------------------------------------------------
+
+void ggml_cuda_op_mul_mat_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
+
+bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
new file mode 100644
index 000000000..f89ed03b5
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -0,0 +1,297 @@
+#include "ggml.h"
+#include "common.cuh"
+#include "mmv.cuh"
+
+template <typename T, typename type_acc, int block_size>
+static __global__ void mul_mat_vec(
+        const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
+        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
+        const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
+    const int64_t row       = blockIdx.x;
+    const int64_t channel   = blockIdx.y;
+    const int64_t sample    = blockIdx.z;
+    const int     tid       = threadIdx.x;
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    x   +=  (sample/sample_ratio)*stride_sample_x   + (channel/channel_ratio)*stride_channel_x + row*stride_row;
+    y   +=   sample              *stride_sample_y   +  channel               *stride_channel_y;
+    dst +=   sample              *stride_sample_dst +  channel               *stride_channel_dst;
+
+    const float2 * y2 = (const float2 *) y;
+
+    extern __shared__ char data_mmv[];
+    float * buf_iw = (float *) data_mmv;
+
+    if (block_size > warp_size) {
+        if (tid < warp_size) {
+            buf_iw[tid] = 0.0f;
+        }
+        __syncthreads();
+    }
+
+    float sumf;
+
+    if constexpr (std::is_same<T, half>::value) {
+        const half2 * x2 = (const half2 *) x;
+
+        if (std::is_same<type_acc, float>::value) {
+            sumf = 0.0f;
+
+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+                const float2 tmpx = __half22float2(x2[col2]);
+                const float2 tmpy = y2[col2];
+                sumf += tmpx.x * tmpy.x;
+                sumf += tmpx.y * tmpy.y;
+            }
+        } else {
+#ifdef FP16_AVAILABLE
+            half2 sumh2 = make_half2(0.0f, 0.0f);
+
+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+                const float2 tmp = y2[col2];
+                sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
+            }
+
+            sumf = __low2float(sumh2) + __high2float(sumh2);
+#else
+            NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+        }
+    } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
+        const int * x2 = (const int *) x;
+        sumf = 0.0f;
+
+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            const int    tmpx = x2[col2];
+            const float2 tmpy = y2[col2];
+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+        }
+    } else {
+        static_assert(std::is_same<T, void>::value, "unsupported type");
+    }
+
+    sumf = warp_reduce_sum<warp_size>(sumf);
+
+    if (block_size > warp_size) {
+        buf_iw[tid/warp_size] = sumf;
+        __syncthreads();
+        if (tid >= warp_size) {
+            return;
+        }
+        sumf = buf_iw[tid];
+        sumf = warp_reduce_sum<warp_size>(sumf);
+    }
+
+    if (tid != 0) {
+        return;
+    }
+
+    dst[row] = sumf;
+}
+
+template <typename T, typename type_acc>
+static void launch_mul_mat_vec_cuda(
+        const T * x, const float * y, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
+        const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        cudaStream_t stream) {
+    GGML_ASSERT(ncols      % 2 == 0);
+    GGML_ASSERT(stride_row % 2 == 0);
+    GGML_ASSERT(nchannels_y % nchannels_x == 0);
+    GGML_ASSERT(nsamples_y  % nsamples_x  == 0);
+    const int64_t channel_ratio = nchannels_y / nchannels_x;
+    const int64_t sample_ratio  = nsamples_y  / nsamples_x;
+    int device;
+    int warp_size;
+
+    CUDA_CHECK(cudaGetDevice(&device));
+    warp_size = ggml_cuda_info().devices[device].warp_size;
+
+    int64_t block_size_best = warp_size;
+    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
+    int64_t max_block_size  = 256;
+    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
+        max_block_size = 128;
+    }
+    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
+        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
+        if (niter < niter_best) {
+            niter_best      = niter;
+            block_size_best = block_size;
+        }
+    }
+
+    const int smem = warp_size*sizeof(float);
+    const dim3 block_nums(nrows, nchannels_y, nsamples_y);
+    const dim3 block_dims(block_size_best, 1, 1);
+    switch (block_size_best) {
+        case   32: {
+            mul_mat_vec<T, type_acc,  32><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case   64: {
+            mul_mat_vec<T, type_acc,  64><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case   96: {
+            mul_mat_vec<T, type_acc,  96><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  128: {
+            mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  160: {
+            mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  192: {
+            mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  224: {
+            mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        case  256: {
+            mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
+                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+        } break;
+        default: {
+            GGML_ABORT("fatal error");
+        } break;
+    }
+}
+
+template<typename T>
+static void mul_mat_vec_cuda(
+        const T * x, const float * y, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
+        const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        enum ggml_prec prec, cudaStream_t stream) {
+    switch (prec) {
+        case GGML_PREC_DEFAULT: {
+            launch_mul_mat_vec_cuda<T, half>
+                (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+        } break;
+        case GGML_PREC_F32: {
+            launch_mul_mat_vec_cuda<T, float>
+                (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+        } break;
+    }
+}
+
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const size_t ts_src0 = ggml_type_size(src0->type);
+    const size_t ts_src1 = ggml_type_size(src1->type);
+    const size_t ts_dst  = ggml_type_size(dst->type);
+
+    GGML_ASSERT(ne11 == 1);
+    GGML_ASSERT(ne12 == ne2);
+    GGML_ASSERT(ne13 == ne3);
+
+    GGML_ASSERT(nb00 == ts_src0);
+    GGML_ASSERT(nb10 == ts_src1);
+    GGML_ASSERT(nb0  == ts_dst);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
+    const int64_t s01 = src0->nb[1] / ts_src0;
+    const int64_t s02 = src0->nb[2] / ts_src0;
+    const int64_t s12 = src1->nb[2] / ts_src1;
+    const int64_t s2  =  dst->nb[2] / ts_dst;
+    const int64_t s03 = src0->nb[3] / ts_src0;
+    const int64_t s13 = src1->nb[3] / ts_src1;
+    const int64_t s3  =  dst->nb[3] / ts_dst;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16: {
+            const half * src0_d = (const half *) src0->data;
+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
+        } break;
+        case GGML_TYPE_BF16: {
+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
+        } break;
+        default:
+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+    }
+}
+
+void ggml_cuda_op_mul_mat_vec(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    GGML_ASSERT(src1_ncols == 1);
+
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
+
+
+    // ggml_cuda_op provides single, contiguous matrices
+    const int64_t stride_row         = ne00;
+    const int64_t nchannels_x        = 1;
+    const int64_t nchannels_y        = 1;
+    const int64_t stride_channel_x   = 0;
+    const int64_t stride_channel_y   = 0;
+    const int64_t stride_channel_dst = 0;
+    const int64_t nsamples_x         = 1;
+    const int64_t nsamples_y         = 1;
+    const int64_t stride_sample_x    = 0;
+    const int64_t stride_sample_y    = 0;
+    const int64_t stride_sample_dst  = 0;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16: {
+            const half * src0_d = (const half *) src0_dd_i;
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
+                nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+        } break;
+        case GGML_TYPE_BF16: {
+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
+                nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+        } break;
+        default:
+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+    }
+
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
+}
diff --git a/ggml/src/ggml-cuda/mmv.cuh b/ggml/src/ggml-cuda/mmv.cuh
new file mode 100644
index 000000000..78a1cd4a6
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmv.cuh
@@ -0,0 +1,12 @@
+#include "common.cuh"
+
+// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
+#define MMV_MAX_ROWS 512
+
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_cuda_op_mul_mat_vec(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
new file mode 100644
index 000000000..4fb466ca0
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -0,0 +1,426 @@
+#include "mmvq.cuh"
+#include "vecdotq.cuh"
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
+
+static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
+        type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
+        type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
+        type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
+        type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
+        type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
+        type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
+        type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
+        type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
+        type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
+        type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
+        type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
+        type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
+        type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
+        type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
+        type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
+        type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
+        type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
+        type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
+        nullptr;
+}
+
+static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q4_1    ? VDR_Q4_1_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_0    ? VDR_Q5_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_1    ? VDR_Q5_1_Q8_1_MMVQ :
+        type == GGML_TYPE_Q8_0    ? VDR_Q8_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q2_K    ? VDR_Q2_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q3_K    ? VDR_Q3_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q4_K    ? VDR_Q4_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_K    ? VDR_Q5_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q6_K    ? VDR_Q6_K_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ2_XS  ? VDR_IQ2_XS_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ2_S   ? VDR_IQ2_S_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ3_S   ? VDR_IQ3_S_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ4_NL  ? VDR_IQ4_NL_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ4_XS  ? VDR_IQ4_XS_Q8_1_MMVQ :
+        1;
+}
+
+template <ggml_type type, int ncols_y>
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+// tell the compiler to use as many registers as it wants, see nwarps definition below
+__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+static __global__ void mul_mat_vec_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
+
+    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
+    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
+    constexpr int vdr = get_vdr_mmvq(type);
+
+    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
+
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+    constexpr int nwarps              = 1;
+    constexpr int rows_per_cuda_block = 1;
+#else
+    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
+    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+
+    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
+    const     int row0 = rows_per_cuda_block*blockIdx.x;
+    const     int blocks_per_row_x = ncols_x / qk;
+    const     int blocks_per_col_y = nrows_y / QK8_1;
+    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
+
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
+        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
+
+        // x block quant index when casting the quants to int
+        const int kqs = vdr * (tid % (qi/vdr));
+
+#pragma unroll
+        for (int j = 0; j < ncols_y; ++j) {
+#pragma unroll
+            for (int i = 0; i < rows_per_cuda_block; ++i) {
+                tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
+            }
+        }
+    }
+
+    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
+    if (threadIdx.y > 0) {
+#pragma unroll
+        for (int j = 0; j < ncols_y; ++j) {
+#pragma unroll
+            for (int i = 0; i < rows_per_cuda_block; ++i) {
+                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
+            }
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y > 0) {
+        return;
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int j = 0; j < ncols_y; ++j) {
+#pragma unroll
+        for (int i = 0; i < rows_per_cuda_block; ++i) {
+#pragma unroll
+            for (int l = 0; l < nwarps-1; ++l) {
+                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
+            }
+            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
+        }
+
+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
+            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
+        }
+    }
+}
+
+template <ggml_type type>
+static void mul_mat_vec_q_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
+    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
+
+    int id = ggml_cuda_get_device();
+
+    int64_t nwarps = 1;
+    int64_t rows_per_cuda_block = 1;
+
+    if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
+        switch(ncols_y) {
+            case 1:
+                nwarps = 4;
+                rows_per_cuda_block = 1;
+                break;
+            case 2:
+            case 3:
+            case 4:
+                nwarps = 4;
+                rows_per_cuda_block = 2;
+                break;
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+                nwarps = 2;
+                rows_per_cuda_block = 2;
+                break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+    }
+
+    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
+    const dim3 block_nums(nblocks, 1, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    switch (ncols_y) {
+        case 1:
+            mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 2:
+            mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 3:
+            mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 4:
+            mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 5:
+            mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 6:
+            mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 7:
+            mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        case 8:
+            mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq2_xs_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq2_s_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq3_xxs_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq1_s_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq1_m_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq4_nl_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq4_xs_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+static void mul_mat_vec_iq3_s_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
+void ggml_cuda_op_mul_mat_vec_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    int id = ggml_cuda_get_device();
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the kernel writes into
+    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ2_XXS:
+            mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ2_XS:
+            mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ2_S:
+            mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ1_S:
+            mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ1_M:
+            mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ4_XS:
+            mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        case GGML_TYPE_IQ3_S:
+            mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
+}
diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
new file mode 100644
index 000000000..d9e42fdd6
--- /dev/null
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
+
+void ggml_cuda_op_mul_mat_vec_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
new file mode 100644
index 000000000..f127616ed
--- /dev/null
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -0,0 +1,342 @@
+#include "norm.cuh"
+#include <cstdint>
+
+template <int block_size>
+static __global__ void norm_f32(
+        const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps) {
+    const int nrows     = gridDim.x;
+    const int nchannels = gridDim.y;
+
+    const int row       = blockIdx.x;
+    const int channel   = blockIdx.y;
+    const int sample    = blockIdx.z;
+    const int tid       = threadIdx.x;
+
+    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
+    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+    float2 mean_var = make_float2(0.0f, 0.0f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[col];
+        mean_var.x += xi;
+        mean_var.y += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float2 s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
+    }
+
+    const float mean = mean_var.x / ncols;
+    const float var = mean_var.y / ncols - mean * mean;
+    const float inv_std = rsqrtf(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[col] = (x[col] - mean) * inv_std;
+    }
+}
+
+template <int block_size>
+static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
+    // blockIdx.x: num_groups idx
+    // threadIdx.x: block_size idx
+    const int start =     blockIdx.x*group_size + threadIdx.x;
+    const int end   = min(blockIdx.x*group_size + group_size,  ne_elements);
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        const float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float variance = tmp / group_size;
+    const float scale = rsqrtf(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+template <int block_size>
+static __global__ void rms_norm_f32(
+        const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps) {
+    const int nrows     = gridDim.x;
+    const int nchannels = gridDim.y;
+
+    const int row       = blockIdx.x;
+    const int channel   = blockIdx.y;
+    const int sample    = blockIdx.z;
+    const int tid       = threadIdx.x;
+
+    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
+    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[col] = scale * x[col];
+    }
+}
+
+template <int block_size>
+static __global__ void rms_norm_back_f32(
+        const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    grad += int64_t(row)*ncols;
+    xf   += int64_t(row)*ncols;
+    dst  += int64_t(row)*ncols;
+
+    float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
+    float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xfi = xf[col];
+        sum_xx += xfi * xfi;
+        sum_xg += xfi * grad[col];
+    }
+
+    // sum up partial sums
+    sum_xx = warp_reduce_sum(sum_xx);
+    sum_xg = warp_reduce_sum(sum_xg);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum_xx[32];
+        __shared__ float s_sum_xg[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum_xx[warp_id] = sum_xx;
+            s_sum_xg[warp_id] = sum_xg;
+        }
+        __syncthreads();
+
+        sum_xx = s_sum_xx[lane_id];
+        sum_xx = warp_reduce_sum(sum_xx);
+
+        sum_xg = s_sum_xg[lane_id];
+        sum_xg = warp_reduce_sum(sum_xg);
+    }
+
+    const float mean_eps = sum_xx / ncols + eps;
+    const float sum_eps  = sum_xx + ncols*eps;
+
+    const float scale_grad = rsqrtf(mean_eps);
+    const float scale_x    = -scale_grad * sum_xg/sum_eps;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[col] = scale_grad*grad[col] + scale_x*xf[col];
+    }
+}
+
+static void norm_f32_cuda(
+        const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
+    const dim3 blocks_num(nrows, nchannels, nsamples);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+    }
+}
+
+static void group_norm_f32_cuda(
+        const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
+    if (group_size < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
+    }
+}
+
+static void rms_norm_f32_cuda(
+        const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
+    const dim3 blocks_num(nrows, nchannels, nsamples);
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        rms_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        rms_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+    }
+}
+
+static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
+    if (ncols < 1024) {
+        const dim3 block_dims(WARP_SIZE, 1, 1);
+        rms_norm_back_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        rms_norm_back_f32<1024><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
+    }
+}
+
+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
+
+    norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream);
+}
+
+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
+
+    rms_norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream);
+}
+
+void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * grad  = dst->src[0]; // gradients
+    const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
+
+    const float * grad_d  = (const float *) grad->data;
+    const float * src0f_d = (const float *) src0f->data;
+    float       * dst_d   = (float       *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(grad));
+
+    GGML_ASSERT( grad->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0f->type == GGML_TYPE_F32);
+    GGML_ASSERT(  dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0f->ne[0];
+    const int64_t nrows = ggml_nrows(src0f);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream);
+}
diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh
new file mode 100644
index 000000000..d63d34380
--- /dev/null
+++ b/ggml/src/ggml-cuda/norm.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu
new file mode 100644
index 000000000..35154f299
--- /dev/null
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@@ -0,0 +1,78 @@
+#include "ggml-impl.h"
+#include "opt-step-adamw.cuh"
+
+#include <cstdint>
+
+static __global__ void opt_step_adamw_f32(
+    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v,
+    const float * __restrict__ pars, const int64_t k) {
+
+    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const float alpha  = pars[0];
+    const float beta1  = pars[1];
+    const float beta2  = pars[2];
+    const float eps    = pars[3];
+    const float wd     = pars[4];
+    const float beta1h = pars[5];
+    const float beta2h = pars[6];
+
+    const float gi = g[i];
+    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
+    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
+
+    g_m[i] = gmi;
+    g_v[i] = gvi;
+
+    const float mh =       gmi*beta1h;
+    const float vh = sqrtf(gvi*beta2h) + eps;
+
+    x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh;
+}
+
+static void opt_step_adamw_f32_cuda(
+    float * x, const float * g, float * g_m, float * g_v, const float * pars, const int64_t k, cudaStream_t stream) {
+
+    const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
+    const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
+    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, pars, k);
+}
+
+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0         = dst->src[0];
+    const ggml_tensor * src0_grad    = dst->src[1];
+    const ggml_tensor * src0_grad_m  = dst->src[2];
+    const ggml_tensor * src0_grad_v  = dst->src[3];
+    const ggml_tensor * adamw_params = dst->src[4];
+
+    GGML_ASSERT(src0->type         == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad->type    == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_m->type  == GGML_TYPE_F32);
+    GGML_ASSERT(src0_grad_v->type  == GGML_TYPE_F32);
+    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
+    GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
+    GGML_ASSERT(ggml_is_contiguous(adamw_params));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+
+    float       * src0_d         = (float       *) src0->data;
+    const float * src0_grad_d    = (const float *) src0_grad->data;
+    float       * src0_grad_m_d  = (float       *) src0_grad_m->data;
+    float       * src0_grad_v_d  = (float       *) src0_grad_v->data;
+    const float * adamw_params_d = (const float *) adamw_params->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t ne = ggml_nelements(src0);
+
+    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, adamw_params_d, ne, stream);
+}
diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cuh b/ggml/src/ggml-cuda/opt-step-adamw.cuh
new file mode 100644
index 000000000..58d6f6e5d
--- /dev/null
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
+
+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
new file mode 100644
index 000000000..c9b2b699c
--- /dev/null
+++ b/ggml/src/ggml-cuda/out-prod.cu
@@ -0,0 +1,68 @@
+#include "out-prod.cuh"
+
+#include <cstdint>
+
+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    GGML_ASSERT(ne01 == ne11);
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+
+    GGML_ASSERT(ne2 % src0->ne[2] == 0);
+    GGML_ASSERT(ne3 % src0->ne[3] == 0);
+
+    GGML_ASSERT(ne2 == src1->ne[2]);
+    GGML_ASSERT(ne3 == src1->ne[3]);
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
+    cudaStream_t   stream = ctx.stream();
+    cublasHandle_t handle = ctx.cublas_handle();
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    CUBLAS_CHECK(cublasSetStream(handle, stream));
+
+    const int64_t lda = nb01 / sizeof(float);
+    const int64_t ldc = nb1  / sizeof(float);
+
+    const bool src1_T = ggml_is_transposed(src1);
+    const cublasOperation_t src1_cublas_op =  src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
+    const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
+    GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));
+
+    // data strides in dimensions 2/3
+    const size_t s02 = nb02 / sizeof(float);
+    const size_t s03 = nb03 / sizeof(float);
+    const size_t s12 = nb12 / sizeof(float);
+    const size_t s13 = nb13 / sizeof(float);
+    const size_t s2  = nb2  / sizeof(float);
+    const size_t s3  = nb3  / sizeof(float);
+
+    // dps == dst per src0, used for group query attention
+    const int64_t dps2 = ne2 / ne02;
+    const int64_t dps3 = ne3 / ne03;
+
+    // TODO batched matrix multiplication
+    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            CUBLAS_CHECK(
+                cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                        ne0, ne1, ne01,
+                        &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
+                                src1_d +  i3      *s13 +  i2      *s12, ldb,
+                        &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
+        }
+    }
+}
diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh
new file mode 100644
index 000000000..a0046f5f8
--- /dev/null
+++ b/ggml/src/ggml-cuda/out-prod.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
new file mode 100644
index 000000000..aba539e8d
--- /dev/null
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -0,0 +1,49 @@
+#include "pad.cuh"
+
+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+    // blockIdx.y: idx of ne1
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * ne01;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+static void pad_f32_cuda(const float * x, float * dst,
+    const int ne00, const int ne01, const int ne02, const int ne03,
+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_cuda(src0_d, dst_d,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
new file mode 100644
index 000000000..8fd386b00
--- /dev/null
+++ b/ggml/src/ggml-cuda/pad.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_PAD_BLOCK_SIZE 256
+
+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu
new file mode 100644
index 000000000..c6d51e4d6
--- /dev/null
+++ b/ggml/src/ggml-cuda/pool2d.cu
@@ -0,0 +1,94 @@
+#include "pool2d.cuh"
+
+template <typename Ti, typename To>
+static  __global__ void pool2d_nchw_kernel(
+        const int ih, const int iw, const int oh, const int ow,
+        const int kh, const int kw, const int sh, const int sw,
+        const int ph, const int pw, const int parallel_elements,
+        const Ti* src, To* dst, const enum ggml_op_pool op) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= parallel_elements) {
+        return;
+    }
+
+    const int I_HW = ih * iw;
+    const int O_HW = oh * ow;
+    const int nc = idx / O_HW;
+    const int cur_oh = idx % O_HW / ow;
+    const int cur_ow = idx % O_HW % ow;
+    const Ti* i_ptr = src + nc * I_HW;
+    To* o_ptr = dst + nc * O_HW;
+    const int start_h = cur_oh * sh - ph;
+    const int bh = max(0, start_h);
+    const int eh = min(ih, start_h + kh);
+    const int start_w = cur_ow * sw - pw;
+    const int bw = max(0, start_w);
+    const int ew = min(iw, start_w + kw);
+    const To scale = 1. / (kh * kw);
+    To res = 0;
+
+    switch (op) {
+        case GGML_OP_POOL_AVG: res = 0; break;
+        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
+        default: assert(false);
+    }
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+#if __CUDA_ARCH__ >= 350
+            Ti cur = __ldg(i_ptr + i * iw + j);
+#else
+            Ti cur = i_ptr[i * iw + j];
+#endif
+            switch (op) {
+                case GGML_OP_POOL_AVG: res += cur * scale; break;
+                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
+                default: assert(false);
+            }
+        }
+    }
+    o_ptr[cur_oh * ow + cur_ow] = res;
+}
+
+static void pool2d_nchw_kernel_f32_f32_cuda(
+        const int ih, const int iw, const int oh, const int ow,
+        const int kh, const int kw, const int sh, const int sw,
+        const int ph, const int pw, const int parallel_elements,
+        const float * src, float * dst, const enum ggml_op_pool op,
+        cudaStream_t stream) {
+
+    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
+    dim3 block_nums(num_blocks);
+    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
+}
+
+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    const int64_t IH = src0->ne[1];
+    const int64_t IW = src0->ne[0];
+
+    const int64_t N = dst->ne[3];
+    const int64_t OC = dst->ne[2];
+    const int64_t OH = dst->ne[1];
+    const int64_t OW = dst->ne[0];
+
+    const int parallel_elements = N * OC * OH * OW;
+
+    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
+}
diff --git a/ggml/src/ggml-cuda/pool2d.cuh b/ggml/src/ggml-cuda/pool2d.cuh
new file mode 100644
index 000000000..7841292bc
--- /dev/null
+++ b/ggml/src/ggml-cuda/pool2d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_POOL2D_BLOCK_SIZE 256
+
+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
new file mode 100644
index 000000000..1702e4ce2
--- /dev/null
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -0,0 +1,169 @@
+#include "quantize.cuh"
+#include <cstdint>
+
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (ix0 >= kx0_padded) {
+        return;
+    }
+
+    const int64_t ix1 = blockIdx.y;
+
+    const int64_t i_padded = ix1*kx0_padded + ix0;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int64_t ib = i_padded / QK8_1; // block index
+    const int64_t iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
+    float amax = fabsf(xi);
+    float sum = xi;
+
+    amax = warp_reduce_max(amax);
+    sum = warp_reduce_sum(sum);
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+}
+
+template <mmq_q8_1_ds_layout ds_layout>
+static __global__ void quantize_mmq_q8_1(
+    const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
+
+    constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
+    constexpr int vals_per_sum   = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
+
+    const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
+
+    if (ix0 >= kx0_padded) {
+        return;
+    }
+
+    const float4 * x4 = (const float4 *) x;
+
+    const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
+
+    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
+
+    const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
+    const int64_t ib  = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y;                   // block index in channel
+    const int64_t iqs = ix0 % (4*QK8_1);                                            // quant index in block
+
+    // Load 4 floats per thread and calculate max. abs. value between them:
+    const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    float amax = fabsf(xi.x);
+    amax = fmaxf(amax, fabsf(xi.y));
+    amax = fmaxf(amax, fabsf(xi.z));
+    amax = fmaxf(amax, fabsf(xi.w));
+
+    // Exchange max. abs. value between vals_per_scale/4 threads.
+#pragma unroll
+    for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
+    }
+
+    float sum;
+    if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
+        sum = xi.x + xi.y + xi.z + xi.w;
+
+        // Exchange calculate sum across vals_per_sum/4 threads.
+#pragma unroll
+        for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
+            sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
+        }
+    }
+
+    const float d_inv = 127.0f / amax;
+    char4 q;
+    q.x = roundf(xi.x*d_inv);
+    q.y = roundf(xi.y*d_inv);
+    q.z = roundf(xi.z*d_inv);
+    q.w = roundf(xi.w*d_inv);
+
+    // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
+    char4 * yqs4 = (char4 *) y[ib].qs;
+    yqs4[iqs/4] = q;
+
+    if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) {
+        if (iqs % 16 != 0 || iqs >= 96) {
+            return;
+        }
+
+        y[ib].d2s6[2 + iqs/16] = sum;
+
+        if (iqs % 64 != 0) {
+            return;
+        }
+
+        const float d = 1.0f / d_inv;
+
+        y[ib].d2s6[iqs/64] = d;
+
+        return;
+    }
+
+    if (iqs % 32 != 0) {
+        return;
+    }
+
+    const float d = 1.0f / d_inv;
+
+    if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
+        y[ib].ds4[iqs/32] = make_half2(d, sum);
+    } else {
+        y[ib].d4[iqs/32]  = d;
+    }
+}
+
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+
+    GGML_ASSERT(kx0_padded % QK8_1 == 0);
+
+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, kx1*channels, 1);
+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
+
+    GGML_UNUSED(type_x);
+}
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+
+    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
+
+    const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
+    const dim3 num_blocks(block_num_x, kx1, channels);
+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
+    switch (mmq_get_q8_1_ds_layout(type_x)) {
+        case MMQ_Q8_1_DS_LAYOUT_D4:
+            quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+            break;
+        case MMQ_Q8_1_DS_LAYOUT_DS4:
+            quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+            break;
+        case MMQ_Q8_1_DS_LAYOUT_D2S6:
+            quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh
new file mode 100644
index 000000000..03bf322b9
--- /dev/null
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "common.cuh"
+#include "mmq.cuh"
+
+#include <cstdint>
+
+#define CUDA_QUANTIZE_BLOCK_SIZE     256
+#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
+
+static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
+static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
+
+typedef void (*quantize_cuda_t)(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
new file mode 100644
index 000000000..18f691b2d
--- /dev/null
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -0,0 +1,456 @@
+#include "rope.cuh"
+
+struct rope_corr_dims {
+    float v[2];
+};
+
+
+struct mrope_sections {
+    int v[4];
+};
+
+static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+template<bool forward>
+static __device__ void rope_yarn(
+        const float theta_extrap, const float freq_scale, const rope_corr_dims corr_dims, const int64_t i0, const float ext_factor,
+        float mscale, float & cos_theta, float & sin_theta) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    cos_theta = cosf(theta) * mscale;
+    sin_theta = sinf(theta) * mscale;
+    if (!forward) {
+        sin_theta *= -1.0f;
+    }
+}
+
+template<bool forward, bool has_ff, typename T>
+static __global__ void rope_norm(
+        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;
+
+    const int idst = row_dst*ne0 + i0;
+    const int ix   = channel_x*s2 + row_x*s1 + i0;
+
+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
+
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + 1];
+
+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<bool forward, bool has_ff, typename T>
+static __global__ void rope_neox(
+        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;
+
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
+
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];
+
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+template<bool forward, bool has_ff, typename T>
+static __global__ void rope_multi(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
+        const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;
+
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
+
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];
+
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+template<bool forward, bool has_ff, typename T>
+static __global__ void rope_vision(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
+        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
+        const float theta_scale, const float * freq_factors, const mrope_sections sections) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;
+
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const int sect_dims = sections.v[0] + sections.v[1];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        const int p = sector;
+        theta_base = pos[channel_x]*powf(theta_scale, p);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        const int p = sector - sections.v[0];
+        theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
+
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims];
+
+    dst[idst + 0]      = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims] = x0*sin_theta + x1*cos_theta;
+}
+
+template<bool forward, typename T>
+static void rope_norm_cuda(
+        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
+    } else {
+        rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
+    }
+}
+
+template<bool forward, typename T>
+static void rope_neox_cuda(
+        const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_neox<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
+    } else {
+        rope_neox<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
+    }
+}
+
+template<bool forward, typename T>
+static void rope_multi_cuda(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    } else {
+        rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    }
+}
+
+template<bool forward, typename T>
+static void rope_vision_cuda(
+        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+    // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
+    // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    } else {
+        rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
+    }
+}
+
+template <bool forward>
+void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * src2 = dst->src[2];
+
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0]; // head dims
+    const int64_t ne01 = src0->ne[1]; // num heads
+    const int64_t ne02 = src0->ne[2]; // num heads
+    const int64_t nr = ggml_nrows(src0);
+
+    const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
+    const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+    mrope_sections sections;
+
+    // RoPE alteration for extended context
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
+    const int32_t * pos = (const int32_t *) src1_d;
+
+    const float * freq_factors = nullptr;
+    if (src2 != nullptr) {
+        freq_factors = (const float *) src2->data;
+    }
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_neox) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else if (is_mrope && !is_vision) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_multi_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_multi_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else if (is_vision) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_vision_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_vision_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_norm_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_norm_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+}
+
+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_rope_impl<true>(ctx, dst);
+}
+
+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_rope_impl<false>(ctx, dst);
+}
diff --git a/ggml/src/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh
new file mode 100644
index 000000000..9139f3b22
--- /dev/null
+++ b/ggml/src/ggml-cuda/rope.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+#define CUDA_ROPE_BLOCK_SIZE 256
+
+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
new file mode 100644
index 000000000..1405e066e
--- /dev/null
+++ b/ggml/src/ggml-cuda/scale.cu
@@ -0,0 +1,31 @@
+#include "scale.cuh"
+
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
+}
+
+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
+}
diff --git a/ggml/src/ggml-cuda/scale.cuh b/ggml/src/ggml-cuda/scale.cuh
new file mode 100644
index 000000000..8ff75c829
--- /dev/null
+++ b/ggml/src/ggml-cuda/scale.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_SCALE_BLOCK_SIZE 256
+
+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
new file mode 100644
index 000000000..aac6e0999
--- /dev/null
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -0,0 +1,283 @@
+#include "common.cuh"
+#include "ggml.h"
+#include "softmax.cuh"
+#include <cstdint>
+
+template <typename T>
+static __device__ __forceinline__ float t2f32(T val) {
+    return (float) val;
+}
+
+template <>
+__device__ float __forceinline__ t2f32<half>(half val) {
+    return __half2float(val);
+}
+
+// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
+// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpass-failed"
+#endif // __clang__
+template <bool use_shared, int ncols_template, int block_size_template, typename T>
+static __global__ void soft_max_f32(
+        const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
+        const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
+    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
+
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
+
+    x    += int64_t(rowx)*ncols;
+    mask += int64_t(rowy)*ncols * (mask != nullptr);
+    dst  += int64_t(rowx)*ncols;
+
+    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
+
+    extern __shared__ float data_soft_max_f32[];
+    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
+    // shared memory buffer to cache values between iterations:
+    float * vals = use_shared ? buf_iw + WARP_SIZE : dst;
+
+    float max_val = -INFINITY;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
+
+        vals[col] = val;
+        max_val = max(max_val, val);
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }
+
+    float tmp = 0.0f; // partial sum
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const float val = expf(vals[col] - max_val);
+        tmp += val;
+        vals[col] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __syncthreads();
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    const float inv_sum = 1.0f / tmp;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            return;
+        }
+
+        dst[col] = vals[col] * inv_sum;
+    }
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif // __clang__
+
+static __global__ void soft_max_back_f32(
+        const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+
+    grad += int64_t(rowx)*ncols;
+    dstf += int64_t(rowx)*ncols;
+    dst  += int64_t(rowx)*ncols;
+
+    float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dgf_dot += dstf[col]*grad[col];
+    }
+
+    dgf_dot = warp_reduce_sum(dgf_dot);
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[col] = scale * (grad[col] - dgf_dot) * dstf[col];
+    }
+}
+
+template<typename T>
+static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const dim3 block_dims(nth,     1, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
+    const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
+    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
+
+    const uint32_t n_head      = nrows_x/nrows_y;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
+    if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
+        switch (ncols_x) {
+            case 32:
+                soft_max_f32<true,   32,   32><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 64:
+                soft_max_f32<true,   64,   64><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 128:
+                soft_max_f32<true,  128,  128><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 256:
+                soft_max_f32<true,  256,  256><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 512:
+                soft_max_f32<true,  512,  512><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 1024:
+                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 2048:
+                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 4096:
+                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            default:
+                soft_max_f32<true,    0,    0><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+        }
+    } else {
+        const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+    }
+}
+
+static void soft_max_back_f32_cuda(
+        const float * grad, const float * dstf, float * dst,
+        const int ncols, const int nrows, const float scale, cudaStream_t stream) {
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows,     1, 1);
+
+    soft_max_back_f32<<<block_nums, block_dims, 0, stream>>>(grad, dstf, dst, ncols, scale);
+}
+
+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    const float * src0_d = (const float *) src0->data;
+    const void  * src1_d = src1 ? (const void *) src1->data : nullptr;
+    float       *  dst_d = (float *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00    = src0->ne[0];
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src0->ne[1];
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+
+    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+
+    if (use_f16) {
+        soft_max_f32_cuda(src0_d, (const half  *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
+    } else {
+        soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
+    }
+}
+
+void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // grad
+    const ggml_tensor * src1 = dst->src[1]; // forward pass output
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+
+    GGML_ASSERT(max_bias == 0.0f);
+
+    soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream);
+}
diff --git a/ggml/src/ggml-cuda/softmax.cuh b/ggml/src/ggml-cuda/softmax.cuh
new file mode 100644
index 000000000..93dfee835
--- /dev/null
+++ b/ggml/src/ggml-cuda/softmax.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+
+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu
new file mode 100644
index 000000000..e0dafc1d2
--- /dev/null
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -0,0 +1,45 @@
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+#define USE_CUB
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
+
+#ifdef USE_CUB
+#include <cub/cub.cuh>
+using namespace cub;
+#endif // USE_CUB
+
+#include "sumrows.cuh"
+#include "sum.cuh"
+
+#include <cstdint>
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
+#ifdef USE_CUB
+    size_t tmp_size = 0;
+    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
+    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
+    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
+#else
+    // Use (inefficient) sum_rows implementation as a fallback.
+    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
+    sum_rows_f32_cuda(x, dst, ne, 1, stream);
+    GGML_UNUSED(pool);
+#endif // USE_CUB
+}
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+
+    const int64_t ne = ggml_nelements(src0);
+
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t stream = ctx.stream();
+
+    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
+}
diff --git a/ggml/src/ggml-cuda/sum.cuh b/ggml/src/ggml-cuda/sum.cuh
new file mode 100644
index 000000000..8cadc3736
--- /dev/null
+++ b/ggml/src/ggml-cuda/sum.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
new file mode 100644
index 000000000..38dbf1b5e
--- /dev/null
+++ b/ggml/src/ggml-cuda/sumrows.cu
@@ -0,0 +1,39 @@
+#include "sumrows.cuh"
+
+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
+}
diff --git a/ggml/src/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh
new file mode 100644
index 000000000..191db1c13
--- /dev/null
+++ b/ggml/src/ggml-cuda/sumrows.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu
new file mode 100644
index 000000000..f09bdeff7
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 16);
+DECL_FATTN_MMA_F16_CASE(80, 16);
+DECL_FATTN_MMA_F16_CASE(96, 16);
+DECL_FATTN_MMA_F16_CASE(112, 16);
+DECL_FATTN_MMA_F16_CASE(128, 16);
+DECL_FATTN_MMA_F16_CASE(256, 16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu
new file mode 100644
index 000000000..221108873
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 32);
+DECL_FATTN_MMA_F16_CASE(80, 32);
+DECL_FATTN_MMA_F16_CASE(96, 32);
+DECL_FATTN_MMA_F16_CASE(112, 32);
+DECL_FATTN_MMA_F16_CASE(128, 32);
+DECL_FATTN_MMA_F16_CASE(256, 32);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu
new file mode 100644
index 000000000..d24b08575
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 64);
+DECL_FATTN_MMA_F16_CASE(80, 64);
+DECL_FATTN_MMA_F16_CASE(96, 64);
+DECL_FATTN_MMA_F16_CASE(112, 64);
+DECL_FATTN_MMA_F16_CASE(128, 64);
+DECL_FATTN_MMA_F16_CASE(256, 64);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu
new file mode 100644
index 000000000..bdf86c0ea
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 8);
+DECL_FATTN_MMA_F16_CASE(80, 8);
+DECL_FATTN_MMA_F16_CASE(96, 8);
+DECL_FATTN_MMA_F16_CASE(112, 8);
+DECL_FATTN_MMA_F16_CASE(128, 8);
+DECL_FATTN_MMA_F16_CASE(256, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
new file mode 100644
index 000000000..6696a2384
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
new file mode 100644
index 000000000..dd070db28
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
new file mode 100644
index 000000000..54dcde6f5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
new file mode 100644
index 000000000..4ec22f791
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
new file mode 100644
index 000000000..3c15bf7f0
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
new file mode 100644
index 000000000..7e61b5fdc
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
new file mode 100644
index 000000000..fdb15b580
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
new file mode 100644
index 000000000..0f7c417d2
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
new file mode 100644
index 000000000..851f33c43
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
new file mode 100644
index 000000000..763809cbe
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
new file mode 100644
index 000000000..f2a276e50
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
new file mode 100644
index 000000000..cb227f6f5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
new file mode 100644
index 000000000..97ac0520c
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
new file mode 100644
index 000000000..c772b4263
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
new file mode 100644
index 000000000..5cb743081
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
new file mode 100644
index 000000000..98a709d17
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
new file mode 100644
index 000000000..4f2f947ae
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
new file mode 100644
index 000000000..11f96b6f6
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
new file mode 100644
index 000000000..b39bdc061
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
new file mode 100644
index 000000000..bbd6a2c7f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
new file mode 100644
index 000000000..9d84ff2b1
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
new file mode 100644
index 000000000..bc8a5bff6
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
new file mode 100644
index 000000000..a679100c8
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
new file mode 100644
index 000000000..8f21bccf7
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
new file mode 100644
index 000000000..858b00fd7
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
new file mode 100644
index 000000000..0fc8011fa
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
new file mode 100644
index 000000000..261fdf623
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
new file mode 100644
index 000000000..0fb824738
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
new file mode 100644
index 000000000..a9d9d089b
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
new file mode 100644
index 000000000..7d7b27920
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
new file mode 100644
index 000000000..a092ee2d5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
new file mode 100644
index 000000000..db55927a1
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
new file mode 100644
index 000000000..c3c21cefa
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
new file mode 100644
index 000000000..35dd9f520
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
new file mode 100644
index 000000000..050c22ac7
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
new file mode 100644
index 000000000..de4866c5e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
new file mode 100644
index 000000000..57a10bc4b
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
new file mode 100644
index 000000000..e0f08b46a
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
new file mode 100644
index 000000000..1c8e8a467
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
new file mode 100644
index 000000000..cefed83fb
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
new file mode 100644
index 000000000..aede6e358
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
new file mode 100644
index 000000000..1a1a92c78
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
new file mode 100644
index 000000000..ad667473d
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
new file mode 100644
index 000000000..c499f455d
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
new file mode 100644
index 000000000..8286ebf37
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
new file mode 100644
index 000000000..458786882
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
new file mode 100644
index 000000000..d89103ce0
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
new file mode 100644
index 000000000..bb75fd42f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
new file mode 100644
index 000000000..b1629817e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
new file mode 100644
index 000000000..d8657604d
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
new file mode 100644
index 000000000..2e5bd2f1a
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
new file mode 100644
index 000000000..be5f302d9
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
new file mode 100644
index 000000000..8dd91cd72
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
new file mode 100644
index 000000000..4cb791502
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
new file mode 100644
index 000000000..09dea4267
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
new file mode 100644
index 000000000..0fbb60769
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
new file mode 100644
index 000000000..2aeab83b2
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
new file mode 100644
index 000000000..599415b49
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
new file mode 100644
index 000000000..e4f8e3083
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
new file mode 100644
index 000000000..34d166527
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
new file mode 100644
index 000000000..4bebef45a
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
new file mode 100644
index 000000000..326468da2
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
new file mode 100644
index 000000000..511b58f4e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
new file mode 100644
index 000000000..d9906d142
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
new file mode 100644
index 000000000..f61c183ab
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
new file mode 100644
index 000000000..c10450fd2
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
new file mode 100644
index 000000000..2d5cb195c
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
new file mode 100644
index 000000000..b384f34d7
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
new file mode 100644
index 000000000..446e293b1
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
new file mode 100644
index 000000000..6f4302988
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
new file mode 100644
index 000000000..1cd8ba88f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
new file mode 100644
index 000000000..1ee2eab65
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
new file mode 100644
index 000000000..2bc77816a
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
new file mode 100644
index 000000000..d55ced08b
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
new file mode 100644
index 000000000..8361e99c4
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
new file mode 100644
index 000000000..7507a67c4
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
new file mode 100644
index 000000000..61f050b23
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
new file mode 100644
index 000000000..d4a49d9c9
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
new file mode 100644
index 000000000..d14627897
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
new file mode 100644
index 000000000..e73f917a1
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
new file mode 100644
index 000000000..d40825dfc
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
new file mode 100644
index 000000000..b5c6869f4
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
new file mode 100644
index 000000000..4e21b0cca
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
new file mode 100644
index 000000000..2eac321b3
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
new file mode 100644
index 000000000..f7d2c3b4e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
new file mode 100644
index 000000000..a013f400b
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
new file mode 100755
index 000000000..a2628f16e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+from glob import glob
+import os
+
+TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
+
+SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f{vkq_size}.cuh"
+
+DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
+"""
+
+SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+"""
+
+SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {cols_per_block});\n"
+
+TYPES_MMQ = [
+    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
+    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
+    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
+    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
+]
+
+SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE({type});
+"""
+
+
+def get_short_name(long_quant_name):
+    return long_quant_name.replace("GGML_TYPE_", "").lower()
+
+
+def get_head_sizes(type_k, type_v):
+    if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
+        return [64, 128, 256]
+    if type_k == "GGML_TYPE_F16":
+        return [64, 128]
+    return [128]
+
+
+for filename in glob("*.cu"):
+    os.remove(filename)
+
+for vkq_size in [16, 32]:
+    for type_k in TYPES_KV:
+        for type_v in TYPES_KV:
+            for head_size in get_head_sizes(type_k, type_v):
+                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
+                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
+
+for cols_per_block in [8, 16, 32, 64]:
+    with open(f"fattn-mma-f16-instance-cpb{cols_per_block}.cu", "w") as f:
+        f.write(SOURCE_FATTN_MMA_START)
+
+        for head_size in [64, 80, 96, 112, 128, 256]:
+            f.write(SOURCE_FATTN_MMA_CASE.format(cols_per_block=cols_per_block, head_size=head_size))
+
+for type in TYPES_MMQ:
+    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
+        f.write(SOURCE_MMQ.format(type=type))
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
new file mode 100644
index 000000000..84ec85029
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
new file mode 100644
index 000000000..583c4e5a5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
new file mode 100644
index 000000000..edaf1560d
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
new file mode 100644
index 000000000..233d9342c
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
new file mode 100644
index 000000000..6092dc713
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
new file mode 100644
index 000000000..1d5bd201f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
new file mode 100644
index 000000000..eb02fab00
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
new file mode 100644
index 000000000..1eb3b7430
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
new file mode 100644
index 000000000..6415369dc
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q2_K);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
new file mode 100644
index 000000000..ffb6213af
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q3_K);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
new file mode 100644
index 000000000..0c0b0c8a8
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_0);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
new file mode 100644
index 000000000..ee67f6942
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_1);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
new file mode 100644
index 000000000..9eeb3cd7f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_K);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
new file mode 100644
index 000000000..cc57fb975
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_0);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
new file mode 100644
index 000000000..721ac790c
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_1);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
new file mode 100644
index 000000000..a2e90ffd5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_K);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
new file mode 100644
index 000000000..470938fef
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q6_K);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
new file mode 100644
index 000000000..974477bbb
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu
new file mode 100644
index 000000000..153ddbcda
--- /dev/null
+++ b/ggml/src/ggml-cuda/tsembd.cu
@@ -0,0 +1,47 @@
+#include "tsembd.cuh"
+
+static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
+    // blockIDx.y: idx of timesteps->ne[0]
+    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
+    int i = blockIdx.y;
+    int j = threadIdx.x + blockIdx.x * blockDim.x;
+    float * embed_data = (float *)((char *)dst +  i*nb1);
+
+    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
+        embed_data[dim] = 0.f;
+    }
+
+    int half = dim / 2;
+    if (j >= half) {
+        return;
+    }
+
+    float timestep = timesteps[i];
+    float freq = (float)expf(-logf(max_period) * j / half);
+    float arg = timestep * freq;
+    embed_data[j] = cosf(arg);
+    embed_data[j + half] = sinf(arg);
+}
+
+static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
+                                        const int dim, const int max_period, cudaStream_t stream) {
+    int half_ceil = (dim + 1) / 2;
+    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne00, 1);
+    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
+}
+
+void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+
+    timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+}
diff --git a/ggml/src/ggml-cuda/tsembd.cuh b/ggml/src/ggml-cuda/tsembd.cuh
new file mode 100644
index 000000000..84340e3d7
--- /dev/null
+++ b/ggml/src/ggml-cuda/tsembd.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
+
+void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
new file mode 100644
index 000000000..6b21f407d
--- /dev/null
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -0,0 +1,492 @@
+#include "unary.cuh"
+
+static __global__ void neg_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = -x[i];
+}
+
+static __global__ void step_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] > 0.0f;
+}
+
+static __global__ void gelu_f32(const float * x, float * dst, const int k) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
+
+static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
+}
+
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + expf(-x[i]));
+}
+
+static __global__ void silu_back_f32(
+        const float * grad, const float * xf, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const float xfi = xf[i];
+    const float s = 1.0f / (1.0f + expf(-xfi));
+    dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
+}
+
+static __global__ void tanh_f32(const float * x, float * dst, int k) {
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = tanhf(x[i]);
+}
+
+static __global__ void relu_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = fmaxf(x[i], 0);
+}
+
+static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = 1.0f / (1.0f + expf(-x[i]));
+}
+
+static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+static __global__ void exp_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = expf(x[i]);
+}
+
+static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
+    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= k) {
+        return;
+    }
+    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
+}
+
+static __global__ void sqr_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sqrtf(x[i]);
+}
+
+static __global__ void sin_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sinf(x[i]);
+}
+
+static __global__ void cos_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = cosf(x[i]);
+}
+
+static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
+    neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
+    step_f32<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
+}
+
+static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
+    sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
+    hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
+    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
+    exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
+}
+
+static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
+    sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
+    sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
+    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // input from forward pass
+    const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream);
+}
+
+void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
+void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
new file mode 100644
index 000000000..e7f62643a
--- /dev/null
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -0,0 +1,51 @@
+#include "common.cuh"
+
+#define CUDA_NEG_BLOCK_SIZE 256
+#define CUDA_STEP_BLOCK_SIZE 256
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_SILU_BACK_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SIGMOID_BLOCK_SIZE 256
+#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
+#define CUDA_EXP_BLOCK_SIZE 256
+#define CUDA_HARDSWISH_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_SQRT_BLOCK_SIZE 256
+#define CUDA_SIN_BLOCK_SIZE 256
+#define CUDA_COS_BLOCK_SIZE 256
+
+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
new file mode 100644
index 000000000..cf513c3ad
--- /dev/null
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -0,0 +1,51 @@
+#include "upscale.cuh"
+
+static __global__ void upscale_f32(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne10, const int ne11, const int ne12, const int ne13,
+        const float sf0, const float sf1, const float sf2, const float sf3) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index >= ne10 * ne11 * ne12 * ne13) {
+        return;
+    }
+
+    int i10 = index % ne10;
+    int i11 = (index / ne10) % ne11;
+    int i12 = (index / (ne10 * ne11)) % ne12;
+    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
+
+    int i00 = i10 / sf0;
+    int i01 = i11 / sf1;
+    int i02 = i12 / sf2;
+    int i03 = i13 / sf3;
+
+    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
+}
+
+static void upscale_f32_cuda(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne10, const int ne11, const int ne12, const int ne13,
+        const float sf0, const float sf1, const float sf2, const float sf3,
+        cudaStream_t stream) {
+    int dst_size = ne10 * ne11 * ne12 * ne13;
+    int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+
+    upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
+}
+
+void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const float sf0 = (float)dst->ne[0]/src0->ne[0];
+    const float sf1 = (float)dst->ne[1]/src0->ne[1];
+    const float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const float sf3 = (float)dst->ne[3]/src0->ne[3];
+
+    upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
+}
diff --git a/ggml/src/ggml-cuda/upscale.cuh b/ggml/src/ggml-cuda/upscale.cuh
new file mode 100644
index 000000000..d4d765230
--- /dev/null
+++ b/ggml/src/ggml-cuda/upscale.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+
+void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
new file mode 100644
index 000000000..40091a0ef
--- /dev/null
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -0,0 +1,1133 @@
+#include "common.cuh"
+#include <cstdint>
+
+static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
+
+    int x32  = x16[2*i32 + 0] <<  0;
+    x32     |= x16[2*i32 + 1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
+    return ((const int *) x)[i32]; // assume at least 4 byte alignment
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
+    const int * v, const int * u, const float & d4, const half2 & ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
+        sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
+        sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const float2 dm4f = __half22float2(dm4);
+    const float2 ds8f = __half22float2(ds8);
+    const float d4d8 = dm4f.x * ds8f.x;
+    const float m4s8 = dm4f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
+    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const float2 dm5f = __half22float2(dm5);
+    const float2 ds8f = __half22float2(ds8);
+    const float d5d8 = dm5f.x * ds8f.x;
+    const float m5s8 = dm5f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
+    const int * v, const int * u, const T & d8_0, const T & d8_1) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * ((T) sumi);
+}
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
+    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const float2 dm8f = __half22float2(dm8);
+    const float2 ds8f = __half22float2(ds8);
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+}
+
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
+    const int * v, const int * u, const float * d8_0, const float & d8_1) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
+        int sumi = 0;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_0/2; ++i) {
+            // SIMD dot product of quantized values
+            sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
+        }
+
+        sumf += d8_0[i0/(QI8_0/2)]*sumi;
+    }
+
+    return d8_1*sumf;
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  4
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const half2 & dm2, const float * __restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+}
+
+// contiguous v/x + u/y values
+template <int ns8>
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8, const half2 * s8) {
+
+    float sumf    = 0.0f;
+    float sumf_d8 = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR2_K*VDR_Q2_K_Q8_1_MMQ; i0 += QI8_1) {
+        const float2 dm2f0 = __half22float2(dm2[i0/(QI8_1/2) + 0]);
+        int sumi_d0 = 0;
+
+        const float2 dm2f1 = __half22float2(dm2[i0/(QI8_1/2) + 1]);
+        int sumi_d1 = 0;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d0 = ggml_cuda_dp4a(v[i], u[i], sumi_d0);
+        }
+        sumf_d8 += dm2f0.x * sumi_d0;
+
+#pragma unroll
+        for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
+            sumi_d1 = ggml_cuda_dp4a(v[i], u[i], sumi_d1);
+        }
+        sumf_d8 += dm2f1.x * sumi_d1;
+
+        if (i0/QI8_1 < ns8) {
+            const float2 s8f = __half22float2(s8[i0/QI8_1]);
+            sumf -= dm2f0.y*s8f.x;
+            sumf -= dm2f1.y*s8f.y;
+        } else {
+            int sumi_m0 = 0;
+#pragma unroll
+            for (int i = i0; i < i0 + QI8_1/2; ++i) {
+                sumi_m0 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m0);
+            }
+            sumf_d8 -= dm2f0.y * sumi_m0;
+
+            int sumi_m1 = 0;
+#pragma unroll
+            for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
+                sumi_m1 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m1);
+            }
+            sumf_d8 -= dm2f1.y * sumi_m1;
+        }
+    }
+
+    return sumf + d8*sumf_d8;
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
+    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+}
+
+// contiguous v/x + u/y values
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d3, const float & d8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = ggml_cuda_dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+}
+
+// contiguous v/x + u/y values
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+}
+
+// contiguous v/x + u/y values
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
+    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
+    const float & d, const float * __restrict__ d8) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+}
+
+// contiguous v/x + u/y values
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
+    const float & d6, const float * __restrict__ d8) {
+
+    float sumf_d = 0.0f;
+
+    const int      sc_packed = get_int_b4(sc, 0);
+    const int8_t * sc_reg    = (const int8_t *) &sc_packed;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc_reg[i0/2+0]*sumi_d.x + sc_reg[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+}
+
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq + kbx;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_b2(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq + kbx;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_b4(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq + kbx;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_b2(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq + kbx;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_b4(bq5_1->qs, iqs + i);
+        vh[i]    = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq + kbx;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_b2(bq8_0->qs, iqs + i);
+        u[i] = get_int_b4(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
+}
+
+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq + kbx;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_b4(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq + kbx;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_b2(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq + kbx;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+}
+
+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq + kbx;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+}
+
+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq + kbx;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_b2(bq6_K->ql, iqs);
+    const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+#define VDR_IQ2_XXS_Q8_1_MMVQ 2
+#define VDR_IQ2_XXS_Q8_1_MMQ  2
+
+static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
+
+    const int q2 = get_int_b2(bq2->qs, iqs);
+    const uint8_t * aux8 = (const uint8_t *) &q2;
+    const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1);
+
+    int sumi = 0;
+#pragma unroll
+    for (int k0 = 0; k0 < 8; k0 += 2) {
+        const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
+        const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
+
+        const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
+        const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+        const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
+        sumi = ggml_cuda_dp4a(grid0, u0, sumi);
+
+        const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
+        const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+        const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
+        sumi = ggml_cuda_dp4a(grid1, u1, sumi);
+    }
+
+    const int ls = aux32 >> 28;
+    sumi = (ls*sumi + sumi/2)/4;
+    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    return d * sumi;
+}
+
+#define VDR_IQ2_XS_Q8_1_MMVQ 2
+#define VDR_IQ2_XS_Q8_1_MMQ  2
+
+static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
+
+    const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
+    const uint16_t * q2 = (const uint16_t *) &q2_packed;
+    const int ls0 = bq2->scales[iqs/2] & 0x0F;
+    const int ls1 = bq2->scales[iqs/2] >> 4;
+
+    int sumi0 = 0;
+    int sumi1 = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
+        const uint32_t * signs    = (const uint32_t *)(ksigns64   + (q2[l0/2] >> 9));
+
+        const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+
+        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+        if (l0 < 4) {
+            sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
+            sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
+        } else {
+            sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
+            sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
+        }
+    }
+    const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
+    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    return d * sumi;
+}
+
+#define VDR_IQ2_S_Q8_1_MMVQ 2
+#define VDR_IQ2_S_Q8_1_MMQ  2
+
+static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
+
+    const int       qs_packed = get_int_b2(bq2->qs, iqs/2);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq2->qh[iqs/2];
+
+    const int       signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2);
+    const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+    const int ls0 = bq2->scales[iqs/2] & 0x0F;
+    const int ls1 = bq2->scales[iqs/2] >> 4;
+
+    int sumi0 = 0;
+    int sumi1 = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300)));
+
+        const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
+        const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
+
+        const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
+        const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+        if (l0 < 4) {
+            sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
+            sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
+        } else {
+            sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
+            sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
+        }
+    }
+    const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
+
+    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
+    return d * sumi;
+}
+
+#define VDR_IQ3_XXS_Q8_1_MMVQ 2
+#define VDR_IQ3_XXS_Q8_1_MMQ  2
+
+static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
+
+    const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
+    const uint8_t * q3 = (const uint8_t *) &q3_packed;
+    const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2);
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
+
+        const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
+
+        const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+
+        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+        sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
+        sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
+    }
+
+    const int ls = aux32 >> 28;
+    sumi = (ls*sumi + sumi/2)/2;
+    const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
+    return d * sumi;
+}
+
+#define VDR_IQ3_S_Q8_1_MMVQ 2
+#define VDR_IQ3_S_Q8_1_MMQ  2
+
+// TODO: don't use lookup table for signs
+static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
+
+    const int2      qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq3->qh[iqs/2];
+
+    const int       signs_packed_32 = get_int_b2(bq3->signs, iqs/2);
+    const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int2 grid_pos = make_int2(
+            iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)],
+            iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]);
+
+        const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
+        const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
+
+        const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
+        const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
+
+        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
+
+        sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
+        sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
+    }
+
+    sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
+
+    const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
+    return d * sumi;
+}
+
+#define VDR_IQ1_S_Q8_1_MMVQ 1
+#define VDR_IQ1_S_Q8_1_MMQ  1
+
+static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+    const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
+
+    const int       qs_packed = get_int_b2(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq1->qh[iqs];
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi = ggml_cuda_dp4a(grid0, u0, sumi);
+        sumi = ggml_cuda_dp4a(grid1, u1, sumi);
+    }
+
+    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
+    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+    const float2 ds    = __half22float2(bq8_1[iqs].ds);
+    return d1q * (ds.x*sumi + ds.y*delta);
+}
+
+#define VDR_IQ1_M_Q8_1_MMVQ 1
+#define VDR_IQ1_M_Q8_1_MMQ  1
+
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
+
+    const int       qs_packed = get_int_b4(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    int   sumi[2] = {0};
+    float sumf[2] = {0.0f};
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
+
+        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]);
+        sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]);
+
+        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
+        int sumy = 0;
+        sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy);
+        sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy);
+        sumf[l0/4] += delta*sumy;
+    }
+
+    const uint16_t * sc = (const uint16_t *) bq1->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
+    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+
+    const int tmp = sc[iqs/2] >> (6*(iqs%2));
+    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
+    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
+    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
+}
+
+static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
+    const int      q0_32  = (q4 >> 0) & 0x0F0F0F0F;
+    const int8_t * q0_8   = (const int8_t *) &q0_32;
+    const char4    val0_8 = make_char4(
+        kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]);
+
+    const int      q1_32  = (q4 >> 4) & 0x0F0F0F0F;
+    const int8_t * q1_8   = (const int8_t *) &q1_32;
+    const char4    val1_8 = make_char4(
+        kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]);
+
+    return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
+}
+
+#define VDR_IQ4_NL_Q8_1_MMVQ 2
+#define VDR_IQ4_NL_Q8_1_MMQ  4
+
+static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
+
+    const int * q8 = (const int *) bq8_1->qs + iqs;
+
+    int sumi = 0;
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
+        const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
+        const int2 v = get_int_from_table_16(aux_q4);
+
+        sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
+        sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
+    }
+
+    const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
+    return d * sumi;
+}
+
+#define VDR_IQ4_XS_Q8_1_MMVQ 4
+#define VDR_IQ4_XS_Q8_1_MMQ  4
+
+static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
+
+    int sumi = 0;
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        const int aux_q4 = get_int_b4(bq4->qs, iqs + j);
+        const int2 v = get_int_from_table_16(aux_q4);
+
+        const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0);
+        const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4);
+
+        sumi = ggml_cuda_dp4a(v.x, u0, sumi);
+        sumi = ggml_cuda_dp4a(v.y, u1, sumi);
+    }
+
+    const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
+    sumi *= ls - 32;
+
+    const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
+    return d * sumi;
+}
diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h
new file mode 100644
index 000000000..1746b0732
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
new file mode 100644
index 000000000..81964611c
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -0,0 +1,235 @@
+#pragma once
+
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef hip_bfloat16 nv_bfloat16;
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int &>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
+
+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
+// __shfl_xor() for half2 was added in ROCm 5.6
+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
+    typedef union half2_b32 {
+        half2 val;
+        int   b32;
+    } half2_b32_t;
+    half2_b32_t tmp;
+    tmp.val = var;
+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
+    return tmp.val;
+}
+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
new file mode 100644
index 000000000..6cc1b69ee
--- /dev/null
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_bf16.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+typedef mt_bfloat16 nv_bfloat16;
diff --git a/ggml/src/ggml-cuda/wkv6.cu b/ggml/src/ggml-cuda/wkv6.cu
new file mode 100644
index 000000000..bbdafbee5
--- /dev/null
+++ b/ggml/src/ggml-cuda/wkv6.cu
@@ -0,0 +1,89 @@
+#include "common.cuh"
+#include "wkv6.cuh"
+
+static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    const int head_size = CUDA_WKV_BLOCK_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+
+    float state[head_size];
+    __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size];
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+
+    __syncthreads();
+    _tf[tid] = tf[head_i * head_size + tid];
+    __syncthreads();
+
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
+        __syncthreads();
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        __syncthreads();
+
+        const float _v = v[t];
+        float y = 0;
+        for (int j = 0; j < head_size; j += 4) {
+            const float4& k = (float4&)(_k[j]);
+            const float4& r = (float4&)(_r[j]);
+            const float4& tf = (float4&)(_tf[j]);
+            const float4& td = (float4&)(_td[j]);
+            float4& s = (float4&)(state[j]);
+            float4 kv;
+
+            kv.x = k.x * _v;
+            kv.y = k.y * _v;
+            kv.z = k.z * _v;
+            kv.w = k.w * _v;
+
+            y += r.x * (tf.x * kv.x + s.x);
+            y += r.y * (tf.y * kv.y + s.y);
+            y += r.z * (tf.z * kv.z + s.z);
+            y += r.w * (tf.w * kv.w + s.w);
+
+            s.x = s.x * td.x + kv.x;
+            s.y = s.y * td.y + kv.y;
+            s.z = s.z * td.z + kv.z;
+            s.w = s.w * td.w + kv.w;
+        }
+        dst[t] = y;
+    }
+
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+
+void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const float * k_d  = (const float *)dst->src[0]->data;
+    const float * v_d  = (const float *)dst->src[1]->data;
+    const float * r_d  = (const float *)dst->src[2]->data;
+    const float * tf_d = (const float *)dst->src[3]->data;
+    const float * td_d = (const float *)dst->src[4]->data;
+    const float * s_d  = (const float *)dst->src[5]->data;
+
+    const int64_t B = dst->src[5]->ne[1];
+    const int64_t T = dst->src[0]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[1];
+
+    float * dst_d = (float *)dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); // The current cuda kernel is designed for RWKV6, HEAD_SIZE == 64
+
+    rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
+}
diff --git a/ggml/src/ggml-cuda/wkv6.cuh b/ggml/src/ggml-cuda/wkv6.cuh
new file mode 100644
index 000000000..a7124ee51
--- /dev/null
+++ b/ggml/src/ggml-cuda/wkv6.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_WKV_BLOCK_SIZE 64
+
+void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
new file mode 100644
index 000000000..f4a468363
--- /dev/null
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -0,0 +1,121 @@
+if (NOT EXISTS $ENV{ROCM_PATH})
+    if (NOT EXISTS /opt/rocm)
+        set(ROCM_PATH /usr)
+    else()
+        set(ROCM_PATH /opt/rocm)
+    endif()
+else()
+    set(ROCM_PATH $ENV{ROCM_PATH})
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+
+# CMake on Windows doesn't support the HIP language yet
+if (WIN32)
+    set(CXX_IS_HIPCC TRUE)
+else()
+    string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
+endif()
+
+if (CXX_IS_HIPCC)
+    if (LINUX)
+        if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+            message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+        endif()
+
+        message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+                " Prefer setting the HIP compiler directly. See README for details.")
+    endif()
+else()
+    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+    endif()
+    cmake_minimum_required(VERSION 3.21)
+    enable_language(HIP)
+endif()
+
+find_package(hip     REQUIRED)
+find_package(hipblas REQUIRED)
+find_package(rocblas REQUIRED)
+
+if (${hip_VERSION} VERSION_LESS 5.5)
+    message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
+endif()
+
+message(STATUS "HIP and hipBLAS found")
+
+# Workaround old compilers
+set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
+
+file(GLOB   GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
+list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
+
+file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
+file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+if (GGML_CUDA_FA_ALL_QUANTS)
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+else()
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+endif()
+
+ggml_add_backend_library(ggml-hip
+                         ${GGML_HEADERS_ROCM}
+                         ${GGML_SOURCES_ROCM}
+                        )
+
+# TODO: do not use CUDA definitions for HIP
+if (NOT GGML_BACKEND_DL)
+    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+endif()
+
+add_compile_definitions(GGML_USE_HIP)
+
+if (GGML_HIP_UMA)
+    add_compile_definitions(GGML_HIP_UMA)
+endif()
+
+if (GGML_CUDA_FORCE_MMQ)
+    add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+endif()
+
+if (GGML_CUDA_FORCE_CUBLAS)
+    add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+endif()
+
+if (GGML_CUDA_NO_PEER_COPY)
+    add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+endif()
+
+if (GGML_HIP_GRAPHS)
+    add_compile_definitions(GGML_HIP_GRAPHS)
+endif()
+
+if (GGML_HIP_NO_VMM)
+    add_compile_definitions(GGML_HIP_NO_VMM)
+endif()
+
+if (CXX_IS_HIPCC)
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    target_link_libraries(ggml-hip PRIVATE hip::device)
+else()
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+endif()
+
+if (GGML_STATIC)
+    message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+endif()
+
+target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
new file mode 100644
index 000000000..eab017889
--- /dev/null
+++ b/ggml/src/ggml-impl.h
@@ -0,0 +1,567 @@
+#pragma once
+
+// GGML internal header
+
+#include "ggml.h"
+#include "gguf.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+
+#if defined(__F16C__)
+#include <immintrin.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+// required for mmap as gguf only guarantees 32-byte alignment
+#define TENSOR_ALIGNMENT 32
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef __cplusplus
+    #ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+    #endif
+#endif
+
+static inline int ggml_up32(int n) {
+    return (n + 31) & ~31;
+}
+
+//static inline int ggml_up64(int n) {
+//    return (n + 63) & ~63;
+//}
+
+static inline int ggml_up(int n, int m) {
+    // assert m is a power of 2
+    GGML_ASSERT((m & (m - 1)) == 0);
+    return (n + m - 1) & ~(m - 1);
+}
+
+//
+// logging
+//
+
+GGML_ATTRIBUTE_FORMAT(2, 3)
+GGML_API void ggml_log_internal        (enum ggml_log_level level, const char * format, ...);
+GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
+
+#define GGML_LOG(...)       ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define GGML_LOG_INFO(...)  ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define GGML_LOG_WARN(...)  ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define GGML_LOG_CONT(...)  ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
+#define GGML_DEBUG 0
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+// tensor params
+
+static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
+    assert(params_size <= GGML_MAX_OP_PARAMS);
+    memcpy(tensor->op_params, params, params_size);
+}
+
+static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    return ((const int32_t *)(tensor->op_params))[i];
+}
+
+static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
+    return ((const float *)(tensor->op_params))[i];
+}
+
+static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    ((int32_t *)(tensor->op_params))[i] = value;
+}
+
+static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
+    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
+    ((float *)(tensor->op_params))[i] = value;
+}
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t  fun;
+    int                n_tasks;
+    void             * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t   fun;
+    int                 n_tasks;
+    void              * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+// bitset
+
+typedef uint32_t ggml_bitset_t;
+
+static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
+#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
+#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
+
+static size_t ggml_bitset_size(size_t n) {
+    return (n + BITSET_MASK) >> BITSET_SHR;
+}
+
+static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
+    return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
+}
+
+static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
+}
+
+static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
+}
+
+// hash set
+
+#define GGML_HASHSET_FULL ((size_t)-1)
+#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
+
+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
+struct ggml_hash_set ggml_hash_set_new(size_t size);
+void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
+
+// returns the minimum size for a hash set that can hold min_sz elements
+size_t ggml_hash_size(size_t min_sz);
+
+// remove all elements from the hash set
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
+
+// returns true if key is in the hash set
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key);
+
+// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// return index, asserts if table is full
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// hash function for ggml_tensor
+static inline size_t ggml_hash(const struct ggml_tensor * p) {
+    // the last 4 bits are always zero due to alignment
+    return (size_t)(uintptr_t)p >> 4;
+}
+
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
+        i = (i + 1) % hash_set->size;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_HASHSET_FULL;
+        }
+    }
+    return i;
+}
+
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t i = ggml_hash_find(hash_set, key);
+    return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
+}
+
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return GGML_HASHSET_ALREADY_EXISTS;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
+
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return i;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
+
+// computation graph
+
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
+struct ggml_cgraph {
+    int size;    // maximum number of nodes/leafs/grads/grad_accs
+    int n_nodes; // number of nodes currently in use
+    int n_leafs; // number of leafs currently in use
+
+    struct ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
+    struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
+    struct ggml_tensor ** grad_accs; // accumulators for node gradients
+    struct ggml_tensor ** leafs;     // tensors with constant data
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+};
+
+// returns a slice of cgraph with nodes [i0, i1)
+// the slice does not have leafs or gradients
+// if you need the gradients, get them from the original graph
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
+// Memory allocation
+
+GGML_API void * ggml_aligned_malloc(size_t size);
+GGML_API void ggml_aligned_free(void * ptr, size_t size);
+
+// FP16 to FP32 conversion
+
+#if defined(__ARM_NEON)
+    #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
+        typedef uint16_t ggml_fp16_internal_t;
+    #else
+        typedef __fp16 ggml_fp16_internal_t;
+    #endif
+#endif
+
+#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        ggml_fp16_internal_t tmp;
+        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+        return (float)tmp;
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        ggml_fp16_t res;
+        ggml_fp16_internal_t tmp = f;
+        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+        return res;
+    }
+
+#elif defined(__F16C__)
+
+    #ifdef _MSC_VER
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+    #else
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+    #endif
+
+#elif defined(__POWER9_VECTOR__)
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+    /* the inline asm below is about 12% faster than the lookup method */
+    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        register float f;
+        register double d;
+        __asm__(
+            "mtfprd %0,%2\n"
+            "xscvhpdp %0,%0\n"
+            "frsp %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=f"(f):
+            /* in */   "r"(h));
+        return f;
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        register double d;
+        register ggml_fp16_t r;
+        __asm__( /* xscvdphp can work on double or single precision */
+            "xscvdphp %0,%2\n"
+            "mffprd %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=r"(r):
+            /* in */   "f"(f));
+        return r;
+    }
+
+#else
+
+    // FP16 <-> FP32
+    // ref: https://github.com/Maratyszcza/FP16
+
+    static inline float fp32_from_bits(uint32_t w) {
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } fp32;
+        fp32.as_bits = w;
+        return fp32.as_value;
+    }
+
+    static inline uint32_t fp32_to_bits(float f) {
+        union {
+            float as_value;
+            uint32_t as_bits;
+        } fp32;
+        fp32.as_value = f;
+        return fp32.as_bits;
+    }
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        const uint32_t w = (uint32_t) h << 16;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        const uint32_t two_w = w + w;
+
+        const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float exp_scale = 0x1.0p-112f;
+    #else
+        const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+    #endif
+        const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+        const uint32_t magic_mask = UINT32_C(126) << 23;
+        const float magic_bias = 0.5f;
+        const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+        const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+        const uint32_t result = sign |
+            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+        return fp32_from_bits(result);
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float scale_to_inf = 0x1.0p+112f;
+        const float scale_to_zero = 0x1.0p-110f;
+    #else
+        const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+    #endif
+        float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+        const uint32_t w = fp32_to_bits(f);
+        const uint32_t shl1_w = w + w;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+        if (bias < UINT32_C(0x71000000)) {
+            bias = UINT32_C(0x71000000);
+        }
+
+        base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+        const uint32_t bits = fp32_to_bits(base);
+        const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+        const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+        const uint32_t nonsign = exp_bits + mantissa_bits;
+        return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+    }
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+GGML_API float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+#include <vector>
+
+// expose GGUF internals for test code
+GGML_API size_t gguf_type_size(enum gguf_type type);
+GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
+GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
+#endif // __cplusplus
diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt
new file mode 100644
index 000000000..c9109d5e8
--- /dev/null
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@@ -0,0 +1,166 @@
+
+find_package(Vulkan COMPONENTS glslc REQUIRED)
+find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+
+if (NOT glslc_executable)
+    message(FATAL_ERROR "glslc not found")
+endif()
+
+ggml_add_backend_library(ggml-kompute
+                         ggml-kompute.cpp
+                         ../../include/ggml-kompute.h
+                        )
+
+target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
+target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+
+function(compile_shader)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SOURCES)
+    cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    foreach(source ${compile_shader_SOURCES})
+        get_filename_component(filename ${source} NAME)
+        set(spv_file ${filename}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+            ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${spv_file}"
+            )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            add_custom_command(
+                OUTPUT ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                DEPENDS ${spv_file} xxd
+                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+                )
+        else()
+            add_custom_command(
+                OUTPUT ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                DEPENDS ${spv_file} xxd
+                COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+                )
+        endif()
+    endforeach()
+endfunction()
+
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+    message(STATUS "Kompute found")
+    set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
+    add_subdirectory(kompute)
+
+    # Compile our shaders
+    compile_shader(SOURCES
+        kompute-shaders/op_scale.comp
+        kompute-shaders/op_scale_8.comp
+        kompute-shaders/op_add.comp
+        kompute-shaders/op_addrow.comp
+        kompute-shaders/op_mul.comp
+        kompute-shaders/op_silu.comp
+        kompute-shaders/op_relu.comp
+        kompute-shaders/op_gelu.comp
+        kompute-shaders/op_softmax.comp
+        kompute-shaders/op_norm.comp
+        kompute-shaders/op_rmsnorm.comp
+        kompute-shaders/op_diagmask.comp
+        kompute-shaders/op_mul_mat_mat_f32.comp
+        kompute-shaders/op_mul_mat_f16.comp
+        kompute-shaders/op_mul_mat_q8_0.comp
+        kompute-shaders/op_mul_mat_q4_0.comp
+        kompute-shaders/op_mul_mat_q4_1.comp
+        kompute-shaders/op_mul_mat_q4_k.comp
+        kompute-shaders/op_mul_mat_q6_k.comp
+        kompute-shaders/op_getrows_f32.comp
+        kompute-shaders/op_getrows_f16.comp
+        kompute-shaders/op_getrows_q4_0.comp
+        kompute-shaders/op_getrows_q4_1.comp
+        kompute-shaders/op_getrows_q6_k.comp
+        kompute-shaders/op_rope_norm_f16.comp
+        kompute-shaders/op_rope_norm_f32.comp
+        kompute-shaders/op_rope_neox_f16.comp
+        kompute-shaders/op_rope_neox_f32.comp
+        kompute-shaders/op_cpy_f16_f16.comp
+        kompute-shaders/op_cpy_f16_f32.comp
+        kompute-shaders/op_cpy_f32_f16.comp
+        kompute-shaders/op_cpy_f32_f32.comp
+    )
+
+    # Create a custom target for our generated shaders
+    add_custom_target(generated_shaders DEPENDS
+        shaderop_scale.h
+        shaderop_scale_8.h
+        shaderop_add.h
+        shaderop_addrow.h
+        shaderop_mul.h
+        shaderop_silu.h
+        shaderop_relu.h
+        shaderop_gelu.h
+        shaderop_softmax.h
+        shaderop_norm.h
+        shaderop_rmsnorm.h
+        shaderop_diagmask.h
+        shaderop_mul_mat_mat_f32.h
+        shaderop_mul_mat_f16.h
+        shaderop_mul_mat_q8_0.h
+        shaderop_mul_mat_q4_0.h
+        shaderop_mul_mat_q4_1.h
+        shaderop_mul_mat_q4_k.h
+        shaderop_mul_mat_q6_k.h
+        shaderop_getrows_f32.h
+        shaderop_getrows_f16.h
+        shaderop_getrows_q4_0.h
+        shaderop_getrows_q4_1.h
+        shaderop_getrows_q6_k.h
+        shaderop_rope_norm_f16.h
+        shaderop_rope_norm_f32.h
+        shaderop_rope_neox_f16.h
+        shaderop_rope_neox_f32.h
+        shaderop_cpy_f16_f16.h
+        shaderop_cpy_f16_f32.h
+        shaderop_cpy_f32_f16.h
+        shaderop_cpy_f32_f32.h
+    )
+
+    # Create a custom command that depends on the generated_shaders
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+        DEPENDS generated_shaders
+        COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
+    )
+
+    # Add the stamp to the main sources to ensure dependency tracking
+    target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+else()
+    message(WARNING "Kompute not found")
+endif()
diff --git a/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp
similarity index 80%
rename from ggml-kompute.cpp
rename to ggml/src/ggml-kompute/ggml-kompute.cpp
index e740a76d1..505792271 100644
--- a/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-kompute.h"
@@ -20,14 +20,18 @@
 #include "shaderop_mul_mat_q8_0.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_mul_mat_q4_k.h"
 #include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_mul_mat_mat_f32.h"
+#include "shaderop_getrows_f32.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope_f16.h"
-#include "shaderop_rope_f32.h"
+#include "shaderop_rope_norm_f16.h"
+#include "shaderop_rope_norm_f32.h"
+#include "shaderop_rope_neox_f16.h"
+#include "shaderop_rope_neox_f32.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
 #include "shaderop_cpy_f32_f16.h"
@@ -41,6 +45,7 @@
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <mutex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
@@ -272,18 +277,9 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
     return results;
 }
 
-// public API returns a C-style array
-ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) {
-    auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    *count = devices.size();
-    if (devices.empty()) {
-        return nullptr;
-    }
-
-    size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
-    auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
-    memcpy(arr, devices.data(), nbytes);
-    return arr;
+static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
+    static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
+    return devices;
 }
 
 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
@@ -340,7 +336,7 @@ ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
-    auto devices = ggml_vk_available_devices_internal(0);
+    auto devices = ggml_vk_available_devices();
     ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
     GGML_ASSERT(!devices.empty());
     return devices.front();
@@ -351,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
         vk::DescriptorPoolSize(
           vk::DescriptorType::eStorageBuffer,
-          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          4 * size // Descriptor count is number of possible tensors to pass into an algorithm
           )
     };
 
@@ -565,7 +561,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     }
     if ((a % b) != 0) {
         fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
-        GGML_ASSERT(!"safe_divide result would've had remainder");
+        GGML_ABORT("safe_divide result would've had remainder");
     }
     return a / b;
 }
@@ -794,7 +790,8 @@ static void ggml_vk_soft_max(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
-    float scale
+    float scale, float max_bias, float m0, float m1,
+    uint32_t n_head_log2
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
         kp::shader_data::op_softmax_comp_spv_len);
@@ -802,12 +799,14 @@ static void ggml_vk_soft_max(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        float scale;
+        float scale, max_bias, m0, m1;
+        uint32_t n_head_log2;
         int32_t mask;
     } pushConsts {
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        scale,
+        scale, max_bias, m0, m1,
+        n_head_log2,
         bool(inB)
     };
 
@@ -917,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
     int32_t ne00, int32_t ne01, int32_t ne02,
-    uint32_t nb00, uint32_t nb01, uint32_t nb02,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
-    uint32_t nb10, uint32_t nb11, uint32_t nb12,
+    uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
     int32_t ne0, int32_t ne1,
     uint32_t r2, uint32_t r3
 ) {
@@ -929,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
         int32_t ne00, ne01, ne02;
-        uint32_t nb00, nb01, nb02;
+        uint32_t nb00, nb01, nb02, nb03;
         int32_t ne10, ne11, ne12;
-        uint32_t nb10, nb11, nb12;
+        uint32_t nb10, nb11, nb12, nb13;
         int32_t ne0, ne1;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
-        nb00, nb01, nb02,
+        nb00, nb01, nb02, nb03,
         ne10, ne11, ne12,
-        nb10, nb11, nb12,
+        nb10, nb11, nb12, nb13,
         ne0, ne1,
         r2, r3
     };
@@ -1019,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
     int32_t ne00, int32_t ne01, int32_t ne02,
     int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
     int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
     uint32_t r2, uint32_t r3
 ) {
     struct PushConstants {
@@ -1026,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
         int32_t ne00, ne01, ne02;
         int32_t ne10, ne12;
         int32_t ne0, ne1;
+        uint32_t nb01, nb02, nb03;
+        uint32_t nb11, nb12, nb13;
         uint32_t r2, r3;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, ne01, ne02,
         ne10, ne12,
         ne0, ne1,
+        nb01, nb02, nb03,
+        nb11, nb12, nb13,
         r2, r3
     };
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
@@ -1074,34 +1079,84 @@ static void ggml_vk_mul_mat_q8_0(Args&&... args) {
     ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+static void ggml_vk_mul_mat_q4_k(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
+) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
+        kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 static void ggml_vk_mul_mat_q6_k(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
-    int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
+    int32_t ne00, int32_t ne01, int32_t ne02,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t ne0, int32_t ne1,
+    uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    uint32_t nb11, uint32_t nb12, uint32_t nb13,
+    uint32_t r2, uint32_t r3
 ) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
         kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+        int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
+        uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
+        uint32_t r2, r3;
     } pushConsts {
         inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+        ne00, ne10, ne0, ne1, ne01, ne02, ne12,
+        nb01, nb02, nb03, nb11, nb12, nb13,
+        r2, r3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        const uint32_t local_x = 2;
+        const uint32_t local_y = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1146,6 +1201,14 @@ static void ggml_vk_get_rows(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+template <typename... Args>
+static void ggml_vk_get_rows_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
+        kp::shader_data::op_getrows_f32_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
+}
+
 template <typename... Args>
 static void ggml_vk_get_rows_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
@@ -1181,10 +1244,11 @@ static void ggml_vk_rope(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& inC,
     const std::shared_ptr<kp::Tensor>& out,
-    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
-    float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
+    uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
+    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
+    float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne0,
@@ -1192,11 +1256,17 @@ static void ggml_vk_rope(
 ) {
     GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
 
-    static const auto spirv_f16 = getSpirvShader(
-        kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
+    static const auto spirv_norm_f16 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
     );
-    static const auto spirv_f32 = getSpirvShader(
-        kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
+    static const auto spirv_norm_f32 = getSpirvShader(
+        kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
+    );
+    static const auto spirv_neox_f16 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
+    );
+    static const auto spirv_neox_f32 = getSpirvShader(
+        kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
     );
 
     int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
@@ -1211,32 +1281,40 @@ static void ggml_vk_rope(
     GGML_ASSERT(nb0  % type_size == 0);
 
     struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t n_dims, mode, n_orig_ctx;
-        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+        uint32_t inAOff, inBOff, inCOff, outOff;
+        int32_t n_dims, mode, n_ctx_orig;
+        float freq_base, freq_scale;
+        bool has_freq_factors;
+        float ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
-        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
-        n_dims, mode, n_orig_ctx,
-        freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
+        n_dims, mode, n_ctx_orig,
+        freq_base, freq_scale,
+        has_freq_factors,
+        ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
     };
 
-    auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
+    auto & inC_ = inC ? inC : inA;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_f16 = src0t == GGML_TYPE_F16;
+
+    auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(name)) {
+        auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
         s_algo = komputeManager()->algorithm<float, PushConstants>(
-            name, s_kompute_context->pool.get(), {inA, inB, out},
-            src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
+            name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
             {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
         );
     } else {
         s_algo = komputeManager()->getAlgorithm(name);
-        s_algo->setTensors({inA, inB, out});
+        s_algo->setTensors({inA, inB, inC_, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1314,24 +1392,18 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
     ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
 }
 
-static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
-    switch (op->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            break;
-        default:
-            return false;
-    }
-
+static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    int64_t n = ggml_nelements(op);
     switch (op->op) {
         case GGML_OP_UNARY:
+            if (n % 4 != 0) return false;
             switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_GELU:
+                    if (n % 8 != 0) return false;
+                    // fall through
+                case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_SILU:
-                    return true;
+                    return ggml_is_contiguous(op->src[0]);
                 default:
                     ;
             }
@@ -1347,8 +1419,18 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
         case GGML_OP_NORM:
-        case GGML_OP_ROPE:
             return true;
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
         case GGML_OP_DUP:
         case GGML_OP_CPY:
         case GGML_OP_CONT:
@@ -1371,6 +1453,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
             return op->ne[3] == 1;
         case GGML_OP_GET_ROWS:
             switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
@@ -1386,12 +1469,13 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
 
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
-                case GGML_TYPE_Q6_K:
                     return op->ne[3] == 1;
+                case GGML_TYPE_Q6_K:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q8_0:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q4_K:
                     return true;
                 default:
                     ;
@@ -1400,6 +1484,8 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
             ;
     }
     return false;
+
+    GGML_UNUSED(dev);
 }
 
 static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
@@ -1427,9 +1513,14 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
         for (int i = node_start; i < node_end; ++i) {
             struct ggml_tensor * src0 = gf->nodes[i]->src[0];
             struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
             struct ggml_tensor * dst = gf->nodes[i];
             GGML_ASSERT(dst->data != nullptr);
 
+            if (ggml_is_empty(dst)) {
+                continue;
+            }
+
             switch (dst->op) {
                 case GGML_OP_NONE:
                 case GGML_OP_RESHAPE:
@@ -1443,11 +1534,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 
             any_commands_recorded = true;
 
-            if (!ggml_vk_supports_op(dst)) {
-                 fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                 GGML_ASSERT(!"unsupported op");
-            }
-
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1485,9 +1571,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
+            uint32_t off_src2 = 0;
             uint32_t off_dst  = 0;
             const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
@@ -1547,15 +1635,32 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                             default:
                                 {
                                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ASSERT(false);
+                                    GGML_ABORT("fatal error");
                                 }
                         }
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
                         float scale;
-                        memcpy(&scale, dst->op_params, sizeof(float));
-                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
+                        float max_bias;
+
+                        memcpy(&scale,    (float *)dst->op_params + 0, sizeof(float));
+                        memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
+
+#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
+#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5021")
+                        GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
+
+                        const int64_t nrows_x = ggml_nrows(src0);
+                        const int64_t nrows_y = src0->ne[1];
+
+                        const uint32_t n_head      = nrows_x/nrows_y;
+                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
@@ -1580,7 +1685,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                     {
                         GGML_ASSERT(ne00 == ne10);
 
-                        // TODO: assert that dim2 and dim3 are contiguous
                         GGML_ASSERT(ne12 % ne02 == 0);
                         GGML_ASSERT(ne13 % ne03 == 0);
 
@@ -1608,32 +1712,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
+                                    ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                    ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
                                     ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q8_0:
                                 ggml_vk_mul_mat_q8_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_0:
                                 ggml_vk_mul_mat_q4_0(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_1:
                                 ggml_vk_mul_mat_q4_1(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
+                                );
+                                break;
+                            case GGML_TYPE_Q4_K:
+                                ggml_vk_mul_mat_q4_k(
+                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q6_K:
                                 ggml_vk_mul_mat_q6_k(
                                     seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
-                                    ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
+                                    ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
+                                    nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
                                 );
                                 break;
                             default: {
@@ -1645,7 +1761,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
-                        if (src0t == GGML_TYPE_F16) {
+                        if (src0t == GGML_TYPE_F32) {
+                            ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0t == GGML_TYPE_F16) {
                             ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_0) {
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
@@ -1666,7 +1784,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         const int n_dims     = ((int32_t *) dst->op_params)[1];
                         const int mode       = ((int32_t *) dst->op_params)[2];
                         // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
-                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+                        const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+                        const bool has_freq_factors = dst->src[2] != nullptr;
 
                         float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                         memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
@@ -1676,8 +1796,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                         ggml_vk_rope(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
-                            freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+                            seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
+                            freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
                             ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
                         );
                     } break;
@@ -1710,7 +1830,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
             continue;
             not_implemented: {}
             fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-            //GGML_ASSERT(false);
+            //GGML_ABORT("fatal error");
         }
 
         // Evaluate sequence
@@ -1785,11 +1905,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
     }
 }
 
-static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
-    return ctx->name.c_str();
-}
-
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     auto * memory = (ggml_vk_memory *)buffer->context;
     if (ggml_vk_has_device()) {
@@ -1833,10 +1948,10 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
-    /* .get_name        = */ ggml_backend_kompute_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
     /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ NULL,
     /* .set_tensor      = */ ggml_backend_kompute_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_kompute_buffer_get_tensor,
     /* .cpy_tensor      = */ NULL,
@@ -1867,40 +1982,41 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
     return ctx->max_alloc;
 }
 
-static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    GGML_UNUSED(buft);
-    return ggml_backend_is_kompute(backend);
-}
-
 static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
     /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
     /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
     /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
 };
 
 ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
-    static std::vector<ggml_backend_buffer_type> bufts = []() {
-        std::vector<ggml_backend_buffer_type> vec;
-        auto devices = ggml_vk_available_devices_internal(0);
-        vec.reserve(devices.size());
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
 
-        for (const auto & dev : devices) {
-            vec.push_back({
-                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
-                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
-            });
+    auto devices = ggml_vk_available_devices();
+    int32_t device_count = (int32_t) devices.size();
+    GGML_ASSERT(device < device_count);
+    GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
+
+    static ggml_backend_buffer_type
+        ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
+
+    static bool ggml_backend_kompute_buffer_type_initialized = false;
+
+    if (!ggml_backend_kompute_buffer_type_initialized) {
+        for (int32_t i = 0; i < device_count; i++) {
+            ggml_backend_kompute_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_kompute_buffer_type_interface,
+                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
+                /* .context  = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
+            };
         }
-        return vec;
-    }();
+        ggml_backend_kompute_buffer_type_initialized = true;
+    }
 
-    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
-        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
-    });
-    return it < bufts.end() ? &*it : nullptr;
+    return &ggml_backend_kompute_buffer_types[device];
 }
 
 // backend
@@ -1922,35 +2038,26 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    return ggml_backend_kompute_buffer_type(ctx->device);
-}
-
-static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
-    return true;
-}
-
-static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    GGML_UNUSED(backend);
-    return ggml_vk_supports_op(op);
+    return GGML_STATUS_SUCCESS;
 }
 
 static struct ggml_backend_i kompute_backend_i = {
     /* .get_name                = */ ggml_backend_kompute_name,
     /* .free                    = */ ggml_backend_kompute_free,
-    /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
     /* .set_tensor_async        = */ NULL,
     /* .get_tensor_async        = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ NULL,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
-    /* .supports_op             = */ ggml_backend_kompute_supports_op,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_kompute_guid() {
@@ -1965,6 +2072,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_kompute_guid(),
         /* .interface = */ kompute_backend_i,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
         /* .context   = */ s_kompute_context,
     };
 
@@ -1975,22 +2083,169 @@ bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
 }
 
-static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
-    GGML_UNUSED(params);
-    return ggml_backend_kompute_init(intptr_t(user_data));
-}
-
-extern "C" int ggml_backend_kompute_reg_devices();
-
-int ggml_backend_kompute_reg_devices() {
-    auto devices = ggml_vk_available_devices_internal(0);
-    for (const auto & device : devices) {
-        ggml_backend_register(
-            ggml_kompute_format_name(device.index).c_str(),
-            ggml_backend_reg_kompute_init,
-            ggml_backend_kompute_buffer_type(device.index),
-            reinterpret_cast<void *>(intptr_t(device.index))
-        );
-    }
+static size_t ggml_backend_kompute_get_device_count() {
+    auto devices = ggml_vk_available_devices();
     return devices.size();
 }
+
+static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
+    auto devices = ggml_vk_available_devices();
+    GGML_ASSERT((size_t) device < devices.size());
+    snprintf(description, description_size, "%s", devices[device].name);
+}
+
+static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
+    auto devices = ggml_vk_available_devices();
+    GGML_ASSERT((size_t) device < devices.size());
+    *total = devices[device].heapSize;
+    *free = devices[device].heapSize;
+}
+
+//////////////////////////
+
+struct ggml_backend_kompute_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    ggml_backend_kompute_get_device_memory(ctx->device, free, total);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ggml_backend_kompute_buffer_type(ctx->device);
+}
+
+static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
+        return false;
+    }
+
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
+
+    return buft_ctx->device == ctx->device;
+}
+
+static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_kompute_device_get_name(dev);
+    props->description = ggml_backend_kompute_device_get_description(dev);
+    props->type        = ggml_backend_kompute_device_get_type(dev);
+    ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* async                  = */ false,
+        /* host_buffer            = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* events                 = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
+    return ggml_backend_kompute_init(ctx->device);
+}
+
+static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
+    /* .get_name             = */ ggml_backend_kompute_device_get_name,
+    /* .get_description      = */ ggml_backend_kompute_device_get_description,
+    /* .get_memory           = */ ggml_backend_kompute_device_get_memory,
+    /* .get_type             = */ ggml_backend_kompute_device_get_type,
+    /* .get_props            = */ ggml_backend_kompute_device_get_props,
+    /* .init_backend         = */ ggml_backend_kompute_device_init,
+    /* .get_buffer_type      = */ ggml_backend_kompute_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_kompute_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_kompute_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_kompute_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return "Kompute";
+}
+
+static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return ggml_backend_kompute_get_device_count();
+}
+
+static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
+                ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
+                char desc[256];
+                ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
+                ctx->device = i;
+                ctx->name = "Kompute" + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_kompute_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
+        }
+    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
+    /* .get_name         = */ ggml_backend_kompute_reg_get_name,
+    /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_kompute_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_kompute_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_kompute_reg_i,
+        /* .context     = */ nullptr,
+    };
+
+    return &reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
diff --git a/kompute b/ggml/src/ggml-kompute/kompute
similarity index 100%
rename from kompute
rename to ggml/src/ggml-kompute/kompute
diff --git a/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp
similarity index 93%
rename from kompute-shaders/common.comp
rename to ggml/src/ggml-kompute/kompute-shaders/common.comp
index 62d62b025..dbe4cf804 100644
--- a/kompute-shaders/common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/common.comp
@@ -3,6 +3,7 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
 #extension GL_EXT_control_flow_attributes: enable
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
@@ -15,6 +16,7 @@
 #define TWOPI_F 6.283185307179586f
 
 #define QK_K 256
+#define K_SCALE_SIZE 12
 
 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
 #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
@@ -64,6 +66,14 @@ mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
     return reg;
 }
 
+#define sizeof_block_q4_k 144
+struct block_q4_k {
+    float16_t d;
+    float16_t dmin;
+    uint8_t scales[K_SCALE_SIZE];
+    uint8_t qs[QK_K/2];
+};
+
 #define sizeof_block_q6_k 210
 struct block_q6_k {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits
diff --git a/kompute-shaders/op_add.comp b/ggml/src/ggml-kompute/kompute-shaders/op_add.comp
similarity index 100%
rename from kompute-shaders/op_add.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_add.comp
diff --git a/kompute-shaders/op_addrow.comp b/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp
similarity index 100%
rename from kompute-shaders/op_addrow.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp
diff --git a/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f16_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp
diff --git a/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f16_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp
diff --git a/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f32_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp
diff --git a/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp
similarity index 100%
rename from kompute-shaders/op_cpy_f32_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp
diff --git a/kompute-shaders/op_diagmask.comp b/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp
similarity index 100%
rename from kompute-shaders/op_diagmask.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp
diff --git a/kompute-shaders/op_gelu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp
similarity index 100%
rename from kompute-shaders/op_gelu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp
diff --git a/kompute-shaders/op_getrows.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp
similarity index 100%
rename from kompute-shaders/op_getrows.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp
diff --git a/kompute-shaders/op_getrows_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp
similarity index 100%
rename from kompute-shaders/op_getrows_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp
new file mode 100644
index 000000000..9d7acdaf8
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp
@@ -0,0 +1,31 @@
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    for (int j = 0; j < k; j++) {
+        out_[y + j] = inA[x + j];
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q4_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp
diff --git a/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q4_1.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp
diff --git a/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp
similarity index 100%
rename from kompute-shaders/op_getrows_q6_k.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp
diff --git a/kompute-shaders/op_mul.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp
similarity index 100%
rename from kompute-shaders/op_mul.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul.comp
diff --git a/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
similarity index 91%
rename from kompute-shaders/op_mul_mat_f16.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
index 8f0a9031f..0ab1b2fc2 100644
--- a/kompute-shaders/op_mul_mat_f16.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
@@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
     uint nb00;
     uint nb01;
     uint nb02;
+    uint nb03;
     int ne10;
     int ne11;
     int ne12;
     uint nb10;
     uint nb11;
     uint nb12;
+    uint nb13;
     int ne0;
     int ne1;
     uint r2;
@@ -42,7 +44,7 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
+    const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
 
     const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
 
@@ -52,7 +54,7 @@ void main() {
             break;
         }
 
-        const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
+        const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
         float sumf = 0;
         for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_mat_f32.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp
diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q4_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp
diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q4_1.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
new file mode 100644
index 000000000..a5752a3a0
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
@@ -0,0 +1,140 @@
+#version 450
+
+#include "common.comp"
+
+#define N_DST 4
+#define SIZE_OF_BLOCK sizeof_block_q4_k
+
+layout(local_size_x = 4) in;
+layout(local_size_y = 8) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int ne02;
+    int ne12;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
+} pcs;
+
+void main() {
+    const uint16_t kmask1 = uint16_t(0x3f3f);
+    const uint16_t kmask2 = uint16_t(0x0f0f);
+    const uint16_t kmask3 = uint16_t(0xc0c0);
+
+    const uint ix = gl_SubgroupInvocationID/8;  // 0...3
+    const uint it = gl_SubgroupInvocationID%8;  // 0...7
+    const uint iq = it/4;     // 0 or 1
+    const uint ir = it%4;     // 0...3
+
+    const uint nb = pcs.ne00/QK_K;
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint first_row = r0 * N_DST;
+    const uint ib_row = first_row * nb;
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint offset1 =        r1*pcs.nb11 + (i12       )*pcs.nb12 + (i13       )*pcs.nb13;
+
+    const uint xblk = offset0 + pcs.inAOff;
+    const uint y = (offset1 / 4) + pcs.inBOff;
+
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
+    float all_sum = 0.f;
+
+    uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
+
+    for (uint ib = ix; ib < nb; ib += 4) {
+        const uint blk_idx = ib + xblk;
+
+        float sumy[4] = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = inB[y4+i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
+
+            uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
+            uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
+            uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
+            uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
+            uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
+
+            uint16_t sc16[4];
+            sc16[0] = sc_0 & kmask1;
+            sc16[1] = sc_2 & kmask1;
+            sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
+            sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
+
+            float acc1[4] = {0.f, 0.f, 0.f, 0.f};
+            float acc2[4] = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
+                uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
+                acc1[0] += yl[i+0] * (q1 & 0x000F);
+                acc1[1] += yl[i+1] * (q1 & 0x0F00);
+                acc1[2] += yl[i+8] * (q1 & 0x00F0);
+                acc1[3] += yl[i+9] * (q1 & 0xF000);
+                acc2[0] += yh[i+0] * (q2 & 0x000F);
+                acc2[1] += yh[i+1] * (q2 & 0x0F00);
+                acc2[2] += yh[i+8] * (q2 & 0x00F0);
+                acc2[3] += yh[i+9] * (q2 & 0xF000);
+            }
+
+            uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
+            uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
+            uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
+            uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
+            uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
+            uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
+            uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
+            uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
+
+            float dall = float(inA[blk_idx + row_idx].d);
+            float dmin = float(inA[blk_idx + row_idx].dmin);
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
+                               (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
+                               (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
+                               (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
+                dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
+        }
+
+        y4 += 4 * QK_K;
+    }
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = subgroupAdd(sumf[row]);
+        if (subgroupElect()) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
+        }
+    }
+}
diff --git a/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
similarity index 86%
rename from kompute-shaders/op_mul_mat_q6_k.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
index c9baebdf4..d331d1a70 100644
--- a/kompute-shaders/op_mul_mat_q6_k.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
@@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
     int ne0;
     int ne1;
     int ne01;
-    int gqa;
+    int ne02;
+    int ne12;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    uint r2;
+    uint r3;
 } pcs;
 
 void main() {
@@ -34,12 +43,15 @@ void main() {
 
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
-    const uint r2 = gl_WorkGroupID.z;
+    const uint im = gl_WorkGroupID.z;
 
     const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
-    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
-    const uint x = row * nb + offset0; // Based from inA without base offset
-    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    const uint i12 = im%pcs.ne12;
+    const uint i13 = im/pcs.ne12;
+
+    const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
+    const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf = 0;
 
@@ -89,6 +101,6 @@ void main() {
 
     const float tot = subgroupAdd(sumf);
     if (subgroupElect()) {
-        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+        out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
     }
 }
diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp
similarity index 100%
rename from kompute-shaders/op_mul_mat_q8_0.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp
diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
similarity index 76%
rename from kompute-shaders/op_mul_mv_q_n.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
index 440b5ab2c..a6517cc1f 100644
--- a/kompute-shaders/op_mul_mv_q_n.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
@@ -14,10 +14,15 @@ void main() {
     const uint i12 = im%pcs.ne12;
     const uint i13 = im/pcs.ne12;
 
-    const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
+    // pointers to src0 rows
+    uint ax[N_ROWS];
+    for (int row = 0; row < N_ROWS; ++row) {
+        const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
 
-    const uint x = offset0; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+        ax[row] = offset0 + pcs.inAOff;
+    }
+
+    const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
     float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
 
@@ -32,8 +37,7 @@ void main() {
 
     for (uint ib = ix; ib < nb; ib += 16) {
         for (int row = 0; row < N_ROWS; row++) {
-            const uint block_index = x + ib + row * nb;
-            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+            sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
         }
 
         yb += BLOCKS_IN_QUANT * 16;
diff --git a/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
similarity index 80%
rename from kompute-shaders/op_mul_mv_q_n_pre.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
index 7912b09ac..a9a2f2218 100644
--- a/kompute-shaders/op_mul_mv_q_n_pre.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
@@ -1,5 +1,5 @@
 layout(local_size_x_id = 0) in;
-layout(local_size_y = 1) in;
+layout(local_size_y = 8) in;
 layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
     int  ne12;
     int  ne0;
     int  ne1;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    uint nb11;
+    uint nb12;
+    uint nb13;
     uint r2;
     uint r3;
 } pcs;
diff --git a/kompute-shaders/op_norm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp
similarity index 100%
rename from kompute-shaders/op_norm.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_norm.comp
diff --git a/kompute-shaders/op_relu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp
similarity index 100%
rename from kompute-shaders/op_relu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_relu.comp
diff --git a/kompute-shaders/op_rmsnorm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp
similarity index 100%
rename from kompute-shaders/op_rmsnorm.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
new file mode 100644
index 000000000..63659cbfe
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+pcs.n_dims/2]);
+
+            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
new file mode 100644
index 000000000..4df56204d
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + ic*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+pcs.n_dims/2];
+
+            out_[dst_data]              = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
new file mode 100644
index 000000000..a3c0eda8b
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float     inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+1]);
+
+            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
new file mode 100644
index 000000000..b7963ae72
--- /dev/null
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "rope_common.comp"
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
+layout(binding = 2) buffer restrict readonly  tensorInC { float inC[]; };
+layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    float theta_base = float(inB[pcs.inBOff + i2]);
+    float inv_ndims = -1.f/pcs.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
+        if (i0 < pcs.n_dims) {
+            uint ic = i0/2;
+
+            float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
+
+            const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = inA[src];
+            const float x1 = inA[src+1];
+
+            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        } else {
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data]   = inA[src];
+            out_[dst_data+1] = inA[src+1];
+        }
+    }
+}
diff --git a/kompute-shaders/op_scale.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp
similarity index 100%
rename from kompute-shaders/op_scale.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_scale.comp
diff --git a/kompute-shaders/op_scale_8.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp
similarity index 100%
rename from kompute-shaders/op_scale_8.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp
diff --git a/kompute-shaders/op_silu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp
similarity index 100%
rename from kompute-shaders/op_silu.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_silu.comp
diff --git a/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
similarity index 78%
rename from kompute-shaders/op_softmax.comp
rename to ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
index 7bc9176ca..4165295bf 100644
--- a/kompute-shaders/op_softmax.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
@@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants {
     int ne01;
     int ne02;
     float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
     int mask;
 } pcs;
 
@@ -34,17 +38,29 @@ void main() {
     const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
+    float slope = 1.0f;
+
+    // ALiBi
+    if (pcs.max_bias > 0.0f) {
+        int64_t h = i02;
+
+        float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
+        int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
+
+        slope = pow(base, float(exp));
+    }
+
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
+        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
     for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
+        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }
diff --git a/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
similarity index 84%
rename from kompute-shaders/rope_common.comp
rename to ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
index 57ba6597a..0fca640dc 100644
--- a/kompute-shaders/rope_common.comp
+++ b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
@@ -1,17 +1,21 @@
 #include "common.comp"
 
+#define GGML_ROPE_TYPE_NEOX 2
+
 // TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 
 layout (push_constant) uniform parameter {
     uint inAOff;
     uint inBOff;
+    uint inCOff;
     uint outOff;
     int n_dims;
     int mode;
-    int n_orig_ctx;
+    int n_ctx_orig;
     float freq_base;
     float freq_scale;
+    bool has_freq_factors;
     float ext_factor;
     float attn_factor;
     float beta_fast;
@@ -54,14 +58,14 @@ void rope_yarn(
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
+float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
 }
 
 void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
 ) {
     // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
 }
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
new file mode 100644
index 000000000..89fcde2fa
--- /dev/null
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -0,0 +1,121 @@
+find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+
+message(STATUS "Metal framework found")
+
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.m
+                        )
+
+target_link_libraries(ggml-metal PRIVATE
+                      ${FOUNDATION_LIBRARY}
+                      ${METAL_FRAMEWORK}
+                      ${METALKIT_FRAMEWORK}
+                      )
+
+if (GGML_METAL_NDEBUG)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+endif()
+
+if (GGML_METAL_USE_BF16)
+    add_compile_definitions(GGML_METAL_USE_BF16)
+endif()
+
+# copy metal files to bin directory
+configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
+configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
+configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
+
+if (GGML_METAL_EMBED_LIBRARY)
+    enable_language(ASM)
+
+    add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
+
+    set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
+    set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    set(METALLIB_IMPL   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
+
+    file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+
+    # merge ggml-common.h and ggml-metal.metal into a single file
+    set(METALLIB_EMBED_ASM        "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+    set(METALLIB_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
+
+    add_custom_command(
+        OUTPUT ${METALLIB_EMBED_ASM}
+        COMMAND echo "Embedding Metal library"
+        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
+        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
+        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+        DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
+        COMMENT "Generate assembly for embedded Metal library"
+    )
+
+    target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
+else()
+    if (GGML_METAL_SHADER_DEBUG)
+        # custom command to do the following:
+        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+        #
+        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+        #       disabling fast math is needed in order to pass tests/test-backend-ops
+        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+        set(XC_FLAGS -fno-fast-math -fno-inline -g)
+    else()
+        set(XC_FLAGS -O3)
+    endif()
+
+    # Append macOS metal versioning flags
+    if (GGML_METAL_MACOSX_VERSION_MIN)
+        message(STATUS "Adding  -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+        list   (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
+    endif()
+
+    if (GGML_METAL_STD)
+        message(STATUS "Adding  -std=${GGML_METAL_STD} flag to metal compilation")
+        list   (APPEND XC_FLAGS -std=${GGML_METAL_STD})
+    endif()
+
+    add_custom_command(
+        OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+        DEPENDS ggml-metal.metal ggml-common.h
+        COMMENT "Compiling Metal kernels"
+        )
+
+    # FIXME: only add to the ggml-metal target?
+    add_custom_target(
+        ggml-metal-lib ALL
+        DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        )
+endif() # GGML_METAL_EMBED_LIBRARY
+
+if (NOT GGML_METAL_EMBED_LIBRARY)
+    install(
+        FILES src/ggml-metal/ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+        install(
+            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+        )
+endif()
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
new file mode 100644
index 000000000..e3dc25f16
--- /dev/null
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -0,0 +1,288 @@
+#ifndef GGML_METAL_IMPL
+#define GGML_METAL_IMPL
+
+// kernel argument structs
+//
+// - element counters (e.g. ne00) typically use int32_t to reduce register usage
+//   however, be careful from int overflows when using those in the kernel implementation
+//
+// - strides (e.g. nb00) use uint64_t
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  dim;
+} ggml_metal_kargs_concat;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    uint64_t offs;
+} ggml_metal_kargs_bin;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+} ggml_metal_kargs_repeat;
+
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  ne0;
+    int64_t  ne1;
+    int64_t  ne2;
+    int64_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+} ggml_metal_kargs_cpy;
+
+typedef struct {
+    int64_t  ne10;
+    int64_t  ne11;
+    int64_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    uint64_t offs;
+    bool     inplace;
+} ggml_metal_kargs_set;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  n_past;
+    int32_t  n_dims;
+    int32_t  n_ctx_orig;
+    float    freq_base;
+    float    freq_scale;
+    float    ext_factor;
+    float    attn_factor;
+    float    beta_fast;
+    float    beta_slow;
+} ggml_metal_kargs_rope;
+
+typedef struct {
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne11;
+    int32_t  ne_12_2; // assume K and V are same shape
+    int32_t  ne_12_3;
+    uint64_t nb_12_1;
+    uint64_t nb_12_2;
+    uint64_t nb_12_3;
+    uint64_t nb31;
+    int32_t  ne1;
+    int32_t  ne2;
+    float    scale;
+    float    max_bias;
+    float    m0;
+    float    m1;
+    uint16_t n_head_log2;
+    float    logit_softcap;
+} ggml_metal_kargs_flash_attn_ext;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+} ggml_metal_kargs_mul_mm;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+} ggml_metal_kargs_mul_mv;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    int32_t  ne0;
+    int32_t  ne1;
+    int16_t  r2;
+    int16_t  r3;
+    int16_t  nsg;
+    int16_t  nxpsg;
+    int16_t  r1ptg;
+} ggml_metal_kargs_mul_mv_ext;
+
+typedef struct {
+    int32_t  nei0;
+    int32_t  nei1;
+    uint64_t nbi1;
+    int32_t  ne00;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  ne0;
+    int32_t  ne1;
+} ggml_metal_kargs_mul_mm_id;
+
+typedef struct {
+    int32_t  nei0;
+    int32_t  nei1;
+    uint64_t nbi1;
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    int32_t  ne10;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  ne0;
+    int32_t  ne1;
+    uint64_t nb1;
+} ggml_metal_kargs_mul_mv_id;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne00_4;
+    uint64_t nb01;
+    float    eps;
+} ggml_metal_kargs_norm;
+
+typedef struct {
+    int32_t  ne00;
+    int32_t  ne00_4;
+    uint64_t nb01;
+    float    eps;
+} ggml_metal_kargs_rms_norm;
+
+#endif // GGML_METAL_IMPL
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
new file mode 100644
index 000000000..944d90af3
--- /dev/null
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -0,0 +1,4998 @@
+#import "ggml-metal.h"
+
+#import "ggml-impl.h"
+#import "ggml-backend-impl.h"
+#import "ggml-metal-impl.h"
+
+#import <Foundation/Foundation.h>
+
+#import <Metal/Metal.h>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 64
+
+// max number of MTLCommandBuffer used to submit a graph for processing
+#define GGML_METAL_MAX_COMMAND_BUFFERS 8
+
+#ifndef TARGET_OS_VISION
+#define TARGET_OS_VISION 0
+#endif
+
+// create residency sets only on macOS >= 15.0
+#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
+    TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
+    TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
+    TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
+#define GGML_METAL_HAS_RESIDENCY_SETS 1
+#endif
+
+// globals
+
+// overload of MTLGPUFamilyMetal3 (not available in some environments)
+static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+
+// initialized in ggml_backend_metal_reg
+static struct ggml_backend_reg    g_ggml_backend_metal_reg;
+static struct ggml_backend_device g_ggml_backend_metal_device;
+
+// information about a Metal device
+// note: assumes single GPU device - the default one
+// TODO: support multiple GPU devices
+static struct ggml_backend_metal_device_context {
+    id<MTLDevice> mtl_device;
+    int           mtl_device_ref_count;
+
+    bool has_simdgroup_reduction;
+    bool has_simdgroup_mm;
+    bool has_residency_sets;
+    bool has_bfloat;
+    bool use_bfloat;
+
+    char name[128];
+} g_ggml_ctx_dev_main = {
+    /*.mtl_device              =*/ nil,
+    /*.mtl_device_ref_count    =*/ 0,
+    /*.has_simdgroup_reduction =*/ false,
+    /*.has_simdgroup_mm        =*/ false,
+    /*.has_residency_sets      =*/ false,
+    /*.has_bfloat              =*/ false,
+    /*.use_bfloat              =*/ false,
+    /*.name                    =*/ "",
+};
+
+// acquire
+static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) {
+    assert(ctx != NULL);
+
+    if (ctx->mtl_device == nil) {
+        ctx->mtl_device = MTLCreateSystemDefaultDevice();
+    }
+
+    if (ctx->mtl_device) {
+        ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+        ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+
+        ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
+
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+        ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
+#endif
+
+        ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
+        ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
+
+#if defined(GGML_METAL_USE_BF16)
+        ctx->use_bfloat = ctx->has_bfloat;
+#else
+        ctx->use_bfloat = false;
+#endif
+
+        strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
+    }
+
+    ctx->mtl_device_ref_count++;
+
+    return ctx->mtl_device;
+}
+
+// release
+static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_context * ctx) {
+    assert(ctx != NULL);
+    assert(ctx->mtl_device_ref_count > 0);
+
+    ctx->mtl_device_ref_count--;
+
+    if (ctx->mtl_device_ref_count == 0) {
+        if (ctx->mtl_device) {
+            [ctx->mtl_device release];
+            ctx->mtl_device = nil;
+        }
+    }
+}
+
+// kernels
+
+struct ggml_metal_kernel {
+    id<MTLComputePipelineState> pipeline;
+};
+
+enum ggml_metal_kernel_type {
+    GGML_METAL_KERNEL_TYPE_ADD,
+    GGML_METAL_KERNEL_TYPE_ADD_ROW,
+    GGML_METAL_KERNEL_TYPE_SUB,
+    GGML_METAL_KERNEL_TYPE_SUB_ROW,
+    GGML_METAL_KERNEL_TYPE_MUL,
+    GGML_METAL_KERNEL_TYPE_MUL_ROW,
+    GGML_METAL_KERNEL_TYPE_DIV,
+    GGML_METAL_KERNEL_TYPE_DIV_ROW,
+    GGML_METAL_KERNEL_TYPE_REPEAT_F32,
+    GGML_METAL_KERNEL_TYPE_REPEAT_F16,
+    GGML_METAL_KERNEL_TYPE_REPEAT_I32,
+    GGML_METAL_KERNEL_TYPE_REPEAT_I16,
+    GGML_METAL_KERNEL_TYPE_SCALE,
+    GGML_METAL_KERNEL_TYPE_SCALE_4,
+    GGML_METAL_KERNEL_TYPE_CLAMP,
+    GGML_METAL_KERNEL_TYPE_TANH,
+    GGML_METAL_KERNEL_TYPE_RELU,
+    GGML_METAL_KERNEL_TYPE_SIGMOID,
+    GGML_METAL_KERNEL_TYPE_GELU,
+    GGML_METAL_KERNEL_TYPE_GELU_4,
+    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
+    GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
+    GGML_METAL_KERNEL_TYPE_SILU,
+    GGML_METAL_KERNEL_TYPE_SILU_4,
+    GGML_METAL_KERNEL_TYPE_ELU,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
+    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,
+    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
+    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
+    GGML_METAL_KERNEL_TYPE_RMS_NORM,
+    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
+    GGML_METAL_KERNEL_TYPE_NORM,
+    GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
+    GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,
+  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
+    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
+    GGML_METAL_KERNEL_TYPE_IM2COL_F32,
+    GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
+    GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,
+    GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32,
+    GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
+    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
+    GGML_METAL_KERNEL_TYPE_PAD_F32,
+    GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+    GGML_METAL_KERNEL_TYPE_ARANGE_F32,
+    GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
+    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
+    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
+    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
+    GGML_METAL_KERNEL_TYPE_SET_I32,
+    GGML_METAL_KERNEL_TYPE_SET_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
+    GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
+    GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
+    GGML_METAL_KERNEL_TYPE_CONCAT,
+    GGML_METAL_KERNEL_TYPE_SQR,
+    GGML_METAL_KERNEL_TYPE_SQRT,
+    GGML_METAL_KERNEL_TYPE_SIN,
+    GGML_METAL_KERNEL_TYPE_COS,
+    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
+    GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
+    GGML_METAL_KERNEL_TYPE_ARGMAX,
+
+    GGML_METAL_KERNEL_TYPE_COUNT
+};
+
+struct ggml_backend_metal_context {
+    id<MTLCommandQueue> queue;
+
+    dispatch_queue_t d_queue;
+
+    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
+
+    // capture state
+    bool capture_next_compute;
+    bool capture_started;
+
+    id<MTLCaptureScope> capture_scope;
+
+    // command buffer state
+    int n_cb;           // number of extra threads used to submit the command buffers
+    int n_nodes_0;      // number of nodes submitted by the main thread
+    int n_nodes_1;      // remaining number of nodes submitted by the n_cb threads
+    int n_nodes_per_cb;
+
+    struct ggml_cgraph * gf;
+
+    // the callback given to the thread pool
+    void (^encode_async)(size_t ith);
+
+    // n_cb command buffers + 1 used by the main thread
+    id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
+
+    // abort ggml_metal_graph_compute if callback returns true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+// MSL code
+// TODO: move the contents here when ready
+//       for now it is easier to work in a separate file
+// static NSString * const msl_library_source = @"see metal.metal";
+
+// Here to assist with NSBundle Path Hack
+@interface GGMLMetalClass : NSObject
+@end
+@implementation GGMLMetalClass
+@end
+
+static void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+
+#if TARGET_OS_OSX
+    kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
+    if (err != KERN_SUCCESS) {
+        GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
+        return NULL;
+    }
+#else
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+#endif
+
+    return data;
+}
+
+static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
+    GGML_LOG_INFO("%s: allocating\n", __func__);
+
+#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
+    // Show all the Metal device instances in the system
+    NSArray * devices = MTLCopyAllDevices();
+    for (id<MTLDevice> device in devices) {
+        GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
+    }
+    [devices release]; // since it was created by a *Copy* C method
+#endif
+
+    // init context
+    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
+    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
+
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
+
+    ctx->queue  = [device newCommandQueue];
+    if (ctx->queue == nil) {
+        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
+        return NULL;
+    }
+
+    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+
+    id<MTLLibrary> metal_library;
+
+    // load library
+    //
+    // - first check if the library is embedded
+    // - then check if the library is in the bundle
+    // - if not found, load the source and compile it
+    // - if that fails, return NULL
+    {
+        NSBundle * bundle = nil;
+#ifdef SWIFT_PACKAGE
+        bundle = SWIFTPM_MODULE_BUNDLE;
+#else
+        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+#endif
+
+        NSError * error = nil;
+
+#if GGML_METAL_EMBED_LIBRARY
+        const bool try_metallib = false;
+#else
+        const bool try_metallib = true;
+#endif
+
+        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
+        if (path_lib == nil) {
+            // Try to find the resource in the directory where the current binary located.
+            NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
+            NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
+            NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
+            if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
+                GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
+                NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
+                if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
+                    // Optionally, if this is a symlink, try to resolve it.
+                    default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
+                    if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
+                        // It is a relative path, adding the binary directory as directory prefix.
+                        default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
+                    }
+                    if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
+                        // Link to the resource could not be resolved.
+                        default_metallib_path = nil;
+                    } else {
+                        GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
+                    }
+                }
+            } else {
+                // The resource couldn't be found in the binary's directory.
+                default_metallib_path = nil;
+            }
+            path_lib = default_metallib_path;
+        }
+
+        if (try_metallib && path_lib != nil) {
+            // pre-compiled library found
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
+            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
+
+            metal_library = [device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
+        } else {
+#if GGML_METAL_EMBED_LIBRARY
+            GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
+
+            extern const char ggml_metallib_start[];
+            extern const char ggml_metallib_end[];
+
+            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+#else
+            GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
+
+            NSString * path_source;
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
+
+            GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
+
+            if (path_resource) {
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
+            } else {
+                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+            }
+
+            if (path_source == nil) {
+                GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
+                path_source = @"ggml-metal.metal";
+            }
+
+            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+
+            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
+            if (error) {
+                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
+#endif // GGML_METAL_EMBED_LIBRARY
+
+            @autoreleasepool {
+                // dictionary of preprocessor macros
+                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+
+                if (ctx_dev->use_bfloat) {
+                    [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
+                }
+
+#if GGML_METAL_EMBED_LIBRARY
+                [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
+#endif
+
+                MTLCompileOptions * options = [MTLCompileOptions new];
+                options.preprocessorMacros = prep;
+
+                //[options setFastMathEnabled:false];
+
+                metal_library = [device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return NULL;
+                }
+
+#if !__has_feature(objc_arc)
+                [options release];
+#endif
+            }
+#if GGML_METAL_EMBED_LIBRARY
+            [src release];
+#endif // GGML_METAL_EMBED_LIBRARY
+        }
+    }
+
+    // print MTL GPU family:
+    GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, [[device name] UTF8String]);
+
+    // determine max supported GPU family
+    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    {
+        for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
+            if ([device supportsFamily:i]) {
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
+                break;
+            }
+        }
+
+        for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
+            if ([device supportsFamily:i]) {
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
+                break;
+            }
+        }
+
+        for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
+            if ([device supportsFamily:i]) {
+                GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
+                break;
+            }
+        }
+    }
+
+    GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
+    GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
+    GGML_LOG_INFO("%s: has residency sets    = %s\n", __func__, ctx_dev->has_residency_sets          ? "true" : "false");
+    GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
+    GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
+    GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
+
+    ctx->capture_next_compute = false;
+    ctx->capture_started = false;
+    ctx->capture_scope = nil;
+
+    ctx->gf = nil;
+    ctx->encode_async = nil;
+    for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
+        ctx->command_buffers[i] = nil;
+    }
+
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
+    }
+#endif
+
+    // load kernels
+    {
+        NSError * error = nil;
+
+        for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
+            ctx->kernels[i].pipeline = nil;
+        }
+
+#define GGML_METAL_ADD_KERNEL(e, name, supported) \
+        if (supported) { \
+            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
+            id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
+            kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
+            GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
+                    (int) kernel->pipeline.threadExecutionWidth); \
+            [metal_function release]; \
+            if (error) { \
+                GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+                [metal_library release]; \
+                return NULL; \
+            } \
+        } else { \
+            GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
+        }
+
+        const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
+        const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
+        const bool use_bfloat              = ctx_dev->use_bfloat;
+
+        // simd_sum and simd_max requires MTLGPUFamilyApple7
+
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                           add,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                       add_row,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB,                           sub,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW,                       sub_row,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                           mul,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                       mul_row,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                           div,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                       div_row,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32,                    repeat_f32,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16,                    repeat_f16,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32,                    repeat_i32,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16,                    repeat_i16,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                         scale,                          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                       scale_4,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP,                         clamp,                          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                          tanh,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                          relu,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                       sigmoid,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                          gelu,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                        gelu_4,                         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                    gelu_quick,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                  gelu_quick_4,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                          silu,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                        silu_4,                         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU,                           elu,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                  soft_max_f16,                   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                soft_max_f16_4,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                  soft_max_f32,                   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4,                soft_max_f32_4,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,                 diag_mask_inf,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,               diag_mask_inf_8,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,                  get_rows_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,                  get_rows_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16,                 get_rows_bf16,                  use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,                 get_rows_q4_0,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,                 get_rows_q4_1,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,                 get_rows_q5_0,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,                 get_rows_q5_1,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,                 get_rows_q8_0,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,                 get_rows_q2_K,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,                 get_rows_q3_K,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,                 get_rows_q4_K,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,                 get_rows_q5_K,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,                 get_rows_q6_K,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,              get_rows_iq2_xxs,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,               get_rows_iq2_xs,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,              get_rows_iq3_xxs,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,                get_rows_iq3_s,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S,                get_rows_iq2_s,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,                get_rows_iq1_s,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M,                get_rows_iq1_m,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,               get_rows_iq4_nl,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS,               get_rows_iq4_xs,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,                  get_rows_i32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                      rms_norm,                       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                    group_norm,                     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                          norm,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                  ssm_conv_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                  ssm_scan_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                mul_mv_f32_f32,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,               mul_mv_bf16_f32,                has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,          mul_mv_bf16_f32_1row,           has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,            mul_mv_bf16_f32_l4,             has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,              mul_mv_bf16_bf16,               has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                mul_mv_f16_f32,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,           mul_mv_f16_f32_1row,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,             mul_mv_f16_f32_l4,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                mul_mv_f16_f16,                 has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,               mul_mv_q4_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,               mul_mv_q4_1_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,               mul_mv_q5_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,               mul_mv_q5_1_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,               mul_mv_q8_0_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,       mul_mv_ext_f16_f32_r1_2,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,       mul_mv_ext_f16_f32_r1_3,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,       mul_mv_ext_f16_f32_r1_4,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5,       mul_mv_ext_f16_f32_r1_5,        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2,      mul_mv_ext_q4_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3,      mul_mv_ext_q4_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4,      mul_mv_ext_q4_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5,      mul_mv_ext_q4_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2,      mul_mv_ext_q4_1_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3,      mul_mv_ext_q4_1_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4,      mul_mv_ext_q4_1_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5,      mul_mv_ext_q4_1_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2,      mul_mv_ext_q5_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3,      mul_mv_ext_q5_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4,      mul_mv_ext_q5_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5,      mul_mv_ext_q5_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2,      mul_mv_ext_q5_1_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3,      mul_mv_ext_q5_1_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4,      mul_mv_ext_q5_1_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5,      mul_mv_ext_q5_1_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2,      mul_mv_ext_q8_0_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,      mul_mv_ext_q8_0_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,      mul_mv_ext_q8_0_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,      mul_mv_ext_q8_0_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,      mul_mv_ext_q4_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,      mul_mv_ext_q4_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,      mul_mv_ext_q4_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5,      mul_mv_ext_q4_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2,      mul_mv_ext_q5_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3,      mul_mv_ext_q5_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4,      mul_mv_ext_q5_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5,      mul_mv_ext_q5_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2,      mul_mv_ext_q6_K_f32_r1_2,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3,      mul_mv_ext_q6_K_f32_r1_3,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4,      mul_mv_ext_q6_K_f32_r1_4,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5,      mul_mv_ext_q6_K_f32_r1_5,       has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2,    mul_mv_ext_iq4_nl_f32_r1_2,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3,    mul_mv_ext_iq4_nl_f32_r1_3,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4,    mul_mv_ext_iq4_nl_f32_r1_4,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5,    mul_mv_ext_iq4_nl_f32_r1_5,     has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,               mul_mv_q2_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,               mul_mv_q3_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,               mul_mv_q4_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,               mul_mv_q5_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,               mul_mv_q6_K_f32,                has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,            mul_mv_iq2_xxs_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,             mul_mv_iq2_xs_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,            mul_mv_iq3_xxs_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,              mul_mv_iq3_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32,              mul_mv_iq2_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,              mul_mv_iq1_s_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32,              mul_mv_iq1_m_f32,               has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,             mul_mv_iq4_nl_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32,             mul_mv_iq4_xs_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,             mul_mv_id_f32_f32,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,             mul_mv_id_f16_f32,              has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,        mul_mv_id_f16_f32_1row,         has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,          mul_mv_id_f16_f32_l4,           has_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,             mul_mv_id_f16_f16,              has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32,            mul_mv_id_bf16_f32,             has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,            mul_mv_id_q4_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,            mul_mv_id_q4_1_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,            mul_mv_id_q5_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,            mul_mv_id_q5_1_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,            mul_mv_id_q8_0_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,            mul_mv_id_q2_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,            mul_mv_id_q3_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,            mul_mv_id_q4_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,            mul_mv_id_q5_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,            mul_mv_id_q6_K_f32,             has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,         mul_mv_id_iq2_xxs_f32,          has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,          mul_mv_id_iq2_xs_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,         mul_mv_id_iq3_xxs_f32,          has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,           mul_mv_id_iq3_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32,           mul_mv_id_iq2_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,           mul_mv_id_iq1_s_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,           mul_mv_id_iq1_m_f32,            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,          mul_mv_id_iq4_nl_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,          mul_mv_id_iq4_xs_f32,           has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,                mul_mm_f32_f32,                 has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,                mul_mm_f16_f32,                 has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,               mul_mm_bf16_f32,                has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,               mul_mm_q4_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,               mul_mm_q4_1_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,               mul_mm_q5_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,               mul_mm_q5_1_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,               mul_mm_q8_0_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,               mul_mm_q2_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,               mul_mm_q3_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,               mul_mm_q4_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,               mul_mm_q5_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,               mul_mm_q6_K_f32,                has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,            mul_mm_iq2_xxs_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,             mul_mm_iq2_xs_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,            mul_mm_iq3_xxs_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,              mul_mm_iq3_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32,              mul_mm_iq2_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,              mul_mm_iq1_s_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,              mul_mm_iq1_m_f32,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,             mul_mm_iq4_nl_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,             mul_mm_iq4_xs_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,             mul_mm_id_f32_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,             mul_mm_id_f16_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,            mul_mm_id_bf16_f32,             has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,            mul_mm_id_q4_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,            mul_mm_id_q4_1_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,            mul_mm_id_q5_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,            mul_mm_id_q5_1_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,            mul_mm_id_q8_0_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,            mul_mm_id_q2_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,            mul_mm_id_q3_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,            mul_mm_id_q4_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,            mul_mm_id_q5_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,            mul_mm_id_q6_K_f32,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,         mul_mm_id_iq2_xxs_f32,          has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,          mul_mm_id_iq2_xs_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,         mul_mm_id_iq3_xxs_f32,          has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,           mul_mm_id_iq3_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,           mul_mm_id_iq2_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,           mul_mm_id_iq1_s_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                 rope_norm_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,                im2col_ext_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32,     conv_transpose_1d_f32_f32,      true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,     conv_transpose_1d_f16_f32,      true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,          argsort_f32_i32_desc,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,                leaky_relu_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,        flash_attn_ext_f16_h64,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,        flash_attn_ext_f16_h80,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,        flash_attn_ext_f16_h96,         has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,       flash_attn_ext_f16_h112,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,       flash_attn_ext_f16_h128,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,       flash_attn_ext_f16_h256,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64,       flash_attn_ext_bf16_h64,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80,       flash_attn_ext_bf16_h80,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96,       flash_attn_ext_bf16_h96,        has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112,      flash_attn_ext_bf16_h112,       has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128,      flash_attn_ext_bf16_h128,       has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256,      flash_attn_ext_bf16_h256,       has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64,       flash_attn_ext_q4_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80,       flash_attn_ext_q4_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96,       flash_attn_ext_q4_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112,      flash_attn_ext_q4_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128,      flash_attn_ext_q4_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256,      flash_attn_ext_q4_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64,       flash_attn_ext_q4_1_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80,       flash_attn_ext_q4_1_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96,       flash_attn_ext_q4_1_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112,      flash_attn_ext_q4_1_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128,      flash_attn_ext_q4_1_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256,      flash_attn_ext_q4_1_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64,       flash_attn_ext_q5_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80,       flash_attn_ext_q5_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96,       flash_attn_ext_q5_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112,      flash_attn_ext_q5_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128,      flash_attn_ext_q5_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256,      flash_attn_ext_q5_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64,       flash_attn_ext_q5_1_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80,       flash_attn_ext_q5_1_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96,       flash_attn_ext_q5_1_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112,      flash_attn_ext_q5_1_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128,      flash_attn_ext_q5_1_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256,      flash_attn_ext_q5_1_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64,       flash_attn_ext_q8_0_h64,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80,       flash_attn_ext_q8_0_h80,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96,       flash_attn_ext_q8_0_h96,        has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112,      flash_attn_ext_q8_0_h112,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128,      flash_attn_ext_q8_0_h128,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,      flash_attn_ext_q8_0_h256,       has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,   flash_attn_ext_vec_f16_h128,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,  flash_attn_ext_vec_bf16_h128,   has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,  flash_attn_ext_vec_q4_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128,  flash_attn_ext_vec_q4_1_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128,  flash_attn_ext_vec_q5_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128,  flash_attn_ext_vec_q5_1_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128,  flash_attn_ext_vec_q8_0_h128,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,   flash_attn_ext_vec_f16_h256,    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256,  flash_attn_ext_vec_bf16_h256,   has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256,  flash_attn_ext_vec_q4_0_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256,  flash_attn_ext_vec_q4_1_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,  flash_attn_ext_vec_q5_0_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,  flash_attn_ext_vec_q5_1_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,  flash_attn_ext_vec_q8_0_h256,   has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32,                       set_f32,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32,                       set_i32,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                   cpy_f32_f32,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                   cpy_f32_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,                  cpy_f32_bf16,                   use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,                   cpy_f16_f32,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,                   cpy_f16_f16,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_F32,                  cpy_bf16_f32,                   use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16,                 cpy_bf16_bf16,                  use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,                  cpy_f32_q8_0,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,                  cpy_f32_q4_0,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,                  cpy_f32_q4_1,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,                  cpy_f32_q5_0,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,                  cpy_f32_q5_1,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,                cpy_f32_iq4_nl,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                        concat,                         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                           sqr,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                        argmax,                         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,               pool_2d_max_f32,                true);
+    }
+
+    [metal_library release];
+
+    return ctx;
+}
+
+static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
+    GGML_LOG_INFO("%s: deallocating\n", __func__);
+
+    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
+        [ctx->kernels[i].pipeline release];
+    }
+
+    Block_release(ctx->encode_async);
+
+    [ctx->queue release];
+
+    dispatch_release(ctx->d_queue);
+
+    free(ctx);
+}
+
+// temporarily defined here for compatibility between ggml-backend and the old API
+
+struct ggml_backend_metal_buffer {
+    void   * data;
+    size_t   size;
+
+    id<MTLBuffer> metal;
+};
+
+struct ggml_backend_metal_buffer_context {
+    void * all_data;
+    size_t all_size;
+    bool owned;
+
+    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+    int n_buffers;
+    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+
+    // optional MTLResidencySet
+    id rset;
+};
+
+// rset init
+static bool ggml_backend_metal_buffer_rset_init(
+        struct ggml_backend_metal_buffer_context * ctx,
+        struct ggml_backend_metal_device_context * ctx_dev,
+        id<MTLDevice> device) {
+    ctx->rset = nil;
+
+    if (!ctx_dev->has_residency_sets) {
+        return true;
+    }
+
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
+        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
+        desc.label = @"ggml_backend_metal";
+        desc.initialCapacity = ctx->n_buffers;
+
+        NSError * error;
+        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
+        if (error) {
+            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            [desc release];
+            return false;
+        }
+
+        [desc release];
+
+        for (int i = 0; i < ctx->n_buffers; i++) {
+            [ctx->rset addAllocation:ctx->buffers[i].metal];
+        }
+
+        [ctx->rset commit];
+        [ctx->rset requestResidency];
+
+        return true;
+    }
+#else
+    GGML_UNUSED(ctx_dev);
+    GGML_UNUSED(device);
+#endif
+
+    return true;
+}
+
+// rset free
+static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
+        if (ctx->rset) {
+            [ctx->rset endResidency];
+            [ctx->rset removeAllAllocations];
+            [ctx->rset release];
+        }
+    }
+#else
+    GGML_UNUSED(ctx);
+#endif
+}
+
+// finds the Metal buffer that contains the tensor data on the GPU device
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// Metal buffer based on the host memory pointer
+//
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
+    //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+
+    const int64_t tsize = ggml_nbytes(t);
+
+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
+
+    // find the view that contains the tensor fully
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
+
+        //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
+
+            //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+
+            return buf_ctx->buffers[i].metal;
+        }
+    }
+
+    GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
+
+    return nil;
+}
+
+static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) {
+    const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
+    const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
+    const bool use_bfloat              = ctx_dev->use_bfloat;
+
+    if (!use_bfloat) {
+        for (size_t i = 0, n = 3; i < n; ++i) {
+            if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
+                return false;
+            }
+        }
+    }
+
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_ELU:
+                    return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_CONCAT:
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_ACC:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_REPEAT:
+        case GGML_OP_SCALE:
+        case GGML_OP_CLAMP:
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            return true;
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_GROUP_NORM:
+            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
+        case GGML_OP_RMS_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
+        case GGML_OP_ARGMAX:
+            return true;
+        case GGML_OP_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
+        case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
+        case GGML_OP_POOL_1D:
+            return false;
+        case GGML_OP_POOL_2D:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+            if (op->src[1]->type != op->src[2]->type) {
+                return false;
+            }
+            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
+        case GGML_OP_SSM_CONV:
+        case GGML_OP_SSM_SCAN:
+            return true;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            return has_simdgroup_reduction &&
+                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                        switch (op->type) {
+                           case GGML_TYPE_F32:
+                           case GGML_TYPE_F16:
+                           case GGML_TYPE_BF16:
+                           case GGML_TYPE_Q8_0:
+                           case GGML_TYPE_Q4_0:
+                           case GGML_TYPE_Q4_1:
+                           case GGML_TYPE_Q5_0:
+                           case GGML_TYPE_Q5_1:
+                           case GGML_TYPE_IQ4_NL:
+                                return true;
+                           default:
+                                return false;
+                        }
+                    case GGML_TYPE_F16:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case GGML_TYPE_BF16:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_BF16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    default:
+                        return false;
+                };
+            }
+        case GGML_OP_SET:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_I32:
+                        return true;
+                    default:
+                        return false;
+                };
+            }
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_GET_ROWS:
+            {
+                return op->ne[3] == 1;
+            }
+        default:
+            return false;
+    }
+}
+
+static void ggml_metal_encode_node(
+                        ggml_backend_t   backend,
+                                   int   idx,
+          id<MTLComputeCommandEncoder>   encoder) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    struct ggml_cgraph * gf = ctx->gf;
+
+    struct ggml_tensor * node = ggml_graph_node(gf, idx);
+
+    //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
+
+    struct ggml_tensor * src0 = node->src[0];
+    struct ggml_tensor * src1 = node->src[1];
+    struct ggml_tensor * src2 = node->src[2];
+    struct ggml_tensor * dst  = node;
+
+    if (ggml_is_empty(dst)) {
+        return;
+    }
+
+    switch (dst->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+            {
+                // noop -> next node
+            } return;
+        default:
+            {
+            } break;
+    }
+
+    if (!ggml_metal_supports_op(ctx_dev, dst)) {
+        GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+        GGML_ABORT("unsupported op");
+    }
+
+    const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+    const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+    const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+    const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+
+    const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+    const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+    const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+    const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+    const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+    const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+    const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+    const int64_t  ne13 = src1 ? src1->ne[3] : 0;
+
+    const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+    const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+    const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+    const uint64_t nb13 = src1 ? src1->nb[3] : 0;
+
+    const int64_t  ne20 = src2 ? src2->ne[0] : 0;
+    const int64_t  ne21 = src2 ? src2->ne[1] : 0;
+    const int64_t  ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
+    const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
+
+    const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+    const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+    const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+    const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
+
+    const int64_t  ne0  =  dst ?  dst->ne[0] : 0;
+    const int64_t  ne1  =  dst ?  dst->ne[1] : 0;
+    const int64_t  ne2  =  dst ?  dst->ne[2] : 0;
+    const int64_t  ne3  =  dst ?  dst->ne[3] : 0;
+
+    const uint64_t nb0  =  dst ?  dst->nb[0] : 0;
+    const uint64_t nb1  =  dst ?  dst->nb[1] : 0;
+    const uint64_t nb2  =  dst ?  dst->nb[2] : 0;
+    const uint64_t nb3  =  dst ?  dst->nb[3] : 0;
+
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+    const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+    size_t offs_src0 = 0;
+    size_t offs_src1 = 0;
+    size_t offs_src2 = 0;
+    size_t offs_dst  = 0;
+
+    id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
+    id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
+    id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
+    id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
+
+#if 0
+    GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+    if (src0) {
+        GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
+                ggml_is_contiguous(src0), src0->name);
+    }
+    if (src1) {
+        GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
+                ggml_is_contiguous(src1), src1->name);
+    }
+    if (dst) {
+        GGML_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
+                dst->name);
+    }
+#endif
+
+    id<MTLDevice> device = ctx_dev->mtl_device;
+
+    switch (dst->op) {
+        case GGML_OP_CONCAT:
+            {
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
+
+                const int32_t dim = ((const int32_t *) dst->op_params)[0];
+
+                ggml_metal_kargs_concat args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                    /*.dim  =*/ dim,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                const size_t offs = 0;
+
+                bool bcast_row = false;
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+                    GGML_ASSERT(ggml_is_contiguous(src0));
+
+                    // src1 is a row
+                    GGML_ASSERT(ne11 == 1);
+
+                    switch (dst->op) {
+                        case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
+                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
+                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
+                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    }
+
+                    bcast_row = true;
+                } else {
+                    switch (dst->op) {
+                        case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
+                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
+                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    }
+                }
+
+                ggml_metal_kargs_bin args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                    /*.offs =*/ offs,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                if (bcast_row) {
+                    const int64_t n = ggml_nelements(dst)/4;
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } else {
+                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                }
+            } break;
+        case GGML_OP_REPEAT:
+            {
+                id<MTLComputePipelineState> pipeline;
+
+                switch (src0t) {
+                    case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
+                    case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
+                    case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
+                    case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                }
+
+                ggml_metal_kargs_repeat args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ACC:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                GGML_ASSERT(dstt  == GGML_TYPE_F32);
+
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+
+                const size_t pnb1 = ((const int32_t *) dst->op_params)[0];
+                const size_t pnb2 = ((const int32_t *) dst->op_params)[1];
+                const size_t pnb3 = ((const int32_t *) dst->op_params)[2];
+                const size_t offs = ((const int32_t *) dst->op_params)[3];
+
+                const bool inplace = (bool) ((const int32_t *) dst->op_params)[4];
+
+                if (!inplace) {
+                    // run a separete kernel to cpy src->dst
+                    // not sure how to avoid this
+                    // TODO: make a simpler cpy_bytes kernel
+
+                    const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
+
+                    ggml_metal_kargs_cpy args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.ne03 =*/ ne03,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.ne2  =*/ ne2,
+                        /*.ne3  =*/ ne3,
+                        /*.nb0  =*/ nb0,
+                        /*.nb1  =*/ nb1,
+                        /*.nb2  =*/ nb2,
+                        /*.nb3  =*/ nb3,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                }
+
+                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+
+                ggml_metal_kargs_bin args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ pnb1,
+                    /*.nb02 =*/ pnb2,
+                    /*.nb03 =*/ pnb3,
+                    /*.ne10 =*/ ne10,
+                    /*.ne11 =*/ ne11,
+                    /*.ne12 =*/ ne12,
+                    /*.ne13 =*/ ne13,
+                    /*.nb10 =*/ nb10,
+                    /*.nb11 =*/ nb11,
+                    /*.nb12 =*/ nb12,
+                    /*.nb13 =*/ nb13,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ pnb1,
+                    /*.nb2  =*/ pnb2,
+                    /*.nb3  =*/ pnb3,
+                    /*.offs =*/ offs,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_SCALE:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                float scale;
+                memcpy(&scale, dst->op_params, sizeof(scale));
+
+                int64_t n = ggml_nelements(dst);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (n % 4 == 0) {
+                    n /= 4;
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
+                } else {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                }
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_CLAMP:
+            {
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
+
+                float min;
+                float max;
+                memcpy(&min, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&max, ((const int32_t *) dst->op_params) + 1, sizeof(float));
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&min   length:sizeof(min) atIndex:2];
+                [encoder setBytes:&max   length:sizeof(max) atIndex:3];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(node)) {
+                // we are not taking into account the strides, so for now require contiguous tensors
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                case GGML_UNARY_OP_TANH:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_RELU:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_SIGMOID:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_GELU:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_SILU:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                case GGML_UNARY_OP_ELU:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ELU].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
+                default:
+                {
+                    GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                    GGML_ABORT("fatal error");
+                }
+            } break;
+        case GGML_OP_SQR:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SQRT:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SIN:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_COS:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SUM_ROWS:
+            {
+                GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SOFT_MAX:
+            {
+                GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
+
+                int nth = 32; // SIMD width
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+
+                if (ne00%4 == 0) {
+                    while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
+                        nth *= 2;
+                    }
+                    if (use_f16) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
+                    }
+                } else {
+                    while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
+                        nth *= 2;
+                    }
+                    if (use_f16) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline;
+                    }
+                }
+
+                float scale;
+                float max_bias;
+
+                memcpy(&scale,    ((const int32_t *) dst->op_params) + 0, sizeof(scale));
+                memcpy(&max_bias, ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
+
+                const int64_t nrows_x = ggml_nrows(src0);
+                const int64_t nrows_y = src0->ne[1];
+
+                const uint32_t n_head      = nrows_x/nrows_y;
+                const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                // TODO: add ggml_metal_kargs struct
+                // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                if (id_src1) {
+                    [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                }
+                [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
+                [encoder setBytes:&ne00        length:sizeof(ne00)        atIndex:3];
+                [encoder setBytes:&ne01        length:sizeof(ne01)        atIndex:4];
+                [encoder setBytes:&ne02        length:sizeof(ne02)        atIndex:5];
+                [encoder setBytes:&scale       length:sizeof(scale)       atIndex:6];
+                [encoder setBytes:&max_bias    length:sizeof(max_bias)    atIndex:7];
+                [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
+                [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
+                [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
+
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_DIAG_MASK_INF:
+            {
+                const int n_past = ((const int32_t *)(dst->op_params))[0];
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (ne00%8 == 0) {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
+                } else {
+                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
+                }
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                if (ne00%8 == 0) {
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                }
+                else {
+                    [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                }
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                GGML_ASSERT(src0t == GGML_TYPE_F32);
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                [encoder setBytes:&ne01    length:sizeof(ne01) atIndex:4];
+                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:5];
+                [encoder setBytes:&nb00    length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&ne10    length:sizeof(ne10) atIndex:9];
+                [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:10];
+                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
+                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
+                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                [encoder setBytes:&ne2     length:sizeof(ne2)  atIndex:15];
+                [encoder setBytes:&nb0     length:sizeof(nb0)  atIndex:16];
+                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:17];
+                [encoder setBytes:&nb2     length:sizeof(nb2)  atIndex:18];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                struct ggml_tensor * src3 = node->src[3];
+                struct ggml_tensor * src4 = node->src[4];
+                struct ggml_tensor * src5 = node->src[5];
+
+                GGML_ASSERT(src3);
+                GGML_ASSERT(src4);
+                GGML_ASSERT(src5);
+
+                size_t offs_src3 = 0;
+                size_t offs_src4 = 0;
+                size_t offs_src5 = 0;
+
+                id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
+                id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
+                id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
+
+                const int64_t  ne30 = src3->ne[0]; GGML_UNUSED(ne30);
+                const int64_t  ne31 = src3->ne[1]; GGML_UNUSED(ne31);
+
+                const uint64_t nb30 = src3->nb[0];
+                const uint64_t nb31 = src3->nb[1];
+
+                const int64_t  ne40 = src4->ne[0]; GGML_UNUSED(ne40);
+                const int64_t  ne41 = src4->ne[1]; GGML_UNUSED(ne41);
+                const int64_t  ne42 = src4->ne[2]; GGML_UNUSED(ne42);
+
+                const uint64_t nb40 = src4->nb[0];
+                const uint64_t nb41 = src4->nb[1];
+                const uint64_t nb42 = src4->nb[2];
+
+                const int64_t  ne50 = src5->ne[0]; GGML_UNUSED(ne50);
+                const int64_t  ne51 = src5->ne[1]; GGML_UNUSED(ne51);
+                const int64_t  ne52 = src5->ne[2]; GGML_UNUSED(ne52);
+
+                const uint64_t nb50 = src5->nb[0];
+                const uint64_t nb51 = src5->nb[1];
+                const uint64_t nb52 = src5->nb[2];
+
+                const int64_t d_state      = ne00;
+                const int64_t d_inner      = ne01;
+                const int64_t n_seq_tokens = ne11;
+                const int64_t n_seqs       = ne02;
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
+                [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
+                [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
+                [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:6];
+
+                [encoder setBytes:&d_state      length:sizeof(d_state)      atIndex:7];
+                [encoder setBytes:&d_inner      length:sizeof(d_inner)      atIndex:8];
+                [encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
+                [encoder setBytes:&n_seqs       length:sizeof(n_seqs)       atIndex:10];
+
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
+                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
+                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
+                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
+                [encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
+                [encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
+                [encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
+                [encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
+                [encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
+                [encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
+                [encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
+                [encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                GGML_ASSERT(ne00 == ne10);
+
+                GGML_ASSERT(ne12 % ne02 == 0);
+                GGML_ASSERT(ne13 % ne03 == 0);
+
+                const uint32_t r2 = ne12/ne02;
+                const uint32_t r3 = ne13/ne03;
+
+                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                // to the matrix-vector kernel
+                const int ne11_mm_min = 4;
+
+                // first try to use small-batch mat-mv kernels
+                // these should be efficient for BS [2, ~8]
+                if (src1t == GGML_TYPE_F32 && (ne00%256 == 0) &&
+                    (
+                     (
+                      (
+                       src0t == GGML_TYPE_F16  || // TODO: helper function
+                       src0t == GGML_TYPE_Q4_0 ||
+                       src0t == GGML_TYPE_Q4_1 ||
+                       src0t == GGML_TYPE_Q5_0 ||
+                       src0t == GGML_TYPE_Q5_1 ||
+                       src0t == GGML_TYPE_Q8_0 ||
+                       src0t == GGML_TYPE_IQ4_NL ||
+                       false) && (ne11 >= 2 && ne11 <= 8)
+                     ) ||
+                     (
+                      (
+                       src0t == GGML_TYPE_Q4_K ||
+                       src0t == GGML_TYPE_Q5_K ||
+                       src0t == GGML_TYPE_Q6_K ||
+                       false) && (ne11 >= 4 && ne11 <= 8)
+                     )
+                    )
+                   ) {
+                    // TODO: determine the optimal parameters based on grid utilization
+                    //       I still don't know why we should not always use the maximum available threads:
+                    //
+                    //       nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
+                    //
+                    //       my current hypothesis is that the work grid is not evenly divisible for different nsg
+                    //       values and there can be some tail effects when nsg is high. need to confirm this
+                    //
+                    const int nsg    = 2;                 // num simdgroups per threadgroup
+                    const int nxpsg  = ne11 < 3 ? 16 : 8; // num threads along row per simdgroup
+                    const int nypsg  = 32/nxpsg;          // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
+                    const int r0ptg  = nypsg*nsg;         // num src0 rows per threadgroup
+                          int r1ptg  = 4;                 // num src1 rows per threadgroup
+
+                    // note: not sure how optimal are those across all different hardware. there might be someting cleverer
+                    switch (ne11) {
+                        case 2:
+                            r1ptg = 2; break;
+                        case 3:
+                        case 6:
+                            r1ptg = 3; break;
+                        case 4:
+                        case 7:
+                        case 8:
+                            r1ptg = 4; break;
+                        case 5:
+                            r1ptg = 5; break;
+                    };
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    switch (src0->type) {
+                        case GGML_TYPE_F16:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            switch (r1ptg) {
+                                case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2].pipeline; break;
+                                case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3].pipeline; break;
+                                case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4].pipeline; break;
+                                case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            } break;
+                        default: GGML_ABORT("not implemented");
+                    }
+
+                    ggml_metal_kargs_mul_mv_ext args = {
+                        /*.ne00  =*/ ne00,
+                        /*.ne01  =*/ ne01,
+                        /*.ne02  =*/ ne02,
+                        /*.nb00  =*/ nb00,
+                        /*.nb01  =*/ nb01,
+                        /*.nb02  =*/ nb02,
+                        /*.nb03  =*/ nb03,
+                        /*.ne10  =*/ ne10,
+                        /*.ne11  =*/ ne11,
+                        /*.ne12  =*/ ne12,
+                        /*.nb10  =*/ nb10,
+                        /*.nb11  =*/ nb11,
+                        /*.nb12  =*/ nb12,
+                        /*.nb13  =*/ nb13,
+                        /*.ne0   =*/ ne0,
+                        /*.ne1   =*/ ne1,
+                        /*.r2    =*/ r2,
+                        /*.r3    =*/ r3,
+                        /*.nsg   =*/ nsg,
+                        /*.nxpsg =*/ nxpsg,
+                        /*.r1ptg =*/ r1ptg,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                    //printf("ne01 = %lld nr0ptg = %d\n", ne01, nr0ptg);
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + r0ptg - 1)/r0ptg, (ne11 + r1ptg - 1)/r1ptg, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                } else
+                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                if ([device supportsFamily:MTLGPUFamilyApple7] &&
+                        !ggml_is_transposed(src0) &&
+                        !ggml_is_transposed(src1) &&
+                        src1t == GGML_TYPE_F32 &&
+                        ne00 % 32 == 0 && ne00 >= 64 &&
+                        (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                    //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                    // some Metal matrix data types require aligned pointers
+                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        default: break;
+                    }
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL MAT-MAT not implemented");
+                    }
+
+                    ggml_metal_kargs_mul_mm args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+
+                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                } else {
+                    int nth0 = 32;
+                    int nth1 = 1;
+                    int nrows = 1;
+                    //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    // use custom matrix x vector kernel
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                nrows = 4;
+                            } break;
+                        case GGML_TYPE_F16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
+                                        nrows = 4;
+                                    }
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                nth0 = 32;
+                                nth1 = 1;
+                                if (src1t == GGML_TYPE_F32) {
+                                    if (ne11 * ne12 < 4) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
+                                    } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
+                                        nrows = ne11;
+                                    } else {
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
+                                        nrows = 4;
+                                    }
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
+                                    nrows = 4;
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q2_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q3_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            {
+                                nth0 = 4; //1;
+                                nth1 = 8; //32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_M:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
+                            } break;
+                        default:
+                            {
+                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                GGML_ABORT("not implemented");
+                            }
+                    };
+
+                    ggml_metal_kargs_mul_mv args = {
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.nb03 =*/ nb03,
+                        /*.ne10 =*/ ne10,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.nb13 =*/ nb13,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.r2   =*/ r2,
+                        /*.r3   =*/ r3,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+
+                    if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                        src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                        src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                        const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                        const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
+                        const int mem_size = 32*sizeof(float);
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q4_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q3_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q5_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q6_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    } else {
+                        const int64_t ny = (ne11 + nrows - 1)/nrows;
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                }
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                const int n_as = src0->ne[2];
+
+                // src2 = ids
+                const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
+
+                GGML_ASSERT(src2t == GGML_TYPE_I32);
+
+                GGML_ASSERT(!ggml_is_transposed(src0));
+                GGML_ASSERT(!ggml_is_transposed(src1));
+
+                GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+                GGML_ASSERT(ne03 == 1);
+                GGML_ASSERT(ne13 == 1);
+
+                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                // to the matrix-vector kernel
+                // ne20 = n_used_experts
+                // ne21 = n_rows
+                const int dst_rows = ne20*ne21;
+                const int dst_rows_min = n_as;
+                const int dst_rows_max = (device.maxThreadgroupMemoryLength - 32 - 8192)/4;
+
+                // max size of the rowids array in the kernel shared buffer
+                GGML_ASSERT(dst_rows <= dst_rows_max);
+
+                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                // !!!
+                // TODO: for now, always use mat-vec kernels until we figure out how to improve the
+                //       indirect matrix multiplication
+                // !!!
+                if ([device supportsFamily:MTLGPUFamilyApple7] &&
+                        ne00 % 32 == 0 && ne00 >= 64 &&
+                        dst_rows > dst_rows_min) {
+                    // some Metal matrix data types require aligned pointers
+                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:  GGML_ASSERT(nb01 % 16 == 0); break;
+                        case GGML_TYPE_F16:  GGML_ASSERT(nb01 % 8  == 0); break;
+                        case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8  == 0); break;
+                        default: break;
+                    }
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    switch (src0->type) {
+                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
+                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
+                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
+                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
+                        default: GGML_ABORT("MUL_MAT_ID not implemented");
+                    }
+
+                    ggml_metal_kargs_mul_mm_id args = {
+                        /*.nei0 =*/ ne20,
+                        /*.nei1 =*/ ne21,
+                        /*.nbi1 =*/ nb21,
+                        /*.ne00 =*/ ne00,
+                        /*.ne02 =*/ ne02,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.ne13 =*/ ne13,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];
+
+                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                } else {
+                    int nth0 = 32;
+                    int nth1 = 1;
+                    int nrows = 1;
+                    //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    // use custom matrix x vector kernel
+                    switch (src0t) {
+                        case GGML_TYPE_F32:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_F16:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                nth0 = 32;
+                                nth1 = 1;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                nth0 = 32;
+                                nth1 = 1;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_BF16_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                nth0 = 8;
+                                nth1 = 8;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q2_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q3_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q4_K:
+                            {
+                                nth0 = 4; //1;
+                                nth1 = 8; //32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q5_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_Q6_K:
+                            {
+                                nth0 = 2;
+                                nth1 = 32;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_XXS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ3_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ2_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_S:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ1_M:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
+                            } break;
+                        case GGML_TYPE_IQ4_XS:
+                            {
+                                nth0 = 4;
+                                nth1 = 16;
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
+                            } break;
+                        default:
+                            {
+                                GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                GGML_ABORT("not implemented");
+                            }
+                    };
+
+                    if (ggml_is_quantized(src0t)) {
+                        GGML_ASSERT(ne00 >= nth0*nth1);
+                    }
+
+                    ggml_metal_kargs_mul_mv_id args = {
+                        /*.nei0 =*/ ne20,
+                        /*.nei1 =*/ ne21,
+                        /*.nbi1 =*/ nb21,
+                        /*.ne00 =*/ ne00,
+                        /*.ne01 =*/ ne01,
+                        /*.ne02 =*/ ne02,
+                        /*.nb00 =*/ nb00,
+                        /*.nb01 =*/ nb01,
+                        /*.nb02 =*/ nb02,
+                        /*.ne10 =*/ ne10,
+                        /*.ne11 =*/ ne11,
+                        /*.ne12 =*/ ne12,
+                        /*.ne13 =*/ ne13,
+                        /*.nb10 =*/ nb10,
+                        /*.nb11 =*/ nb11,
+                        /*.nb12 =*/ nb12,
+                        /*.ne0  =*/ ne0,
+                        /*.ne1  =*/ ne1,
+                        /*.nb1  =*/ nb1,
+                    };
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
+
+                    const int64_t _ne1 = 1;
+                    const int tgz = dst_rows;
+
+                    if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                            src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                            src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                        const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                        const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
+                        const int mem_size = 32*sizeof(float);
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q4_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q3_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q5_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                    else if (src0t == GGML_TYPE_Q6_K) {
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    } else {
+                        const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                    }
+                }
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (src0->type) {
+                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
+                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                    case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_BF16   ].pipeline; break;
+                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
+                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
+                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
+                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
+                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
+                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
+                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
+                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
+                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
+                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
+                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                    case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
+                    case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S  ].pipeline; break;
+                    case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S  ].pipeline; break;
+                    case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
+                    case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M  ].pipeline; break;
+                    case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
+                    case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
+                    case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
+                    default: GGML_ABORT("not implemented");
+                }
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
+                [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
+                [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
+                [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
+                [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
+                [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
+                [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
+                [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
+                [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+            } break;
+        case GGML_OP_RMS_NORM:
+            {
+                GGML_ASSERT(ne00 % 4 == 0);
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+
+                nth = MIN(nth, ne00/4);
+
+                ggml_metal_kargs_rms_norm args = {
+                    /*.ne00   =*/ ne00,
+                    /*.ne00_4 =*/ ne00/4,
+                    /*.nb01   =*/ nb01,
+                    /*.eps    =*/ eps,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_GROUP_NORM:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+                const int32_t n_groups = ((const int32_t *) dst->op_params)[0];
+
+                int nth = 32; // SIMD width
+
+                //while (nth < ne00/4 && nth < 1024) {
+                //    nth *= 2;
+                //}
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
+                [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
+                [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
+                [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
+                [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
+                [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
+                [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_NORM:
+            {
+                GGML_ASSERT(ne00 % 4 == 0);
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+                float eps;
+                memcpy(&eps, dst->op_params, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+
+                nth = MIN(nth, ne00/4);
+
+                ggml_metal_kargs_norm args = {
+                    /*.ne00   =*/ ne00,
+                    /*.ne00_4 =*/ ne00/4,
+                    /*.nb01   =*/ nb01,
+                    /*.eps    =*/ eps,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ROPE:
+            {
+                // make sure we have one or more position id(ne10) per token(ne02)
+                GGML_ASSERT(ne10 % ne02 == 0);
+                GGML_ASSERT(ne10 >= ne02);
+
+                const int nth = MIN(1024, ne00);
+
+                const int n_past     = ((const int32_t *) dst->op_params)[0];
+                const int n_dims     = ((const int32_t *) dst->op_params)[1];
+                const int mode       = ((const int32_t *) dst->op_params)[2];
+                // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                const int n_ctx_orig = ((const int32_t *) dst->op_params)[4];
+
+                float freq_base;
+                float freq_scale;
+                float ext_factor;
+                float attn_factor;
+                float beta_fast;
+                float beta_slow;
+
+                memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
+                memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
+                memcpy(&ext_factor,  (const int32_t *) dst->op_params +  7, sizeof(float));
+                memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
+                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
+                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
+
+                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                if (!is_neox) {
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else {
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                }
+
+                ggml_metal_kargs_rope args = {
+                    /*.ne00        =*/ ne00,
+                    /*.ne01        =*/ ne01,
+                    /*.ne02        =*/ ne02,
+                    /*.ne03        =*/ ne03,
+                    /*.nb00        =*/ nb00,
+                    /*.nb01        =*/ nb01,
+                    /*.nb02        =*/ nb02,
+                    /*.nb03        =*/ nb03,
+                    /*.ne0         =*/ ne0,
+                    /*.ne1         =*/ ne1,
+                    /*.ne2         =*/ ne2,
+                    /*.ne3         =*/ ne3,
+                    /*.nb0         =*/ nb0,
+                    /*.nb1         =*/ nb1,
+                    /*.nb2         =*/ nb2,
+                    /*.nb3         =*/ nb3,
+                    /*.n_past      =*/ n_past,
+                    /*.n_dims      =*/ n_dims,
+                    /*.n_ctx_orig  =*/ n_ctx_orig,
+                    /*.freq_base   =*/ freq_base,
+                    /*.freq_scale  =*/ freq_scale,
+                    /*.ext_factor  =*/ ext_factor,
+                    /*.attn_factor =*/ attn_factor,
+                    /*.beta_fast   =*/ beta_fast,
+                    /*.beta_slow   =*/ beta_slow,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
+                if (id_src2 != nil) {
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
+                }
+                [encoder setBuffer:id_dst  offset:offs_dst      atIndex:4];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+                GGML_ASSERT(src0->type == GGML_TYPE_F16);
+                GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+
+                const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+
+                const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                const int32_t IH = is_2D ? src1->ne[1] : 1;
+                const int32_t IW =         src1->ne[0];
+
+                const int32_t KH = is_2D ? src0->ne[1] : 1;
+                const int32_t KW =         src0->ne[0];
+
+                const int32_t OH = is_2D ? dst->ne[2] : 1;
+                const int32_t OW =         dst->ne[1];
+
+                const int32_t CHW = IC * KH * KW;
+
+                const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline;
+
+                const bool is_gt_mttpt = ((size_t)(N * KH * KW)) > pipeline.maxTotalThreadsPerThreadgroup;
+
+                switch (dst->type) {
+                    case GGML_TYPE_F32: {
+                        pipeline = (is_gt_mttpt ?
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32].pipeline
+                                    :
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline);
+                    } break;
+                    case GGML_TYPE_F16: {
+                        pipeline = (is_gt_mttpt ?
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16].pipeline
+                                    :
+                                    ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline);
+                    } break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src1 offset:offs_src1       atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
+                [encoder setBytes:&ofs0    length:sizeof(int32_t) atIndex:2];
+                [encoder setBytes:&ofs1    length:sizeof(int32_t) atIndex:3];
+                [encoder setBytes:&IW      length:sizeof(int32_t) atIndex:4];
+                [encoder setBytes:&IH      length:sizeof(int32_t) atIndex:5];
+                [encoder setBytes:&CHW     length:sizeof(int32_t) atIndex:6];
+                [encoder setBytes:&s0      length:sizeof(int32_t) atIndex:7];
+                [encoder setBytes:&s1      length:sizeof(int32_t) atIndex:8];
+                [encoder setBytes:&p0      length:sizeof(int32_t) atIndex:9];
+                [encoder setBytes:&p1      length:sizeof(int32_t) atIndex:10];
+                [encoder setBytes:&d0      length:sizeof(int32_t) atIndex:11];
+                [encoder setBytes:&d1      length:sizeof(int32_t) atIndex:12];
+
+                if (is_gt_mttpt) {
+                    [encoder setBytes:&N   length:sizeof(int32_t) atIndex:13];
+                    [encoder setBytes:&KH  length:sizeof(int32_t) atIndex:14];
+                    [encoder setBytes:&KW  length:sizeof(int32_t) atIndex:15];
+
+                    const uint64_t n_threads = MIN(pipeline.maxTotalThreadsPerThreadgroup, (uint64_t)N);
+
+                    const int64_t  quotient  = N / n_threads + (N % n_threads > 0 ? 1 : 0);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(quotient * CHW, OH, OW) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
+                } else {
+                    [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                }
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(ggml_is_contiguous(src1));
+                GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+                const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+
+                const int32_t IC = src1->ne[1];
+                const int32_t IL = src1->ne[0];
+
+                const int32_t K  = src0->ne[0];
+
+                const int32_t OL = dst->ne[0];
+                const int32_t OC = dst->ne[1];
+
+                id<MTLComputePipelineState> pipeline;
+
+                switch (src0->type) {
+                    case GGML_TYPE_F32: {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32].pipeline;
+                    } break;
+                    case GGML_TYPE_F16: {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32].pipeline;
+                    } break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0         atIndex:0];
+                [encoder setBuffer:id_src1 offset:offs_src1         atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst          atIndex:2];
+                [encoder setBytes:&IC      length:sizeof( int32_t)  atIndex:3];
+                [encoder setBytes:&IL      length:sizeof( int32_t)  atIndex:4];
+                [encoder setBytes:&K       length:sizeof( int32_t)  atIndex:5];
+                [encoder setBytes:&s0      length:sizeof( int32_t)  atIndex:6];
+                [encoder setBytes:&nb0     length:sizeof(uint64_t)  atIndex:7];
+                [encoder setBytes:&nb1     length:sizeof(uint64_t)  atIndex:8];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(OL, OC, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_UPSCALE:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                const float sf0 = (float)ne0/src0->ne[0];
+                const float sf1 = (float)ne1/src0->ne[1];
+                const float sf2 = (float)ne2/src0->ne[2];
+                const float sf3 = (float)ne3/src0->ne[3];
+
+                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                [encoder setBytes:&sf0  length:sizeof(sf0)  atIndex:18];
+                [encoder setBytes:&sf1  length:sizeof(sf1)  atIndex:19];
+                [encoder setBytes:&sf2  length:sizeof(sf2)  atIndex:20];
+                [encoder setBytes:&sf3  length:sizeof(sf3)  atIndex:21];
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_PAD:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_PAD_REFLECT_1D:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                const int32_t p0 = ((const int32_t *)(dst->op_params))[0];
+                const int32_t p1 = ((const int32_t *)(dst->op_params))[1];
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:6];
+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:11];
+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:12];
+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:13];
+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:14];
+                [encoder setBytes:&p0   length:sizeof(p0)   atIndex:15];
+                [encoder setBytes:&p1   length:sizeof(p1)   atIndex:16];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ARANGE:
+            {
+                GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+                float start;
+                float step;
+
+                memcpy(&start, ((const int32_t *) dst->op_params) + 0, sizeof(float));
+                memcpy(&step,  ((const int32_t *) dst->op_params) + 2, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
+                [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
+                [encoder setBytes:&start length:sizeof(start) atIndex:2];
+                [encoder setBytes:&step  length:sizeof(step)  atIndex:3];
+
+                const int nth = MIN(1024, ne0);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                const int dim        = dst->op_params[0];
+                const int max_period = dst->op_params[1];
+
+                const int half = dim / 2;
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBytes:&nb1   length:sizeof(nb1) atIndex:2];
+                [encoder setBytes:&dim   length:sizeof(dim) atIndex:3];
+                [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
+
+                const int nth = MIN(1024, half);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_ARGSORT:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+                const int nrows = ggml_nrows(src0);
+
+                enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+                // bitonic sort requires the number of elements to be power of 2
+                int64_t ne00_padded = 1;
+                while (ne00_padded < ne00) {
+                    ne00_padded *= 2;
+                }
+
+                // Metal kernels require the buffer size to be multiple of 16 bytes
+                // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+                const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (order) {
+                    case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                    case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                };
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
+                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
+            } break;
+        case GGML_OP_LEAKY_RELU:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                float slope;
+                memcpy(&slope, dst->op_params, sizeof(float));
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
+                [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+
+                const int64_t n = ggml_nelements(dst);
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                GGML_ASSERT(ne00 % 4  == 0);
+                GGML_ASSERT(ne11 % 32 == 0);
+
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(src1->type == src2->type);
+
+                GGML_ASSERT(ggml_are_same_shape (src1, src2));
+
+                struct ggml_tensor * src3 = node->src[3];
+
+                size_t offs_src3 = 0;
+
+                id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
+
+                GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
+                GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
+                        "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
+
+                const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
+                //const int64_t  ne31 = src3 ? src3->ne[1] : 0;
+                const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
+                const int64_t  ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
+
+                const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
+                const uint64_t nb31 = src3 ? src3->nb[1] : 0;
+                const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
+                const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
+
+                const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
+
+                float scale;
+                float max_bias;
+                float logit_softcap;
+                memcpy(&scale,         ((const int32_t *) dst->op_params) + 0, sizeof(scale));
+                memcpy(&max_bias,      ((const int32_t *) dst->op_params) + 1, sizeof(max_bias));
+                memcpy(&logit_softcap, ((const int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
+
+                if (logit_softcap != 0.0f) {
+                    scale /= logit_softcap;
+                }
+
+                const uint32_t n_head      = src0->ne[2];
+                const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+                const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                bool use_vec_kernel = false;
+
+                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
+                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
+                if (ne01 >= 4 || (ne00%128 != 0)) {
+                    switch (src1->type) {
+                        case GGML_TYPE_F16:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_BF16:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_BF16_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q4_1:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q4_1_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q5_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q5_1:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q5_1_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        case GGML_TYPE_Q8_0:
+                            {
+                                switch (ne00) {
+                                    case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H64 ].pipeline; break;
+                                    case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H80 ].pipeline; break;
+                                    case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H96 ].pipeline; break;
+                                    case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H112].pipeline; break;
+                                    case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H128].pipeline; break;
+                                    case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256].pipeline; break;
+                                    default:
+                                              {
+                                                  GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                                  GGML_LOG_ERROR("add template specialization for this size\n");
+                                                  GGML_ABORT("add template specialization for this size");
+                                              }
+                                }
+                            } break;
+                        default:
+                            {
+                                GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                GGML_LOG_ERROR("add template specialization for this type\n");
+                                GGML_ABORT("add template specialization for this type");
+                            }
+                    }
+                } else {
+                    use_vec_kernel = true;
+
+                    switch (ne00) {
+                        case 128:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H128].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H128].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H128].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H128].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
+                        case 256:
+                            {
+                                switch (src1->type) {
+                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
+                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H256].pipeline; break;
+                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H256].pipeline; break;
+                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H256].pipeline; break;
+                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256].pipeline; break;
+                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256].pipeline; break;
+                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256].pipeline; break;
+                                    default:
+                                        {
+                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
+                                            GGML_LOG_ERROR("add template specialization for this type\n");
+                                            GGML_ABORT("add template specialization for this type");
+                                        }
+                                }
+                            } break;
+                        default:
+                                  {
+                                      GGML_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                      GGML_LOG_ERROR("add template specialization for this size\n");
+                                      GGML_ABORT("add template specialization for this size");
+                                  }
+                    }
+                }
+
+                ggml_metal_kargs_flash_attn_ext args = {
+                    /*.ne01          =*/ ne01,
+                    /*.ne02          =*/ ne02,
+                    /*.ne03          =*/ ne03,
+                    /*.nb01          =*/ nb01,
+                    /*.nb02          =*/ nb02,
+                    /*.nb03          =*/ nb03,
+                    /*.ne11          =*/ ne11,
+                    /*.ne_12_2       =*/ ne12,
+                    /*.ne_12_3       =*/ ne13,
+                    /*.nb_12_1       =*/ nb11,
+                    /*.nb_12_2       =*/ nb12,
+                    /*.nb_12_3       =*/ nb13,
+                    /*.nb31          =*/ nb31,
+                    /*.ne1           =*/ ne1,
+                    /*.ne2           =*/ ne2,
+                    /*.scale         =*/ scale,
+                    /*.max_bias      =*/ max_bias,
+                    /*.m0            =*/ m0,
+                    /*.m1            =*/ m1,
+                    /*.n_head_log2   =*/ n_head_log2,
+                    /*.logit_softcap =*/ logit_softcap,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
+                [encoder setBuffer:id_src2 offset:offs_src2     atIndex:3];
+                if (id_src3) {
+                    [encoder setBuffer:id_src3 offset:offs_src3 atIndex:4];
+                } else {
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:4];
+                }
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:5];
+
+                if (!use_vec_kernel) {
+                    // half8x8 kernel
+                    const int64_t nqptg = 8;  // queries per threadgroup    !! sync with kernel template arguments !!
+                    const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
+
+                    GGML_ASSERT(nqptg <= 32);
+                    GGML_ASSERT(nqptg  % 8  == 0);
+                    GGML_ASSERT(ncpsg  % 32 == 0);
+
+                    // 2*(2*ncpsg + nqptg)*(nsg)
+                    // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
+                    //
+                    // 16*32*(nsg)
+                    // the shared memory needed for the simdgroups to load the KV cache
+                    // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
+                    //
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
+
+                    int64_t nsgmax = 2;
+
+                    while (true) {
+                        const size_t smem = FATTN_SMEM(nsgmax);
+                        if (smem > device.maxThreadgroupMemoryLength) {
+                            break;
+                        }
+                        nsgmax *= 2;
+                    }
+                    nsgmax /= 2;
+
+                    // simdgroups per threadgroup (a.k.a. warps)
+                    const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
+
+                    const size_t smem = FATTN_SMEM(nsg);
+
+                    //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg);
+                    GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
+                    [encoder setThreadgroupMemoryLength:smem atIndex:0];
+#undef FATTN_SMEM
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                } else {
+                    // half4x4 kernel
+                    const int64_t nqptg = 1;  // queries per threadgroup    !! sync with kernel template arguments !!
+                    const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
+
+                    GGML_ASSERT(nqptg <= 32);
+                    GGML_ASSERT(nqptg  % 1  == 0);
+                    GGML_ASSERT(ncpsg  % 32 == 0);
+
+                    // ne00 + 2*ncpsg*(nsg)
+                    // for each query, we load it as f16 in shared memory (ne00)
+                    // and store the soft_max values and the mask
+                    //
+                    // ne00*(nsg)
+                    // each simdgroup has a full f16 head vector in shared mem to accumulate results
+                    //
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*ncpsg*(nsg)) + ne00*(nsg))*(sizeof(float)/2), 16))
+
+                    int64_t nsgmax = 2;
+
+                    while (true) {
+                        const size_t smem = FATTN_SMEM(nsgmax);
+                        if (smem > device.maxThreadgroupMemoryLength) {
+                            break;
+                        }
+                        nsgmax *= 2;
+                    }
+                    nsgmax /= 2;
+
+                    // simdgroups per threadgroup (a.k.a. warps)
+                    const int64_t nsgt = MAX(2, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
+
+                    int64_t nsg = 1;
+                    while (nsg <= nsgt) {
+                        nsg *= 2;
+                    }
+                    nsg /= 2;
+
+                    const size_t smem = FATTN_SMEM(nsg);
+
+                    //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg);
+                    GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
+                    [encoder setThreadgroupMemoryLength:smem atIndex:0];
+#undef FATTN_SMEM
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                }
+            } break;
+        case GGML_OP_DUP:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+            {
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (src0t) {
+                    case GGML_TYPE_F32:
+                        {
+                            GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
+
+                            switch (dstt) {
+                                case GGML_TYPE_F32:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break;
+                                case GGML_TYPE_F16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break;
+                                case GGML_TYPE_BF16:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_BF16].pipeline; break;
+                                case GGML_TYPE_Q8_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
+                                case GGML_TYPE_Q4_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
+                                case GGML_TYPE_Q4_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
+                                case GGML_TYPE_Q5_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
+                                case GGML_TYPE_Q5_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
+                                case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_F16:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
+                                case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_BF16:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break;
+                                case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            };
+                        } break;
+                    default: GGML_ABORT("not implemented");
+                }
+
+                ggml_metal_kargs_cpy args = {
+                    /*.ne00 =*/ ne00,
+                    /*.ne01 =*/ ne01,
+                    /*.ne02 =*/ ne02,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne0,
+                    /*.ne1  =*/ ne1,
+                    /*.ne2  =*/ ne2,
+                    /*.ne3  =*/ ne3,
+                    /*.nb0  =*/ nb0,
+                    /*.nb1  =*/ nb1,
+                    /*.nb2  =*/ nb2,
+                    /*.nb3  =*/ nb3,
+                };
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_SET:
+            {
+                GGML_ASSERT(ggml_are_same_shape(src0, dst));
+                GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+                // src0 and dst as viewed during set
+                const size_t dst_nb0 = ggml_element_size(src0);
+
+                const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
+                const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
+                const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
+                const size_t offset  = ((int32_t *) dst->op_params)[3];
+                const bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+                if (!inplace) {
+                    memcpy(((char *)  dst->data), ((char *) src0->data), ggml_nbytes(dst));
+                }
+
+                const int im0 = (ne10 == 0 ? 0 : ne10-1);
+                const int im1 = (ne11 == 0 ? 0 : ne11-1);
+                const int im2 = (ne12 == 0 ? 0 : ne12-1);
+                const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+                GGML_ASSERT(offset + im0*dst_nb0  + im1*dst_nb1  + im2*dst_nb2  + im3*dst_nb3  <= ggml_nbytes(dst));
+
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (src0t) {
+                    case GGML_TYPE_F32:
+                        GGML_ASSERT(nb10 == sizeof(float));
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
+                    case GGML_TYPE_I32:
+                        GGML_ASSERT(nb10 == sizeof(int32_t));
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
+                    default: GGML_ABORT("fatal error");
+                }
+
+                ggml_metal_kargs_set args = {
+                    /*.ne10    =*/ ne10,
+                    /*.ne11    =*/ ne11,
+                    /*.ne12    =*/ ne12,
+                    /*.nb10    =*/ nb10,
+                    /*.nb11    =*/ nb11,
+                    /*.nb12    =*/ nb12,
+                    /*.nb13    =*/ nb13,
+                    /*.nb1     =*/ dst_nb1,
+                    /*.nb2     =*/ dst_nb2,
+                    /*.nb3     =*/ dst_nb3,
+                    /*.offs    =*/ offset,
+                    /*.inplace =*/ inplace,
+                };
+
+                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+        case GGML_OP_POOL_2D:
+            {
+                GGML_ASSERT(ggml_is_contiguous(src0));
+                GGML_ASSERT(src0t == GGML_TYPE_F32 && src0t == dstt);
+
+                const int32_t * opts = dst->op_params;
+                enum ggml_op_pool op = opts[0];
+
+                id<MTLComputePipelineState> pipeline = nil;
+                switch (src0t) {
+                    case GGML_TYPE_F32: {
+                        switch(op) {
+                            case GGML_OP_POOL_AVG:
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32].pipeline; break;
+                            case GGML_OP_POOL_MAX:
+                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32].pipeline; break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        }
+                    } break;
+                    default: GGML_ASSERT(false && "not implemented");
+                }
+
+                const int32_t k0 = opts[1];
+                const int32_t k1 = opts[2];
+                const int32_t s0 = opts[3];
+                const int32_t s1 = opts[4];
+                const int32_t p0 = opts[5];
+                const int32_t p1 = opts[6];
+
+                const int64_t IH = src0->ne[1];
+                const int64_t IW = src0->ne[0];
+
+                const int64_t N  = dst->ne[3];
+                const int64_t OC = dst->ne[2];
+                const int64_t OH = dst->ne[1];
+                const int64_t OW = dst->ne[0];
+
+                const int64_t parallel_elements = N * OC * OH * OW;
+                const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
+                const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
+
+                // TODO: add ggml_metal_kargs struct
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0       atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
+                [encoder setBytes:&k0      length:sizeof(int32_t) atIndex:2];
+                [encoder setBytes:&k1      length:sizeof(int32_t) atIndex:3];
+                [encoder setBytes:&s0      length:sizeof(int32_t) atIndex:4];
+                [encoder setBytes:&s1      length:sizeof(int32_t) atIndex:5];
+                [encoder setBytes:&p0      length:sizeof(int32_t) atIndex:6];
+                [encoder setBytes:&p1      length:sizeof(int32_t) atIndex:7];
+                [encoder setBytes:&IH      length:sizeof(int64_t) atIndex:8];
+                [encoder setBytes:&IW      length:sizeof(int64_t) atIndex:9];
+                [encoder setBytes:&OH      length:sizeof(int64_t) atIndex:10];
+                [encoder setBytes:&OW      length:sizeof(int64_t) atIndex:11];
+                [encoder setBytes:&parallel_elements length:sizeof(int64_t) atIndex:12];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
+            } break;
+            case GGML_OP_ARGMAX:
+            {
+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                GGML_ASSERT(ggml_is_contiguous_1(src0));
+                GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+
+                const int64_t nrows = ggml_nrows(src0);
+
+                int nth = 32; // SIMD width
+                while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
+                    nth *= 2;
+                }
+
+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGMAX].pipeline;
+
+                [encoder setComputePipelineState:pipeline];
+                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float)   atIndex:0];
+                [encoder setThreadgroupMemoryLength:32*sizeof(int32_t) atIndex:1];
+
+                [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+            } break;
+       default:
+            {
+                GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+static enum ggml_status ggml_metal_graph_compute(
+            ggml_backend_t   backend,
+        struct ggml_cgraph * gf) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    // number of nodes encoded by the main thread (empirically determined)
+    const int n_main = 128;
+
+    // number of threads in addition to the main thread
+    const int n_cb = ctx->n_cb;
+
+    // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
+    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
+    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
+    // each thread creates it's own command buffer and enqueues the ops in parallel
+    //
+    // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
+
+    @autoreleasepool {
+        ctx->gf = gf;
+
+        ctx->n_nodes_0 = MIN(n_main, gf->n_nodes);
+        ctx->n_nodes_1 = gf->n_nodes - ctx->n_nodes_0;
+
+        ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
+
+        const bool should_capture = ctx->capture_next_compute;
+        if (should_capture) {
+            ctx->capture_next_compute = false;
+
+            if (!ctx->capture_started) {
+                // create capture scope
+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx_dev->mtl_device];
+
+                MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
+                descriptor.captureObject = ctx->capture_scope;
+                descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+                descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
+
+                NSError * error = nil;
+                if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
+                    GGML_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
+                } else {
+                    [ctx->capture_scope beginScope];
+                    ctx->capture_started = true;
+                }
+            }
+        }
+
+        // the main thread commits the first few commands immediately
+        // command_buffer[n_cb]
+        {
+            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->command_buffers[n_cb] = command_buffer;
+
+            [command_buffer enqueue];
+            ctx->encode_async(n_cb);
+        }
+
+        // prepare the rest of the command buffers asynchronously
+        // command_buffer[0.. n_cb)
+        for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->command_buffers[cb_idx] = command_buffer;
+
+            // always enqueue the first two command buffers
+            // enqueue all of the command buffers if we don't need to abort
+            if (cb_idx < 2 || ctx->abort_callback == NULL) {
+                [command_buffer enqueue];
+            }
+        }
+
+        dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
+
+        // wait for completion and check status of each command buffer
+        // needed to detect if the device ran out-of-memory for example (#1881)
+        {
+            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
+            [command_buffer waitUntilCompleted];
+
+            MTLCommandBufferStatus status = [command_buffer status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                }
+
+                return GGML_STATUS_FAILED;
+            }
+        }
+
+        for (int i = 0; i < n_cb; ++i) {
+            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
+            [command_buffer waitUntilCompleted];
+
+            MTLCommandBufferStatus status = [command_buffer status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                }
+
+                return GGML_STATUS_FAILED;
+            }
+
+            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
+            if (!next_buffer) {
+                continue;
+            }
+
+            const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+            if (next_queued) {
+                continue;
+            }
+
+            if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+                GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+                return GGML_STATUS_ABORTED;
+            }
+
+            [next_buffer commit];
+        }
+
+        if (!should_capture && ctx->capture_started) {
+            [ctx->capture_scope endScope];
+            [[MTLCaptureManager sharedCaptureManager] stopCapture];
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    for (int i = 0; i < ctx->n_buffers; i++) {
+        [ctx->buffers[i].metal release];
+    }
+
+    ggml_backend_metal_buffer_rset_free(ctx);
+    ggml_backend_metal_device_rel(buffer->buft->device->context);
+
+    if (ctx->owned) {
+#if TARGET_OS_OSX
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
+#else
+        free(ctx->all_data);
+#endif
+    }
+
+    free(ctx);
+}
+
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    return ctx->all_data;
+}
+
+static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    memset((char *)tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    memset(ctx->all_data, value, ctx->all_size);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_metal_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .memset_tensor   = */ ggml_backend_metal_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_metal_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// default buffer type
+
+static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "Metal";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
+#ifndef GGML_METAL_NDEBUG
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
+                __func__,
+                size_aligned / 1024.0 / 1024.0,
+                device.currentAllocatedSize / 1024.0 / 1024.0,
+                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+            GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        }
+    } else {
+        GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
+                __func__,
+                size_aligned / 1024.0 / 1024.0,
+                device.currentAllocatedSize / 1024.0 / 1024.0);
+    }
+#endif
+#endif
+    GGML_UNUSED(device);
+    GGML_UNUSED(size_aligned);
+}
+
+static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
+
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    ctx->all_data = ggml_metal_host_malloc(size_aligned);
+    ctx->all_size = size_aligned;
+    ctx->owned = true;
+    ctx->n_buffers = 1;
+
+    if (ctx->all_data != NULL) {
+        ctx->buffers[0].data  = ctx->all_data;
+        ctx->buffers[0].size  = size;
+        ctx->buffers[0].metal = nil;
+
+        if (size_aligned > 0) {
+            ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                                            length:size_aligned
+                                            options:MTLResourceStorageModeShared
+                                            deallocator:nil];
+        }
+    }
+
+    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
+        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
+    //ggml_backend_metal_log_allocated_size(device, size_aligned);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
+}
+
+static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 32;
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+    const size_t max_size = device.maxBufferLength;
+    ggml_backend_metal_device_rel(buft->device->context);
+
+    return max_size;
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+
+    GGML_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
+            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
+        },
+        /* .device  = */ &g_ggml_backend_metal_device,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_metal;
+}
+
+static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "Metal_Mapped";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
+            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
+        },
+        /* .device  = */ &g_ggml_backend_metal_device,
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_from_ptr_type_metal;
+}
+
+// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
+ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
+
+    ctx->all_data = data;
+    ctx->all_size = size;
+    ctx->owned = false;
+    ctx->n_buffers = 0;
+
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+
+    // page-align the data ptr
+    {
+        const uintptr_t offs = (uintptr_t) data % size_page;
+        data  = (void *) ((char *) data - offs);
+        size += offs;
+    }
+
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    // the buffer fits into the max buffer size allowed by the device
+    if (size_aligned <= device.maxBufferLength) {
+        ctx->buffers[ctx->n_buffers].data  = data;
+        ctx->buffers[ctx->n_buffers].size  = size;
+        ctx->buffers[ctx->n_buffers].metal = nil;
+
+        if (size_aligned > 0) {
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
+        }
+
+        ggml_backend_metal_log_allocated_size(device, size_aligned);
+
+        ++ctx->n_buffers;
+    } else {
+        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+        // one of the views
+        const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+        const size_t size_step = device.maxBufferLength - size_ovlp;
+        const size_t size_view = device.maxBufferLength;
+
+        for (size_t i = 0; i < size; i += size_step) {
+            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) data + i);
+            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].metal = nil;
+
+            if (size_step_aligned > 0) {
+                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
+            }
+
+            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
+
+            if (i + size_step < size) {
+                GGML_LOG_INFO("\n");
+            }
+
+            ++ctx->n_buffers;
+        }
+    }
+
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
+}
+
+// backend
+
+static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+    return "Metal";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_metal_free(ggml_backend_t backend) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    ggml_backend_metal_device_rel(ctx_dev);
+    ggml_metal_free(ctx);
+
+    free(backend);
+}
+
+static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return ggml_metal_graph_compute(backend, cgraph);
+}
+
+static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+    if (ctx->n_cb != n_cb) {
+        ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
+
+        if (ctx->n_cb > 2) {
+            GGML_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade the performance in some cases\n", __func__, n_cb);
+        }
+    }
+
+    if (ctx->encode_async) {
+        Block_release(ctx->encode_async);
+    }
+
+    ctx->encode_async = Block_copy(^(size_t iter) {
+        const int cb_idx = iter;
+        const int n_cb_l = ctx->n_cb;
+
+        const int n_nodes_0 = ctx->n_nodes_0;
+        const int n_nodes_1 = ctx->n_nodes_1;
+
+        const int n_nodes_per_cb = ctx->n_nodes_per_cb;
+
+        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+
+        int node_start = 0;
+        int node_end   = n_nodes_0;
+
+        if (cb_idx < n_cb_l) {
+            node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
+            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
+        }
+
+        const bool should_capture = ctx->capture_next_compute;
+
+        for (int idx = node_start; idx < node_end; ++idx) {
+            if (should_capture) {
+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+            }
+
+            ggml_metal_encode_node(backend, idx, encoder);
+
+            if (should_capture) {
+                [encoder popDebugGroup];
+            }
+        }
+
+        [encoder endEncoding];
+
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer commit];
+        }
+    });
+}
+
+static struct ggml_backend_i ggml_backend_metal_i = {
+    /* .get_name                = */ ggml_backend_metal_name,
+    /* .free                    = */ ggml_backend_metal_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_metal_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_metal_guid(void) {
+    static ggml_guid guid = { 0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6 };
+    return &guid;
+}
+
+// TODO: remove in the future
+ggml_backend_t ggml_backend_metal_init(void) {
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), 0);
+
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(dev);
+    if (ctx == NULL) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return NULL;
+    }
+
+    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
+
+    *backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_metal_guid(),
+        /* .interface = */ ggml_backend_metal_i,
+        /* .device    = */ dev,
+        /* .context   = */ ctx,
+    };
+
+    ggml_backend_metal_set_n_cb(backend, 1);
+
+    return backend;
+}
+
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_metal_guid());
+}
+
+void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = user_data;
+}
+
+bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+}
+
+void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
+    ctx->capture_next_compute = true;
+}
+
+// backend device
+
+static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
+    return "Metal";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
+    // acq/rel just to populate ctx->name in case it hasn't been done yet
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+    ggml_backend_metal_device_acq(ctx_dev);
+    ggml_backend_metal_device_rel(ctx_dev);
+
+    return ctx_dev->name;
+}
+
+static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+        id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+        *total = device.recommendedMaxWorkingSetSize;
+        *free  = *total - device.currentAllocatedSize;
+
+        ggml_backend_metal_device_rel(ctx_dev);
+    } else {
+        *free = 1;
+        *total = 1;
+    }
+}
+
+static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_metal_device_get_name(dev);
+    props->description = ggml_backend_metal_device_get_description(dev);
+    props->type        = ggml_backend_metal_device_get_type(dev);
+    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = (struct ggml_backend_dev_caps) {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(dev);
+    if (ctx == NULL) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return NULL;
+    }
+
+    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
+
+    *backend = (struct ggml_backend) {
+        /* .guid      = */ ggml_backend_metal_guid(),
+        /* .interface = */ ggml_backend_metal_i,
+        /* .device    = */ dev,
+        /* .context   = */ ctx,
+    };
+
+    ggml_backend_metal_set_n_cb(backend, 1);
+
+    return backend;
+
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_metal_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
+
+    ctx->all_data = ptr;
+    ctx->all_size = size;
+    ctx->owned = false;
+    ctx->n_buffers = 0;
+
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+
+    // page-align the data ptr
+    {
+        const uintptr_t offs = (uintptr_t) ptr % size_page;
+        ptr  = (void *) ((char *) ptr - offs);
+        size += offs;
+    }
+
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    // the buffer fits into the max buffer size allowed by the device
+    if (size_aligned <= device.maxBufferLength) {
+        ctx->buffers[ctx->n_buffers].data  = ptr;
+        ctx->buffers[ctx->n_buffers].size  = size;
+        ctx->buffers[ctx->n_buffers].metal = nil;
+
+        if (size_aligned > 0) {
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
+        }
+
+        ggml_backend_metal_log_allocated_size(device, size_aligned);
+
+        ++ctx->n_buffers;
+    } else {
+        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+        // one of the views
+        const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+        const size_t size_step = device.maxBufferLength - size_ovlp;
+        const size_t size_view = device.maxBufferLength;
+
+        for (size_t i = 0; i < size; i += size_step) {
+            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) ptr + i);
+            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].metal = nil;
+
+            if (size_step_aligned > 0) {
+                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
+            }
+
+            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
+
+            if (i + size_step < size) {
+                GGML_LOG_INFO("\n");
+            }
+
+            ++ctx->n_buffers;
+        }
+    }
+
+    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
+        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
+        free(ctx);
+        ggml_backend_metal_device_rel(ctx_dev);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
+}
+
+static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
+
+    return ggml_metal_supports_op(ctx_dev, op);
+}
+
+static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
+            buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    return false;
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(op);
+}
+
+static struct ggml_backend_device_i ggml_backend_metal_device_i = {
+    /* .get_name             = */ ggml_backend_metal_device_get_name,
+    /* .get_description      = */ ggml_backend_metal_device_get_description,
+    /* .get_memory           = */ ggml_backend_metal_device_get_memory,
+    /* .get_type             = */ ggml_backend_metal_device_get_type,
+    /* .get_props            = */ ggml_backend_metal_device_get_props,
+    /* .init_backend         = */ ggml_backend_metal_device_init,
+    /* .get_buffer_type      = */ ggml_backend_metal_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_metal_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_metal_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_metal_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend registry
+
+static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
+    return "Metal";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    return &g_ggml_backend_metal_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
+#if defined(GGML_METAL_EMBED_LIBRARY)
+    { "EMBED_LIBRARY", "1" },
+#endif
+#if defined(GGML_METAL_USE_BF16)
+    { "BF16", "1" },
+#endif
+    { nil, nil },
+};
+
+static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
+    return g_ggml_backend_metal_features;
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_metal_get_features;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
+    /* .get_name         = */ ggml_backend_metal_reg_get_name,
+    /* .device_count     = */ ggml_backend_metal_reg_device_count,
+    /* .device_get       = */ ggml_backend_metal_reg_device_get,
+    /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_metal_reg(void) {
+    // TODO: make this thread-safe somehow?
+    {
+        g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface       = */ ggml_backend_metal_reg_i,
+            /* .context     = */ NULL,
+        };
+
+        g_ggml_backend_metal_device = (struct ggml_backend_device) {
+            /* .iface   = */ ggml_backend_metal_device_i,
+            /* .reg     = */ &g_ggml_backend_metal_reg,
+            /* .context = */ &g_ggml_ctx_dev_main,
+        };
+    }
+
+    return &g_ggml_backend_metal_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
new file mode 100644
index 000000000..44f04c909
--- /dev/null
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -0,0 +1,6735 @@
+#define GGML_COMMON_DECL_METAL
+#define GGML_COMMON_IMPL_METAL
+#if defined(GGML_METAL_EMBED_LIBRARY)
+__embed_ggml-common.h__
+#else
+// TODO: this should not be a relative path, but can't figure out how to set Metal include paths in Package.swift
+#include "../ggml-common.h"
+#endif
+#include "ggml-metal-impl.h"
+
+#include <metal_stdlib>
+
+using namespace metal;
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
+
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+
+// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+//
+// cmd:
+//   .../usr/bin/metal -dM -E -c                             ggml/src/ggml-metal/ggml-metal.metal
+//   .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/ggml-metal.metal
+//
+#if __METAL_VERSION__ < 310 && defined(GGML_METAL_USE_BF16)
+#undef GGML_METAL_USE_BF16
+#endif
+
+#if defined(GGML_METAL_USE_BF16)
+typedef matrix<bfloat, 4, 4> bfloat4x4;
+#endif
+
+constexpr constant static float kvalues_iq4nl_f[16] = {
+    -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
+};
+
+// NOTE: this is not dequantizing - we are simply fitting the template
+template <typename type4x4>
+void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
+    reg = (type4x4)(*src);
+}
+
+template <typename type4x4>
+void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
+    reg = (type4x4)(*src);
+}
+
+template <typename type4>
+void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) {
+    reg = (type4)(*(src + il));
+}
+
+#if defined(GGML_METAL_USE_BF16)
+template <typename type4x4>
+void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
+    reg = (type4x4)(*src);
+}
+#endif
+
+template <typename type4x4>
+void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.h * xb->d;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md;
+        reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md;
+    }
+
+    reg = (type4x4) reg_f;
+}
+
+template <typename type4>
+void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.h * xb->d;
+    const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i = 0; i < 2; i++) {
+        reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md;
+        reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+    const float d1 = il ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb->m;
+    const ushort mask0 = il ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m;
+        reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m;
+    }
+
+    reg = (type4x4) reg_f;
+}
+
+template <typename type4>
+void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
+    const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb->m;
+    const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
+    const ushort mask1 = mask0 << 8;
+
+    for (int i = 0; i < 2; i++) {
+        reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m;
+        reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg_f[i/2][2*(i%2) + 0] = d * x0 + md;
+        reg_f[i/2][2*(i%2) + 1] = d * x1 + md;
+    }
+
+    reg = (type4x4) reg_f;
+}
+
+template <typename type4>
+void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = (il/4) ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = (il/4) ? 4 : 0;
+
+    const int gh_mv = (il/4) ? 12 : 0;
+    const int gh_bk = (il/4) ?  0 : 4;
+
+    for (int ii = 0; ii < 2; ii++) {
+        int i = 2*(il%4) + ii;
+
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[2*ii + 0] = d * x0 + md;
+        reg[2*ii + 1] = d * x1 + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    float4x4 reg_f;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg_f[i/2][2*(i%2) + 0] = d * x0 + m;
+        reg_f[i/2][2*(i%2) + 1] = d * x1 + m;
+    }
+
+    reg = (type4x4) reg_f;
+}
+
+template <typename type4>
+void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = (il/4) ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = (il/4) ? 4 : 0;
+
+    const int gh_mv = (il/4) ? 12 : 0;
+    const int gh_bk = (il/4) ?  0 : 4;
+
+    for (int ii = 0; ii < 2; ii++) {
+        int i = 2*(il%4) + ii;
+
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[2*ii + 0] = d * x0 + m;
+        reg[2*ii + 1] = d * x1 + m;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
+    device const int8_t * qs = ((device const int8_t *)xb->qs);
+    const float d = xb->d;
+
+    float4x4 reg_f;
+
+    for (int i = 0; i < 16; i++) {
+        reg_f[i/4][i%4] = (qs[i + 16*il] * d);
+    }
+
+    reg = (type4x4) reg_f;
+}
+
+template <typename type4>
+void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) {
+    device const int8_t * qs = ((device const int8_t *)xb->qs);
+    const float d = xb->d;
+
+    for (int i = 0; i < 4; i++) {
+        reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d);
+    }
+}
+
+template <typename type4x4>
+void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
+    const float d = xb->d;
+    const float min = xb->dmin;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    float dl, ml;
+    uint8_t sc = xb->scales[il];
+
+    q = q + 32*(il/8) + 16*(il&1);
+    il = (il/2)%4;
+
+    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * q = (device const uint8_t *)xb->qs;
+    device const uint8_t * h = (device const uint8_t *)xb->hmask;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+    q = q + 32 * (il/8) + 16 * (il&1);
+    h = h + 16 * (il&1);
+    uint8_t m = 1 << (il/2);
+    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
+                                 ((il/4)>0 ? 12  : 3);
+    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
+    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
+    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
+                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
+    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
+    const float ml = 4.f * dl;
+
+    il = (il/2) & 3;
+    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
+    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
+    dl *= coef;
+
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
+    }
+}
+
+static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
+    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
+                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
+}
+
+template <typename type4x4>
+void dequantize_q4_K(device const block_q4_K * xb, short il, thread type4x4 & reg) {
+    device const uchar * q = xb->qs;
+
+    short is = (il/4) * 2;
+    q = q + (il/4) * 32 + 16 * (il&1);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const float d   = il < 2 ? xb->d : xb->d / 16.h;
+    const float min = xb->dmin;
+    const float dl = d * sc[0];
+    const float ml = min * sc[1];
+
+    const ushort mask = il < 2 ? 0x0F : 0xF0;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
+    device const uint8_t * q  = xb->qs;
+    device const uint8_t * qh = xb->qh;
+
+    short is = (il/4) * 2;
+    q  = q + 32 * (il/4) + 16 * (il&1);
+    qh = qh + 16 * (il&1);
+    uint8_t ul = 1 << (il/2);
+    il = il & 3;
+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
+    const float d = il < 2 ? xb->d : xb->d / 16.f;
+    const float min = xb->dmin;
+    const float dl = d * sc[0];
+    const float ml = min * sc[1];
+
+    const ushort mask  = il<2 ? 0x0F : 0xF0;
+    const float qh_val = il<2 ? 16.f : 256.f;
+    for (int i = 0; i < 16; ++i) {
+        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
+    const half d_all = xb->d;
+    device const uint8_t * ql = (device const uint8_t *)xb->ql;
+    device const uint8_t * qh = (device const uint8_t *)xb->qh;
+    device const int8_t * scales = (device const int8_t *)xb->scales;
+
+    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    qh = qh + 32*(il/8) + 16*(il&1);
+    float sc = scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+
+    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
+    const float       coef = il>1 ? 1.f/16.f          : 1.f;
+    const float ml = d_all * sc * 32.f;
+    const float dl = d_all * sc * coef;
+    for (int i = 0; i < 16; ++i) {
+        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
+                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
+    device const uint16_t * q2 = xb->qs + 4*ib32;
+    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
+    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
+    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
+    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
+    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
+    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
+    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
+    for (int i = 0; i < 8; ++i) {
+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint16_t * q2 = xb->qs + 4*ib32;
+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
+    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
+    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
+    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
+    for (int i = 0; i < 8; ++i) {
+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * q3 = xb->qs + 8*ib32;
+    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
+    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
+    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
+    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
+    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * qs = xb->qs + 8*ib32;
+    device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
+    const uint8_t qh = xb->qh[ib32] >> 4*il;
+    const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
+    constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
+        reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
+    }
+    grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
+    grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
+    for (int i = 0; i < 4; ++i) {
+        reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
+        reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint8_t * signs = qs + QK_K/8;
+    const uint8_t qh = xb->qh[ib32] >> 4*il;
+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
+    for (int i = 0; i < 8; ++i) {
+        reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
+        reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    const float d = xb->d;
+    device const uint8_t  * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint16_t * qh = xb->qh;
+    const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
+    const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
+    const uint16_t h = qh[ib32] >> 6*il;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * (grid1[i] & 0xf) + ml;
+        reg[1][i] = dl * (grid1[i] >>  4) + ml;
+        reg[2][i] = dl * (grid2[i] & 0xf) + ml;
+        reg[3][i] = dl * (grid2[i] >>  4) + ml;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    device const uint16_t * sc = (device const uint16_t *)xb->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const float d = scale.f16;
+
+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
+    device const uint8_t * qh = xb->qh + 2*ib32 + il;
+
+    const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
+    const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+    const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
+        reg[1][i] = dl * (grid1[i] >>  4) + ml1;
+        reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
+        reg[3][i] = dl * (grid2[i] >>  4) + ml2;
+    }
+}
+
+template <typename type4x4>
+void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
+    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
+    const float d = xb->d;
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    for (int i = 0; i < 4; ++i) {
+        aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
+    }
+}
+
+template <typename type4>
+void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) {
+    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
+    const float d = xb->d;
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f;
+    reg[0] = d * kvalues_iq4nl_f[q8[0]];
+    reg[1] = d * kvalues_iq4nl_f[q8[1]];
+    reg[2] = d * kvalues_iq4nl_f[q8[2]];
+    reg[3] = d * kvalues_iq4nl_f[q8[3]];
+}
+
+template <typename type4x4>
+void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
+    const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
+    const float d = (float)xb->d * (ls - 32);
+    uint32_t aux32;
+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
+    for (int i = 0; i < 4; ++i) {
+        aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
+    }
+}
+
+enum ggml_sort_order {
+    GGML_SORT_ORDER_ASC,
+    GGML_SORT_ORDER_DESC,
+};
+
+// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across all dims
+// cons: not very efficient
+kernel void kernel_add(
+        constant ggml_metal_kargs_bin & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
+
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
+
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) + *((device float *)(src1_ptr + i10*args.nb10));
+    }
+}
+
+kernel void kernel_sub(
+        constant ggml_metal_kargs_bin & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
+
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
+
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10));
+    }
+}
+
+kernel void kernel_mul(
+        constant ggml_metal_kargs_bin & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
+
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
+
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10));
+    }
+}
+
+kernel void kernel_div(
+        constant ggml_metal_kargs_bin & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig.z;
+    const int i02 = tgpig.y;
+    const int i01 = tgpig.x;
+
+    const int i13 = i03%args.ne13;
+    const int i12 = i02%args.ne12;
+    const int i11 = i01%args.ne11;
+
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11;
+    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i10 = i0%args.ne10;
+        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10));
+    }
+}
+
+template<typename T>
+kernel void kernel_repeat(
+        constant ggml_metal_kargs_repeat & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+
+    const int i03 = i3%args.ne03;
+    const int i02 = i2%args.ne02;
+    const int i01 = i1%args.ne01;
+
+    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
+    device       char * dst_ptr  = dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        const int i00 = i0%args.ne00;
+        *((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00));
+    }
+}
+
+typedef decltype(kernel_repeat<float>) kernel_repeat_t;
+
+template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
+template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
+template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
+template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_add_row(
+        constant ggml_metal_kargs_bin & args,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
+    dst[tpig] = src0[tpig] + src1[tpig % nb];
+}
+
+kernel void kernel_sub_row(
+        constant ggml_metal_kargs_bin & args,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
+    dst[tpig] = src0[tpig] - src1[tpig % nb];
+}
+
+kernel void kernel_mul_row(
+        constant ggml_metal_kargs_bin & args,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
+}
+
+kernel void kernel_div_row(
+        constant ggml_metal_kargs_bin & args,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    const uint nb = args.ne00/4;
+    dst[tpig] = src0[tpig] / src1[tpig % nb];
+}
+
+kernel void kernel_scale(
+        device const float * src0,
+        device       float * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_scale_4(
+        device const float4 * src0,
+        device       float4 * dst,
+        constant     float  & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_clamp(
+        device const float * src0,
+        device       float * dst,
+        constant     float & min,
+        constant     float & max,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]);
+}
+
+kernel void kernel_relu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = max(0.0f, src0[tpig]);
+}
+
+kernel void kernel_sigmoid(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
+}
+
+kernel void kernel_tanh(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = precise::tanh(x);
+}
+
+constant float GELU_COEF_A     = 0.044715f;
+constant float GELU_QUICK_COEF = -1.702f;
+constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+kernel void kernel_gelu(
+    device const float * src0,
+    device       float * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+
+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_gelu_4(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+
+    // BEWARE !!!
+    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
+    // This was observed with Falcon 7B and 40B models
+    //
+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_gelu_quick(
+    device const float * src0,
+    device       float * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+
+    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
+}
+
+kernel void kernel_gelu_quick_4(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+
+    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
+}
+
+kernel void kernel_silu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_silu_4(
+        device const float4 * src0,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+    dst[tpig] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_elu(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = (x > 0.0f) ? x : (exp(x) - 1.0f);
+}
+
+kernel void kernel_sqr(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * src0[tpig];
+}
+
+kernel void kernel_sqrt(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = sqrt(src0[tpig]);
+}
+
+kernel void kernel_sin(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = sin(src0[tpig]);
+}
+
+kernel void kernel_cos(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = cos(src0[tpig]);
+}
+
+kernel void kernel_sum_rows(
+        device const float * src0,
+        device       float * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant uint64_t & nb00,
+        constant uint64_t & nb01,
+        constant uint64_t & nb02,
+        constant uint64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant uint64_t & nb10,
+        constant uint64_t & nb11,
+        constant uint64_t & nb12,
+        constant uint64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant uint64_t & nb0,
+        constant uint64_t & nb1,
+        constant uint64_t & nb2,
+        constant uint64_t & nb3,
+        uint3 tpig[[thread_position_in_grid]]) {
+    int64_t i3 = tpig.z;
+    int64_t i2 = tpig.y;
+    int64_t i1 = tpig.x;
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float row_sum = 0;
+
+    for (int64_t i0 = 0; i0 < ne00; i0++) {
+        row_sum += src_row[i0];
+    }
+
+    dst_row[0] = row_sum;
+}
+
+template<typename T>
+kernel void kernel_soft_max(
+        device const  char * src0,
+        device const  char * src1,
+        device        char * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant     float & scale,
+        constant     float & max_bias,
+        constant     float & m0,
+        constant     float & m1,
+        constant  uint32_t & n_head_log2,
+        threadgroup  float * buf [[threadgroup(0)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = (tgpig) / (ne02*ne01);
+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
+
+    device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    device const     T * pmask = src1 != src0 ? (device const    T *) src1         + i01*ne00 : nullptr;
+    device       float * pdst  = (device       float *) dst  + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        const int64_t h = i02;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float lmax = -INFINITY;
+
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+
+    // find the max value in the block
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
+
+    // parallel sum
+    float lsum = 0.0f;
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
+        lsum += exp_psrc0;
+        pdst[i00] = exp_psrc0;
+    }
+
+    // This barrier fixes a failing test
+    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+    threadgroup_barrier(mem_flags::mem_none);
+
+    float sum = simd_sum(lsum);
+
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+
+    const float inv_sum = 1.0f/sum;
+
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+        pdst[i00] *= inv_sum;
+    }
+}
+
+template<typename T>
+kernel void kernel_soft_max_4(
+        device const  char * src0,
+        device const  char * src1,
+        device        char * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant     float & scale,
+        constant     float & max_bias,
+        constant     float & m0,
+        constant     float & m1,
+        constant  uint32_t & n_head_log2,
+        threadgroup  float * buf [[threadgroup(0)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = (tgpig) / (ne02*ne01);
+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
+
+    device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
+    device const      T * pmask = src1 != src0 ? (device const     T *) src1         + i01*ne00/4 : nullptr;
+    device       float4 * pdst4 = (device       float4 *) dst  + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
+
+    float slope = 1.0f;
+
+    if (max_bias > 0.0f) {
+        const int64_t h = i02;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float4 lmax4 = -INFINITY;
+
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
+    }
+
+    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
+
+    float max_val = simd_max(lmax);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = -INFINITY;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = max_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = buf[tiisg];
+        max_val = simd_max(max_val);
+    }
+
+    // parallel sum
+    float4 lsum4 = 0.0f;
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
+        lsum4 += exp_psrc4;
+        pdst4[i00] = exp_psrc4;
+    }
+
+    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
+
+    // This barrier fixes a failing test
+    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+    threadgroup_barrier(mem_flags::mem_none);
+
+    float sum = simd_sum(lsum);
+
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = sum;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        sum = buf[tiisg];
+        sum = simd_sum(sum);
+    }
+
+    const float inv_sum = 1.0f/sum;
+
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+        pdst4[i00] *= inv_sum;
+    }
+}
+
+typedef decltype(kernel_soft_max<float>)    kernel_soft_max_t;
+typedef decltype(kernel_soft_max_4<float4>) kernel_soft_max_4_t;
+
+template [[host_name("kernel_soft_max_f16")]]   kernel kernel_soft_max_t   kernel_soft_max<half>;
+template [[host_name("kernel_soft_max_f32")]]   kernel kernel_soft_max_t   kernel_soft_max<float>;
+template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<half4>;
+template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;
+
+kernel void kernel_diag_mask_inf(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant       int & n_past,
+        uint3 tpig[[thread_position_in_grid]]) {
+    const int64_t i02 = tpig[2];
+    const int64_t i01 = tpig[1];
+    const int64_t i00 = tpig[0];
+
+    if (i00 > n_past + i01) {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+    } else {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+    }
+}
+
+kernel void kernel_diag_mask_inf_8(
+        device const float4 * src0,
+        device       float4 * dst,
+        constant    int64_t & ne00,
+        constant    int64_t & ne01,
+        constant        int & n_past,
+        uint3 tpig[[thread_position_in_grid]]) {
+
+    const int64_t i = 2*tpig[0];
+
+    dst[i+0] = src0[i+0];
+    dst[i+1] = src0[i+1];
+    int64_t i4 = 4*i;
+    const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
+    const int64_t i01 = i4/(ne00);      i4 -= i01*ne00;
+    const int64_t i00 = i4;
+    for (int k = 3; k >= 0; --k) {
+        if (i00 + 4 + k <= n_past + i01) {
+            break;
+        }
+        dst[i+1][k] = -INFINITY;
+        if (i00 + k > n_past + i01) {
+            dst[i][k] = -INFINITY;
+        }
+    }
+}
+
+// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
+// TODO: optimize
+kernel void kernel_ssm_conv_f32(
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t ir = tgpig.x;
+    const int64_t i2 = tgpig.y;
+    const int64_t i3 = tgpig.z;
+
+    const int64_t nc  = ne10;
+  //const int64_t ncs = ne00;
+  //const int64_t nr  = ne01;
+  //const int64_t n_t = ne1;
+  //const int64_t n_s = ne2;
+
+    device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
+    device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
+    device       float * x = (device       float *) ((device       char *) dst  + ir*nb0  + i2*nb1  + i3*nb2);
+
+    float sumf = 0.0f;
+
+    for (int64_t i0 = 0; i0 < nc; ++i0) {
+        sumf += s[i0] * c[i0];
+    }
+
+    x[0] = sumf;
+}
+
+// ref: ggml.c:ggml_compute_forward_ssm_scan_f32
+// TODO: optimize
+kernel void kernel_ssm_scan_f32(
+        device const void * src0,
+        device const void * src1,
+        device const void * src2,
+        device const void * src3,
+        device const void * src4,
+        device const void * src5,
+        device      float * dst,
+        constant  int64_t & d_state,
+        constant  int64_t & d_inner,
+        constant  int64_t & n_seq_tokens,
+        constant  int64_t & n_seqs,
+        constant uint64_t & nb00,
+        constant uint64_t & nb01,
+        constant uint64_t & nb02,
+        constant uint64_t & nb10,
+        constant uint64_t & nb11,
+        constant uint64_t & nb12,
+        constant uint64_t & nb13,
+        constant uint64_t & nb20,
+        constant uint64_t & nb21,
+        constant uint64_t & nb22,
+        constant uint64_t & nb30,
+        constant uint64_t & nb31,
+        constant uint64_t & nb40,
+        constant uint64_t & nb41,
+        constant uint64_t & nb42,
+        constant uint64_t & nb50,
+        constant uint64_t & nb51,
+        constant uint64_t & nb52,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t ir = tgpig.x;
+    const int64_t i3 = tgpig.y;
+
+    const int64_t nc  = d_state;
+  //const int64_t nr  = d_inner;
+    const int64_t n_t = n_seq_tokens;
+  //const int64_t n_s = n_seqs;
+
+    for (int64_t i2 = 0; i2 < n_t; ++i2) {
+        device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
+        device const float * x  = (device const float *) ((device const char *) src1 + ir*nb10 + i2*nb11 + i3*nb12);
+        device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*nb21 + i3*nb22);
+        device const float * A  = (device const float *) ((device const char *) src3 + ir*nb31);
+        device const float * B  = (device const float *) ((device const char *) src4 + i2*nb41 + i3*nb42);
+        device const float * C  = (device const float *) ((device const char *) src5 + i2*nb51 + i3*nb52);
+        device       float * y  = (device       float *) ((device       char *) dst  + ir*nb10 + i2*nb11 + i3*nb12); // TODO: do not use src1 strides
+        device       float * s  = (device       float *) ((device       char *) dst  + ir*nb01 + i3*nb02 +    nb13);
+
+        if (i2 > 0) {
+            s0 = s;
+        }
+
+        // i1 == 0
+        float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
+        float x_dt = x[0] * dt_soft_plus;
+        float sumf = 0.0f;
+
+        for (int64_t i0 = 0; i0 < nc; ++i0) {
+            int64_t i = i0;
+            float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+            sumf += state * C[i0];
+            s[i] = state;
+        }
+
+        y[0] = sumf;
+    }
+}
+
+kernel void kernel_argmax(
+        device   const void * x,
+        device      int32_t * dst,
+        constant    int64_t & ncols,
+        constant   uint64_t & nb01,
+        threadgroup   float * shared_maxval [[threadgroup(0)]],
+        threadgroup int32_t * shared_argmax [[threadgroup(1)]],
+        uint  tgpig[[threadgroup_position_in_grid]],
+        uint  tpitg[[thread_position_in_threadgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint    ntg[[threads_per_threadgroup]]) {
+    device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01);
+
+    float   lmax = -INFINITY;
+    int32_t larg = -1;
+
+    for (int i00 = tpitg; i00 < ncols; i00 += ntg) {
+        if (x_row[i00] > lmax) {
+            lmax = x_row[i00];
+            larg = i00;
+        }
+    }
+
+    // find the argmax value in the block
+    float max_val = simd_max(lmax);
+    int32_t arg_val = simd_max(select(-1, larg, lmax == max_val));
+
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            shared_maxval[tiisg] = -INFINITY;
+            shared_argmax[tiisg] = -1;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            shared_maxval[sgitg] = max_val;
+            shared_argmax[sgitg] = arg_val;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        max_val = shared_maxval[tiisg];
+        arg_val = shared_argmax[tiisg];
+
+        float max_val_reduced   = simd_max(max_val);
+        int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced));
+
+        dst[tgpig] = arg_val_reduced;
+
+        return;
+    }
+
+    dst[tgpig] = arg_val;
+}
+
+kernel void kernel_norm(
+        constant ggml_metal_kargs_norm & args,
+        device const char * src0,
+        device       char * dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint   tgpig[[threadgroup_position_in_grid]],
+        ushort tpitg[[thread_position_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort   ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
+    device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
+
+    float4 sumf4(0.0f);
+
+    float sumf = 0.0f;
+
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        sumf4 += x[i00];
+    }
+    sumf = sumf4[0] + sumf4[1] + sumf4[2] + sumf4[3];
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float mean = sumf/args.ne00;
+
+    device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
+
+    sumf = 0.0f;
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        y[i00] = x[i00] - mean;
+        sumf += dot(y[i00], y[i00]);
+    }
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float variance = sumf/args.ne00;
+
+    const float scale = 1.0f/sqrt(variance + args.eps);
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        y[i00] = y[i00] * scale;
+    }
+}
+
+kernel void kernel_rms_norm(
+        constant ggml_metal_kargs_rms_norm & args,
+        device const char * src0,
+        device       char * dst,
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
+        uint   tgpig[[threadgroup_position_in_grid]],
+        ushort tpitg[[thread_position_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort   ntg[[threads_per_threadgroup]]) {
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
+    device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
+
+    float sumf = 0.0f;
+
+    // parallel sum
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        sumf += dot(x[i00], x[i00]);
+    }
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    const float mean  = sumf/args.ne00;
+    const float scale = 1.0f/sqrt(mean + args.eps);
+
+    device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
+    for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
+        y[i00] = x[i00] * scale;
+    }
+}
+
+kernel void kernel_group_norm(
+        device const float * src0,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int32_t & n_groups,
+        constant     float & eps,
+        threadgroup float  * buf [[threadgroup(0)]],
+        uint tgpig[[threadgroup_position_in_grid]],
+        uint tpitg[[thread_position_in_threadgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]],
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint   ntg[[threads_per_threadgroup]]) {
+    const int64_t ne = ne00*ne01*ne02;
+    const int64_t gs = ne00*ne01*((ne02 + n_groups - 1) / n_groups);
+
+    int start = tgpig * gs;
+    int end   = start + gs;
+
+    start += tpitg;
+
+    if (end >= ne) {
+        end = ne;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += ntg) {
+        tmp += src0[j];
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    tmp = simd_sum(tmp);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = tmp;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        tmp = buf[tiisg];
+        tmp = simd_sum(tmp);
+    }
+
+    const float mean = tmp / gs;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += ntg) {
+        float xi = src0[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = simd_sum(tmp);
+    if (ntg > N_SIMDWIDTH) {
+        if (sgitg == 0) {
+            buf[tiisg] = 0.0f;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (tiisg == 0) {
+            buf[sgitg] = tmp;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        tmp = buf[tiisg];
+        tmp = simd_sum(tmp);
+    }
+
+    const float variance = tmp / gs;
+    const float scale = 1.0f/sqrt(variance + eps);
+    for (int j = start; j < end; j += ntg) {
+        dst[j] *= scale;
+    }
+}
+
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    device const uint16_t * qs = ((device const uint16_t *) qb_curr + 1 + il/2);
+
+    for (int i = 0; i < 8; i += 2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0);
+        acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+
+    return d * (sumy * -8.f + acc[0] + acc[1] + acc[2] + acc[3]);
+}
+
+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    device const uint16_t * qs = ((device const uint16_t *) qb_curr + 2 + il/2);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F);
+        acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0);
+        acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+
+    return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m;
+}
+
+// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010));
+        acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100));
+        acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+
+    return d * (sumy * -16.f + acc[0] + acc[1] + acc[2] + acc[3]);
+}
+
+// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_1/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+
+    float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010));
+        acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100));
+        acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+
+    return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m;
+}
+
+// putting them in the kernel cause a significant performance penalty
+#define N_DST 4        // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
+//Note: This is a template, but strictly speaking it only applies to
+//      quantizations where the block size is 32. It also does not
+//      guard against the number of rows not being divisible by
+//      N_DST, so this is another explicit assumption of the implementation.
+template<typename block_q_type, int nr, int nsg, int nw, typename args_t>
+void mul_vec_q_n_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const int nb = args.ne00/QK4_0;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * nsg + sgitg) * nr;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+  //const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+  //device const block_q_type * x = (device const block_q_type *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_q_type * ax[nr];
+    for (int row = 0; row < nr; ++row) {
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+        ax[row] = (device const block_q_type *) ((device char *) src0 + offset0);
+    }
+
+    float yl[16]; // src1 vector cache
+    float sumf[nr] = {0.f};
+
+    const short ix = (tiisg/2);
+    const short il = (tiisg%2)*8;
+
+    device const float * yb = y + ix*QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += nw/2) {
+        float sumy[2] = { 0.f, 0.f };
+
+#pragma unroll
+        for (int i = 0; i < 8; i += 2) {
+            sumy[0]  += yb[i +  0] + yb[i +  1];
+            yl[i + 0] = yb[i +  0];
+            yl[i + 1] = yb[i +  1]/256.f;
+
+            sumy[1]  += yb[i + 16] + yb[i + 17];
+            yl[i + 8] = yb[i + 16]/16.f;
+            yl[i + 9] = yb[i + 17]/4096.f;
+        }
+
+#pragma unroll
+        for (int row = 0; row < nr; row++) {
+            sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy[0] + sumy[1], yl, il);
+        }
+
+        yb += QK4_0 * 16;
+    }
+
+    device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
+
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+kernel void kernel_mul_mv_q4_0_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+kernel void kernel_mul_mv_q4_1_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+kernel void kernel_mul_mv_q5_0_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+kernel void kernel_mul_mv_q5_1_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+#define NB_Q8_0 8
+
+template<typename args_t>
+void kernel_mul_mv_q8_0_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const int nr  = N_DST;
+    const int nsg = N_SIMDGROUP;
+    const int nw  = N_SIMDWIDTH;
+
+    const int nb = args.ne00/QK8_0;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0*nsg + sgitg)*nr;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+  //const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+  //device const block_q8_0 * x = (device const block_q8_0 *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
+
+    // pointers to src0 rows
+    device const block_q8_0 * ax[nr];
+    for (int row = 0; row < nr; ++row) {
+        const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+        ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0);
+    }
+
+    float yl[NB_Q8_0];
+    float sumf[nr] = { 0.f };
+
+    const short ix = tiisg/4;
+    const short il = tiisg%4;
+
+    device const float * yb = y + ix*QK8_0 + il*NB_Q8_0;
+
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
+    for (int ib = ix; ib < nb; ib += nw/4) {
+        for (short i = 0; i < NB_Q8_0; ++i) {
+            yl[i] = yb[i];
+        }
+
+        for (int row = 0; row < nr; row++) {
+            device const int8_t * qs = ax[row][ib].qs + il*NB_Q8_0;
+            float sumq = 0.f;
+            for (short iq = 0; iq < NB_Q8_0; ++iq) {
+                sumq += qs[iq] * yl[iq];
+            }
+            sumf[row] += sumq*ax[row][ib].d;
+        }
+
+        yb += nw*NB_Q8_0;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+
+        if (tiisg == 0 && first_row + row < args.ne01) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q8_0_f32")]]
+kernel void kernel_mul_mv_q8_0_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+// mat-vec kernel processing in chunks of float4
+// chpb - chunks per quantization block
+template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4)(device const q_t *, short, thread float4 &) >
+void kernel_mul_mv_ext_q4_f32_impl(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short chpt = 4; // chunks per thread
+
+  //const short nxpsg = (32);
+    const short nypsg = (32/nxpsg);
+
+    const short tx = tiisg%nxpsg;
+    const short ty = tiisg/nxpsg;
+
+    const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
+    const int i11 = tgpig.y*r1ptg;
+    const int i1m = tgpig.z;
+
+    const int i12 = i1m%args.ne12;
+    const int i13 = i1m/args.ne12;
+
+    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
+
+    device const float4 * y4[r1ptg];
+
+    for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
+        y4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4 *) src1;
+    }
+
+    float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
+
+    short cch = tx%chpb; // current chunk index
+
+    for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) {
+        float4 lx[chpt];
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+            deq_t4(xq, cch, lx[ch]);
+
+            cch += nxpsg;
+            if (cch >= chpb) {
+                xq  += cch/chpb;
+                cch %= chpb;
+            }
+        }
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+#pragma unroll(r1ptg)
+            for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+                sumf[ir1] += dot(lx[ch], y4[ir1][ch*nxpsg]);
+
+            }
+        }
+
+#pragma unroll(r1ptg)
+        for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+            y4[ir1] += chpt*nxpsg;
+        }
+    }
+
+    // reduce only the threads in each row
+    for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+        if (nxpsg >= 32) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
+        }
+        if (nxpsg >= 16) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  8);
+        }
+        if (nxpsg >= 8) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  4);
+        }
+        if (nxpsg >= 4) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  2);
+        }
+        if (nxpsg >= 2) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  1);
+        }
+
+        //sumf[ir1] = simd_sum(sumf[ir1]);
+    }
+
+    if (tx == 0) {
+        for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
+            device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
+
+            if (i01 < args.ne01) {
+                dst_f32[i01] = sumf[ir1];
+            }
+        }
+    }
+}
+
+// mat-vec kernel processing in chunks of float4x4
+template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &) >
+void kernel_mul_mv_ext_q4x4_f32_impl(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short chpt = 1;
+
+  //const short nxpsg = (32);
+    const short nypsg = (32/nxpsg);
+
+    const short tx = tiisg%nxpsg;
+    const short ty = tiisg/nxpsg;
+
+    const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
+    const int i11 = tgpig.y*r1ptg;
+    const int i1m = tgpig.z;
+
+    const int i12 = i1m%args.ne12;
+    const int i13 = i1m/args.ne12;
+
+    const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = i11*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
+
+    device const float4x4 * y4x4[r1ptg];
+
+    for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
+        y4x4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4x4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4x4 *) src1;
+    }
+
+    float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
+
+    short cch = tx%chpb;
+
+    for (int ich = tx; 16*ich < args.ne00; ich += chpt*nxpsg) {
+        float4x4 lx[chpt];
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+            deq_t4x4(xq, cch, lx[ch]);
+
+            cch += nxpsg;
+            if (cch >= chpb) {
+                xq  += cch/chpb;
+                cch %= chpb;
+            }
+        }
+
+#pragma unroll(chpt)
+        for (short ch = 0; ch < chpt; ++ch) {
+#pragma unroll(r1ptg)
+            for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+                sumf[ir1] +=
+                    dot(lx[ch][0], y4x4[ir1][ch*nxpsg][0]) +
+                    dot(lx[ch][1], y4x4[ir1][ch*nxpsg][1]) +
+                    dot(lx[ch][2], y4x4[ir1][ch*nxpsg][2]) +
+                    dot(lx[ch][3], y4x4[ir1][ch*nxpsg][3]);
+
+            }
+        }
+
+#pragma unroll(r1ptg)
+        for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+            y4x4[ir1] += chpt*nxpsg;
+        }
+    }
+
+    for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
+        if (nxpsg >= 32) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
+        }
+        if (nxpsg >= 16) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  8);
+        }
+        if (nxpsg >= 8) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  4);
+        }
+        if (nxpsg >= 4) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  2);
+        }
+        if (nxpsg >= 2) {
+            sumf[ir1] += simd_shuffle_down(sumf[ir1],  1);
+        }
+
+        //sumf[ir1] = simd_sum(sumf[ir1]);
+    }
+
+    if (tx == 0) {
+        for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
+            device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
+
+            if (i01 < args.ne01) {
+                dst_f32[i01] = sumf[ir1];
+            }
+        }
+    }
+}
+
+// dispatchers needed for compile-time nxpsg
+// epb - elements per quantization block
+template<short r1ptg, typename q_t, short epb, void (*deq_t4)(device const q_t *, short, thread float4 &)>
+kernel void kernel_mul_mv_ext_q4_f32_disp(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    switch (args.nxpsg) {
+        case 4:  kernel_mul_mv_ext_q4_f32_impl<4,  r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 8:  kernel_mul_mv_ext_q4_f32_impl<8,  r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 16: kernel_mul_mv_ext_q4_f32_impl<16, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 32: kernel_mul_mv_ext_q4_f32_impl<32, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+    }
+}
+
+template<short r1ptg, typename q_t, short epb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &)>
+kernel void kernel_mul_mv_ext_q4x4_f32_disp(
+        constant ggml_metal_kargs_mul_mv_ext & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    switch (args.nxpsg) {
+        case 4:  kernel_mul_mv_ext_q4x4_f32_impl<4,  r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 8:  kernel_mul_mv_ext_q4x4_f32_impl<8,  r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 16: kernel_mul_mv_ext_q4x4_f32_impl<16, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+        case 32: kernel_mul_mv_ext_q4x4_f32_impl<32, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
+    }
+}
+
+typedef decltype(kernel_mul_mv_ext_q4_f32_disp  <2, block_q8_0, 32,  dequantize_q8_0_t4>) mul_mv_ext_q4_f32_t;
+typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>)    mul_mv_ext_q4x4_f32_t;
+
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_2")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4,        4,  dequantize_f16_t4>;
+template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]]    kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4,        4,  dequantize_f16_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0,   32, dequantize_q4_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_0,   32, dequantize_q4_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_1,   32, dequantize_q4_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_1,   32, dequantize_q4_1_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_0,   32, dequantize_q5_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_0,   32, dequantize_q5_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_1,   32, dequantize_q5_1_t4>;
+template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_1,   32, dequantize_q5_1_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_2")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_3")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_4")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q8_0,   32, dequantize_q8_0_t4>;
+template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]]   kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q8_0,   32, dequantize_q8_0_t4>;
+
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
+
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q4_K, 256, dequantize_q4_K>;
+template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q4_K, 256, dequantize_q4_K>;
+
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>;
+template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>;
+
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
+template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
+
+#define N_MV_T_T 4
+
+template<typename T0, typename T04, typename T1, typename T14, typename args_t>
+void kernel_mul_mv_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig,
+        ushort tiisg) {
+    const int r0 = tgpig.x;
+    const int rb = tgpig.y*N_MV_T_T;
+    const int im = tgpig.z;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+    device const T0 * x = (device const T0 *) (src0 + offset0);
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1;
+
+    if (args.ne00 < 128) {
+        for (int row = 0; row < N_MV_T_T; ++row) {
+            int r1 = rb + row;
+            if (r1 >= args.ne11) {
+                break;
+            }
+
+            const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
+
+            device const T1 * y = (device const T1 *) (src1 + offset1);
+
+            float sumf = 0;
+            for (int i = tiisg; i < args.ne00; i += 32) {
+                sumf += (T0) x[i] * (T1) y[i];
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        device const T04 * x4 = (device const T04 *) x;
+        for (int row = 0; row < N_MV_T_T; ++row) {
+            int r1 = rb + row;
+            if (r1 >= args.ne11) {
+                break;
+            }
+
+            const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
+
+            device const T1  * y  = (device const T1  *) (src1 + offset1);
+            device const T14 * y4 = (device const T14 *) y;
+
+            float sumf = 0;
+            for (int i = tiisg; i < args.ne00/4; i += 32) {
+                sumf += dot((float4) x4[i], (float4) y4[i]);
+            }
+
+            float all_sum = simd_sum(sumf);
+            if (tiisg == 0) {
+                for (int i = 4*(args.ne00/4); i < args.ne00; ++i) all_sum += (float) (x[i] * y[i]);
+                dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+template<typename T0, typename T04, typename T1, typename T14>
+kernel void kernel_mul_mv(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
+    kernel_mul_mv_impl<T0, T04, T1, T14, constant ggml_metal_kargs_mul_mv &>(
+        args,
+        src0,
+        src1,
+        dst,
+        tgpig,
+        tiisg);
+}
+
+typedef decltype(kernel_mul_mv<half, half4, half, half4>) mul_mv_t;
+
+template [[host_name("kernel_mul_mv_f32_f32")]]   kernel mul_mv_t kernel_mul_mv<float,  float4,  float,  float4>;
+template [[host_name("kernel_mul_mv_f16_f32")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   float,  float4>;
+template [[host_name("kernel_mul_mv_f16_f16")]]   kernel mul_mv_t kernel_mul_mv<half,   half4,   half,   half4>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mv_bf16_f32")]]  kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, float,  float4>;
+template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, bfloat, bfloat4>;
+#endif
+
+template<typename T, typename T4>
+kernel void kernel_mul_mv_1row(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 = r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const T     * x = (device const T     *) (src0 + offset0);
+    device const float * y = (device const float *) (src1 + offset1);
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    float sumf = 0;
+    if (args.ne00 < 128) {
+        for (int i = tiisg; i < args.ne00; i += 32) {
+            sumf += (float) x[i] * (float) y[i];
+        }
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            dst_f32[r0] = all_sum;
+        }
+    } else {
+        device const T4     * x4 = (device const T4     *) x;
+        device const float4 * y4 = (device const float4 *) y;
+
+        for (int i = tiisg; i < args.ne00/4; i += 32) {
+            sumf += dot((float4) x4[i], y4[i]);
+        }
+
+        float all_sum = simd_sum(sumf);
+
+        if (tiisg == 0) {
+            for (int i = 4*(args.ne00/4); i < args.ne00; ++i) all_sum += (float) (x[i] * y[i]);
+            dst_f32[r0] = all_sum;
+        }
+    }
+}
+
+typedef decltype(kernel_mul_mv_1row<half, half4>) mul_mv_1row_t;
+
+template [[host_name("kernel_mul_mv_f16_f32_1row")]]  kernel mul_mv_1row_t kernel_mul_mv_1row<half,   half4>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mv_bf16_f32_1row")]] kernel mul_mv_1row_t kernel_mul_mv_1row<bfloat, bfloat4>;
+#endif
+
+// Assumes row size (ne00) is a multiple of 4
+template<typename T, typename T4>
+kernel void kernel_mul_mv_l4(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
+
+    const int nrows = args.ne11;
+    const int r0 = tgpig.x;
+    const int im = tgpig.z;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+    device const T4 * x4 = (device const T4 *) (src0 + offset0);
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1;
+
+    for (int r1 = 0; r1 < nrows; ++r1) {
+        const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
+
+        device const float4 * y4 = (device const float4 *) (src1 + offset1);
+
+        float sumf = 0;
+        for (int i = tiisg; i < args.ne00/4; i += 32) {
+            sumf += dot((float4) x4[i], y4[i]);
+        }
+
+        float all_sum = simd_sum(sumf);
+        if (tiisg == 0) {
+            dst_f32[(uint64_t)r1*args.ne0 + r0] = all_sum;
+        }
+    }
+}
+
+typedef decltype(kernel_mul_mv_l4<half, half4>) mul_mv_l4_t;
+
+template [[host_name("kernel_mul_mv_f16_f32_l4")]]  kernel mul_mv_l4_t kernel_mul_mv_l4<half, half4>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mv_bf16_f32_l4")]] kernel mul_mv_l4_t kernel_mul_mv_l4<bfloat, bfloat4>;
+#endif
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], int i0, float ext_factor, float mscale,
+    thread float * cos_theta, thread float * sin_theta) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    *cos_theta = cos(theta) * mscale;
+    *sin_theta = sin(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+static void rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
+}
+
+template<typename T>
+kernel void kernel_rope_norm(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float theta_base = (float) pos[i2];
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[1];
+
+            dst_data[0] = x0*cos_theta - x1*sin_theta;
+            dst_data[1] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+template<typename T>
+kernel void kernel_rope_neox(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float theta_base = (float) pos[i2];
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
+typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+
+template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
+template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
+
+template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
+template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
+
+typedef void (im2col_t)(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]);
+
+template <typename T>
+kernel void kernel_im2col(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+//    const int64_t IC = tgpg[0];
+    const int64_t OH = tgpg[1];
+    const int64_t OW = tgpg[2];
+
+//    const int64_t N  = ntg[0];
+    const int64_t KH = ntg[1];
+    const int64_t KW = ntg[2];
+
+    const int64_t in  = tpitg[0];
+    const int64_t ikh = tpitg[1];
+    const int64_t ikw = tpitg[2];
+
+    const int64_t iic = tgpig[0];
+    const int64_t ioh = tgpig[1];
+    const int64_t iow = tgpig[2];
+
+    const int64_t iiw = iow*s0 + ikw*d0 - p0;
+    const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+    const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*CHW + (iic*(KH*KW) + ikh*KW + ikw);
+
+    device T * pdst = (device T *) (dst);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        pdst[offset_dst] = 0.0f;
+    } else {
+        const int64_t offset_src = in*ofs0 + iic*ofs1 + iih*IW + iiw;
+        pdst[offset_dst] = x[offset_src];
+    }
+}
+
+template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
+template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
+
+typedef void (im2col_ext_t)(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        constant   int32_t & N,
+        constant   int32_t & KH,
+        constant   int32_t & KW,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]);
+
+template <typename T>
+kernel void kernel_im2col_ext(
+        device const float * x,
+        device        char * dst,
+        constant   int32_t & ofs0,
+        constant   int32_t & ofs1,
+        constant   int32_t & IW,
+        constant   int32_t & IH,
+        constant   int32_t & CHW,
+        constant   int32_t & s0,
+        constant   int32_t & s1,
+        constant   int32_t & p0,
+        constant   int32_t & p1,
+        constant   int32_t & d0,
+        constant   int32_t & d1,
+        constant   int32_t & N,
+        constant   int32_t & KH,
+        constant   int32_t & KW,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
+    const int64_t KHW = KH * KW;             // KHW == ntg[1] * ntg[2], KW == ntg[2]
+
+    const int64_t d = tgpig[0] / CHW;
+    const int64_t chw = tgpig[0] % CHW;
+    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+    const int64_t HW = tgpig[0] % KHW;
+
+    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+    if (tpitg_0 >= N) {
+        return;
+    }
+
+    const int64_t tpitg_1 = HW / KW;
+    const int64_t tpitg_2 = HW % KW;
+
+    const int64_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0;
+    const int64_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1;
+
+    const int64_t offset_dst =
+        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
+        (tgpig_0 * KHW + tpitg_1 * KW + tpitg_2);
+
+    device T * pdst = (device T *) (dst);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        pdst[offset_dst] = 0.0f;
+    } else {
+        const int64_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1;
+        pdst[offset_dst] = x[offset_src + iih * IW + iiw];
+    }
+}
+
+template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
+template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
+
+typedef void (conv_transpose_1d_t)(
+        device const float * src0,
+        device const float * src1,
+        device        char * dst,
+        constant   int32_t & IC,
+        constant   int32_t & IL,
+        constant   int32_t & K,
+        constant   int32_t & s0,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3    tgpg[[threadgroups_per_grid]]);
+
+template <typename T>
+kernel void kernel_conv_transpose_1d(
+        device const     T * src0,
+        device const float * src1,
+        device        char * dst,
+        constant   int32_t & IC,
+        constant   int32_t & IL,
+        constant   int32_t & K,
+        constant   int32_t & s0,
+        constant  uint64_t & nb0,
+        constant  uint64_t & nb1,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3   tgpg[[threadgroups_per_grid]]) {
+
+    float v = 0.0f;
+
+    for (int64_t c = 0; c < IC; c++) {
+        const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1];
+        const int32_t input_offset = c * IL;
+
+        for (int64_t i = 0; i < IL; i++) {
+            if (tgpig[0] >= i * s0 && tgpig[0] < i * s0 + K) {
+                v += src0[kernel_offset + tgpig[0] - i * s0] * src1[input_offset + i];
+            }
+        }
+    }
+
+    device float * dst_ptr = (device float *) (dst + tgpig[0] * nb0 + tgpig[1] * nb1);
+
+    dst_ptr[0] = v;
+}
+
+template [[host_name("kernel_conv_transpose_1d_f32_f32")]]
+kernel void kernel_conv_transpose_1d<float>(
+    device const float * src0,
+    device const float * src1,
+    device        char * dst,
+    constant   int32_t & IC,
+    constant   int32_t & IL,
+    constant   int32_t & K,
+    constant   int32_t & s0,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
+template [[host_name("kernel_conv_transpose_1d_f16_f32")]]
+kernel void kernel_conv_transpose_1d<half>(
+    device const half  * src0,
+    device const float * src1,
+    device        char * dst,
+    constant   int32_t & IC,
+    constant   int32_t & IL,
+    constant   int32_t & K,
+    constant   int32_t & s0,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
+kernel void kernel_upscale_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    constant     float & sf0,
+    constant     float & sf1,
+    constant     float & sf2,
+    constant     float & sf3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3/sf3;
+    const int64_t i02 = i2/sf2;
+    const int64_t i01 = i1/sf1;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        const int64_t i00 = i0/sf0;
+
+        device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1  +  i0*nb0);
+
+        dst_ptr[0] = src0_ptr[0];
+    }
+}
+
+kernel void kernel_pad_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant   int64_t & ne0,
+    constant   int64_t & ne1,
+    constant   int64_t & ne2,
+    constant   int64_t & ne3,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3;
+    const int64_t i02 = i2;
+    const int64_t i01 = i1;
+
+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+
+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+            if (i0 < ne00) {
+                dst_ptr[i0] = src0_ptr[i0];
+            } else {
+                dst_ptr[i0] = 0.0f;
+            }
+        }
+
+        return;
+    }
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        dst_ptr[i0] = 0.0f;
+    }
+}
+
+kernel void kernel_pad_reflect_1d_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant   int64_t & ne00,
+    constant   int64_t & ne01,
+    constant   int64_t & ne02,
+    constant   int64_t & ne03,
+    constant   int64_t & ne0,
+    constant  uint64_t & nb00,
+    constant  uint64_t & nb01,
+    constant  uint64_t & nb02,
+    constant  uint64_t & nb03,
+    constant  uint64_t & nb0,
+    constant  uint64_t & nb1,
+    constant  uint64_t & nb2,
+    constant  uint64_t & nb3,
+    constant   int32_t & p0,
+    constant   int32_t & p1,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3  tgpg[[threadgroups_per_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    const int64_t i03 = i3;
+    const int64_t i02 = i2;
+    const int64_t i01 = i1;
+
+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+
+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+            if (i0 < p0) {
+                dst_ptr[i0] = src0_ptr[p0 - i0];
+            } else if (i0 < ne0 - p1) {
+                dst_ptr[i0] = src0_ptr[i0 - p0];
+            } else {
+                dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
+            }
+        }
+    }
+}
+
+kernel void kernel_arange_f32(
+    device        char * dst,
+    constant   int64_t & ne0,
+    constant   float   & start,
+    constant   float   & step,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    device float * dst_ptr = (device float *) dst;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        dst_ptr[i0] = start + step * i0;
+    }
+}
+
+kernel void kernel_timestep_embedding_f32(
+    device  const char * src0,
+    device        char * dst,
+    constant  uint64_t & nb1,
+    constant  int      & dim,
+    constant  int      & max_period,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    int i = tgpig.x;
+    device float * embed_data = (device float *)(dst +  i*nb1);
+
+    int half_ = dim / 2;
+    for (int j = tpitg.x; j < half_; j += ntg.x) {
+        float timestep = ((device float *)src0)[i];
+        float freq = (float)exp(-log((float)max_period) * j / half_);
+        float arg = timestep * freq;
+        embed_data[j        ] = cos(arg);
+        embed_data[j + half_] = sin(arg);
+    }
+
+    if (dim % 2 != 0 && tpitg.x == 0) {
+        embed_data[dim] = 0.f;
+    }
+}
+
+// bitonic sort implementation following the CUDA kernels as reference
+typedef void (argsort_t)(
+        device const float  * x,
+        device     int32_t  * dst,
+        constant   int64_t  & ncols,
+        constant   int64_t  & ncols_pad,
+        threadgroup int32_t * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_f32_i32(
+        device const float   * x,
+        device       int32_t * dst,
+        constant     int64_t & ncols,
+        constant     int64_t & ncols_pad,
+        threadgroup int32_t  * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]]) {
+    // bitonic sort
+    int col = tpitg[0];
+    int row = tgpig[1];
+
+    if (col >= ncols_pad) return;
+
+    device const float   * x_row   = x + row * ncols;
+    threadgroup int32_t  * dst_row = shared_values;
+
+    // initialize indices
+    dst_row[col] = col;
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
+
+kernel void kernel_leaky_relu_f32(
+        device const float * src0,
+        device       float * dst,
+        constant     float & slope,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
+}
+
+// ref: https://arxiv.org/pdf/2307.08691.pdf
+template<
+    typename q_t,     // query types in shared memory
+    typename q4_t,
+    typename q8x8_t,
+    typename k_t,     // key types in shared memory
+    typename k4x4_t,
+    typename k8x8_t,
+    typename v_t,     // value types in shared memory
+    typename v4x4_t,
+    typename v8x8_t,
+    typename qk_t,    // Q*K types
+    typename qk8x8_t,
+    typename s_t,     // soft-max types
+    typename s8x8_t,
+    typename o_t,     // attention accumulation types
+    typename o4_t,
+    typename o8x8_t,
+    typename kd4x4_t, // key type in device memory
+    short nl_k,
+    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
+    typename vd4x4_t, // key type in device memory
+    short nl_v,
+    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
+    short D,         // head size
+    short Q  = 8,    // queries per threadgroup
+    short KV = 8,    // key/value processed per each simdgroup
+    short C  = 32>   // cache items per threadgroup
+kernel void kernel_flash_attn_ext(
+        constant ggml_metal_kargs_flash_attn_ext & args,
+        device const char * q,
+        device const char * k,
+        device const char * v,
+        device const char * mask,
+        device       char * dst,
+        threadgroup  half * shmem_f16 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3   ntg[[threads_per_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short nsg = ntg.y; // number of simdgroups
+
+    const int iq3 = tgpig[2];
+    const int iq2 = tgpig[1];
+    const int iq1 = tgpig[0]*Q;
+
+    const short D4  = D/4;
+    const short D8  = D/8;
+    const short D16 = D/16;
+    const short NW  = N_SIMDWIDTH;
+    const short SH  = (2*C + Q); // shared memory per simdgroup (s_t == float)
+
+    const short TS = nsg*SH;   // shared memory size per query in (s_t == float)
+    const short T  = D + 2*TS; // shared memory size per query in (half)
+
+    threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +              0*D); // holds the query data
+    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +              0*D); // same as above but in q4_t
+    threadgroup o_t  * so  = (threadgroup o_t  *) (shmem_f16 +              0*D); // reuse query data for accumulation
+    threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 +              0*D); // same as above but in o4_t
+    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + 2*sgitg*SH + Q*D); // scratch buffer for attention, mask and diagonal matrix
+
+    threadgroup k_t    * sk    = (threadgroup k_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
+    threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
+
+    threadgroup v_t    * sv    = (threadgroup v_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load V in shared memory
+    threadgroup v4x4_t * sv4x4 = (threadgroup v4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in v4x4_t
+
+    // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
+    o8x8_t lo[D8];
+
+    // load heads from Q to shared memory
+    for (short j = sgitg; j < Q; j += nsg) {
+        device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
+
+        for (short i = tiisg; i < D4; i += NW) {
+            if (iq1 + j < args.ne01) {
+                sq4[j*D4 + i] = (q4_t) q4[i];
+            } else {
+                sq4[j*D4 + i] = (q4_t) 0.0f;
+            }
+        }
+    }
+
+    // zero out lo
+    for (short i = 0; i < D8; ++i) {
+        lo[i] = make_filled_simdgroup_matrix<o_t, 8>((o_t) 0.0f);
+    }
+
+    // zero out shared memory SH
+    for (short j = 0; j < Q; ++j) {
+        for (short i = tiisg; i < SH; i += NW) {
+            ss[j*TS + i] = 0.0f;
+        }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    {
+        half S[Q] = { [0 ... Q-1] = 0.0f };
+        half M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
+
+        // thread indices inside the simdgroup
+        // TODO: see if we can utilize quad-group functions for better performance
+        //       https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (6.9.3)
+        const short tx = tiisg%4;
+        const short ty = tiisg/4;
+
+        // broadcast kv
+        //const short rk2 = args.ne02/args.ne12;
+        //const short rk3 = args.ne03/args.ne13;
+
+        const short ikv2 = iq2/(args.ne02/args.ne_12_2);
+        const short ikv3 = iq3/(args.ne03/args.ne_12_3);
+
+        // load the queries from shared memory into local memory
+        q8x8_t mq[D8];
+
+        for (short i = 0; i < D8; ++i) {
+            simdgroup_load(mq[i], sq + i*8, D);
+        }
+
+        const bool has_mask = mask != q;
+
+        half slope = 1.0f;
+
+        // ALiBi
+        if (args.max_bias > 0.0f) {
+            const short h = iq2;
+
+            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
+
+            slope = pow(base, exph);
+        }
+
+        // loop over the KV cache
+        // each simdgroup handles blocks of Q rows and C columns
+        for (int ic0 = 0; ic0 < args.ne11; ic0 += C*nsg) {
+            const int ic = ic0 + C*sgitg;
+            if (ic >= args.ne11) {
+                break;
+            }
+
+            if (has_mask) {
+                // used to detect blocks full of -INF
+                half smax = -INFINITY;
+
+                // load the mask in shared memory
+                #pragma unroll(Q)
+                for (short j = 0; j < Q; ++j) {
+                    device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31);
+
+                    const half m = pm[ic + tiisg];
+
+                    ss[j*TS + C + tiisg] = m;
+                    smax = max(smax, m);
+                }
+
+                smax = simd_max(smax);
+
+                if (smax == -INFINITY) {
+                    continue;
+                }
+            }
+
+            // Q*K^T
+            {
+                for (short cc = 0; cc < C/8; ++cc) {
+                    qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
+
+                    // this is compile-time check, so it does not have runtime overhead
+                    if (is_same<kd4x4_t, k4x4_t>::value) {
+                        // we can read directly from global memory
+                        device const k_t * pk = (device const k_t *) ((device const char *) k + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                        #pragma unroll(D8)
+                        for (short i = 0; i < D8; ++i) {
+                            k8x8_t mk;
+                            simdgroup_load(mk, pk + i*8, args.nb_12_1/sizeof(k_t), 0, true); // transpose // TODO: use ne10
+
+                            simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
+                        }
+                    } else {
+                        for (short ii = 0; ii < D16; ii += 4) {
+                            device const kd4x4_t * pk4x4 = (device const kd4x4_t *) ((device const char *) k + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                            if (D16%4 == 0) {
+                                // the head is evenly divisible by 4*16 = 64, so no need for bound checks
+                                {
+                                    k4x4_t tmp;
+                                    deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp);
+                                    sk4x4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                #pragma unroll(4)
+                                for (short k = 0; k < 4; ++k) {
+                                    k8x8_t mk;
+
+                                    simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
+
+                                    simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
+                                }
+                            } else {
+                                if (ii + tx < D16) {
+                                    k4x4_t tmp;
+                                    deq_k(pk4x4 + (ii + tx)/nl_k, (ii + tx)%nl_k, tmp);
+                                    sk4x4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                for (short k = 0; k < 4 && ii + k < D16; ++k) {
+                                    k8x8_t mk;
+
+                                    simdgroup_load(mk, sk + 16*k + 0*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 0], mk, mqk);
+
+                                    simdgroup_load(mk, sk + 16*k + 1*8, 4*16, 0, true); // transpose
+                                    simdgroup_multiply_accumulate(mqk, mq[2*(ii + k) + 1], mk, mqk);
+                                }
+                            }
+                        }
+                    }
+
+                    // cast qk_t -> s_t
+                    //s8x8_t mqks(1.0f);
+                    //simdgroup_multiply(mqks, mqk, mqks);
+                    //simdgroup_store(mqks, ss + 8*cc, TS, 0, false);
+
+                    simdgroup_store(mqk, ss + 8*cc, TS, 0, false);
+                }
+            }
+
+            // online softmax
+            {
+                for (ushort j = 0; j < Q; ++j) {
+                    const half m = M[j];
+
+                    // scale and apply the logitcap / mask
+                    half s = ss[j*TS + tiisg]*args.scale;
+
+                    if (args.logit_softcap != 0.0f) {
+                        s = args.logit_softcap*precise::tanh(s);
+                    }
+
+                    // mqk = mqk + mask*slope
+                    s += slope*ss[j*TS + C + tiisg];
+
+                    M[j] = simd_max(max(M[j], s));
+
+                    const half ms = exp(m - M[j]);
+                    const half vs = exp(s - M[j]);
+
+                    S[j] = S[j]*ms + simd_sum(vs);
+
+                    // the P matrix from the paper (Q rows, C columns)
+                    ss[j*TS + tiisg] = vs;
+
+                    // create a QxQ diagonal matrix for rescaling the output
+                    if (tiisg == j) {
+                        ss[j*TS + 2*C + j] = ms;
+                    }
+                }
+            }
+
+            // O = diag(ms)*O
+            {
+                s8x8_t mm;
+                simdgroup_load(mm, ss + 2*C, TS, 0, false);
+
+                #pragma unroll(D8)
+                for (short i = 0; i < D8; ++i) {
+                    simdgroup_multiply(lo[i], mm, lo[i]);
+                }
+            }
+
+            // O = O + (Q*K^T)*V
+            {
+                for (short cc = 0; cc < C/8; ++cc) {
+                    s8x8_t ms;
+                    simdgroup_load(ms, ss + 8*cc, TS, 0, false);
+
+                    if (is_same<vd4x4_t, v4x4_t>::value) {
+                        // we can read directly from global memory
+                        device const v_t * pv = (device const v_t *) ((device const char *) v + ((ic + 8*cc)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                        #pragma unroll(D8)
+                        for (short i = 0; i < D8; ++i) {
+                            v8x8_t mv;
+                            simdgroup_load(mv, pv + i*8, args.nb_12_1/sizeof(v_t), 0, false); // TODO: use ne20
+
+                            simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
+                        }
+                    } else {
+                        for (short ii = 0; ii < D16; ii += 4) {
+                            device const vd4x4_t * pv4x4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 8*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                            if (D16%4 == 0) {
+                                // no need for bound checks
+                                {
+                                    v4x4_t tmp;
+                                    deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp);
+                                    sv4x4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                #pragma unroll(4)
+                                for (short k = 0; k < 4; ++k) {
+                                    v8x8_t mv;
+
+                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+
+                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                }
+                            } else {
+                                if (ii + tx < D16) {
+                                    v4x4_t tmp;
+                                    deq_v(pv4x4 + (ii + tx)/nl_v, (ii + tx)%nl_v, tmp);
+                                    sv4x4[4*ty + tx] = tmp;
+                                }
+
+                                simdgroup_barrier(mem_flags::mem_threadgroup);
+
+                                for (short k = 0; k < 4 && ii + k < D16; ++k) {
+                                    v8x8_t mv;
+
+                                    simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+
+                                    simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
+        for (short j = 0; j < Q; ++j) {
+            if (tiisg == 0) {
+                ss[j*TS + 0] = S[j];
+                ss[j*TS + 1] = M[j];
+            }
+        }
+    }
+
+    // reduce the warps sequentially
+    for (ushort sg = 1; sg < nsg; ++sg) {
+        half S = { 0.0f };
+        half M = { -__FLT16_MAX__/2 };
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // each simdgroup stores its output to shared memory, reusing sq
+        if (sgitg == sg) {
+            for (short i = 0; i < D8; ++i) {
+                simdgroup_store(lo[i], so + i*8, D, 0, false);
+            }
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // the first simdgroup accumulates the results from the other simdgroups
+        if (sgitg == 0) {
+            for (short j = 0; j < Q; ++j) {
+                const half S0 = ss[j*TS +         0];
+                const half S1 = ss[j*TS + sg*SH + 0];
+
+                const half M0 = ss[j*TS +         1];
+                const half M1 = ss[j*TS + sg*SH + 1];
+
+                M = max(M0, M1);
+
+                const half ms0 = exp(M0 - M);
+                const half ms1 = exp(M1 - M);
+
+                S = S0*ms0 + S1*ms1;
+
+                if (tiisg == 0) {
+                    ss[j*TS + 0] = S;
+                    ss[j*TS + 1] = M;
+
+                    ss[j*TS + 2*C + j        ] = ms0;
+                    ss[j*TS + 2*C + j + sg*SH] = ms1;
+                }
+            }
+
+            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
+            {
+                s8x8_t ms0;
+                s8x8_t ms1;
+
+                simdgroup_load(ms0, ss + 2*C,         TS, 0, false);
+                simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
+
+                #pragma unroll(D8)
+                for (short i = 0; i < D8; ++i) {
+                    o8x8_t t;
+
+                    simdgroup_load    (t, so + i*8, D, 0, false);
+                    simdgroup_multiply(t, ms1, t);
+
+                    simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
+                }
+            }
+        }
+    }
+
+    // store result to shared memory (reuse sq)
+    if (sgitg == 0) {
+        for (short i = 0; i < D8; ++i) {
+            simdgroup_store(lo[i], so + i*8, D, 0, false);
+        }
+    }
+
+    device float4 * dst4 = (device float4 *) dst;
+
+    // final rescale with 1/S and store to global memory
+    if (sgitg == 0) {
+        for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) {
+            const float S = ss[j*TS + 0];
+
+            for (short i = tiisg; i < D4; i += NW) {
+                dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*D4 + i] = (float4) so4[j*D4 + i]/S;
+            }
+        }
+    }
+}
+
+// TODO: this is quite ugly. in the future these types will be hardcoded in the kernel, but for now keep them as
+//       template to be able to explore different combinations
+//
+#define FA_TYPES \
+    half,  half4,   simdgroup_half8x8,  \
+    half,  half4x4, simdgroup_half8x8,  \
+    half,  half4x4, simdgroup_half8x8,  \
+    float,          simdgroup_float8x8, \
+    float,          simdgroup_float8x8, \
+    half,  half4,   simdgroup_half8x8
+
+typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64>) flash_attn_ext_t;
+
+template [[host_name("kernel_flash_attn_ext_f16_h64" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  64>;
+template [[host_name("kernel_flash_attn_ext_f16_h80" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  80>;
+template [[host_name("kernel_flash_attn_ext_f16_h96" )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  96>;
+template [[host_name("kernel_flash_attn_ext_f16_h112")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  112>;
+template [[host_name("kernel_flash_attn_ext_f16_h128")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  128>;
+template [[host_name("kernel_flash_attn_ext_f16_h256")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  256>;
+
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64>;
+template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80>;
+template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96>;
+template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 112>;
+template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 128>;
+template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 256>;
+#endif
+
+template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q4_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q4_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q5_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q5_1_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 96>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 112>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_q5_1_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256>;
+
+template [[host_name("kernel_flash_attn_ext_q8_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 96>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 112>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 128>;
+template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256>;
+
+#undef FA_TYPES
+
+template<
+    typename q4_t,    // query types in shared memory
+    typename q4x4_t,
+    typename k4x4_t,  // key types in shared memory
+    typename v4x4_t,  // value types in shared memory
+    typename qk_t,    // Q*K types
+    typename s_t,     // soft-max types
+    typename s4_t,
+    typename s4x4_t,
+    typename o4x4_t,  // attention accumulation types
+    typename kd4x4_t, // key type in device memory
+    short nl_k,
+    void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
+    typename vd4x4_t, // key type in device memory
+    short nl_v,
+    void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
+    short D,         // head size
+    short Q  = 1,    // queries per threadgroup
+    short C  = 32>   // cache items per threadgroup
+kernel void kernel_flash_attn_ext_vec(
+        constant ggml_metal_kargs_flash_attn_ext & args,
+        device const char * q,
+        device const char * k,
+        device const char * v,
+        device const char * mask,
+        device       char * dst,
+        threadgroup  half * shmem_f16 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3   ntg[[threads_per_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const short nsg = ntg.y; // number of simdgroups
+
+    const int iq3 = tgpig[2];
+    const int iq2 = tgpig[1];
+    const int iq1 = tgpig[0];
+
+    const short D4  = D/4;
+    const short D16 = D/16;
+    const short NW  = N_SIMDWIDTH;
+    const short NL  = NW/4; // note: this can be adjusted to support D%64 == 0 and D%32 == 0
+    const short SH  = 2*C;  // shared memory per simdgroup
+
+    const short T = D + nsg*SH; // shared memory size per query in (half)
+
+  //threadgroup q_t    * sq    = (threadgroup q_t    *) (shmem_f16 +                0*D); // holds the query data
+    threadgroup q4_t   * sq4   = (threadgroup q4_t   *) (shmem_f16 +                0*D); // same as above but in q4_t
+    threadgroup q4x4_t * sq4x4 = (threadgroup q4x4_t *) (shmem_f16 +                0*D); // same as above but in q4x4_t
+    threadgroup s_t    * ss    = (threadgroup s_t    *) (shmem_f16 + sgitg*SH     + Q*D); // scratch buffer for attention
+    threadgroup s4_t   * ss4   = (threadgroup s4_t   *) (shmem_f16 + sgitg*SH     + Q*D); // same as above but in s4_t
+    threadgroup half   * sm    = (threadgroup half   *) (shmem_f16 + sgitg*SH + C + Q*D); // scratch buffer for mask
+    threadgroup o4x4_t * sr4x4 = (threadgroup o4x4_t *) (shmem_f16 + sgitg*D      + Q*T); // scratch buffer for the results
+
+    // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
+    o4x4_t lo[D16/NL];
+
+    // load heads from Q to shared memory
+    device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*args.nb01 + iq2*args.nb02 + iq3*args.nb03));
+
+    for (short i = tiisg; i < D4; i += NW) {
+        if (iq1 < args.ne01) {
+            sq4[i] = (q4_t) q4[i];
+        } else {
+            sq4[i] = (q4_t) 0.0f;
+        }
+    }
+
+    // zero out lo
+    for (short i = 0; i < D16/NL; ++i) {
+        lo[i] = (o4x4_t) 0.0f;
+    }
+
+    // zero out shared memory SH
+    for (short i = tiisg; i < SH/4; i += NW) {
+        ss4[i] = (s4_t) 0.0f;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    {
+        half S = 0.0f;
+        half M = -__FLT16_MAX__/2;
+
+        // thread indices inside the simdgroup
+        const short tx = tiisg%NL;
+        const short ty = tiisg/NL;
+
+        // broadcast kv
+        //const short rk2 = args.ne02/args.ne12;
+        //const short rk3 = args.ne03/args.ne13;
+
+        const short ikv2 = iq2/(args.ne02/args.ne_12_2);
+        const short ikv3 = iq3/(args.ne03/args.ne_12_3);
+
+        // load the queries from shared memory into local memory
+        q4x4_t mq[D16/NL];
+
+        #pragma unroll(D16/NL)
+        for (short ii = 0; ii < D16; ii += NL) {
+            mq[ii/NL] = sq4x4[ii + tx];
+        }
+
+        const bool has_mask = mask != q;
+
+        // pointer to the mask
+        device const half * pm = (device const half *) (mask + iq1*args.nb31);
+
+        half slope = 1.0f;
+
+        // ALiBi
+        if (args.max_bias > 0.0f) {
+            const short h = iq2;
+
+            const half  base = h < args.n_head_log2 ? args.m0 : args.m1;
+            const short exph = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
+
+            slope = pow(base, exph);
+        }
+
+        // loop over the KV cache
+        // each simdgroup handles blocks of Q rows and C columns
+        for (int ic0 = 0; ic0 < args.ne11; ic0 += C*nsg) {
+            const int ic = ic0 + C*sgitg;
+            if (ic >= args.ne11) {
+                break;
+            }
+
+            if (has_mask) {
+                sm[tiisg] = pm[ic + tiisg];
+            }
+
+            // Q*K^T
+            {
+                // each simdgroup processes 1 query and 4 (NW/NL) keys
+                for (short cc = 0; cc < C/4; ++cc) {
+                    qk_t mqka[4] = { 0.0, 0.0, 0.0, 0.0 };
+
+                    device const kd4x4_t * pk = (device const kd4x4_t *) ((device const char *) k + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                    #pragma unroll(D16/NL)
+                    for (short ii = 0; ii < D16; ii += NL) {
+                        const short i = ii + tx;
+
+                        k4x4_t mk;
+                        deq_k(pk + i/nl_k, i%nl_k, mk);
+
+                        // note: this is less precise than the version below
+                        //mqka[0] += dot(mq[ii/NL][0], mk[0]);
+                        //mqka[1] += dot(mq[ii/NL][1], mk[1]);
+                        //mqka[2] += dot(mq[ii/NL][2], mk[2]);
+                        //mqka[3] += dot(mq[ii/NL][3], mk[3]);
+
+                        mqka[0] += dot((float4) mq[ii/NL][0], (float4) mk[0]);
+                        mqka[1] += dot((float4) mq[ii/NL][1], (float4) mk[1]);
+                        mqka[2] += dot((float4) mq[ii/NL][2], (float4) mk[2]);
+                        mqka[3] += dot((float4) mq[ii/NL][3], (float4) mk[3]);
+                    }
+
+                    qk_t mqk = mqka[0] + mqka[1] + mqka[2] + mqka[3];
+
+                    // simdgroup reduce
+                    // [ 0 ..  7] -> [ 0]
+                    // [ 8 .. 15] -> [ 8]
+                    // [16 .. 23] -> [16]
+                    // [24 .. 31] -> [24]
+                  //mqk += simd_shuffle_down(mqk, 16);
+                  //mqk += simd_shuffle_down(mqk,  8);
+                    mqk += simd_shuffle_down(mqk,  4);
+                    mqk += simd_shuffle_down(mqk,  2);
+                    mqk += simd_shuffle_down(mqk,  1);
+
+                    // mqk = mqk*scale + mask*slope
+                    if (tx == 0) {
+                        mqk *= args.scale;
+
+                        if (args.logit_softcap != 0.0f) {
+                            mqk = args.logit_softcap*precise::tanh(mqk);
+                        }
+
+                        mqk += sm[4*cc + ty]*slope;
+
+                        ss[4*cc + ty] = mqk;
+                    }
+                }
+            }
+
+            simdgroup_barrier(mem_flags::mem_threadgroup);
+
+            // online softmax
+            {
+                const half m = M;
+                const half s = ss[tiisg];
+
+                M = simd_max(max(M, s));
+
+                const half ms = exp(m - M);
+                const half vs = exp(s - M);
+
+                S = S*ms + simd_sum(vs);
+
+                // the P matrix from the paper (Q rows, C columns)
+                ss[tiisg] = vs;
+
+                // O = diag(ms)*O
+                #pragma unroll(D16/NL)
+                for (short ii = 0; ii < D16; ii += NL) {
+                    lo[ii/NL] *= ms;
+                }
+            }
+
+            simdgroup_barrier(mem_flags::mem_threadgroup);
+
+            // O = O + (Q*K^T)*V
+            {
+                for (short cc = 0; cc < C/4; ++cc) {
+                    device const vd4x4_t * pv4 = (device const vd4x4_t *) ((device const char *) v + ((ic + 4*cc + ty)*args.nb_12_1 + ikv2*args.nb_12_2 + ikv3*args.nb_12_3));
+
+                    const s4x4_t ms(ss[4*cc + ty]);
+
+                    #pragma unroll(D16/NL)
+                    for (short ii = 0; ii < D16; ii += NL) {
+                        const short i = ii + tx;
+
+                        v4x4_t mv;
+                        deq_v(pv4 + i/nl_v, i%nl_v, mv);
+
+                        lo[ii/NL] += mv*ms;
+                    }
+                }
+            }
+        }
+
+        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
+        if (tiisg == 0) {
+            ss[0] = (s_t) S;
+            ss[1] = (s_t) M;
+        }
+    }
+
+    // simdgroup reduce
+    // [ 0,  8, 16, 24] -> [ 0]
+    // [ 1,  9, 17, 25] -> [ 1]
+    // [ 2, 10, 18, 26] -> [ 2]
+    // [ 3, 11, 19, 27] -> [ 3]
+    // [ 4, 12, 20, 28] -> [ 4]
+    // [ 5, 13, 21, 29] -> [ 5]
+    // [ 6, 14, 22, 30] -> [ 6]
+    // [ 7, 15, 23, 31] -> [ 7]
+    for (short ii = 0; ii < D16; ii += NL) {
+        lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0], 16);
+        lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  8);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  4);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  2);
+      //lo[ii/NL][0] += simd_shuffle_down(lo[ii/NL][0],  1);
+
+        lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1], 16);
+        lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  8);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  4);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  2);
+      //lo[ii/NL][1] += simd_shuffle_down(lo[ii/NL][1],  1);
+
+        lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2], 16);
+        lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  8);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  4);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  2);
+      //lo[ii/NL][2] += simd_shuffle_down(lo[ii/NL][2],  1);
+
+        lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3], 16);
+        lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  8);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  4);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  2);
+      //lo[ii/NL][3] += simd_shuffle_down(lo[ii/NL][3],  1);
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // store results to shared memory
+    for (short i = tiisg; i < D16; i += NL) {
+        sr4x4[i] = lo[i/NL];
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // parallel reduce
+    for (short r = nsg/2; r > 0; r >>= 1) {
+        if (sgitg < r) {
+            const half S0 = ss[       0];
+            const half S1 = ss[r*SH + 0];
+
+            const half M0 = ss[       1];
+            const half M1 = ss[r*SH + 1];
+
+            const half M = max(M0, M1);
+
+            const half ms0 = exp(M0 - M);
+            const half ms1 = exp(M1 - M);
+
+            const half S = S0*ms0 + S1*ms1;
+
+            if (tiisg == 0) {
+                ss[0] = S;
+                ss[1] = M;
+            }
+
+            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
+            for (short i = tiisg; i < D16; i += NW) {
+                sr4x4[i] = sr4x4[i]*ms0 + sr4x4[i + r*D16]*ms1;
+            }
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    device float4x4 * dst44 = (device float4x4 *) dst;
+
+    // final rescale with 1/S and store to global memory
+    if (sgitg == 0) {
+        const float S = ss[0];
+
+        for (short i = tiisg; i < D16; i += NW) {
+            dst44[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)iq1*args.ne1)*D16 + i] = (float4x4) sr4x4[i]/S;
+        }
+    }
+}
+
+// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem
+//       in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
+//
+#define FA_TYPES \
+           half4,  half4x4, \
+                   half4x4, \
+                   half4x4, \
+    float,                  \
+    half,  half4,  half4x4, \
+                   half4x4
+
+typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 128>) flash_attn_ext_vec_t;
+
+template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  128>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 128>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1,  2, dequantize_q4_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0,  2, dequantize_q5_0, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1,  2, dequantize_q5_1, 128>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0,  2, dequantize_q8_0, 128>;
+
+template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,     1, dequantize_f16,  256>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_flash_attn_ext_vec_bf16_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,   1, dequantize_bf16, 256>;
+#endif
+template [[host_name("kernel_flash_attn_ext_vec_q4_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0,  2, dequantize_q4_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q4_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1,  2, dequantize_q4_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0,  2, dequantize_q5_0, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q5_1_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1,  2, dequantize_q5_1, 256>;
+template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0,  2, dequantize_q8_0, 256>;
+
+#undef FA_TYPES
+
+template<typename T>
+kernel void kernel_set(
+    constant ggml_metal_kargs_set & args,
+    device  const char * src0,
+    device  const char * src1,
+    device        char * dst,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    ushort3 tpitg[[thread_position_in_threadgroup]],
+    ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i13 = tgpig[2];
+    const int i12 = tgpig[1];
+    const int i11 = tgpig[0];
+
+    const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
+
+    const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
+    const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
+    const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
+
+    device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
+
+    for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
+        device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
+        dst_data[i10] = (T) src[0];
+    }
+}
+
+typedef decltype(kernel_set<float>) kernel_set_t;
+
+template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
+template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
+
+template<typename T0, typename T1>
+kernel void kernel_cpy(
+        constant ggml_metal_kargs_cpy & args,
+        device  const char * src0,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
+
+    device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
+        device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+        dst_data[i00] = (T1) src[0];
+    }
+}
+
+typedef decltype(kernel_cpy<float, float>) kernel_cpy_t;
+
+template [[host_name("kernel_cpy_f32_f32")]]   kernel kernel_cpy_t kernel_cpy<float,  float>;
+template [[host_name("kernel_cpy_f32_f16")]]   kernel kernel_cpy_t kernel_cpy<float,  half>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_cpy_f32_bf16")]]  kernel kernel_cpy_t kernel_cpy<float,  bfloat>;
+#endif
+template [[host_name("kernel_cpy_f16_f32")]]   kernel kernel_cpy_t kernel_cpy<half,   float>;
+template [[host_name("kernel_cpy_f16_f16")]]   kernel kernel_cpy_t kernel_cpy<half,   half>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_cpy_bf16_f32")]]  kernel kernel_cpy_t kernel_cpy<bfloat, float>;
+template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy<bfloat, bfloat>;
+#endif
+
+kernel void kernel_cpy_f32_q8_0(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK8_0;
+
+    device block_q8_0 * dst_data = (device block_q8_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK8_0; i00 < args.ne00; i00 += ntg.x*QK8_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = src[j];
+            amax = MAX(amax, fabs(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK8_0].d = d;
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = src[j]*id;
+
+            dst_data[i00/QK8_0].qs[j] = round(x0);
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_0(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_0;
+
+    device block_q4_0 * dst_data = (device block_q4_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_0; i00 < args.ne00; i00 += ntg.x*QK4_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < QK4_0; j++) {
+            const float v = src[j];
+            if (amax < fabs(v)) {
+                amax = fabs(v);
+                max  = v;
+            }
+        }
+
+        const float d = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_0].d = d;
+
+        for (int j = 0; j < QK4_0/2; ++j) {
+            const float x0 = src[0       + j]*id;
+            const float x1 = src[QK4_0/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            dst_data[i00/QK4_0].qs[j]  = xi0;
+            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q4_1(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_1;
+
+    device block_q4_1 * dst_data = (device block_q4_1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_1; i00 < args.ne00; i00 += ntg.x*QK4_1) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < QK4_1; j++) {
+            const float v = src[j];
+            if (min > v) min = v;
+            if (max < v) max = v;
+        }
+
+        const float d = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK4_1].d = d;
+        dst_data[i00/QK4_1].m = min;
+
+        for (int j = 0; j < QK4_1/2; ++j) {
+            const float x0 = (src[0       + j] - min)*id;
+            const float x1 = (src[QK4_1/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            dst_data[i00/QK4_1].qs[j]  = xi0;
+            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q5_0(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK5_0;
+
+    device block_q5_0 * dst_data = (device block_q5_0 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK5_0; i00 < args.ne00; i00 += ntg.x*QK5_0) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < QK5_0; j++) {
+            const float v = src[j];
+            if (amax < fabs(v)) {
+                amax = fabs(v);
+                max  = v;
+            }
+        }
+
+        const float d = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK5_0].d = d;
+
+        uint32_t qh = 0;
+        for (int j = 0; j < QK5_0/2; ++j) {
+            const float x0 = src[0       + j]*id;
+            const float x1 = src[QK5_0/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            dst_data[i00/QK5_0].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+        thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
+        for (int j = 0; j < 4; ++j) {
+            dst_data[i00/QK5_0].qh[j] = qh8[j];
+        }
+    }
+}
+
+kernel void kernel_cpy_f32_q5_1(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK5_1;
+
+    device block_q5_1 * dst_data = (device block_q5_1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK5_1; i00 < args.ne00; i00 += ntg.x*QK5_1) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float max = src[0];
+        float min = src[0];
+
+        for (int j = 1; j < QK5_1; j++) {
+            const float v = src[j];
+            min = v < min ? v : min;
+            max = v > max ? v : max;
+        }
+
+        const float d = (max - min) / 31;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        dst_data[i00/QK5_1].d = d;
+        dst_data[i00/QK5_1].m = min;
+
+        uint32_t qh = 0;
+        for (int j = 0; j < QK5_1/2; ++j) {
+            const float x0 = (src[0       + j] - min)*id;
+            const float x1 = (src[QK5_1/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            dst_data[i00/QK5_1].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
+        }
+        thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
+        for (int j = 0; j < 4; ++j) {
+            dst_data[i00/QK5_1].qh[j] = qh8[j];
+        }
+    }
+}
+
+static inline int best_index_int8(int n, constant float * val, float x) {
+    if (x <= val[0]) return 0;
+    if (x >= val[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < val[mav]) mu = mav; else ml = mav;
+    }
+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
+
+kernel void kernel_cpy_f32_iq4_nl(
+        constant ggml_metal_kargs_cpy & args,
+        device const char * src0,
+        device       char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK4_NL;
+
+    device block_iq4_nl * dst_data = (device block_iq4_nl *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x*QK4_NL; i00 < args.ne00; i00 += ntg.x*QK4_NL) {
+        device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
+
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < QK4_0; j++) {
+            const float v = src[j];
+            if (amax < fabs(v)) {
+                amax = fabs(v);
+                max  = v;
+            }
+        }
+
+        const float d = max / kvalues_iq4nl_f[0];
+        const float id = d ? 1.0f/d : 0.0f;
+
+        float sumqx = 0, sumq2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            const float x0 = src[0        + j]*id;
+            const float x1 = src[QK4_NL/2 + j]*id;
+
+            const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0);
+            const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1);
+
+            dst_data[i00/QK4_NL].qs[j] = xi0 | (xi1 << 4);
+
+            const float v0 = kvalues_iq4nl_f[xi0];
+            const float v1 = kvalues_iq4nl_f[xi1];
+            const float w0 = src[0        + j]*src[0        + j];
+            const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j];
+            sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j];
+            sumq2 += w0*v0*v0 + w1*v1*v1;
+
+        }
+
+        dst_data[i00/QK4_NL].d = sumq2 > 0 ? sumqx/sumq2 : d;
+    }
+}
+
+kernel void kernel_concat(
+    constant ggml_metal_kargs_concat & args,
+    device  const char * src0,
+    device  const char * src1,
+    device        char * dst,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    ushort3 tpitg[[thread_position_in_threadgroup]],
+    ushort3   ntg[[threads_per_threadgroup]]) {
+
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+
+    int o[4] = {0, 0, 0, 0};
+    o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
+
+    device const float * x;
+
+    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
+            x = (device const float *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
+        } else {
+            x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
+        }
+
+        device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+
+        *y = *x;
+    }
+}
+
+template<typename args_t>
+void kernel_mul_mv_q2_K_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q2_K * x = (device const block_q2_K *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int ix = tiisg/8;  // 0...3
+    const int it = tiisg%8;  // 0...7
+    const int iq = it/4;     // 0 or 1
+    const int ir = it%4;     // 0...3
+    const int is = (8*ir)/16;// 0 or 1
+
+    device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir;
+
+    for (int ib = ix; ib < nb; ib += 4) {
+
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
+            yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
+            yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
+            yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
+        }
+
+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*iq + is;
+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
+                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
+                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
+                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
+                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
+                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
+                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
+                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
+            }
+            float dall = dh[0];
+            float dmin = dh[1] * 1.f/16.f;
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
+                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
+                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
+                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
+                         dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
+
+            qs += args.nb01/2;
+            sc += args.nb01;
+            dh += args.nb01/2;
+        }
+
+        y4 += 4 * QK_K;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q2_K_f32")]]
+kernel void kernel_mul_mv_q2_K_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q2_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_q3_K_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q3_K * x = (device const block_q3_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
+
+    float yl[32];
+
+    //const uint16_t kmask1 = 0x3030;
+    //const uint16_t kmask2 = 0x0f0f;
+
+    const int tid = tiisg/4;
+    const int ix  = tiisg%4;
+    const int ip  = tid/4;          // 0 or 1
+    const int il  = 2*((tid%4)/2);  // 0 or 2
+    const int ir  = tid%2;
+    const int n   = 8;
+    const int l0  = n*ir;
+
+    // One would think that the Metal compiler would figure out that ip and il can only have
+    // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
+    // with these two tales.
+    //
+    // Possible masks for the high bit
+    const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200},  // ip = 0, il = 0
+                           {0x0004, 0x0400, 0x0008, 0x0800},  // ip = 0, il = 2
+                           {0x0010, 0x1000, 0x0020, 0x2000},  // ip = 1, il = 0
+                           {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
+
+    // Possible masks for the low 2 bits
+    const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
+
+    const ushort4 hm = mm[2*ip + il/2];
+
+    const short shift = 2*il;
+
+    const float v1 = il == 0 ? 4.f : 64.f;
+    const float v2 = 4.f * v1;
+
+    const uint16_t s_shift1 = 4*ip;
+    const uint16_t s_shift2 = s_shift1 + il;
+
+    const int q_offset = 32*ip + l0;
+    const int y_offset = 128*ip + 32*il + l0;
+
+    device const float * y1 = yy + ix*QK_K + y_offset;
+
+    uint32_t scales32, aux32;
+    thread uint16_t * scales16 = (thread uint16_t *)&scales32;
+    thread const int8_t * scales = (thread const int8_t *)&scales32;
+
+    float sumf1[2] = {0.f};
+    float sumf2[2] = {0.f};
+    for (int i = ix; i < nb; i += 4) {
+        for (int l = 0; l < 8; ++l) {
+            yl[l+ 0] = y1[l+ 0];
+            yl[l+ 8] = y1[l+16];
+            yl[l+16] = y1[l+32];
+            yl[l+24] = y1[l+48];
+        }
+
+        device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
+        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
+        device const uint16_t * a = (device const uint16_t *)(x[i].scales);
+        device const half * dh = &x[i].d;
+
+        for (int row = 0; row < 2; ++row) {
+            const float d_all = (float)dh[0];
+
+            scales16[0] = a[4];
+            scales16[1] = a[5];
+            aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
+            scales16[0] = a[il+0];
+            scales16[1] = a[il+1];
+            scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
+
+            float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
+            for (int l = 0; l < n; l += 2) {
+                const int32_t qs = q[l/2];
+                s1 += yl[l+0] * (qs & qm[il/2][0]);
+                s2 += yl[l+1] * (qs & qm[il/2][1]);
+                s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
+                s4 += yl[l+16] * (qs & qm[il/2][2]);
+                s5 += yl[l+17] * (qs & qm[il/2][3]);
+                s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
+            }
+            float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
+            float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
+            sumf1[row] += d1 * (scales[0] - 32);
+            sumf2[row] += d2 * (scales[2] - 32);
+
+            s1 = s2 = s3 = s4 = s5 = s6 = 0;
+            for (int l = 0; l < n; l += 2) {
+                const int32_t qs = q[l/2+8];
+                s1 += yl[l+8] * (qs & qm[il/2][0]);
+                s2 += yl[l+9] * (qs & qm[il/2][1]);
+                s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
+                s4 += yl[l+24] * (qs & qm[il/2][2]);
+                s5 += yl[l+25] * (qs & qm[il/2][3]);
+                s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
+            }
+            d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
+            d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
+            sumf1[row] += d1 * (scales[1] - 32);
+            sumf2[row] += d2 * (scales[3] - 32);
+
+            q  += args.nb01/2;
+            h  += args.nb01/2;
+            a  += args.nb01/2;
+            dh += args.nb01/2;
+        }
+
+        y1 += 4 * QK_K;
+    }
+
+    for (int row = 0; row < 2; ++row) {
+        const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
+        sumf1[row] = simd_sum(sumf);
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    if (tiisg == 0) {
+        for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
+            dst_f32[first_row + row] = sumf1[row];
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q3_K_f32")]]
+kernel void kernel_mul_mv_q3_K_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q3_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_q4_K_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int ix = tiisg/8;  // 0...3
+    const int it = tiisg%8;  // 0...7
+    const int iq = it/4;     // 0 or 1
+    const int ir = it%4;     // 0...3
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int first_row = r0 * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q4_K * x = (device const block_q4_K *) (src0 + offset0);
+    device const float      * y = (device const float      *) (src1 + offset1);
+
+    float yl[16];
+    float yh[16];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
+
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
+
+    for (int ib = ix; ib < nb; ib += 4) {
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
+        }
+
+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq;
+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
+        device const half     * dh = &x[ib].d;
+
+        for (int row = 0; row < N_DST; row++) {
+            sc16[0] = sc[0] & kmask1;
+            sc16[1] = sc[2] & kmask1;
+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
+
+            device const uint16_t * q2 = q1 + 32;
+
+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
+            for (int i = 0; i < 8; i += 2) {
+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
+            }
+
+            float dall = dh[0];
+            float dmin = dh[1];
+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += args.nb01/2;
+            sc += args.nb01/2;
+            dh += args.nb01/2;
+        }
+
+        y4 += 4 * QK_K;
+    }
+
+    device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q4_K_f32")]]
+kernel void kernel_mul_mv_q4_K_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q4_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_q5_K_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q5_K * x = (device const block_q5_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
+
+    float sumf[2]={0.f};
+
+    float yl[16], yh[16];
+
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = tiisg/4;
+    const int ix  = tiisg%4;
+    const int iq  = tid/4;
+    const int ir  = tid%4;
+    const int n   = 8;
+
+    const int l0 = n*ir;
+    const int q_offset = 32*iq + l0;
+    const int y_offset = 64*iq + l0;
+
+    const uint8_t hm1 = 1u << (2*iq);
+    const uint8_t hm2 = hm1 << 1;
+    const uint8_t hm3 = hm1 << 4;
+    const uint8_t hm4 = hm2 << 4;
+
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
+
+    device const float * y1 = yy + ix*QK_K + y_offset;
+
+    for (int i = ix; i < nb; i += 4) {
+        device const uint8_t * q1 = x[i].qs + q_offset;
+        device const uint8_t * qh = x[i].qh + l0;
+        device const half * dh = &x[i].d;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
+
+        device const float * y2 = y1 + 128;
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < 8; ++l) {
+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
+        }
+
+        for (int row = 0; row < 2; ++row) {
+            device const uint8_t * q2 = q1 + 64;
+
+            sc16[0] = a[0] & kmask1;
+            sc16[1] = a[2] & kmask1;
+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
+
+            float4 acc1 = {0.f};
+            float4 acc2 = {0.f};
+            for (int l = 0; l < n; ++l) {
+                uint8_t h = qh[l];
+                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
+                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
+                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
+                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
+                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
+                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
+                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
+                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
+            }
+            const float dall = dh[0];
+            const float dmin = dh[1];
+            sumf[row] += dall * (sc8[0] * (acc1[0] +  16.f*acc2[0]) +
+                                 sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
+                                 sc8[4] * (acc1[2] +  16.f*acc2[2]) +
+                                 sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += args.nb01;
+            qh += args.nb01;
+            dh += args.nb01/2;
+            a  += args.nb01/2;
+        }
+
+        y1 += 4 * QK_K;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q5_K_f32")]]
+kernel void kernel_mul_mv_q5_K_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q5_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+template <typename args_t>
+void kernel_mul_mv_q6_K_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const uint8_t kmask1 = 0x03;
+    const uint8_t kmask2 = 0x0C;
+    const uint8_t kmask3 = 0x30;
+    const uint8_t kmask4 = 0xC0;
+
+    const int nb = args.ne00/QK_K;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int row = 2*r0 + sgitg;
+
+    if (row >= args.ne0) {
+        return;
+    }
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =  r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q6_K * x = (device const block_q6_K *) (src0 + offset0);
+    device const float     * yy = (device const float      *) (src1 + offset1);
+
+    float sumf = 0;
+
+    const int tid  = tiisg/2;
+    const int ix   = tiisg%2;
+    const int ip   = tid/8;         // 0 or 1
+    const int il   = tid%8;
+    const int n    = 4;
+    const int l0   = n*il;
+    const int is   = 8*ip + l0/16;
+
+    const int y_offset = 128*ip + l0;
+    const int q_offset_l = 64*ip + l0;
+    const int q_offset_h = 32*ip + l0;
+
+    for (int i = ix; i < nb; i += 2) {
+        device const uint8_t * q1 = x[i].ql + q_offset_l;
+        device const uint8_t * q2 = q1 + 32;
+        device const uint8_t * qh = x[i].qh + q_offset_h;
+        device const int8_t  * sc = x[i].scales + is;
+
+        device const float * y = yy + i * QK_K + y_offset;
+
+        const float dall = x[i].d;
+
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+        for (int l = 0; l < n; ++l) {
+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+        }
+
+        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
+
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    const float tot = simd_sum(sumf);
+    if (tiisg == 0) {
+        dst_f32[row] = tot;
+    }
+}
+
+[[host_name("kernel_mul_mv_q6_K_f32")]]
+kernel void kernel_mul_mv_q6_K_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q6_K_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+// ======================= "True" 2-bit
+
+template<typename args_t>
+void kernel_mul_mv_iq2_xxs_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq2_xxs * x = (device const block_iq2_xxs *) (src0 + offset0);
+    device const float         * y = (device const float         *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 256);
+    {
+        int nval = 4;
+        int pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xxs_grid[pos + i];
+        nval = 2;
+        pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq2_xxs * xr = x + ibl;
+        device const uint16_t * q2 = xr->qs + 4 * ib;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            const float db = dh[0];
+            device const uint8_t * aux8 = (device const uint8_t *)q2;
+            const uint32_t aux32 = q2[2] | (q2[3] << 16);
+            const float d = db * (0.5f + (aux32 >> 28));
+
+            float sum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + aux8[l]);
+                const uint8_t signs = ssigns[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+            }
+            sumf[row] += d * sum;
+
+            dh += args.nb01/2;
+            q2 += args.nb01/2;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum * 0.25f;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq2_xxs_f32")]]
+kernel void kernel_mul_mv_iq2_xxs_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    kernel_mul_mv_iq2_xxs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_iq2_xs_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq2_xs * x = (device const block_iq2_xs *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    threadgroup uint64_t * svalues = (threadgroup uint64_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 512);
+    {
+        int nval = 8;
+        int pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2xs_grid[pos + i];
+        nval = 2;
+        pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq2_xs * xr = x + ibl;
+        device const uint16_t * q2 = xr->qs + 4 * ib;
+        device const uint8_t  * sc = xr->scales + ib;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            const float db = dh[0];
+            const uint8_t ls1 = sc[0] & 0xf;
+            const uint8_t ls2 = sc[0] >>  4;
+            const float d1 = db * (0.5f + ls1);
+            const float d2 = db * (0.5f + ls2);
+
+            float sum1 = 0, sum2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511));
+                const uint8_t signs = ssigns[(q2[l] >> 9)];
+                for (int j = 0; j < 8; ++j) {
+                    sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+            }
+            for (int l = 2; l < 4; ++l) {
+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(svalues + (q2[l] & 511));
+                const uint8_t signs = ssigns[(q2[l] >> 9)];
+                for (int j = 0; j < 8; ++j) {
+                    sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+            }
+            sumf[row] += d1 * sum1 + d2 * sum2;
+
+            dh += args.nb01/2;
+            q2 += args.nb01/2;
+            sc += args.nb01;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum * 0.25f;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq2_xs_f32")]]
+kernel void kernel_mul_mv_iq2_xs_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq2_xs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template <typename args_t>
+void kernel_mul_mv_iq3_xxs_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq3_xxs * x = (device const block_iq3_xxs *) (src0 + offset0);
+    device const float         * y = (device const float         *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    threadgroup uint32_t * svalues = (threadgroup uint32_t *)(shmem);
+    threadgroup uint8_t  * ssigns  = (threadgroup uint8_t  *)(svalues + 256);
+    {
+        int nval = 4;
+        int pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3xxs_grid[pos + i];
+        nval = 2;
+        pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) ssigns[pos+i] = ksigns_iq2xs[pos+i];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq3_xxs * xr = x + ibl;
+        device const uint8_t  * q3 = xr->qs + 8 * ib;
+        device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+            const float db = dh[0];
+            const uint32_t aux32 = gas[0] | (gas[1] << 16);
+            const float d = db * (0.5f + (aux32 >> 28));
+
+            float2 sum = {0};
+            for (int l = 0; l < 4; ++l) {
+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + q3[2*l+0]);
+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + q3[2*l+1]);
+                const uint8_t signs = ssigns[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+            }
+            sumf[row] += d * (sum[0] + sum[1]);
+
+            dh  += args.nb01/2;
+            q3  += args.nb01;
+            gas += args.nb01/2;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum * 0.5f;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
+kernel void kernel_mul_mv_iq3_xxs_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq3_xxs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_iq3_s_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq3_s * x = (device const block_iq3_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    threadgroup uint32_t * svalues = (threadgroup uint32_t *) shmem;
+    {
+        int nval = 8;
+        int pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) svalues[pos + i] = iq3s_grid[pos + i];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq3_s * xr = x + ibl;
+        device const uint8_t * qs = xr->qs + 8 * ib;
+        device const uint8_t * qh = xr->qh + ib;
+        device const uint8_t * sc = xr->scales + (ib/2);
+        device const uint8_t * signs = xr->signs + 4 * ib;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            const float db = dh[0];
+            const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
+
+            float2 sum = {0};
+            for (int l = 0; l < 4; ++l) {
+                const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? svalues + 256 : svalues;
+                const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? svalues + 256 : svalues;
+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
+                for (int j = 0; j < 4; ++j) {
+                    sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
+                    sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
+                }
+            }
+            sumf[row] += d * (sum[0] + sum[1]);
+
+            dh    += args.nb01/2;
+            qs    += args.nb01;
+            qh    += args.nb01;
+            sc    += args.nb01;
+            signs += args.nb01;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq3_s_f32")]]
+kernel void kernel_mul_mv_iq3_s_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq3_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template <typename args_t>
+void kernel_mul_mv_iq2_s_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq2_s * x = (device const block_iq2_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    //threadgroup uint64_t * svalues = (threadgroup uint64_t *) shmem;
+    //{
+    //    int nval = 32;
+    //    int pos  = (32*sgitg + tiisg)*nval;
+    //    for (int i = 0; i < nval; ++i) svalues[pos + i] = iq2s_grid[pos + i];
+    //    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //}
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq2_s * xr = x + ibl;
+        device const uint8_t * qs = xr->qs + 4 * ib;
+        device const uint8_t * qh = xr->qh + ib;
+        device const uint8_t * sc = xr->scales + ib;
+        device const uint8_t * signs = qs + QK_K/8;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            const float db = dh[0];
+            const float d1 = db * (0.5f + (sc[0] & 0xf));
+            const float d2 = db * (0.5f + (sc[0] >>  4));
+
+            float2 sum = {0};
+            for (int l = 0; l < 2; ++l) {
+                //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(svalues + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
+                //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(svalues + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
+                constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
+                constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sum[0] += yl[8*l + j +  0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
+                    sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
+                }
+            }
+            sumf[row] += d1 * sum[0] + d2 * sum[1];
+
+            dh    += args.nb01/2;
+            qs    += args.nb01;
+            qh    += args.nb01;
+            sc    += args.nb01;
+            signs += args.nb01;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum * 0.25f;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq2_s_f32")]]
+kernel void kernel_mul_mv_iq2_s_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq2_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template<typename args_t>
+void kernel_mul_mv_iq1_s_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq1_s * x = (device const block_iq1_s *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        float sumy = 0;
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+            sumy += yl[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq1_s * xr = x + ibl;
+        device const uint8_t  * qs = xr->qs + 4 * ib;
+        device const uint16_t * qh = xr->qh + ib;
+        device const half     * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+            constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700)));
+            constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700)));
+            constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700)));
+
+            float sum = 0;
+            for (int j = 0; j < 4; ++j) {
+                sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
+                     + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4)
+                     + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
+                     + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
+            }
+            sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1);
+
+            dh += args.nb01/2;
+            qs += args.nb01;
+            qh += args.nb01/2;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+template <typename args_t>
+void kernel_mul_mv_iq1_m_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq1_m * x = (device const block_iq1_m *) (src0 + offset0);
+    device const float       * y = (device const float       *) (src1 + offset1);
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    iq1m_scale_t scale;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        float4 sumy = {0.f};
+        for (int i = 0; i < 8; ++i) {
+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
+            yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8];
+            yl[i+16] = y4[i+16]; sumy[2] += yl[i+16];
+            yl[i+24] = y4[i+24]; sumy[3] += yl[i+24];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq1_m * xr = x + ibl;
+        device const uint8_t  * qs = xr->qs + 4 * ib;
+        device const uint8_t  * qh = xr->qh + 2 * ib;
+        device const uint16_t * sc = (device const uint16_t *)xr->scales;
+
+        for (int row = 0; row < N_DST; row++) {
+            scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+            constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
+            constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
+            constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700)));
+            constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700)));
+
+            float2 sum = {0.f};
+            for (int j = 0; j < 4; ++j) {
+                sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
+                        + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4);
+                sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
+                        + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
+            }
+            const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+            const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
+
+            sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
+                                             (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
+
+            sc += args.nb01/2;
+            qs += args.nb01;
+            qh += args.nb01;
+        }
+
+        y4 += 32 * 32;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+template<typename args_t>
+void kernel_mul_mv_iq4_nl_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
+    const int nb = args.ne00/QK4_NL;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+    const int first_row = (r0 * 2 + sgitg) * 2;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
+
+    const int ix = tiisg/2;  // 0...15
+    const int it = tiisg%2;  // 0 or 1
+
+    shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    float4 yl[4];
+    float sumf[2]={0.f}, all_sum;
+
+    device const float * yb = y + ix * QK4_NL + it * 8;
+
+    uint32_t aux32[2];
+    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
+
+    float4 qf1, qf2;
+
+    for (int ib = ix; ib < nb; ib += 16) {
+
+        device const float4 * y4 = (device const float4 *)yb;
+        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
+
+        for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
+
+            device const block_iq4_nl & xb = x[row*nb + ib];
+            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
+
+            float4 acc1 = {0.f}, acc2 = {0.f};
+
+            aux32[0] = q4[0] | (q4[1] << 16);
+            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
+            aux32[0] &= 0x0f0f0f0f;
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
+            acc1 += yl[0] * qf1;
+            acc2 += yl[1] * qf2;
+
+            aux32[0] = q4[2] | (q4[3] << 16);
+            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
+            aux32[0] &= 0x0f0f0f0f;
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
+            acc1 += yl[2] * qf1;
+            acc2 += yl[3] * qf2;
+
+            acc1 += acc2;
+
+            sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
+
+        }
+
+        yb += 16 * QK4_NL;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+template<typename args_t>
+void kernel_mul_mv_iq4_xs_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+
+    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
+    const int nb = args.ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+    const int first_row = (r0 * 2 + sgitg) * 2;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
+    device const float        * y = (device const float        *) (src1 + offset1);
+
+    const int ix = tiisg/16;  // 0 or 1
+    const int it = tiisg%16;  // 0...15
+    const int ib = it/2;
+    const int il = it%2;
+
+    shmem_f32[tiisg] = kvalues_iq4nl_f[tiisg%16];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    float4 yl[4];
+    float sumf[2]={0.f}, all_sum;
+
+    device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
+
+    uint32_t aux32[2];
+    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
+
+    float4 qf1, qf2;
+
+    for (int ibl = ix; ibl < nb; ibl += 2) {
+        device const float4 * y4 = (device const float4 *)yb;
+        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
+
+        for (int row = 0; row < 2; ++row) {
+            device const block_iq4_xs & xb = x[row*nb + ibl];
+            device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
+
+            float4 acc1 = {0.f}, acc2 = {0.f};
+
+            aux32[0] = (q4[0]     ) & 0x0f0f0f0f;
+            aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
+            acc1 += yl[0] * qf1;
+            acc2 += yl[1] * qf2;
+
+            aux32[0] = (q4[1]     ) & 0x0f0f0f0f;
+            aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
+            qf1 = {shmem_f32[q8[0]], shmem_f32[q8[1]], shmem_f32[q8[2]], shmem_f32[q8[3]]};
+            qf2 = {shmem_f32[q8[4]], shmem_f32[q8[5]], shmem_f32[q8[6]], shmem_f32[q8[7]]};
+            acc1 += yl[2] * qf1;
+            acc2 += yl[3] * qf2;
+
+            acc1 += acc2;
+
+            const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
+            sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
+
+        }
+
+        yb += 2 * QK_K;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = all_sum;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq1_s_f32")]]
+kernel void kernel_mul_mv_iq1_s_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq1_s_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+[[host_name("kernel_mul_mv_iq1_m_f32")]]
+kernel void kernel_mul_mv_iq1_m_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq1_m_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
+[[host_name("kernel_mul_mv_iq4_nl_f32")]]
+kernel void kernel_mul_mv_iq4_nl_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq4_nl_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+[[host_name("kernel_mul_mv_iq4_xs_f32")]]
+kernel void kernel_mul_mv_iq4_xs_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq4_xs_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
+kernel void kernel_get_rows_q(
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int64_t i10 = tgpig.x;
+    const int64_t i11 = tgpig.y;
+
+    const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    const int64_t i02 = i11;
+
+    for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) {
+        float4x4 temp;
+        dequantize_func(((device const block_q *) ((const device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
+        *(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
+    }
+}
+
+template<typename T>
+kernel void kernel_get_rows_f(
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int64_t i10 = tgpig.x;
+    const int64_t i11 = tgpig.y;
+
+    const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    const int64_t i02 = i11;
+
+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
+        ((      device float *) ((      device char *)  dst + i11*nb2  + i10*nb1))[ind] =
+        ((const device T     *) ((const device char *) src0 + i02*nb02 +  r*nb01))[ind];
+    }
+}
+
+kernel void kernel_get_rows_i32(
+        device const  void * src0,
+        device const  void * src1,
+        device     int32_t * dst,
+        constant   int64_t & ne00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb1,
+        constant  uint64_t & nb2,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int64_t i10 = tgpig.x;
+    const int64_t i11 = tgpig.y;
+
+    const int64_t r = ((const device int32_t *) ((const device char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    const int64_t i02 = i11;
+
+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
+        ((      device int32_t *) ((      device char *) dst  + i11*nb2 + i10*nb1))[ind] =
+        ((const device int32_t *) ((const device char *) src0 + i02*nb02 + r*nb01))[ind];
+    }
+}
+
+
+#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
+#define BLOCK_SIZE_K 32
+#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
+#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
+#define THREAD_PER_BLOCK 128
+#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
+#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
+#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
+#define SG_MAT_ROW 8
+
+// each block_q contains 16*nl weights
+template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_mul_mm(
+        constant ggml_metal_kargs_mul_mm & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    threadgroup T     * sa = (threadgroup T     *)(shmem);
+    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
+
+    const int r0 = tgpig.y;
+    const int r1 = tgpig.x;
+    const int im = tgpig.z;
+
+    // if this block is of 64x32 shape or smaller
+    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+
+    // a thread shouldn't load data outside of the matrix
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+
+    simdgroup_T8x8     ma[4];
+    simdgroup_float8x8 mb[2];
+    simdgroup_float8x8 mc[8];
+
+    for (short i = 0; i < 8; i++){
+        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    short il = (tiitg % THREAD_PER_ROW);
+
+    const int i12 = im%args.ne12;
+    const int i13 = im/args.ne12;
+
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;
+
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
+
+    device const float   * y = (device const float   *)(src1
+        + args.nb13*i13
+        + args.nb12*i12
+        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
+        + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
+        // load data and store to threadgroup memory
+        T4x4 temp_a;
+        dequantize_func(x, il, temp_a);
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        #pragma unroll(16)
+        for (short i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
+            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
+            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
+        }
+
+        *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
+
+        il = (il + 2 < nl) ? il + 2 : il % 2;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
+        y += BLOCK_SIZE_K;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // load matrices from threadgroup memory and conduct outer products
+        threadgroup const T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+
+        #pragma unroll(4)
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
+            #pragma unroll(4)
+            for (short i = 0; i < 4; i++) {
+                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
+            }
+
+            simdgroup_barrier(mem_flags::mem_none);
+
+            #pragma unroll(2)
+            for (short i = 0; i < 2; i++) {
+                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
+            }
+
+            #pragma unroll(8)
+            for (short i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
+            }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
+        }
+    }
+
+    if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
+        device float * C = (device float *) dst +
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
+        }
+    } else {
+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup float * temp_str = ((threadgroup float *) shmem) \
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (sgitg == 0) {
+            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
+                device float4 * D4 = (device float4 *) D;
+
+                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
+                threadgroup float4 * C4 = (threadgroup float4 *) C;
+
+                int i = 0;
+                for (; i < n_rows/4; i++) {
+                    *(D4 + i) = *(C4 + i);
+                }
+
+                i *= 4;
+                for (; i < n_rows; i++) {
+                    *(D + i) = *(C + i);
+                }
+            }
+        }
+    }
+}
+
+// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
+// TODO: this kernel needs to be reimplemented from scratch for better performance
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+void kernel_mul_mm_id_impl(
+        int32_t  ne00,
+        int32_t  ne02,
+        uint64_t nb01,
+        uint64_t nb02,
+        int32_t  ne11,
+        int32_t  ne12,
+        uint64_t nb10,
+        uint64_t nb11,
+        uint64_t nb12,
+        int32_t  ne0,
+        int32_t  ne1,
+        int64_t  ne0ne1,
+        device   const char * src0,
+        device   const char * src1,
+        threadgroup ushort2 * rowids,
+        device         char * dst,
+        threadgroup    char * shmem,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    threadgroup half  * sa = (threadgroup half  *)(shmem);
+    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
+
+    const int r0 = tgpig.y;
+    const int r1 = tgpig.x;
+
+    if (r1*BLOCK_SIZE_N >= ne1) return;
+
+    // if this block is of 64x32 shape or smaller
+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+
+    // a thread shouldn't load data outside of the matrix
+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+
+    simdgroup_half8x8  ma[4];
+    simdgroup_float8x8 mb[2];
+    simdgroup_float8x8 mc[8];
+    for (int i = 0; i < 8; i++){
+        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+    short il = (tiitg % THREAD_PER_ROW);
+
+    ushort offset1 = il/nl;
+
+    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
+
+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
+    device const float   * y = (device const float   *)(src1
+        + nb12 * id[1]
+        + nb11 * (id[0] % ne11)
+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+        // load data and store to threadgroup memory
+        half4x4 temp_a;
+        dequantize_func(x, il, temp_a);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        for (int i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        }
+
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+
+        il = (il + 2 < nl) ? il + 2 : il % 2;
+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        y += BLOCK_SIZE_K;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // load matrices from threadgroup memory and conduct outer products
+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+
+        #pragma unroll(BLOCK_SIZE_K/8)
+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+            #pragma unroll(4)
+            for (int i = 0; i < 4; i++) {
+                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
+            }
+            simdgroup_barrier(mem_flags::mem_none);
+            #pragma unroll(2)
+            for (int i = 0; i < 2; i++) {
+                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
+            }
+
+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
+            #pragma unroll(8)
+            for (int i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
+            }
+        }
+    }
+
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup float * temp_str = ((threadgroup float *) shmem) \
+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
+        for (int i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (sgitg == 0) {
+            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
+                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
+
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
+                device float4 * D4 = (device float4 *) D;
+
+                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
+                threadgroup float4 * C4 = (threadgroup float4 *) C;
+
+                int i = 0;
+                for (; i < n_rows/4; i++) {
+                    *(D4 + i) = *(C4 + i);
+                }
+
+                i *= 4;
+                for (; i < n_rows; i++) {
+                    *(D + i) = *(C + i);
+                }
+            }
+        }
+    }
+}
+
+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
+kernel void kernel_mul_mm_id(
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0s,
+        device const char * src1,
+        device       char * dst,
+        device const char * ids,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int32_t i02 = tgpig.z;
+
+    tgpig.z = 0;
+
+    device const char * src0 = src0s + i02*args.nb02;
+
+    // row indices
+    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
+
+    // TODO: parallelize this loop
+    int32_t _ne1 = 0;
+    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
+        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
+            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
+            if (id == i02) {
+                if (tiitg == 0) {
+                    rowids[_ne1] = ushort2(ii0, ii1);
+                }
+                _ne1++;
+            }
+        }
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
+        args.ne00,
+        args.ne02,
+        args.nb01,
+        args.nb02,
+        args.ne11,
+        args.ne12,
+        args.nb10,
+        args.nb11,
+        args.nb12,
+        args.ne0,
+        _ne1,
+        (int64_t)args.ne0*args.ne1,
+        src0,
+        src1,
+        rowids,
+        dst,
+        shmem,
+        tgpig,
+        tiitg,
+        sgitg);
+}
+
+#define QK_NL 16
+
+//
+// get rows
+//
+
+typedef decltype(kernel_get_rows_f<float>) get_rows_f_t;
+
+template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_f_t kernel_get_rows_f<float>;
+template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_f_t kernel_get_rows_f<half>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f<bfloat>;
+#endif
+
+typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
+
+template [[host_name("kernel_get_rows_q4_0")]]    kernel get_rows_q_t kernel_get_rows_q<block_q4_0,    2, dequantize_q4_0>;
+template [[host_name("kernel_get_rows_q4_1")]]    kernel get_rows_q_t kernel_get_rows_q<block_q4_1,    2, dequantize_q4_1>;
+template [[host_name("kernel_get_rows_q5_0")]]    kernel get_rows_q_t kernel_get_rows_q<block_q5_0,    2, dequantize_q5_0>;
+template [[host_name("kernel_get_rows_q5_1")]]    kernel get_rows_q_t kernel_get_rows_q<block_q5_1,    2, dequantize_q5_1>;
+template [[host_name("kernel_get_rows_q8_0")]]    kernel get_rows_q_t kernel_get_rows_q<block_q8_0,    2, dequantize_q8_0>;
+template [[host_name("kernel_get_rows_q2_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_get_rows_q3_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_get_rows_q4_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_get_rows_q5_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_get_rows_q6_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_q_t kernel_get_rows_q<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_get_rows_iq3_s")]]   kernel get_rows_q_t kernel_get_rows_q<block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_q_t kernel_get_rows_q<block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_q_t kernel_get_rows_q<block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_get_rows_iq1_m")]]   kernel get_rows_q_t kernel_get_rows_q<block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_q_t kernel_get_rows_q<block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_q_t kernel_get_rows_q<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+
+//
+// matrix-matrix multiplication
+//
+
+typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
+
+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+#endif
+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+
+//
+// indirect matrix-matrix multiplication
+//
+
+typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
+
+template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
+#endif
+template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+
+//
+// matrix-vector multiplication
+//
+
+typedef void (kernel_mul_mv_impl_t)(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig,
+        ushort tiisg);
+
+typedef void (kernel_mul_mv2_impl_t)(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg);
+
+template<kernel_mul_mv_impl_t impl_fn>
+void mmv_fn(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiitg,
+        ushort tiisg,
+        ushort sgitg) {
+    impl_fn(args, src0, src1, dst, tgpig, tiisg);
+}
+
+template<kernel_mul_mv2_impl_t impl_fn>
+void mmv_fn(
+        ggml_metal_kargs_mul_mv args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiitg,
+        ushort tiisg,
+        ushort sgitg) {
+    impl_fn(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
+}
+
+typedef decltype(mmv_fn<kernel_mul_mv_impl<half, half4, half, half4, ggml_metal_kargs_mul_mv>>) mul_mv_impl_fn_t;
+
+template<mul_mv_impl_fn_t impl_fn>
+kernel void kernel_mul_mv_id(
+        constant ggml_metal_kargs_mul_mv_id & args,
+        device const char * src0s,
+        device const char * src1,
+        device       char * dst,
+        device const char * ids,
+        threadgroup  char * shmem [[threadgroup(0)]],
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiitg[[thread_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+    const int iid1 = tgpig.z/args.nei0;
+    const int idx  = tgpig.z%args.nei0;
+
+    tgpig.z = 0;
+
+    const int32_t i02 = ((device const int32_t *) (ids + iid1*args.nbi1))[idx];
+
+    const int64_t i11 = idx % args.ne11;
+    const int64_t i12 = iid1;
+
+    const int64_t i1 = idx;
+    const int64_t i2 = i12;
+
+    device const char * src0_cur = src0s + i02*args.nb02;
+    device const char * src1_cur = src1  + i11*args.nb11 + i12*args.nb12;
+
+    device char * dst_cur = dst + (i1*args.ne0 + i2*args.ne1*args.ne0)*sizeof(float);
+
+    ggml_metal_kargs_mul_mv args0 = {
+        /*.ne00 =*/ args.ne00,
+        /*.ne01 =*/ args.ne01,
+        /*.ne02 =*/ 1, // args.ne02,
+        /*.nb00 =*/ args.nb00,
+        /*.nb01 =*/ args.nb01,
+        /*.nb02 =*/ args.nb02,
+        /*.nb03 =*/ args.nb02, // args.ne02 == 1
+        /*.ne10 =*/ args.ne10,
+        /*.ne11 =*/ 1, // args.ne11,
+        /*.ne12 =*/ 1, // args.ne12,
+        /*.nb10 =*/ args.nb10,
+        /*.nb11 =*/ args.nb11,
+        /*.nb12 =*/ args.nb12,
+        /*.nb13 =*/ args.nb12, // ne12 == 1
+        /*.ne0  =*/ args.ne0,
+        /*.ne1  =*/ 1, // args.ne1,
+        /*.r2   =*/ 1,
+        /*.r3   =*/ 1,
+    };
+
+    impl_fn(
+        args0,
+        /* src0 */ src0_cur,
+        /* src1 */ src1_cur,
+        /* dst  */ dst_cur,
+        shmem,
+        tgpig,
+        tiitg,
+        tiisg,
+        sgitg);
+}
+
+typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
+
+template [[host_name("kernel_mul_mv_id_f32_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
+template [[host_name("kernel_mul_mv_id_f16_f32")]]     kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<half, half4, float, float4>>>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mv_id_bf16_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<bfloat, bfloat4, float, float4>>>;
+#endif
+template [[host_name("kernel_mul_mv_id_q8_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_q4_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
+template [[host_name("kernel_mul_mv_id_q4_1_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
+template [[host_name("kernel_mul_mv_id_q5_0_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
+template [[host_name("kernel_mul_mv_id_q5_1_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
+template [[host_name("kernel_mul_mv_id_q2_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q2_K_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_q3_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q3_K_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_q4_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_K_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_q5_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_q6_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq1_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_s_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq1_m_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_m_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xxs_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xs_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_xxs_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
+template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
+
+kernel void kernel_pool_2d_max_f32(
+        device  const float * src0,
+        device        float * dst,
+        constant    int32_t & k0,
+        constant    int32_t & k1,
+        constant    int32_t & s0,
+        constant    int32_t & s1,
+        constant    int32_t & p0,
+        constant    int32_t & p1,
+        constant    int64_t & IH,
+        constant    int64_t & IW,
+        constant    int64_t & OH,
+        constant    int64_t & OW,
+        constant    int64_t & parallel_elements,
+        uint        gid[[thread_position_in_grid]]) {
+
+    if (gid >= parallel_elements) {
+        return;
+    }
+
+    const int idx = gid;
+    const int I_HW = IH * IW;
+    const int O_HW = OH * OW;
+    const int nc = idx / O_HW;
+    const int cur_oh = idx % O_HW / OW;
+    const int cur_ow = idx % O_HW % OW;
+
+    device const float * i_ptr = src0 + nc * I_HW;
+    device       float * o_ptr = dst  + nc * O_HW;
+
+    const int start_h = cur_oh * s1 - p1;
+    const int bh = MAX(0,  start_h);
+    const int eh = MIN(IH, start_h + k1);
+    const int start_w = cur_ow * s0 - p0;
+    const int bw = MAX(0,  start_w);
+    const int ew = MIN(IW, start_w + k0);
+
+    float res = -INFINITY;
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+            res = MAX(res, i_ptr[i * IW + j]);
+        }
+    }
+
+    o_ptr[cur_oh * OW + cur_ow] = res;
+}
+
+kernel void kernel_pool_2d_avg_f32(
+        device  const float * src0,
+        device        float * dst,
+        constant    int32_t & k0,
+        constant    int32_t & k1,
+        constant    int32_t & s0,
+        constant    int32_t & s1,
+        constant    int32_t & p0,
+        constant    int32_t & p1,
+        constant    int64_t & IH,
+        constant    int64_t & IW,
+        constant    int64_t & OH,
+        constant    int64_t & OW,
+        constant    int64_t & parallel_elements,
+        uint        gid[[thread_position_in_grid]]) {
+
+    if (gid >= parallel_elements) {
+        return;
+    }
+
+    const int idx = gid;
+    const int I_HW = IH * IW;
+    const int O_HW = OH * OW;
+    const int nc = idx / O_HW;
+    const int cur_oh = idx % O_HW / OW;
+    const int cur_ow = idx % O_HW % OW;
+
+    device const float * i_ptr = src0 + nc * I_HW;
+    device       float * o_ptr = dst  + nc * O_HW;
+
+    const int start_h = cur_oh * s1 - p1;
+    const int bh = MAX(0,  start_h);
+    const int eh = MIN(IH, start_h + k1);
+    const int start_w = cur_ow * s0 - p0;
+    const int bw = MAX(0,  start_w);
+    const int ew = MIN(IW, start_w + k0);
+    // const float scale = 1. / ((eh - bh) * (ew - bw));
+    const float scale = 1. / (k0 * k1);
+
+    float res = 0;
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+            float cur = i_ptr[i * IW + j];
+            res += cur * scale;
+        }
+    }
+
+    o_ptr[cur_oh * OW + cur_ow] = res;
+}
diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt
new file mode 100644
index 000000000..2f555416e
--- /dev/null
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@@ -0,0 +1,107 @@
+if (NOT EXISTS $ENV{MUSA_PATH})
+    if (NOT EXISTS /opt/musa)
+        set(MUSA_PATH /usr/local/musa)
+    else()
+        set(MUSA_PATH /opt/musa)
+    endif()
+else()
+    set(MUSA_PATH $ENV{MUSA_PATH})
+endif()
+
+set(CMAKE_C_COMPILER "${MUSA_PATH}/bin/clang")
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_CXX_COMPILER "${MUSA_PATH}/bin/clang++")
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")
+
+find_package(MUSAToolkit)
+
+if (MUSAToolkit_FOUND)
+    message(STATUS "MUSA Toolkit found")
+
+    if (NOT DEFINED MUSA_ARCHITECTURES)
+        set(MUSA_ARCHITECTURES "21;22")
+    endif()
+    message(STATUS "Using MUSA architectures: ${MUSA_ARCHITECTURES}")
+
+    file(GLOB   GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
+    list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_MUSA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+        file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_MUSA ${SRCS})
+    endif()
+
+    set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
+    foreach(SOURCE ${GGML_SOURCES_MUSA})
+        set(COMPILE_FLAGS "-x musa -mtgpu")
+        foreach(ARCH ${MUSA_ARCHITECTURES})
+            set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
+        endforeach()
+        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
+    endforeach()
+
+    ggml_add_backend_library(ggml-musa
+                             ${GGML_HEADERS_MUSA}
+                             ${GGML_SOURCES_MUSA}
+                            )
+
+    # TODO: do not use CUDA definitions for MUSA
+    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+
+    add_compile_definitions(GGML_USE_MUSA)
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        add_compile_definitions(GGML_CUDA_F16)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
+    else()
+        target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the musa driver lib (libmusa.so)
+    else()
+        target_link_libraries(ggml-musa PRIVATE MUSA::musa_driver)
+    endif()
+else()
+    message(FATAL_ERROR "MUSA Toolkit not found")
+endif()
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
new file mode 100644
index 000000000..45328a657
--- /dev/null
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -0,0 +1,147 @@
+find_package(OpenCL REQUIRED)
+find_package(Python3 REQUIRED)
+
+set(TARGET_NAME ggml-opencl)
+
+ggml_add_backend_library(${TARGET_NAME}
+                         ggml-opencl.cpp
+                         ../../include/ggml-opencl.h)
+target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
+target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
+
+if (GGML_OPENCL_PROFILING)
+    message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
+    add_compile_definitions(GGML_OPENCL_PROFILING)
+endif ()
+
+add_compile_definitions(GGML_OPENCL_SOA_Q)
+
+if (GGML_OPENCL_USE_ADRENO_KERNELS)
+    message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
+    add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
+endif ()
+
+if (GGML_OPENCL_EMBED_KERNELS)
+    add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
+
+    set(OPENCL_CL_SOURCE_EMBED         "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
+    set(OPENCL_MM_CL_SOURCE_EMBED      "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
+    set(OPENCL_CVT_CL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
+
+    set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED             "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
+    set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
+    set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED          "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
+    set(OPENCL_TRANSPOSE_16_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
+    set(OPENCL_TRANSPOSE_32_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
+    set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED            "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
+
+    set(EMBED_KERNEL_SCRIPT             "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
+    file(MAKE_DIRECTORY                 "${CMAKE_BINARY_DIR}/autogenerated")
+
+    include_directories("${CMAKE_BINARY_DIR}/autogenerated")
+
+    # Python must be accessible from command line
+    add_custom_command(
+        OUTPUT ${OPENCL_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
+            ${OPENCL_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
+            ${OPENCL_MM_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_mm.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
+            ${OPENCL_CVT_CL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_cvt.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
+            ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
+            ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+            ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
+            ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_16.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
+            ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_32.cl.h"
+    )
+
+    add_custom_command(
+        OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+        COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
+            ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
+            ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
+        DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
+        COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
+    )
+
+    target_sources(${TARGET_NAME} PRIVATE
+                   ${OPENCL_CL_SOURCE_EMBED}
+                   ${OPENCL_MM_CL_SOURCE_EMBED}
+                   ${OPENCL_CVT_CL_SOURCE_EMBED}
+                   ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
+                   ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
+                   ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
+                   ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
+else ()
+    # copy ggml-opencl.cl to bin directory
+    configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
+
+    configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
+    configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
+endif ()
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
new file mode 100644
index 000000000..ed90e471a
--- /dev/null
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -0,0 +1,4004 @@
+#define CL_TARGET_OPENCL_VERSION 220
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+// suppress warnings in CL headers for GCC and Clang
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
+#endif
+
+#include "ggml-opencl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml.h"
+
+#include <CL/cl.h>
+
+#include <string.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <atomic>
+#include <fstream>
+#include <limits>
+#include <vector>
+#include <string>
+#include <cmath>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define UNUSED(x) (void)(x)
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            GGML_ASSERT(0);                                         \
+        }                                                           \
+    } while (0)
+
+//------------------------------------------------------------------------------
+// OpenCL
+//------------------------------------------------------------------------------
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
+
+enum GPU_FAMILY {
+    ADRENO,
+    INTEL,
+    UNKNOWN,
+};
+
+enum ADRENO_GPU_GEN {
+    ADRENO_UNKNOWN,
+    A7X,
+    A8X,
+    X1E,
+};
+
+static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
+    if (strstr(device_name, "730") ||
+        strstr(device_name, "740") ||
+        strstr(device_name, "750")) {
+        return ADRENO_GPU_GEN::A7X;
+    }
+
+    if (strstr(device_name, "830")) {
+        return ADRENO_GPU_GEN::A8X;
+    }
+
+    if (strstr(device_name, "X1")) {
+        return ADRENO_GPU_GEN::X1E;
+    }
+
+    return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
+}
+
+static int get_adreno_cl_compiler_version(const char *driver_version) {
+    std::string driver_ver_str(driver_version);
+    size_t compiler_ver_pos = driver_ver_str.find("E031");
+    size_t compiler_ver_len = 13;
+    size_t compiler_ver_offset = 5;
+
+    if (compiler_ver_pos == std::string::npos) {
+        compiler_ver_pos = driver_ver_str.find("DX");
+        if (compiler_ver_pos == std::string::npos) {
+            return -1;
+        }
+        compiler_ver_len = 11;
+        compiler_ver_offset = 3;
+    }
+
+    std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
+    std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2);
+    return std::atoi(major_ver_str.c_str());
+}
+
+// backend device context
+struct ggml_backend_opencl_device_context {
+    cl_platform_id platform;
+    std::string platform_name;
+
+    cl_device_id device;
+    std::string device_name;
+};
+
+// backend context
+struct ggml_backend_opencl_context {
+    cl_device_id device;
+    std::string device_name;
+
+    std::string driver_version;
+
+    GPU_FAMILY gpu_family;
+    ADRENO_GPU_GEN adreno_gen;
+
+    cl_int alignment;
+    size_t max_alloc_size;
+    bool fp16_support;
+
+    int adreno_wave_size;
+
+    cl_context context;
+    cl_command_queue queue;
+
+    cl_program program;
+    cl_program program_1;
+    cl_program program_2;
+
+    cl_kernel kernel_add, kernel_add_row;
+    cl_kernel kernel_mul, kernel_mul_row;
+    cl_kernel kernel_scale;
+    cl_kernel kernel_silu, kernel_silu_4;
+    cl_kernel kernel_gelu, kernel_gelu_4;
+    cl_kernel kernel_relu;
+    cl_kernel kernel_clamp;
+    cl_kernel kernel_norm;
+    cl_kernel kernel_rms_norm;
+    cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
+    cl_kernel kernel_soft_max, kernel_soft_max_4;
+    cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
+    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
+    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
+    cl_kernel kernel_mul_mat_f32_f32;
+    cl_kernel kernel_mul_mat_f16_f16;
+    cl_kernel kernel_mul_mat_f16_f32_1row;
+    cl_kernel kernel_mul_mat_f16_f32;
+    cl_kernel kernel_mul_mat_f16_f32_l4;
+    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
+    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat;
+    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
+    cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0,
+              kernel_mul_mat_q4_0_f32_flat_img_v0;
+    cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
+    cl_kernel kernel_mul_mv_q6_K_f32;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    // Transpose kernels
+    cl_program program_transpose_32;
+    cl_program program_transpose_32_16;
+    cl_program program_transpose_16;
+    cl_kernel kernel_transpose_32;
+    cl_kernel kernel_transpose_32_16;
+    cl_kernel kernel_transpose_16;
+
+    cl_mem A_s_d_max;            // max scale buffer size for transpose
+    cl_mem A_q_d_max;            // max weight buffer size for transpose
+    cl_mem B_d_max;              // max activation buffer size for transpose
+
+    // Gemm and Gemv related programs, kernels, etc
+    cl_program program_CL_gemm;
+    cl_program program_CL_gemv_general;
+    cl_program program_CL_gemv_4096_1_11008;
+    cl_program program_CL_gemv_4096_1_4096;
+    cl_program program_CL_gemv_11008_1_4096;
+    cl_program program_CL_gemv_32000_1_4096;
+    cl_kernel CL_mul_mat_Ab_Bi_8x4;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+};
+
+static ggml_backend_device                 g_ggml_backend_opencl_device;
+static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
+    /*.platform         =*/ nullptr,
+    /*.platform_nane    =*/ "",
+    /*.device           =*/ nullptr,
+    /*.device_name      =*/ "",
+};
+
+static int ggml_backend_opencl_n_devices = 0;
+
+// Profiling
+#ifdef GGML_OPENCL_PROFILING
+struct ProfilingInfo {
+    std::string op_name;
+    std::string kernel_name;
+    // Kernel execution time in nanoseconds.
+    cl_ulong duration_ns;
+    // Global and local work sizes.
+    size_t global_size[3];
+    size_t local_size[3];
+    // Op output size.
+    size_t output_size[4];
+};
+
+std::vector<ProfilingInfo> g_profiling_info;
+#endif
+
+inline std::string read_file(const std::string &path) {
+  std::ifstream ifs(path);
+  if (!ifs) {
+    return "";
+  }
+  std::string text;
+  ifs.seekg(0, std::ios::end);
+  text.resize(ifs.tellg());
+  ifs.seekg(0, std::ios::beg);
+  ifs.read(&text[0], text.size());
+  return text;
+}
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
+    cl_program p;
+    char *program_log;
+    size_t program_size;
+    size_t log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        GGML_LOG_ERROR("OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
+    if(err < 0) {
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    static bool initialized = false;
+    static ggml_backend_opencl_context *backend_ctx = nullptr;
+
+    if (initialized) {
+        return backend_ctx;
+    }
+
+    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
+    GGML_ASSERT(dev_ctx);
+    GGML_ASSERT(dev_ctx->platform == nullptr);
+    GGML_ASSERT(dev_ctx->device == nullptr);
+    GGML_ASSERT(backend_ctx == nullptr);
+
+    initialized = true;
+    backend_ctx = new ggml_backend_opencl_context();
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    cl_int err;
+
+#ifdef GGML_PROFILE_OPENCL
+    GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
+#endif
+
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+
+    enum { NPLAT = 16, NDEV = 16 };
+
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+
+    cl_platform_id platform_ids[NPLAT];
+    if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
+        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+        return backend_ctx;
+    }
+
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+
+    if (n_devices == 0) {
+        GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
+        return backend_ctx;
+    }
+
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    if (user_platform_number != -1 && user_device_number != -1) {
+        cl_platform* platform = &platforms[user_platform_number];
+        if ((unsigned)user_device_number >= platform->n_devices) {
+            GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
+            exit(1);
+        }
+        default_device = &platform->devices[user_device_number];
+    } else {
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
+        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+            for (unsigned i = 0; i < n_platforms; i++) {
+                struct cl_platform * p = &platforms[i];
+                if (strstr(p->name, user_platform_string) != NULL ||
+                    strstr(p->vendor, user_platform_string) != NULL) {
+                    user_platform_number = (int)i;
+                    break;
+                }
+            }
+            if (user_platform_number == -1) {
+                GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+                exit(1);
+            }
+        }
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
+        }
+
+        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
+                if (strstr(d->name, user_device_string) != NULL) {
+                    user_device_number = d->number;
+                    break;
+                }
+            }
+            if (user_device_number == -1) {
+                GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+                exit(1);
+            }
+        }
+        if (user_device_number != -1) {
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
+        }
+
+        GGML_ASSERT(n_selected_devices > 0);
+
+        if (default_device == NULL) {
+            default_device = &selected_devices[0];
+        }
+    }
+
+    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+
+    dev_ctx->platform = default_device->platform->id;
+    dev_ctx->device = default_device->id;
+    backend_ctx->device = default_device->id;
+
+    if (strstr(default_device->name, "Adreno")) {
+        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+
+        // Default wave size is 128, A8x uses 64.
+        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
+            backend_ctx->adreno_wave_size = 64;
+        } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
+                   backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
+            backend_ctx->adreno_wave_size = 128;
+        } else {
+            backend_ctx->adreno_wave_size = 128;
+            GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
+                "using wave size %d, "
+                "may not work as expected\n",
+                backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
+        }
+    } else if (strstr(default_device->name, "Intel")) {
+        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
+    } else {
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+        return backend_ctx;
+    }
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
+        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
+            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
+        return backend_ctx;
+    }
+#endif
+
+    // Populate backend device name
+    dev_ctx->platform_name = default_device->platform->name;
+    dev_ctx->device_name = default_device->name;
+    backend_ctx->device_name = default_device->name;
+
+    // A local ref of cl_device_id for convenience
+    cl_device_id device = backend_ctx->device;
+
+    // Check device OpenCL version, OpenCL 2.0 or above is required
+    size_t device_ver_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
+    char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
+    device_ver_buffer[device_ver_str_size] = '\0';
+    GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
+
+    if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
+        strstr(device_ver_buffer, "OpenCL 3") == NULL) {
+        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
+        return backend_ctx;
+    }
+
+    // Check driver version
+    size_t driver_version_str_size;
+    clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
+    char *driver_version = (char *)alloca(driver_version_str_size + 1);
+    clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
+    driver_version[driver_version_str_size] = '\0';
+    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
+    backend_ctx->driver_version = driver_version;
+
+    int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
+    bool has_vector_subgroup_broadcast =
+        adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17;
+    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
+        has_vector_subgroup_broadcast ? "true" : "false");
+
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char *ext_buffer = (char *)alloca(ext_str_size + 1);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Check if ext_buffer contains cl_khr_fp16
+    backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
+
+    // fp16 is required
+    if (!backend_ctx->fp16_support) {
+        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
+        return backend_ctx;
+    }
+
+    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
+    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
+    if (strstr(device_ver_buffer, "OpenCL 3") &&
+        strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
+        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
+        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
+            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
+        return backend_ctx;
+    }
+
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
+    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
+
+    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
+    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
+
+    // Check SVM.
+    cl_device_svm_capabilities svm_caps;
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
+    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
+        svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
+        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
+        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
+        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+
+    // Print out configurations
+#ifdef GGML_OPENCL_SOA_Q
+    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
+#endif // GGML_OPENCL_SOA_Q
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
+    };
+
+    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+    // A local ref of cl_context for convenience
+    cl_context context = backend_ctx->context;
+
+    //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+    //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+    //    (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    //)));
+    cl_command_queue_properties command_queue_props = 0;
+#ifdef GGML_OPENCL_PROFILING
+    command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
+#endif
+    CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src {
+        #include "ggml-opencl.cl.h"
+    };
+#else
+    const std::string kernel_src = read_file("ggml-opencl.cl");
+#endif
+
+    std::string compile_opts =
+        "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
+        "-cl-finite-math-only -cl-fast-relaxed-math ";
+    backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
+
+    // Non matmul kernels.
+    CL_CHECK((backend_ctx->kernel_get_rows_f32       = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_get_rows_f16       = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_get_rows_q4_0      = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_add                = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
+    CL_CHECK((backend_ctx->kernel_add_row            = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul                = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_row            = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
+    CL_CHECK((backend_ctx->kernel_scale              = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
+    CL_CHECK((backend_ctx->kernel_silu               = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
+    CL_CHECK((backend_ctx->kernel_silu_4             = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_gelu               = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
+    CL_CHECK((backend_ctx->kernel_gelu_4             = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_relu               = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
+    CL_CHECK((backend_ctx->kernel_clamp              = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
+    CL_CHECK((backend_ctx->kernel_norm               = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
+    CL_CHECK((backend_ctx->kernel_rms_norm           = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
+    CL_CHECK((backend_ctx->kernel_diag_mask_inf      = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
+    CL_CHECK((backend_ctx->kernel_diag_mask_inf_8    = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
+    CL_CHECK((backend_ctx->kernel_soft_max           = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
+    CL_CHECK((backend_ctx->kernel_soft_max_4         = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_norm_f32      = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_norm_f16      = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_neox_f32      = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_rope_neox_f16      = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f16_f16        = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f16_f32        = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f32_f16        = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_cpy_f32_f32        = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
+
+    // Matmul kernels.
+    CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row   = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32        = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4     = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32       = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v     = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
+
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat  = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_convert_block_q4_0     = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_restore_block_q4_0     = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
+
+    // Load additional mulmat kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_1 {
+        #include "ggml-opencl_mm.cl.h"
+    };
+#else
+    const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
+#endif
+    backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
+
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat      = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat     = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32                  = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0         = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
+    CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0     = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
+
+    // Load additional data conversion kernels.
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_2 {
+        #include "ggml-opencl_cvt.cl.h"
+    };
+#else
+    const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
+#endif
+    backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
+
+    CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle     = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
+
+    // Kernels for Adreno
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_32_src {
+        #include "ggml-opencl_transpose_32.cl.h"
+    };
+#else
+    const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl");
+#endif
+    backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_32_16_src {
+        #include "ggml-opencl_transpose_32_16.cl.h"
+    };
+#else
+    const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
+#endif
+    backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string transpose_16_src {
+        #include "ggml-opencl_transpose_16.cl.h"
+    };
+#else
+    const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
+#endif
+    backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
+
+    // Gemv general
+    std::string CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemv_general {
+        #include "ggml-opencl_gemv_noshuffle_general.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
+#endif
+
+    backend_ctx->program_CL_gemv_general = build_program_from_source(
+        context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 2048, 16384
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=2048 "
+        " -DBLOCK_STRIDE_A=16384 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemv {
+        #include "ggml-opencl_gemv_noshuffle.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
+#endif
+
+    backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 2048, 16384
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=2048 "
+        " -DBLOCK_STRIDE_A=16384 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 5504, 44032
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=5504 "
+        " -DBLOCK_STRIDE_A=44032 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
+        context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemv 16000, 128000
+    CL_gemv_compile_opts =
+        " -cl-std=CL2.0 "
+        " -cl-mad-enable "
+        " -DLINE_STRIDE_A=16000 "
+        " -DBLOCK_STRIDE_A=128000 "
+        " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
+    if (has_vector_subgroup_broadcast) {
+        CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+    }
+
+    backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
+
+    // Gemm
+#ifdef GGML_OPENCL_EMBED_KERNELS
+    const std::string kernel_src_CL_gemm {
+        #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
+    };
+#else
+    const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
+#endif
+    backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
+    CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
+
+    // Allocate intermediate buffers and images
+    size_t max_A_q_d_bytes = 311164928;
+    size_t max_A_s_d_bytes = 38895616;
+    size_t max_B_d_bytes = 45088768;
+
+    CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
+    CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
+    CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    // For now we support a single devices
+    ggml_backend_opencl_n_devices = 1;
+
+    return backend_ctx;
+}
+
+static void ggml_cl2_free(void) {
+#ifdef GGML_OPENCL_PROFILING
+    FILE * fperf = fopen("cl_profiling.csv", "w");
+    if (!fperf) {
+        GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+        return;
+    }
+
+    float total_kernel_time = 0;
+    fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
+    for (const ProfilingInfo & info : g_profiling_info) {
+        total_kernel_time += info.duration_ns/1.e6f;
+        fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
+            info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
+            info.global_size[0], info.global_size[1], info.global_size[2],
+            info.local_size[0], info.local_size[2], info.local_size[2],
+            info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
+    }
+    fclose(fperf);
+
+    GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Tensor extra management
+//------------------------------------------------------------------------------
+struct ggml_tensor_extra_cl {
+    // The buffer object that holds the data.
+    cl_mem data_device;
+    // The offset into the buffer object. This is primarily for scratch buffer
+    // and view operation.
+    // NB: this offset no longer includes view offset (view_offs). Whenever this
+    // offset is used, view_offs should be considered.
+    cl_ulong offset;
+    // The actual size of the cl_mem object. This is needed when returning the
+    // block to the pool.
+    size_t actual_size;
+
+    void reset() {
+        data_device = nullptr;
+        offset = 0;
+        actual_size = 0;
+    }
+};
+
+// Additional tensor extra structs for quantized tensors.
+// These tensors are loaded from files and should not be allocated in scratch --
+// they should always be allocated from the pool. Hence, they do not have an
+// `offset`, which indicate their locations in the scratch buffer.
+struct ggml_tensor_extra_cl_q4_0 {
+    // Quantized values.
+    cl_mem q = nullptr;
+    // Quantized values in image1d_buffer_t.
+    cl_mem q_img = nullptr;
+    // Scales.
+    cl_mem d = nullptr;
+    // Scales in image1d_buffer_t.
+    cl_mem d_img = nullptr;
+    // Size of quantized values.
+    size_t size_q = 0;
+    // Size of scales.
+    size_t size_d = 0;
+
+    ~ggml_tensor_extra_cl_q4_0() {
+        reset();
+    }
+
+    void reset() {
+        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
+        // They must be properly released so that the original buffer can be
+        // properly released to avoid memory leak.
+        if (q != nullptr) {
+            CL_CHECK(clReleaseMemObject(q));
+            q = nullptr;
+        }
+        if (d != nullptr) {
+            CL_CHECK(clReleaseMemObject(d));
+            d = nullptr;
+        }
+        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
+        // enabled. They point to the images in ggml_backend_opencl_buffer_context.
+        // So, there is no need to release them here.
+        // TODO: initialize them for non SMALL_PATH path, or remove them.
+        q_img = nullptr;
+        d_img = nullptr;
+        size_q = 0;
+        size_d = 0;
+    }
+};
+
+//------------------------------------------------------------------------------
+// Backend API
+//------------------------------------------------------------------------------
+
+//
+// backend
+//
+static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
+    return "OpenCL";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_opencl_free(ggml_backend_t backend) {
+    ggml_cl2_free();
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(data);
+    GGML_UNUSED(offset);
+    GGML_UNUSED(size);
+}
+
+static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(data);
+    GGML_UNUSED(offset);
+    GGML_UNUSED(size);
+}
+
+static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    return false;
+}
+
+static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+}
+
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        bool ok = ggml_cl_compute_forward(backend, node);
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    GGML_UNUSED(dev);
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+            return true;
+        case GGML_OP_GET_ROWS:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    return true;
+                case GGML_TYPE_Q4_0:
+#ifdef GGML_OPENCL_SOA_Q
+                    // We do not support flattened Q4_0 (and possibly other Q's)
+                    return false;
+#else // GGML_OPENCL_SOA_Q
+                    return true;
+#endif // GGML_OPENCL_SOA_Q
+                default:
+                    return false;
+            }
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                    switch (op->type) {
+                        case GGML_TYPE_F16:
+                        case GGML_TYPE_F32:
+                            return true;
+                        default:
+                            return false;
+                    }
+                case GGML_TYPE_F16:
+                    switch (op->type) {
+                        case GGML_TYPE_F16:
+                        case GGML_TYPE_F32:
+                            return true;
+                        default:
+                            return false;
+                    }
+                default:
+                    return false;
+            }
+        case GGML_OP_ADD:
+        case GGML_OP_SCALE:
+        case GGML_OP_MUL:
+            return true;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                   return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+        case GGML_OP_CLAMP:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+            return true;
+        case GGML_OP_MUL_MAT:
+            if (op->src[0]->type == GGML_TYPE_F16) {
+                return true;
+            } else if (op->src[0]->type == GGML_TYPE_F32) {
+                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+            } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
+                       op->src[0]->type == GGML_TYPE_Q6_K) {
+                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+            }
+            return false;
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+        case GGML_OP_DIAG_MASK_INF:
+            return op->ne[3] == 1;
+        case GGML_OP_ROPE:
+            return true;
+        default:
+            return false;
+    }
+}
+
+// Forward declaration - implementation appears later in the file.
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
+
+static ggml_guid_t ggml_backend_opencl_guid() {
+    static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
+    return &guid;
+}
+
+static ggml_backend_i ggml_backend_opencl_i = {
+    /* .get_name                = */ ggml_backend_opencl_name,
+    /* .free                    = */ ggml_backend_opencl_free,
+    /* .set_tensor_async        = */ NULL,  /* ggml_backend_opencl_set_tensor_async */
+    /* .get_tensor_async        = */ NULL,  /* ggml_backend_opencl_get_tensor_async */
+    /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
+    /* .synchronize             = */ NULL,  /* ggml_backend_opencl_synchronize */
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+ggml_backend_t ggml_backend_opencl_init(void) {
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_opencl_guid(),
+        /* .interface = */ ggml_backend_opencl_i,
+        /* .device    = */ dev,
+        /* .context   = */ backend_ctx
+    };
+
+    return backend;
+}
+
+bool ggml_backend_is_opencl(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_opencl_name;
+}
+
+//
+// buffer
+//
+struct ggml_backend_opencl_buffer_context {
+    // A buffer context can hold multiple cl_mem objects. This is for flattening
+    // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
+    // each tensor is allocated a separate buffer. When flattening is enabled
+    // with small allocation, each tensor is backed by two cl_mem objects (for
+    // quants and scales) packed into a backend_opencl_buffer.
+    ggml_backend_opencl_buffer_context(cl_mem buf)
+        : name("OpenCL") {
+        buffer.push_back(buf);
+    }
+
+    ~ggml_backend_opencl_buffer_context() {
+        for (cl_mem buf : buffer) {
+            CL_CHECK(clReleaseMemObject(buf));
+        }
+        for (cl_mem im : img) {
+            CL_CHECK(clReleaseMemObject(im));
+        }
+
+        // Delete all extras to trigger their destructors
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+            delete e;
+        }
+    }
+
+    ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
+        ggml_tensor_extra_cl * extra;
+        if (temp_tensor_extras.empty()) {
+            extra = new ggml_tensor_extra_cl();
+        } else {
+            extra = temp_tensor_extras.back();
+            temp_tensor_extras.pop_back();
+        }
+
+        temp_tensor_extras_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
+    ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
+        ggml_tensor_extra_cl_q4_0 * extra;
+        if (temp_tensor_extras_q4_0.empty()) {
+            extra = new ggml_tensor_extra_cl_q4_0();
+        } else {
+            extra = temp_tensor_extras_q4_0.back();
+            temp_tensor_extras_q4_0.pop_back();
+        }
+
+        temp_tensor_extras_q4_0_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
+    void reset() {
+        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
+            temp_tensor_extras.push_back(e);
+        }
+        temp_tensor_extras_in_use.clear();
+
+        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
+            temp_tensor_extras_q4_0.push_back(e);
+        }
+        temp_tensor_extras_q4_0_in_use.clear();
+    }
+
+    // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
+    // being used are in `temp_tensor_extras_in_use`. At the first run, new
+    // extras get created and put in `in_use`. When the buffer is reset via
+    // the `reset` callback, all extras in `in_use` get moved to available extras
+    // for reuse.
+    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
+    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
+    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
+    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
+
+    // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
+    // before any tensor is initialized (at the beginning of alloc_tensor_range).
+    // Hence, there is alway a buffer object in this vector. When each tensor is
+    // being initialized, this original buffer object will be released if both
+    // flattening and small allocation are enabled, and additional buffer
+    // objects will be created in init_tensor to represent flattened quantized
+    // weights.
+    std::vector<cl_mem> buffer;
+    // These are image1d_buffer_t objects that wrap around the quants and scales.
+    // For Q4_0 quantization, there should be two of them - one for quants and
+    // one for scales. They should be populated only when flattening and small
+    // allocation are enabled.
+    std::vector<cl_mem> img;
+    std::string name;
+};
+
+static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+
+static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return cl_ptr_base;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+
+    ggml_cl2_init(buffer->buft->device);
+
+    if (tensor->view_src != nullptr) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+
+        ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
+        GGML_ASSERT(view_extra && "view_extra is nullptr?");
+
+        // Reuse extra of the parent tensor. The offset of this view tensor
+        // becomes `extra->offset + view_offs` and needs to be calculated when
+        // it is used. This changes is needed because of the change to
+        // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
+        // `buffer` passed in here will always be `tensor->buffer`. It is OK
+        // to allocate extras from the same buffer context for ordinary
+        // intermediate tensors. But for views into kv cache tensors, doing so
+        // would mess up the extras used by kv cache.
+        // Before #7640, `buffer` is for intermediate tensors, which is always
+        // different from that of kv cache tensors.
+        //
+        // NB: now extra->offset no longer accounts for view_offs.
+        // NB: this should not apply to weight tensors (for end-to-end runs, but
+        //     may apply for test-backend-ops).
+        // FIXME: if any unexpected results are seen, double check the offset -
+        // there could be other places that need fix.
+        tensor->extra = view_extra;
+    } else {
+        {
+            size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
+
+            ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
+            extra->offset = offset;
+            extra->data_device = ctx->buffer[0];
+            extra->actual_size = ggml_nbytes(tensor);
+
+            tensor->extra = extra;
+        }
+    }
+}
+
+// The optimized gemm and gemv kernels are used for large matrices without batch.
+// tensor is the quantized weights matrix.
+inline bool use_adreno_kernels(const ggml_tensor *tensor) {
+    return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 &&
+            tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+    cl_context context = backend_ctx->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+#ifdef GGML_OPENCL_SOA_Q
+    // We separate the quantized bits and scale from block_q4_0 by using an
+    // additional kernel, where each thread handles a block. We first read the
+    // original weights into a temporary buffer, then create two separate
+    // buffers for quantized bits and scales, which are then populated by the
+    // conversion kernel.
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        // Tensors should have been preallocated, therefore they should
+        // already have ggml_tensor_extra_cl as extra.
+        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+        // Allocate the new extra and create aliases from the original.
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
+
+        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
+        GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+        CL_CHECK(clEnqueueWriteBuffer(
+            queue, data_device, CL_TRUE, 0,
+            ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+        // We consider the specified offset arg as always, although For weights
+        // the offset arg should be 0 (we do not assert this).
+        //GGML_ASSERT(offset == 0);
+
+        // We create subbuffers from the original tensor buffer for scales and
+        // quants - i.e., scales and quants are aliases into the buffer obejct
+        // that backs the original tensor. This is a cleaner way to adapt to the
+        // new memory management.
+        // In the old code, we allocate new buffers for scales and quants
+        // respectively, which could still be done but would result in double
+        // allocation; properly deallocating the preallocated buffer that backs
+        // the tensors is tricky and would leak the backend specific information
+        // into the general backend code.
+        // Does this create misaligned subbuffers (alignment is 1024) in certain
+        // cases ?
+        cl_buffer_region region;
+
+        // The original tensor memory is divided into scales and quants, i.e.,
+        // we first store scales, then quants.
+        // Create subbuffer for scales.
+        region.origin = extra_orig->offset + tensor->view_offs + offset;
+        region.size = size_d;
+        extra->d = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        // Create subbuffer for quants.
+        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+        region.size = size_q;
+        extra->q = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+
+        // The optimized kernels need weights in natural order, so unshuffle.
+        if (use_adreno_kernels(tensor)) {
+            kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
+        }
+    #else
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
+    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clReleaseMemObject(data_device));
+
+        tensor->extra = extra;
+
+        // transpose the weights and scales
+    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        // Only do transpose for large, non batched matrix
+        // TODO: use preallocated images instead of sub-buffer then image
+        if (use_adreno_kernels(tensor)) {
+        // <----------------------------------------------------------------------------------> //
+        // start transpose
+        // <----------------------------------------------------------------------------------> //
+        int M = tensor->ne[1];   // ne01
+        int K = tensor->ne[0];   // ne00
+
+        // transpose is out of place, so we need to allocate transposed buffers
+        // <----------------------------------------------------------------------------------> //
+        // use sub_buffer of max buffer size instead
+
+        size_t q_size_bytes = K * M / 8 * sizeof(float);
+        cl_buffer_region region;
+        region.origin = 0;
+        region.size = q_size_bytes;
+        cl_mem qT_d = clCreateSubBuffer(
+            backend_ctx->A_q_d_max,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &err);
+        // cl_mem qT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, q_size_bytes, NULL, &err);
+        CL_CHECK(err);
+
+        // size_t d_size_bytes = M * (K / 32) / 2 * sizeof(float);
+        size_t d_size_bytes = M * (K / 32) * 2;
+        region.origin = 0;
+        region.size = d_size_bytes;
+        cl_mem dT_d = clCreateSubBuffer(
+            backend_ctx->A_s_d_max,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &err);
+        // cl_mem dT_d = clCreateBuffer(context, CL_MEM_READ_WRITE, d_size_bytes, NULL, &err);
+        CL_CHECK(err);
+
+        // <----------------------------------------------------------------------------------> //
+
+
+        // create images from the buffers
+        // <----------------------------------------------------------------------------------> //
+        cl_mem q_d_image1D;
+        cl_mem d_d_image1D;
+        cl_mem qT_d_image1D;
+        cl_mem dT_d_image1D;
+
+        cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        cl_image_desc img_desc_1d;
+
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.buffer = extra->q;
+        q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.buffer = qT_d;
+        qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.buffer = extra->d;
+        d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+
+        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.buffer = dT_d;
+        dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+        CL_CHECK(err);
+        // <----------------------------------------------------------------------------------> //
+
+        // set up and call the transpose kernels
+        // <----------------------------------------------------------------------------------> //
+        // weights
+        int height_q = M / 8;
+        int width_q = K / 8 / 4;
+        kernel = backend_ctx->kernel_transpose_16;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_q));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_q));
+
+        size_t local_size_q[3] = {4, 16, 1};
+        size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+
+        // scales
+        int height_s = M / 8;
+        int width_s = K / 32 / 8;
+
+        kernel = backend_ctx->kernel_transpose_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
+
+        size_t local_size_s[3] = {4, 16, 1};
+        size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        // <----------------------------------------------------------------------------------> //
+
+        // copy transposed buffer contents to original buffers
+        // <----------------------------------------------------------------------------------> //
+        // weights
+        CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+
+        // scales
+        CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        // <----------------------------------------------------------------------------------> //
+
+        // deallocate transpose buffers
+        // <----------------------------------------------------------------------------------> //
+        CL_CHECK(clReleaseMemObject(qT_d));
+        CL_CHECK(clReleaseMemObject(dT_d));
+
+        // deallocate temporary images
+        CL_CHECK(clReleaseMemObject(q_d_image1D));
+        CL_CHECK(clReleaseMemObject(d_d_image1D));
+        CL_CHECK(clReleaseMemObject(qT_d_image1D));
+        CL_CHECK(clReleaseMemObject(dT_d_image1D));
+        // <----------------------------------------------------------------------------------> //
+        // end transpose
+        // <----------------------------------------------------------------------------------> //
+        }
+    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+        return;
+    }
+#endif // GGML_OPENCL_SOA_Q
+
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+    GGML_ASSERT(extra);
+
+    CL_CHECK(clEnqueueWriteBuffer(
+        queue, extra->data_device, CL_TRUE, extra->offset + offset,
+        size, data, 0, NULL, NULL));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(tensor->extra);
+
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
+
+    cl_context context = backend_ctx->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    // Make sure all previously submitted commands are finished.
+    CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+    // In end-to-end runs, get_tensor is usually used to get back the logits,
+    // where we can simply do clEnqueueReadBuffer since they are f32.
+    // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
+    // which requires reading back quantized weight tensors.
+    // To properly support this, we need to restore block_q4_0 struct arrays
+    // from the flattened buffers.
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {1, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+            global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clEnqueueReadBuffer(
+            queue, data_device, CL_TRUE, offset,
+            size, data, 0, NULL, NULL));
+        CL_CHECK(clReleaseMemObject(data_device));
+        return;
+    }
+#endif // GGML_OPENCL_SOA_Q
+
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+
+    CL_CHECK(clEnqueueReadBuffer(
+        queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
+        size, data, 0, NULL, NULL));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_dev_t dev = buffer->buft->device;
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    for (cl_mem buf : ctx->buffer) {
+        CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
+    }
+    CL_CHECK(clFinish(queue));
+}
+
+static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    ctx->reset();
+}
+
+static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_opencl_buffer_clear,
+    /* .reset           = */ ggml_backend_opencl_buffer_reset,
+};
+
+//
+// buffer type
+//
+
+static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
+
+    // clCreateBuffer returns -61 for size 0
+    size = std::max(size, (size_t)1);
+
+    cl_int err;
+    cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
+    if (err != CL_SUCCESS) {
+        GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
+        return nullptr;
+    }
+
+    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
+
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+    // FIXME: not thread safe, device may not be initialized yet
+    static cl_uint alignment = -1;
+    if (alignment == (cl_uint)-1) {
+        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+        alignment = backend_ctx->alignment;
+    }
+    return alignment;
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+    static size_t max_size = -1;
+    if (max_size == (size_t)-1) {
+        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+        max_size = backend_ctx->max_alloc_size;
+    }
+    return max_size;
+}
+
+static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_opencl(backend);
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_opencl_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ NULL,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+    static ggml_backend_buffer_type buffer_type = {
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ &g_ggml_backend_opencl_device,
+        /* .context = */ nullptr,
+    };
+
+    return &buffer_type;
+}
+
+//
+// backend device
+//
+
+static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
+    return "GPUOpenCL";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    return dev_ctx->device_name.c_str();
+}
+
+static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    *free = 1;
+    *total = 1;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_opencl_device_get_name(dev);
+    props->description = ggml_backend_opencl_device_get_description(dev);
+    props->type        = ggml_backend_opencl_device_get_type(dev);
+    ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = ggml_backend_dev_caps {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_opencl_guid(),
+        /* .interface = */ ggml_backend_opencl_i,
+        /* .device    = */ dev,
+        /* .context   = */ backend_ctx,
+    };
+
+    return backend;
+
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_opencl_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    return ggml_opencl_supports_op(dev, op);
+}
+
+static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+
+    GGML_UNUSED(dev);
+}
+
+static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+    /* .get_name             = */ ggml_backend_opencl_device_get_name,
+    /* .get_description      = */ ggml_backend_opencl_device_get_description,
+    /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
+    /* .get_type             = */ ggml_backend_opencl_device_get_type,
+    /* .get_props            = */ ggml_backend_opencl_device_get_props,
+    /* .init_backend         = */ ggml_backend_opencl_device_init,
+    /* .get_buffer_type      = */ ggml_backend_opencl_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_opencl_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_opencl_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// Backend registry
+
+static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
+    return "OpenCL";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
+    return ggml_backend_opencl_n_devices;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    return &g_ggml_backend_opencl_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
+    /* .get_name         = */ ggml_backend_opencl_reg_get_name,
+    /* .device_count     = */ ggml_backend_opencl_reg_device_count,
+    /* .device_get       = */ ggml_backend_opencl_reg_device_get,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_opencl_reg(void) {
+    // TODO: make this thread-safe somehow?
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    if (!initialized) {
+        reg = ggml_backend_reg {
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
+            /* .iface   = */ ggml_backend_opencl_reg_i,
+            /* .context = */ NULL,
+        };
+
+        g_ggml_backend_opencl_device = ggml_backend_device {
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ &reg,
+            /* .context = */ &g_ggml_ctx_dev_main,
+        };
+
+        ggml_cl2_init(&g_ggml_backend_opencl_device);
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
+
+//------------------------------------------------------------------------------
+// Debugging utils
+//------------------------------------------------------------------------------
+#if 0
+#define QK4_0 32
+typedef struct {
+    ggml_fp16_t d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
+    "wrong q4_0 block size/padding");
+
+#include <math.h>
+#ifdef __cplusplus
+#include "half.hpp"
+#endif
+
+static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
+    void * buf = malloc(ggml_nbytes(tensor));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+#ifdef GGML_OPENCL_SOA_Q
+    void * buf_q;
+    void * buf_d;
+#endif
+
+#ifdef GGML_USE_OPENCL
+    // Make sure everything is done.
+    CL_CHECK(clFinish(queue));
+
+#ifdef GGML_OPENCL_SOA_Q
+    if (tensor->type == GGML_TYPE_Q4_0) {
+        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
+        GGML_ASSERT(extra);
+
+        size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
+        size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
+        GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
+        buf_q = malloc(size_q);
+        buf_d = malloc(size_d);
+
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
+        CL_CHECK(clFinish(queue));
+    } else {
+        // Read out the tensor from GPU memory.
+        ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+        GGML_ASSERT(extra);
+
+        CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+        CL_CHECK(clFinish(queue));
+    }
+#else
+    // Read out the tensor from GPU memory.
+    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
+    GGML_ASSERT(extra);
+
+    CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
+        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+#endif // GGML_OPENCL_SOA_Q
+#endif // GGML_USE_OPENCL
+
+    // Open file and dump.
+    char fname[512];
+    sprintf(fname, "./tensor-dumps/%s.txt", tensor->name);
+    FILE * f = fopen(fname, "w");
+    if (!f) {
+        printf("Failed to open %s\n", fname);
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_F32) {
+        float * data = (float *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%f\n", data[i]);
+        }
+    } else if (tensor->type == GGML_TYPE_I32) {
+        int * data = (int *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%d\n", data[i]);
+        }
+    } else if (tensor->type == GGML_TYPE_F16) {
+#ifdef __cplusplus
+        half_float::half * data = (half_float::half *) buf;
+        for (int i = 0; i < ggml_nelements(tensor); ++i) {
+            if (std::isnan(data[i])) {
+                printf("NaN found: %s\n", tensor->name);
+                break;
+            }
+            fprintf(f, "%f\n", float(data[i]));
+        }
+#endif
+    } else if (tensor->type == GGML_TYPE_Q4_0) {
+#ifdef GGML_OPENCL_SOA_Q
+        ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
+        unsigned char * data_q = (unsigned char *)buf_q;
+
+        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+            fprintf(f, "%04x, ", data_d[i]);
+            for (int k = 0; k < QK4_0/2; ++k) {
+                fprintf(f, "%02x, ", data_q[k]);
+            }
+            fprintf(f, "\n");
+            data_q += QK4_0/2;
+        }
+        free(buf_d);
+        free(buf_q);
+#else
+        block_q4_0 * data = (block_q4_0 *) buf;
+        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
+            fprintf(f, "%04x, ", data[i].d);
+            for (int k = 0; k < QK4_0/2; ++k) {
+                fprintf(f, "%02x, ", data[i].qs[k]);
+            }
+            fprintf(f, "\n");
+        }
+#endif // GGML_OPENCL_SOA_Q
+    }
+    free(buf);
+    fflush(f);
+    fclose(f);
+}
+#else
+#define dump_tensor(tensor)
+#endif
+
+//------------------------------------------------------------------------------
+// Profiling utility
+//------------------------------------------------------------------------------
+#ifdef GGML_OPENCL_PROFILING
+void populateProfilingInfo(
+        ProfilingInfo& info, cl_event evt, cl_kernel kernel,
+        size_t global_size[3], size_t local_size[3],
+        const ggml_tensor * tensor) {
+    cl_ulong start;
+    cl_ulong end;
+    CL_CHECK(clWaitForEvents(1, &evt));
+    CL_CHECK(clGetEventProfilingInfo(
+        evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
+    CL_CHECK(clGetEventProfilingInfo(
+        evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
+
+    char kernel_name[512];
+    CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+        sizeof(kernel_name), kernel_name, NULL));
+
+    info.duration_ns = end - start;
+    info.op_name = tensor->name;
+    info.kernel_name = kernel_name;
+    info.local_size[0]  = local_size[0];
+    info.local_size[1]  = local_size[1];
+    info.local_size[2]  = local_size[2];
+    info.global_size[0] = global_size[0];
+    info.global_size[1] = global_size[1];
+    info.global_size[2] = global_size[2];
+    info.output_size[0] = tensor->ne[0];
+    info.output_size[1] = tensor->ne[1];
+    info.output_size[2] = tensor->ne[2];
+    info.output_size[3] = tensor->ne[3];
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Ops
+//------------------------------------------------------------------------------
+
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    UNUSED(backend);
+    UNUSED(src0);
+    UNUSED(src1);
+    UNUSED(dst);
+}
+
+static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int      ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const int      ne10 = src1 ? src1->ne[0] : 0;
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const int      ne11 = src1 ? src1->ne[1] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb1  = dst  ?  dst->nb[1] : 0;
+    const cl_ulong nb2  = dst  ?  dst->nb[2] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            kernel = backend_ctx->kernel_get_rows_f32;
+            break;
+        case GGML_TYPE_F16:
+            kernel = backend_ctx->kernel_get_rows_f16;
+            break;
+        case GGML_TYPE_Q4_0:
+            kernel = backend_ctx->kernel_get_rows_q4_0;
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
+
+    size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
+    size_t local_work_size[] = {1, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int  ne10 = src1 ? src1->ne[0] : 0;
+    const int  ne11 = src1 ? src1->ne[1] : 0;
+    const int  ne12 = src1 ? src1->ne[2] : 0;
+    const int  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+    const int  ne0  = dst ? dst->ne[0] : 0;
+    const int  ne1  = dst ? dst->ne[1] : 0;
+    const int  ne2  = dst ? dst->ne[2] : 0;
+    const int  ne3  = dst ? dst->ne[3] : 0;
+
+    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_add_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_add;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+    }
+
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const int ne01 = src0 ? src0->ne[1] : 0;
+    const int ne02 = src0 ? src0->ne[2] : 0;
+    const int ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0;
+    const int ne12 = src1 ? src1->ne[2] : 0;
+    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+    const int ne0  = dst ? dst->ne[0] : 0;
+    const int ne1  = dst ? dst->ne[1] : 0;
+    const int ne2  = dst ? dst->ne[2] : 0;
+    const int ne3  = dst ? dst->ne[3] : 0;
+
+    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
+    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
+    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
+    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_mul_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_mul;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
+    }
+
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    int n = ggml_nelements(dst);
+
+    if (n % 4 == 0) {
+        kernel = backend_ctx->kernel_gelu_4;
+        n /= 4;
+    } else {
+        kernel = backend_ctx->kernel_gelu;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+#endif
+}
+
+static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    int n = ggml_nelements(dst);
+
+    if (n % 4 == 0) {
+        kernel = backend_ctx->kernel_silu_4;
+        n /= 4;
+    } else {
+        kernel = backend_ctx->kernel_silu;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel = backend_ctx->kernel_relu;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    const int64_t n = ggml_nelements(dst);
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float min;
+    float max;
+    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+
+    cl_kernel kernel = backend_ctx->kernel_clamp;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &min));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &max));
+
+    const int64_t n = ggml_nelements(dst);
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+    const int nth = MIN(64, ne00);
+
+    cl_kernel kernel = backend_ctx->kernel_norm;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),     &eps));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
+
+    const int64_t nrows = ggml_nrows(src0);
+
+    size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_backend_opencl_device_context * dev_ctx =
+        (ggml_backend_opencl_device_context *)backend->device->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+
+    GGML_ASSERT(ne00 % 4 == 0);
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
+
+    const int nth = MIN(64, ne00);
+
+    const int64_t nrows = ggml_nrows(src0);
+
+    size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+    cl_kernel kernel = backend_ctx->kernel_rms_norm;
+
+    // Note, this kernel declares local memory in kernel args and the size
+    // depends on subgroup size.
+    // Retrieve subgroup size.
+    // Note, this requires OpenCL 2.1 and above
+    size_t sgs;
+    CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
+        CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+        sizeof(local_work_size), local_work_size,
+        sizeof(size_t), &sgs, NULL));
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),     &eps));
+    // This is local memory - the size depends on subgroup size.
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs,  NULL));
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+#ifdef GGML_OPENCL_SOA_Q
+    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
+#endif
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int  ne10 = src1 ? src1->ne[0] : 0;
+    const int  ne11 = src1 ? src1->ne[1] : 0;
+    const int  ne12 = src1 ? src1->ne[2] : 0;
+    const int  ne13 = src1 ? src1->ne[3] : 0;
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+    const int  ne0 = dst ? dst->ne[0] : 0;
+    const int  ne1 = dst ? dst->ne[1] : 0;
+
+    int r2 = ne12/ne02;
+    int r3 = ne13/ne03;
+
+    GGML_ASSERT(ne00 == ne10);
+
+    int nth0 = 32;
+    int nth1 = 1;
+    int nrows = 1;
+    // The number of values produced by each subgroup
+    int ndst = 4;
+
+    cl_kernel kernel;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    cl_context context = backend_ctx->context;
+
+    if (ne01 && ne1 && use_adreno_kernels(src0)) {
+
+    // init CL objects
+    // <--------------------------------------------> //
+    cl_int              status;
+    cl_image_format     img_fmt_1d;
+    cl_image_desc       img_desc_1d;
+    cl_buffer_region    region;
+    cl_mem              A_image1d = nullptr;
+    cl_mem              B_image1d = nullptr;
+    cl_mem              B_sub_buffer = nullptr;
+    cl_mem              C_d = nullptr;
+    // for B transpose
+    cl_mem B_d = nullptr;
+    cl_mem B_d_input_image = nullptr;
+    // <--------------------------------------------> //
+
+    // define matrix dimensions
+    // <--------------------------------------------> //
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+    int padding;
+    // <--------------------------------------------> //
+
+    // q4_0 x fp32
+    if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
+        // TODO: remove duplicate definitions of image description + format -- move to top
+
+        // create an image for A
+        // <--------------------------------------------> //
+        if (N == 1) {
+            img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
+        } else {
+            img_fmt_1d = { CL_R, CL_FLOAT};
+        }
+        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc_1d.image_width = M * K / 2 / 4;    // Divide by 4 for char -> float
+        img_desc_1d.buffer = extra0_q4_0->q;
+        A_image1d = clCreateImage(
+            context,
+            CL_MEM_READ_ONLY,
+            &img_fmt_1d,
+            &img_desc_1d,
+            NULL,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+
+        // create a sub_buffer for B
+        // <--------------------------------------------> //
+        region.origin = (extra1->offset);
+        region.size = K * N * sizeof(float);
+        B_sub_buffer = clCreateSubBuffer(
+            extra1->data_device,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &status);
+        CL_CHECK(status);
+        // <--------------------------------------------> //
+
+        // transpose activation for Skyler's gemm
+        if (N != 1) {
+            //how many extra elements beyond multiple of 8
+            int extra_elements = N % 8;
+
+            //how much padding to add
+            padding = 0;
+            if (extra_elements > 0){
+                padding = 8 - extra_elements;
+            }
+
+            // Specify the starting offset (in bytes)
+            region.origin = 0;
+            // Specify the size of the sub-buffer (divide by 2 for FP16)
+            region.size = K * (N + padding) * sizeof(float)/2;
+            B_d = clCreateSubBuffer(
+                backend_ctx->B_d_max,
+                0,
+                CL_BUFFER_CREATE_TYPE_REGION,
+                &region,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
+            cl_image_desc image_desc_B_d_input = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * N / 4),
+                0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
+            };
+            B_d_input_image = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_input,
+                &image_desc_B_d_input,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
+            cl_image_desc image_desc_B_d_output = {
+                CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                static_cast<size_t>(K * (N + padding)/4),
+                0, 0, 0, 0, 0, 0, 0, { B_d }
+            };
+            B_image1d = clCreateImage(
+                context,
+                0,
+                &image_format_B_d_output,
+                &image_desc_B_d_output,
+                NULL,
+                &status);
+            CL_CHECK(status);
+
+            int height_B = N/4;
+            int width_B = K/4;
+            int padded_height_B = (N + padding)/4;
+
+            kernel = backend_ctx->kernel_transpose_32_16;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+            size_t local_size_t[2] = { 1, 16 };
+            //WGS tuning
+            if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=4;
+                local_size_t[1]=8;
+            } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+                local_size_t[0]=1;
+                local_size_t[1]=8;
+            } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+                local_size_t[0]=2;
+                local_size_t[1]=8;
+            }
+
+            size_t global_size_t[2] = {
+                static_cast<size_t>(width_B),
+                static_cast<size_t>(padded_height_B)
+            };
+
+            #ifdef GGML_OPENCL_PROFILING
+                cl_event evt;
+                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
+
+                g_profiling_info.emplace_back();
+                populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
+            #else
+                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
+            #endif
+        } else {
+            // no need to transpose B in other cases
+            // create an image for B from sub_buffer
+            // <--------------------------------------------> //
+            img_fmt_1d = {CL_RGBA, CL_FLOAT};
+
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_width = K * N / 4;
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.buffer = B_sub_buffer;
+            B_image1d = clCreateImage(
+                context,
+                CL_MEM_READ_ONLY,
+                &img_fmt_1d,
+                &img_desc_1d,
+                NULL,
+                &status);
+            CL_CHECK(status);
+            // <--------------------------------------------> //
+        }
+
+        // choose gemm or gemv kernel
+        // <--------------------------------------------> //
+        if (N == 1) {
+            kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
+            if (M == 4096 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
+            } else if (M == 4096 && K == 11008) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
+            } else if (M == 11008 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
+            } else if (M == 32000 && K == 4096) {
+                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+            }
+        } else {
+            kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
+        }
+        // <--------------------------------------------> //
+
+        // set kernel args
+        // <--------------------------------------------> //
+        cl_uint k_arg = 0;
+
+        if (N == 1) {
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &A_image1d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extra0_q4_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &B_image1d));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extra1->offset));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extrad->offset));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r3));
+        } else {
+            region.origin = extrad->offset; // Specify the starting offset (in bytes)
+            region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
+            C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+            CL_CHECK(status);
+
+            int padded_N = ne1 + padding;
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &ne01)); //M
+            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),    &padded_N)); //N with padding
+            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),    &ne00)); //K
+            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),    &ne1)); //N without padding
+        }
+        // <--------------------------------------------> //
+
+        // choose workgroup size
+        // <--------------------------------------------> //
+        size_t global_work_size[3] = {
+            64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)};
+        size_t local_work_size[3] = {64, 2, 4};
+
+        global_work_size[0] = (size_t)(ceil((float)ne1/8));
+        global_work_size[1] = (size_t)(ne01/4);
+        global_work_size[2] = (size_t)(1);
+
+        local_work_size[0]  = (size_t)(1); //4x32 for FP32
+        local_work_size[1]  = (size_t)(128);
+        local_work_size[2]  = (size_t)(1);
+
+        //WGS tuning
+        if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 1;
+            local_work_size[1] = 128;
+        } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
+            local_work_size[0] = 2;
+            local_work_size[1] = 64;
+        }
+
+        if (N == 1) {
+            local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
+            local_work_size[1] = 4; // reduce factor
+            local_work_size[2] = 1;
+
+            global_work_size[0] = M / 2;
+            global_work_size[1] = 4; // reduce factor
+            global_work_size[2] = 1;
+        }
+        // <--------------------------------------------> //
+
+        // enqueue kernel with profiling
+        // <--------------------------------------------> //
+    #ifdef GGML_OPENCL_PROFILING
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        // enqueue kernel without profiling
+    #else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    #endif
+        // <--------------------------------------------> //
+
+        // deallocate sub buffers and images
+        // <--------------------------------------------> //
+        CL_CHECK(clReleaseMemObject(A_image1d));
+        CL_CHECK(clReleaseMemObject(B_sub_buffer));
+        CL_CHECK(clReleaseMemObject(B_image1d));
+
+        if (N != 1) {
+            CL_CHECK(clReleaseMemObject(B_d));
+            CL_CHECK(clReleaseMemObject(B_d_input_image));
+            CL_CHECK(clReleaseMemObject(C_d));
+        }
+        // <--------------------------------------------> //
+
+        return;
+    }
+    } // if (ne01 && ne1)
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    if (!ggml_is_transposed(src0) &&
+        !ggml_is_transposed(src1) &&
+        src1t == GGML_TYPE_F32 &&
+        ne00%32 == 0 &&
+        ne11 > 2) {
+#ifdef GGML_OPENCL_SOA_Q
+        // Set up kernel.
+        switch(src0t) {
+            case GGML_TYPE_Q4_0:
+                // This should have been satisfied.
+                GGML_ASSERT(ne11 == ne1);
+                GGML_ASSERT(ne01 == ne0);
+
+                if (backend_ctx->gpu_family == INTEL) {
+                    nth0 = 16;
+                    nth1 = 1;
+
+                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
+                } else if (backend_ctx->gpu_family == ADRENO) {
+                    nth0 = 64;
+                    nth1 = 1;
+
+                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
+                } else {
+                    GGML_ASSERT(false && "TODO: Unknown GPU");
+                }
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+                break;
+            default:
+                break;
+        }
+
+        // Launch kernel.
+        if (src0t == GGML_TYPE_Q4_0) {
+            size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+            size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+            if (backend_ctx->gpu_family == INTEL) {
+                // Set global size for Intel. It uses 16x output values.
+                global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
+                global_work_size[1] = (size_t)ne11*nth1;
+                global_work_size[2] = (size_t)ne12*ne13;
+            }
+
+#ifdef GGML_OPENCL_PROFILING
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+            g_profiling_info.emplace_back();
+            populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+            return;
+        }
+#else // GGML_OPENCL_SOA_Q
+        // TODO: add block_q4_0 variant.
+#endif // GGML_OPENCL_SOA_Q
+    }
+
+    // use custom matrix x vector kernel
+    switch (src0t) {
+        case GGML_TYPE_F32:
+            //GGML_ASSERT(ne02 == ne12);
+            GGML_ASSERT(src1t == GGML_TYPE_F32);
+            kernel = backend_ctx->kernel_mul_mat_f32_f32;
+            nrows = 4;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 32;
+                nth1 = 1;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
+            break;
+        case GGML_TYPE_F16:
+            //GGML_ASSERT(ne02 == ne12);
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 32;
+                nth1 = 1;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            if (src1t == GGML_TYPE_F32) {
+                if (ne11 * ne12 < 4) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
+                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
+                    nrows = ne11;
+                } else {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32;
+                    nrows = 4;
+                }
+            } else {
+                kernel = backend_ctx->kernel_mul_mat_f16_f16;
+                nrows = 4;
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
+            break;
+        case GGML_TYPE_Q4_0:
+            // This should have been satisfied.
+            GGML_ASSERT(ne11 == ne1);
+            GGML_ASSERT(ne01 == ne0);
+
+#ifdef GGML_OPENCL_SOA_Q
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+                ndst = 8;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
+                ndst =8;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+#else // GGML_OPENCL_SOA_Q
+            if (backend_ctx->gpu_family == INTEL) {
+                // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
+                // group produces N_DST (4 for Q4_0 kernel) values in the result.
+                // The number of workgroups on dim 0 (the leading dimension) is
+                // the nearest multiple of 4 that covers ne0 (equals ne01).
+                nth0 = 16;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
+                ndst = 4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 1;
+
+                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
+                ndst = 4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+#endif // GGML_OPENCL_SOA_Q
+            break;
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 2;
+                nth1 = 16;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 2;
+                nth1 = 64;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    if (src0t == GGML_TYPE_Q4_0 ||
+        src0t == GGML_TYPE_Q4_1 ||
+        src0t == GGML_TYPE_Q8_0 ||
+        src0t == GGML_TYPE_Q2_K) {
+        // Each SIMD group produces N_DST values in the result. Assuming each
+        // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
+        // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
+        // (number of workgroups) will be a nearest multiple of
+        // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
+        // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
+        size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else if (src0t == GGML_TYPE_Q4_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q3_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q5_K) {
+        GGML_ASSERT(false && "not implemented");
+    } else if (src0t == GGML_TYPE_Q6_K) {
+        size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        int64_t ny = (ne11 + nrows - 1)/nrows;
+
+        size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
+        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_UNUSED(src1);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(scale));
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel = backend_ctx->kernel_scale;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &scale));
+
+    int n = ggml_nelements(dst)/4;
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+
+    // GGML_OP_CPY happens between src0 and src1.
+    // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
+    UNUSED(dst);
+
+    const int ne00 = src0 ? src0->ne[0] : 0;
+    const int ne01 = src0 ? src0->ne[1] : 0;
+    const int ne02 = src0 ? src0->ne[2] : 0;
+    const int ne03 = src0 ? src0->ne[3] : 0;
+
+    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0;
+    const int ne12 = src1 ? src1->ne[2] : 0;
+    const int ne13 = src1 ? src1->ne[3] : 0;
+
+    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
+    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
+    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
+    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+
+    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+
+    cl_kernel kernel;
+
+    switch (src0t) {
+        case GGML_TYPE_F32:
+            switch (src1t) {
+                case GGML_TYPE_F16:
+                    kernel = backend_ctx->kernel_cpy_f32_f16;
+                    break;
+                case GGML_TYPE_F32:
+                    kernel = backend_ctx->kernel_cpy_f32_f32;
+                    break;
+                default:
+                    GGML_ASSERT(false && "not implemented");
+            }
+            break;
+        case GGML_TYPE_F16:
+            switch (src1t) {
+                case GGML_TYPE_F16:
+                    kernel = backend_ctx->kernel_cpy_f16_f16;
+                    break;
+                case GGML_TYPE_F32:
+                    kernel = backend_ctx->kernel_cpy_f16_f32;
+                    break;
+                default:
+                    GGML_ASSERT(false && "not implemented");
+            }
+            break;
+        default:
+            GGML_ASSERT(false && "not implemented");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
+
+    const int nth = MIN(64, ne00);
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cl_cpy(backend, src0, dst, nullptr);
+    UNUSED(src1);
+}
+
+static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    int n_past = ((int32_t *)(dst->op_params))[0];
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+
+    if (ne00%8 == 0) {
+        kernel = backend_ctx->kernel_diag_mask_inf_8;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
+
+        size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        kernel = backend_ctx->kernel_diag_mask_inf;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
+
+        size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
+    // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
+    // alibi is not used; however, for some other models, it is used.
+    // KQ_mask
+    if (src1) {
+        GGML_ASSERT(src1);
+        GGML_ASSERT(src1->extra);
+    }
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    float scale, max_bias;
+    memcpy(&scale,    dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, dst->op_params + 1, sizeof(float));
+
+    const int nrows_x = ggml_nrows(src0);
+    const int nrows_y = src0->ne[1];
+
+    const int n_head      = nrows_x/nrows_y;
+    const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    // Local size must be wave size. Each workgroup is a wave, working on a row,
+    // where a row corresponds to leading dimension.
+    int nth = MIN(32, ne00);
+
+    if (backend_ctx->gpu_family == INTEL) {
+        // This is the same as the initial value.
+        nth = MIN(32, ne00);
+    }
+    else if (backend_ctx->gpu_family == ADRENO) {
+        nth = 64;
+    } else {
+        GGML_ASSERT(false && "TODO: Unknown GPU");
+    }
+
+    cl_kernel kernel;
+
+    if (ne00%4 == 0) {
+        kernel = backend_ctx->kernel_soft_max_4;
+    } else {
+        kernel = backend_ctx->kernel_soft_max;
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   extra1 ? &extra1->data_device : &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(float),    &scale));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float),    &max_bias));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),    &m0));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),    &m1));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &n_head_log2));
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    ggml_tensor * src2 = dst->src[2];
+    ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
+
+    cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
+
+    const int  ne00 = src0 ? src0->ne[0] : 0;
+    const int  ne01 = src0 ? src0->ne[1] : 0;
+    const int  ne02 = src0 ? src0->ne[2] : 0;
+    const int  ne03 = src0 ? src0->ne[3] : 0;
+
+    const int  nb00 = src0 ? src0->nb[0] : 0;
+    const int  nb01 = src0 ? src0->nb[1] : 0;
+    const int  nb02 = src0 ? src0->nb[2] : 0;
+    const int  nb03 = src0 ? src0->nb[3] : 0;
+
+    const int ne10 = src1 ? src1->ne[0] : 0;
+    const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
+    const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
+    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+    const int  ne0 = dst ? dst->ne[0] : 0;
+    const int  ne1 = dst ? dst->ne[1] : 0;
+    const int  ne2 = dst ? dst->ne[2] : 0;
+    const int  ne3 = dst ? dst->ne[3] : 0;
+
+    const int  nb0 = dst ? dst->nb[0] : 0;
+    const int  nb1 = dst ? dst->nb[1] : 0;
+    const int  nb2 = dst ? dst->nb[2] : 0;
+    const int  nb3 = dst ? dst->nb[3] : 0;
+
+    GGML_ASSERT(ne10 == ne02);
+
+    int nth = MIN(64, ne00);
+
+    const int n_past     = ((int *) dst->op_params)[0];
+    const int n_dims     = ((int *) dst->op_params)[1];
+    const int mode       = ((int *) dst->op_params)[2];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const bool is_neox = mode & 2;
+
+    cl_kernel kernel;
+
+    if (!is_neox) {
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                kernel = backend_ctx->kernel_rope_norm_f32;
+                break;
+            case GGML_TYPE_F16:
+                kernel = backend_ctx->kernel_rope_norm_f16;
+                break;
+            default:
+                GGML_ASSERT(false);
+        };
+    } else {
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                kernel = backend_ctx->kernel_rope_neox_f32;
+                break;
+            case GGML_TYPE_F16:
+                kernel = backend_ctx->kernel_rope_neox_f16;
+                break;
+            default:
+                GGML_ASSERT(false);
+        };
+    }
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   extra2 ? &extra2->data_device : &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne0));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne1));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne2));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne3));
+    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
+    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
+    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
+    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &n_past));
+    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &n_dims));
+    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &n_ctx_orig));
+    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float),    &freq_base));
+    CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float),    &freq_scale));
+    CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float),    &ext_factor));
+    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float),    &attn_factor));
+    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float),    &beta_fast));
+    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float),    &beta_slow));
+
+    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Op offloading
+//------------------------------------------------------------------------------
+
+typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+    ggml_cl_func_t func = nullptr;
+
+    ggml_tensor * src0 = tensor->src[0];
+    ggml_tensor * src1 = tensor->src[1];
+
+    const bool any_on_device = tensor->extra
+        || (src0 != nullptr && src0->extra)
+        || (src1 != nullptr && src1->extra);
+
+    switch (tensor->op) {
+        case GGML_OP_GET_ROWS:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_get_rows;
+            break;
+        case GGML_OP_CPY:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_cpy;
+            break;
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_dup;
+            break;
+        case GGML_OP_ADD:
+            if (!any_on_device) {
+                return false;
+            }
+            GGML_ASSERT(ggml_is_contiguous(src0));
+            GGML_ASSERT(ggml_is_contiguous(src1));
+            func = ggml_cl_add;
+            break;
+        case GGML_OP_MUL:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_mul;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_silu;
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_relu;
+                    break;
+                default:
+                    return false;
+            } break;
+        case GGML_OP_CLAMP:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_clamp;
+            break;
+        case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_norm;
+            break;
+        case GGML_OP_RMS_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cl_mul_mat;
+            break;
+        case GGML_OP_SCALE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_scale;
+            break;
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_rope;
+            break;
+        default:
+            return false;
+    }
+
+    func(backend, tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
diff --git a/ggml/src/ggml-opencl/kernels/embed_kernel.py b/ggml/src/ggml-opencl/kernels/embed_kernel.py
new file mode 100644
index 000000000..b5d1d7242
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/embed_kernel.py
@@ -0,0 +1,26 @@
+#
+
+import sys
+import logging
+logger = logging.getLogger("opencl-embed-kernel")
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    if len(sys.argv) != 3:
+        logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
+        sys.exit(1)
+
+    ifile = open(sys.argv[1], "r")
+    ofile = open(sys.argv[2], "w")
+
+    for i in ifile:
+        ofile.write('R"({})"\n'.format(i))
+
+    ifile.close()
+    ofile.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
new file mode 100644
index 000000000..d1cdf709b
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
@@ -0,0 +1,2683 @@
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q4_1
+//------------------------------------------------------------------------------
+struct block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_0
+//------------------------------------------------------------------------------
+struct block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_1
+//------------------------------------------------------------------------------
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q8_0
+//------------------------------------------------------------------------------
+struct block_q8_0
+{
+    half d;
+    int8_t qs[QK8_0];
+};
+
+//------------------------------------------------------------------------------
+// block_q2_K
+//------------------------------------------------------------------------------
+struct block_q2_K
+{
+    uint8_t scales[16];
+    uint8_t qs[64];
+    half d;
+    half dmin;
+};
+
+//------------------------------------------------------------------------------
+// block_q3_K
+//------------------------------------------------------------------------------
+struct block_q3_K
+{
+    uint8_t hmask[32];
+    uint8_t qs[64];
+    uint8_t scales[12];
+    half d;
+};
+
+//------------------------------------------------------------------------------
+// block_q4_K
+//------------------------------------------------------------------------------
+struct block_q4_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q5_K
+//------------------------------------------------------------------------------
+struct block_q5_K
+{
+    half d;
+    half dmin;
+    uint8_t scales[12];
+    uint8_t qh[32];
+    uint8_t qs[128];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+struct block_q6_K
+{
+    uint8_t ql[128];
+    uint8_t qh[64];
+    int8_t scales[16];
+    half d;
+};
+
+//------------------------------------------------------------------------------
+// dequantize_q4_0_f32, dequantize_q4_0_f16
+//------------------------------------------------------------------------------
+void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
+    global ushort * qs = ((global ushort *)xb + 1);
+    float d1 = il ? (xb->d / 16.h) : xb->d;
+    float d2 = d1 / 256.f;
+    float md = -8.h * xb->d;
+    ushort mask0 = il ? 0x00F0 : 0x000F;
+    ushort mask1 = mask0 << 8;
+
+    reg->s0 = d1 * (qs[0] & mask0) + md;
+    reg->s1 = d2 * (qs[0] & mask1) + md;
+
+    reg->s2 = d1 * (qs[1] & mask0) + md;
+    reg->s3 = d2 * (qs[1] & mask1) + md;
+
+    reg->s4 = d1 * (qs[2] & mask0) + md;
+    reg->s5 = d2 * (qs[2] & mask1) + md;
+
+    reg->s6 = d1 * (qs[3] & mask0) + md;
+    reg->s7 = d2 * (qs[3] & mask1) + md;
+
+    reg->s8 = d1 * (qs[4] & mask0) + md;
+    reg->s9 = d2 * (qs[4] & mask1) + md;
+
+    reg->sa = d1 * (qs[5] & mask0) + md;
+    reg->sb = d2 * (qs[5] & mask1) + md;
+
+    reg->sc = d1 * (qs[6] & mask0) + md;
+    reg->sd = d2 * (qs[6] & mask1) + md;
+
+    reg->se = d1 * (qs[7] & mask0) + md;
+    reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg) {
+    global ushort * qs = ((global ushort *)xb + 1);
+    half d1 = il ? (xb->d / 16.h) : xb->d;
+    half d2 = d1 / 256.h;
+    half md = -8.h * xb->d;
+    ushort mask0 = il ? 0x00F0 : 0x000F;
+    ushort mask1 = mask0 << 8;
+
+    reg->s0 = d1 * (qs[0] & mask0) + md;
+    reg->s1 = d2 * (qs[0] & mask1) + md;
+
+    reg->s2 = d1 * (qs[1] & mask0) + md;
+    reg->s3 = d2 * (qs[1] & mask1) + md;
+
+    reg->s4 = d1 * (qs[2] & mask0) + md;
+    reg->s5 = d2 * (qs[2] & mask1) + md;
+
+    reg->s6 = d1 * (qs[3] & mask0) + md;
+    reg->s7 = d2 * (qs[3] & mask1) + md;
+
+    reg->s8 = d1 * (qs[4] & mask0) + md;
+    reg->s9 = d2 * (qs[4] & mask1) + md;
+
+    reg->sa = d1 * (qs[5] & mask0) + md;
+    reg->sb = d2 * (qs[5] & mask1) + md;
+
+    reg->sc = d1 * (qs[6] & mask0) + md;
+    reg->sd = d2 * (qs[6] & mask1) + md;
+
+    reg->se = d1 * (qs[7] & mask0) + md;
+    reg->sf = d2 * (qs[7] & mask1) + md;
+}
+
+//------------------------------------------------------------------------------
+// add
+//------------------------------------------------------------------------------
+
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
+kernel void kernel_add(
+        global char * src0,
+        ulong  offset0,
+        global char * src1,
+        ulong  offset1,
+        global char * dst,
+        ulong  offsetd,
+        int   ne00,
+        int   ne01,
+        int   ne02,
+        int   ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int   ne10,
+        int   ne11,
+        int   ne12,
+        int   ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int   ne0,
+        int   ne1,
+        int   ne2,
+        int   ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_add_row(
+        global float4 * src0,
+        ulong  offset0,
+        global float4 * src1,
+        ulong  offset1,
+        global float4 * dst,
+        ulong  offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] + src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// mul
+//------------------------------------------------------------------------------
+kernel void kernel_mul(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global char * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_mul_row(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * src1,
+        ulong offset1,
+        global float4 * dst,
+        ulong offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] * src1[idx1];
+}
+
+//------------------------------------------------------------------------------
+// scale
+//------------------------------------------------------------------------------
+kernel void kernel_scale(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd,
+        float scale
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+    dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
+}
+
+//------------------------------------------------------------------------------
+// gelu
+//------------------------------------------------------------------------------
+#define GELU_COEF_A     0.044715f
+#define SQRT_2_OVER_PI  0.79788456080286535587989211986876f
+
+kernel void kernel_gelu(
+    global float * src0,
+    ulong offset0,
+    global float * dst,
+    ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    float x = src0[get_global_id(0)];
+
+    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+kernel void kernel_gelu_4(
+    global float4 * src0,
+    ulong offset0,
+    global float4 * dst,
+    ulong offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    float4 x = src0[get_global_id(0)];
+
+    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+//------------------------------------------------------------------------------
+// silu
+//------------------------------------------------------------------------------
+kernel void kernel_silu(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    float x = src0[get_global_id(0)];
+    dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+kernel void kernel_silu_4(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    float4 x = src0[get_global_id(0)];
+    dst[get_global_id(0)] = x / (1.0f + exp(-x));
+}
+
+//------------------------------------------------------------------------------
+// relu
+//------------------------------------------------------------------------------
+kernel void kernel_relu(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// clamp
+//------------------------------------------------------------------------------
+kernel void kernel_clamp(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        float min,
+        float max
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
+        min :
+        (src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
+}
+
+//------------------------------------------------------------------------------
+// norm
+//------------------------------------------------------------------------------
+kernel void kernel_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        float eps,
+        local float * sum
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global void*)((global char*)dst + offsetd);
+
+    global float * x = (global float *) ((global char *) src0 + get_group_id(0)*nb01);
+
+    // MEAN
+    // parallel sum
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        sum[get_local_id(0)] += x[i00];
+    }
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float mean  = sum[0] / ne00;
+
+    // recenter and VARIANCE
+    barrier(CLK_LOCAL_MEM_FENCE);
+    global float * y = dst + get_group_id(0)*ne00;
+    sum[get_local_id(0)] = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = x[i00] - mean;
+        sum[get_local_id(0)] += y[i00] * y[i00];
+    }
+
+    // reduce
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
+        if (get_local_id(0) < i) {
+            sum[get_local_id(0)] += sum[get_local_id(0) + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    float variance = sum[0] / ne00;
+
+    float scale = 1.0f/sqrt(variance + eps);
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        y[i00] = y[i00] * scale;
+    }
+}
+
+//------------------------------------------------------------------------------
+// rms_norm
+//------------------------------------------------------------------------------
+// This kernel depends on subgroup size.
+kernel void kernel_rms_norm(
+        global void * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        float eps,
+        local float * sum // Note, the size depends on number of subgroups
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    global float4 * x = (global float4 *) ((global char *) src0 + get_group_id(0)*nb01);
+    global float * x_scalar = (global float *) x;
+    float4 sumf = 0;
+    float all_sum = 0;
+
+    // parallel sum
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        sumf += x[i00] * x[i00];
+    }
+    all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
+    all_sum = sub_group_reduce_add(all_sum);
+    if (get_sub_group_local_id() == 0) {
+        sum[get_sub_group_id()] = all_sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // broadcast
+    for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+       if (get_local_id(0) < i) {
+           sum[get_local_id(0)] += sum[get_local_id(0) + i];
+       }
+    }
+    if (get_local_id(0) == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
+            sum[0] += x_scalar[i];
+        }
+        sum[0] /= ne00;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const float mean  = sum[0];
+    const float scale = 1.0f/sqrt(mean + eps);
+
+    global float4 * y = (global float4 *) (dst + get_group_id(0)*ne00);
+    global float * y_scalar = (global float *) y;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        y[i00] = x[i00] * scale;
+    }
+    if (get_local_id(0) == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
+            y_scalar[i00] = x_scalar[i00] * scale;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// diag_mask_inf kernels
+//------------------------------------------------------------------------------
+kernel void kernel_diag_mask_inf(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int n_past
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i02 = get_global_id(2);
+    int i01 = get_global_id(1);
+    int i00 = get_global_id(0);
+
+    if (i00 > n_past + i01) {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
+    } else {
+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
+    }
+}
+
+kernel void kernel_diag_mask_inf_8(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int n_past
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    int i = 2*get_global_id(0);
+
+    dst[i+0] = src0[i+0];
+    dst[i+1] = src0[i+1];
+    int i4 = 4*i;
+    int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
+    int i01 = i4/(ne00);      i4 -= i01*ne00;
+    int i00 = i4;
+    for (int k = 3; k >= 0; --k) {
+        if (i00 + 4 + k <= n_past + i01) {
+            break;
+        }
+        (&dst[i+1])[k] = -INFINITY;
+        if (i00 + k > n_past + i01) {
+            (&dst[i])[k] = -INFINITY;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// softmax
+//------------------------------------------------------------------------------
+kernel void kernel_soft_max(
+        global float * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        float scale,
+        float max_bias,
+        float m0,
+        float m1,
+        int n_head_log2
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+    global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
+    global float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        int h = i02;
+
+        float base = h < n_head_log2 ? m0 : m1;
+        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float lmax = -INFINITY;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+    float max = sub_group_reduce_max(lmax);
+
+    // parallel sum
+    float lsum = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+        lsum += exp_psrc0;
+        // Remember the result of exp here. exp is expensive, so we really do not
+        // wish to compute it twice.
+        pdst[i00] = exp_psrc0;
+    }
+
+    const float sum = sub_group_reduce_add(lsum);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        pdst[i00] /= sum;
+    }
+}
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_soft_max_4(
+        global float * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        float scale,
+        float max_bias,
+        float m0,
+        float m1,
+        int n_head_log2
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+    global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
+    global float4 * pdst4 = (global float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        int h = i02;
+
+        float base = h < n_head_log2 ? m0 : m1;
+        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // parallel max
+    float4 lmax4 = -INFINITY;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
+    }
+    float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
+
+    const float max = sub_group_reduce_max(lmax);
+
+    // parallel sum
+    float4 lsum4 = 0.0f;
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
+        lsum4 += exp_psrc4;
+        pdst4[i00] = exp_psrc4;
+    }
+    float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
+
+    const float sum = sub_group_reduce_add(lsum);
+
+    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
+        pdst4[i00] /= sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+// kernel_rope
+//------------------------------------------------------------------------------
+float rope_yarn_ramp(float low, float high, int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+float2 rope_yarn(
+    float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    return (float2)(cos(theta) * mscale, sin(theta) * mscale);
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+}
+
+float2 rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
+) {
+    // start and end correction dims
+    return (float2)(
+        max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
+        min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
+    );
+}
+
+kernel void kernel_rope_norm_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global float * src       = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            float x0 = src[0];
+            float x1 = src[1];
+
+            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_norm_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            float x0 = src[0];
+            float x1 = src[1];
+
+            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global half * src      = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_neox_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims/2];
+
+            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+kernel void kernel_rope_neox_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * src2,
+        ulong offset2,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3,
+        int n_past,
+        int n_dims,
+        int n_ctx_orig,
+        float freq_base,
+        float freq_scale,
+        float ext_factor,
+        float attn_factor,
+        float beta_fast,
+        float beta_slow
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    src2 = (global float*)((global char*)src2 + offset2);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i3 = get_group_id(2);
+    int i2 = get_group_id(1);
+    int i1 = get_group_id(0);
+
+    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
+
+    global int * pos = src1;
+
+    float theta_base = (float) pos[i2];
+    float inv_ndims = -1.f/n_dims;
+
+    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
+        if (i0 < n_dims) {
+            int ic = i0/2;
+
+            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
+
+            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims/2];
+
+            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
+            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
+        } else {
+            global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// cpy
+//------------------------------------------------------------------------------
+
+kernel void kernel_cpy_f16_f16(
+        global half * src0,
+        ulong offset0,
+        global half * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst = (global half*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f16_f32(
+        global half * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f16(
+        global float * src0,
+        ulong offset0,
+        global half * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global half*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+kernel void kernel_cpy_f32_f32(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
+
+//------------------------------------------------------------------------------
+// get_rows
+//------------------------------------------------------------------------------
+kernel void kernel_get_rows_f32(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+            ((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+    }
+}
+
+kernel void kernel_get_rows_f16(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
+        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
+            ((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
+    }
+}
+
+kernel void kernel_get_rows_q4_0(
+        global void * src0,
+        ulong offset0,
+        global int * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        ulong nb01,
+        ulong nb02,
+        int ne10,
+        ulong nb10,
+        ulong nb11,
+        ulong nb1,
+        ulong nb2
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global int*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int NL = 2;
+
+    int i10 = get_group_id(0);
+    int i11 = get_group_id(1);
+
+    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
+
+    int i02 = i11;
+
+    for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
+        float16 temp;
+        dequantize_q4_0_f32(
+            ((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
+        *(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f32_f32
+//------------------------------------------------------------------------------
+#define N_F32_F32 4
+
+kernel void kernel_mul_mat_f32_f32(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F32_F32;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global float * x = (global float *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float * y = (global float *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += (float) x[i] * (float) y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global float4 * x4 = (global float4 *)x;
+        for (int row = 0; row < N_F32_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float  * y  = (global float  *) (src1 + offset_src1);
+            global float4 * y4 = (global float4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += (float) x4[i].s0 * y4[i].s0;
+                sumf += (float) x4[i].s1 * y4[i].s1;
+                sumf += (float) x4[i].s2 * y4[i].s2;
+                sumf += (float) x4[i].s3 * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (float) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f16
+//------------------------------------------------------------------------------
+#define N_F16_F16 4
+
+kernel void kernel_mul_mat_f16_f16(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3)
+{
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F16_F16;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half * x = (global half *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global half * y = (global half *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += (half) x[i] * (half) y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global half4 * x4 = (global half4 *)x;
+        for (int row = 0; row < N_F16_F16; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global half  * y  = (global half  *) (src1 + offset_src1);
+            global half4 * y4 = (global half4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += (half) x4[i].s0 * y4[i].s0;
+                sumf += (half) x4[i].s1 * y4[i].s1;
+                sumf += (half) x4[i].s2 * y4[i].s2;
+                sumf += (half) x4[i].s3 * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (half) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_1row
+//------------------------------------------------------------------------------
+kernel void kernel_mul_mat_f16_f32_1row(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+    ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+    global half  * x = (global half  *) (src0 + offset_src0);
+    global float * y = (global float *) (src1 + offset_src1);
+
+    float sumf = 0;
+    if (ne00 < 128) {
+        for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+            sumf += (float) x[i] * (float) y[i];
+        }
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    } else {
+        global half4  * x4 = (global half4  *) x;
+        global float4 * y4 = (global float4 *) y;
+        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+            sumf += (float) x4[i].s0 * y4[i].s0;
+            sumf += (float) x4[i].s1 * y4[i].s1;
+            sumf += (float) x4[i].s2 * y4[i].s2;
+            sumf += (float) x4[i].s3 * y4[i].s3;
+        }
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            for (int i = 4*(ne00/4); i < ne00; ++i) {
+                all_sum += (float) x[i] * y[i];
+            }
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32
+//------------------------------------------------------------------------------
+#define N_F16_F32 4
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int r0 = get_group_id(0);
+    int rb = get_group_id(1)*N_F16_F32;
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half * x = (global half *) (src0 + offset_src0);
+
+    if (ne00 < 128) {
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float * y = (global float *) (src1 + offset_src1);
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
+                sumf += convert_float(x[i]) * y[i];
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    } else {
+        global half4 * x4 = (global half4 *)x;
+        for (int row = 0; row < N_F16_F32; ++row) {
+            int r1 = rb + row;
+            if (r1 >= ne11) {
+                break;
+            }
+
+            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+            global float  * y  = (global float  *) (src1 + offset_src1);
+            global float4 * y4 = (global float4 *) y;
+
+            float sumf = 0;
+            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+                sumf += convert_float(x4[i].s0) * y4[i].s0;
+                sumf += convert_float(x4[i].s1) * y4[i].s1;
+                sumf += convert_float(x4[i].s2) * y4[i].s2;
+                sumf += convert_float(x4[i].s3) * y4[i].s3;
+            }
+
+            float all_sum = sub_group_reduce_add(sumf);
+            if (get_sub_group_local_id() == 0) {
+                for (int i = 4*(ne00/4); i < ne00; ++i) {
+                    all_sum += (float) x[i] * y[i];
+                }
+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_mat_f16_f32_l4
+//------------------------------------------------------------------------------
+// Assumes row size (ne00) is a multiple of 4
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_f16_f32_l4(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int nrows = ne11;
+    int r0 = get_group_id(0);
+    int im = get_group_id(2);
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global half4 * x4 = (global half4 *) (src0 + offset_src0);
+
+    for (int r1 = 0; r1 < nrows; ++r1) {
+        ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
+
+        global float4 * y4 = (global float4 *) (src1 + offset_src1);
+
+        float sumf = 0;
+        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
+            sumf += convert_float(x4[i].s0) * y4[i].s0;
+            sumf += convert_float(x4[i].s1) * y4[i].s1;
+            sumf += convert_float(x4[i].s2) * y4[i].s2;
+            sumf += convert_float(x4[i].s3) * y4[i].s3;
+        }
+
+        float all_sum = sub_group_reduce_add(sumf);
+        if (get_sub_group_local_id() == 0) {
+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32
+//------------------------------------------------------------------------------
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_4_0_dot_y(
+        global struct block_q4_0 * qb_curr,
+        float sumy,
+        private float * yl,
+        int il
+) {
+    float d = qb_curr->d;
+    float2 acc = 0.f;
+    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+    for (int i = 0; i < 8; i+=2) {
+        acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
+        acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
+    }
+    return d * (sumy * -8.f + acc.s0 + acc.s1);
+}
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32(
+        global void * src0,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+    // id of a SIMD group in the grid.
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float yl[16];       // src1 vector cache
+    float sumf[N_DST]={0.f};
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+        for (int i = 0; i < 8; i += 2) {
+            sumy += yb[i] + yb[i+1];
+            yl[i+0] = yb[i+ 0];
+            yl[i+1] = yb[i+ 1]/256.f;
+            sumy += yb[i+16] + yb[i+17];
+            yl[i+8] = yb[i+16]/16.f;
+            yl[i+9] = yb[i+17]/4096.f;
+        }
+
+        for (int row = 0; row < N_DST; row++) {
+            sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
+        }
+
+        // One thread in a SIMD group (i.e., subgroup) handles a half block,
+        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+        // y points to the activation matrix (of type float). Therefore for
+        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    // The above does not work for Adreno - it produces incorrect results for
+    // row = 1, 2, 3 and only row = 0 gives the correct result.
+    // If N_DST is changed, the below array must be initialized accordingly.
+    // This also seems to perform better on Intel.
+    float tot[N_DST] = {
+        sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
+        sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
+    for (int row = 0; row < N_DST; ++row) {
+        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant unrolls the loops and uses vector types instead of pointers.
+// It improves performance on Adreno but not so much on Intel.
+//
+inline float block_q_4_0_dot_y_v(
+        global struct block_q4_0 * qb_curr,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float d = qb_curr->d;
+    float acc = 0.f;
+    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_v(
+        global void * src0,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
+    // id of a SIMD group in the grid.
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
+    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;       // src1 vector cache
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix * QK4_0 + il;
+
+    // each thread in a SIMD group deals with half a block.
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
+
+        // One thread in a SIMD group (i.e., subgroup) handles a half block,
+        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
+        // y points to the activation matrix (of type float). Therefore for
+        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
+        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
+        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    // The above does not work for Adreno - it produces incorrect results for
+    // row = 1, 2, 3 and only row = 0 gives the correct result.
+    // If N_DST is changed, the below array must be initialized accordingly.
+    // This also seems to perform better on Intel.
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_v(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_convert_block_q4_0
+// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
+// This kernel does not deshuffle the bits.
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q4_0(
+    global struct block_q4_0 * src0,
+    global uchar * dst_q,
+    global half  * dst_d
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) dst_d + get_global_id(0);
+
+    *d = b->d;
+
+    for (int i = 0; i < QK4_0/2; ++i) {
+        q[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q4_0(
+    global uchar * src_q,
+    global half  * src_d,
+    global struct block_q4_0 * dst
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
+    global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) src_d + get_global_id(0);
+
+    b->d = *d;
+    for (int i = 0; i < QK4_0/2; ++i) {
+        b->qs[i] = q[i];
+    }
+}
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors.
+//------------------------------------------------------------------------------
+
+// This function requires the original shuffled weights.
+// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
+// packed together in a byte, so are (q[1], q[17]) and so on.
+inline float block_q_4_0_dot_y_flat(
+        global uchar * x,
+        global half  * dh,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float           d   = *dh;
+    global ushort * qs  = ((global ushort *)x + il/2);
+    float           acc = 0.f;
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 4
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//
+// This variant outputs 8 values.
+//
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 32
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float8 sumf = 0.f;
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float8 tot = (float8)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
new file mode 100644
index 000000000..e2024332f
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
@@ -0,0 +1,106 @@
+//------------------------------------------------------------------------------
+// This file is contains additional kernels for data conversion.
+// These kernels are used when loading the model, so its performance is less
+// important.
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// mul_vec_q_n_f32_flat_noshuffle
+//
+// This variation uses flat arrays (struct of arrays, SOA) representation for
+// quant tensors. It also uses non shuffled bit order for weights.
+//
+// The shuffled version is kept in the original file because moving it here
+// seems to result in worse performance for adreno.
+//------------------------------------------------------------------------------
+
+kernel void kernel_convert_block_q4_0_noshuffle(
+    global struct block_q4_0 * src0,
+    global uchar * dst_q,
+    global half  * dst_d
+) {
+    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
+    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
+    global half  * d = (global half *) dst_d + get_global_id(0);
+
+    *d = b->d;
+    for (int i = 0; i < QK4_0/4; ++i) {
+        uchar x0 = b->qs[2*i + 0];
+        uchar x1 = b->qs[2*i + 1];
+
+        q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
+        q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
+
+#ifdef ADRENO_GPU
+        // Workaround for adreno - must have the following printf statement for
+        // the kernel to work properly. Otherwise it produces incorrect result.
+        // convert_uchar above also seems necessary.
+        // Compare against a large number so that it does not print anything.
+        // get_sub_group_local_id() also works.
+        if (get_global_id(0) == 65536*4096) {
+            printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
+        }
+#endif
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
new file mode 100644
index 000000000..5e195411d
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@@ -0,0 +1,265 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+        __read_only  image1d_buffer_t src0_q,  // quantized A
+        global half2  * src0_d,  // A scales
+        __read_only  image1d_buffer_t src1,    // B
+        ulong offset1,            // offset to B (0)
+        global float * dst,     // C
+        ulong offsetd,            // offset to C (0)
+        uint K,               // K
+        int ne01,               // M
+        int ne02,               // 1
+        int ne10,               // K
+        int ne12,               // 1
+        int ne0,                // M
+        int ne1,                // N
+        int r2,                 // 1
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid    = get_sub_group_local_id();
+
+    __private uint4     regA;
+    __private half2     regS;
+    __private float8    regB;
+
+    __private float2 totalSum = (float2)(0.0f);
+
+    // loop along K in block granularity, skip 4 blocks every iter
+    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+        // first 4 fibers in each wave load 8 B values to its private scope
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        // load half weights for two blocks in consecutive rows
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+    }
+
+    // reduction in local memory, assumes #wave=4
+    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
new file mode 100644
index 000000000..5bdd4d067
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@@ -0,0 +1,271 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
+#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
+#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+// assume
+#define QK4_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
+    float shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
+    float8 shared_y; \
+    shared_y = sub_group_broadcast(y, 0); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 1); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
+    shared_y = sub_group_broadcast(y, 2); \
+    total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+    shared_y = sub_group_broadcast(y, 3); \
+    total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
+    total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
+    total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
+    total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
+    total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
+    total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
+    total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
+    total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
+    total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
+    total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
+    total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
+    total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
+    total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
+    total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
+    total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
+    total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
+
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+__kernel void kernel_gemv_noshuffle(
+        __read_only  image1d_buffer_t src0_q,  // quantized A
+        global half2  * src0_d,  // A scales
+        __read_only  image1d_buffer_t src1,    // B
+        ulong offset1,            // offset to B (0)
+        global float * dst,     // C
+        ulong offsetd,            // offset to C (0)
+        int ne00,               // K
+        int ne01,               // M
+        int ne02,               // 1
+        int ne10,               // K
+        int ne12,               // 1
+        int ne0,                // M
+        int ne1,                // N
+        int r2,                 // 1
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid    = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A = M / 2;
+    uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
+
+    __private uint4     regA;
+    __private half2     regS;
+    __private float8    regB;
+
+    __private float2 totalSum = (float2)(0.0f);
+
+    // loop along K in block granularity, skip 4 blocks every iter
+    for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
+        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
+        // first 4 fibers in each wave load 8 B values to its private scope
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        // load half weights for two blocks in consecutive rows
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+#ifdef VECTOR_SUB_GROUP_BROADCAT
+        dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
+#else
+        dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
+#endif // VECTOR_SUB_GROUP_BROADCAT
+    }
+
+    // reduction in local memory, assumes #wave=4
+    __local float2 reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 2 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        vstore2(totalSum, 0, &(dst[gid * 2]));
+    }
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
new file mode 100644
index 000000000..e19e9a2f4
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
@@ -0,0 +1,1225 @@
+//------------------------------------------------------------------------------
+// This file is contains additional mulmat kernels
+// (and potentially other kernels).
+//------------------------------------------------------------------------------
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#elif defined(cl_amd_fp16)
+#pragma OPENCL EXTENSION cl_amd_fp16 : enable
+#else
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
+#endif
+
+#ifdef cl_khr_subgroups
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#elif defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#error "Subgroup not supported on your device."
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+// Always use subgroup size of 32 on Intel.
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+// Always use subgroups size of 64 on Adreno.
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#else
+// TODO: do not know how to choose subgroup size on other GPUs.
+#error "Selecting subgroup size is not supported on your device."
+#endif
+
+#define QK4_0                   32
+#define QR4_0                   2
+#define QK4_1                   32
+#define QR4_1                   2
+#define QK5_0                   32
+#define QR5_0                   2
+#define QK5_1                   32
+#define QR5_1                   2
+#define QK8_0                   32
+#define QR8_0                   1
+#define QK_K                    256
+#define K_QUANTS_PER_ITERATION  2
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+//------------------------------------------------------------------------------
+// block_q6_K
+//------------------------------------------------------------------------------
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    half d;             // super-block scale
+} block_q6_K;
+
+//------------------------------------------------------------------------------
+// These are the variant for matmatmul, based on the matvecmul kernel with
+// flattened block_q4_0.
+//------------------------------------------------------------------------------
+
+// Common dot prod.
+inline float mm_block_q_4_0_dot_y_flat(
+        global uchar * x,
+        global half  * dh,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float           d   = *dh;
+    global ushort * qs  = ((global ushort *)x + il/2);
+    float           acc = 0.f;
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 8x output.
+// Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float8 tot = (float8)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
+#elif defined (ADRENO_GPU)
+#define N_DST 16
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+//
+// This variant performs 1d blocking with 16x output.
+// Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
+//
+inline void mul_mat_q_n_f32_1d_16x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
+    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
+    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
+    // Currently with llama2 7B, im is always 0.
+    // TODO: how to handle im/gqa*(nb*ne0)?
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+                             0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  0*nb*QK4_0/2, d + ib +  0*nb, sumy, yl, il);
+        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  1*nb*QK4_0/2, d + ib +  1*nb, sumy, yl, il);
+        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  2*nb*QK4_0/2, d + ib +  2*nb, sumy, yl, il);
+        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  3*nb*QK4_0/2, d + ib +  3*nb, sumy, yl, il);
+
+        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  4*nb*QK4_0/2, d + ib +  4*nb, sumy, yl, il);
+        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  5*nb*QK4_0/2, d + ib +  5*nb, sumy, yl, il);
+        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  6*nb*QK4_0/2, d + ib +  6*nb, sumy, yl, il);
+        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  7*nb*QK4_0/2, d + ib +  7*nb, sumy, yl, il);
+
+        sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  8*nb*QK4_0/2, d + ib +  8*nb, sumy, yl, il);
+        sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  9*nb*QK4_0/2, d + ib +  9*nb, sumy, yl, il);
+        sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
+        sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
+
+        sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
+        sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
+        sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
+        sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float16 tot = (float16)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
+
+        sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
+        sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
+        sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
+        sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+
+        if (first_row + 8 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
+        }
+        if (first_row + 9 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
+        }
+        if (first_row + 10 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
+        }
+        if (first_row + 11 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
+        }
+
+        if (first_row + 12 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
+        }
+        if (first_row + 13 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
+        }
+        if (first_row + 14 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
+        }
+        if (first_row + 15 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mat_q4_0_f32_flat_v0
+//------------------------------------------------------------------------------
+inline float block_q_4_0_dot_y_flat_v2(
+    half   x,
+    half   d,
+    float  sumy,
+    float4 yl
+) {
+    uchar2 q = as_uchar2(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+
+    acc += (q.s0 & 0xF0) * yl.s2;
+    acc += (q.s1 & 0xF0) * yl.s3;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v4(
+    float  x,
+    half   d,
+    float  sumy,
+    float8 yl
+) {
+    uchar4 q = as_uchar4(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+    acc += (q.s2 & 0x0F) * yl.s2;
+    acc += (q.s3 & 0x0F) * yl.s3;
+
+    acc += (q.s0 & 0xF0) * yl.s4;
+    acc += (q.s1 & 0xF0) * yl.s5;
+    acc += (q.s2 & 0xF0) * yl.s6;
+    acc += (q.s3 & 0xF0) * yl.s7;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+inline float block_q_4_0_dot_y_flat_v8(
+    float2  x,
+    half    d,
+    float   sumy,
+    float16 yl
+) {
+    uchar8 q = as_uchar8(x);
+    float acc = 0.0f;
+
+    acc += (q.s0 & 0x0F) * yl.s0;
+    acc += (q.s1 & 0x0F) * yl.s1;
+    acc += (q.s2 & 0x0F) * yl.s2;
+    acc += (q.s3 & 0x0F) * yl.s3;
+    acc += (q.s4 & 0x0F) * yl.s4;
+    acc += (q.s5 & 0x0F) * yl.s5;
+    acc += (q.s6 & 0x0F) * yl.s6;
+    acc += (q.s7 & 0x0F) * yl.s7;
+
+    acc += (q.s0 & 0xF0) * yl.s8;
+    acc += (q.s1 & 0xF0) * yl.s9;
+    acc += (q.s2 & 0xF0) * yl.sa;
+    acc += (q.s3 & 0xF0) * yl.sb;
+    acc += (q.s4 & 0xF0) * yl.sc;
+    acc += (q.s5 & 0xF0) * yl.sd;
+    acc += (q.s6 & 0xF0) * yl.se;
+    acc += (q.s7 & 0xF0) * yl.sf;
+
+    return d * (sumy * -8.f + acc);;
+}
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4   // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     64
+#endif
+
+#if THREADS_PER_BLK == 2                // Each thread processes 1/2 block
+#   define ACT_TY                       float16
+#   define Q_BLK_LD_TY                  float2
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4              // Each thread processes 1/4 block
+#   define ACT_TY                       float8
+#   define Q_BLK_LD_TY                  float
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8              // Each thread processes 1/8 block
+#   define ACT_TY                       float4
+#   define Q_BLK_LD_TY                  half
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK         (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+#   define  SUM_TY                      float2
+#elif N_DST == 4
+#   define  SUM_TY                      float4
+#elif N_DST == 8
+#   define  SUM_TY                      float8
+#elif N_DST == 16
+#   define  SUM_TY                      float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_v0(
+        global uchar * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+    int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+    global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+    // Registers for caching activation
+    ACT_TY yl = 0.f;
+
+    // Registers for caching quants
+    Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+    // Partial sum
+    SUM_TY sumf = 0.f;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+        float sumy = 0.f;
+
+        q_blk_0 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 0*nb*QK4_0/2);
+        q_blk_1 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 1*nb*QK4_0/2);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        q_blk_2 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 2*nb*QK4_0/2);
+        q_blk_3 = *(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 3*nb*QK4_0/2);
+#endif
+#if N_DST == 8 || N_DST == 16
+        q_blk_4 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 4*nb*QK4_0/2));
+        q_blk_5 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 5*nb*QK4_0/2));
+        q_blk_6 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 6*nb*QK4_0/2));
+        q_blk_7 = (*(global Q_BLK_LD_TY*)(x + ib*QK4_0/2 + BTYES_PER_THREAD_IN_BLK*il + 7*nb*QK4_0/2));
+#endif
+
+        // Load activation
+#if THREADS_PER_BLK == 2    // Each thread processes 1/2 block
+        yl.s01234567 = *(global float8 *)(yb);
+        yl.s89abcdef = *(global float8 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4;
+        sumy += yl.s5;
+        sumy += yl.s6;
+        sumy += yl.s7;
+        sumy += yl.s8; yl.s8 /= 16.f;
+        sumy += yl.s9; yl.s9 /= 16.f;
+        sumy += yl.sa; yl.sa /= 16.f;
+        sumy += yl.sb; yl.sb /= 16.f;
+        sumy += yl.sc; yl.sc /= 16.f;
+        sumy += yl.sd; yl.sd /= 16.f;
+        sumy += yl.se; yl.se /= 16.f;
+        sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4  // Each thread processes 1/4 block
+        yl.s0123 = *(global float4 *)(yb);
+        yl.s4567 = *(global float4 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4; yl.s4 /= 16.f;
+        sumy += yl.s5; yl.s5 /= 16.f;
+        sumy += yl.s6; yl.s6 /= 16.f;
+        sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8  // Each thread processes 1/8 block
+        yl.s01 = *(global float2 *)(yb);
+        yl.s23 = *(global float2 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2; yl.s2 /= 16.f;
+        sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+        sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, *(d + ib + 0*nb), sumy, yl);
+        sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, *(d + ib + 1*nb), sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, *(d + ib + 2*nb), sumy, yl);
+        sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, *(d + ib + 3*nb), sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+        sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, *(d + ib + 4*nb), sumy, yl);
+        sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, *(d + ib + 5*nb), sumy, yl);
+        sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, *(d + ib + 6*nb), sumy, yl);
+        sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, *(d + ib + 7*nb), sumy, yl);
+#endif
+
+        yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+    }
+
+    SUM_TY tot = (SUM_TY)(
+          sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+        , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+#endif
+#if N_DST == 8 || N_DST == 16
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+#endif
+    }
+}
+
+//------------------------------------------------------------------------------
+// Using image1d_buffer_t
+
+#if defined(cl_qcom_subgroup_shuffle)
+#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
+float qcom_sub_group_reduce_add(float sum) {
+    sum += qcom_sub_group_shuffle_down(sum, 32, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum, 16, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  8, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  4, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  2, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    sum += qcom_sub_group_shuffle_down(sum,  1, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.f);
+    return sum;
+}
+#define sub_group_reduce_add qcom_sub_group_reduce_add
+#else
+#define sub_group_reduce_add sub_group_reduce_add
+#endif
+
+#undef THREADS_PER_BLK
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define THREADS_PER_BLK 4   // Number of threads per block, or each thread process 1/THREADS_PER_BLK of a block
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     16
+#elif defined (ADRENO_GPU)
+#define THREADS_PER_BLK 4
+#define N_DST           4
+#define N_SIMDGROUP     1
+#define N_SIMDWIDTH     64
+#endif
+
+#if THREADS_PER_BLK == 2                // Each thread processes 1/2 block
+#   define ACT_TY                       float16
+#   define Q_BLK_LD_TY                  float2
+#   define EXTRACT_BLK_DATA(tmp, part)  *((float2*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v8
+#elif THREADS_PER_BLK == 4              // Each thread processes 1/4 block
+#   define ACT_TY                       float8
+#   define Q_BLK_LD_TY                  float
+#   define EXTRACT_BLK_DATA(tmp, part)  *((float*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v4
+#elif THREADS_PER_BLK == 8              // Each thread processes 1/8 block
+#   define ACT_TY                       float4
+#   define Q_BLK_LD_TY                  half
+#   define EXTRACT_BLK_DATA(tmp, part)  *((half*)&tmp + part)
+#   define block_q_4_0_dot_y_flat       block_q_4_0_dot_y_flat_v2
+#endif
+
+#define BTYES_PER_THREAD_IN_BLK         (QK4_0/2/THREADS_PER_BLK)
+
+#if N_DST == 2
+#   define  SUM_TY                      float2
+#elif N_DST == 4
+#   define  SUM_TY                      float4
+#elif N_DST == 8
+#   define  SUM_TY                      float8
+#elif N_DST == 16
+#   define  SUM_TY                      float16
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
+        read_only image1d_buffer_t src0_q,
+        read_only image1d_buffer_t src0_d,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    const int nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    int ix = get_sub_group_local_id()/THREADS_PER_BLK;
+    int il = get_sub_group_local_id()%THREADS_PER_BLK;
+
+    global float * yb = y + ix*QK4_0 + BTYES_PER_THREAD_IN_BLK*il;
+
+    // Registers for caching activation
+    ACT_TY yl = 0.f;
+
+    // Registers for caching quants
+    Q_BLK_LD_TY q_blk_0 = 0, q_blk_1 = 0;
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_2 = 0, q_blk_3 = 0;
+#endif
+#if N_DST == 8 || N_DST == 16
+    Q_BLK_LD_TY q_blk_4 = 0, q_blk_5 = 0, q_blk_6 = 0, q_blk_7 = 0;
+#endif
+
+    // Partial sum
+    SUM_TY sumf = 0.f;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/THREADS_PER_BLK) {
+        float sumy = 0.f;;
+
+        float4 tmp;
+        tmp = read_imagef(src0_q, offset0_q + ib + 0*nb);
+        q_blk_0 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 1*nb);
+        q_blk_1 = EXTRACT_BLK_DATA(tmp, il);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        tmp = read_imagef(src0_q, offset0_q + ib + 2*nb);
+        q_blk_2 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 3*nb);
+        q_blk_3 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+#if N_DST == 8 || N_DST == 16
+        tmp = read_imagef(src0_q, offset0_q + ib + 4*nb);
+        q_blk_4 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 5*nb);
+        q_blk_5 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 6*nb);
+        q_blk_6 = EXTRACT_BLK_DATA(tmp, il);
+        tmp = read_imagef(src0_q, offset0_q + ib + 7*nb);
+        q_blk_7 = EXTRACT_BLK_DATA(tmp, il);
+#endif
+
+        // Load activation
+#if THREADS_PER_BLK == 2    // Each thread processes 1/2 block
+        yl.s01234567 = *(global float8 *)(yb);
+        yl.s89abcdef = *(global float8 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4;
+        sumy += yl.s5;
+        sumy += yl.s6;
+        sumy += yl.s7;
+        sumy += yl.s8; yl.s8 /= 16.f;
+        sumy += yl.s9; yl.s9 /= 16.f;
+        sumy += yl.sa; yl.sa /= 16.f;
+        sumy += yl.sb; yl.sb /= 16.f;
+        sumy += yl.sc; yl.sc /= 16.f;
+        sumy += yl.sd; yl.sd /= 16.f;
+        sumy += yl.se; yl.se /= 16.f;
+        sumy += yl.sf; yl.sf /= 16.f;
+#elif THREADS_PER_BLK == 4  // Each thread processes 1/4 block
+        yl.s0123 = *(global float4 *)(yb);
+        yl.s4567 = *(global float4 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2;
+        sumy += yl.s3;
+        sumy += yl.s4; yl.s4 /= 16.f;
+        sumy += yl.s5; yl.s5 /= 16.f;
+        sumy += yl.s6; yl.s6 /= 16.f;
+        sumy += yl.s7; yl.s7 /= 16.f;
+#elif THREADS_PER_BLK == 8  // Each thread processes 1/8 block
+        yl.s01 = *(global float2 *)(yb);
+        yl.s23 = *(global float2 *)(yb + 16);
+
+        sumy += yl.s0;
+        sumy += yl.s1;
+        sumy += yl.s2; yl.s2 /= 16.f;
+        sumy += yl.s3; yl.s3 /= 16.f;
+#endif
+
+        sumf.s0 += block_q_4_0_dot_y_flat(q_blk_0, read_imageh(src0_d, offset0_d + ib + 0*nb).s0, sumy, yl);
+        sumf.s1 += block_q_4_0_dot_y_flat(q_blk_1, read_imageh(src0_d, offset0_d + ib + 1*nb).s0, sumy, yl);
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        sumf.s2 += block_q_4_0_dot_y_flat(q_blk_2, read_imageh(src0_d, offset0_d + ib + 2*nb).s0, sumy, yl);
+        sumf.s3 += block_q_4_0_dot_y_flat(q_blk_3, read_imageh(src0_d, offset0_d + ib + 3*nb).s0, sumy, yl);
+#endif
+#if N_DST == 8 || N_DST == 16
+        sumf.s4 += block_q_4_0_dot_y_flat(q_blk_4, read_imageh(src0_d, offset0_d + ib + 4*nb).s0, sumy, yl);
+        sumf.s5 += block_q_4_0_dot_y_flat(q_blk_5, read_imageh(src0_d, offset0_d + ib + 5*nb).s0, sumy, yl);
+        sumf.s6 += block_q_4_0_dot_y_flat(q_blk_6, read_imageh(src0_d, offset0_d + ib + 6*nb).s0, sumy, yl);
+        sumf.s7 += block_q_4_0_dot_y_flat(q_blk_7, read_imageh(src0_d, offset0_d + ib + 7*nb).s0, sumy, yl);
+#endif
+
+        yb += QK4_0 * (N_SIMDWIDTH/THREADS_PER_BLK);
+    }
+
+    SUM_TY tot = (SUM_TY)(
+          sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1)
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
+#endif
+#if N_DST == 8 || N_DST == 16
+        , sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5)
+        , sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+#endif
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+#if N_DST == 4 || N_DST == 8 || N_DST == 16
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+#endif
+#if N_DST == 8 || N_DST == 16
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+#endif
+    }
+}
+
+//------------------------------------------------------------------------------
+// kernel_mul_mv_q6_K_f32
+//------------------------------------------------------------------------------
+
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 1 // number of rows each SIMD group works on
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // SIMD group size
+#elif defined (ADRENO_GPU)
+#define N_DST 1
+#define N_SIMDGROUP 2
+#define N_SIMDWIDTH 64
+#endif
+
+#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q6_K_f32(
+        global void * src0,
+        ulong offset0,
+        global float * src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    src0 = (global void*)((global char*)src0 + offset0);
+    src1 = (global float*)((global char*)src1 + offset1);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    uchar kmask1 = 0x03;
+    uchar kmask2 = 0x0C;
+    uchar kmask3 = 0x30;
+    uchar kmask4 = 0xC0;
+
+    int nb = ne00/QK_K;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int row = N_SIMDGROUP * r0 + get_sub_group_id();
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
+    global float      * yy = (global float     *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float sumf = 0;
+
+    // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
+    // block. Values in a subblock shares a scale that is quantized with 8 bits;
+    // the entire block shares a single floating point scale.
+    // For work distribution, each thread processes a subblock (16 weights), hence
+    // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
+    // (super) blocks -- this is the block stride.
+    // The 16 threads that process a (super) block are split into 2 portions, each has
+    // 8 threads; each portion works on 8 subblocks.
+    // For subgroup of 16 threads, the entire subgroup works on a single (super) block
+    // before moving to the next (super) block. Thread0 - thread7 work on the
+    // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
+    // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
+    // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
+    // works on a total of 16 weight values.
+    int tid  = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
+    int ix   = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+    int ip   = tid/8;   // first or second half of (super) block (0 or 1)
+    int il   = tid%8;   // each half has 8 parts, one per scale
+    int n    = 4;       // 4 scales at a time (and 4 sums)
+    int l0   = n*il;    // offset into half-block, 0..28
+    int is   = 8*ip + l0/16; // 0, 1, 8, 9
+
+    int y_offset = 128*ip + l0;
+    int q_offset_l = 64*ip + l0;
+    int q_offset_h = 32*ip + l0;
+
+    for (int i = ix; i < nb; i += BLOCK_STRIDE) {
+
+        global uint8_t * q1 = x[i].ql + q_offset_l;
+        global uint8_t * q2 = q1 + QK_K/8;
+        global uint8_t * qh = x[i].qh + q_offset_h;
+        global int8_t  * sc = x[i].scales + is;
+
+        global float * y = yy + i * QK_K + y_offset;
+
+        float dall = x[i].d;
+
+        float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+        sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
+
+        sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
+        sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
+        sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
+        sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
+
+        sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
+    }
+
+    float tot = sub_group_reduce_add(sumf);
+    if (get_sub_group_local_id() == 0) {
+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
new file mode 100644
index 000000000..57768c803
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
@@ -0,0 +1,130 @@
+// src0_q, src0_d, src1 are transposed as a preprocessing step
+// 4-bit weights are transposed in groups of 4 (unsigned short int)
+// consider weights originally "next to each other", now "on top of each other"
+// each fiber computes a 8x4 tile of output elements
+// using unshuffled weights
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+__attribute__((qcom_reqd_sub_group_size("full")))
+kernel void kernel_mul_mat_Ab_Bi_8x4(
+        global const ushort * src0_q,       // quantized A
+        global const half  * src0_d,        // A scales
+        __read_only image1d_buffer_t src1,  // B (1d image)
+        global float * dst,                 // C
+        int m,                              // M
+        int n,                              // N with padding
+        int k,                              // K
+        int n_no_padding                    // N without padding
+) {
+
+    int m_4 = m >> 2;
+    int n_4 = n >> 2;
+
+    int gy = get_global_id(0);
+    int gx = get_global_id(1);
+    int gx_2 = gx << 2;
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
+    half8 B; // registers for activations
+    half4 dequantized_weights; // registers for dequantized weights
+    __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
+    __global const half* scale_ptr = src0_d + gx_2; // pointer for scales
+
+    for(int i=0; i<k; i+=4){ //loop through K dimension
+
+        B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
+
+        // keep (i/4) and (i/32) in parenthesis, rounds down
+        // load 4 consecutive groups of 4 weights
+        ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
+
+        // load 4 consecutive scales
+        half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
+
+        // j=0
+        dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
+        dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
+        dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=1
+        B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=2
+        B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+
+        // j=3
+        B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
+        B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
+        dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
+        dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
+        dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
+        dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
+        c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
+        c1 += B * dequantized_weights.s1;
+        c2 += B * dequantized_weights.s2;
+        c3 += B * dequantized_weights.s3;
+    }
+
+    int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
+
+    // conditional check if store is to a valid location. Required when N is not a multiple of 8
+    // if statements allow registers to be reused for each store
+    // provides a performance boost due to reduced register footprint, which increases number of concurrent waves
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
new file mode 100644
index 000000000..d59a0c05d
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@@ -0,0 +1,32 @@
+// 16-bit transpose, loading/storing an 8x8 tile of elements
+
+kernel void kernel_transpose_16(
+    __read_only image1d_buffer_t input,
+    __write_only image1d_buffer_t output,
+    const uint rows,
+    const uint cols
+) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_3 = i<<3;
+    const int j_3 = j<<3;
+
+    ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
+    ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
+    ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
+    ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
+    ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
+    ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
+    ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
+    ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
+
+    write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
+    write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
+    write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
+    write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
+    write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
+    write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
+    write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
+    write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
new file mode 100644
index 000000000..914ec0193
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
@@ -0,0 +1,25 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+
+kernel void kernel_transpose_32(
+    __read_only image1d_buffer_t input,
+    __write_only image1d_buffer_t output,
+    const uint rows,
+    const uint cols
+) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_2 = i<<2;
+    const int j_2 = j<<2;
+
+    float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
+    float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
+    float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
+    float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
+
+    write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
+    write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+    write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+    write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+
+}
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
new file mode 100644
index 000000000..d3bd1fabb
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
@@ -0,0 +1,35 @@
+// 32-bit transpose, loading/storing a 4x4 tile of elements
+// Only used for activations
+// converts to FP16
+// also adds zero padding for non multiple of 8 prompt lengths
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
+
+    const int i = get_global_id(0);
+    const int j = get_global_id(1);
+    const int i_2 = i<<2;
+    const int j_2 = j<<2;
+    half4 temp0 = {0,0,0,0}; // initialize outputs to 0
+    half4 temp1 = {0,0,0,0};
+    half4 temp2 = {0,0,0,0};
+    half4 temp3 = {0,0,0,0};
+
+    if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
+        temp0 = read_imageh(input, (j_2+0)*cols+i);
+    }
+    if((j_2+1)*cols+i*4+3 < rows*cols*16){
+        temp1 = read_imageh(input, (j_2+1)*cols+i);
+    }
+    if((j_2+2)*cols+i*4+3 < rows*cols*16){
+        temp2 = read_imageh(input, (j_2+2)*cols+i);
+    }
+    if((j_2+3)*cols+i*4+3 < rows*cols*16){
+        temp3 = read_imageh(input, (j_2+3)*cols+i);
+    }
+
+    write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
+    write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+    write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+    write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
+}
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
new file mode 100644
index 000000000..7c3e24103
--- /dev/null
+++ b/ggml/src/ggml-opt.cpp
@@ -0,0 +1,854 @@
+#include "ggml-opt.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cinttypes>
+#include <map>
+#include <random>
+#include <vector>
+
+struct ggml_opt_dataset {
+    struct ggml_context   * ctx    = nullptr;
+    ggml_backend_buffer_t   buf    = nullptr;
+    struct ggml_tensor    * data   = nullptr;
+    struct ggml_tensor    * labels = nullptr;
+
+    int64_t ndata       = -1;
+    int64_t ndata_shard = -1;
+    size_t  nbs_data    = -1;
+    size_t  nbs_labels  = -1;
+
+    std::vector<int64_t> permutation;
+};
+
+struct ggml_opt_context {
+    ggml_backend_sched_t    backend_sched        = nullptr;
+    ggml_cgraph           * allocated_graph      = nullptr;
+    ggml_cgraph           * allocated_graph_copy = nullptr;
+    struct ggml_context   * ctx_static           = nullptr;
+    struct ggml_context   * ctx_static_cpu       = nullptr;
+    struct ggml_context   * ctx_compute          = nullptr;
+    struct ggml_context   * ctx_copy             = nullptr;
+    ggml_backend_buffer_t   buf_static           = nullptr;
+    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
+    std::mt19937            rng;
+
+    struct ggml_tensor * inputs  = nullptr;
+    struct ggml_tensor * outputs = nullptr;
+    struct ggml_tensor * labels  = nullptr;
+
+    struct ggml_tensor * loss     = nullptr;
+    struct ggml_tensor * pred     = nullptr;
+    struct ggml_tensor * ncorrect = nullptr;
+
+    struct ggml_cgraph * gf      = nullptr;
+    struct ggml_cgraph * gb_grad = nullptr;
+    struct ggml_cgraph * gb_opt  = nullptr;
+
+    int64_t iter               = 1;
+    int32_t opt_period         = 1;
+    int32_t opt_i              = 0;
+    bool    loss_per_datapoint = false;
+
+    ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+    void * get_opt_pars_ud                     = nullptr;
+    struct ggml_tensor * adamw_params          = nullptr;
+};
+
+struct ggml_opt_result {
+    int64_t              ndata    = 0;
+    std::vector<float>   loss;
+    std::vector<int32_t> pred;
+    int64_t              ncorrect = 0;
+
+    int64_t opt_period         = -1;
+    bool    loss_per_datapoint = false;
+};
+
+// ====== Dataset ======
+
+ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+    GGML_ASSERT(ne_datapoint >  0);
+    GGML_ASSERT(ne_label     >= 0);
+    GGML_ASSERT(ndata        >  0);
+    GGML_ASSERT(ndata_shard  >  0);
+
+    ggml_opt_dataset_t result = new ggml_opt_dataset;
+    result->ndata       = ndata;
+    result->ndata_shard = ndata_shard;
+
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 2*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx = ggml_init(params);
+    }
+
+    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
+
+    if (ne_label > 0) {
+        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
+    } else {
+        result->labels = nullptr;
+        result->nbs_labels = 0;
+    }
+
+    result->buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_cpu_buffer_type());
+
+    const int64_t nshards = ndata/ndata_shard;
+    result->permutation.resize(nshards);
+    for (int64_t i = 0; i < nshards; ++i) {
+        result->permutation[i] = i;
+    }
+    return result;
+}
+
+void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
+    ggml_backend_buffer_free(dataset->buf);
+    ggml_free(dataset->ctx);
+    delete dataset;
+}
+
+struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
+    return dataset->data;
+}
+
+struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset) {
+    return dataset->labels;
+}
+
+void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata) {
+    GGML_ASSERT(idata <= dataset->ndata);
+
+    if (idata < 0) {
+        std::shuffle(dataset->permutation.begin(), dataset->permutation.end(), opt_ctx->rng);
+        return;
+    }
+
+    GGML_ASSERT(idata % dataset->ndata_shard == 0);
+    const int64_t ishard_max = idata / dataset->ndata_shard;
+    std::shuffle(dataset->permutation.begin(), dataset->permutation.begin() + ishard_max, opt_ctx->rng);
+}
+
+void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, int64_t ibatch) {
+    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
+    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+
+    const size_t nb_data_batch = ggml_nbytes(data_batch);
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    if (labels_batch) {
+        const size_t nb_labels_batch = ggml_nbytes(labels_batch);
+        GGML_ASSERT(nb_labels_batch == shards_per_batch*dataset->nbs_labels);
+    }
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data = (const char *) dataset->data->data + ishard*dataset->nbs_data;
+        ggml_backend_tensor_set(data_batch, ptr_data, ishard_batch*dataset->nbs_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels = (const char *) dataset->labels->data + ishard*dataset->nbs_labels;
+        ggml_backend_tensor_set(labels_batch, ptr_labels, ishard_batch*dataset->nbs_labels, dataset->nbs_labels);
+    }
+}
+
+// ====== Model / Context ======
+
+struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
+    GGML_UNUSED(userdata);
+
+    ggml_opt_optimizer_params result;
+
+    result.adamw.alpha = 0.001f;
+    result.adamw.beta1 = 0.9f;
+    result.adamw.beta2 = 0.999f;
+    result.adamw.eps   = 1e-8f;
+    result.adamw.wd    = 0.0f;
+
+    return result;
+}
+
+struct ggml_opt_params ggml_opt_default_params(
+        ggml_backend_sched_t      backend_sched,
+        struct ggml_context     * ctx_compute,
+        struct ggml_tensor      * inputs,
+        struct ggml_tensor      * outputs,
+        enum ggml_opt_loss_type   loss_type) {
+    return {
+        /*backend_sched   =*/ backend_sched,
+        /*ctx_compute     =*/ ctx_compute,
+        /*inputs          =*/ inputs,
+        /*logits          =*/ outputs,
+        /*loss_type       =*/ loss_type,
+        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
+        /*opt_period      =*/ 1,
+        /*get_opt_pars    =*/ ggml_opt_get_default_optimizer_params,
+        /*get_opt_pars_ud =*/ nullptr,
+    };
+}
+
+static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
+    if (!tensor) {
+        return nullptr;
+    }
+
+    if (tensor_map.find(tensor) != tensor_map.end()) {
+        return tensor_map[tensor];
+    }
+
+    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
+    tensor_map[tensor] = new_tensor;
+
+    new_tensor->op = tensor->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        new_tensor->nb[i] = tensor->nb[i];
+    }
+    new_tensor->flags = tensor->flags;
+    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
+    strcpy(new_tensor->name, tensor->name);
+    new_tensor->data = tensor->data;
+    new_tensor->buffer = tensor->buffer;
+    new_tensor->extra = tensor->extra;
+    new_tensor->view_offs = tensor->view_offs;
+    new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
+    }
+
+    return new_tensor;
+}
+
+static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
+    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
+
+    ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
+
+    for (int i = 0; i < src->n_leafs; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
+    }
+    GGML_ASSERT(dst->n_leafs == src->n_leafs);
+    for (int i = 0; i < src->n_nodes; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
+    }
+    GGML_ASSERT(dst->n_nodes == src->n_nodes);
+    for (int i = 0; i < src->n_nodes; ++i) {
+        const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+        const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+        GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+        GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+        dst->grads[igrad_dst]     = src->grads[igrad_src];
+        dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
+    }
+
+    return dst;
+}
+
+static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
+    GGML_ASSERT(graph);
+    if (opt_ctx->allocated_graph == graph) {
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    {
+        ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+    }
+
+    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched   = params.backend_sched;
+    result->ctx_compute     = params.ctx_compute;
+    result->inputs          = params.inputs;
+    result->outputs         = params.outputs;
+    result->opt_period      = params.opt_period;
+    result->get_opt_pars    = params.get_opt_pars;
+    result->get_opt_pars_ud = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
+    GGML_ASSERT(result->opt_period >= 1);
+
+    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
+        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
+
+    ggml_set_input(result->inputs);
+    ggml_set_output(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    int n_param = 0;
+    for (int i = 0; i < result->gf->n_nodes; ++i) {
+        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+            n_param++;
+        }
+    }
+
+    {
+        // The static context is used for:
+        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - optimizer momenta (2 tensors per param)
+        //   - labels
+        //   - loss + its gradient (up to 5 tensors)
+        //   - pred
+        //   - ncorrect (2 tensors).
+        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static = ggml_init(params);
+    }
+    {
+        // The static cpu context is used for:
+        //   - optimizer parameters (1 for the entire context)
+        const size_t size_meta = 1 * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static_cpu = ggml_init(params);
+    }
+
+
+    switch (params.loss_type) {
+        case GGML_OPT_LOSS_TYPE_MEAN: {
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean");
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_SUM: {
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            result->loss_per_datapoint = false;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_cross_entropy");
+            if (result->opt_period > 1) {
+                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
+                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            }
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_error");
+            result->loss = ggml_sqr(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_squared_error");
+            result->loss = ggml_sum(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean_squared_error");
+            result->loss_per_datapoint = true;
+            break;
+        }
+    }
+    ggml_set_output(result->loss);
+    ggml_set_loss(result->loss);
+    ggml_build_forward_expand(result->gf, result->loss);
+
+    result->pred = ggml_argmax(result->ctx_static, result->outputs);
+    ggml_set_name(result->pred, "pred");
+    ggml_set_output(result->pred);
+    ggml_build_forward_expand(result->gf, result->pred);
+
+    if (result->labels) {
+        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
+        ggml_set_name(result->ncorrect, "ncorrect");
+        ggml_set_output(result->ncorrect);
+        ggml_build_forward_expand(result->gf, result->ncorrect);
+    } else {
+        result->ncorrect = nullptr;
+    }
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        return result;
+    }
+
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
+    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        ggml_graph_reset(result->gb_grad);
+        return result;
+    }
+
+    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
+
+    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(result->adamw_params);
+    ggml_set_name(result->adamw_params, "adamw_params");
+
+    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = result->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
+            ggml_build_forward_expand(result->gb_opt, opt_step);
+        }
+    }
+
+    result->buf_static = ggml_backend_alloc_ctx_tensors(
+        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+
+    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+
+    ggml_graph_reset(result->gb_opt);
+
+    return result;
+}
+
+void ggml_opt_free(ggml_opt_context_t opt_ctx) {
+    if (opt_ctx == nullptr) {
+        return;
+    }
+    ggml_backend_buffer_free(opt_ctx->buf_static);
+    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_free(opt_ctx->ctx_static);
+    ggml_free(opt_ctx->ctx_static_cpu);
+    delete opt_ctx;
+}
+
+void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
+    if (optimizer) {
+        ggml_graph_reset(opt_ctx->gb_opt);
+        opt_ctx->iter = 1;
+    } else {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+}
+
+struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->inputs;
+}
+
+struct ggml_tensor * ggml_opt_outputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->outputs;
+}
+
+struct ggml_tensor * ggml_opt_labels(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->labels;
+}
+
+struct ggml_tensor * ggml_opt_loss(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->loss;
+}
+
+struct ggml_tensor * ggml_opt_pred(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->pred;
+}
+
+struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->ncorrect;
+}
+
+struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node) {
+    return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node);
+}
+
+// ====== Optimization Result ======
+
+ggml_opt_result_t ggml_opt_result_init() {
+    return new ggml_opt_result;
+}
+
+void ggml_opt_result_free(ggml_opt_result_t result) {
+    delete result;
+}
+
+void ggml_opt_result_reset(ggml_opt_result_t result) {
+    result->ndata = 0;
+    result->loss.clear();
+    result->pred.clear();
+    result->ncorrect = 0;
+}
+
+void ggml_opt_result_ndata(ggml_opt_result_t result, int64_t * ndata) {
+    *ndata = result->ndata;
+}
+
+void ggml_opt_result_loss(ggml_opt_result_t result, double * loss, double * unc) {
+    const int64_t nbatches = result->loss.size(); // Number of physical batches.
+
+    if (nbatches == 0) {
+        *loss = 0.0;
+        *unc  = NAN;
+        return;
+    }
+
+    double sum         = 0.0;
+    double sum_squared = 0.0;
+
+    for (const float & loss : result->loss) {
+        // If the loss is per datapoint it was scaled by 1.0f/opt_period for each physical batch.
+        const float loss_scaled = result->loss_per_datapoint ? loss*result->opt_period : loss;
+        sum         += loss_scaled;
+        sum_squared += loss_scaled*loss_scaled;
+    }
+
+    const double mean = sum/nbatches;
+    *loss = result->loss_per_datapoint ? mean : sum;
+
+    if (!unc) {
+        return;
+    }
+
+    if (nbatches < 2) {
+        *unc = NAN;
+        return;
+    }
+
+    const double var_sum = sum_squared/nbatches - mean*mean; // variance without Bessel's correction, i.e. nbatches/(nbatches-1)
+    *unc = result->loss_per_datapoint ? sqrt(var_sum / (nbatches - 1)) : sqrt(var_sum * nbatches/(nbatches - 1));
+}
+
+void ggml_opt_result_pred(ggml_opt_result_t result, int32_t * pred) {
+    for (size_t i = 0; i < result->pred.size(); ++i) {
+        pred[i] = result->pred[i];
+    }
+}
+
+void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc) {
+    *accuracy = result->ncorrect >= 0 ? double(result->ncorrect) / double(result->ndata) : NAN;
+
+    if (!unc) {
+        return;
+    }
+
+    *unc = result->ncorrect >= 0 && result->ndata >= 2 ?
+        sqrt((*accuracy) * (1.0 - (*accuracy)) / double(result->ndata - 1)) : NAN;
+}
+
+// ====== Computation ======
+
+static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
+    if (graph != opt_ctx->gf) {
+        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+
+        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.eps   >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    <= 1.0f);
+
+        // beta1, beta2 after applying warmup
+        const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+        const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+
+        float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
+        adamw_par_data[0] = opt_pars.adamw.alpha;
+        adamw_par_data[1] = opt_pars.adamw.beta1;
+        adamw_par_data[2] = opt_pars.adamw.beta2;
+        adamw_par_data[3] = opt_pars.adamw.eps;
+        adamw_par_data[4] = opt_pars.adamw.wd;
+        adamw_par_data[5] = beta1h;
+        adamw_par_data[6] = beta2h;
+    }
+
+    ggml_opt_alloc_graph(opt_ctx, graph);
+    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+
+    if (!result) {
+        return;
+    }
+
+    if (result->ndata == 0) {
+        result->loss_per_datapoint = opt_ctx->loss_per_datapoint;
+        result->opt_period         = opt_ctx->opt_period;
+    } else {
+        GGML_ASSERT(result->loss_per_datapoint == opt_ctx->loss_per_datapoint);
+        GGML_ASSERT(result->opt_period         == opt_ctx->opt_period);
+    }
+
+    const int64_t ndata = opt_ctx->outputs->ne[1];
+    GGML_ASSERT(result->ndata == ndata*int64_t(result->loss.size()) && "varying batch size not supported");
+    result->ndata += ndata;
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->loss));
+    GGML_ASSERT(opt_ctx->loss->type == GGML_TYPE_F32);
+    float loss;
+    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
+    result->loss.push_back(loss);
+
+    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+    std::vector<int32_t> pred(ndata);
+    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+
+    if (!opt_ctx->labels || result->ncorrect < 0) {
+        result->ncorrect = -1;
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->ncorrect));
+    GGML_ASSERT(opt_ctx->ncorrect->type == GGML_TYPE_I64);
+    int64_t ncorrect;
+    ggml_backend_tensor_get(opt_ctx->ncorrect, &ncorrect, 0, ggml_nbytes(opt_ctx->ncorrect));
+    result->ncorrect += ncorrect;
+}
+
+void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
+}
+
+void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    if (opt_ctx->opt_period == 1) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        return;
+    }
+
+    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+    if (opt_i_next == 0) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
+    } else {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
+    }
+    opt_ctx->opt_i = opt_i_next;
+}
+
+// ====== High-Level Functions ======
+
+void ggml_opt_epoch(
+        ggml_opt_context_t      opt_ctx,
+        ggml_opt_dataset_t      dataset,
+        ggml_opt_result_t       result_train,
+        ggml_opt_result_t       result_eval,
+        int64_t                 idata_split,
+        ggml_opt_epoch_callback callback_train,
+        ggml_opt_epoch_callback callback_eval) {
+    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
+    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
+    GGML_ASSERT(data->ne[0] == inputs->ne[0]);
+
+    const int64_t ndata       =   data->ne[1];
+    const int64_t ndata_batch = inputs->ne[1];
+
+    GGML_ASSERT(data->ne[1] % inputs->ne[1] == 0);
+    const int64_t nbatches = ndata/ndata_batch;
+
+    idata_split = idata_split < 0 ? ndata : idata_split;
+    GGML_ASSERT(idata_split % ndata_batch == 0);
+    const int64_t ibatch_split = idata_split / ndata_batch;
+
+    int64_t ibatch = 0;
+    int64_t t_loop_start = ggml_time_us();
+    for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward_backward(opt_ctx, result_train);
+        if (callback_train) {
+            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
+        }
+    }
+    t_loop_start = ggml_time_us();
+    for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward(opt_ctx, result_eval);
+        if (callback_eval) {
+            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
+        }
+    }
+}
+
+void ggml_opt_epoch_callback_progress_bar(
+        bool               train,
+        ggml_opt_context_t opt_ctx,
+        ggml_opt_dataset_t dataset,
+        ggml_opt_result_t  result,
+        int64_t            ibatch,
+        int64_t            ibatch_max,
+        int64_t            t_start_us) {
+    fprintf(stderr, "%s[", train ? "train: " : "val:   ");
+
+    constexpr int64_t bar_length = 25;
+    for (int64_t j = 0; j < bar_length; ++j) {
+        const int64_t ibatch_j = ibatch_max * j/bar_length;
+        if (ibatch_j < ibatch) {
+            fprintf(stderr, "=");
+        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
+            fprintf(stderr, ">");
+        } else {
+            fprintf(stderr, " ");
+        }
+    }
+
+    const int64_t batch_size = ggml_opt_inputs(opt_ctx)->ne[1];
+    const int64_t idata      = ibatch*batch_size;
+    const int64_t idata_max  = ibatch_max*batch_size;
+
+    double loss;
+    double loss_unc;
+    ggml_opt_result_loss(result, &loss, &loss_unc);
+
+    double accuracy;
+    double accuracy_unc;
+    ggml_opt_result_accuracy(result, &accuracy, &accuracy_unc);
+
+    const int64_t t_ibatch_us = ggml_time_us() - t_start_us;
+    int64_t t_ibatch_s = t_ibatch_us / 1000000;
+    const int64_t t_ibatch_h = t_ibatch_s / 3600;
+    t_ibatch_s -= t_ibatch_h * 3600;
+    const int64_t t_ibatch_m = t_ibatch_s / 60;
+    t_ibatch_s -= t_ibatch_m * 60;
+
+    const int64_t t_eta_us = t_ibatch_us * (ibatch_max - ibatch)/ibatch;
+    int64_t t_eta_s = t_eta_us / 1000000;
+    const int64_t t_eta_h = t_eta_s / 3600;
+    t_eta_s -= t_eta_h * 3600;
+    const int64_t t_eta_m = t_eta_s / 60;
+    t_eta_s -= t_eta_m * 60;
+
+    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
+            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
+    if (ibatch == ibatch_max) {
+        fprintf(stderr, "\n");
+    }
+    fflush(stderr);
+
+    GGML_UNUSED(dataset);
+}
+
+void ggml_opt_fit(
+        ggml_backend_sched_t            backend_sched,
+        ggml_context                  * ctx_compute,
+        ggml_tensor                   * inputs,
+        ggml_tensor                   * outputs,
+        ggml_opt_dataset_t              dataset,
+        enum ggml_opt_loss_type         loss_type,
+        ggml_opt_get_optimizer_params   get_opt_pars,
+        int64_t                         nepoch,
+        int64_t                         nbatch_logical,
+        float                           val_split,
+        bool                            silent) {
+    ggml_time_init();
+    const int64_t t_start_us = ggml_time_us();
+
+    const int64_t ndata           = ggml_opt_dataset_data(dataset)->ne[1];
+    const int64_t nbatch_physical = inputs->ne[1];
+    GGML_ASSERT(ndata          % nbatch_logical  == 0);
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+
+    const int64_t opt_period       = nbatch_logical / nbatch_physical;
+    const int64_t nbatches_logical = ndata / nbatch_logical;
+
+    GGML_ASSERT(val_split >= 0.0f);
+    GGML_ASSERT(val_split <  1.0f);
+    const int64_t ibatch_split = int64_t(((1.0f - val_split) * nbatches_logical)) * opt_period; // train <-> val split index (physical)
+    const int64_t idata_split  = ibatch_split * nbatch_physical;
+
+    int64_t epoch = 1;
+
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    params.opt_period      = opt_period;
+    params.get_opt_pars    = get_opt_pars;
+    params.get_opt_pars_ud = &epoch;
+    ggml_opt_context_t opt_ctx = ggml_opt_init(params);
+
+    // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
+    if (nbatch_logical < ndata) {
+        ggml_opt_dataset_shuffle(opt_ctx, dataset, -1); // Shuffle all data (train + validation).
+    }
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_val   = ggml_opt_result_init();
+
+    ggml_opt_epoch_callback epoch_callback = silent ? nullptr : ggml_opt_epoch_callback_progress_bar;
+
+    for (; epoch <= nepoch; ++epoch) {
+        if (nbatch_logical < idata_split) {
+            ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
+        }
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_val);
+
+        if (!silent) {
+            fprintf(stderr, "%s: epoch %04" PRId64 "/%04" PRId64 ":\n", __func__, epoch, nepoch);
+        }
+        ggml_opt_epoch(opt_ctx, dataset, result_train, result_val, idata_split, epoch_callback, epoch_callback);
+        if (!silent) {
+            fprintf(stderr, "\n");
+        }
+    }
+
+    if (!silent) {
+        int64_t t_total_s = (ggml_time_us() - t_start_us) / 1000000;
+        const int64_t t_total_h = t_total_s / 3600;
+        t_total_s -= t_total_h * 3600;
+        const int64_t t_total_m = t_total_s / 60;
+        t_total_s -= t_total_m * 60;
+        fprintf(stderr, "%s: training took %02" PRId64 ":%02" PRId64 ":%02" PRId64 "\n", __func__, t_total_h, t_total_m, t_total_s);
+    }
+
+    ggml_opt_free(opt_ctx);
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_val);
+}
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
new file mode 100644
index 000000000..7918388ae
--- /dev/null
+++ b/ggml/src/ggml-quants.c
@@ -0,0 +1,5238 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid warnings for hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
+#define UNUSED GGML_UNUSED
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
+    const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(qh));
+    }
+}
+
+void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+// reference implementation for deterministic creation of model files
+void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
+    assert(QK8_1 == 32);
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_1; j++) {
+            const float v = x[i*QK8_1 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        int sum = 0;
+
+        for (int j = 0; j < QK8_1/2; ++j) {
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
+
+            sum += y[i].qs[          j];
+            sum += y[i].qs[QK8_1/2 + j];
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(sum*d);
+    }
+}
+
+void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
+        }
+    }
+}
+
+//
+// 2-6 bit quantization in super-blocks
+//
+
+//
+// ===================== Helper functions
+//
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
+        const float * restrict qw) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+        }
+        return 1/iscale;
+    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
+    for (int i = 0; i < n; ++i) {
+#endif
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
+static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (do_rmse) {
+        float sumlx = 0;
+        float suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            L[i] = l;
+            float w = x[i]*x[i];
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        for (int itry = 0; itry < 5; ++itry) {
+            int n_changed = 0;
+            for (int i = 0; i < n; ++i) {
+                float w = x[i]*x[i];
+                float slx = sumlx - w*x[i]*L[i];
+                if (slx > 0) {
+                    float sl2 = suml2 - w*L[i]*L[i];
+                    int new_l = nearest_int(x[i] * sl2 / slx);
+                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
+                    if (new_l != L[i]) {
+                        slx += w*x[i]*new_l;
+                        sl2 += w*new_l*new_l;
+                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
+                            L[i] = new_l; sumlx = slx; suml2 = sl2;
+                            ++n_changed;
+                        }
+                    }
+                }
+            }
+            if (!n_changed) {
+                break;
+            }
+        }
+        for (int i = 0; i < n; ++i) {
+            L[i] += nmax;
+        }
+        return sumlx / suml2;
+    }
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+    }
+    return 1/iscale;
+}
+
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
+    float min = x[0];
+    float max = x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+    }
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = 0;
+        return 0.f;
+    }
+    if (min > 0) min = 0;
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    for (int itry = 0; itry < ntry; ++itry) {
+        float sumlx = 0; int suml2 = 0;
+        bool did_change = false;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            if (l != L[i]) {
+                L[i] = l;
+                did_change = true;
+            }
+            sumlx += (x[i] - min)*l;
+            suml2 += l*l;
+        }
+        scale = sumlx/suml2;
+        float sum = 0;
+        for (int i = 0; i < n; ++i) {
+            sum += x[i] - scale*L[i];
+        }
+        min = alpha*min + (1 - alpha)*sum/n;
+        if (min > 0) min = 0;
+        iscale = 1/scale;
+        if (!did_change) break;
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+//========================- 2-bit (de)-quantization
+
+void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float   weights[16];
+    float mins[QK_K/16];
+    float scales[QK_K/16];
+
+    const float q4scale = 15.f;
+
+    for (int i = 0; i < nb; i++) {
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
+            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        if (max_scale > 0) {
+            float iscale = q4scale/max_scale;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*scales[j]);
+                y[i].scales[j] = l;
+            }
+            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
+        } else {
+            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+        }
+        if (max_min > 0) {
+            float iscale = q4scale/max_min;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*mins[j]);
+                y[i].scales[j] |= (l << 4);
+            }
+            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
+        } else {
+            y[i].dmin = GGML_FP32_TO_FP16(0.f);
+        }
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int((x[16*j + ii] + dm)/d);
+                l = MAX(0, MIN(3, l));
+                L[16*j + ii] = l;
+            }
+        }
+
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * q = x[i].qs;
+
+        int is = 0;
+        float dl, ml;
+        for (int n = 0; n < QK_K; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                uint8_t sc = x[i].scales[is++];
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+
+                sc = x[i].scales[is++];
+                dl = d * (sc & 0xF); ml = min * (sc >> 4);
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+
+                shift += 2;
+            }
+            q += 32;
+        }
+    }
+}
+
+static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights ? weights[0] : x[0]*x[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights ? weights[i] : x[i]*x[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) {
+        min = 0;
+    }
+    if (max <= min) {
+        memset(L, 0, n);
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff*diff;
+        float w = weights ? weights[i] : x[i]*x[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights ? weights[i] : x[i]*x[i];
+            sum_l  += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff*diff;
+                float w = weights ? weights[i] : x[i]*x[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
+static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
+    float max = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    if (!max) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.f;
+    }
+    float iscale = nmax / max;
+    for (int i = 0; i < n; ++i) {
+        L[i] = nearest_int(iscale * x[i]);
+    }
+    float scale = 1/iscale;
+    float best_mse = 0;
+    for (int i = 0; i < n; ++i) {
+        float diff = x[i] - scale*L[i];
+        float w = quant_weights[i];
+        best_mse += w*diff*diff;
+    }
+    for (int is = -4; is <= 4; ++is) {
+        if (is == 0) continue;
+        float iscale_is = (0.1f*is + nmax)/max;
+        float scale_is = 1/iscale_is;
+        float mse = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale_is*x[i]);
+            l = MIN(nmax, l);
+            float diff = x[i] - scale_is*l;
+            float w = quant_weights[i];
+            mse += w*diff*diff;
+        }
+        if (mse < best_mse) {
+            best_mse = mse;
+            iscale = iscale_is;
+        }
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = MIN(nmax, l);
+        L[i] = l;
+        float w = quant_weights[i];
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    for (int itry = 0; itry < 5; ++itry) {
+        int n_changed = 0;
+        for (int i = 0; i < n; ++i) {
+            float w = quant_weights[i];
+            float slx = sumlx - w*x[i]*L[i];
+            float sl2 = suml2 - w*L[i]*L[i];
+            if (slx > 0 && sl2 > 0) {
+                int new_l = nearest_int(x[i] * sl2 / slx);
+                new_l = MIN(nmax, new_l);
+                if (new_l != L[i]) {
+                    slx += w*x[i]*new_l;
+                    sl2 += w*new_l*new_l;
+                    if (slx*slx*suml2 > sumlx*sumlx*sl2) {
+                        L[i] = new_l; sumlx = slx; suml2 = sl2;
+                        ++n_changed;
+                    }
+                }
+            }
+        }
+        if (!n_changed) {
+            break;
+        }
+    }
+    return sumlx/suml2;
+}
+
+static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
+    GGML_ASSERT(quant_weights);
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+    const bool requantize = true;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float mins[QK_K/16];
+    float scales[QK_K/16];
+    float sw[QK_K/16];
+    float weight[16];
+    uint8_t Ls[QK_K/16], Lm[QK_K/16];
+
+    for (int i = 0; i < nb; i++) {
+        memset(sw, 0, QK_K/16*sizeof(float));
+        float sumx2 = 0;
+        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
+        float sigma2 = sumx2/QK_K;
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float * restrict qw = quant_weights + QK_K * i + 16*j;
+            for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
+            for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
+            scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+        }
+
+        float dm, mm;
+        dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
+        mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
+
+        y[i].d    = GGML_FP32_TO_FP16(dm);
+        y[i].dmin = GGML_FP32_TO_FP16(mm);
+        dm        = GGML_FP16_TO_FP32(y[i].d);
+        mm        = GGML_FP16_TO_FP32(y[i].dmin);
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            y[i].scales[j] = Ls[j] | (Lm[j] << 4);
+        }
+
+        if (requantize) {
+            for (int j = 0; j < QK_K/16; ++j) {
+                const float d = dm * (y[i].scales[j] & 0xF);
+                if (!d) continue;
+                const float m = mm * (y[i].scales[j] >> 4);
+                for (int ii = 0; ii < 16; ++ii) {
+                    int l = nearest_int((x[16*j + ii] + m)/d);
+                    l = MAX(0, MIN(3, l));
+                    L[16*j + ii] = l;
+                }
+            }
+        }
+
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+
+        x += QK_K;
+    }
+}
+
+size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
+//========================= 3-bit (de)-quantization
+
+void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    int8_t L[QK_K];
+    float scales[QK_K / 16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0;
+        float amax = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
+            float scale = fabsf(scales[j]);
+            if (scale > amax) {
+                amax = scale; max_scale = scales[j];
+            }
+        }
+
+        memset(y[i].scales, 0, 12);
+        if (max_scale) {
+            float iscale = -32.f/max_scale;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int8_t l = nearest_int(iscale*scales[j]);
+                l = MAX(-32, MIN(31, l)) + 32;
+                if (j < 8) {
+                    y[i].scales[j] = l & 0xF;
+                } else {
+                    y[i].scales[j-8] |= ((l & 0xF) << 4);
+                }
+                l >>= 4;
+                y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
+            }
+            y[i].d = GGML_FP32_TO_FP16(1/iscale);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+        }
+
+        int8_t sc;
+        for (int j = 0; j < QK_K/16; ++j) {
+            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
+                L[16*j + ii] = l + 4;
+            }
+        }
+
+        memset(y[i].hmask, 0, QK_K/8);
+        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
+        int m = 0;
+        uint8_t hm = 1;
+        for (int j = 0; j < QK_K; ++j) {
+            if (L[j] > 3) {
+                y[i].hmask[m] |= hm;
+                L[j] -= 4;
+            }
+            if (++m == QK_K/8) {
+                m = 0; hm <<= 1;
+            }
+        }
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    uint32_t aux[4];
+    const int8_t * scales = (const int8_t*)aux;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict q = x[i].qs;
+        const uint8_t * restrict hm = x[i].hmask;
+        uint8_t m = 1;
+
+        memcpy(aux, x[i].scales, 12);
+        uint32_t tmp = aux[2];
+        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+
+        int is = 0;
+        float dl;
+        for (int n = 0; n < QK_K; n += 128) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
+                }
+
+                dl = d_all * (scales[is++] - 32);
+                for (int l = 0; l < 16; ++l) {
+                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
+                }
+
+                shift += 2;
+                m <<= 1;
+            }
+            q += 32;
+        }
+
+    }
+}
+
+static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
+    assert(n_per_row % QK_K == 0);
+    const int nb = n_per_row / QK_K;
+
+    int8_t L[QK_K];
+    float scales[QK_K / 16];
+    float weight[16];
+    float sw[QK_K / 16];
+    int8_t Ls[QK_K / 16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sumx2 = 0;
+        for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K * i + 16*j;
+                for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
+            } else {
+                for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
+            }
+            float sumw = 0;
+            for (int l = 0; l < 16; ++l) sumw += weight[l];
+            sw[j] = sumw;
+
+            scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
+
+        }
+
+        memset(y[i].scales, 0, 12);
+
+        float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
+        for (int j = 0; j < QK_K/16; ++j) {
+            int l = Ls[j];
+            if (j < 8) {
+                y[i].scales[j] = l & 0xF;
+            } else {
+                y[i].scales[j-8] |= ((l & 0xF) << 4);
+            }
+            l >>= 4;
+            y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
+        }
+        y[i].d = GGML_FP32_TO_FP16(d_block);
+
+        int8_t sc;
+        for (int j = 0; j < QK_K/16; ++j) {
+            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
+                L[16*j + ii] = l + 4;
+            }
+        }
+
+        memset(y[i].hmask, 0, QK_K/8);
+        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
+        int m = 0;
+        uint8_t hm = 1;
+        for (int j = 0; j < QK_K; ++j) {
+            if (L[j] > 3) {
+                y[i].hmask[m] |= hm;
+                L[j] -= 4;
+            }
+            if (++m == QK_K/8) {
+                m = 0; hm <<= 1;
+            }
+        }
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
+            }
+        }
+
+        x += QK_K;
+    }
+}
+
+size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
+// ====================== 4-bit (de)-quantization
+
+void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    float   weights[32];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+
+    for (int i = 0; i < nb; i++) {
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(15, l));
+                L[32*j + ii] = l;
+            }
+        }
+
+        uint8_t * q = y[i].qs;
+        for (int j = 0; j < QK_K; j += 64) {
+            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
+            q += 32;
+        }
+
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        const uint8_t * q = x[i].qs;
+
+        const float d   = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
+
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K; j += 64) {
+            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
+            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
+            q += 32; is += 2;
+        }
+    }
+}
+
+static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+    assert(n_per_row % QK_K == 0);
+    const int64_t nb = n_per_row / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    uint8_t Ls[QK_K/32];
+    uint8_t Lm[QK_K/32];
+    float   weights[32];
+    float   sw[QK_K/32];
+    float   mins[QK_K/32];
+    float   scales[QK_K/32];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sum_x2 = 0;
+        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
+        float sigma2 = 2*sum_x2/QK_K;
+        float av_x = sqrtf(sigma2);
+
+        for (int j = 0; j < QK_K/32; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 32*j;
+                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
+            } else {
+                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            }
+            float sumw = 0;
+            for (int l = 0; l < 32; ++l) sumw += weights[l];
+            sw[j] = sumw;
+            scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+        }
+
+        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
+        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = Ls[j];
+            uint8_t lm = Lm[j];
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(d_block);
+        y[i].dmin = GGML_FP32_TO_FP16(m_block);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(15, l));
+                L[32*j + ii] = l;
+            }
+        }
+        uint8_t * q = y[i].qs;
+        for (int j = 0; j < QK_K; j += 64) {
+            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
+            q += 32;
+        }
+
+        x += QK_K;
+
+    }
+}
+
+size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
+// ====================== 5-bit (de)-quantization
+
+void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    uint8_t L[QK_K];
+    float mins[QK_K/32];
+    float scales[QK_K/32];
+    float weights[32];
+    uint8_t Laux[32];
+
+    for (int i = 0; i < nb; i++) {
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
+        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(31, l));
+                L[32*j + ii] = l;
+            }
+        }
+
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
+
+        uint8_t m1 = 1, m2 = 2;
+        for (int n = 0; n < QK_K; n += 64) {
+            for (int j = 0; j < 32; ++j) {
+                int l1 = L[n + j];
+                if (l1 > 15) {
+                    l1 -= 16; qh[j] |= m1;
+                }
+                int l2 = L[n + j + 32];
+                if (l2 > 15) {
+                    l2 -= 16; qh[j] |= m2;
+                }
+                ql[j] = l1 | (l2 << 4);
+            }
+            m1 <<= 2; m2 <<= 2;
+            ql += 32;
+        }
+
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        const uint8_t * ql = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float min = GGML_FP16_TO_FP32(x[i].dmin);
+
+        int is = 0;
+        uint8_t sc, m;
+        uint8_t u1 = 1, u2 = 2;
+        for (int j = 0; j < QK_K; j += 64) {
+            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
+            for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
+            ql += 32; is += 2;
+            u1 <<= 2; u2 <<= 2;
+        }
+    }
+}
+
+static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+    assert(n_per_row % QK_K == 0);
+    const int64_t nb = n_per_row / QK_K;
+
+    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    uint8_t Ls[QK_K/32];
+    uint8_t Lm[QK_K/32];
+    float   mins[QK_K/32];
+    float   scales[QK_K/32];
+    float   sw[QK_K/32];
+    float   weights[32];
+
+    for (int i = 0; i < nb; i++) {
+
+        float sum_x2 = 0;
+        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
+        float sigma2 = 2*sum_x2/QK_K;
+        float av_x = sqrtf(sigma2);
+
+        for (int j = 0; j < QK_K/32; ++j) {
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 32*j;
+                for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
+            } else {
+                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            }
+            float sumw = 0;
+            for (int l = 0; l < 32; ++l) sumw += weights[l];
+            sw[j] = sumw;
+
+            scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+        }
+
+        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
+        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
+
+        for (int j = 0; j < QK_K/32; ++j) {
+            uint8_t ls = Ls[j];
+            uint8_t lm = Lm[j];
+            ls = MIN(63, ls);
+            lm = MIN(63, lm);
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+        y[i].d = GGML_FP32_TO_FP16(d_block);
+        y[i].dmin = GGML_FP32_TO_FP16(m_block);
+
+        uint8_t sc, m;
+        for (int j = 0; j < QK_K/32; ++j) {
+            get_scale_min_k4(j, y[i].scales, &sc, &m);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            for (int ii = 0; ii < 32; ++ii) {
+                int l = nearest_int((x[32*j + ii] + dm)/d);
+                l = MAX(0, MIN(31, l));
+                L[32*j + ii] = l;
+            }
+        }
+
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
+
+        uint8_t m1 = 1, m2 = 2;
+        for (int n = 0; n < QK_K; n += 64) {
+            for (int j = 0; j < 32; ++j) {
+                int l1 = L[n + j];
+                if (l1 > 15) {
+                    l1 -= 16; qh[j] |= m1;
+                }
+                int l2 = L[n + j + 32];
+                if (l2 > 15) {
+                    l2 -= 16; qh[j] |= m2;
+                }
+                ql[j] = l1 | (l2 << 4);
+            }
+            m1 <<= 2; m2 <<= 2;
+            ql += 32;
+        }
+
+        x += QK_K;
+
+    }
+}
+
+size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
+// ====================== 6-bit (de)-quantization
+
+void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    int8_t L[QK_K];
+    float   scales[QK_K/16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0;
+        float max_abs_scale = 0;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+
+            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
+            scales[ib] = scale;
+
+            const float abs_scale = fabsf(scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scale;
+            }
+
+        }
+
+        if (max_abs_scale < GROUP_MAX_EPS) {
+            memset(&y[i], 0, sizeof(block_q6_K));
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+            x += QK_K;
+            continue;
+        }
+
+        float iscale = -128.f/max_scale;
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
+        }
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
+                L[16*j + ii] = l + 32;
+            }
+        }
+
+        uint8_t * restrict ql = y[i].ql;
+        uint8_t * restrict qh = y[i].qh;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                const uint8_t q1 = L[j + l +  0] & 0xF;
+                const uint8_t q2 = L[j + l + 32] & 0xF;
+                const uint8_t q3 = L[j + l + 64] & 0xF;
+                const uint8_t q4 = L[j + l + 96] & 0xF;
+                ql[l+ 0] = q1 | (q3 << 4);
+                ql[l+32] = q2 | (q4 << 4);
+                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+            }
+            ql += 64;
+            qh += 32;
+        }
+
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict ql = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const int8_t  * restrict sc = x[i].scales;
+
+        for (int n = 0; n < QK_K; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                y[l +  0] = d * sc[is + 0] * q1;
+                y[l + 32] = d * sc[is + 2] * q2;
+                y[l + 64] = d * sc[is + 4] * q3;
+                y[l + 96] = d * sc[is + 6] * q4;
+            }
+            y  += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
+static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+    assert(n_per_row % QK_K == 0);
+    const int64_t nb = n_per_row / QK_K;
+
+    int8_t L[QK_K];
+    float   scales[QK_K/16];
+    //float   weights[16];
+
+    for (int i = 0; i < nb; i++) {
+
+        //float sum_x2 = 0;
+        //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j];
+        //float sigma2 = sum_x2/QK_K;
+
+        float max_scale = 0;
+        float max_abs_scale = 0;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+
+            float scale;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*i + 16*ib;
+                //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]);
+                //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights);
+                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw);
+            } else {
+                scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
+            }
+            scales[ib] = scale;
+
+            const float abs_scale = fabsf(scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scale;
+            }
+
+        }
+
+        if (max_abs_scale < GROUP_MAX_EPS) {
+            memset(&y[i], 0, sizeof(block_q6_K));
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+            x += QK_K;
+            continue;
+        }
+
+        float iscale = -128.f/max_scale;
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
+        }
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
+                L[16*j + ii] = l + 32;
+            }
+        }
+
+        uint8_t * restrict ql = y[i].ql;
+        uint8_t * restrict qh = y[i].qh;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                const uint8_t q1 = L[j + l +  0] & 0xF;
+                const uint8_t q2 = L[j + l + 32] & 0xF;
+                const uint8_t q3 = L[j + l + 64] & 0xF;
+                const uint8_t q4 = L[j + l + 96] & 0xF;
+                ql[l+ 0] = q1 | (q3 << 4);
+                ql[l+32] = q2 | (q4 << 4);
+                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+            }
+            ql += 64;
+            qh += 32;
+        }
+
+        x += QK_K;
+
+    }
+}
+
+size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
+    if (!quant_weights) {
+        quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
+    }
+    else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_0_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_0];
+    int8_t L[QK4_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK4_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_0 * ib;
+        const float * qw = quant_weights + QK4_0 * ib;
+        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    static_assert(QK4_1 == 32, "QK4_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_1_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_1];
+    uint8_t L[QK4_1], Laux[QK4_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK4_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_1 * ib;
+        const float * qw = quant_weights + QK4_1 * ib;
+        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    static_assert(QK5_0 == 32, "QK5_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_0_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_0];
+    int8_t L[QK5_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK5_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_0 * ib;
+        const float * qw = quant_weights + QK5_0 * ib;
+        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    static_assert(QK5_1 == 32, "QK5_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_1_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_1];
+    uint8_t L[QK5_1], Laux[QK5_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK5_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_1 * ib;
+        const float * qw = quant_weights + QK5_1 * ib;
+        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+
+        uint32_t qh = 0;
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights; // not used
+    const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
+    quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * row_size;
+}
+
+// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
+
+void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int64_t i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK_K; j++) {
+            const float v = x[j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        // 5 elements per byte, along 32 bytes
+        for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) {
+            for (size_t m = 0; m < 32; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 5; ++n) {
+                    int xi = lroundf(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2
+                    q *= 3;
+                    q += xi;
+                }
+                // ceiling division (243 == pow(3, 5))
+                q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+                y[i].qs[j + m] = q;
+            }
+            x += 5*32;
+        }
+        // along 16 bytes
+        for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) {
+            for (size_t m = 0; m < 16; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 5; ++n) {
+                    int xi = lroundf(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2
+                    q *= 3;
+                    q += xi;
+                }
+                // ceiling division (243 == pow(3, 5))
+                q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+                y[i].qs[j + m] = q;
+            }
+            x += 5*16;
+        }
+        // 4 elements per byte
+        for (size_t j = 0; j < sizeof(y->qh); ++j) {
+            uint8_t q = 0;
+            for (size_t m = 0; m < 4; ++m) {
+                // -1, 0, 1 -> 0, 1, 2
+                int xi = lroundf(x[j + m*sizeof(y->qh)] * id) + 1;
+                q *= 3;
+                q += xi;
+            }
+            // shift the first value to the most significant trit
+            q *= 3;
+            // ceiling division (243 == pow(3, 5))
+            q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+            y[i].qh[j] = q;
+        }
+        x += 4*sizeof(y->qh);
+    }
+}
+
+void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int64_t i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK_K; j++) {
+            const float v = x[j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (size_t j = 0; j < sizeof(y->qs); j += 32) {
+            for (size_t m = 0; m < 32; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 4; ++n) {
+                    // -1, 0, 1 -> 0, 1, 2
+                    int xi = lroundf(x[m + n*32] * id) + 1;
+                    q += (xi & 3) << (2*n);
+                }
+                y[i].qs[j + m] = q;
+            }
+            x += 4*32;
+        }
+    }
+}
+
+size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights; // not used
+    const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
+    quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * row_size;
+}
+
+size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    (void)quant_weights; // not used
+    const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
+    quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * row_size;
+}
+
+void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    for (int64_t i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t n = 0; n < 5; ++n) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[n];
+                    int16_t xi = ((uint16_t) q * 3) >> 8;
+                    *y++ = (float) (xi - 1) * d;
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t n = 0; n < 5; ++n) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[n];
+                    int16_t xi = ((uint16_t) q * 3) >> 8;
+                    *y++ = (float) (xi - 1) * d;
+                }
+            }
+        }
+
+        for (size_t n = 0; n < 4; ++n) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[n];
+                int16_t xi = ((uint16_t) q * 3) >> 8;
+                *y++ = (float) (xi - 1) * d;
+            }
+        }
+    }
+}
+
+void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int64_t i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    int8_t q = (x[i].qs[j + m] >> (l*2)) & 3;
+                    *y++ = (float) (q - 1) * d;
+                }
+            }
+        }
+    }
+}
+
+// ====================== "True" 2-bit (de)-quantization
+
+void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
+            const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+        }
+    }
+}
+
+// ====================== 2.3125 bpw (de)-quantization
+
+void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    float db[2];
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
+            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+        }
+    }
+}
+
+// ====================== 2.5625 bpw (de)-quantization
+
+void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    float db[2];
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
+            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
+            for (int l = 0; l < 4; ++l) {
+                const float dl = db[l/2];
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = dl * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+            qs += 4;
+            signs += 4;
+        }
+    }
+}
+
+// ====================== 3.0625 bpw (de)-quantization
+
+void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    uint32_t aux32;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * scales_and_signs = qs + QK_K/4;
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t));
+            const float db = d * (0.5f + (aux32 >> 28)) * 0.5f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]);
+                for (int j = 0; j < 4; ++j) {
+                    y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+            qs += 8;
+        }
+    }
+}
+
+// ====================== 3.3125 bpw (de)-quantization
+
+void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = x[i].signs;
+
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf));
+            const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >>  4));
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+            qs += 8;
+            signs += 4;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+            qh += 2;
+            qs += 8;
+            signs += 4;
+        }
+    }
+}
+
+// ====================== 1.5625 bpw (de)-quantization
+
+void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
+            const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = dl * (grid[j] + delta);
+                }
+                y += 8;
+            }
+            qs += 4;
+        }
+    }
+}
+
+void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    float delta[4];
+    uint16_t idx[4];
+
+    iq1m_scale_t scale;
+
+    for (int i = 0; i < nb; i++) {
+
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+        const float d = GGML_FP16_TO_FP32(scale.f16);
+
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
+            const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
+
+            idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
+            idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
+            idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
+            idx[3] = qs[3] | ((qh[1] << 4) & 0x700);
+            delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
+            delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
+            delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA;
+            delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA;
+            for (int l = 0; l < 2; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = dl1 * (grid[j] + delta[l]);
+                }
+                y += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
+                for (int j = 0; j < 8; ++j) {
+                    y[j] = dl2 * (grid[j] + delta[l]);
+                }
+                y += 8;
+            }
+            qs += 4;
+            qh += 2;
+        }
+    }
+}
+
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK4_NL == 0);
+    const int64_t nb = k / QK4_NL;
+
+    for (int i = 0; i < nb; i++) {
+
+        const uint8_t * qs = x[i].qs;
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            y[j+       0] = d * kvalues_iq4nl[qs[j] & 0xf];
+            y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >>  4];
+        }
+        y  += QK4_NL;
+        qs += QK4_NL/2;
+    }
+}
+
+void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        const uint8_t * qs = x[i].qs;
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = ((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4);
+            const float dl = d * (ls - 32);
+            for (int j = 0; j < 16; ++j) {
+                y[j+ 0] = dl * kvalues_iq4nl[qs[j] & 0xf];
+                y[j+16] = dl * kvalues_iq4nl[qs[j] >>  4];
+            }
+            y  += 32;
+            qs += 16;
+        }
+    }
+}
+
+//===================================== Q8_K ==============================================
+
+void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+
+        float max = 0;
+        float amax = 0;
+        for (int j = 0; j < QK_K; ++j) {
+            float ax = fabsf(x[j]);
+            if (ax > amax) {
+                amax = ax; max = x[j];
+            }
+        }
+        if (!amax) {
+            y[i].d = 0;
+            memset(y[i].qs, 0, QK_K);
+            x += QK_K;
+            continue;
+        }
+        //const float iscale = -128.f/max;
+        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
+        const float iscale = -127.f/max;
+        for (int j = 0; j < QK_K; ++j) {
+            int v = nearest_int(iscale*x[j]);
+            y[i].qs[j] = MIN(127, v);
+        }
+        for (int j = 0; j < QK_K/16; ++j) {
+            int sum = 0;
+            for (int ii = 0; ii < 16; ++ii) {
+                sum += y[i].qs[j*16 + ii];
+            }
+            y[i].bsums[j] = sum;
+        }
+        y[i].d = 1/iscale;
+        x += QK_K;
+    }
+}
+
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < QK_K; ++j) {
+            *y++ = x[i].d * x[i].qs[j];
+        }
+    }
+}
+
+// ================================ IQ2 quantization =============================================
+
+typedef struct {
+    uint64_t * grid;
+    int      * map;
+    uint16_t * neighbours;
+} iq2_entry_t;
+
+static iq2_entry_t iq2_data[4] = {
+    {NULL, NULL, NULL},
+    {NULL, NULL, NULL},
+    {NULL, NULL, NULL},
+    {NULL, NULL, NULL},
+};
+
+static inline int iq2_data_index(enum ggml_type type) {
+    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
+    return type == GGML_TYPE_IQ2_XXS ? 0 :
+           type == GGML_TYPE_IQ2_XS  ? 1 :
+           type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3;
+}
+
+static inline int iq2_grid_size(enum ggml_type type) {
+    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
+    return type == GGML_TYPE_IQ2_XXS ? 256 :
+           type == GGML_TYPE_IQ2_XS  ? 512 :
+           type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024;
+}
+
+static int iq2_compare_func(const void * left, const void * right) {
+    const int * l = (const int *)left;
+    const int * r = (const int *)right;
+    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
+}
+
+void iq2xs_init_impl(enum ggml_type type) {
+    const int gindex = iq2_data_index(type);
+    const int grid_size = iq2_grid_size(type);
+    if (iq2_data[gindex].grid) {
+        return;
+    }
+    static const uint16_t kgrid_2bit_256[256] = {
+            0,     2,     5,     8,    10,    17,    20,    32,    34,    40,    42,    65,    68,    80,    88,    97,
+          100,   128,   130,   138,   162,   257,   260,   272,   277,   320,   388,   408,   512,   514,   546,   642,
+         1025,  1028,  1040,  1057,  1060,  1088,  1090,  1096,  1120,  1153,  1156,  1168,  1188,  1280,  1282,  1288,
+         1312,  1350,  1385,  1408,  1425,  1545,  1552,  1600,  1668,  1700,  2048,  2053,  2056,  2068,  2088,  2113,
+         2116,  2128,  2130,  2184,  2308,  2368,  2562,  2580,  4097,  4100,  4112,  4129,  4160,  4192,  4228,  4240,
+         4245,  4352,  4360,  4384,  4432,  4442,  4480,  4644,  4677,  5120,  5128,  5152,  5157,  5193,  5248,  5400,
+         5474,  5632,  5654,  6145,  6148,  6160,  6208,  6273,  6400,  6405,  6560,  6737,  8192,  8194,  8202,  8260,
+         8289,  8320,  8322,  8489,  8520,  8704,  8706,  9217,  9220,  9232,  9280,  9302,  9472,  9537,  9572,  9872,
+        10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
+        16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
+        17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
+        20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
+        22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
+        25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
+        33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
+        37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
+    };
+    static const uint16_t kgrid_2bit_512[512] = {
+            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
+           73,    80,    82,    85,    88,    97,   100,   128,   130,   133,   136,   145,   148,   153,   160,   257,
+          260,   262,   265,   272,   274,   277,   280,   282,   289,   292,   320,   322,   325,   328,   337,   340,
+          352,   360,   385,   388,   400,   512,   514,   517,   520,   529,   532,   544,   577,   580,   592,   597,
+          640,   650,  1025,  1028,  1030,  1033,  1040,  1042,  1045,  1048,  1057,  1060,  1088,  1090,  1093,  1096,
+         1105,  1108,  1110,  1120,  1153,  1156,  1168,  1280,  1282,  1285,  1288,  1297,  1300,  1312,  1345,  1348,
+         1360,  1377,  1408,  1537,  1540,  1552,  1574,  1600,  1602,  1668,  2048,  2050,  2053,  2056,  2058,  2065,
+         2068,  2080,  2085,  2113,  2116,  2128,  2136,  2176,  2208,  2218,  2305,  2308,  2320,  2368,  2433,  2441,
+         2560,  2592,  2600,  2710,  2720,  4097,  4100,  4102,  4105,  4112,  4114,  4117,  4120,  4129,  4132,  4160,
+         4162,  4165,  4168,  4177,  4180,  4192,  4202,  4225,  4228,  4240,  4352,  4354,  4357,  4360,  4369,  4372,
+         4384,  4417,  4420,  4432,  4480,  4500,  4502,  4609,  4612,  4614,  4624,  4672,  4704,  5120,  5122,  5125,
+         5128,  5137,  5140,  5152,  5185,  5188,  5193,  5200,  5220,  5248,  5377,  5380,  5392,  5440,  5632,  5652,
+         5705,  6145,  6148,  6160,  6162,  6208,  6228,  6278,  6400,  6405,  6502,  6737,  6825,  8192,  8194,  8197,
+         8200,  8202,  8209,  8212,  8224,  8257,  8260,  8272,  8320,  8352,  8449,  8452,  8464,  8512,  8520,  8549,
+         8704,  8738,  8832,  8872,  9217,  9220,  9232,  9257,  9280,  9472,  9537,  9554,  9625,  9729,  9754,  9894,
+        10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
+        16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
+        16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
+        16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
+        17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
+        18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
+        20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
+        21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
+        22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
+        24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
+        32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
+        33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
+        33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
+        35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
+        37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
+        40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
+        42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
+    };
+    static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = {
+            0,     2,     5,     8,    10,    17,    21,    32,    34,    40,    42,    69,    81,    84,    86,   101,
+          128,   130,   136,   138,   149,   160,   162,   168,   170,   260,   261,   273,   276,   278,   281,   282,
+          293,   321,   326,   329,   338,   341,   346,   353,   356,   358,   360,   389,   401,   404,   406,   421,
+          512,   514,   520,   522,   533,   544,   546,   552,   554,   581,   593,   601,   612,   617,   640,   642,
+          648,   650,   657,   661,   665,   672,   674,   680,   682,  1041,  1044,  1046,  1061,  1089,  1097,  1109,
+         1114,  1124,  1125,  1169,  1177,  1189,  1281,  1284,  1285,  1286,  1301,  1304,  1306,  1321,  1344,  1349,
+         1354,  1360,  1361,  1364,  1365,  1366,  1369,  1376,  1378,  1381,  1384,  1386,  1409,  1425,  1429,  1432,
+         1434,  1441,  1444,  1445,  1446,  1449,  1556,  1561,  1601,  1604,  1616,  1618,  1621,  1624,  1632,  1633,
+         1638,  1641,  1669,  1681,  1684,  1689,  2048,  2050,  2056,  2058,  2069,  2080,  2082,  2088,  2090,  2117,
+         2129,  2134,  2149,  2176,  2178,  2184,  2186,  2197,  2208,  2210,  2216,  2218,  2309,  2321,  2324,  2329,
+         2340,  2341,  2369,  2384,  2385,  2389,  2401,  2404,  2409,  2449,  2452,  2454,  2457,  2469,  2560,  2562,
+         2568,  2570,  2581,  2592,  2594,  2600,  2602,  2629,  2641,  2649,  2657,  2661,  2688,  2690,  2693,  2696,
+         2698,  2709,  2720,  2722,  2728,  2730,  4112,  4113,  4116,  4121,  4132,  4133,  4161,  4164,  4176,  4181,
+         4184,  4193,  4196,  4197,  4201,  4241,  4244,  4246,  4257,  4261,  4353,  4356,  4358,  4361,  4368,  4370,
+         4373,  4376,  4385,  4388,  4393,  4421,  4426,  4432,  4433,  4434,  4436,  4437,  4438,  4441,  4448,  4453,
+         4484,  4498,  4501,  4513,  4516,  4625,  4628,  4630,  4645,  4672,  4678,  4681,  4690,  4693,  4696,  4698,
+         4708,  4710,  4741,  4753,  4756,  4758,  4773,  5121,  5126,  5129,  5140,  5141,  5144,  5145,  5153,  5158,
+         5185,  5189,  5190,  5192,  5194,  5201,  5204,  5205,  5206,  5209,  5218,  5221,  5224,  5252,  5257,  5264,
+         5268,  5269,  5272,  5273,  5274,  5281,  5284,  5285,  5289,  5378,  5381,  5386,  5393,  5396,  5397,  5398,
+         5401,  5408,  5410,  5413,  5416,  5418,  5441,  5444,  5445,  5446,  5457,  5458,  5460,  5461,  5462,  5465,
+         5466,  5473,  5476,  5477,  5478,  5481,  5504,  5506,  5508,  5509,  5512,  5514,  5520,  5521,  5524,  5525,
+         5526,  5529,  5530,  5536,  5538,  5541,  5633,  5636,  5637,  5638,  5653,  5654,  5656,  5658,  5665,  5670,
+         5696,  5698,  5700,  5701,  5704,  5706,  5713,  5717,  5718,  5720,  5721,  5729,  5732,  5733,  5736,  5737,
+         5738,  5766,  5770,  5778,  5781,  5796,  5801,  6161,  6166,  6181,  6209,  6212,  6214,  6217,  6224,  6229,
+         6232,  6234,  6240,  6241,  6244,  6246,  6249,  6277,  6289,  6292,  6309,  6416,  6418,  6421,  6426,  6433,
+         6437,  6466,  6468,  6469,  6472,  6481,  6484,  6485,  6486,  6489,  6490,  6496,  6501,  6506,  6537,  6545,
+         6546,  6549,  6552,  6561,  6566,  6569,  6665,  6678,  6692,  6694,  6724,  6726,  6729,  6736,  6738,  6741,
+         6744,  6753,  6758,  6761,  6789,  6801,  6806,  6810,  8192,  8194,  8200,  8202,  8213,  8224,  8226,  8229,
+         8232,  8234,  8261,  8273,  8281,  8289,  8293,  8320,  8322,  8328,  8330,  8341,  8352,  8354,  8357,  8360,
+         8362,  8453,  8465,  8468,  8473,  8485,  8514,  8516,  8521,  8533,  8536,  8538,  8545,  8548,  8549,  8550,
+         8581,  8592,  8598,  8601,  8613,  8705,  8712,  8714,  8721,  8725,  8736,  8738,  8744,  8746,  8773,  8785,
+         8790,  8793,  8805,  8833,  8840,  8842,  8849,  8853,  8864,  8866,  8872,  8874,  9221,  9236,  9238,  9241,
+         9253,  9284,  9285,  9286,  9289,  9298,  9301,  9304,  9306,  9318,  9349,  9361,  9364,  9369,  9377,  9381,
+         9481,  9493,  9505,  9513,  9536,  9541,  9544,  9553,  9556,  9557,  9561,  9570,  9573,  9576,  9609,  9616,
+         9620,  9621,  9624,  9626,  9633,  9636,  9638,  9641,  9733,  9744,  9746,  9753,  9765,  9793,  9801,  9813,
+         9824,  9825,  9833,  9860,  9862,  9872,  9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282,
+        10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521,
+        10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752,
+        10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890,
+        10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484,
+        16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673,
+        16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772,
+        16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986,
+        16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494,
+        17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666,
+        17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744,
+        17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809,
+        17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953,
+        17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049,
+        18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517,
+        18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704,
+        18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784,
+        18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012,
+        19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501,
+        20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617,
+        20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761,
+        20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822,
+        20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896,
+        20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078,
+        21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526,
+        21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589,
+        21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653,
+        21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780,
+        21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832,
+        21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864,
+        21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924,
+        21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048,
+        22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098,
+        22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154,
+        22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561,
+        22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665,
+        22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821,
+        22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884,
+        22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061,
+        23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144,
+        23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656,
+        24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850,
+        24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970,
+        24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221,
+        25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674,
+        25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749,
+        25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926,
+        25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001,
+        26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176,
+        26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250,
+        26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721,
+        26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949,
+        26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044,
+        27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270,
+        27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852,
+        32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046,
+        33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161,
+        33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369,
+        33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877,
+        33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117,
+        34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192,
+        34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394,
+        34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858,
+        34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986,
+        35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172,
+        35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412,
+        35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901,
+        36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124,
+        37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205,
+        37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396,
+        37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889,
+        37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985,
+        37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161,
+        38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226,
+        38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290,
+        38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432,
+        38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538,
+        38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998,
+        39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194,
+        39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269,
+        39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497,
+        39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994,
+        41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130,
+        41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349,
+        41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561,
+        41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068,
+        42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278,
+        42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386,
+        42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592,
+        42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048,
+        43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284,
+        43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530,
+        43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690,
+    };
+    static const uint16_t kgrid_2bit_1024[1024] = {
+            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
+           73,    80,    82,    85,    88,    97,   100,   102,   105,   128,   130,   133,   136,   145,   148,   160,
+          165,   170,   257,   260,   262,   265,   272,   274,   277,   280,   289,   292,   320,   322,   325,   328,
+          337,   340,   342,   345,   352,   357,   360,   385,   388,   400,   402,   405,   417,   420,   512,   514,
+          517,   520,   529,   532,   544,   554,   577,   580,   582,   585,   592,   597,   640,   645,   650,   660,
+          674,  1025,  1028,  1030,  1033,  1040,  1042,  1045,  1048,  1057,  1060,  1062,  1065,  1088,  1090,  1093,
+         1096,  1098,  1105,  1108,  1110,  1113,  1120,  1122,  1125,  1153,  1156,  1158,  1161,  1168,  1173,  1176,
+         1185,  1188,  1280,  1282,  1285,  1288,  1290,  1297,  1300,  1302,  1305,  1312,  1317,  1320,  1345,  1348,
+         1350,  1353,  1360,  1362,  1365,  1368,  1377,  1380,  1408,  1410,  1413,  1416,  1425,  1428,  1440,  1537,
+         1540,  1542,  1545,  1552,  1557,  1600,  1605,  1608,  1617,  1620,  1632,  1665,  1668,  1680,  2048,  2050,
+         2053,  2056,  2065,  2068,  2070,  2073,  2080,  2085,  2090,  2113,  2116,  2118,  2121,  2128,  2130,  2133,
+         2136,  2145,  2148,  2176,  2181,  2196,  2218,  2305,  2308,  2320,  2322,  2325,  2328,  2337,  2368,  2373,
+         2376,  2385,  2388,  2400,  2433,  2448,  2560,  2577,  2580,  2594,  2600,  2602,  2640,  2713,  4097,  4100,
+         4102,  4105,  4112,  4114,  4117,  4120,  4129,  4132,  4134,  4160,  4162,  4165,  4168,  4177,  4180,  4182,
+         4185,  4192,  4194,  4197,  4200,  4225,  4228,  4230,  4240,  4245,  4248,  4257,  4260,  4352,  4354,  4357,
+         4360,  4362,  4369,  4372,  4374,  4377,  4384,  4386,  4389,  4392,  4417,  4420,  4422,  4425,  4432,  4434,
+         4437,  4440,  4449,  4452,  4480,  4482,  4485,  4488,  4497,  4500,  4609,  4612,  4617,  4624,  4629,  4641,
+         4644,  4672,  4677,  4689,  4692,  4737,  4740,  4752,  5120,  5122,  5125,  5128,  5137,  5140,  5142,  5145,
+         5152,  5157,  5160,  5185,  5188,  5190,  5193,  5200,  5202,  5205,  5208,  5217,  5220,  5248,  5250,  5253,
+         5256,  5265,  5268,  5280,  5377,  5380,  5382,  5385,  5392,  5394,  5397,  5400,  5409,  5412,  5440,  5442,
+         5445,  5448,  5457,  5460,  5472,  5505,  5508,  5520,  5632,  5637,  5640,  5649,  5652,  5664,  5697,  5700,
+         5712,  5760,  5802,  6145,  6148,  6150,  6153,  6160,  6165,  6168,  6177,  6208,  6210,  6213,  6216,  6225,
+         6228,  6240,  6273,  6276,  6400,  6402,  6405,  6408,  6417,  6420,  6432,  6465,  6468,  6480,  6505,  6562,
+         6660,  6672,  6720,  6742,  8192,  8194,  8197,  8200,  8209,  8212,  8214,  8217,  8224,  8229,  8234,  8257,
+         8260,  8272,  8274,  8277,  8292,  8320,  8330,  8340,  8362,  8449,  8452,  8464,  8466,  8469,  8481,  8512,
+         8514,  8517,  8529,  8532,  8544,  8577,  8580,  8592,  8704,  8714,  8738,  8744,  8746,  8772,  8784,  8840,
+         8842,  8872,  9217,  9220,  9222,  9225,  9232,  9237,  9240,  9249,  9252,  9280,  9282,  9285,  9288,  9297,
+         9300,  9312,  9345,  9348,  9360,  9472,  9477,  9480,  9489,  9492,  9504,  9537,  9540,  9552,  9574,  9600,
+         9729,  9732,  9744,  9792,  9817, 10240, 10245, 10257, 10260, 10305, 10308, 10320, 10378, 10410, 10497, 10500,
+        10512, 10645, 10762, 10786, 10852, 10888, 10890, 16385, 16388, 16390, 16393, 16400, 16402, 16405, 16408, 16410,
+        16417, 16420, 16422, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16470, 16473, 16480, 16482, 16485, 16513,
+        16516, 16528, 16533, 16536, 16545, 16548, 16640, 16642, 16645, 16648, 16657, 16660, 16662, 16665, 16672, 16674,
+        16677, 16705, 16708, 16710, 16713, 16720, 16722, 16725, 16728, 16737, 16740, 16768, 16770, 16773, 16776, 16785,
+        16788, 16800, 16897, 16900, 16912, 16914, 16917, 16920, 16932, 16960, 16965, 16968, 16977, 16980, 16992, 17025,
+        17028, 17408, 17410, 17413, 17416, 17418, 17425, 17428, 17430, 17433, 17440, 17442, 17445, 17448, 17473, 17476,
+        17478, 17481, 17488, 17490, 17493, 17496, 17505, 17508, 17536, 17538, 17541, 17544, 17553, 17556, 17568, 17665,
+        17668, 17670, 17673, 17680, 17682, 17685, 17688, 17697, 17700, 17728, 17730, 17733, 17736, 17745, 17748, 17760,
+        17770, 17793, 17796, 17808, 17920, 17922, 17925, 17928, 17937, 17940, 17952, 17985, 17988, 18000, 18048, 18085,
+        18433, 18436, 18441, 18448, 18450, 18453, 18456, 18465, 18468, 18496, 18498, 18501, 18504, 18513, 18516, 18528,
+        18564, 18576, 18688, 18690, 18693, 18696, 18705, 18708, 18720, 18753, 18756, 18768, 18816, 18838, 18945, 18948,
+        18960, 19008, 20480, 20482, 20485, 20488, 20497, 20500, 20502, 20505, 20512, 20514, 20517, 20520, 20545, 20548,
+        20550, 20553, 20560, 20562, 20565, 20568, 20577, 20580, 20608, 20610, 20613, 20616, 20625, 20628, 20737, 20740,
+        20742, 20745, 20752, 20754, 20757, 20760, 20769, 20772, 20800, 20802, 20805, 20808, 20817, 20820, 20832, 20865,
+        20868, 20880, 20992, 20997, 21000, 21009, 21012, 21024, 21057, 21060, 21072, 21097, 21120, 21505, 21508, 21510,
+        21513, 21520, 21522, 21525, 21528, 21537, 21540, 21568, 21570, 21573, 21576, 21585, 21588, 21600, 21633, 21636,
+        21648, 21760, 21762, 21765, 21768, 21777, 21780, 21792, 21825, 21828, 21840, 21888, 22017, 22020, 22032, 22054,
+        22080, 22528, 22530, 22533, 22536, 22545, 22548, 22560, 22593, 22596, 22608, 22618, 22656, 22785, 22788, 22800,
+        22848, 23040, 23065, 23173, 23208, 24577, 24580, 24582, 24592, 24594, 24597, 24600, 24609, 24612, 24640, 24645,
+        24648, 24657, 24660, 24672, 24708, 24720, 24832, 24834, 24837, 24840, 24849, 24852, 24864, 24897, 24900, 24912,
+        24960, 24985, 25092, 25104, 25152, 25174, 25249, 25600, 25605, 25608, 25617, 25620, 25632, 25665, 25668, 25680,
+        25728, 25857, 25860, 25872, 25920, 25930, 25960, 26002, 26112, 26260, 26625, 26628, 26640, 26725, 26776, 26880,
+        26922, 27202, 27297, 32768, 32770, 32773, 32776, 32785, 32788, 32793, 32800, 32805, 32833, 32836, 32848, 32850,
+        32853, 32856, 32865, 32896, 32901, 32913, 32916, 33025, 33028, 33033, 33040, 33042, 33045, 33048, 33057, 33060,
+        33088, 33090, 33093, 33096, 33105, 33108, 33153, 33156, 33168, 33193, 33280, 33285, 33290, 33297, 33300, 33345,
+        33348, 33360, 33793, 33796, 33798, 33801, 33808, 33810, 33813, 33816, 33825, 33856, 33858, 33861, 33864, 33873,
+        33876, 33888, 33921, 33924, 33936, 34048, 34050, 34053, 34056, 34065, 34068, 34080, 34113, 34116, 34128, 34176,
+        34186, 34305, 34308, 34320, 34345, 34368, 34816, 34821, 34833, 34836, 34881, 34884, 34896, 34978, 35073, 35076,
+        35136, 35173, 35362, 35416, 35418, 35458, 35490, 36865, 36868, 36873, 36880, 36882, 36885, 36888, 36900, 36928,
+        36930, 36933, 36936, 36945, 36948, 36960, 36993, 36996, 37008, 37120, 37125, 37137, 37140, 37185, 37188, 37200,
+        37210, 37377, 37380, 37392, 37440, 37542, 37888, 37890, 37893, 37896, 37905, 37908, 37920, 37953, 37956, 37968,
+        38016, 38038, 38145, 38148, 38160, 38208, 38296, 38305, 38400, 38470, 38500, 38913, 38916, 38928, 38950, 38976,
+        39081, 39168, 39241, 39250, 39568, 40960, 40965, 40970, 40980, 40994, 41002, 41025, 41028, 41040, 41122, 41130,
+        41280, 41317, 41474, 41482, 41506, 41512, 41514, 41602, 41608, 41610, 41640, 41985, 41988, 42000, 42048, 42121,
+        42148, 42240, 42265, 42577, 43018, 43048, 43170, 43348, 43398, 43528, 43530, 43552, 43554, 43560, 43656, 43690,
+    };
+
+    const int kmap_size = 43692;
+    //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
+    const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2;
+    const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
+                             type == GGML_TYPE_IQ2_XS  ? kgrid_2bit_512 :
+                             type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024;
+    uint64_t * kgrid_q2xs;
+    int      * kmap_q2xs;
+    uint16_t * kneighbors_q2xs;
+
+    //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+    uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
+    for (int k = 0; k < grid_size; ++k) {
+        int8_t * pos = (int8_t *)(the_grid + k);
+        for (int i = 0; i < 8; ++i) {
+            int l = (kgrid[k] >> 2*i) & 0x3;
+            pos[i] = 2*l + 1;
+        }
+    }
+    kgrid_q2xs = the_grid;
+    iq2_data[gindex].grid = the_grid;
+    kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
+    iq2_data[gindex].map = kmap_q2xs;
+    for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
+    uint64_t aux64;
+    uint8_t * aux8 = (uint8_t *)&aux64;
+    for (int i = 0; i < grid_size; ++i) {
+        aux64 = kgrid_q2xs[i];
+        uint16_t index = 0;
+        for (int k=0; k<8; ++k) {
+            uint16_t q = (aux8[k] - 1)/2;
+            index |= (q << 2*k);
+        }
+        kmap_q2xs[index] = i;
+    }
+    int8_t pos[8];
+    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    int num_neighbors = 0, num_not_in_map = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q2xs[i] >= 0) continue;
+        ++num_not_in_map;
+        for (int k = 0; k < 8; ++k) {
+            int l = (i >> 2*k) & 0x3;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+        int n = 0; int d2 = dist2[0];
+        int nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            ++n;
+        }
+        num_neighbors += n;
+    }
+    //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+    kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
+    iq2_data[gindex].neighbours = kneighbors_q2xs;
+    int counter = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q2xs[i] >= 0) continue;
+        for (int k = 0; k < 8; ++k) {
+            int l = (i >> 2*k) & 0x3;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+        kmap_q2xs[i] = -(counter + 1);
+        int d2 = dist2[0];
+        uint16_t * start = &kneighbors_q2xs[counter++];
+        int n = 0, nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            kneighbors_q2xs[counter++] = dist2[2*j+1];
+            ++n;
+        }
+        *start = n;
+    }
+    free(dist2);
+}
+
+void iq2xs_free_impl(enum ggml_type type) {
+    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S);
+    const int gindex = iq2_data_index(type);
+    if (iq2_data[gindex].grid) {
+        free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
+        free(iq2_data[gindex].map);        iq2_data[gindex].map  = NULL;
+        free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
+    }
+}
+
+static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_d2 = FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float d2 = 0;
+        for (int i = 0; i < 8; ++i) {
+            float q = pg[i];
+            float diff = scale*q - xval[i];
+            d2 += weight[i]*diff*diff;
+        }
+        if (d2 < best_d2) {
+            best_d2 = d2; grid_index = neighbours[j];
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+
+    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 3;
+
+    const int64_t nbl = n/QK_K;
+
+    block_iq2_xxs * y = vy;
+
+    float scales[QK_K/32];
+    float weight[32];
+    float xval[32];
+    int8_t L[32];
+    int8_t Laux[32];
+    float  waux[32];
+    uint8_t block_signs[4];
+    uint32_t q2[2*(QK_K/32)];
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(q2, 0, QK_K/4);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float * xb = xbl + 32*ib;
+            const float * qw = quant_weights + QK_K*ibl + 32*ib;
+            for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 4; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
+            if (max < GROUP_MAX_EPS) {
+                scales[ib] = 0;
+                memset(L, 0, 32);
+                continue;
+            }
+            float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
+            float eff_max = scale*kMaxQ;
+            float best = 0;
+            for (int is = -6; is <= 6; ++is) {
+                float id = (2*kMaxQ-1+is*0.1f)/eff_max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 4; ++k) {
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    memcpy(L, Laux, 32);
+                }
+            }
+            if (scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 4; ++k) {
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 2*i);
+                    }
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
+                    }
+                    const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
+                    for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
+                // and correspondingly flip quant signs.
+                scale = -scale;
+                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 4; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
+                    printf("\n");
+                    GGML_ABORT("fatal error");
+                }
+                q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
+                q2[2*ib+1] |= (block_signs[k] << 7*k);
+            }
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(y[ibl].qs, 0, QK_K/4);
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d);
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            q2[2*ib+1] |= ((uint32_t)l << 28);
+        }
+        memcpy(y[ibl].qs, q2, QK_K/4);
+    }
+}
+
+static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+
+    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 3;
+
+    const int64_t nbl = n/QK_K;
+
+    block_iq2_xs * y = vy;
+
+    float scales[QK_K/16];
+    float weight[16];
+    float xval[16];
+    int8_t L[16];
+    int8_t Laux[16];
+    float  waux[16];
+    bool   is_on_grid[2];
+    bool   is_on_grid_aux[2];
+    uint8_t block_signs[2];
+    uint16_t q2[2*(QK_K/16)];
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(q2, 0, QK_K/4);
+        memset(y[ibl].scales, 0, QK_K/32);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            const float * xb = xbl + 16*ib;
+            const float * qw = quant_weights + QK_K*ibl + 16*ib;
+            for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 2; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
+            if (max < GROUP_MAX_EPS) {
+                scales[ib] = 0;
+                memset(L, 0, 16);
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            is_on_grid[0] = is_on_grid[1] = true;
+            for (int is = -9; is <= 9; ++is) {
+                float id = (2*kMaxQ-1+is*0.1f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 2; ++k) {
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
+                    int grid_index = kmap_q2xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 16; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  2; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 2; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 2*i);
+                        L[8*k + i] = l;
+                    }
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                scale = -scale;
+                for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 2; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
+                    printf("\n");
+                    GGML_ABORT("fatal error");
+                }
+                q2[2*ib+k] = grid_index | (block_signs[k] << 9);
+            }
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(y[ibl].qs, 0, QK_K/4);
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d);
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            if (ib%2 == 0) y[ibl].scales[ib/2] = l;
+            else y[ibl].scales[ib/2] |= (l << 4);
+        }
+        memcpy(y[ibl].qs, q2, QK_K/4);
+
+    }
+}
+
+size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq2_xxs);
+    }
+    return nrow * nblock * sizeof(block_iq2_xxs);
+}
+
+size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq2_xs);
+    }
+    return nrow * nblock * sizeof(block_iq2_xs);
+}
+
+//
+// ============================================= 3-bit using D4 lattice
+//
+
+typedef struct {
+    uint32_t * grid;
+    int      * map;
+    uint16_t * neighbours;
+} iq3_entry_t;
+
+static iq3_entry_t iq3_data[2] = {
+    {NULL, NULL, NULL},
+    {NULL, NULL, NULL},
+};
+
+static inline int iq3_data_index(int grid_size) {
+    (void)grid_size;
+    GGML_ASSERT(grid_size == 256 || grid_size == 512);
+    return grid_size == 256 ? 0 : 1;
+}
+
+static int iq3_compare_func(const void * left, const void * right) {
+    const int * l = (const int *)left;
+    const int * r = (const int *)right;
+    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
+}
+
+void iq3xs_init_impl(int grid_size) {
+    const int gindex = iq3_data_index(grid_size);
+    if (iq3_data[gindex].grid) {
+        return;
+    }
+    static const uint16_t kgrid_256[256] = {
+            0,     2,     4,     9,    11,    15,    16,    18,    25,    34,    59,    61,    65,    67,    72,    74,
+           81,    85,    88,    90,    97,   108,   120,   128,   130,   132,   137,   144,   146,   153,   155,   159,
+          169,   175,   189,   193,   199,   200,   202,   213,   248,   267,   287,   292,   303,   315,   317,   321,
+          327,   346,   362,   413,   436,   456,   460,   462,   483,   497,   513,   515,   520,   522,   529,   531,
+          536,   538,   540,   551,   552,   576,   578,   585,   592,   594,   641,   643,   648,   650,   657,   664,
+          698,   704,   706,   720,   729,   742,   758,   769,   773,   808,   848,   852,   870,   889,   901,   978,
+          992,  1024,  1026,  1033,  1035,  1040,  1042,  1046,  1049,  1058,  1089,  1091,  1093,  1096,  1098,  1105,
+         1112,  1139,  1143,  1144,  1152,  1154,  1161,  1167,  1168,  1170,  1183,  1184,  1197,  1217,  1224,  1228,
+         1272,  1276,  1309,  1323,  1347,  1367,  1377,  1404,  1473,  1475,  1486,  1509,  1537,  1544,  1546,  1553,
+         1555,  1576,  1589,  1594,  1600,  1602,  1616,  1625,  1636,  1638,  1665,  1667,  1672,  1685,  1706,  1722,
+         1737,  1755,  1816,  1831,  1850,  1856,  1862,  1874,  1901,  1932,  1950,  1971,  2011,  2032,  2052,  2063,
+         2077,  2079,  2091,  2095,  2172,  2192,  2207,  2208,  2224,  2230,  2247,  2277,  2308,  2345,  2356,  2389,
+         2403,  2424,  2501,  2504,  2506,  2520,  2570,  2593,  2616,  2624,  2630,  2646,  2669,  2700,  2714,  2746,
+         2754,  2795,  2824,  2835,  2839,  2874,  2882,  2905,  2984,  3028,  3042,  3092,  3108,  3110,  3124,  3153,
+         3185,  3215,  3252,  3288,  3294,  3364,  3397,  3434,  3483,  3523,  3537,  3587,  3589,  3591,  3592,  3610,
+         3626,  3670,  3680,  3722,  3749,  3754,  3776,  3789,  3803,  3824,  3857,  3873,  3904,  3906,  3924,  3992,
+    };
+    static const uint16_t kgrid_512[512] = {
+            0,     1,     2,     5,     7,     8,     9,    10,    12,    14,    16,    17,    21,    27,    32,    34,
+           37,    39,    41,    43,    48,    50,    57,    60,    63,    64,    65,    66,    68,    72,    73,    77,
+           80,    83,    87,    89,    93,   100,   113,   117,   122,   128,   129,   133,   135,   136,   139,   142,
+          145,   149,   152,   156,   162,   165,   167,   169,   171,   184,   187,   195,   201,   205,   208,   210,
+          217,   219,   222,   228,   232,   234,   247,   249,   253,   256,   267,   271,   273,   276,   282,   288,
+          291,   297,   312,   322,   324,   336,   338,   342,   347,   353,   357,   359,   374,   379,   390,   393,
+          395,   409,   426,   441,   448,   450,   452,   464,   466,   470,   475,   488,   492,   512,   513,   514,
+          516,   520,   521,   523,   525,   527,   528,   530,   537,   540,   542,   556,   558,   561,   570,   576,
+          577,   579,   582,   584,   588,   593,   600,   603,   609,   616,   618,   632,   638,   640,   650,   653,
+          655,   656,   660,   666,   672,   675,   685,   688,   698,   705,   708,   711,   712,   715,   721,   727,
+          728,   732,   737,   754,   760,   771,   773,   778,   780,   793,   795,   802,   806,   808,   812,   833,
+          840,   843,   849,   856,   858,   873,   912,   916,   919,   932,   934,   961,   963,   968,   970,   977,
+          989,   993,  1010,  1016,  1024,  1025,  1027,  1029,  1031,  1032,  1034,  1036,  1038,  1041,  1043,  1047,
+         1048,  1050,  1057,  1059,  1061,  1064,  1066,  1079,  1080,  1083,  1085,  1088,  1090,  1096,  1099,  1103,
+         1106,  1109,  1113,  1116,  1122,  1129,  1153,  1156,  1159,  1169,  1171,  1176,  1183,  1185,  1195,  1199,
+         1209,  1212,  1216,  1218,  1221,  1225,  1234,  1236,  1241,  1243,  1250,  1256,  1270,  1281,  1287,  1296,
+         1299,  1306,  1309,  1313,  1338,  1341,  1348,  1353,  1362,  1375,  1376,  1387,  1400,  1408,  1410,  1415,
+         1425,  1453,  1457,  1477,  1481,  1494,  1496,  1507,  1512,  1538,  1545,  1547,  1549,  1551,  1554,  1561,
+         1563,  1565,  1570,  1572,  1575,  1577,  1587,  1593,  1601,  1603,  1605,  1612,  1617,  1619,  1632,  1648,
+         1658,  1662,  1664,  1674,  1680,  1690,  1692,  1704,  1729,  1736,  1740,  1745,  1747,  1751,  1752,  1761,
+         1763,  1767,  1773,  1787,  1795,  1801,  1806,  1810,  1817,  1834,  1840,  1844,  1857,  1864,  1866,  1877,
+         1882,  1892,  1902,  1915,  1934,  1953,  1985,  1987,  2000,  2002,  2013,  2048,  2052,  2058,  2064,  2068,
+         2071,  2074,  2081,  2088,  2104,  2114,  2119,  2121,  2123,  2130,  2136,  2141,  2147,  2153,  2157,  2177,
+         2179,  2184,  2189,  2193,  2203,  2208,  2223,  2226,  2232,  2244,  2249,  2251,  2256,  2258,  2265,  2269,
+         2304,  2306,  2324,  2335,  2336,  2361,  2373,  2375,  2385,  2418,  2443,  2460,  2480,  2504,  2509,  2520,
+         2531,  2537,  2562,  2568,  2572,  2578,  2592,  2596,  2599,  2602,  2614,  2620,  2625,  2627,  2629,  2634,
+         2641,  2650,  2682,  2688,  2697,  2707,  2712,  2718,  2731,  2754,  2759,  2760,  2775,  2788,  2793,  2805,
+         2811,  2817,  2820,  2832,  2842,  2854,  2890,  2902,  2921,  2923,  2978,  3010,  3012,  3026,  3081,  3083,
+         3085,  3097,  3099,  3120,  3136,  3152,  3159,  3188,  3210,  3228,  3234,  3245,  3250,  3256,  3264,  3276,
+         3281,  3296,  3349,  3363,  3378,  3392,  3395,  3420,  3440,  3461,  3488,  3529,  3531,  3584,  3588,  3591,
+         3600,  3602,  3614,  3616,  3628,  3634,  3650,  3657,  3668,  3683,  3685,  3713,  3716,  3720,  3726,  3729,
+         3736,  3753,  3778,  3802,  3805,  3819,  3841,  3845,  3851,  3856,  3880,  3922,  3938,  3970,  3993,  4032,
+    };
+
+    const int kmap_size = 4096;
+    const int nwant = grid_size == 256 ? 2 : 3;
+    const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
+    uint32_t * kgrid_q3xs;
+    int      * kmap_q3xs;
+    uint16_t * kneighbors_q3xs;
+
+    //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+    uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
+    for (int k = 0; k < grid_size; ++k) {
+        int8_t * pos = (int8_t *)(the_grid + k);
+        for (int i = 0; i < 4; ++i) {
+            int l = (kgrid[k] >> 3*i) & 0x7;
+            pos[i] = 2*l + 1;
+        }
+    }
+    kgrid_q3xs = the_grid;
+    iq3_data[gindex].grid = the_grid;
+    kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
+    iq3_data[gindex].map = kmap_q3xs;
+    for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
+    uint32_t aux32;
+    uint8_t * aux8 = (uint8_t *)&aux32;
+    for (int i = 0; i < grid_size; ++i) {
+        aux32 = kgrid_q3xs[i];
+        uint16_t index = 0;
+        for (int k=0; k<4; ++k) {
+            uint16_t q = (aux8[k] - 1)/2;
+            index |= (q << 3*k);
+        }
+        kmap_q3xs[index] = i;
+    }
+    int8_t pos[4];
+    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    int num_neighbors = 0, num_not_in_map = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q3xs[i] >= 0) continue;
+        ++num_not_in_map;
+        for (int k = 0; k < 4; ++k) {
+            int l = (i >> 3*k) & 0x7;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+        int n = 0; int d2 = dist2[0];
+        int nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            ++n;
+        }
+        num_neighbors += n;
+    }
+    //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+    kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
+    iq3_data[gindex].neighbours = kneighbors_q3xs;
+    int counter = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q3xs[i] >= 0) continue;
+        for (int k = 0; k < 4; ++k) {
+            int l = (i >> 3*k) & 0x7;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+        kmap_q3xs[i] = -(counter + 1);
+        int d2 = dist2[0];
+        uint16_t * start = &kneighbors_q3xs[counter++];
+        int n = 0, nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            kneighbors_q3xs[counter++] = dist2[2*j+1];
+            ++n;
+        }
+        *start = n;
+    }
+    free(dist2);
+}
+
+void iq3xs_free_impl(int grid_size) {
+    GGML_ASSERT(grid_size == 256 || grid_size == 512);
+    const int gindex = iq3_data_index(grid_size);
+    if (iq3_data[gindex].grid) {
+        free(iq3_data[gindex].grid);       iq3_data[gindex].grid = NULL;
+        free(iq3_data[gindex].map);        iq3_data[gindex].map  = NULL;
+        free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
+    }
+}
+
+static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_d2 = FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float d2 = 0;
+        for (int i = 0; i < 4; ++i) {
+            float q = pg[i];
+            float diff = scale*q - xval[i];
+            d2 += weight[i]*diff*diff;
+        }
+        if (d2 < best_d2) {
+            best_d2 = d2; grid_index = neighbours[j];
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
+        const float * restrict quant_weights) {
+
+    const int gindex = iq3_data_index(grid_size);
+
+    const uint32_t * kgrid_q3xs      = iq3_data[gindex].grid;
+    const int      * kmap_q3xs       = iq3_data[gindex].map;
+    const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
+
+    //GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q3xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q3xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 8;
+
+    const int64_t nbl = n/QK_K;
+
+    ggml_fp16_t * dh;
+    uint8_t * qs;
+    int block_size;
+    if (grid_size == 256) {
+        block_iq3_xxs * y = vy;
+        dh = &y->d;
+        qs = y->qs;
+        block_size = sizeof(block_iq3_xxs);
+    } else {
+        block_iq3_s * y = vy;
+        dh = &y->d;
+        qs = y->qs;
+        block_size = sizeof(block_iq3_s);
+    }
+    int quant_size = block_size - sizeof(ggml_fp16_t);
+
+    float scales[QK_K/32];
+    float weight[32];
+    float xval[32];
+    int8_t L[32];
+    int8_t Laux[32];
+    float  waux[32];
+    bool   is_on_grid[8];
+    bool   is_on_grid_aux[8];
+    uint8_t block_signs[8];
+    uint8_t q3[3*(QK_K/8)+QK_K/32];
+    uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
+    uint8_t  * qh = q3 + 3*(QK_K/8);
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        dh[0] = GGML_FP32_TO_FP16(0.f);
+        memset(q3, 0, 3*QK_K/8+QK_K/32);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float * xb = xbl + 32*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + 32*ib;
+                for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
+            }
+            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 4; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
+            if (max < GROUP_MAX_EPS_IQ3_XXS) {
+                scales[ib] = 0;
+                memset(L, 0, 32);
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            for (int is = -15; is <= 15; ++is) {
+                float id = (2*kMaxQ-1+is*0.2f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 8; ++k) {
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
+                    int grid_index = kmap_q3xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 32; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  8; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 8; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 3*i);
+                    }
+                    int grid_index = kmap_q3xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
+                    }
+                    const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
+                    for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
+                // and correspondingly flip quant signs.
+                scale = -scale;
+                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 8; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
+                int grid_index = kmap_q3xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
+                    printf("\n");
+                    GGML_ABORT("fatal error");
+                }
+                if (grid_size == 256) {
+                    q3[8*ib+k] = grid_index;
+                } else {
+                    q3[8*ib+k] = grid_index & 255;
+                    qh[ib] |= ((grid_index >> 8) << k);
+                }
+
+            }
+            scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(qs, 0, quant_size);
+            dh += block_size/sizeof(ggml_fp16_t);
+            qs += block_size;
+            continue;
+        }
+
+        float d = max_scale/31;
+        dh[0] = GGML_FP32_TO_FP16(d * 1.0125f);  // small improvement via this fudge factor
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            scales_and_signs[ib] |= ((uint32_t)l << 28);
+        }
+        memcpy(qs, q3, quant_size);
+
+        dh += block_size/sizeof(ggml_fp16_t);
+        qs += block_size;
+
+    }
+}
+
+size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq3_xxs);
+    }
+    return nrow * nblock * sizeof(block_iq3_xxs);
+}
+
+void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
+}
+
+static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
+        const float * restrict quant_weights,
+        float   * scales,
+        float   * weight,
+        float   * xval,
+        int8_t  * L,
+        int8_t  * Laux,
+        float   * waux,
+        bool    * is_on_grid,
+        bool    * is_on_grid_aux,
+        uint8_t * block_signs) {
+
+    const int gindex = iq3_data_index(512);
+
+    const uint32_t * kgrid_q3xs      = iq3_data[gindex].grid;
+    const int      * kmap_q3xs       = iq3_data[gindex].map;
+    const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
+
+    //GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q3xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q3xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 8;
+
+    const int64_t nbl = n/QK_K;
+
+    block_iq3_s * y = vy;
+
+    const int bs4 = block_size/4;
+    const int bs8 = block_size/8;
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        memset(&y[ibl], 0, sizeof(block_iq3_s));
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+
+        uint8_t * qs = y[ibl].qs;
+        uint8_t * qh = y[ibl].qh;
+        uint8_t * signs = y[ibl].signs;
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            const float * xb = xbl + block_size*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+                for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
+            }
+            for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < bs8; ++k) {
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
+                    }
+                }
+                block_signs[k] = s;
+            }
+            float max = xval[0];
+            for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]);
+            if (!max) {
+                scales[ib] = 0;
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            for (int k = 0; k < bs4; ++k) is_on_grid[k] = false;
+            for (int is = -9; is <= 9; ++is) {
+                float id = (2*kMaxQ-1+is*0.2f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < bs4; ++k) {
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
+                    int grid_index = kmap_q3xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < block_size; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < block_size; ++i) L[i] = Laux[i];
+                    for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < bs4; ++k) {
+                    //if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 3*i);
+                    }
+                    int grid_index = kmap_q3xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
+                    }
+                    const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
+                    for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < block_size; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
+                // and correspondingly flip quant signs.
+                scale = -scale;
+                for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k];
+            }
+            for (int k = 0; k < bs4; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
+                int grid_index = kmap_q3xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
+                    printf("\n");
+                    GGML_ABORT("fatal error");
+                }
+                qs[k] = grid_index & 255;
+                qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
+            }
+            qs += bs4;
+            for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k];
+            signs += bs8;
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f);
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/block_size; ib += 2) {
+            int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
+            l1 = MAX(0, MIN(15, l1));
+            int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
+            l2 = MAX(0, MIN(15, l2));
+            y[ibl].scales[ib/2] = l1 | (l2 << 4);
+        }
+
+    }
+}
+
+#define IQ3S_BLOCK_SIZE 32
+size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    float scales[QK_K/IQ3S_BLOCK_SIZE];
+    float weight[IQ3S_BLOCK_SIZE];
+    float xval[IQ3S_BLOCK_SIZE];
+    int8_t L[IQ3S_BLOCK_SIZE];
+    int8_t Laux[IQ3S_BLOCK_SIZE];
+    float  waux[IQ3S_BLOCK_SIZE];
+    bool   is_on_grid[IQ3S_BLOCK_SIZE/4];
+    bool   is_on_grid_aux[IQ3S_BLOCK_SIZE/4];
+    uint8_t block_signs[IQ3S_BLOCK_SIZE/8];
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights,
+                scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq3_s);
+    }
+    return nrow * nblock * sizeof(block_iq3_s);
+}
+
+void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq3_s(x, y, 1, k, NULL);
+}
+
+
+// =================================== 1.5 bpw ===================================================
+
+static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_score = -FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float sumqx = 0, sumq2 = 0;
+        for (int i = 0; i < 8; ++i) {
+            float q = (pg[i] - 3)/2;
+            float w = weight[i];
+            sumqx += w*q*xval[i];
+            sumq2 += w*q*q;
+        }
+        if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+            *scale = sumqx/sumq2; best_score = *scale * sumqx;
+            grid_index = neighbours[j];
+        }
+    }
+    if (grid_index < 0) {
+        for (int i = 0; i < ngrid; ++i) {
+            const int8_t * grid_i = (const int8_t *)(grid + i);
+            float sumqx = 0, sumq2 = 0;
+            for (int j = 0; j < 8; ++j) {
+                float w = weight[j];
+                float q = (grid_i[j] - 3)/2;
+                sumqx += w*q*xval[j];
+                sumq2 += w*q*q;
+            }
+            if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+                *scale = sumqx/sumq2; best_score = *scale*sumqx;
+                grid_index = i;
+            }
+        }
+    }
+    if (grid_index < 0) {
+        printf("Oops, did not find grid point\n");
+        printf("Have %d neighbours\n", num_neighbors);
+        for (int j = 1; j <= num_neighbors; ++j) {
+            const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+            float sumqx = 0, sumq2 = 0;
+            for (int i = 0; i < 8; ++i) {
+                float q = (pg[i] - 3)/2;
+                float w = weight[i];
+                sumqx += w*q*xval[i];
+                sumq2 += w*q*q;
+            }
+            printf("    neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    *scale *= 1.05f;  // This is a fudge factor. Don't ask me why it improves the result.
+    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_score = FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float d2 = 0;
+        for (int i = 0; i < 8; ++i) {
+            float q = xg[(pg[i] - 1)/2];
+            float w = weight[i];
+            float diff = scale*q - xval[i];
+            d2 += w*diff*diff;
+        }
+        if (d2 < best_score) {
+            best_score = d2;
+            grid_index = neighbours[j];
+        }
+    }
+    if (grid_index < 0) {
+        for (int i = 0; i < ngrid; ++i) {
+            const int8_t * grid_i = (const int8_t *)(grid + i);
+            float d2 = 0;
+            for (int j = 0; j < 8; ++j) {
+                float w = weight[j];
+                float q = xg[(grid_i[j] - 1)/2];
+                float diff = scale*q - xval[i];
+                d2 += w*diff*diff;
+            }
+            if (d2 < best_score) {
+                best_score = d2;
+                grid_index = i;
+            }
+        }
+    }
+    if (grid_index < 0) {
+        printf("Oops, did not find grid point\n");
+        printf("Have %d neighbours\n", num_neighbors);
+        for (int j = 1; j <= num_neighbors; ++j) {
+            const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+            float sumqx = 0, sumq2 = 0;
+            for (int i = 0; i < 8; ++i) {
+                float q = xg[(pg[i] - 1)/2];
+                float w = weight[i];
+                sumqx += w*q*xval[i];
+                sumq2 += w*q*q;
+            }
+            printf("    neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static int iq1_sort_helper(const void * left, const void * right) {
+    const float * l = left;
+    const float * r = right;
+    return *l < *r ? -1 : *l > *r ? 1 : 0;
+}
+
+#define IQ1S_BLOCK_SIZE 32
+#define IQ1M_BLOCK_SIZE 16
+static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+        float    * scales,
+        float    * weight,
+        float    * sumx,
+        float    * sumw,
+        float    * pairs,
+        int8_t   * L,
+        uint16_t * index,
+        int8_t   * shifts) {
+
+    const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    block_iq1_s * y = vy;
+
+    const int64_t nbl = n/QK_K;
+
+    const int block_size = IQ1S_BLOCK_SIZE;
+
+    const float x_p[3] = {-1 + IQ1S_DELTA,  IQ1S_DELTA, 1 + IQ1S_DELTA};
+    const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
+
+
+    int * idx = (int *)(pairs + 1);
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(y[ibl].qs, 0, QK_K/8);
+        memset(y[ibl].qh, 0, QK_K/16);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            const float * xb = xbl + block_size*ib;
+            const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+            for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            float max = fabsf(xb[0]);
+            for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
+            if (max < GROUP_MAX_EPS_IQ1_S) {
+                scales[ib] = 0;
+                memset(L, 1, block_size);
+                continue;
+            }
+            // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+            // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+            // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+            // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+            // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+            // for each possible and score for each split.
+            for (int j = 0; j < block_size; ++j) {
+                pairs[2*j] = xb[j];
+                idx[2*j] = j;
+            }
+            qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+            {
+                sumx[0] = sumw[0] = 0;
+                for (int j = 0; j < block_size; ++j) {
+                    int i = idx[2*j];
+                    sumx[j+1] = sumx[j] + weight[i]*xb[i];
+                    sumw[j+1] = sumw[j] + weight[i];
+                }
+            }
+            float best_score = -FLT_MIN, scale = max;
+            int besti1 = -1, besti2 = -1, best_shift = 0;
+            for (int i1 = 0; i1 <= block_size; ++i1) {
+                for (int i2 = i1; i2 <= block_size; ++i2) {
+                    float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2];
+                    float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2];
+                    if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+                        scale = sumqx/sumq2; best_score = scale*sumqx;
+                        besti1 = i1; besti2 = i2; best_shift = 1;
+                    }
+                    sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2];
+                    sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2];
+                    if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
+                        scale = sumqx/sumq2; best_score = scale*sumqx;
+                        besti1 = i1; besti2 = i2; best_shift = -1;
+                    }
+                }
+            }
+            GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
+            for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
+            for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+            for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+            if (scale < 0) {
+                for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+                scale = -scale; best_shift = -best_shift;
+            }
+            bool all_on_grid = true;
+            const float * xx = best_shift == 1 ? x_p : x_m;
+            for (int k = 0; k < block_size/8; ++k) {
+                uint16_t u = 0;
+                for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    all_on_grid = false;
+                    const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                    grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+                    GGML_ASSERT(grid_index >= 0);
+                }
+                index[k] = grid_index;
+            }
+            if (!all_on_grid) {
+                float sumqx = 0, sumq2 = 0;
+                for (int k = 0; k < block_size/8; ++k) {
+                    const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
+                    for (int j = 0; j < 8; ++j) {
+                        float w = weight[8*k + j];
+                        float q = xx[(pg[j] - 1)/2];
+                        sumqx += w*q*xb[8*k+j];
+                        sumq2 += w*q*q;
+                    }
+                }
+                if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
+            }
+            uint16_t h = 0;
+            for (int k = 0; k < block_size/8; ++k) {
+                y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
+                h |= (index[k] >> 8) << 3*k;
+            }
+            y[ibl].qh[ib] = h;
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            shifts[ib] = best_shift;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            continue;
+        }
+
+        float d = max_scale/15;
+        y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(7, l));
+            if (shifts[ib] == -1) l |= 8;
+            y[ibl].qh[ib] |= (l << 12);
+        }
+    }
+}
+
+size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    float  scales[QK_K/IQ1S_BLOCK_SIZE];
+    float  weight[IQ1S_BLOCK_SIZE];
+    int8_t L[IQ1S_BLOCK_SIZE];
+    float  sumx[IQ1S_BLOCK_SIZE+1];
+    float  sumw[IQ1S_BLOCK_SIZE+1];
+    float  pairs[2*IQ1S_BLOCK_SIZE];
+    uint16_t index[IQ1S_BLOCK_SIZE/8];
+    int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq1_s);
+    }
+    return nrow * nblock * sizeof(block_iq1_s);
+}
+
+static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+        float    * scales,
+        float    * weight,
+        float    * pairs,
+        int8_t   * L,
+        uint16_t * index,
+        int8_t   * shifts) {
+
+    const int gindex = iq2_data_index(GGML_TYPE_IQ1_M);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    //GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    block_iq1_m * y = vy;
+
+    const int64_t nbl = n/QK_K;
+
+    const int block_size = IQ1M_BLOCK_SIZE;
+
+    const float x_p[3] = {-1 + IQ1M_DELTA,  IQ1M_DELTA, 1 + IQ1M_DELTA};
+    const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA};
+    const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88};
+
+    int * idx = (int *)(pairs + 1);
+
+    float sumqx[4], sumq2[4];
+
+    iq1m_scale_t s;
+    const float * xx;
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+        memset(y[ibl].qs, 0, QK_K/8);
+        memset(y[ibl].qh, 0, QK_K/16);
+        memset(y[ibl].scales, 0, QK_K/32);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            const float * xb = xbl + block_size*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+                for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
+            }
+            float max = fabsf(xb[0]);
+            for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
+            if (max < GROUP_MAX_EPS_IQ1_M) {
+                scales[ib] = 0;
+                memset(L, 1, block_size);
+                continue;
+            }
+            // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
+            // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
+            // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
+            // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
+            // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
+            // for each possible and score for each split.
+            for (int j = 0; j < block_size; ++j) {
+                pairs[2*j] = xb[j];
+                idx[2*j] = j;
+            }
+            qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
+            float best_score = -FLT_MIN, scale = max;
+            int besti1 = -1, besti2 = -1, best_k = -1;
+            // 0: +, +
+            // 1: +, -
+            // 2: -, +
+            // 3: -, -
+            for (int i1 = 0; i1 <= block_size; ++i1) {
+                for (int i2 = i1; i2 <= block_size; ++i2) {
+                    memset(sumqx, 0, 4*sizeof(float));
+                    memset(sumq2, 0, 4*sizeof(float));
+                    for (int j = 0; j < i1; ++j) {
+                        int i = idx[2*j];
+                        if (i < block_size/2) {
+                            sumqx[0] += weight[i]*x_p[0]*xb[i];
+                            sumqx[1] += weight[i]*x_p[0]*xb[i];
+                            sumqx[2] += weight[i]*x_m[0]*xb[i];
+                            sumqx[3] += weight[i]*x_m[0]*xb[i];
+                            sumq2[0] += weight[i]*x_p[0]*x_p[0];
+                            sumq2[1] += weight[i]*x_p[0]*x_p[0];
+                            sumq2[2] += weight[i]*x_m[0]*x_m[0];
+                            sumq2[3] += weight[i]*x_m[0]*x_m[0];
+                        } else {
+                            sumqx[0] += weight[i]*x_p[0]*xb[i];
+                            sumqx[2] += weight[i]*x_p[0]*xb[i];
+                            sumqx[1] += weight[i]*x_m[0]*xb[i];
+                            sumqx[3] += weight[i]*x_m[0]*xb[i];
+                            sumq2[0] += weight[i]*x_p[0]*x_p[0];
+                            sumq2[2] += weight[i]*x_p[0]*x_p[0];
+                            sumq2[1] += weight[i]*x_m[0]*x_m[0];
+                            sumq2[3] += weight[i]*x_m[0]*x_m[0];
+                        }
+                    }
+                    for (int j = i1; j < i2; ++j) {
+                        int i = idx[2*j];
+                        if (i < block_size/2) {
+                            sumqx[0] += weight[i]*x_p[1]*xb[i];
+                            sumqx[1] += weight[i]*x_p[1]*xb[i];
+                            sumqx[2] += weight[i]*x_m[1]*xb[i];
+                            sumqx[3] += weight[i]*x_m[1]*xb[i];
+                            sumq2[0] += weight[i]*x_p[1]*x_p[1];
+                            sumq2[1] += weight[i]*x_p[1]*x_p[1];
+                            sumq2[2] += weight[i]*x_m[1]*x_m[1];
+                            sumq2[3] += weight[i]*x_m[1]*x_m[1];
+                        } else {
+                            sumqx[0] += weight[i]*x_p[1]*xb[i];
+                            sumqx[2] += weight[i]*x_p[1]*xb[i];
+                            sumqx[1] += weight[i]*x_m[1]*xb[i];
+                            sumqx[3] += weight[i]*x_m[1]*xb[i];
+                            sumq2[0] += weight[i]*x_p[1]*x_p[1];
+                            sumq2[2] += weight[i]*x_p[1]*x_p[1];
+                            sumq2[1] += weight[i]*x_m[1]*x_m[1];
+                            sumq2[3] += weight[i]*x_m[1]*x_m[1];
+                        }
+                    }
+                    for (int j = i2; j < block_size; ++j) {
+                        int i = idx[2*j];
+                        if (i < block_size/2) {
+                            sumqx[0] += weight[i]*x_p[2]*xb[i];
+                            sumqx[1] += weight[i]*x_p[2]*xb[i];
+                            sumqx[2] += weight[i]*x_m[2]*xb[i];
+                            sumqx[3] += weight[i]*x_m[2]*xb[i];
+                            sumq2[0] += weight[i]*x_p[2]*x_p[2];
+                            sumq2[1] += weight[i]*x_p[2]*x_p[2];
+                            sumq2[2] += weight[i]*x_m[2]*x_m[2];
+                            sumq2[3] += weight[i]*x_m[2]*x_m[2];
+                        } else {
+                            sumqx[0] += weight[i]*x_p[2]*xb[i];
+                            sumqx[2] += weight[i]*x_p[2]*xb[i];
+                            sumqx[1] += weight[i]*x_m[2]*xb[i];
+                            sumqx[3] += weight[i]*x_m[2]*xb[i];
+                            sumq2[0] += weight[i]*x_p[2]*x_p[2];
+                            sumq2[2] += weight[i]*x_p[2]*x_p[2];
+                            sumq2[1] += weight[i]*x_m[2]*x_m[2];
+                            sumq2[3] += weight[i]*x_m[2]*x_m[2];
+                        }
+                    }
+                    for (int k = 0; k < 4; ++k) {
+                        if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) {
+                            scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k];
+                            besti1 = i1; besti2 = i2; best_k = k;
+                        }
+                    }
+                }
+            }
+            GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0);
+            for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
+            for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
+            for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2;
+            if (scale < 0) {
+                for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j];
+                scale = -scale;
+                best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0;
+            }
+            bool all_on_grid = true;
+            for (int k = 0; k < block_size/8; ++k) {
+                if (k == 0) xx = best_k < 2 ? x_p : x_m;
+                else xx = best_k%2 == 0 ? x_p : x_m;
+                uint16_t u = 0;
+                for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    all_on_grid = false;
+                    const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                    grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
+                    GGML_ASSERT(grid_index >= 0);
+                }
+                index[k] = grid_index;
+            }
+            if (!all_on_grid) {
+                float sumqx_f = 0, sumq2_f = 0;
+                for (int k = 0; k < block_size/8; ++k) {
+                    if (k == 0) xx = best_k < 2 ? x_p : x_m;
+                    else xx = best_k%2 == 0 ? x_p : x_m;
+                    const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
+                    for (int j = 0; j < 8; ++j) {
+                        float w = weight[8*k + j];
+                        float q = xx[(pg[j] - 1)/2];
+                        sumqx_f += w*q*xb[8*k+j];
+                        sumq2_f += w*q*q;
+                    }
+                }
+                if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f;
+            }
+            y[ibl].qs[2*ib + 0] = index[0] & 255;
+            y[ibl].qs[2*ib + 1] = index[1] & 255;
+            y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            shifts[ib] = best_k;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            continue;
+        }
+
+        uint16_t * sc = (uint16_t *)y[ibl].scales;
+        float d = max_scale/15;
+        float id = 1/d;
+        float sumqx_f = 0, sumq2_f = 0;
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib+0]-1));
+            l = MAX(0, MIN(7, l));
+            sc[ib/4] |= (l << 3*(ib%4));
+            y[ibl].qh[ib] |= masks[shifts[ib]];
+            const float * xb = xbl + block_size*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + block_size*ib;
+                for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
+            }
+            for (int k = 0; k < block_size/8; ++k) {
+                if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m;
+                else xx = shifts[ib]%2 == 0 ? x_p : x_m;
+                const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700));
+                for (int j = 0; j < 8; ++j) {
+                    float w = weight[8*k + j];
+                    float q = xx[(pg[j] - 1)/2]*(2*l+1);
+                    sumqx_f += w*q*xb[8*k+j];
+                    sumq2_f += w*q*q;
+                }
+            }
+        }
+        if (sumq2_f > 0) d = sumqx_f/sumq2_f;
+        s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed.
+        sc[0] |= ((s.u16 & 0x000f) << 12);
+        sc[1] |= ((s.u16 & 0x00f0) <<  8);
+        sc[2] |= ((s.u16 & 0x0f00) <<  4);
+        sc[3] |= ((s.u16 & 0xf000) <<  0);
+    }
+}
+
+size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    float  scales[QK_K/IQ1M_BLOCK_SIZE];
+    float  weight[IQ1M_BLOCK_SIZE];
+    int8_t L[IQ1M_BLOCK_SIZE];
+    float  pairs[2*IQ1M_BLOCK_SIZE];
+    uint16_t index[IQ1M_BLOCK_SIZE/8];
+    int8_t shifts[QK_K/IQ1M_BLOCK_SIZE];
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq1_m);
+    }
+    return nrow * nblock * sizeof(block_iq1_m);
+}
+
+// ============================ 4-bit non-linear quants
+
+static inline int best_index_int8(int n, const int8_t * val, float x) {
+    if (x <= val[0]) return 0;
+    if (x >= val[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < val[mav]) mu = mav; else ml = mav;
+    }
+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
+
+static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
+        ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
+        float * scales, float * weight, uint8_t * L,
+        const int8_t * values,
+        const float * quant_weights,
+        const int ntry) {
+
+    float sigma2 = 0;
+    for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
+    sigma2 *= 2.f/super_block_size;
+
+    memset(q4, 0, super_block_size/2);
+    dh[0] = GGML_FP32_TO_FP16(0.f);
+
+    float max_scale = 0, amax_scale = 0;
+    for (int ib = 0; ib < super_block_size/block_size; ++ib) {
+        const float * xb = x + ib*block_size;
+        uint8_t * Lb = L + ib*block_size;
+        if (quant_weights) {
+            const float * qw = quant_weights + ib*block_size;
+            for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        } else {
+            for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+        }
+        float amax = 0, max = 0;
+        for (int j = 0; j < block_size; ++j) {
+            float ax = fabsf(xb[j]);
+            if (ax > amax) {
+                amax = ax; max = xb[j];
+            }
+        }
+        if (amax < GROUP_MAX_EPS) {
+            scales[ib] = 0;
+            continue;
+        }
+        float d = ntry > 0 ? -max/values[0] : max/values[0];
+        float id = 1/d;
+        float sumqx = 0, sumq2 = 0;
+        for (int j = 0; j < block_size; ++j) {
+            float al = id*xb[j];
+            int l = best_index_int8(16, values, al);
+            Lb[j] = l;
+            float q = values[l];
+            float w = weight[j];
+            sumqx += w*q*xb[j];
+            sumq2 += w*q*q;
+        }
+        d = sumqx/sumq2;
+        float best = d*sumqx;
+        for (int itry = -ntry; itry <= ntry; ++itry) {
+            id = (itry + values[0])/max;
+            sumqx = sumq2 = 0;
+            for (int j = 0; j < block_size; ++j) {
+                float al = id*xb[j];
+                int l = best_index_int8(16, values, al);
+                float q = values[l];
+                float w = weight[j];
+                sumqx += w*q*xb[j];
+                sumq2 += w*q*q;
+            }
+            if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                d = sumqx/sumq2; best = d * sumqx;
+            }
+        }
+        scales[ib] = d;
+        float abs_d = fabsf(d);
+        if (abs_d > amax_scale) {
+            amax_scale = abs_d; max_scale = d;
+        }
+    }
+
+    if (super_block_size/block_size > 1) {
+        int nb = super_block_size/block_size;
+        memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
+        float d = -max_scale/32;
+        dh[0] = GGML_FP32_TO_FP16(d);
+        float id = d ? 1/d : 0.f;
+        for (int ib = 0; ib < super_block_size/block_size; ++ib) {
+            int l = nearest_int(id*scales[ib]);
+            l = MAX(-32, MIN(31, l));
+            float dl = d * l;
+            float idl = dl ? 1/dl : 0.f;
+            uint8_t * Lb = L + ib*block_size;
+            const float * xb = x + ib*block_size;
+            for (int j = 0; j < block_size; ++j) {
+                Lb[j] = best_index_int8(16, values, idl*xb[j]);
+            }
+            l += 32;
+            uint8_t l_l = l & 0xf;
+            uint8_t l_h = l >>  4;
+            if (ib%2 == 0) scales_l[ib/2] = l_l;
+            else scales_l[ib/2] |= (l_l << 4);
+            scales_h[ib/8] |= (l_h << 2*(ib%8));
+        }
+    } else {
+        dh[0] = GGML_FP32_TO_FP16(scales[0]);
+        if (ntry > 0) {
+            float id = scales[0] ? 1/scales[0] : 0;
+            for (int j = 0; j < super_block_size; ++j) {
+                L[j] = best_index_int8(16, values, id*x[j]);
+            }
+        }
+    }
+
+    for (int i = 0; i < super_block_size/32; ++i) {
+        for (int j = 0; j < 16; ++j) {
+            q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
+        }
+    }
+}
+
+size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK4_NL == 0);
+    int64_t nblock = n_per_row/QK4_NL;
+    char * qrow = (char *)dst;
+    uint8_t L[QK4_NL];
+    float weight[QK4_NL];
+    uint16_t unused_h;
+    uint8_t * unused_l = NULL;
+    float scale;
+    for (int64_t row = 0; row < nrow; ++row) {
+        block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
+        for (int ibl = 0; ibl < nblock; ++ibl) {
+            const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
+            quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
+                    &scale, weight, L, kvalues_iq4nl, qw, 7);
+        }
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq4_nl);
+    }
+    return nrow * nblock * sizeof(block_iq4_nl);
+}
+
+//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
+    GGML_ASSERT(k%QK4_NL == 0);
+    int64_t nblock = k/QK4_NL;
+    uint8_t L[QK4_NL];
+    float weight[QK4_NL];
+    uint16_t unused_h;
+    uint8_t * unused_l = NULL;
+    float scale;
+    block_iq4_nl * iq4 = y;
+    for (int ibl = 0; ibl < nblock; ++ibl) {
+        quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
+                &scale, weight, L, kvalues_iq4nl, NULL, -1);
+    }
+}
+
+size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    uint8_t L[QK_K];
+    float weight[32];
+    float scales[QK_K/32];
+    for (int64_t row = 0; row < nrow; ++row) {
+        block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
+        for (int ibl = 0; ibl < nblock; ++ibl) {
+            const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
+            quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
+                    scales, weight, L, kvalues_iq4nl, qw, 7);
+        }
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq4_xs);
+    }
+    return nrow * nblock * sizeof(block_iq4_xs);
+}
+
+void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq4_xs(x, y, 1, k, NULL);
+}
+
+// =============================== 2.5625 bpw
+
+static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+
+    const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
+
+    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
+    const int      * kmap_q2xs       = iq2_data[gindex].map;
+    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
+
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 3;
+
+    const int64_t nbl = n/QK_K;
+
+    block_iq2_s * y = vy;
+
+    float scales[QK_K/16];
+    float weight[16];
+    float xval[16];
+    int8_t L[16];
+    int8_t Laux[16];
+    float  waux[16];
+    bool   is_on_grid[2];
+    bool   is_on_grid_aux[2];
+    uint8_t block_signs[2];
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        memset(&y[ibl], 0, sizeof(block_iq2_s));
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = 2*sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            const float * xb = xbl + 16*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + 16*ib;
+                for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+            }
+            for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 2; ++k) {
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; s |= (1 << i);
+                    }
+                }
+                block_signs[k] = s;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
+            if (max < GROUP_MAX_EPS_IQ2_S) {
+                scales[ib] = 0;
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            is_on_grid[0] = is_on_grid[1] = true;
+            for (int is = -9; is <= 9; ++is) {
+                float id = (2*kMaxQ-1+is*0.1f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 2; ++k) {
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
+                    int grid_index = kmap_q2xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 16; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  2; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 2; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 8; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 2*i);
+                        L[8*k + i] = l;
+                    }
+                    int grid_index = kmap_q2xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
+                        grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 16; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                scale = -scale;
+                for (int k = 0; k < 2; ++k) block_signs[k] = ~block_signs[k];
+            }
+            for (int k = 0; k < 2; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
+                int grid_index = kmap_q2xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
+                    printf("\n");
+                    GGML_ABORT("fatal error");
+                }
+                const int i8 = 2*ib + k;
+                y[ibl].qs[i8] = grid_index & 255;
+                y[ibl].qh[i8/4] |= ((grid_index >> 8) << 2*(i8%4));
+                y[ibl].qs[QK_K/8 + i8] = block_signs[k];
+            }
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d * 0.9875f);
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            if (ib%2 == 0) y[ibl].scales[ib/2] = l;
+            else y[ibl].scales[ib/2] |= (l << 4);
+        }
+    }
+}
+
+size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int64_t nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq2_s);
+    }
+    return nrow * nblock * sizeof(block_iq2_s);
+}
+
+void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq2_s(x, y, 1, k, NULL);
+}
+
+// =============================== data validation
+
+static bool validate_float(float f, size_t i) {
+    if (isinf(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
+        return false;
+    }
+
+    if (isnan(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
+        return false;
+    }
+
+    return true;
+}
+
+static bool isinf_fp16(ggml_fp16_t f) {
+    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
+}
+
+static bool isnan_fp16(ggml_fp16_t f) {
+    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
+}
+
+static bool validate_fp16(ggml_fp16_t f, size_t i) {
+    if (isinf_fp16(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
+        return false;
+    }
+
+    if (isnan_fp16(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
+        return false;
+    }
+
+    return true;
+}
+
+#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        if (!validate_fp16(q[i].d, i)) { \
+            return false; \
+        } \
+    }
+
+#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
+            return false; \
+        } \
+    }
+
+#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        for (size_t j = 0; j < (nr); ++j) { \
+            if (!validate_fp16(q[i].d[j], i)) { \
+                return false; \
+            } \
+        } \
+    }
+
+bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
+    if (type < 0 || type >= GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid type %d\n", __func__, type);
+        return false;
+    }
+
+    if (nbytes % ggml_type_size(type) != 0) {
+        fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
+        return false;
+    }
+
+    const size_t nb = nbytes/ggml_type_size(type);
+
+    switch (type) {
+        case GGML_TYPE_BF16:
+            {
+                int nans = 0;
+                int infs = 0;
+                const unsigned short * f = (const unsigned short *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    nans += (f[i] & 0x7fff) > 0x7f80;
+                    infs += (f[i] & 0x7fff) == 0x7f80;
+                }
+                if (nans) {
+                    fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
+                    return false;
+                }
+                if (infs) {
+                    fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
+                    return false;
+                }
+            } break;
+        case GGML_TYPE_F16:
+            {
+                const ggml_fp16_t * f = (const ggml_fp16_t *) data;
+                size_t i = 0;
+#if defined(__AVX2__)
+                for (; i + 15 < nb; i += 16) {
+                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
+                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
+                    __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
+                    int mask = _mm256_movemask_epi8(cmp);
+                    if (mask) {
+                        for (size_t j = 0; j < 16; ++j) {
+                            if (!validate_fp16(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#elif defined(__ARM_NEON)
+                for (; i + 7 < nb; i += 8) {
+                    uint16x8_t v = vld1q_u16(f + i);
+                    uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
+                    uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
+                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
+                    if (mask) {
+                        for (size_t j = 0; j < 8; ++j) {
+                            if (!validate_fp16(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#endif
+                for (; i < nb; ++i) {
+                    if (!validate_fp16(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                const float * f = (const float *) data;
+                size_t i = 0;
+#if defined(__AVX2__)
+                for (; i + 7 < nb; i += 8) {
+                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
+                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
+                    __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
+                    int mask = _mm256_movemask_epi8(cmp);
+                    if (mask) {
+                        for (size_t j = 0; j < 8; ++j) {
+                            if (!validate_float(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#elif defined(__ARM_NEON)
+                for (; i + 3 < nb; i += 4) {
+                    uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
+                    uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
+                    uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
+                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
+                    if (mask) {
+                        for (size_t j = 0; j < 4; ++j) {
+                            if (!validate_float(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#endif
+                for (; i < nb; ++i) {
+                    if (!validate_float(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_F64:
+            {
+                const double * f = (const double *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    if (!validate_float(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
+            } break;
+        case GGML_TYPE_Q4_1:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
+            } break;
+        case GGML_TYPE_Q5_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
+            } break;
+        case GGML_TYPE_Q5_1:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
+            } break;
+        case GGML_TYPE_Q8_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
+            } break;
+        case GGML_TYPE_Q2_K:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
+            } break;
+        case GGML_TYPE_Q3_K:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
+            } break;
+        case GGML_TYPE_Q4_K:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
+            } break;
+        case GGML_TYPE_Q5_K:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
+            } break;
+        case GGML_TYPE_Q6_K:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
+            } break;
+        case GGML_TYPE_Q8_K:
+            {
+                const block_q8_K * q = (const block_q8_K *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    if (!validate_float(q[i].d, i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_TQ1_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_tq1_0, data, nb);
+            } break;
+        case GGML_TYPE_TQ2_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
+            } break;
+        case GGML_TYPE_IQ1_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ1_M:
+            {
+                const block_iq1_m * q = (const block_iq1_m *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    iq1m_scale_t scale;
+                    const uint16_t * sc = (const uint16_t *)q[i].scales;
+                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+                    if (!validate_fp16(scale.f16, i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_IQ2_XXS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
+            } break;
+        case GGML_TYPE_IQ2_XS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
+            } break;
+        case GGML_TYPE_IQ2_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ3_XXS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
+            } break;
+
+        case GGML_TYPE_IQ3_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ4_XS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
+            } break;
+        case GGML_TYPE_IQ4_NL:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
+            } break;
+
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+            // nothing to validate
+            break;
+        default:
+            {
+                fprintf(stderr, "%s: invalid type %d\n", __func__, type);
+                return false;
+            }
+    }
+
+    return true;
+}
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
new file mode 100644
index 000000000..d09173e11
--- /dev/null
+++ b/ggml/src/ggml-quants.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// NOTE: these functions are defined as GGML_API because they used by the CPU backend
+
+// Quantization
+GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
+
+GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
+
+GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
+
+GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
+
+// Dequantization
+GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+GGML_API void iq2xs_init_impl(enum ggml_type type);
+GGML_API void iq2xs_free_impl(enum ggml_type type);
+GGML_API void iq3xs_init_impl(int grid_size);
+GGML_API void iq3xs_free_impl(int grid_size);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt
new file mode 100644
index 000000000..f5acb8ec2
--- /dev/null
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@@ -0,0 +1,9 @@
+message(STATUS "Using RPC backend")
+
+ggml_add_backend_library(ggml-rpc
+                         ggml-rpc.cpp
+                        )
+
+if (WIN32)
+    target_link_libraries(ggml-rpc PRIVATE ws2_32)
+endif()
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
new file mode 100644
index 000000000..97873acc7
--- /dev/null
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -0,0 +1,1545 @@
+#include "ggml-rpc.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  ifndef NOMINMAX
+#     define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <winsock2.h>
+#else
+#  include <arpa/inet.h>
+#  include <sys/socket.h>
+#  include <sys/types.h>
+#  include <netinet/in.h>
+#  include <netinet/tcp.h>
+#  include <netdb.h>
+#  include <unistd.h>
+#endif
+#include <cstring>
+
+#ifdef _WIN32
+typedef SOCKET sockfd_t;
+using ssize_t = __int64;
+#else
+typedef int sockfd_t;
+#endif
+
+// cross-platform socket
+struct socket_t {
+    sockfd_t fd;
+    socket_t(sockfd_t fd) : fd(fd) {}
+    ~socket_t() {
+        GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
+#ifdef _WIN32
+        closesocket(this->fd);
+#else
+        close(this->fd);
+#endif
+    }
+};
+
+// all RPC structures must be packed
+#pragma pack(push, 1)
+// ggml_tensor is serialized into rpc_tensor
+struct rpc_tensor {
+    uint64_t id;
+    uint32_t type;
+    uint64_t buffer;
+    uint32_t ne[GGML_MAX_DIMS];
+    uint32_t nb[GGML_MAX_DIMS];
+    uint32_t op;
+    int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+    int32_t  flags;
+    uint64_t src[GGML_MAX_SRC];
+    uint64_t view_src;
+    uint64_t view_offs;
+    uint64_t data;
+    char name[GGML_MAX_NAME];
+
+    char padding[4];
+};
+
+static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
+
+// RPC commands
+enum rpc_cmd {
+    RPC_CMD_ALLOC_BUFFER = 0,
+    RPC_CMD_GET_ALIGNMENT,
+    RPC_CMD_GET_MAX_SIZE,
+    RPC_CMD_BUFFER_GET_BASE,
+    RPC_CMD_FREE_BUFFER,
+    RPC_CMD_BUFFER_CLEAR,
+    RPC_CMD_SET_TENSOR,
+    RPC_CMD_GET_TENSOR,
+    RPC_CMD_COPY_TENSOR,
+    RPC_CMD_GRAPH_COMPUTE,
+    RPC_CMD_GET_DEVICE_MEMORY,
+    RPC_CMD_INIT_TENSOR,
+    RPC_CMD_GET_ALLOC_SIZE,
+    RPC_CMD_COUNT,
+};
+
+struct rpc_msg_get_alloc_size_req {
+    rpc_tensor tensor;
+};
+
+struct rpc_msg_get_alloc_size_rsp {
+    uint64_t alloc_size;
+};
+
+struct rpc_msg_init_tensor_req {
+    rpc_tensor tensor;
+};
+
+struct rpc_msg_alloc_buffer_req {
+    uint64_t size;
+};
+
+struct rpc_msg_alloc_buffer_rsp {
+    uint64_t remote_ptr;
+    uint64_t remote_size;
+};
+
+struct rpc_msg_get_alignment_rsp {
+    uint64_t alignment;
+};
+
+struct rpc_msg_get_max_size_rsp {
+    uint64_t max_size;
+};
+
+struct rpc_msg_buffer_get_base_req {
+    uint64_t remote_ptr;
+};
+
+struct rpc_msg_buffer_get_base_rsp {
+    uint64_t base_ptr;
+};
+
+struct rpc_msg_free_buffer_req {
+    uint64_t remote_ptr;
+};
+
+struct rpc_msg_buffer_clear_req {
+    uint64_t remote_ptr;
+    uint8_t value;
+};
+
+struct rpc_msg_get_tensor_req {
+    rpc_tensor tensor;
+    uint64_t offset;
+    uint64_t size;
+};
+
+struct rpc_msg_copy_tensor_req {
+    rpc_tensor src;
+    rpc_tensor dst;
+};
+
+struct rpc_msg_copy_tensor_rsp {
+    uint8_t result;
+};
+
+struct rpc_msg_graph_compute_rsp {
+    uint8_t result;
+};
+
+struct rpc_msg_get_device_memory_rsp {
+    uint64_t free_mem;
+    uint64_t total_mem;
+};
+#pragma pack(pop)
+
+// RPC data structures
+
+static ggml_guid_t ggml_backend_rpc_guid() {
+    static ggml_guid guid = {0x99, 0x68, 0x5b, 0x6c, 0xd2, 0x83, 0x3d, 0x24, 0x25, 0x36, 0x72, 0xe1, 0x5b, 0x0e, 0x14, 0x03};
+    return &guid;
+}
+
+struct ggml_backend_rpc_buffer_type_context {
+    std::string endpoint;
+    std::string name;
+    size_t alignment;
+    size_t max_size;
+};
+
+struct ggml_backend_rpc_context {
+    std::string endpoint;
+    std::string name;
+};
+
+struct ggml_backend_rpc_buffer_context {
+    std::shared_ptr<socket_t> sock;
+    void * base_ptr;
+    uint64_t remote_ptr;
+};
+
+// RPC helper functions
+
+static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
+#ifdef _WIN32
+    if (fd == INVALID_SOCKET) {
+        return nullptr;
+    }
+#else
+    if (fd < 0) {
+        return nullptr;
+    }
+#endif
+    return std::make_shared<socket_t>(fd);
+}
+
+static bool set_no_delay(sockfd_t sockfd) {
+    int flag = 1;
+    // set TCP_NODELAY to disable Nagle's algorithm
+    int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
+    return ret == 0;
+}
+
+static bool set_reuse_addr(sockfd_t sockfd) {
+    int flag = 1;
+    int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
+    return ret == 0;
+}
+
+static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
+    struct sockaddr_in addr;
+    auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    auto sock_ptr = make_socket(sockfd);
+    if (sock_ptr == nullptr) {
+        return nullptr;
+    }
+    if (!set_no_delay(sockfd)) {
+        fprintf(stderr, "Failed to set TCP_NODELAY\n");
+        return nullptr;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    struct hostent * server = gethostbyname(host);
+    if (server == NULL) {
+        fprintf(stderr, "Cannot resolve host '%s'\n", host);
+        return nullptr;
+    }
+    memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
+    if (connect(sock_ptr->fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+        return nullptr;
+    }
+    return sock_ptr;
+}
+
+static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
+    auto client_socket_fd = accept(srv_sockfd, NULL, NULL);
+    auto client_socket = make_socket(client_socket_fd);
+    if (client_socket == nullptr) {
+        return nullptr;
+    }
+    if (!set_no_delay(client_socket_fd)) {
+        fprintf(stderr, "Failed to set TCP_NODELAY\n");
+        return nullptr;
+    }
+    return client_socket;
+}
+
+static std::shared_ptr<socket_t> create_server_socket(const char * host, int port) {
+    auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    auto sock = make_socket(sockfd);
+    if (sock == nullptr) {
+        return nullptr;
+    }
+    if (!set_reuse_addr(sockfd)) {
+        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
+        return nullptr;
+    }
+    if (inet_addr(host) == INADDR_NONE) {
+        fprintf(stderr, "Invalid host address: %s\n", host);
+        return nullptr;
+    }
+    struct sockaddr_in serv_addr;
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = inet_addr(host);
+    serv_addr.sin_port = htons(port);
+
+    if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
+        return nullptr;
+    }
+    if (listen(sockfd, 1) < 0) {
+        return nullptr;
+    }
+    return sock;
+}
+
+static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
+    size_t bytes_sent = 0;
+    while (bytes_sent < size) {
+        ssize_t n = send(sockfd, (const char *)data + bytes_sent, size - bytes_sent, 0);
+        if (n < 0) {
+            return false;
+        }
+        bytes_sent += n;
+    }
+    return true;
+}
+
+static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
+    size_t bytes_recv = 0;
+    while (bytes_recv < size) {
+        ssize_t n = recv(sockfd, (char *)data + bytes_recv, size - bytes_recv, 0);
+        if (n <= 0) {
+            return false;
+        }
+        bytes_recv += n;
+    }
+    return true;
+}
+
+static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) {
+    if (!send_data(sockfd, &msg_size, sizeof(msg_size))) {
+        return false;
+    }
+    return send_data(sockfd, msg, msg_size);
+}
+
+static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) {
+    uint64_t size;
+    if (!recv_data(sockfd, &size, sizeof(size))) {
+        return false;
+    }
+    if (size != msg_size) {
+        return false;
+    }
+    return recv_data(sockfd, msg, msg_size);
+}
+
+static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
+    uint64_t size;
+    if (!recv_data(sockfd, &size, sizeof(size))) {
+        return false;
+    }
+    try {
+        input.resize(size);
+    } catch (const std::bad_alloc & e) {
+        fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
+        return false;
+    }
+    return recv_data(sockfd, input.data(), size);
+}
+
+static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
+    size_t pos = endpoint.find(':');
+    if (pos == std::string::npos) {
+        return false;
+    }
+    host = endpoint.substr(0, pos);
+    port = std::stoi(endpoint.substr(pos + 1));
+    return true;
+}
+
+// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
+// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
+    uint8_t cmd_byte = cmd;
+    if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
+        return false;
+    }
+    if (!send_data(sock->fd, &input_size, sizeof(input_size))) {
+        return false;
+    }
+    if (!send_data(sock->fd, input, input_size)) {
+        return false;
+    }
+    // TODO: currently the output_size is always known, do we need support for commands with variable output size?
+    // even if we do, we can skip sending output_size from the server for commands with known output size
+    uint64_t out_size;
+    if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
+        return false;
+    }
+    if (out_size != output_size) {
+        return false;
+    }
+    if (!recv_data(sock->fd, output, output_size)) {
+        return false;
+    }
+    return true;
+}
+
+// RPC client-side implementation
+
+static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
+    static bool initialized = false;
+
+    auto it = sockets.find(endpoint);
+    if (it != sockets.end()) {
+        if (auto sock = it->second.lock()) {
+            return sock;
+        }
+    }
+    std::string host;
+    int port;
+    if (!parse_endpoint(endpoint, host, port)) {
+        return nullptr;
+    }
+#ifdef _WIN32
+    if (!initialized) {
+        WSADATA wsaData;
+        int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
+        if (res != 0) {
+            return nullptr;
+        }
+        initialized = true;
+    }
+#else
+    GGML_UNUSED(initialized);
+#endif
+    auto sock = socket_connect(host.c_str(), port);
+    if (sock == nullptr) {
+        return nullptr;
+    }
+    GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
+    sockets[endpoint] = sock;
+    return sock;
+}
+
+static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_msg_free_buffer_req request = {ctx->remote_ptr};
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
+    GGML_ASSERT(status);
+    delete ctx;
+}
+
+static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    if (ctx->base_ptr != nullptr) {
+        return ctx->base_ptr;
+    }
+    rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
+    rpc_msg_buffer_get_base_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
+    GGML_ASSERT(status);
+    ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
+    return ctx->base_ptr;
+}
+
+static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
+    rpc_tensor result;
+    result.id = reinterpret_cast<uint64_t>(tensor);
+    result.type = tensor->type;
+    if (tensor->buffer) {
+        ggml_backend_buffer_t buffer = tensor->buffer;
+        ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+        result.buffer = ctx->remote_ptr;
+    } else {
+        result.buffer = 0;
+    }
+    for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+        result.ne[i] = tensor->ne[i];
+        result.nb[i] = tensor->nb[i];
+    }
+    result.op = tensor->op;
+    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+        result.op_params[i] = tensor->op_params[i];
+    }
+    result.flags = tensor->flags;
+    for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+        result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+    }
+    result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+    result.view_offs = tensor->view_offs;
+    result.data = reinterpret_cast<uint64_t>(tensor->data);
+    snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+    return result;
+}
+
+static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+
+    // CUDA backend on the server pads everything to 512 due to CUDA limitations.
+    // Due to bandwidth constraints, we only call the server init tensor functions if necessary.
+    // In particular, only quantized tensors need padding
+    if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
+        rpc_msg_init_tensor_req request;
+
+        request.tensor = serialize_tensor(tensor);
+
+        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
+        GGML_ASSERT(status);
+    }
+}
+
+static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
+    size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
+    std::vector<uint8_t> input(input_size, 0);
+    rpc_tensor rpc_tensor = serialize_tensor(tensor);
+    memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
+    memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
+    memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
+    GGML_ASSERT(status);
+}
+
+static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_msg_get_tensor_req request;
+    request.tensor = serialize_tensor(tensor);
+    request.offset = offset;
+    request.size = size;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
+    GGML_ASSERT(status);
+}
+
+static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+    // check if src and dst are on the same server
+    ggml_backend_buffer_t src_buffer = src->buffer;
+    ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
+    ggml_backend_buffer_t dst_buffer = dst->buffer;
+    ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context;
+    if (src_ctx->sock != dst_ctx->sock) {
+        return false;
+    }
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_msg_copy_tensor_req request;
+    request.src = serialize_tensor(src);
+    request.dst = serialize_tensor(dst);
+    rpc_msg_copy_tensor_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
+    GGML_ASSERT(status);
+    return response.result;
+}
+
+static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
+    GGML_ASSERT(status);
+}
+
+static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_rpc_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_rpc_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_rpc_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_rpc_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_rpc_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_rpc_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_rpc_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    return buft_ctx->name.c_str();
+}
+
+static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    rpc_msg_alloc_buffer_req request = {size};
+    rpc_msg_alloc_buffer_rsp response;
+    auto sock = get_socket(buft_ctx->endpoint);
+    bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
+    GGML_ASSERT(status);
+    if (response.remote_ptr != 0) {
+        ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
+            ggml_backend_rpc_buffer_interface,
+            new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
+            response.remote_size);
+        return buffer;
+    } else {
+        return nullptr;
+    }
+}
+
+static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
+    rpc_msg_get_alignment_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
+    GGML_ASSERT(status);
+    return response.alignment;
+}
+
+static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    return buft_ctx->alignment;
+}
+
+static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
+    rpc_msg_get_max_size_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
+    GGML_ASSERT(status);
+    return response.max_size;
+}
+
+static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    return buft_ctx->max_size;
+}
+
+static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    // See comments in init_tensor.
+    if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
+        ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+        auto sock = get_socket(buft_ctx->endpoint);
+
+        rpc_msg_get_alloc_size_req request;
+
+        request.tensor = serialize_tensor(tensor);
+
+        rpc_msg_get_alloc_size_rsp response;
+        bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
+        GGML_ASSERT(status);
+
+        return response.alloc_size;
+    } else {
+        return ggml_nbytes(tensor);
+    }
+}
+
+static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_rpc_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_rpc_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_rpc_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_rpc_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_rpc_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
+    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+
+    return rpc_ctx->name.c_str();
+}
+
+static void ggml_backend_rpc_free(ggml_backend_t backend) {
+    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+    delete rpc_ctx;
+    delete backend;
+}
+
+static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    // this is no-op because we don't have any async operations
+}
+
+static void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+    if (tensor == nullptr) {
+        return;
+    }
+    if (visited.find(tensor) != visited.end()) {
+        return;
+    }
+    visited.insert(tensor);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        add_tensor(tensor->src[i], tensors, visited);
+    }
+    add_tensor(tensor->view_src, tensors, visited);
+    tensors.push_back(serialize_tensor(tensor));
+}
+
+static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+    uint32_t n_nodes = cgraph->n_nodes;
+    std::vector<rpc_tensor> tensors;
+    std::unordered_set<ggml_tensor*> visited;
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        add_tensor(cgraph->nodes[i], tensors, visited);
+    }
+    // serialization format:
+    // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+    uint32_t n_tensors = tensors.size();
+    int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+    output.resize(output_size, 0);
+    memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+    }
+    uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+    *out_ntensors = n_tensors;
+    rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+    memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
+
+static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+    std::vector<uint8_t> input;
+    serialize_graph(cgraph, input);
+    rpc_msg_graph_compute_rsp response;
+    auto sock = get_socket(rpc_ctx->endpoint);
+    bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
+    GGML_ASSERT(status);
+    return (enum ggml_status)response.result;
+}
+
+static ggml_backend_i ggml_backend_rpc_interface = {
+    /* .get_name                = */ ggml_backend_rpc_name,
+    /* .free                    = */ ggml_backend_rpc_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ ggml_backend_rpc_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    // NOTE: buffer types are allocated and never freed; this is by design
+    static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
+    auto it = buft_map.find(endpoint);
+    if (it != buft_map.end()) {
+        return it->second;
+    }
+    auto sock = get_socket(endpoint);
+    if (sock == nullptr) {
+        fprintf(stderr, "Failed to connect to %s\n", endpoint);
+        return nullptr;
+    }
+    size_t alignment = get_alignment(sock);
+    size_t max_size = get_max_size(sock);
+    ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
+        /* .endpoint  = */ endpoint,
+        /* .name      = */ "RPC[" + std::string(endpoint) + "]",
+        /* .alignment = */ alignment,
+        /* .max_size  = */ max_size
+    };
+
+    ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
+        /* .iface   = */ ggml_backend_rpc_buffer_type_interface,
+        /* .device  = */ ggml_backend_rpc_add_device(endpoint),
+        /* .context = */ buft_ctx
+    };
+    buft_map[endpoint] = buft;
+    return buft;
+}
+
+ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
+    ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
+        /* .endpoint  = */ endpoint,
+        /* .name      = */ "RPC[" + std::string(endpoint) + "]",
+    };
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_rpc_guid(),
+        /* .interface = */ ggml_backend_rpc_interface,
+        /* .device    = */ ggml_backend_rpc_add_device(endpoint),
+        /* .context   = */ ctx
+    };
+    return backend;
+}
+
+bool ggml_backend_is_rpc(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
+}
+
+static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
+    rpc_msg_get_device_memory_rsp response;
+    bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
+    GGML_ASSERT(status);
+    *free = response.free_mem;
+    *total = response.total_mem;
+}
+
+void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
+    auto sock = get_socket(endpoint);
+    if (sock == nullptr) {
+        *free = 0;
+        *total = 0;
+        return;
+    }
+    get_device_memory(sock, free, total);
+}
+
+// RPC server-side implementation
+
+class rpc_server {
+public:
+    rpc_server(ggml_backend_t backend) : backend(backend) {}
+    ~rpc_server();
+
+    void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
+    void get_alignment(rpc_msg_get_alignment_rsp & response);
+    void get_max_size(rpc_msg_get_max_size_rsp & response);
+    bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response);
+    bool free_buffer(const rpc_msg_free_buffer_req & request);
+    bool buffer_clear(const rpc_msg_buffer_clear_req & request);
+    bool set_tensor(const std::vector<uint8_t> & input);
+    bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
+    bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
+    bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
+    bool init_tensor(const rpc_msg_init_tensor_req & request);
+    bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
+
+private:
+    ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
+    ggml_tensor * create_node(uint64_t id,
+                              struct ggml_context * ctx,
+                              const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+                              std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
+
+
+    ggml_backend_t backend;
+    std::unordered_set<ggml_backend_buffer_t> buffers;
+};
+
+bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
+    ggml_backend_buffer_type_t buft;
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+
+    if (tensor == nullptr) {
+        GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
+        ggml_free(ctx);
+        return false;
+    }
+
+    if (tensor->buffer == nullptr) {
+        //No buffer allocated.
+        buft = ggml_backend_get_default_buffer_type(backend);
+    } else {
+        buft = tensor->buffer->buft;
+    }
+
+    response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
+
+    ggml_free(ctx);
+    return true;
+}
+
+void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
+    response.remote_ptr = 0;
+    response.remote_size = 0;
+    if (buffer != nullptr) {
+        response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
+        response.remote_size = buffer->size;
+        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
+        buffers.insert(buffer);
+    } else {
+        GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
+    }
+}
+
+void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+    GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
+    response.alignment = alignment;
+}
+
+void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
+    ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+    size_t max_size = ggml_backend_buft_get_max_size(buft);
+    GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
+    response.max_size = max_size;
+}
+
+bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
+    if (buffers.find(buffer) == buffers.end()) {
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
+        return false;
+    }
+    void * base = ggml_backend_buffer_get_base(buffer);
+    response.base_ptr = reinterpret_cast<uint64_t>(base);
+    return true;
+}
+
+bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
+    if (buffers.find(buffer) == buffers.end()) {
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
+        return false;
+    }
+    ggml_backend_buffer_free(buffer);
+    buffers.erase(buffer);
+    return true;
+}
+
+bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
+    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
+    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
+    if (buffers.find(buffer) == buffers.end()) {
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
+        return false;
+    }
+    ggml_backend_buffer_clear(buffer, request.value);
+    return true;
+}
+
+ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+    ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = tensor->nb[i];
+    }
+    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
+        result->buffer = nullptr;
+    }
+
+    if (result->buffer) {
+        // require that the tensor data does not go beyond the buffer end
+        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    }
+
+    result->op = (ggml_op) tensor->op;
+    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+        result->op_params[i] = tensor->op_params[i];
+    }
+    result->flags = tensor->flags;
+    result->data = reinterpret_cast<void *>(tensor->data);
+    ggml_set_name(result, tensor->name);
+    return result;
+}
+
+
+bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
+    // serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
+    if (input.size() < sizeof(rpc_tensor) + sizeof(uint64_t)) {
+        return false;
+    }
+    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
+    uint64_t offset;
+    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
+    const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
+
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
+    if (tensor == nullptr) {
+        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
+        ggml_free(ctx);
+        return false;
+    }
+    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
+
+    // sanitize tensor->data
+    {
+        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+        }
+    }
+
+    const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
+    ggml_backend_tensor_set(tensor, data, offset, size);
+    ggml_free(ctx);
+    return true;
+}
+
+bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+    if (tensor == nullptr) {
+        GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
+        ggml_free(ctx);
+        return false;
+    }
+
+    // Call the backend's buffer_init_tensor function
+    ggml_backend_buffer_t buffer = tensor->buffer;
+    if (buffer && buffer->iface.init_tensor) {
+        buffer->iface.init_tensor(buffer, tensor);
+    } else {
+        GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
+    }
+
+    if (tensor->extra != nullptr) {
+        // This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
+        // Currently unimplemented.
+        GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
+        ggml_free(ctx);
+        return false;
+    }
+
+    ggml_free(ctx);
+    return true;
+}
+
+bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+    if (tensor == nullptr) {
+        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
+        ggml_free(ctx);
+        return false;
+    }
+    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
+
+    // sanitize tensor->data
+    {
+        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
+        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
+
+        if (request.tensor.data + request.offset < p0 ||
+            request.tensor.data + request.offset >= p1 ||
+            request.size > (p1 - request.tensor.data - request.offset)) {
+                GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+        }
+    }
+
+    response.resize(request.size, 0);
+    ggml_backend_tensor_get(tensor, response.data(), request.offset, request.size);
+    ggml_free(ctx);
+    return true;
+}
+
+bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response) {
+    struct ggml_init_params params {
+        /*.mem_size   =*/ 2*ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    ggml_tensor * src = deserialize_tensor(ctx, &request.src);
+    ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
+    if (src == nullptr || dst == nullptr) {
+        GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
+        ggml_free(ctx);
+        return false;
+    }
+
+    uint64_t src_size   = (uint64_t) ggml_nbytes(src);
+    uint64_t dst_data   = (uint64_t) dst->data;
+    uint64_t dst_base   = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
+    uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
+
+    if (dst_data + src_size > dst_base + dst_buf_sz) {
+        GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
+                         "    write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
+                         "    buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
+                         __func__,
+                         dst_data,
+                         dst_data + src_size,
+                         dst_base,
+                         dst_base + dst_buf_sz);
+        ggml_free(ctx);
+        return false;
+    }
+
+    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
+                     __func__, (void*) src->buffer, (void*) dst->buffer);
+
+    response.result = ggml_backend_buffer_copy_tensor(src, dst);
+    ggml_free(ctx);
+    return true;
+}
+
+ggml_tensor * rpc_server::create_node(uint64_t id,
+                                      struct ggml_context * ctx,
+                                      const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+                                      std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+    if (id == 0) {
+        return nullptr;
+    }
+    if (tensor_map.find(id) != tensor_map.end()) {
+        return tensor_map[id];
+    }
+    const rpc_tensor * tensor = tensor_ptrs.at(id);
+    struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+    if (result == nullptr) {
+        return nullptr;
+    }
+    tensor_map[id] = result;
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+    }
+    result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+    result->view_offs = tensor->view_offs;
+    return result;
+}
+
+bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
+    // serialization format:
+    // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+    if (input.size() < sizeof(uint32_t)) {
+        return false;
+    }
+    uint32_t n_nodes;
+    memcpy(&n_nodes, input.data(), sizeof(n_nodes));
+    if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t)) {
+        return false;
+    }
+    const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes));
+    uint32_t n_tensors;
+    memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors));
+    if (input.size() < sizeof(uint32_t) + n_nodes*sizeof(uint64_t) + sizeof(uint32_t) + n_tensors*sizeof(rpc_tensor)) {
+        return false;
+    }
+    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
+    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
+
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+    graph->n_nodes = n_nodes;
+    std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+    for (uint32_t i = 0; i < n_tensors; i++) {
+        tensor_ptrs[tensors[i].id] = &tensors[i];
+    }
+    std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+    for (uint32_t i = 0; i < n_nodes; i++) {
+        int64_t id;
+        memcpy(&id, &nodes[i], sizeof(id));
+        graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+    }
+    ggml_status status = ggml_backend_graph_compute(backend, graph);
+    response.result = status;
+    ggml_free(ctx);
+    return true;
+}
+
+rpc_server::~rpc_server() {
+    for (auto buffer : buffers) {
+        ggml_backend_buffer_free(buffer);
+    }
+}
+
+static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
+    rpc_server server(backend);
+    while (true) {
+        uint8_t cmd;
+        if (!recv_data(sockfd, &cmd, 1)) {
+            break;
+        }
+        if (cmd >= RPC_CMD_COUNT) {
+            // fail fast if the command is invalid
+            fprintf(stderr, "Unknown command: %d\n", cmd);
+            break;
+        }
+        switch (cmd) {
+            case RPC_CMD_ALLOC_BUFFER: {
+                rpc_msg_alloc_buffer_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_alloc_buffer_rsp response;
+                server.alloc_buffer(request, response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GET_ALLOC_SIZE: {
+                rpc_msg_get_alloc_size_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_get_alloc_size_rsp response;
+                server.get_alloc_size(request, response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GET_ALIGNMENT: {
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_alignment_rsp response;
+                server.get_alignment(response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GET_MAX_SIZE: {
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_max_size_rsp response;
+                server.get_max_size(response);
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_BUFFER_GET_BASE: {
+                rpc_msg_buffer_get_base_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_buffer_get_base_rsp response;
+                if (!server.buffer_get_base(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_FREE_BUFFER: {
+                rpc_msg_free_buffer_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                if (!server.free_buffer(request)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_BUFFER_CLEAR: {
+                rpc_msg_buffer_clear_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                if (!server.buffer_clear(request)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_SET_TENSOR: {
+                std::vector<uint8_t> input;
+                if (!recv_msg(sockfd, input)) {
+                    return;
+                }
+                if (!server.set_tensor(input)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_INIT_TENSOR: {
+                rpc_msg_init_tensor_req request;
+                if (!recv_msg(sockfd, &request,sizeof(request))) {
+                    return;
+                }
+                if (!server.init_tensor(request)) {
+                    return;
+                }
+                if (!send_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GET_TENSOR: {
+                rpc_msg_get_tensor_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                std::vector<uint8_t> response;
+                if (!server.get_tensor(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, response.data(), response.size())) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_COPY_TENSOR: {
+                rpc_msg_copy_tensor_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
+                    return;
+                }
+                rpc_msg_copy_tensor_rsp response;
+                if (!server.copy_tensor(request, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GRAPH_COMPUTE: {
+                std::vector<uint8_t> input;
+                if (!recv_msg(sockfd, input)) {
+                    return;
+                }
+                rpc_msg_graph_compute_rsp response;
+                if (!server.graph_compute(input, response)) {
+                    return;
+                }
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            case RPC_CMD_GET_DEVICE_MEMORY: {
+                if (!recv_msg(sockfd, nullptr, 0)) {
+                    return;
+                }
+                rpc_msg_get_device_memory_rsp response;
+                response.free_mem = free_mem;
+                response.total_mem = total_mem;
+                if (!send_msg(sockfd, &response, sizeof(response))) {
+                    return;
+                }
+                break;
+            }
+            default: {
+                fprintf(stderr, "Unknown command: %d\n", cmd);
+                return;
+            }
+        }
+    }
+}
+
+void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
+    std::string host;
+    int port;
+    if (!parse_endpoint(endpoint, host, port)) {
+        return;
+    }
+#ifdef _WIN32
+    {
+        WSADATA wsaData;
+        int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
+        if (res != 0) {
+            fprintf(stderr, "WSAStartup failed: %d\n", res);
+            return;
+        }
+    }
+#endif
+    auto server_socket = create_server_socket(host.c_str(), port);
+    if (server_socket == nullptr) {
+        fprintf(stderr, "Failed to create server socket\n");
+        return;
+    }
+    while (true) {
+        auto client_socket = socket_accept(server_socket->fd);
+        if (client_socket == nullptr) {
+            fprintf(stderr, "Failed to accept client connection\n");
+            return;
+        }
+        printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
+        fflush(stdout);
+        rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
+        printf("Client connection closed\n");
+        fflush(stdout);
+    }
+#ifdef _WIN32
+    WSACleanup();
+#endif
+}
+
+// device interface
+
+struct ggml_backend_rpc_device_context {
+    std::string endpoint;
+    std::string name;
+};
+
+static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
+    // TODO: obtain value from the server
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_rpc_device_get_name(dev);
+    props->description = ggml_backend_rpc_device_get_description(dev);
+    props->type        = ggml_backend_rpc_device_get_type(dev);
+    ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ggml_backend_rpc_init(ctx->endpoint.c_str());
+
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(op);
+    //TODO: call the remote backend and cache the results
+    return true;
+}
+
+static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
+        return false;
+    }
+    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+    ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
+    return buft_ctx->endpoint == dev_ctx->endpoint;
+}
+
+static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
+    /* .get_name             = */ ggml_backend_rpc_device_get_name,
+    /* .get_description      = */ ggml_backend_rpc_device_get_description,
+    /* .get_memory           = */ ggml_backend_rpc_device_get_memory,
+    /* .get_type             = */ ggml_backend_rpc_device_get_type,
+    /* .get_props            = */ ggml_backend_rpc_device_get_props,
+    /* .init_backend         = */ ggml_backend_rpc_device_init,
+    /* .get_buffer_type      = */ ggml_backend_rpc_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_rpc_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_rpc_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
+    return "RPC";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 0;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
+        return (void *)ggml_backend_rpc_add_device;
+    }
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
+    /* .get_name         = */ ggml_backend_rpc_reg_get_name,
+    /* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_rpc_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_rpc_reg(void) {
+    static struct ggml_backend_reg ggml_backend_rpc_reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_rpc_reg_i,
+        /* .context     = */ NULL,
+    };
+
+    return &ggml_backend_rpc_reg;
+}
+
+ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
+    static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
+
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (dev_map.find(endpoint) != dev_map.end()) {
+        return dev_map[endpoint];
+    }
+
+    ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
+        /* .endpoint = */ endpoint,
+        /* .name     = */ "RPC[" + std::string(endpoint) + "]",
+    };
+
+    ggml_backend_dev_t dev = new ggml_backend_device {
+        /* .iface   = */ ggml_backend_rpc_device_i,
+        /* .reg     = */ ggml_backend_rpc_reg(),
+        /* .context = */ ctx,
+    };
+
+    dev_map[endpoint] = dev;
+
+    return dev;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
new file mode 100644
index 000000000..3579a311a
--- /dev/null
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -0,0 +1,84 @@
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
+    message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+endif()
+
+check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
+
+if (DEFINED ENV{ONEAPI_ROOT})
+    message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
+elseif(SUPPORTS_SYCL)
+    message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
+        If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
+        source /opt/intel/oneapi/setvars.sh")
+else()
+    message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+endif()
+message(STATUS "SYCL found")
+#todo: AOT
+
+ggml_add_backend_library(ggml-sycl
+                         ggml-sycl.cpp
+                         ../../include/ggml-sycl.h
+                        )
+
+if (GGML_SYCL_F16)
+    if (GGML_SYCL_TARGET STREQUAL "AMD")
+        message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
+    endif()
+    add_compile_definitions(GGML_SYCL_F16)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+
+if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+    # INFO: Allowed Sub_group_sizes are not consistent through all
+    # hip targets. For example, 64 is used for certain models, but the backend
+    # does not support it.
+    # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+else()
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+endif()
+
+file(GLOB   GGML_HEADERS_SYCL "*.hpp")
+file(GLOB   GGML_SOURCES_SYCL "*.cpp")
+target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
+
+find_package(DNNL)
+message("-- DNNL found:" ${DNNL_FOUND})
+
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
+    add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
+else()
+    add_compile_definitions(GGML_SYCL_DNNL=0)
+endif()
+
+if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+    target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
+endif()
+
+if (WIN32)
+    find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
+    target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+else()
+    if (GGML_SYCL_TARGET STREQUAL "INTEL")
+        target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+    elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+        add_compile_definitions(GGML_SYCL_NVIDIA)
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
+    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+        if (NOT GGML_SYCL_DEVICE_ARCH)
+            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
+        endif()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
+    endif()
+
+    if (GGML_SYCL_DEVICE_ARCH)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
+  endif()
+endif()
diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
new file mode 100644
index 000000000..b1df4e5db
--- /dev/null
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -0,0 +1,34 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_BACKEND_HPP
+#define GGML_SYCL_BACKEND_HPP
+
+#include "concat.hpp"
+#include "common.hpp"
+#include "conv.hpp"
+#include "convert.hpp"
+#include "dequantize.hpp"
+#include "dmmv.hpp"
+#include "mmq.hpp"
+#include "mmvq.hpp"
+#include "rope.hpp"
+#include "norm.hpp"
+#include "softmax.hpp"
+#include "tsembd.hpp"
+#include "im2col.hpp"
+#include "wkv6.hpp"
+#include "outprod.hpp"
+#include "element_wise.hpp"
+#include "gla.hpp"
+
+#endif // GGML_SYCL_BACKEND_HPP
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
new file mode 100644
index 000000000..022e7b763
--- /dev/null
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -0,0 +1,101 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "common.hpp"
+
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
+int get_current_device_id() {
+  return dpct::dev_mgr::instance().current_device_id();
+}
+
+void* ggml_sycl_host_malloc(size_t size) try {
+  if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
+    return nullptr;
+  }
+
+  void* ptr = nullptr;
+  // allow to use dpct::get_in_order_queue() for host malloc
+  dpct::err0 err = CHECK_TRY_ERROR(
+      ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue()));
+
+  if (err != 0) {
+    // clear the error
+    GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0,    "syclGetErrorString is not supported");
+    return nullptr;
+  }
+
+  return ptr;
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_sycl_host_free(void* ptr) try {
+  // allow to use dpct::get_in_order_queue() for host malloc
+  SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+bool gpu_has_xmx(sycl::device &dev) {
+    return dev.has(sycl::aspect::ext_intel_matrix);
+}
+
+int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
+  const int64_t max_range = std::numeric_limits<int>::max();
+  int64_t sycl_down_blk_size = block_size;
+  int64_t global_range = accumulate_block_num * sycl_down_blk_size;
+  while(global_range > max_range) {
+      sycl_down_blk_size /= 2;
+      global_range = accumulate_block_num * sycl_down_blk_size;
+  }
+  return sycl_down_blk_size;
+}
+
+void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_sycl_op_flatten_t op) try {
+
+    const bool use_src1 = src1 != nullptr;
+    if(use_src1)
+      GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
+    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
+
+    // dd = data device
+    float * src0_ddf = (float *) src0->data;
+    float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
+    float *  dst_ddf = (float *) dst->data;
+
+    ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
+    ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
+    ggml_sycl_pool_alloc<float>  dst_f(ctx.pool());
+
+    ggml_sycl_set_device(ctx.device);
+    queue_ptr main_stream = ctx.stream();
+    // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
+        // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
+
+    // do the computation
+    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    // print_ggml_tensor("tensor", dst);
+}
+catch (sycl::exception const &exc) {
+
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
new file mode 100644
index 000000000..abad847ca
--- /dev/null
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -0,0 +1,684 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_COMMON_HPP
+#define GGML_SYCL_COMMON_HPP
+
+#include <fstream>
+#include <iostream>
+
+#include "dpct/helper.hpp"
+#include "ggml-sycl.h"
+#include "presets.hpp"
+#if GGML_SYCL_DNNL
+#include "dnnl.hpp"
+#include "dnnl_sycl.hpp"
+#endif
+
+#define GGML_COMMON_DECL_SYCL
+#define GGML_COMMON_IMPL_SYCL
+/* suppress warning spam */
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#include "ggml-common.h"
+#pragma clang diagnostic pop
+
+void* ggml_sycl_host_malloc(size_t size);
+void ggml_sycl_host_free(void* ptr);
+
+static int g_ggml_sycl_debug = 0;
+#define GGML_SYCL_DEBUG(...)        \
+  do {                              \
+    if (g_ggml_sycl_debug)          \
+      fprintf(stderr, __VA_ARGS__); \
+  } while (0)
+
+#define CHECK_TRY_ERROR(expr)                                            \
+  [&]() {                                                                \
+    try {                                                                \
+      expr;                                                              \
+      return dpct::success;                                              \
+    } catch (std::exception const& e) {                                  \
+      std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
+                << ", line:" << __LINE__ << ", func:" << __func__        \
+                << std::endl;                                            \
+      return dpct::default_error;                                        \
+    }                                                                    \
+  }()
+
+
+#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
+#define VER_4VEC 610 // todo for hardward optimize.
+#define VER_GEN9 700 // todo for hardward optimize.
+#define VER_GEN12 1000000 // todo for hardward optimize.
+#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
+
+#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
+
+// define for XMX in Intel GPU
+// TODO: currently, it's not used for XMX really.
+#if !defined(GGML_SYCL_FORCE_MMQ)
+    #define SYCL_USE_XMX
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244 4267) // possible loss of data
+#endif
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_SYCL_DMMV_X
+#define GGML_SYCL_DMMV_X 32
+#endif
+#ifndef GGML_SYCL_MMV_Y
+#define GGML_SYCL_MMV_Y 1
+#endif
+
+typedef sycl::queue *queue_ptr;
+
+enum ggml_sycl_backend_gpu_mode {
+  SYCL_UNSET_GPU_MODE = -1,
+  SYCL_SINGLE_GPU_MODE = 0,
+  SYCL_MUL_GPU_MODE
+};
+
+static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+static void crash() {
+  int* ptr = NULL;
+  *ptr = 0;
+}
+
+[[noreturn]] static void ggml_sycl_error(
+    const char* stmt,
+    const char* func,
+    const char* file,
+    const int line,
+    const char* msg) {
+  fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
+  fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
+  GGML_ABORT("SYCL error");
+}
+
+#define SYCL_CHECK(err)                     \
+  do {                                      \
+    auto err_ = (err);                      \
+    if (err_ != 0)                          \
+      ggml_sycl_error(                      \
+          #err,                             \
+          __func__,                         \
+          __FILE__,                         \
+          __LINE__,                         \
+          "Meet error in this line code!"); \
+  } while (0)
+
+#if DPCT_COMPAT_RT_VERSION >= 11100
+#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_SYCL_ASSUME(x)
+#endif // DPCT_COMPAT_RT_VERSION >= 11100
+
+#ifdef GGML_SYCL_F16
+typedef sycl::half dfloat; // dequantize float
+typedef sycl::half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef sycl::float2 dfloat2;
+#endif // GGML_SYCL_F16
+
+#define MMVQ_MAX_BATCH_SIZE  8
+
+static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+
+static int g_all_sycl_device_count = -1;
+static bool g_ggml_backend_sycl_buffer_type_initialized = false;
+
+static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
+    SYCL_UNSET_GPU_MODE;
+
+static void* g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
+  stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
+                "current GPU architecture.\n";
+  // __trap();
+  std::exit(1);
+
+  (void)bad_arch; // suppress unused function warning
+}
+
+int get_current_device_id();
+
+inline dpct::err0 ggml_sycl_set_device(const int device) try {
+
+  int current_device_id;
+  SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
+
+  // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
+  // current_device_id=%d\n", device, current_device);
+  if (device == current_device_id) {
+    return 0;
+  }
+
+  return CHECK_TRY_ERROR(dpct::select_device(device));
+} catch (sycl::exception const& exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  crash();
+  std::exit(1);
+}
+
+//////////////////////
+
+struct ggml_sycl_device_info {
+    int device_count;
+
+    struct sycl_device_info {
+        int     cc;                 // compute capability
+        // int     nsm;                // number of streaming multiprocessors
+        // size_t  smpb;               // max. shared memory per block
+        bool    vmm;                // virtual memory support
+        size_t  total_vram;
+    };
+
+    sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
+
+    std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
+
+    int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
+};
+
+const ggml_sycl_device_info & ggml_sycl_info();
+
+struct ggml_sycl_pool {
+    virtual ~ggml_sycl_pool() = default;
+
+    virtual void * alloc(size_t size, size_t * actual_size) = 0;
+    virtual void free(void * ptr, size_t size) = 0;
+};
+
+template<typename T>
+struct ggml_sycl_pool_alloc {
+    ggml_sycl_pool * pool = nullptr;
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
+    }
+
+    ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
+        alloc(size);
+    }
+
+    ~ggml_sycl_pool_alloc() {
+        if (ptr != nullptr) {
+            pool->free(ptr, actual_size);
+        }
+    }
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(pool != nullptr);
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    T * alloc(ggml_sycl_pool & pool, size_t size) {
+        this->pool = &pool;
+        return alloc(size);
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    ggml_sycl_pool_alloc() = default;
+    ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
+    ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
+    ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
+    ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
+};
+
+// backend interface
+
+struct ggml_tensor_extra_gpu {
+  void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
+                                       // tensors
+  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
+                        [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+struct ggml_backend_sycl_context {
+    int device;
+    std::string name;
+
+    queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
+
+    explicit ggml_backend_sycl_context(int device) :
+        device(device),
+        name(GGML_SYCL_NAME + std::to_string(device)) {
+    }
+
+    queue_ptr stream(int device, int stream) {
+        if (qptrs[device][stream] == nullptr) {
+            qptrs[device][stream] = &(dpct::get_device(device).default_queue());
+        }
+        return qptrs[device][stream];
+    }
+
+    queue_ptr stream() {
+        return stream(device, 0);
+    }
+
+#if GGML_SYCL_DNNL
+    dnnl::engine make_engine(sycl::queue* q) {
+        // Get the device associated with the queue
+        sycl::device dev = q->get_device();
+        // Get the context associated with the queue
+        sycl::context ctx = q->get_context();
+        const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
+        return eng;
+    }
+
+    std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
+    std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
+    dnnl::stream stream_dnnl(int device, int _stream) {
+        auto q = stream(device, _stream);
+        return stream_dnnl(q);
+    }
+    dnnl::engine engine_dnnl(sycl::queue* qptr) {
+        auto it = engine_map.find(qptr);
+        if (it == engine_map.end()) {
+            auto eng = make_engine(qptr);
+            engine_map[qptr] = eng;
+            return eng;
+        }
+        else
+        {
+            return it->second;
+        }
+    }
+    dnnl::stream stream_dnnl(sycl::queue* qptr) {
+        auto it = stream_map.find(qptr);
+        if (it == stream_map.end()) {
+            auto eng = engine_dnnl(qptr);
+            auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
+            stream_map[qptr] = stream;
+            return stream;
+        }
+        else
+        {
+            return it->second;
+        }
+    }
+    dnnl::stream stream_dnnl() {
+        return stream_dnnl(device, 0);
+    }
+#endif
+
+    // pool
+    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
+
+    std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
+
+    static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
+
+    static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
+
+    ggml_sycl_pool & pool(int device) {
+        if (pools[device] == nullptr) {
+            pools[device] = new_pool_for_device(stream(device,0), device);
+        }
+        return *pools[device];
+    }
+
+    ggml_sycl_pool & pool() {
+        return pool(device);
+    }
+
+    ggml_sycl_pool & host_pool(int device) {
+        if (host_pools[device] == nullptr) {
+            host_pools[device] = new_pool_for_host(stream(device, 0), device);
+        }
+        return *host_pools[device];
+    }
+
+    ggml_sycl_pool & host_pool() { return host_pool(device); }
+};
+
+// common device functions
+
+static __dpct_inline__ float warp_reduce_sum(float x,
+    const sycl::nd_item<3>& item_ct1) {
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        /*
+        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
+    }
+    return x;
+}
+
+static __dpct_inline__ sycl::float2
+warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
+            mask);
+        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
+            mask);
+    }
+    return a;
+}
+
+static __dpct_inline__ float warp_reduce_max(float x,
+    const sycl::nd_item<3>& item_ct1) {
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        /*
+        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
+            item_ct1.get_sub_group(), x, mask));
+    }
+    return x;
+}
+
+// Helper for vec loading aligned data
+template <typename Tp, int n>
+inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
+    return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
+}
+
+// Helper for accessing pointers with no warnings
+template <typename Tp, int dim>
+static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
+    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
+int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
+
+typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const queue_ptr &main_stream);
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_sycl {
+    template <typename src0_t, typename src1_t, typename dst_t>
+    void operator()(ggml_backend_sycl_context & ctx,
+                    const struct ggml_tensor *src0,
+                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
+                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
+                    queue_ptr stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
+                                s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
+        }
+        GGML_UNUSED(ctx);
+    }
+};
+
+template <class op>
+inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd,
+                                   const queue_ptr &main_stream) {
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
+             (sycl::half *)dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
+        op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
+             main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+bool gpu_has_xmx(sycl::device &dev);
+
+void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_sycl_op_flatten_t op);
+
+#endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
new file mode 100644
index 000000000..d41cfd3a6
--- /dev/null
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -0,0 +1,197 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "concat.hpp"
+#include "common.hpp"
+
+static void concat_f32_dim0(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne00,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (nidx < ne00) { // src0
+    int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                     item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
+        item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_dim1(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne01,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (item_ct1.get_group(1) < (size_t) ne01) { // src0
+    int offset_src =
+        nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx + (item_ct1.get_group(1) - ne01) * ne0 +
+        item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_dim2(const float *x, const float *y, float *dst,
+                            const int ne0, const int ne02,
+                            const sycl::nd_item<3> &item_ct1) {
+  int nidx = item_ct1.get_local_id(2) +
+             item_ct1.get_group(2) * item_ct1.get_local_range(2);
+  if (nidx >= ne0) {
+    return;
+  }
+  // operation
+  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+  if (item_ct1.get_group(0) < (size_t) ne02) { // src0
+    int offset_src = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+  } else {
+    int offset_src =
+        nidx + item_ct1.get_group(1) * ne0 +
+        (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = y[offset_src];
+  }
+}
+
+static void concat_f32_sycl(const float *x, const float *y, float *dst,
+                            int ne00, int ne01, int ne02, int ne0, int ne1,
+                            int ne2, int dim, queue_ptr stream) {
+  int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
+  sycl::range<3> gridDim(ne2, ne1, num_blocks);
+  switch (dim) {
+  case 0:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
+        });
+    break;
+  case 1:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
+        });
+    break;
+  // dim >=2 will be dispatched to the default path
+  default:
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
+        });
+    break;
+  }
+}
+
+// non-contiguous kernel (slow)
+static void concat_f32_sycl_non_cont(
+    queue_ptr stream, const char *src0, const char *src1, char *dst,
+    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
+    uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
+    int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
+    uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
+    int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
+    uint64_t nb3, int32_t dim) {
+  sycl::range<3> gridDim(ne3, ne2, ne1);
+  stream->parallel_for(
+      sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
+      [=](sycl::nd_item<3> item_ct1) {
+        int64_t i3 = item_ct1.get_group(0);
+        int64_t i2 = item_ct1.get_group(1);
+        int64_t i1 = item_ct1.get_group(2);
+
+        int64_t o[4] = {0, 0, 0, 0};
+        o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+
+        const float *x;
+
+        for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
+             i0 += item_ct1.get_local_range(2)) {
+          if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
+                                (i0)*nb00);
+          } else {
+            x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
+                                (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
+          }
+
+          float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+          *y = *x;
+        }
+      });
+}
+
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+  const ggml_tensor *src0 = dst->src[0];
+  const ggml_tensor *src1 = dst->src[1];
+  queue_ptr stream = ctx.stream();
+
+  const int32_t dim = ((int32_t *)dst->op_params)[0];
+
+  if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+    const float *src0_d = (const float *)src0->data;
+    const float *src1_d = (const float *)src1->data;
+
+    float *dst_d = (float *)dst->data;
+
+    if (dim != 3) {
+      for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_sycl(
+            src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
+            dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
+            src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
+      }
+    } else {
+      const size_t size0 = ggml_nbytes(src0);
+      const size_t size1 = ggml_nbytes(src1);
+
+      SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
+      SYCL_CHECK(CHECK_TRY_ERROR(
+          stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
+    }
+  } else
+    concat_f32_sycl_non_cont(
+        stream, (const char *)src0->data, (const char *)src1->data,
+        (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
+        src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
+        src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
+        dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
+}
diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp
new file mode 100644
index 000000000..e5cb7314c
--- /dev/null
+++ b/ggml/src/ggml-sycl/concat.hpp
@@ -0,0 +1,20 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONCAT_HPP
+#define GGML_SYCL_CONCAT_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
+
+#endif // GGML_SYCL_CONCAT_HPP
diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp
new file mode 100644
index 000000000..ddba601e1
--- /dev/null
+++ b/ggml/src/ggml-sycl/conv.cpp
@@ -0,0 +1,100 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "conv.hpp"
+
+static  void conv_transpose_1d_kernel(
+        const int s0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2,
+        const int src1_ne0, const int dst_ne0,
+        const float * src0, const float * src1,  float * dst,
+        const sycl::nd_item<3> &item_ct1) {
+    int global_index = item_ct1.get_local_id(2) +
+                       item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (global_index >= output_size) {
+        return;
+    }
+
+    int out_index = global_index / dst_ne0;
+
+    float accumulator = 0;
+
+    for (int c = 0; c < src0_ne2; c++) {
+        int idx = global_index % dst_ne0;
+
+        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+        int input_offset = src1_ne0 * c;
+
+        for (int i = 0; i < src1_ne0; i++) {
+            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+                continue;
+            }
+            int weight_idx = idx - i*s0;
+
+            float kernel_weight = src0[kernel_offset + weight_idx];
+            float input_value =  src1[input_offset+i];
+
+            accumulator += kernel_weight * input_value;
+        }
+    }
+    dst[global_index] = accumulator;
+}
+
+static void conv_transpose_1d_f32_f32_sycl(
+    const int s0, const int output_size,
+    const int src0_ne0, const int src0_ne1, const int src0_ne2,
+    const int src1_ne0, const int dst_ne0,
+    const float *src0, const float *src1, float *dst,
+    const queue_ptr& stream) {
+
+    const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, 1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv_transpose_1d_kernel(
+                s0, output_size,
+                src0_ne0, src0_ne1, src0_ne2,
+                src1_ne0, dst_ne0,
+                src0, src1, dst, item_ct1);
+        });
+}
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    dpct::queue_ptr stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int s0 = opts[0];
+
+    const int64_t output_size = ggml_nelements(dst);
+
+    conv_transpose_1d_f32_f32_sycl(s0, output_size,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        src1->ne[0], dst->ne[0],
+        src0_d, src1_d, dst_d, stream);
+}
+
diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp
new file mode 100644
index 000000000..f9e60dc75
--- /dev/null
+++ b/ggml/src/ggml-sycl/conv.hpp
@@ -0,0 +1,20 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONV_HPP
+#define GGML_SYCL_CONV_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
+
+#endif // GGML_SYCL_CONV_HPP
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
new file mode 100644
index 000000000..05b01db2d
--- /dev/null
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -0,0 +1,547 @@
+#include "convert.hpp"
+#include "dequantize.hpp"
+#include "presets.hpp"
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
+                             const sycl::nd_item<3> &item_ct1) {
+    const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                       item_ct1.get_local_id(2));
+
+    if (i >= k) {
+        return;
+    }
+
+    const int64_t ib = i/qk; // block index
+    const int64_t iqs = (i%qk)/qr; // quant index
+    const int64_t iybs = i - i%qk; // y block start index
+    const int64_t y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0] = v.x();
+    y[iybs + iqs + y_offset] = v.y();
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_sycl(const void *__restrict__ vx,
+                                  dst_t *__restrict__ y, const int64_t k,
+                                  dpct::queue_ptr stream) {
+    const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb32 = k / 32;
+    const int64_t nb = (k + 255) / 256;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_0(vx, y, nb32, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb32 = k / 32;
+    const int64_t nb = (k + 255) / 256;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_1(vx, y, nb32, item_ct1);
+                             });
+    }
+}
+
+
+template <typename dst_t>
+static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
+                                        dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_s(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
+                                        dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_m(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
+                                        dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xxs(
+                                     vx, y, item_ct1, iq2xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k,
+                                       dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xs(
+                                     vx, y, item_ct1, iq2xs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
+                                      dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_s(vx, y, item_ct1);
+                             });
+        });
+    }
+}
+
+
+template <typename dst_t>
+static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
+                                        dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_xxs(
+                                     vx, y, item_ct1, iq3xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
+                                        dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_s(
+                                     vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
+                             });
+        });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k,
+                                       dpct::queue_ptr stream) {
+    const int64_t nb = (k + QK_K - 1) / QK_K;
+#if QK_K == 64
+    dequantize_row_iq4_nl_sycl(vx, y, k, stream);
+#else
+      {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_xs(vx, y, item_ct1);
+                      });
+            });
+      }
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k,
+                                       dpct::queue_ptr stream) {
+    const int64_t nb = (k + QK_K - 1) / QK_K;
+      {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_nl(vx, y, item_ct1);
+                      });
+            });
+      }
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int64_t work_group_size = item_ct1.get_local_range(2);
+    const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
+
+    // make each work-item deal with more elements since sycl global range can not exceed max int
+    const src_t * x = (const src_t *) vx;
+    for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
+        y[i] = x[i];
+    }
+}
+
+template <typename src_t, typename dst_t>
+static void convert_unary_sycl(const void *__restrict__ vx,
+                               dst_t *__restrict__ y, const int64_t k,
+                               dpct::queue_ptr stream) {
+    const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
+
+    // decrease global range when it exceeds the max int
+    int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
+    sycl::range<3> block_nums(1, 1, num_blocks);
+    sycl::range<3> local_range(1, 1, local_size);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * local_range, local_range),
+            [=](sycl::nd_item<3> item_ct1) {
+                convert_unary<src_t>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_sycl;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_sycl;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_sycl;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_sycl;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_sycl;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_sycl;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_sycl;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_sycl;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_sycl;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_sycl;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_sycl;
+        case GGML_TYPE_F32:
+            return convert_unary_sycl<float>;
+        default:
+            return nullptr;
+    }
+}
+
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_sycl;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_sycl;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_sycl;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_sycl;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_sycl;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_sycl;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_sycl;
+        case GGML_TYPE_IQ1_S:
+            return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ1_M:
+            return dequantize_row_iq1_m_sycl;
+        case GGML_TYPE_IQ2_XXS:
+            return dequantize_row_iq2_xxs_sycl;
+        case GGML_TYPE_IQ2_XS:
+            return dequantize_row_iq2_xs_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_sycl;
+        case GGML_TYPE_IQ3_S:
+            return dequantize_row_iq3_s_sycl;
+        case GGML_TYPE_IQ4_XS:
+            return dequantize_row_iq4_xs_sycl;
+        case GGML_TYPE_IQ4_NL:
+            return dequantize_row_iq4_nl_sycl;
+        case GGML_TYPE_F16:
+            return convert_unary_sycl<sycl::half>;
+        default:
+            return nullptr;
+    }
+}
diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp
new file mode 100644
index 000000000..0ce2874aa
--- /dev/null
+++ b/ggml/src/ggml-sycl/convert.hpp
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_CONVERT_HPP
+#define GGML_SYCL_CONVERT_HPP
+
+#include "common.hpp"
+
+template <typename T>
+using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+                             int64_t k, dpct::queue_ptr stream);
+typedef to_t_sycl_t<float> to_fp32_sycl_t;
+typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
+
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
+
+#endif // GGML_SYCL_CONVERT_HPP
diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp
new file mode 100644
index 000000000..b8304c3a2
--- /dev/null
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -0,0 +1,698 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_DEQUANTIZE_HPP
+#define GGML_SYCL_DEQUANTIZE_HPP
+
+#include "common.hpp"
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
+
+static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_SYCL_F16
+    // v = v - {8.0f, 8.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 8.0f) * d;
+    v.s1() = (v.s1() - 8.0f) * d;
+
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = sycl::fma(v.s0(), d, m);
+    v.s1() = sycl::fma(v.s1(), d, m);
+
+#else
+    v.x() = sycl::fma(v.x(), d, m);
+    v.y() = sycl::fma(v.y(), d, m);
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_SYCL_F16
+    // v = v - {16.0f, 16.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 16.0f) * d;
+    v.s1() = (v.s1() - 16.0f) * d;
+
+#else
+    v.x() = (v.x() - 16.0f) * d;
+    v.y() = (v.y() - 16.0f) * d;
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = sycl::fma(v.s0(), d, m);
+    v.s1() = sycl::fma(v.s1(), d, m);
+#else
+    v.x() = sycl::fma(v.x(), d, m);
+    v.y() = sycl::fma(v.y(), d, m);
+#endif // GGML_SYCL_F16
+}
+
+static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x() = x[ib].qs[iqs + 0];
+    v.y() = x[ib].qs[iqs + 1];
+
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    v.s0() *= d;
+    v.s1() *= d;
+#else
+    v.x() *= d;
+    v.y() *= d;
+#endif // GGML_SYCL_F16
+}
+
+template<typename dst_t>
+static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_0 * x = (const block_q4_0 *)vx + ib;
+    const float d = sycl::vec<sycl::half, 1>(x->d)
+                        .convert<float, sycl::rounding_mode::automatic>()[0];
+    const float dm = -8*d;
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l+ 0] = d * (q[l] & 0xF) + dm;
+        y[l+16] = d * (q[l] >>  4) + dm;
+    }
+}
+
+template<typename dst_t>
+static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t ib = 8*i + ir;
+    if (ib >= nb32) {
+        return;
+    }
+
+    dst_t * y = yy + 256*i + 32*ir + 4*il;
+
+    const block_q4_1 * x = (const block_q4_1 *)vx + ib;
+    const sycl::float2 d =
+        x->dm.convert<float, sycl::rounding_mode::automatic>();
+
+    const uint8_t * q = x->qs + 4*il;
+
+    for (int l = 0; l < 4; ++l) {
+        y[l + 0] = d.x() * (q[l] & 0xF) + d.y();
+        y[l + 16] = d.x() * (q[l] >> 4) + d.y();
+    }
+}
+
+
+//================================== k-quants
+
+template<typename dst_t>
+static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t n   = tid/32;
+    const int64_t l   = tid - 32*n;
+    const int64_t is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int64_t is = tid/16;  // 0 or 1
+    const int64_t il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int64_t r = item_ct1.get_local_id(2) / 4;
+    const int64_t tid = r/2;
+    const int64_t is0 = r%2;
+    const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
+    const int64_t n = tid / 4;
+    const int64_t j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int64_t is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t is  = tid/16;  // 0 or 1
+    const int64_t il  = tid%16;  // 0...15
+    const int64_t im  = il/8;    // 0...1
+    const int64_t in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63;
+        m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int64_t i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid/8;
+    const int64_t ir  = tid%8;
+    const int64_t is  = 2*il;
+    const int64_t n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const sycl::half2 dm = x[i].dm;
+    const float dall = dm[0];
+    const float dmin = dm[1];
+
+    if (tid < 12)
+        scales_local[tid] = x[i].scales[tid];
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, scales_local, sc, m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, scales_local, sc, m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
+        y[l +32] = d2 * (q_vec[l] >>  4) - m2;
+    }
+#else
+    const int64_t tid = item_ct1.get_local_id(2);
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int64_t i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il  = tid/16;   // il is in 0...3
+    const int64_t ir  = tid%16;   // ir is in 0...15
+    const int64_t is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int64_t tid = item_ct1.get_local_id(2);
+    const uint8_t q = x[i].qs[tid];
+    const int64_t im = tid/8;  // 0...3
+    const int64_t in = tid%8;  // 0...7
+    const int64_t is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int64_t i = item_ct1.get_group(2);
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t ip  = tid/32;   // ip is 0 or 1
+    const int64_t il  = tid - 32*ip; // 0...32
+    const int64_t is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t ip  = tid/16;         // 0 or 1
+    const int64_t il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                     const sycl::nd_item<3> &item_ct1,
+                                     const uint64_t *iq2xxs_grid_ptr,
+                                     const uint8_t *ksigns_iq2xs_ptr,
+                                     const uint8_t *kmask_iq2xs_ptr) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]);
+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                    const sycl::nd_item<3> &item_ct1,
+                                    const uint64_t *iq2xs_grid,
+                                    const uint8_t *ksigns_iq2xs,
+                                    const uint8_t *kmask_iq2xs) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq2_xs * x = (const block_iq2_xs *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * q2 = x[i].qs + 4*ib;
+    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq2_s * x = (const block_iq2_s *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
+#pragma unroll
+    for (int j = 0; j < 8; ++j)
+        y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+#else
+    assert(false);
+
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                     const sycl::nd_item<3> &item_ct1,
+                                     const uint32_t *iq3xxs_grid,
+                                     const uint8_t *ksigns_iq2xs,
+                                     const uint8_t *kmask_iq2xs) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq3_s * x = (const block_iq3_s *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t * qs = x[i].qs + 8*ib;
+    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
+    const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
+    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
+    const uint8_t signs = x[i].signs[4*ib + il];
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint32_t *iq1s_grid_gpu) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq1_s * x = (const block_iq1_s  *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                       const sycl::nd_item<3> &item_ct1,
+                       const uint32_t *iq1s_grid_gpu) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+#pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        y[j] = d * (q[j] + delta);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                        const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
+
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[ib].qs + 4*il;
+    const float d = (float)x[ib].d;
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+
+}
+
+
+template <typename dst_t>
+__dpct_inline__ static void
+dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
+                        const sycl::nd_item<3> &item_ct1) {
+    const int64_t i = item_ct1.get_group(2);
+    const block_iq4_xs * x = (const block_iq4_xs *)vx;
+
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
+    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
+#pragma unroll
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+    }
+}
+
+
+#endif // GGML_SYCL_DEQUANTIZE_HPP
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
new file mode 100644
index 000000000..0d097357c
--- /dev/null
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -0,0 +1,1023 @@
+#include "convert.hpp"
+#include "dmmv.hpp"
+#include "dequantize.hpp"
+#include "presets.hpp"
+
+
+static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const sycl::half *x = (const sycl::half *)vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = item_ct1.get_local_id(2);
+
+    const int iter_stride = 2*GGML_SYCL_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_SYCL_F16
+    sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_SYCL_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+
+            tmp += v * t1;
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_SYCL_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+    const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
+    for (int mask = mask_start; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_SYCL_F16
+        dst[row] = tmp.x() + tmp.y();
+#else
+        dst[row] = tmp;
+#endif // GGML_SYCL_F16
+    }
+}
+
+static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
+                                                          nrows, item_ct1);
+            });
+    }
+}
+
+/*
+DPCT1110:4: The total declared local variable size in device function
+dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = item_ct1.get_local_id(2) /
+                    (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
+    const int ix = item_ct1.get_local_id(2) %
+                   (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const sycl::float2 dall =
+            x[i].dm.convert<float, sycl::rounding_mode::automatic>();
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x() * sum1 - dall.y() * sum2;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:5: The total declared local variable size in device function
+dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:6: The total declared local variable size in device function
+dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
+            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
+                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
+               dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:7: The total declared local variable size in device function
+dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
+    const int ix = item_ct1.get_local_id(2) % 2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x() +=
+                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
+                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
+            sum.y() +=
+                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
+                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
+            sum.z() +=
+                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
+                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
+            sum.w() +=
+                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
+                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
+                       sum.w() * sc[5]) -
+               dmin * smin;
+    }
+
+#else
+    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+
+static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+void ggml_sycl_op_dequantize_mul_mat_vec(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_SYCL_F16
+    ggml_sycl_pool_alloc<sycl::half> src1_dfloat_a(ctx.pool());
+    sycl::half *src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = src1_dfloat_a.alloc(ne00);
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+        GGML_ASSERT(to_fp16_sycl != nullptr);
+        to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_SYCL_F16
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_F16:
+            convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
+            GGML_ABORT("fatal error");
+            break;
+    }
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_ncols);
+    GGML_UNUSED(src1_padded_row_size);
+}
diff --git a/ggml/src/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp
new file mode 100644
index 000000000..bd8373564
--- /dev/null
+++ b/ggml/src/ggml-sycl/dmmv.hpp
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_DMMV_HPP
+#define GGML_SYCL_DMMV_HPP
+
+#include "common.hpp"
+
+
+void ggml_sycl_op_dequantize_mul_mat_vec(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+
+#endif // GGML_SYCL_DMMV_HPP
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
new file mode 100644
index 000000000..c96395be6
--- /dev/null
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -0,0 +1,2968 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_DPCT_HELPER_HPP
+#define GGML_SYCL_DPCT_HELPER_HPP
+
+#include <sycl/sycl.hpp>
+#include <sycl/half_type.hpp>
+#include <syclcompat/math.hpp>
+#include <oneapi/mkl.hpp>
+#include <map>
+
+#include "ggml.h"
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define DPCT_COMPATIBILITY_TEMP (900)
+
+#if defined(_MSC_VER)
+#define __dpct_align__(n) __declspec(align(n))
+#define __dpct_inline__ __forceinline
+#else
+#define __dpct_align__(n) __attribute__((aligned(n)))
+#define __dpct_inline__ __inline__ __attribute__((always_inline))
+#endif
+
+#if defined(_MSC_VER)
+#define __dpct_noinline__ __declspec(noinline)
+#else
+#define __dpct_noinline__ __attribute__((noinline))
+#endif
+
+inline std::string get_device_type_name(const sycl::device &Device) {
+    auto DeviceType = Device.get_info<sycl::info::device::device_type>();
+    switch (DeviceType) {
+    case sycl::info::device_type::cpu:
+        return "cpu";
+    case sycl::info::device_type::gpu:
+        return "gpu";
+    case sycl::info::device_type::host:
+        return "host";
+    case sycl::info::device_type::accelerator:
+        return "acc";
+    default:
+        return "unknown";
+    }
+}
+
+inline std::string get_device_backend_and_type(const sycl::device &device) {
+    std::stringstream device_type;
+    sycl::backend backend = device.get_backend();
+    device_type <<  backend << ":" << get_device_type_name(device);
+    return device_type.str();
+}
+
+template <typename Ts> struct matrix_info_t {
+    oneapi::mkl::transpose transpose_info[2];
+    Ts                     value_info[2];
+    std::int64_t           size_info[3];
+    std::int64_t           ld_info[3];
+    std::int64_t           groupsize_info;
+};
+
+namespace dpct
+{
+    typedef sycl::queue *queue_ptr;
+    typedef sycl::event *event_ptr;
+    typedef char *device_ptr;
+    typedef uint8_t byte_t;
+    typedef sycl::buffer<byte_t> buffer_t;
+
+    /// SYCL default exception handler
+    inline auto exception_handler = [](sycl::exception_list exceptions)
+    {
+        for (std::exception_ptr const &e : exceptions)
+        {
+            try
+            {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e)
+            {
+                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                          << e.what() << std::endl
+                          << "Exception caught at file:" << __FILE__
+                          << ", line:" << __LINE__ << std::endl;
+            }
+        }
+    };
+
+    enum error_code
+    {
+        success = 0,
+        default_error = 999
+    };
+
+    enum memcpy_direction
+    {
+        host_to_host,
+        host_to_device,
+        device_to_host,
+        device_to_device,
+        automatic
+    };
+
+    enum memory_region
+    {
+        global = 0, // device global memory
+        constant,   // device constant memory
+        local,      // device local memory
+        shared,     // memory which can be accessed by host and device
+    };
+
+    enum class library_data_t : unsigned char
+    {
+        real_float = 0,
+        complex_float,
+        real_double,
+        complex_double,
+        real_half,
+        complex_half,
+        real_bfloat16,
+        complex_bfloat16,
+        real_int4,
+        complex_int4,
+        real_uint4,
+        complex_uint4,
+        real_int8,
+        complex_int8,
+        real_uint8,
+        complex_uint8,
+        real_int16,
+        complex_int16,
+        real_uint16,
+        complex_uint16,
+        real_int32,
+        complex_int32,
+        real_uint32,
+        complex_uint32,
+        real_int64,
+        complex_int64,
+        real_uint64,
+        complex_uint64,
+        real_int8_4,
+        real_int8_32,
+        real_uint8_4,
+        library_data_t_size
+    };
+
+    template <typename T>
+    struct DataType
+    {
+        using T2 = T;
+    };
+    template <typename T>
+    struct DataType<sycl::vec<T, 2>>
+    {
+        using T2 = std::complex<T>;
+    };
+
+    static void destroy_event(event_ptr event)
+    {
+        delete event;
+    }
+
+    static inline unsigned int get_tid()
+    {
+#if defined(__linux__)
+        return syscall(SYS_gettid);
+#elif defined(_WIN64)
+        return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+    }
+
+    namespace detail
+    {
+        static void get_version(const sycl::device &dev, int &major, int &minor)
+        {
+            // Version string has the following format:
+            // a. OpenCL<space><major.minor><space><vendor-specific-information>
+            // b. <major.minor>
+            // c. <AmdGcnArchName> e.g gfx1030
+            std::string ver;
+            ver = dev.get_info<sycl::info::device::version>();
+            std::string::size_type i = 0;
+            while (i < ver.size()) {
+              if (isdigit(ver[i]))
+                break;
+              i++;
+            }
+            major = std::stoi(&(ver[i]));
+            while (i < ver.size()) {
+              if (ver[i] == '.')
+                break;
+              i++;
+            }
+            if (i < ver.size()) {
+              // a. and b.
+              i++;
+              minor = std::stoi(&(ver[i]));
+            } else {
+              // c.
+              minor = 0;
+            }
+        }
+
+        template <typename tag, typename T>
+        class generic_error_type
+        {
+        public:
+            generic_error_type() = default;
+            generic_error_type(T value) : value{value} {}
+            operator T() const { return value; }
+
+        private:
+            T value;
+        };
+
+    } // namespace detail
+
+    /// Pitched 2D/3D memory data.
+    class pitched_data
+    {
+    public:
+        pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+        pitched_data(void *data, size_t pitch, size_t x, size_t y)
+            : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+        void *get_data_ptr() { return _data; }
+        void set_data_ptr(void *data) { _data = data; }
+
+        size_t get_pitch() { return _pitch; }
+        void set_pitch(size_t pitch) { _pitch = pitch; }
+
+        size_t get_x() { return _x; }
+        void set_x(size_t x) { _x = x; }
+
+        size_t get_y() { return _y; }
+        void set_y(size_t y) { _y = y; }
+
+    private:
+        void *_data;
+        size_t _pitch, _x, _y;
+    };
+
+    class device_info
+    {
+    public:
+        // get interface
+        const char *get_name() const { return _name; }
+        char *get_name() { return _name; }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes() const
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes()
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        bool get_host_unified_memory() const { return _host_unified_memory; }
+        int get_major_version() const { return _major; }
+        int get_minor_version() const { return _minor; }
+        int get_integrated() const { return _integrated; }
+        int get_max_clock_frequency() const { return _frequency; }
+        int get_max_compute_units() const { return _max_compute_units; }
+        int get_max_work_group_size() const { return _max_work_group_size; }
+        int get_max_sub_group_size() const { return _max_sub_group_size; }
+        int get_max_work_items_per_compute_unit() const
+        {
+            return _max_work_items_per_compute_unit;
+        }
+        int get_max_register_size_per_work_group() const
+        {
+            return _max_register_size_per_work_group;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size() const
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size()
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        size_t get_global_mem_size() const { return _global_mem_size; }
+        size_t get_local_mem_size() const { return _local_mem_size; }
+        size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
+        /// Returns the maximum clock rate of device's global memory in kHz. If
+        /// compiler does not support this API then returns default value 3200000 kHz.
+        unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+        /// Returns the maximum bus width between device and memory in bits. If
+        /// compiler does not support this API then returns default value 64 bits.
+        unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+        uint32_t get_device_id() const { return _device_id; }
+        std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+        /// Returns global memory cache size in bytes.
+        unsigned int get_global_mem_cache_size() const
+        {
+            return _global_mem_cache_size;
+        }
+
+        // set interface
+        void set_name(const char *name)
+        {
+            size_t length = strlen(name);
+            if (length < 256)
+            {
+                std::memcpy(_name, name, length + 1);
+            }
+            else
+            {
+                std::memcpy(_name, name, 255);
+                _name[255] = '\0';
+            }
+        }
+        void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+        }
+        [[deprecated]] void
+        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+            {
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+            }
+        }
+        void set_host_unified_memory(bool host_unified_memory)
+        {
+            _host_unified_memory = host_unified_memory;
+        }
+        void set_major_version(int major) { _major = major; }
+        void set_minor_version(int minor) { _minor = minor; }
+        void set_integrated(int integrated) { _integrated = integrated; }
+        void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+        void set_max_compute_units(int max_compute_units)
+        {
+            _max_compute_units = max_compute_units;
+        }
+        void set_global_mem_size(size_t global_mem_size)
+        {
+            _global_mem_size = global_mem_size;
+        }
+        void set_local_mem_size(size_t local_mem_size)
+        {
+            _local_mem_size = local_mem_size;
+        }
+        void set_max_mem_alloc_size(size_t max_mem_alloc_size)
+        {
+            _max_mem_alloc_size = max_mem_alloc_size;
+        }
+        void set_max_work_group_size(int max_work_group_size)
+        {
+            _max_work_group_size = max_work_group_size;
+        }
+        void set_max_sub_group_size(int max_sub_group_size)
+        {
+            _max_sub_group_size = max_sub_group_size;
+        }
+        void
+        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+        {
+            _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+        }
+        void set_max_nd_range_size(int max_nd_range_size[])
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                _max_nd_range_size[i] = max_nd_range_size[i];
+                _max_nd_range_size_i[i] = max_nd_range_size[i];
+            }
+        }
+        void set_memory_clock_rate(unsigned int memory_clock_rate)
+        {
+            _memory_clock_rate = memory_clock_rate;
+        }
+        void set_memory_bus_width(unsigned int memory_bus_width)
+        {
+            _memory_bus_width = memory_bus_width;
+        }
+        void
+        set_max_register_size_per_work_group(int max_register_size_per_work_group)
+        {
+            _max_register_size_per_work_group = max_register_size_per_work_group;
+        }
+        void set_device_id(uint32_t device_id)
+        {
+            _device_id = device_id;
+        }
+        void set_uuid(std::array<unsigned char, 16> uuid)
+        {
+            _uuid = std::move(uuid);
+        }
+        void set_global_mem_cache_size(unsigned int global_mem_cache_size)
+        {
+            _global_mem_cache_size = global_mem_cache_size;
+        }
+
+    private:
+        char _name[256];
+        int _max_work_item_sizes_i[3];
+        bool _host_unified_memory = false;
+        int _major;
+        int _minor;
+        int _integrated = 0;
+        int _frequency;
+        // Set estimated value 3200000 kHz as default value.
+        unsigned int _memory_clock_rate = 3200000;
+        // Set estimated value 64 bits as default value.
+        unsigned int _memory_bus_width = 64;
+        unsigned int _global_mem_cache_size;
+        int _max_compute_units;
+        int _max_work_group_size;
+        int _max_sub_group_size;
+        int _max_work_items_per_compute_unit;
+        int _max_register_size_per_work_group;
+        size_t _global_mem_size;
+        size_t _local_mem_size;
+        size_t _max_mem_alloc_size;
+        size_t _max_nd_range_size[3];
+        int _max_nd_range_size_i[3];
+        uint32_t _device_id;
+        std::array<unsigned char, 16> _uuid;
+    };
+
+    static int get_major_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return major;
+    }
+
+    static int get_minor_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return minor;
+    }
+
+    static void get_device_info(device_info &out, const sycl::device &dev)
+    {
+        device_info prop;
+        prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
+
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        prop.set_major_version(major);
+        prop.set_minor_version(minor);
+
+        prop.set_max_work_item_sizes(
+#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
+            // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
+            // is an enum class element
+            dev.get_info<sycl::info::device::max_work_item_sizes>());
+#else
+            // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
+            // an int
+            dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
+#endif
+        prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
+
+        prop.set_max_clock_frequency(
+            dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
+
+        prop.set_max_compute_units(
+            dev.get_info<sycl::info::device::max_compute_units>());
+        prop.set_max_work_group_size(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
+        prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
+        prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+        if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
+        {
+            unsigned int tmp =
+                dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+            if (tmp != 0)
+                prop.set_memory_clock_rate(1000 * tmp);
+        }
+        if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
+        {
+            prop.set_memory_bus_width(
+                dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_id))
+        {
+            prop.set_device_id(
+                dev.get_info<sycl::ext::intel::info::device::device_id>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
+        {
+            prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
+        }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value."
+#endif
+
+        size_t max_sub_group_size = 1;
+        std::vector<size_t> sub_group_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+
+        for (const auto &sub_group_size : sub_group_sizes)
+        {
+            if (max_sub_group_size < sub_group_size)
+                max_sub_group_size = sub_group_size;
+        }
+
+        prop.set_max_sub_group_size(max_sub_group_size);
+
+        prop.set_max_work_items_per_compute_unit(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+        prop.set_max_nd_range_size(max_nd_range_size);
+
+        // Estimates max register size per work group, feel free to update the value
+        // according to device properties.
+        prop.set_max_register_size_per_work_group(65536);
+
+        prop.set_global_mem_cache_size(
+            dev.get_info<sycl::info::device::global_mem_cache_size>());
+        out = prop;
+    }
+
+    /// dpct device extension
+    class device_ext : public sycl::device {
+      typedef std::mutex mutex_type;
+
+     public:
+      device_ext() : sycl::device() {}
+      ~device_ext() {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        clear_queues();
+      }
+      device_ext(const sycl::device &base) : sycl::device(base) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        init_queues();
+      }
+
+      int is_native_atomic_supported() { return 0; }
+      int get_major_version() const { return dpct::get_major_version(*this); }
+
+      int get_minor_version() const { return dpct::get_minor_version(*this); }
+
+      int get_max_compute_units() const {
+        return get_device_info().get_max_compute_units();
+      }
+
+      /// Return the maximum clock frequency of this device in KHz.
+      int get_max_clock_frequency() const {
+        return get_device_info().get_max_clock_frequency();
+      }
+
+      int get_integrated() const { return get_device_info().get_integrated(); }
+
+      int get_max_sub_group_size() const {
+        return get_device_info().get_max_sub_group_size();
+      }
+
+      int get_max_register_size_per_work_group() const {
+        return get_device_info().get_max_register_size_per_work_group();
+      }
+
+      int get_max_work_group_size() const {
+        return get_device_info().get_max_work_group_size();
+      }
+
+      int get_mem_base_addr_align() const {
+        return get_info<sycl::info::device::mem_base_addr_align>();
+      }
+
+      size_t get_global_mem_size() const {
+        return get_device_info().get_global_mem_size();
+      }
+
+      size_t get_max_mem_alloc_size() const {
+        return get_device_info().get_max_mem_alloc_size();
+      }
+
+      /// Get the number of bytes of free and total memory on the SYCL device.
+      /// \param [out] free_memory The number of bytes of free memory on the
+      /// SYCL device. \param [out] total_memory The number of bytes of total
+      /// memory on the SYCL device.
+      void get_memory_info(size_t &free_memory, size_t &total_memory) {
+        total_memory = get_device_info().get_global_mem_size();
+        const char *warning_info =
+            "get_memory_info: [warning] ext_intel_free_memory is not "
+            "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
+            "use total memory as free memory";
+#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
+        if (!has(sycl::aspect::ext_intel_free_memory)) {
+          std::cerr << warning_info << std::endl;
+          free_memory = total_memory;
+        } else {
+          free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+        }
+#else
+        std::cerr << warning_info << std::endl;
+        free_memory = total_memory;
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("Querying the number of bytes of free memory is not supported")
+#else
+#warning "Querying the number of bytes of free memory is not supported"
+#endif
+#endif
+      }
+
+      void get_device_info(device_info &out) const {
+        dpct::get_device_info(out, *this);
+      }
+
+      device_info get_device_info() const {
+        device_info prop;
+        dpct::get_device_info(prop, *this);
+        return prop;
+      }
+
+      void reset() {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        clear_queues();
+        init_queues();
+      }
+
+      sycl::queue &in_order_queue() { return _q_in_order; }
+
+      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+
+      sycl::queue &default_queue() { return in_order_queue(); }
+
+      void queues_wait_and_throw() {
+        std::unique_lock<mutex_type> lock(m_mutex);
+        lock.unlock();
+        for (auto &q : _queues) {
+            q.wait_and_throw();
+        }
+        // Guard the destruct of current_queues to make sure the ref count is
+        // safe.
+        lock.lock();
+      }
+
+      sycl::queue create_queue(bool enable_exception_handler = false) {
+        return create_in_order_queue(enable_exception_handler);
+      }
+
+      sycl::queue create_queue(sycl::device device,
+                               bool enable_exception_handler = false) {
+        return create_in_order_queue(device, enable_exception_handler);
+      }
+
+      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(enable_exception_handler,
+                                 sycl::property::queue::in_order());
+      }
+
+      sycl::queue create_in_order_queue(sycl::device device,
+                                        bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(device, enable_exception_handler,
+                                 sycl::property::queue::in_order());
+      }
+
+      sycl::queue create_out_of_order_queue(
+          bool enable_exception_handler = false) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return create_queue_impl(enable_exception_handler);
+      }
+
+      void destroy_queue(sycl::queue queue) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const sycl::queue &q) -> bool
+                                    {
+                                        return q == queue;
+                                    }),
+                    _queues.end());
+      }
+      void set_saved_queue(sycl::queue q) {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        _saved_queue = q;
+      }
+      sycl::queue get_saved_queue() const {
+        std::lock_guard<mutex_type> lock(m_mutex);
+        return _saved_queue;
+      }
+
+     private:
+      void clear_queues() { _queues.clear(); }
+
+      void init_queues() {
+        _q_in_order =
+            create_queue_impl(true, sycl::property::queue::in_order());
+        _q_out_of_order = create_queue_impl(true);
+        _saved_queue = default_queue();
+      }
+
+      /// Caller should acquire resource \p m_mutex before calling this
+      /// function.
+      template <class... Properties>
+      sycl::queue create_queue_impl(bool enable_exception_handler,
+                                    Properties... properties) {
+        sycl::async_handler eh = {};
+        if (enable_exception_handler) {
+          eh = exception_handler;
+        }
+        _queues.push_back(sycl::queue(
+            *this, eh,
+            sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+                sycl::property::queue::enable_profiling(),
+#endif
+                properties...)));
+
+        return _queues.back();
+      }
+
+      template <class... Properties>
+      sycl::queue create_queue_impl(sycl::device device,
+                                    bool enable_exception_handler,
+                                    Properties... properties) {
+        sycl::async_handler eh = {};
+        if (enable_exception_handler) {
+          eh = exception_handler;
+        }
+        _queues.push_back(sycl::queue(
+            device, eh,
+                        sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+                            sycl::property::queue::enable_profiling(),
+#endif
+                            properties...)));
+
+        return _queues.back();
+      }
+
+      void get_version(int &major, int &minor) const {
+        detail::get_version(*this, major, minor);
+      }
+      sycl::queue _q_in_order, _q_out_of_order;
+      sycl::queue _saved_queue;
+      std::vector<sycl::queue> _queues;
+      mutable mutex_type m_mutex;
+    };
+
+
+    /// device manager
+    class dev_mgr
+    {
+    public:
+        device_ext &current_device()
+        {
+            unsigned int dev_id = current_device_id();
+            check_id(dev_id);
+            return *_devs[dev_id];
+        }
+        device_ext &cpu_device() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            if (_cpu_device == -1)
+            {
+                throw std::runtime_error("no valid cpu device");
+            }
+            else
+            {
+                return *_devs[_cpu_device];
+            }
+        }
+        device_ext &get_device(unsigned int id) const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            return *_devs[id];
+        }
+        unsigned int current_device_id() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            auto it = _thread2dev_map.find(get_tid());
+            if (it != _thread2dev_map.end())
+                return it->second;
+            return DEFAULT_DEVICE_ID;
+        }
+
+        /// Select device with a device ID.
+        /// \param [in] id The id of the device which can
+        /// be obtained through get_device_id(const sycl::device).
+        void select_device(unsigned int id)
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            _thread2dev_map[get_tid()] = id;
+        }
+        unsigned int device_count() { return _devs.size(); }
+
+        unsigned int get_device_id(const sycl::device &dev)
+        {
+            unsigned int id = 0;
+            for (auto &dev_item : _devs)
+            {
+                if (*dev_item == dev)
+                {
+                    return id;
+                }
+                id++;
+            }
+            return -1;
+        }
+
+        inline std::string get_preferred_gpu_platform_name() {
+            std::string result;
+
+            std::string filter = "";
+            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
+            if (env) {
+                if (std::strstr(env, "level_zero")) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(env, "opencl")) {
+                    filter = "opencl";
+                }
+                else if (std::strstr(env, "cuda")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(env, "hip")) {
+                    filter = "hip";
+                }
+                else {
+                    throw std::runtime_error("invalid device filter: " + std::string(env));
+                }
+            } else {
+                auto default_device = sycl::device(sycl::default_selector_v);
+                auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
+
+                if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(default_platform_name.c_str(), "HIP")) {
+                    filter = "hip";
+                }
+            }
+
+            auto platform_list = sycl::platform::get_platforms();
+
+            for (const auto& platform : platform_list) {
+                auto devices = platform.get_devices();
+                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
+                    return d.is_gpu();
+                });
+
+                if (gpu_dev == devices.end()) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not contain GPU devices, skipping\n";
+                    continue;
+                }
+
+                auto platform_name = platform.get_info<sycl::info::platform::name>();
+                std::string platform_name_low_case;
+                platform_name_low_case.resize(platform_name.size());
+
+                std::transform(
+                    platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
+
+                if (platform_name_low_case.find(filter) == std::string::npos) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not match with requested "
+                    //      << filter << ", skipping\n";
+                    continue;
+                }
+
+                result = platform_name;
+            }
+
+            if (result.empty())
+                throw std::runtime_error("can not find preferred GPU platform");
+
+            return result;
+        }
+
+        template <class DeviceSelector>
+        std::enable_if_t<
+            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+        {
+            sycl::device selected_device = sycl::device(selector);
+            unsigned int selected_device_id = get_device_id(selected_device);
+            select_device(selected_device_id);
+        }
+
+        /// Returns the instance of device manager singleton.
+        static dev_mgr &instance()
+        {
+            static dev_mgr d_m;
+            return d_m;
+        }
+        dev_mgr(const dev_mgr &) = delete;
+        dev_mgr &operator=(const dev_mgr &) = delete;
+        dev_mgr(dev_mgr &&) = delete;
+        dev_mgr &operator=(dev_mgr &&) = delete;
+
+    private:
+        mutable std::recursive_mutex m_mutex;
+        static bool compare_dev(sycl::device &device1, sycl::device &device2)
+        {
+            sycl::backend backend1 = device1.get_backend();
+            sycl::backend backend2 = device2.get_backend();
+            // levelzero backends always come first
+            if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
+            if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
+            dpct::device_info prop1;
+            dpct::get_device_info(prop1, device1);
+            dpct::device_info prop2;
+            dpct::get_device_info(prop2, device2);
+            return prop1.get_max_compute_units() > prop2.get_max_compute_units();
+        }
+        static int convert_backend_index(std::string & backend) {
+            if (backend == "ext_oneapi_level_zero:gpu") return 0;
+            if (backend == "opencl:gpu") return 1;
+            if (backend == "ext_oneapi_cuda:gpu") return 2;
+            if (backend == "ext_oneapi_hip:gpu") return 3;
+            if (backend == "opencl:cpu") return 4;
+            if (backend == "opencl:acc") return 5;
+            printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
+            GGML_ABORT("fatal error");
+        }
+        static bool compare_backend(std::string &backend1, std::string &backend2) {
+            return convert_backend_index(backend1) < convert_backend_index(backend2);
+        }
+        dev_mgr()
+        {
+            sycl::device default_device =
+                sycl::device(sycl::default_selector_v);
+            _devs.push_back(std::make_shared<device_ext>(default_device));
+
+            std::vector<sycl::device> sycl_all_devs;
+            // Collect other devices except for the default device.
+            if (default_device.is_cpu())
+                _cpu_device = 0;
+
+            auto Platforms = sycl::platform::get_platforms();
+            // Keep track of the number of devices per backend
+            std::map<sycl::backend, size_t> DeviceNums;
+            std::map<std::string, std::vector<sycl::device>> backend_devices;
+            auto preferred_platform_name = get_preferred_gpu_platform_name();
+
+            while (!Platforms.empty()) {
+                auto Platform = Platforms.back();
+                Platforms.pop_back();
+                auto platform_name = Platform.get_info<sycl::info::platform::name>();
+                if (platform_name.compare(preferred_platform_name) != 0) {
+                    continue;
+                }
+                auto devices = Platform.get_devices();
+                std::string backend_type = get_device_backend_and_type(devices[0]);
+                for (const auto &device : devices) {
+                    backend_devices[backend_type].push_back(device);
+                }
+            }
+
+            std::vector<std::string> keys;
+            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+                keys.push_back(it->first);
+            }
+            std::sort(keys.begin(), keys.end(), compare_backend);
+
+            for (auto &key : keys) {
+                std::vector<sycl::device> devs = backend_devices[key];
+                std::sort(devs.begin(), devs.end(), compare_dev);
+                for (const auto &dev : devs) {
+                    sycl_all_devs.push_back(dev);
+                }
+            }
+
+            for (auto &dev : sycl_all_devs)
+            {
+                if (dev == default_device)
+                {
+                    continue;
+                }
+                _devs.push_back(std::make_shared<device_ext>(dev));
+                if (_cpu_device == -1 && dev.is_cpu())
+                {
+                    _cpu_device = _devs.size() - 1;
+                }
+            }
+        }
+        void check_id(unsigned int id) const
+        {
+            if (id >= _devs.size())
+            {
+                throw std::runtime_error("invalid device id");
+            }
+        }
+        std::vector<std::shared_ptr<device_ext>> _devs;
+        /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+        /// thread id in _thread2dev_map, which means default device should be used
+        /// for the current thread.
+        const unsigned int DEFAULT_DEVICE_ID = 0;
+        /// thread-id to device-id map.
+        std::map<unsigned int, unsigned int> _thread2dev_map;
+        int _cpu_device = -1;
+    };
+
+    static inline sycl::queue &get_default_queue()
+    {
+        return dev_mgr::instance().current_device().default_queue();
+    }
+
+    namespace detail
+    {
+        enum class pointer_access_attribute
+        {
+            host_only = 0,
+            device_only,
+            host_device,
+            end
+        };
+
+        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+                                                              const void *ptr)
+        {
+            switch (sycl::get_pointer_type(ptr, q.get_context()))
+            {
+            case sycl::usm::alloc::unknown:
+                return pointer_access_attribute::host_only;
+            case sycl::usm::alloc::device:
+                return pointer_access_attribute::device_only;
+            case sycl::usm::alloc::shared:
+            case sycl::usm::alloc::host:
+                return pointer_access_attribute::host_device;
+            }
+        }
+
+        template <typename ArgT>
+        inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
+        {
+            static_assert((unsigned char)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
+            return (std::uint64_t)Val;
+        }
+
+        template <typename FirstT, typename... RestT>
+        inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
+                                                               RestT... RestVal)
+        {
+            static_assert((std::uint8_t)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
+            static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
+            return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
+        }
+
+        class mem_mgr
+        {
+            mem_mgr()
+            {
+                // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+                mapped_address_space =
+                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+                mapped_address_space = (byte_t *)VirtualAlloc(
+                    NULL,               // NULL specified as the base address parameter
+                    mapped_region_size, // Size of allocation
+                    MEM_RESERVE,        // Allocate reserved pages
+                    PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+                next_free = mapped_address_space;
+            }
+
+        public:
+            using buffer_id_t = int;
+
+            struct allocation
+            {
+                buffer_t buffer;
+                byte_t *alloc_ptr;
+                size_t size;
+            };
+
+            ~mem_mgr()
+            {
+#if defined(__linux__)
+                munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+                VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+            }
+
+            mem_mgr(const mem_mgr &) = delete;
+            mem_mgr &operator=(const mem_mgr &) = delete;
+            mem_mgr(mem_mgr &&) = delete;
+            mem_mgr &operator=(mem_mgr &&) = delete;
+
+            /// Allocate
+            void *mem_alloc(size_t size)
+            {
+                if (!size)
+                    return nullptr;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                if (next_free + size > mapped_address_space + mapped_region_size)
+                {
+                    throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
+                }
+                // Allocation
+                sycl::range<1> r(size);
+                buffer_t buf(r);
+                allocation A{buf, next_free, size};
+                // Map allocation to device pointer
+                void *result = next_free;
+                m_map.emplace(next_free + size, A);
+                // Update pointer to the next free space.
+                next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+                return result;
+            }
+
+            /// Deallocate
+            void mem_free(const void *ptr)
+            {
+                if (!ptr)
+                    return;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                m_map.erase(it);
+            }
+
+            /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+            allocation translate_ptr(const void *ptr)
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                return it->second;
+            }
+
+            /// Check if the pointer represents device pointer or not.
+            bool is_device_ptr(const void *ptr) const
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                return (mapped_address_space <= ptr) &&
+                       (ptr < mapped_address_space + mapped_region_size);
+            }
+
+            /// Returns the instance of memory manager singleton.
+            static mem_mgr &instance()
+            {
+                static mem_mgr m;
+                return m;
+            }
+
+        private:
+            std::map<byte_t *, allocation> m_map;
+            mutable std::mutex m_mutex;
+            byte_t *mapped_address_space;
+            byte_t *next_free;
+            const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+            const size_t alignment = 256;
+            /// This padding may be defined to some positive value to debug
+            /// out of bound accesses.
+            const size_t extra_padding = 0;
+
+            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+            {
+                auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
+                if (it == m_map.end())
+                {
+                    // Not a virtual pointer.
+                    throw std::runtime_error("can not get buffer from non-virtual pointer");
+                }
+                const allocation &alloc = it->second;
+                if (ptr < alloc.alloc_ptr)
+                {
+                    // Out of bound.
+                    // This may happen if there's a gap between allocations due to alignment
+                    // or extra padding and pointer points to this gap.
+                    throw std::runtime_error("invalid virtual pointer");
+                }
+                return it;
+            }
+        };
+
+        template <class T, memory_region Memory, size_t Dimension>
+        class accessor;
+        template <memory_region Memory, class T = byte_t>
+        class memory_traits
+        {
+        public:
+            static constexpr sycl::access::target target =
+                sycl::access::target::device;
+            static constexpr sycl::access_mode mode =
+                (Memory == constant) ? sycl::access_mode::read
+                                     : sycl::access_mode::read_write;
+            static constexpr size_t type_size = sizeof(T);
+            using element_t =
+                typename std::conditional<Memory == constant, const T, T>::type;
+            using value_t = typename std::remove_cv<T>::type;
+            template <size_t Dimension = 1>
+            using accessor_t = typename std::conditional<
+                Memory == local, sycl::local_accessor<value_t, Dimension>,
+                sycl::accessor<T, Dimension, mode, target>>::type;
+            using pointer_t = T *;
+        };
+
+        static inline void *dpct_malloc(size_t size, sycl::queue &q)
+        {
+            return sycl::malloc_device(size, q.get_device(), q.get_context());
+        }
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                        sycl::queue &q)
+        {
+            pitch = PITCH_DEFAULT_ALIGN(x);
+            return dpct_malloc(pitch * y * z, q);
+        }
+
+        /**
+         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] dev_ptr Pointer to the virtual device memory address.
+         * @param [in] value The value to be set.
+         * @param [in] size Number of elements to be set to the value.
+         * @return An event representing the memset operation.
+         */
+        template <typename valueT>
+        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
+                                              valueT value, size_t size)
+        {
+            return q.fill(dev_ptr, value, size);
+        }
+
+        /**
+         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] data Pointer to the pitched device memory region.
+         * @param [in] value The value to be set.
+         * @param [in] size 3D memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+                    sycl::range<3> size)
+        {
+            std::vector<sycl::event> event_list;
+            size_t slice = data.get_pitch() * data.get_y();
+            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *data_ptr = data_surface;
+                for (size_t y = 0; y < size.get(1); ++y)
+                {
+                    event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
+                    data_ptr += data.get_pitch();
+                }
+                data_surface += slice;
+            }
+            return event_list;
+        }
+
+        /**
+         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] ptr Pointer to the virtual device memory.
+         * @param [in] pitch The pitch size by number of elements, including padding.
+         * @param [in] val The value to be set.
+         * @param [in] x The width of memory region by number of elements.
+         * @param [in] y The height of memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+                    size_t y)
+        {
+            return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                               sycl::range<3>(x, y, 1));
+        }
+
+        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+                                                        const void *from_ptr,
+                                                        memcpy_direction dir)
+        {
+            switch (dir)
+            {
+            case memcpy_direction::host_to_host:
+            case memcpy_direction::host_to_device:
+            case memcpy_direction::device_to_host:
+            case memcpy_direction::device_to_device:
+                return dir;
+            case memcpy_direction::automatic:
+            {
+                // table[to_attribute][from_attribute]
+                static const memcpy_direction
+                    direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
+                                       {{memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_host,
+                                         memcpy_direction::host_to_host},
+                                        {memcpy_direction::host_to_device,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device},
+                                        {memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device}};
+                return direction_table[static_cast<unsigned>(get_pointer_attribute(
+                    q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+            }
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+        }
+
+        static sycl::event
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                    memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            if (!size)
+                return sycl::event{};
+            return q.memcpy(to_ptr, from_ptr, size, dep_events);
+            GGML_UNUSED(direction);
+        }
+
+        // Get actual copy range and make sure it will not exceed range.
+        static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                            size_t pitch)
+        {
+            return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+        }
+
+        static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                        size_t pitch)
+        {
+            return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+        }
+
+        /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+        /// and \p from_range to another specified by \p to_ptr and \p to_range.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    sycl::range<3> to_range, sycl::range<3> from_range,
+                    sycl::id<3> to_id, sycl::id<3> from_id,
+                    sycl::range<3> size, memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            // RAII for host pointer
+            class host_buffer
+            {
+                void *_buf;
+                size_t _size;
+                sycl::queue &_q;
+                const std::vector<sycl::event> &_deps; // free operation depends
+
+            public:
+                host_buffer(size_t size, sycl::queue &q,
+                            const std::vector<sycl::event> &deps)
+                    : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+                void *get_ptr() const { return _buf; }
+                size_t get_size() const { return _size; }
+                ~host_buffer()
+                {
+                    if (_buf)
+                    {
+                        _q.submit([&](sycl::handler &cgh)
+                                  {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); }); });
+                    }
+                }
+            };
+            std::vector<sycl::event> event_list;
+
+            size_t to_slice = to_range.get(1) * to_range.get(0),
+                   from_slice = from_range.get(1) * from_range.get(0);
+            unsigned char *to_surface =
+                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+            const unsigned char *from_surface =
+                (const unsigned char *)from_ptr +
+                get_offset(from_id, from_slice, from_range.get(0));
+
+            if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+            {
+                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                    direction, dep_events)};
+            }
+            direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+            size_t size_slice = size.get(1) * size.get(0);
+            switch (direction)
+            {
+            case host_to_host:
+                for (size_t z = 0; z < size.get(2); ++z)
+                {
+                    unsigned char *to_ptr = to_surface;
+                    const unsigned char *from_ptr = from_surface;
+                    if (to_range.get(0) == from_range.get(0) &&
+                        to_range.get(0) == size.get(0))
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                         direction, dep_events));
+                    }
+                    else
+                    {
+                        for (size_t y = 0; y < size.get(1); ++y)
+                        {
+                            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                             direction, dep_events));
+                            to_ptr += to_range.get(0);
+                            from_ptr += from_range.get(0);
+                        }
+                    }
+                    to_surface += to_slice;
+                    from_surface += from_slice;
+                }
+                break;
+            case host_to_device:
+            {
+                host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                                event_list);
+                std::vector<sycl::event> host_events;
+                if (to_slice == size_slice)
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events =
+                        dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                    host_to_host, dep_events);
+                }
+                else
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events = dpct_memcpy(
+                        q, buf.get_ptr(), from_surface, to_range, from_range,
+                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                        // If has padding data, not sure whether it is useless. So fill temp
+                        // buffer with it.
+                        std::vector<sycl::event>{
+                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                        device_to_host, dep_events)});
+                }
+                // Copy from temp host buffer to device with only one submit.
+                event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                                 buf.get_size(), host_to_device,
+                                                 host_events));
+                break;
+            }
+            case device_to_host:
+            {
+                host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                                event_list);
+                // Copy from host temp buffer to host target with reshaping.
+                event_list = dpct_memcpy(
+                    q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                    sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // Copy from device to temp host buffer with only one submit.
+                    std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                         buf.get_size(),
+                                                         device_to_host, dep_events)});
+                break;
+            }
+            case device_to_device:
+                event_list.push_back(q.submit([&](sycl::handler &cgh){
+                cgh.depends_on(dep_events);
+                cgh.parallel_for<class dpct_memcpy_3d_detail>(
+                    size,
+                    [=](sycl::id<3> id) {
+                        to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                            from_surface[get_offset(id, from_slice, from_range.get(0))];
+                    }); }));
+                break;
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+            return event_list;
+        }
+
+        /// memcpy 2D/3D matrix specified by pitched_data.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                               size, direction);
+        }
+
+        /// memcpy 2D matrix with pitch.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                               sycl::range<3>(from_pitch, y, 1),
+                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                               sycl::range<3>(x, y, 1), direction);
+        }
+
+        namespace deprecated
+        {
+
+            template <typename T, sycl::usm::alloc AllocKind>
+            class usm_allocator
+            {
+            private:
+                using Alloc = sycl::usm_allocator<T, AllocKind>;
+                Alloc _impl;
+
+            public:
+                using value_type = typename std::allocator_traits<Alloc>::value_type;
+                using pointer = typename std::allocator_traits<Alloc>::pointer;
+                using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
+                using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
+                using const_void_pointer =
+                    typename std::allocator_traits<Alloc>::const_void_pointer;
+                using reference = typename std::allocator_traits<Alloc>::value_type &;
+                using const_reference =
+                    const typename std::allocator_traits<Alloc>::value_type &;
+                using difference_type =
+                    typename std::allocator_traits<Alloc>::difference_type;
+                using size_type = typename std::allocator_traits<Alloc>::size_type;
+                using propagate_on_container_copy_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_copy_assignment;
+                using propagate_on_container_move_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_move_assignment;
+                using propagate_on_container_swap =
+                    typename std::allocator_traits<Alloc>::propagate_on_container_swap;
+                using is_always_equal =
+                    typename std::allocator_traits<Alloc>::is_always_equal;
+
+                template <typename U>
+                struct rebind
+                {
+                    typedef usm_allocator<U, AllocKind> other;
+                };
+
+                usm_allocator() : _impl(dpct::get_default_queue()) {}
+                ~usm_allocator() {}
+                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+                pointer address(reference r) { return &r; }
+                const_pointer address(const_reference r) { return &r; }
+                pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
+                {
+                    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
+                }
+                void deallocate(pointer p, size_type cnt)
+                {
+                    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
+                }
+                size_type max_size() const
+                {
+                    return std::allocator_traits<Alloc>::max_size(_impl);
+                }
+                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+            };
+
+        } // namespace deprecated
+
+        inline void dpct_free(void *ptr,
+                              const sycl::queue &q)
+        {
+            if (ptr)
+            {
+                sycl::free(ptr, q.get_context());
+            }
+        }
+
+        template <typename T>
+        inline auto get_memory(const void *x)
+        {
+            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+            return new_x;
+        }
+
+        template <typename T>
+        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
+        {
+            using Ty = typename DataType<T>::T2;
+            Ty s_h;
+            if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
+                detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
+                    .wait();
+            else
+                s_h = *reinterpret_cast<const Ty *>(s);
+            return s_h;
+        }
+
+    } // namespace detail
+
+    template <typename T>
+    inline auto get_value(const T *s, sycl::queue &q)
+    {
+        return detail::get_value(s, q);
+    }
+
+    namespace detail
+    {
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                              oneapi::mkl::transpose b_trans, int m, int n, int k,
+                              const void *alpha, const void *a, int lda, const void *b,
+                              int ldb, const void *beta, void *c, int ldc)
+        {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+#ifdef GGML_SYCL_NVIDIA
+            oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
+                                                  a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
+                                                  beta_value, data_c, ldc);
+#else
+            oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
+                                                  beta_value, data_c, ldc);
+#endif
+        }
+
+        template <typename VecT, class BinaryOperation, class = void>
+        class vectorized_binary
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                VecT v4;
+                for (size_t i = 0; i < v4.size(); ++i)
+                {
+                    v4[i] = binary_op(a[i], b[i]);
+                }
+                return v4;
+            }
+        };
+
+        template <typename VecT, class BinaryOperation>
+        class vectorized_binary<
+            VecT, BinaryOperation,
+            std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                return binary_op(a, b).template as<VecT>();
+            }
+        };
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
+                                    int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
+                                    int ldb, const void * beta, void ** c, int ldc, int batch_size,
+                                    matrix_info_t<float> * matrix_info) {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+
+            matrix_info->transpose_info[0] = a_trans;
+            matrix_info->transpose_info[1] = b_trans;
+            matrix_info->value_info[0] = alpha_value;
+            matrix_info->value_info[1] = beta_value;
+            matrix_info->size_info[0] = m;
+            matrix_info->size_info[1] = n;
+            matrix_info->size_info[2] = k;
+            matrix_info->ld_info[0] = lda;
+            matrix_info->ld_info[1] = ldb;
+            matrix_info->ld_info[2] = ldc;
+            matrix_info->groupsize_info = batch_size;
+
+#ifdef GGML_SYCL_NVIDIA
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
+                matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
+                matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
+                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
+                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+#else
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
+                matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
+                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
+                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+#endif
+        }
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void
+        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                        oneapi::mkl::transpose b_trans, int m, int n,
+                        int k, const void *alpha, const void *a, int lda,
+                        long long int stride_a, const void *b, int ldb,
+                        long long int stride_b, const void *beta, void *c,
+                        int ldc, long long int stride_c, int batch_size)
+        {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+#ifdef GGML_SYCL_NVIDIA
+            oneapi::mkl::blas::column_major::gemm_batch(
+                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
+                alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
+                batch_size);
+#else
+            oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                                                        stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
+                                                        stride_c, batch_size);
+#endif
+        }
+
+    } // namespace detail
+
+    template <typename VecT, class BinaryOperation>
+    inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                      const BinaryOperation binary_op)
+    {
+        sycl::vec<unsigned, 1> v0{a}, v1{b};
+        auto v2 = v0.as<VecT>();
+        auto v3 = v1.as<VecT>();
+        auto v4 =
+            detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+        v0 = v4.template as<sycl::vec<unsigned, 1>>();
+        return v0;
+    }
+
+    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                                  memcpy_direction direction = automatic,
+                                  sycl::queue &q = dpct::get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
+    }
+
+    static inline unsigned int select_device(unsigned int id)
+    {
+        dev_mgr::instance().select_device(id);
+        return id;
+    }
+
+    template <typename T>
+    T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                               unsigned int logical_sub_group_size = 32)
+    {
+        unsigned int id = g.get_local_linear_id();
+        unsigned int start_index =
+            id / logical_sub_group_size * logical_sub_group_size;
+        unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+        return sycl::select_from_group(g, x,
+                                       target_offset < logical_sub_group_size
+                                           ? start_index + target_offset
+                                           : id);
+    }
+
+    template <typename T1, typename T2, typename T3>
+    inline auto dp4a(T1 a, T2 b, T3 c)
+    {
+        return syclcompat::dp4a(a, b, c);
+    }
+
+    struct sub_sat
+    {
+        template <typename T>
+        auto operator()(const T x, const T y) const
+        {
+            return sycl::sub_sat(x, y);
+        }
+    };
+
+    template <typename S, typename T>
+    inline T vectorized_min(T a, T b)
+    {
+        sycl::vec<T, 1> v0{a}, v1{b};
+        auto v2 = v0.template as<S>();
+        auto v3 = v1.template as<S>();
+        auto v4 = sycl::min(v2, v3);
+        v0 = v4.template as<sycl::vec<T, 1>>();
+        return v0;
+    }
+
+    inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+    inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+    inline float pow(const float a, const float b) { return sycl::pow(a, b); }
+    inline double pow(const double a, const double b) { return sycl::pow(a, b); }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(a, static_cast<T>(b));
+    }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+    }
+
+    inline double min(const double a, const float b)
+    {
+        return sycl::fmin(a, static_cast<double>(b));
+    }
+    inline double min(const float a, const double b)
+    {
+        return sycl::fmin(static_cast<double>(a), b);
+    }
+    inline float min(const float a, const float b) { return sycl::fmin(a, b); }
+    inline double min(const double a, const double b) { return sycl::fmin(a, b); }
+    inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::min(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t min(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t min(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    // max function overloads.
+    // For floating-point types, `float` or `double` arguments are acceptable.
+    // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+    // `std::int64_t` type arguments are acceptable.
+    inline double max(const double a, const float b)
+    {
+        return sycl::fmax(a, static_cast<double>(b));
+    }
+    inline double max(const float a, const double b)
+    {
+        return sycl::fmax(static_cast<double>(a), b);
+    }
+    inline float max(const float a, const float b) { return sycl::fmax(a, b); }
+    inline double max(const double a, const double b) { return sycl::fmax(a, b); }
+    inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::max(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t max(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t max(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+
+    inline void
+    has_capability_or_fail(const sycl::device &dev,
+                           const std::initializer_list<sycl::aspect> &props)
+    {
+        for (const auto &it : props)
+        {
+            if (dev.has(it))
+                continue;
+            switch (it)
+            {
+            case sycl::aspect::fp64:
+                throw std::runtime_error("'double' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            case sycl::aspect::fp16:
+                throw std::runtime_error("'half' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            default:
+#define __SYCL_ASPECT(ASPECT, ID) \
+    case sycl::aspect::ASPECT:    \
+        return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
+                {
+                    switch (AspectNum)
+                    {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+                    default:
+                        return "unknown aspect";
+                    }
+                };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+                throw std::runtime_error(
+                    "'" + getAspectNameStr(it) + "' is not supported in '" +
+                    dev.get_info<sycl::info::device::name>() + "' device");
+            }
+            break;
+        }
+    }
+
+    static inline unsigned int get_current_device_id()
+    {
+        return dev_mgr::instance().current_device_id();
+    }
+
+    static inline device_ext &get_current_device()
+    {
+        return dev_mgr::instance().current_device();
+    }
+
+    static inline device_ext &get_device(unsigned int id)
+    {
+        return dev_mgr::instance().get_device(id);
+    }
+
+    static inline sycl::queue &get_in_order_queue()
+    {
+        return dev_mgr::instance().current_device().in_order_queue();
+    }
+
+    static sycl::event
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        if (!size)
+            return sycl::event{};
+        return q.memcpy(to_ptr, from_ptr, size, dep_events);
+        GGML_UNUSED(direction);
+    }
+
+    // Get actual copy range and make sure it will not exceed range.
+    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                        size_t pitch)
+    {
+        return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+    }
+
+    static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                    size_t pitch)
+    {
+        return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+    }
+
+    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+    /// and \p from_range to another specified by \p to_ptr and \p to_range.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        // RAII for host pointer
+        class host_buffer
+        {
+            void *_buf;
+            size_t _size;
+            sycl::queue &_q;
+            const std::vector<sycl::event> &_deps; // free operation depends
+
+        public:
+            host_buffer(size_t size, sycl::queue &q,
+                        const std::vector<sycl::event> &deps)
+                : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+            void *get_ptr() const { return _buf; }
+            size_t get_size() const { return _size; }
+            ~host_buffer()
+            {
+                if (_buf)
+                {
+                    _q.submit([&](sycl::handler &cgh)
+                              {
+            cgh.depends_on(_deps);
+            cgh.host_task([buf = _buf] { std::free(buf); }); });
+                }
+            }
+        };
+        std::vector<sycl::event> event_list;
+
+        size_t to_slice = to_range.get(1) * to_range.get(0),
+               from_slice = from_range.get(1) * from_range.get(0);
+        unsigned char *to_surface =
+            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+        const unsigned char *from_surface =
+            (const unsigned char *)from_ptr +
+            get_offset(from_id, from_slice, from_range.get(0));
+
+        if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+        {
+            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                direction, dep_events)};
+        }
+        direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+        size_t size_slice = size.get(1) * size.get(0);
+        switch (direction)
+        {
+        case host_to_host:
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *to_ptr = to_surface;
+                const unsigned char *from_ptr = from_surface;
+                if (to_range.get(0) == from_range.get(0) &&
+                    to_range.get(0) == size.get(0))
+                {
+                    event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                     direction, dep_events));
+                }
+                else
+                {
+                    for (size_t y = 0; y < size.get(1); ++y)
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                         direction, dep_events));
+                        to_ptr += to_range.get(0);
+                        from_ptr += from_range.get(0);
+                    }
+                }
+                to_surface += to_slice;
+                from_surface += from_slice;
+            }
+            break;
+        case host_to_device:
+        {
+            host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                            event_list);
+            std::vector<sycl::event> host_events;
+            if (to_slice == size_slice)
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events =
+                    dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                host_to_host, dep_events);
+            }
+            else
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events = dpct_memcpy(
+                    q, buf.get_ptr(), from_surface, to_range, from_range,
+                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // If has padding data, not sure whether it is useless. So fill temp
+                    // buffer with it.
+                    std::vector<sycl::event>{
+                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                    device_to_host, dep_events)});
+            }
+            // Copy from temp host buffer to device with only one submit.
+            event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                             buf.get_size(), host_to_device,
+                                             host_events));
+            break;
+        }
+        case device_to_host:
+        {
+            host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                            event_list);
+            // Copy from host temp buffer to host target with reshaping.
+            event_list = dpct_memcpy(
+                q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                sycl::id<3>(0, 0, 0), size, host_to_host,
+                // Copy from device to temp host buffer with only one submit.
+                std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                     buf.get_size(),
+                                                     device_to_host, dep_events)});
+            break;
+        }
+        case device_to_device:
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                          {
+        cgh.depends_on(dep_events);
+        cgh.parallel_for<class dpct_memcpy_3d_detail>(
+            size,
+            [=](sycl::id<3> id) {
+                to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                    from_surface[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+        break;
+        default:
+            throw std::runtime_error("dpct_memcpy: invalid direction value");
+        }
+        return event_list;
+    }
+
+    /// memcpy 2D/3D matrix specified by pitched_data.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                           size, direction);
+    }
+
+    /// memcpy 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                           sycl::range<3>(from_pitch, y, 1),
+                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                           sycl::range<3>(x, y, 1), direction);
+    }
+
+    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                     oneapi::mkl::transpose b_trans, int m, int n, int k,
+                     const void *alpha, const void *a, library_data_t a_type,
+                     int lda, const void *b, library_data_t b_type, int ldb,
+                     const void *beta, void *c, library_data_t c_type, int ldc,
+                     library_data_t scaling_type)
+    {
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_impl<std::complex<float>, std::complex<float>,
+                              std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_impl<std::complex<double>, std::complex<double>,
+                              std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                          lda, b, ldb, beta, c, ldc);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                          a, lda, b, ldb, &beta_half, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                              oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+            break;
+        }
+#endif // __INTEL_MKL__
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    } // gemm()
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
+                           int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
+                           const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
+                           library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
+                           matrix_info_t<float> * matrix_info) {
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
+                                                                beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
+                                                                    beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
+                matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    }
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] stride_a Stride between the different A matrices.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] stride_b Stride between the different B matrices.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] stride_c Stride between the different C matrices.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a, library_data_t a_type,
+                           int lda, long long int stride_a, const void *b,
+                           library_data_t b_type, int ldb, long long int stride_b,
+                           const void *beta, void *c, library_data_t c_type,
+                           int ldc, long long int stride_c, int batch_size,
+                           library_data_t scaling_type)
+    {
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, stride_a, b, ldb, stride_b,
+                                                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           stride_a, b, ldb, stride_b, beta, c, ldc,
+                                           stride_c, batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                                  a, lda, stride_a, b, ldb, stride_b,
+                                                  beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+                &beta_half, c, ldc, stride_c, batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    }
+
+    static inline void
+    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                      size_t from_pitch, size_t x, size_t y,
+                      memcpy_direction direction = automatic,
+                      sycl::queue &q = get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                            direction);
+    }
+
+    using err0 = detail::generic_error_type<struct err0_tag, int>;
+    using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+    static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) {
+        detail::dpct_free(ptr, q);
+    }
+
+    /// dpct accessor used as device function parameter.
+    template <class T, memory_region Memory, size_t Dimension> class accessor;
+    template <class T, memory_region Memory> class accessor<T, Memory, 3> {
+    public:
+        using memory_t = detail::memory_traits<Memory, T>;
+        using element_t = typename memory_t::element_t;
+        using pointer_t = typename memory_t::pointer_t;
+        using accessor_t = typename memory_t::template accessor_t<3>;
+        accessor(pointer_t data, const sycl::range<3> &in_range)
+            : _data(data), _range(in_range) {}
+        template <memory_region M = Memory>
+        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+            : accessor(acc, acc.get_range()) {}
+        accessor(const accessor_t &acc, const sycl::range<3> &in_range)
+            : accessor(acc.get_pointer(), in_range) {}
+        accessor<T, Memory, 2> operator[](size_t index) const {
+            sycl::range<2> sub(_range.get(1), _range.get(2));
+            return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
+        }
+
+        pointer_t get_ptr() const { return _data; }
+
+    private:
+        pointer_t _data;
+        sycl::range<3> _range;
+    };
+    template <class T, memory_region Memory> class accessor<T, Memory, 2> {
+    public:
+        using memory_t = detail::memory_traits<Memory, T>;
+        using element_t = typename memory_t::element_t;
+        using pointer_t = typename memory_t::pointer_t;
+        using accessor_t = typename memory_t::template accessor_t<2>;
+        accessor(pointer_t data, const sycl::range<2> &in_range)
+            : _data(data), _range(in_range) {}
+        template <memory_region M = Memory>
+        accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+            : accessor(acc, acc.get_range()) {}
+        accessor(const accessor_t &acc, const sycl::range<2> &in_range)
+            : accessor(acc.get_pointer(), in_range) {}
+
+        pointer_t operator[](size_t index) const {
+            return _data + _range.get(1) * index;
+        }
+
+        pointer_t get_ptr() const { return _data; }
+
+    private:
+        pointer_t _data;
+        sycl::range<2> _range;
+    };
+
+    namespace detail {
+        /// Device variable with address space of shared, global or constant.
+        template <class T, memory_region Memory, size_t Dimension> class device_memory {
+        public:
+            using accessor_t =
+                typename detail::memory_traits<Memory,
+                                            T>::template accessor_t<Dimension>;
+            using value_t = typename detail::memory_traits<Memory, T>::value_t;
+            using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
+
+            device_memory() : device_memory(sycl::range<Dimension>(1)) {}
+
+            /// Constructor of 1-D array with initializer list
+            device_memory(const sycl::range<Dimension> &in_range,
+                        std::initializer_list<value_t> &&init_list)
+                : device_memory(in_range) {
+                assert(init_list.size() <= in_range.size());
+                _host_ptr = (value_t *)std::malloc(_size);
+                std::memset(_host_ptr, 0, _size);
+                std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
+            }
+
+            /// Constructor of 2-D array with initializer list
+            template <size_t D = Dimension>
+            device_memory(
+                const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
+                std::initializer_list<std::initializer_list<value_t>> &&init_list)
+                : device_memory(in_range) {
+                assert(init_list.size() <= in_range[0]);
+                _host_ptr = (value_t *)std::malloc(_size);
+                std::memset(_host_ptr, 0, _size);
+                auto tmp_data = _host_ptr;
+                for (auto sub_list : init_list) {
+                    assert(sub_list.size() <= in_range[1]);
+                    std::memcpy(tmp_data, sub_list.begin(),
+                                sub_list.size() * sizeof(T));
+                    tmp_data += in_range[1];
+                }
+            }
+
+            /// Constructor with range
+            device_memory(const sycl::range<Dimension> &range_in)
+                : _size(range_in.size() * sizeof(T)), _range(range_in),
+                _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) {
+                static_assert(
+                    (Memory == global) || (Memory == constant) || (Memory == shared),
+                    "device memory region should be global, constant or shared");
+                // Make sure that singleton class mem_mgr and dev_mgr will destruct
+                // later than this.
+                detail::mem_mgr::instance();
+                dev_mgr::instance();
+            }
+
+            /// Constructor with range
+            template <class... Args>
+            device_memory(Args... Arguments)
+                : device_memory(sycl::range<Dimension>(Arguments...)) {}
+
+            ~device_memory() {
+                if (_device_ptr && !_reference)
+                    dpct::dpct_free(_device_ptr);
+                if (_host_ptr)
+                    std::free(_host_ptr);
+            }
+
+            /// Allocate memory with default queue, and init memory if has initial
+            /// value.
+            void init() { init(dpct::get_default_queue()); }
+            /// Allocate memory with specified queue, and init memory if has initial
+            /// value.
+            void init(sycl::queue &q) {
+                if (_device_ptr)
+                    return;
+                if (!_size)
+                    return;
+                allocate_device(q);
+                if (_host_ptr)
+                    detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size,
+                                        host_to_device);
+            }
+
+            /// The variable is assigned to a device pointer.
+            void assign(value_t *src, size_t size) {
+                this->~device_memory();
+                new (this) device_memory(src, size);
+            }
+
+            /// Get memory pointer of the memory object, which is virtual pointer when
+            /// usm is not used, and device pointer when usm is used.
+            value_t *get_ptr() { return get_ptr(get_default_queue()); }
+            /// Get memory pointer of the memory object, which is virtual pointer when
+            /// usm is not used, and device pointer when usm is used.
+            value_t *get_ptr(sycl::queue &q) {
+                init(q);
+                return _device_ptr;
+            }
+
+            /// Get the device memory object size in bytes.
+            size_t get_size() { return _size; }
+
+            template <size_t D = Dimension>
+            typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
+                init();
+                return _device_ptr[index];
+            }
+
+            /// Get dpct::accessor with dimension info for the device memory object
+            /// when usm is used and dimension is greater than 1.
+            template <size_t D = Dimension>
+            typename std::enable_if<D != 1, dpct_accessor_t>::type
+            get_access([[maybe_unused]] sycl::handler &cgh) {
+                return dpct_accessor_t((T *)_device_ptr, _range);
+            }
+
+        private:
+            device_memory(value_t *memory_ptr, size_t size)
+                : _size(size), _range(size / sizeof(T)), _reference(true),
+                _device_ptr(memory_ptr) {}
+
+            void allocate_device(sycl::queue &q) {
+        #ifndef DPCT_USM_LEVEL_NONE
+                if (Memory == shared) {
+                    _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
+                                                                q.get_context());
+                    return;
+                }
+        #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+                if (Memory == constant) {
+                    _device_ptr = (value_t *)sycl::malloc_device(
+                        _size, q.get_device(), q.get_context(),
+                        sycl::ext::oneapi::property::usm::device_read_only());
+                    return;
+                }
+        #endif
+        #endif
+                _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
+            }
+
+            size_t _size;
+            sycl::range<Dimension> _range;
+            bool _reference;
+            value_t *_host_ptr;
+            value_t *_device_ptr;
+        };
+        template <class T, memory_region Memory>
+        class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
+        public:
+            using base = device_memory<T, Memory, 1>;
+            using value_t = typename base::value_t;
+            using accessor_t =
+                typename detail::memory_traits<Memory, T>::template accessor_t<0>;
+
+            /// Constructor with initial value.
+            device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
+
+            /// Default constructor
+            device_memory() : base(1) {}
+        };
+        } // namespace detail
+
+    template <class T, size_t Dimension>
+    using global_memory = detail::device_memory<T, global, Dimension>;
+    template <class T, size_t Dimension>
+    using constant_memory = detail::device_memory<T, constant, Dimension>;
+    template <class T, size_t Dimension>
+    using shared_memory = detail::device_memory<T, shared, Dimension>;
+
+
+    template <typename T,
+            sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space,
+            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+            sycl::memory_scope memoryScope = sycl::memory_scope::device>
+    inline T atomic_fetch_add(T *addr, T operand) {
+    auto atm =
+        sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+    return atm.fetch_add(operand);
+    }
+
+    template <sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space,
+            sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+            sycl::memory_scope memoryScope = sycl::memory_scope::device,
+            typename T1, typename T2>
+    inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
+    auto atm =
+        sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+    return atm.fetch_add(operand);
+    }
+
+    template <typename T, sycl::access::address_space addressSpace =
+                            sycl::access::address_space::global_space>
+    inline T atomic_fetch_add(T *addr, T operand,
+                            sycl::memory_order memoryOrder) {
+    switch (memoryOrder) {
+        case sycl::memory_order::relaxed:
+            return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
+                                    sycl::memory_scope::device>(addr, operand);
+        case sycl::memory_order::acq_rel:
+            return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
+                                    sycl::memory_scope::device>(addr, operand);
+        case sycl::memory_order::seq_cst:
+            return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
+                                    sycl::memory_scope::device>(addr, operand);
+        default:
+            assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                            "atomics are: sycl::memory_order::relaxed, "
+                            "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+        }
+    }
+
+    template <sycl::access::address_space addressSpace =
+                sycl::access::address_space::global_space,
+            typename T1, typename T2>
+    inline T1 atomic_fetch_add(T1 *addr, T2 operand,
+                            sycl::memory_order memoryOrder) {
+    atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
+    }
+
+} // COPY from DPCT head files
+
+#endif // GGML_SYCL_DPCT_HELPER_HPP
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
new file mode 100644
index 000000000..4bcd74376
--- /dev/null
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -0,0 +1,1030 @@
+#include "common.hpp"
+#include "element_wise.hpp"
+
+void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+void gelu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f * xi *
+             (1.0f +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+}
+
+void silu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+}
+
+void gelu_quick_f32(const float *x, float *dst, int k,
+                           const sycl::nd_item<3> &item_ct1) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+}
+
+void tanh_f32(const float *x, float *dst, int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::tanh((float)(x[i]));
+}
+
+void relu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+}
+
+void sigmoid_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
+}
+
+void sqrt_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::sqrt(x[i]);
+}
+
+void sin_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::sin(x[i]);
+}
+
+void cos_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::cos(x[i]);
+}
+
+void hardsigmoid_f32(const float * x, float * dst, const int k,
+                            const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+void hardswish_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
+}
+
+void exp_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::exp(x[i]);
+}
+
+void log_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    float xi = x[i];
+    if (xi <= 0) {
+        dst[i] = -INFINITY;
+    } else {
+        dst[i] = sycl::log(xi);
+    }
+}
+
+void neg_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = -x[i];
+}
+
+void step_f32(const float * x, float * dst, const int k,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] > 0.0f;
+}
+
+void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
+             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+}
+
+void sqr_f32(const float * x, float * dst, const int k,
+                    const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
+                        const int nb02, const int nb03, const int ne10, const int ne11,
+                        const int ne12, const int ne13, const float sf0, const float sf1,
+                        const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
+    int index = item_ct1.get_local_id(0) +
+               item_ct1.get_group(0) * item_ct1.get_local_range(0);
+    if (index >= ne10 * ne11 * ne12 * ne13) {
+        return;
+    }
+    // operation
+    int i10 = index % ne10;
+    int i11 = (index / ne10) % ne11;
+    int i12 = (index / (ne10 * ne11)) % ne12;
+    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
+
+    int i00 = i10 / sf0;
+    int i01 = i11 / sf1;
+    int i02 = i12 / sf2;
+    int i03 = i13 / sf3;
+
+    dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
+}
+
+void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+                    const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
+        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                         item_ct1.get_group(0) * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+
+
+void acc_f32_sycl(const float *x, const float *y, float *dst,
+                         const int n_elements, const int ne10, const int ne11,
+                         const int ne12, const int nb1, const int nb2,
+                         const int offset, queue_ptr stream) {
+    int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+                    item_ct1);
+        });
+}
+
+void gelu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void silu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            silu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
+                                queue_ptr stream) {
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_quick_f32(x, dst, k, item_ct1);
+        });
+}
+
+void tanh_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            tanh_f32(x, dst, k, item_ct1);
+        });
+}
+
+void relu_f32_sycl(const float *x, float *dst, const int k,
+                          queue_ptr stream) {
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            relu_f32(x, dst, k, item_ct1);
+        });
+}
+
+void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
+                                 queue_ptr stream) {
+    const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            hardsigmoid_f32(x, dst, k, item_ct1);
+        });
+}
+
+void hardswish_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            hardswish_f32(x, dst, k, item_ct1);
+        });
+}
+
+void exp_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            exp_f32(x, dst, k, item_ct1);
+        });
+}
+
+void log_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            log_f32(x, dst, k, item_ct1);
+        });
+}
+
+void neg_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            neg_f32(x, dst, k, item_ct1);
+        });
+}
+
+void step_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            step_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sigmoid_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sigmoid_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sqrt_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqrt_f32(x, dst, k, item_ct1);
+        });
+}
+
+void sin_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sin_f32(x, dst, k, item_ct1);
+        });
+}
+
+void cos_f32_sycl(const float *x, float *dst, const int k,
+                               queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            cos_f32(x, dst, k, item_ct1);
+        });
+}
+
+void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
+                                const float negative_slope,
+                                queue_ptr stream) {
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+        });
+}
+
+void sqr_f32_sycl(const float *x, float *dst, const int k,
+                         queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqr_f32(x, dst, k, item_ct1);
+        });
+}
+
+void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
+                             const int nb02, const int nb03, const int ne10, const int ne11,
+                             const int ne12, const int ne13, const float sf0, const float sf1,
+                             const float sf2, const float sf3, queue_ptr stream) {
+    int dst_size = ne10 * ne11 * ne12 * ne13;
+    int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
+    sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<1> item_ct1) {
+            upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
+        });
+}
+
+void pad_f32_sycl(const float *x, float *dst, const int ne00,
+                         const int ne01, const int ne02, const int ne0,
+                         const int ne1, const int ne2, queue_ptr stream) {
+    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+        });
+}
+
+inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                     const ggml_tensor *src1, ggml_tensor *dst,
+                                     const float *src0_dd, const float *src1_dd,
+                                     float *dst_dd,
+                                     const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const float sf0 = (float)dst->ne[0]/src0->ne[0];
+    const float sf1 = (float)dst->ne[1]/src0->ne[1];
+    const float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const float sf3 = (float)dst->ne[3]/src0->ne[3];
+
+    upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                     dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
+                     main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_sycl(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+
+void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+
+void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+
+
+void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
new file mode 100644
index 000000000..464432645
--- /dev/null
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -0,0 +1,76 @@
+#ifndef GGML_SYCL_ELEMENTWISE_HPP
+#define GGML_SYCL_ELEMENTWISE_HPP
+
+#include "common.hpp"
+
+static __dpct_inline__ float op_repeat(const float a, const float b) {
+    return b;
+    GGML_UNUSED(a);
+}
+
+static __dpct_inline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __dpct_inline__ float op_sub(const float a, const float b) {
+    return a - b;
+}
+
+static __dpct_inline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __dpct_inline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+
+void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_ELEMENTWISE_HPP
diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp
new file mode 100644
index 000000000..3f0f34ad6
--- /dev/null
+++ b/ggml/src/ggml-sycl/gemm.hpp
@@ -0,0 +1,101 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_GEMM_HPP
+#define GGML_SYCL_GEMM_HPP
+
+#include <fstream>
+#include <iostream>
+
+#include "ggml-sycl.h"
+
+#if GGML_SYCL_DNNL
+
+#include "dnnl.hpp"
+#include "dnnl_sycl.hpp"
+
+class DnnlGemmWrapper {
+public:
+    using dt = dnnl::memory::data_type;
+    using tag = dnnl::memory::format_tag;
+
+    template<typename T>
+    static constexpr dt to_dt() {
+        if constexpr (std::is_same_v<T, float>) return dt::f32;
+        else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
+        else static_assert(0);
+    }
+
+    static inline void row_gemm(sycl::queue& q, bool a_trans,
+        bool b_trans, int m, int n, int k,
+        const void* a, dt at, const void* b, dt bt, void* c, dt ct)
+    {
+        // Get the device associated with the queue
+        sycl::device dev = q.get_device();
+        // Get the context associated with the queue
+        sycl::context ctx = q.get_context();
+        const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
+        const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
+        dnnl::memory::dims a_dims = { m, k };
+        dnnl::memory::dims b_dims = { k, n };
+        dnnl::memory::dims c_dims = { m, n };
+        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
+        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
+        const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
+        auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
+        auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
+        auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
+        auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
+
+        // Create the primitive.
+        auto matmul_prim = dnnl::matmul(matmul_pd);
+        // Primitive arguments.
+        std::unordered_map<int, dnnl::memory> matmul_args;
+        matmul_args.insert({ DNNL_ARG_SRC, a_mem });
+        matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
+        matmul_args.insert({ DNNL_ARG_DST, c_mem });
+
+        matmul_prim.execute(stream, matmul_args);
+    }
+
+
+    static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
+        bool b_trans, int m, int n, int k,
+        const void* a, dt at, const void* b, dt bt, void* c, dt ct)
+    {
+        auto const eng = stream.get_engine();
+        dnnl::memory::dims a_dims = { m, k };
+        dnnl::memory::dims b_dims = { k, n };
+        dnnl::memory::dims c_dims = { m, n };
+        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
+        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
+        const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
+        auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
+        auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
+        auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
+        auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
+
+        // Create the primitive.
+        auto matmul_prim = dnnl::matmul(matmul_pd);
+        // Primitive arguments.
+        std::unordered_map<int, dnnl::memory> matmul_args;
+        matmul_args.insert({ DNNL_ARG_SRC, a_mem });
+        matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
+        matmul_args.insert({ DNNL_ARG_DST, c_mem });
+
+        matmul_prim.execute(stream, matmul_args);
+    }
+};
+
+#endif
+
+#endif // GGML_SYCL_GEMM_HPP
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
new file mode 100644
index 000000000..3d24d2165
--- /dev/null
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -0,0 +1,4800 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <float.h>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <regex>
+
+#include <sycl/sycl.hpp>
+#include <sycl/half_type.hpp>
+
+#include "ggml-sycl.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-sycl/backend.hpp"
+#include "ggml-sycl/presets.hpp"
+#include "ggml-sycl/gemm.hpp"
+
+static bool g_sycl_loaded = false;
+
+static ggml_sycl_device_info ggml_sycl_init() {
+    ggml_sycl_device_info info = {};
+
+    info.device_count = dpct::dev_mgr::instance().device_count();
+    if (info.device_count == 0) {
+        GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
+        return info;
+    }
+
+    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
+
+    int64_t total_vram = 0;
+/* This is a bit misleading;  reserved for later */
+// #if defined(SYCL_USE_XMX)
+//     GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
+// #else
+//     GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
+// #endif
+    for (int i = 0; i < info.device_count; ++i) {
+        info.devices[i].vmm = 0;
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(i))));
+
+        info.default_tensor_split[i] = total_vram;
+        total_vram += prop.get_global_mem_size();
+
+        info.devices[i].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+
+        info.max_work_group_sizes[i] = prop.get_max_work_group_size();
+    }
+
+    for (int id = 0; id < info.device_count; ++id) {
+        info.default_tensor_split[id] /= total_vram;
+    }
+    return info;
+}
+
+const ggml_sycl_device_info & ggml_sycl_info() {
+    static ggml_sycl_device_info info = ggml_sycl_init();
+    return info;
+}
+
+void print_device_detail(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    std::string version;
+    version += std::to_string(prop.get_major_version());
+    version += ".";
+    version += std::to_string(prop.get_minor_version());
+
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+    std::string name = std::string(prop.get_name());
+    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
+    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
+
+    auto global_mem_size = prop.get_global_mem_size()/1000000;
+    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
+            name.c_str(), version.c_str(), prop.get_max_compute_units(),
+            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
+            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
+}
+
+void ggml_backend_sycl_print_sycl_devices() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
+    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
+    GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
+
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |Max    |        |Max  |Global |                     |\n");
+    GGML_LOG_INFO(
+        "|  |                   |                                       |      "
+        " |compute|Max work|sub  |mem    |                     |\n");
+    GGML_LOG_INFO(
+        "|ID|        Device Type|                                   "
+        "Name|Version|units  |group   |group|size   |       Driver version|\n");
+    GGML_LOG_INFO(
+        "|--|-------------------|---------------------------------------|------"
+        "-|-------|--------|-----|-------|---------------------|\n");
+
+    for (int id = 0; id < device_count; ++id) {
+      sycl::device device = dpct::dev_mgr::instance().get_device(id);
+      std::string backend_type = get_device_backend_and_type(device);
+      int type_id = DeviceNums[backend_type]++;
+      std::stringstream device_type;
+      device_type << "[" << backend_type << ":" << std::to_string(type_id)
+                  << "]";
+      print_device_detail(id, device, device_type.str());
+    }
+}
+
+static inline int get_sycl_env(const char *env_name, int default_val) {
+    char *user_device_string = getenv(env_name);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL &&
+        sscanf(user_device_string, " %u", &n) == 1) {
+        user_number = (int)n;
+    } else {
+        user_number = default_val;
+    }
+    return user_number;
+}
+
+static void ggml_check_sycl() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+        GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
+#if defined(GGML_SYCL_FORCE_MMQ)
+        GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:   yes\n");
+#else
+        GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:   no\n");
+#endif
+#if defined(GGML_SYCL_F16)
+        GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
+#else
+        GGML_LOG_INFO("GGML_SYCL_F16: no\n");
+#endif
+
+/* NOT REMOVE, keep it for next optimize for XMX.
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+*/
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                            dpct::dev_mgr::instance().device_count()) != 0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+
+        initialized = true;
+        g_sycl_loaded = true;
+        ggml_backend_sycl_print_sycl_devices();
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+/*
+device_index: device index from 0 to n (continue numbers).
+    It is used for device select/set in SYCL backend internal data structure.
+*/
+inline void check_allow_gpu_index(const int device_index) {
+  if (device_index >= ggml_sycl_info().device_count) {
+    char error_buf[256];
+    snprintf(
+        error_buf,
+        sizeof(error_buf),
+        "%s error: device_index:%d is out of range: [0-%d]",
+        __func__,
+        device_index,
+        ggml_sycl_info().device_count - 1);
+    GGML_LOG_ERROR("%s\n", error_buf);
+    assert(false);
+  }
+}
+
+GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_gpu_list\n");
+    for(int i=0;i<max_len;i++) id_list[i] = -1;
+
+    for (int i=0;i< ggml_sycl_info().device_count;i++){
+        if (i>=max_len) break;
+        id_list[i] = i;
+    }
+    return;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+// sycl buffer
+
+struct ggml_backend_sycl_buffer_context {
+    int device;
+    void * dev_ptr = nullptr;
+    queue_ptr stream;
+    std::string name;
+
+     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
+        device(device), dev_ptr(dev_ptr), stream(stream) {
+            check_allow_gpu_index(device);
+            name = (GGML_SYCL_NAME + std::to_string(device));
+        }
+
+
+    ~ggml_backend_sycl_buffer_context() {
+        if (dev_ptr != nullptr) {
+            ggml_sycl_set_device(device);
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
+        }
+    }
+};
+
+static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
+
+static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
+    return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
+}
+
+static void
+ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+    ggml_sycl_set_device(ctx->device);
+
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void
+ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                     ggml_tensor *tensor) try {
+    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
+
+    if (tensor->view_src != NULL) {
+        assert(tensor->view_src->buffer->buft == buffer->buft);
+        return;
+    }
+
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        size_t original_size = ggml_nbytes(tensor);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset(
+                (char *)tensor->data + original_size, 0,
+                padded_size - original_size).wait()));
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *tensor,
+                                                const void *data, size_t offset,
+                                                size_t size) try {
+
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
+    char* host_buf = (char*)malloc(size);
+    memcpy(host_buf, data, size);
+    SYCL_CHECK(
+        CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
+                             .wait()));
+    free(host_buf);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *tensor,
+                                                void *data, size_t offset,
+                                                size_t size) try {
+
+    ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue();
+
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        stream.memcpy(data, (const char *)tensor->data + offset, size)
+            .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
+                    const void *ptr_src, size_t size) {
+    char *host_buf = (char *)malloc(size);
+    q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
+    q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
+    free(host_buf);
+}
+
+static bool
+ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                    const ggml_tensor *src,
+                                    ggml_tensor *dst) try {
+    if (ggml_backend_buffer_is_sycl(src->buffer)) {
+        ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
+        ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
+
+        ggml_sycl_set_device(src_ctx->device);
+        /*
+        DPCT1009:198: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw()));
+        ggml_sycl_set_device(dst_ctx->device);
+        /*
+        DPCT1009:199: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
+        /*
+        DPCT1009:200: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+
+        queue_ptr stream_dst = dst_ctx->stream;
+        queue_ptr stream_src = src_ctx->stream;
+        size_t size = ggml_nbytes(src);
+
+        //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs.
+        dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size);
+
+//todo, it's known issue：error in device2device cross GPUs. reused when the issue is fixed. DON"T remove
+#if 0
+        SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(
+            (char *)dst->data, (const char *)src->data, size).wait()));
+
+        /*
+        DPCT1009:201: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw()));
+#endif
+        return true;
+    }
+    return false;
+    GGML_UNUSED(buffer);
+} catch (const sycl::exception & exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
+                                           uint8_t value) try {
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+    ggml_sycl_set_device(ctx->device);
+    queue_ptr stream = ctx->stream;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
+                                    .memset(ctx->dev_ptr, value, buffer->size)
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_sycl_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_sycl_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// sycl buffer type
+struct ggml_backend_sycl_buffer_type_context {
+    int device;
+    std::string name;
+
+    // each buffer type has its own stream
+    queue_ptr stream = nullptr;
+};
+
+static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static ggml_backend_buffer_t
+ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) try {
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+    ggml_sycl_set_device(buft_ctx->device);
+    const queue_ptr stream = buft_ctx->stream;
+    size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
+
+    void * dev_ptr;
+    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
+                                    size, *stream)));
+    if (!dev_ptr) {
+      GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+      return nullptr;
+    }
+    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
+    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    return dpct::get_current_device().get_max_mem_alloc_size();
+
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    size_t size = ggml_nbytes(tensor);
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    GGML_UNUSED(buft);
+}
+
+static const ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_sycl_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
+
+    auto dev_count = ggml_backend_sycl_get_device_count();
+
+    if (device>=dev_count or device<0) {
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device, dev_count-1);
+        GGML_ASSERT(device<dev_count);
+    }
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
+
+    static bool ggml_backend_sycl_buffer_type_initialized = false;
+
+    if (!ggml_backend_sycl_buffer_type_initialized) {
+        for (int i = 0; i < dev_count; i++) {
+            auto & device_i = dpct::dev_mgr::instance().get_device(i);
+            queue_ptr stream = &(device_i.default_queue());
+            ggml_backend_sycl_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
+                /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), i),
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
+            };
+        }
+        ggml_backend_sycl_buffer_type_initialized = true;
+    }
+    return &ggml_backend_sycl_buffer_types[device];
+}
+
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
+
+    int device = ctx->device;
+    if (device>=ggml_sycl_info().device_count or device<0) {
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device, ggml_sycl_info().device_count-1);
+        GGML_ASSERT(device<ggml_sycl_info().device_count);
+    }
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
+
+    static bool ggml_backend_sycl_buffer_type_initialized = false;
+
+    if (!ggml_backend_sycl_buffer_type_initialized) {
+        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+            ggml_backend_sycl_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
+                /* .device   = */ nullptr,
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
+            };
+        }
+        ggml_backend_sycl_buffer_type_initialized = true;
+    }
+    return &ggml_backend_sycl_buffer_types[device];
+}
+
+// sycl split buffer
+
+static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
+            if (min_compute_capability > ggml_sycl_info().devices[i].cc) {
+                min_compute_capability = ggml_sycl_info().devices[i].cc;
+            }
+            if (max_compute_capability < ggml_sycl_info().devices[i].cc) {
+                max_compute_capability = ggml_sycl_info().devices[i].cc;
+            }
+        }
+    }
+
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_IQ3_S:
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int id) {
+    const int64_t nrows = ggml_nrows(tensor);
+    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+
+    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low -= *row_low % rounding;
+    if (id == ggml_sycl_info().device_count - 1) {
+        *row_high = nrows;
+    } else {
+        *row_high = nrows*tensor_split[id + 1];
+        *row_high -= *row_high % rounding;
+    }
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+struct ggml_backend_sycl_split_buffer_type_context {
+    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split;
+};
+
+struct ggml_backend_sycl_split_buffer_context {
+    ~ggml_backend_sycl_split_buffer_context() try {
+        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+                for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+                    if (extra->events[i][is] != nullptr) {
+                        /*
+                        DPCT1009:206: SYCL uses exceptions to report errors and
+                        does not use the error codes. The original code was
+                        commented out and a warning string was inserted. You
+                        need to rewrite this code.
+                        */
+                        SYCL_CHECK(CHECK_TRY_ERROR(
+                            dpct::destroy_event(extra->events[i][is])));
+                    }
+                }
+                if (extra->data_device[i] != nullptr) {
+                    /*
+                    DPCT1009:207: SYCL uses exceptions to report errors and does
+                    not use the error codes. The original code was commented out
+                    and a warning string was inserted. You need to rewrite this
+                    code.
+                    */
+                    ggml_sycl_set_device(i);
+                    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
+                        extra->data_device[i], *(streams[i]))));
+                }
+            }
+            delete extra;
+        }
+    }
+    catch (sycl::exception const &exc) {
+      std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+                << ", line:" << __LINE__ << std::endl;
+      std::exit(1);
+    }
+
+    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+    std::vector<queue_ptr> streams;
+};
+
+static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
+    return (void *)0x1000;
+
+    GGML_UNUSED(buffer);
+}
+
+static void
+ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                           ggml_tensor *tensor) try {
+    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+
+    ctx->tensor_extras.push_back(extra);
+        ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        // FIXME: do not crash if SYCL Buffer alloc fails
+        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        char * buf;
+        /*
+        DPCT1009:208: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
+                                        size, *stream)));
+        if (!buf) {
+            char err_buf[1024];
+            snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+            throw std::runtime_error(err_buf);
+        }
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            /*
+            DPCT1009:209: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                (*stream)
+                    .memset(buf + original_size, 0, size - original_size)
+                    .wait()));
+        }
+
+        extra->data_device[i] = buf;
+
+        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+            /*
+            DPCT1009:210: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            SYCL_CHECK(
+                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
+        }
+    }
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void
+ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                          ggml_tensor *tensor, const void *data,
+                                          size_t offset, size_t size) try {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        const char * buf_host = (const char *)data + offset_split;
+        /*
+        DPCT1009:211: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            (*stream)
+                .memcpy(extra->data_device[i], buf_host, original_size)
+                .wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void
+ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                          const ggml_tensor *tensor, void *data,
+                                          size_t offset, size_t size) try {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
+    ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf_host = (char *)data + offset_split;
+        /*
+        DPCT1009:212: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        ggml_sycl_set_device(i);
+        const queue_ptr stream = ctx->streams[i];
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            (*stream)
+                .memcpy(buf_host, extra->data_device[i], original_size)
+                .wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_sycl_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_sycl_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_sycl_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_sycl_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_sycl_split_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_sycl_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// sycl split buffer type
+
+static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return GGML_SYCL_NAME "_Split";
+
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
+   return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // instead, we allocate them for each tensor separately in init_tensor
+    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context();
+
+    return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        total_size += ggml_nbytes_split(tensor, nrows_split);
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_sycl_split_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_sycl_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_sycl_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_sycl_split_buffer_type_is_host,
+};
+
+ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
+    ggml_check_sycl();
+    // FIXME: this is not thread safe
+    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+
+    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split_arr = {};
+
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; });
+    if (all_zero) {
+        tensor_split_arr = ggml_sycl_info().default_tensor_split;
+    } else {
+        float split_sum = 0.0f;
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] = split_sum;
+            split_sum += tensor_split[i];
+        }
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] /= split_sum;
+        }
+    }
+
+    auto it = buft_map.find(tensor_split_arr);
+    if (it != buft_map.end()) {
+        return &it->second;
+    }
+
+    struct ggml_backend_buffer_type buft {
+        /* .iface   = */ ggml_backend_sycl_split_buffer_type_interface,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0),
+        /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
+    };
+
+    auto result = buft_map.emplace(tensor_split_arr, buft);
+    return &result.first->second;
+}
+
+// host buffer type
+
+static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_SYCL_NAME "_Host";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_sycl_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_sycl_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_sycl_buffer_type_host;
+}
+
+// buffer pool for sycl (legacy)
+struct ggml_sycl_pool_leg : public ggml_sycl_pool {
+    static const int MAX_SYCL_BUFFERS = 256;
+
+    int device;
+    queue_ptr qptr;
+    struct ggml_sycl_buffer {
+        void * ptr = nullptr;
+        size_t size = 0;
+    };
+
+    ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
+    size_t pool_size = 0;
+
+    explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
+
+    ~ggml_sycl_pool_leg() {
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
+                pool_size -= b.size;
+            }
+        }
+        GGML_ASSERT(pool_size == 0);
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+#ifdef DEBUG_sycl_MALLOC
+        int nnz = 0;
+        size_t max_size = 0;
+#endif
+        size_t best_diff = 1ull << 36;
+        int ibest = -1;
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer& b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+#ifdef DEBUG_sycl_MALLOC
+                ++nnz;
+                if (b.size > max_size) max_size = b.size;
+#endif
+                if (b.size >= size) {
+                    size_t diff = b.size - size;
+                    if (diff < best_diff) {
+                        best_diff = diff;
+                        ibest = i;
+                        if (!best_diff) {
+                            void * ptr = b.ptr;
+                            *actual_size = b.size;
+                            b.ptr = nullptr;
+                            b.size = 0;
+                            return ptr;
+                        }
+                    }
+                }
+            }
+        }
+        if (ibest >= 0) {
+            ggml_sycl_buffer& b = buffer_pool[ibest];
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+        void * ptr;
+        size_t look_ahead_size = (size_t) (1.05 * size);
+
+        SYCL_CHECK(
+            CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
+                                look_ahead_size, *qptr)));
+        if (!ptr) {
+            GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
+            return nullptr;
+        }
+
+        *actual_size = look_ahead_size;
+        pool_size += look_ahead_size;
+
+#ifdef DEBUG_SYCL_MALLOC
+        GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+                (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+#endif
+
+        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
+        return ptr;
+    }
+
+    void free(void * ptr, size_t size) override {
+        for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+            ggml_sycl_buffer& b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr = ptr;
+                b.size = size;
+                return;
+            }
+        }
+        GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
+        pool_size -= size;
+    }
+};
+
+struct ggml_sycl_pool_host : public ggml_sycl_pool {
+    queue_ptr qptr;
+    int       device;
+
+    inline static int counter{ 0 };
+
+    struct ggml_sycl_buffer {
+        void * ptr  = nullptr;
+        size_t size = 0;
+    };
+
+    // Set arbitrarly to 64
+    static constexpr int          MAX_POOL_SIZE{ 64 };
+    std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
+    size_t                        pool_size   = 0;
+
+    explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
+
+    ~ggml_sycl_pool_host() {
+        for (int i = 0; i < MAX_POOL_SIZE; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
+                b.ptr = nullptr;
+                pool_size -= b.size;
+                b.size = 0;
+            }
+        }
+        counter = 0;
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+        if (counter == MAX_POOL_SIZE) {
+            ggml_sycl_buffer b               = buffer_pool[0];
+            void *           ptr             = b.ptr;
+            *actual_size                     = b.size;
+            counter                          = 1;
+            return ptr;
+        }
+        ggml_sycl_buffer & b = buffer_pool[counter];
+
+        if (b.ptr == nullptr) {
+            void * ptr;
+
+            SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
+            if (!ptr) {
+                GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
+                return nullptr;
+            }
+            pool_size += size;
+            *actual_size = size;
+            counter      = counter + 1;
+            return ptr;
+        } else {
+            ++counter;
+            b.size = size;
+            return b.ptr;
+        }
+    }
+
+    void free(void * ptr, size_t size) override {
+        // if the pool is not completed add the pointer to it in place of the first nullptr found.
+        // Otherwise do nothing, pointers will be freed once the pool is deallocated.
+        for (int i = 0; i < MAX_POOL_SIZE; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr  = ptr;
+                b.size = size;
+                return;
+            }
+        }
+    }
+};
+
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
+    // return pool for the host to speed up memory management
+    return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
+}
+
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
+    // TBD: NO VMM support
+    // if (ggml_sycl_info().devices[device].vmm) {
+    //     return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
+    // }
+   return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
+}
+
+// TBD pool with virtual memory management
+// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
+
+/// kernels
+
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_sycl_op_mul_mat_t)(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const queue_ptr &stream);
+
+
+
+template<int QUANT_BLOCK_TILE>
+static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                   item_ct1.get_local_id(1);
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+    typedef  sycl::vec<float, QUANT_BLOCK_TILE> TC;
+    typedef  sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
+    TC zeros;
+    TQ qzeros;
+#pragma unroll
+    for (int i = 0; i < QUANT_BLOCK_TILE; i++)
+    {
+        zeros[i] = 0.f;
+        qzeros[i] = 0;
+    }
+    const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
+    float sum = xi[0];
+    float amax = sycl::fabs(xi[0]);
+#pragma unroll
+    for (int i = 1; i < QUANT_BLOCK_TILE; i++)
+    {
+        sum += xi[i];
+        amax = sycl::fmax(sycl::fabs(xi[i]), amax);
+    }
+    sum = warp_reduce_sum(sum, item_ct1);
+    amax = warp_reduce_max(amax, item_ct1);
+
+    const float d = amax / 127;
+    TQ q = qzeros;
+    if (amax != 0.0f)
+    {
+#pragma unroll
+        for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
+            q[i] = sycl::round(xi[i] / d);
+        }
+    }
+
+    *(TQ *)&y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
+    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+}
+
+template<typename src0_t, typename dst_t>
+static void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                    item_ct1.get_local_id(2);
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+static void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi =
+            sycl::vec<sycl::half, 1>(x[ix])
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        const int row_y = col_x;
+
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi =
+            sycl::vec<sycl::half, 1>(x[ix])
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = sycl::vec<float, 1>(*xi)
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+}
+
+static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const sycl::half *xi = (const sycl::half *)cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+    const sycl::half *xi = (const sycl::half *)cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
+    const int16_t *xi = (const int16_t *)cxi;
+    int16_t *dsti = (int16_t *)cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t *xi = (const int32_t *)cxi;
+    int32_t *dsti = (int32_t *)cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                        const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                        const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                        const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = sycl::fmax(amax, sycl::fabs((float)v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = sycl::round((float)x0);
+    }
+}
+
+static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float)v)) {
+            amax = sycl::fabs((float)v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                      const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                      const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                      const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
+    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2)) *
+                  qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(1);
+    const int col = item_ct1.get_local_id(2);
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum, item_ct1);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+
+template<typename T>
+static inline void ggml_sycl_swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template <ggml_sort_order order>
+__dpct_inline__ static void
+k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
+                  const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
+    // bitonic sort
+    int col = item_ct1.get_local_id(2);
+    int row = item_ct1.get_group(1);
+
+    if (col >= ncols_pad) {
+        return;
+    }
+
+    const float * x_row = x + row * ncols;
+    auto dst_row = (int *)dpct_local;
+
+    // initialize indices
+    dst_row[col] = col;
+
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_sycl_swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_sycl_swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            /*
+            DPCT1118:1: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            item_ct1.barrier(sycl::access::fence_space::local_space);
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+
+static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
+                              const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+static void scale_f32(const float * x, float * dst, const float scale, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+template <typename Ti, typename To>
+static  void pool2d_nchw_kernel(
+        const int ih, const int iw, const int oh, const int ow,
+        const int kh, const int kw, const int sh, const int sw,
+        const int ph, const int pw, const int parallel_elements,
+        const Ti* src, To* dst, const enum ggml_op_pool op,
+        const sycl::nd_item<3> &item_ct1) {
+        int idx = item_ct1.get_local_id(2) +
+                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
+        if (idx >= parallel_elements) {
+            return;
+        }
+
+        const int I_HW = ih * iw;
+        const int O_HW = oh * ow;
+        const int nc = idx / O_HW;
+        const int cur_oh = idx % O_HW / ow;
+        const int cur_ow = idx % O_HW % ow;
+        const Ti* i_ptr = src + nc * I_HW;
+        To* o_ptr = dst + nc * O_HW;
+        const int start_h = cur_oh * sh - ph;
+        const int bh = sycl::max(0, start_h);
+        const int eh = sycl::min(ih, start_h + kh);
+        const int start_w = cur_ow * sw - pw;
+        const int bw = sycl::max(0, start_w);
+        const int ew = sycl::min(iw, start_w + kw);
+
+        To res = 0;
+
+        switch (op) {
+            case GGML_OP_POOL_AVG: res = 0; break;
+            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
+            default:
+                res      = (To) sycl::nan(uint32_t(0));
+                break;
+        }
+
+        for (int i = bh; i < eh; i += 1) {
+            for (int j = bw; j < ew; j += 1) {
+#if DPCT_COMPATIBILITY_TEMP >= 350
+                /*
+                DPCT1098:106: The '*' expression is used instead of the __ldg
+                call. These two expressions do not provide the exact same
+                functionality. Check the generated code for potential precision
+                and/or performance issues.
+                */
+                Ti cur = *(i_ptr + i * iw + j);
+#else
+                Ti cur = i_ptr[i * iw + j];
+#endif
+                switch (op) {
+                    case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
+                    case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
+                    default:
+                        res = (To) sycl::nan(uint32_t(0));
+                        break;
+                }
+            }
+        }
+        o_ptr[cur_oh * ow + cur_ow] = res;
+}
+
+template <int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+template <typename src0_t>
+static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const src0_t *src0_dd, const int32_t *src1_dd,
+                                float *dst_dd, queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+            });
+    }
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
+                                   const int ky, const int kx_padded,
+                                   queue_ptr stream) {
+    const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
+    const sycl::range<3> num_blocks(1, ky, block_num_x);
+    int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
+    static_assert(QK8_1 % WARP_SIZE == 0);
+    const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(num_blocks * block_size, block_size),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
+            });
+    }
+}
+
+static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
+                                           float *dst, const int ncols_x,
+                                           const int nrows_x,
+                                           const int nchannels_x,
+                                           const int nchannels_y,
+                                           queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
+                                     nchannels_y, item_ct1);
+            });
+    }
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_sycl(
+    const void *vx, const float *y, float *dst, const int ncols_x,
+    const int nrows_x, const int row_stride_x, const int nchannels_x,
+    const int nchannels_y, const int channel_stride_x, queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
+                                       row_stride_x, channel_stride_x,
+                                       nchannels_y / nchannels_x, item_ct1);
+            });
+    }
+}
+
+static void
+ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00,
+                      const int ne01, const int ne02, const int nb00,
+                      const int nb01, const int nb02, const int nb03,
+                      const int ne10, const int ne11, const int ne12,
+                      const int nb10, const int nb11, const int nb12,
+                      const int nb13, queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f16_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00,
+                                           nb01, nb02, nb03, ne10, ne11, ne12,
+                                           nb10, nb11, nb12, nb13, item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int ne02, const int nb00,
+                                  const int nb01, const int nb02,
+                                  const int nb03, const int ne10,
+                                  const int ne11, const int ne12,
+                                  const int nb10, const int nb11,
+                                  const int nb12, const int nb13,
+                                  queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int ne02, const int nb00,
+                                  const int nb01, const int nb02,
+                                  const int nb03, const int ne10,
+                                  const int ne11, const int ne12,
+                                  const int nb10, const int nb11,
+                                  const int nb12, const int nb13,
+                                  queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int ne02, const int nb00,
+                                   const int nb01, const int nb02,
+                                   const int nb03, const int ne10,
+                                   const int ne11, const int ne12,
+                                   const int nb10, const int nb11,
+                                   const int nb12, const int nb13,
+                                   queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
+                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                 item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int ne02, const int nb00,
+                                   const int nb01, const int nb02,
+                                   const int nb03, const int ne10,
+                                   const int ne11, const int ne12,
+                                   const int nb10, const int nb11,
+                                   const int nb12, const int nb13,
+                                   queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
+                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                 item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int ne02, const int nb00,
+                                   const int nb01, const int nb02,
+                                   const int nb03, const int ne10,
+                                   const int ne11, const int ne12,
+                                   const int nb10, const int nb11,
+                                   const int nb12, const int nb13,
+                                   queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
+                                 cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                 nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                 item_ct1);
+                         });
+}
+
+static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int ne02, const int nb00,
+                                  const int nb01, const int nb02,
+                                  const int nb03, const int ne10,
+                                  const int ne11, const int ne12,
+                                  const int nb10, const int nb11,
+                                  const int nb12, const int nb13,
+                                  queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int ne02, const int nb00,
+                                  const int nb01, const int nb02,
+                                  const int nb03, const int ne10,
+                                  const int ne11, const int ne12,
+                                  const int nb10, const int nb11,
+                                  const int nb12, const int nb13,
+                                  queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int ne02, const int nb00,
+                                  const int nb01, const int nb02,
+                                  const int nb03, const int ne10,
+                                  const int ne11, const int ne12,
+                                  const int nb10, const int nb11,
+                                  const int nb12, const int nb13,
+                                  queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
+                                           nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
+                                           item_ct1);
+            });
+    }
+}
+
+static void scale_f32_sycl(const float *x, float *dst, const float scale,
+                           const int k, queue_ptr stream) {
+    const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            scale_f32(x, dst, scale, k, item_ct1);
+        });
+}
+
+static void clamp_f32_sycl(const float *x, float *dst, const float min,
+                           const float max, const int k,
+                           queue_ptr stream) {
+    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp_f32(x, dst, min, max, k, item_ct1);
+        });
+}
+
+static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
+                              const int nrows, queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
+                             });
+}
+
+static int next_power_of_2(int x) {
+    int n = 1;
+    while (n < x) {
+        n *= 2;
+    }
+    return n;
+}
+
+static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
+                                 const int nrows, ggml_sort_order order,
+                                 queue_ptr stream) {
+    // bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    const sycl::range<3> block_dims(1, 1, ncols_pad);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                sycl::range<1>(shared_mem), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
+                        x, dst, ncols, ncols_pad, item_ct1,
+                        dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
+                            .get());
+                });
+        });
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                sycl::range<1>(shared_mem), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
+                        x, dst, ncols, ncols_pad, item_ct1,
+                        dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
+                            .get());
+                });
+        });
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
+                               const int nrows, queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    const size_t shared_mem = 256 * sizeof(float);
+
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::local_accessor<float, 1> shared_data(
+            sycl::range<1>(shared_mem/sizeof(float)), cgh);
+        sycl::local_accessor<int, 1> shared_indices(
+            sycl::range<1>(shared_mem/sizeof(float)), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                const int tid = item_ct1.get_local_id(2);
+                const int row = item_ct1.get_global_id(1);
+
+                float max_val = -INFINITY;
+                int max_idx = -1;
+
+                for (int col = tid; col < ncols; col += 256) {
+                    float val = x[row * ncols + col];
+                    if (val > max_val) {
+                        max_val = val;
+                        max_idx = col;
+                    }
+                }
+
+                shared_data[tid] = max_val;
+                shared_indices[tid] = max_idx;
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+
+                for (int stride = 256/2; stride > 0; stride >>= 1) {
+                    if (tid < stride) {
+                        float val1 = shared_data[tid];
+                        float val2 = shared_data[tid + stride];
+                        if (val2 > val1) {
+                            shared_data[tid] = val2;
+                            shared_indices[tid] = shared_indices[tid + stride];
+                        }
+                    }
+                    item_ct1.barrier(sycl::access::fence_space::local_space);
+                }
+
+
+                if (tid == 0) {
+                    dst[row] = shared_indices[0];
+                }
+            });
+    });
+}
+static void diag_mask_inf_f32_sycl(const float *x, float *dst,
+                                   const int ncols_x, const int nrows_x,
+                                   const int rows_per_channel, const int n_past,
+                                   queue_ptr stream) {
+    const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             diag_mask_inf_f32(x, dst, ncols_x,
+                                               rows_per_channel, n_past,
+                                               item_ct1);
+                         });
+}
+
+static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
+                                          const struct ggml_tensor *src,
+                                          int64_t i3, int64_t i2,
+                                          int64_t i1_low, int64_t i1_high,
+                                          queue_ptr stream) try {
+
+    dpct::memcpy_direction kind;
+    char * src_ptr;
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        kind = dpct::host_to_device;
+        //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
+        src_ptr = (char *) src->data;
+        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
+    } else if (ggml_backend_buffer_is_sycl(src->buffer)) {
+        // If buffer is a SYCL buffer
+        //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
+        kind    = dpct::device_to_device;
+        src_ptr = (char *) src->data;
+    } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
+        /*
+        If buffer is a SYCL split buffer
+        */
+        //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
+        GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
+        kind = dpct::device_to_device;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        int id;
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            id = get_current_device_id()));
+        // GGML_SYCL_DEBUG("current device index %d\n", id);
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
+        GGML_ABORT("fatal error");
+    }
+    char * dst_ptr = (char *) dst;
+
+    GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
+    GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        // GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
+        // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+        return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1,
+                                    kind, *stream));
+
+    } else if (nb0 == ts) {
+        return CHECK_TRY_ERROR(
+            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
+                                    ts * ne0 / bs, i1_diff, kind, *stream));
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            dpct::err0 r = CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
+                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
+            /*
+            DPCT1001:85: The statement could not be removed.
+            */
+            /*
+            DPCT1000:86: Error handling if-stmt was detected but could not be
+            rewritten.
+            */
+            if (r != 0) return r;
+        }
+        return 0;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_d, const float *src1_d,
+                                  float *dst_d, const queue_ptr &stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
+                                src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
+
+static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_d, const float *src1_d,
+                                float *dst_d,
+                                const queue_ptr &main_stream) {
+
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src1_d);
+}
+
+
+inline void ggml_sycl_op_mul_mat_sycl(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const queue_ptr &stream) try {
+
+    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+#if !GGML_SYCL_DNNL
+    const int64_t ne0 = dst->ne[0];
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = id == ctx.device ? ne0 : row_diff;
+#endif
+
+#ifdef GGML_SYCL_F16
+    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
+#else
+    bool use_fp16 = false;
+#endif
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT) {
+
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
+        ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
+            GGML_ASSERT(to_fp16_sycl != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16.alloc(ne);
+            to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src0_dd_i
+                                         : src0_as_f16.get();
+
+        ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+            GGML_ASSERT(to_fp16_sycl != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16.alloc(ne);
+            to_fp16_sycl(src1_ddf_i, src1_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
+                ? (const sycl::half *)src1->data + src1_padded_row_size
+                                         : src1_as_f16.get();
+        ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
+
+#if !GGML_SYCL_DNNL
+        const sycl::half alpha_f16 = 1.0f;
+        const sycl::half beta_f16  = 0.0f;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
+            *stream, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
+            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
+            dst_f16.get(), dpct::library_data_t::real_half, ldc,
+            dpct::library_data_t::real_half)));
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+#else
+        auto dnnl_stream = ctx.stream_dnnl(stream);
+        DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
+            src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
+#endif
+    }
+    else {
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
+        ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
+        ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
+            GGML_ASSERT(to_fp32_sycl != nullptr);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+        }
+        if (src1->type != GGML_TYPE_F32) {
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
+            GGML_ASSERT(to_fp32_sycl != nullptr);
+            src1_ddq_as_f32.alloc(src1_ncols*ne10);
+            to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
+
+#if !GGML_SYCL_DNNL
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+#    ifdef GGML_SYCL_NVIDIA
+        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+            oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
+            ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
+#    else
+        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+            *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
+            dst_dd_i, ldc)));
+#    endif
+#else
+        auto dnnl_stream = ctx.stream_dnnl(stream);
+         DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
+            src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
+#endif
+    }
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddq_i);
+    GGML_UNUSED(src1_padded_row_size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd, const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    const int64_t IH = src0->ne[1];
+    const int64_t IW = src0->ne[0];
+
+    const int64_t N = dst->ne[3];
+    const int64_t OC = dst->ne[2];
+    const int64_t OH = dst->ne[1];
+    const int64_t OW = dst->ne[0];
+
+    const int parallel_elements = N * OC * OH * OW;
+    const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
+    sycl::range<3> block_nums(1, 1, num_blocks);
+    main_stream->parallel_for(
+        sycl::nd_range<3>(block_nums *
+                              sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0,
+                               parallel_elements, src0_dd, dst_dd, op,
+                               item_ct1);
+        });
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const queue_ptr &main_stream) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne = ggml_nelements(src0);
+
+    sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    SYCL_CHECK(0);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    SYCL_CHECK(0);
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
+
+static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_SYCL_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        SYCL_CHECK(ggml_sycl_set_device(i));
+    }
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        SYCL_CHECK(ggml_sycl_set_device(i));
+
+        for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
+            if (i == id_other) {
+                continue;
+            }
+            if (i != main_device && id_other != main_device) {
+                continue;
+            }
+
+            // int can_access_peer;
+            // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            // if (can_access_peer) {
+            //     if (enable_peer_access) {
+            //         SYCL_CHECK(syclDeviceEnablePeerAccess(id_other, 0));
+            //     } else {
+            //         SYCL_CHECK(syclDeviceDisablePeerAccess(id_other));
+            //     }
+            // }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 ggml_sycl_op_mul_mat_t op,
+                                 const bool convert_src1_to_q8_1) try {
+
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
+
+    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+
+    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    std::array<float, GGML_SYCL_MAX_DEVICES> tensor_split;
+    if (split) {
+        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
+        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
+        ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
+        tensor_split = buft_ctx->tensor_split;
+    }
+
+    struct dev_data {
+        ggml_sycl_pool_alloc<char> src0_dd_alloc;
+        ggml_sycl_pool_alloc<float> src1_ddf_alloc;
+        ggml_sycl_pool_alloc<char> src1_ddq_alloc;
+        ggml_sycl_pool_alloc<float> dst_dd_alloc;
+
+        char *src0_dd = nullptr;
+        float *src1_ddf = nullptr; // float
+        char *src1_ddq = nullptr;  // q8_1
+        float *dst_dd = nullptr;
+
+        int64_t row_low;
+        int64_t row_high;
+    };
+
+    dev_data dev[GGML_SYCL_MAX_DEVICES];
+
+    int used_devices = 0;
+    queue_ptr main_stream = ctx.stream();
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        // by default, use all rows
+        dev[i].row_low  = 0;
+        dev[i].row_high = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
+
+            if (i != 0) {
+                dev[i].row_low  = ne01*tensor_split[i];
+                if (dev[i].row_low < ne01) {
+                    dev[i].row_low -= dev[i].row_low % rounding;
+                }
+            }
+
+            if (i != ggml_sycl_info().device_count - 1) {
+                dev[i].row_high  = ne01*tensor_split[i + 1];
+                if (dev[i].row_high < ne01) {
+                    dev[i].row_high -= dev[i].row_high % rounding;
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = i == ctx.device;
+        const bool  dst_on_device = i == ctx.device;
+
+        ggml_sycl_set_device(i);
+        queue_ptr stream = ctx.stream(i, 0);
+
+        if (src0_is_contiguous) {
+            dev[i].src0_dd = (char *) src0->data;
+        } else {
+            dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            dev[i].src1_ddf = (float *) src1->data;
+        } else {
+            dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
+        }
+
+        if (convert_src1_to_q8_1) {
+            dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                /*
+                DPCT1010:90: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                SYCL_CHECK(0);
+            }
+        }
+
+        if (dst_on_device) {
+            dev[i].dst_dd = (float *) dst->data;
+        } else {
+            const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
+            dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        ggml_sycl_set_device(ctx.device);
+        /*
+        DPCT1024:91: The original code returned the error code that was further
+        consumed by the program logic. This original code was replaced with 0.
+        You may need to rewrite the program logic consuming the error code.
+        */
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            *src0_extra->events[ctx.device][0] =
+                ctx.stream()->ext_oneapi_submit_barrier()));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
+                continue;
+            }
+
+            const bool src1_on_device = i == ctx.device;
+            const bool  dst_on_device = i == ctx.device;
+            const int64_t row_diff = dev[i].row_high - dev[i].row_low;
+
+            ggml_sycl_set_device(i);
+            queue_ptr stream = ctx.stream(i, is);
+
+            // wait for main GPU data if necessary
+            if (split && (i != ctx.device || is != 0)) {
+                /*
+                DPCT1009:163: SYCL uses exceptions to report errors and does not
+                use the error codes. The original code was commented out and a
+                warning string was inserted. You need to rewrite this code.
+                */
+                SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
+                    {*src0_extra->events[ctx.device][0]})));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = dev[i].src1_ddq +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dev[i].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (i == ctx.device) {
+                    dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1_is_contiguous) {
+                    if (i != ctx.device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
+                          SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
+                                src1_ddq_i, src1_ddq_i_source,
+                                src1_ncols * src1_padded_col_size * q8_1_ts /
+                                    q8_1_bs).wait()));
+                        } else {
+
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+
+                            SYCL_CHECK(CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream,
+                                src1_ddf_i, src1_ddf_i_source,
+                                src1_ncols * ne10 * sizeof(float))));
+                        }
+                    }
+                } else if (src1_on_device && !src1_is_contiguous) {
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
+                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ABORT("fatal error");
+                }
+
+                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
+                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    /*
+                    DPCT1010:92: SYCL uses exceptions to report errors and does
+                    not use the error codes. The call was replaced with 0. You
+                    need to rewrite this code.
+                    */
+                    SYCL_CHECK(0);
+                }
+
+                if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
+                }
+                if (src1->type == GGML_TYPE_F16) {
+                    src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
+                }
+                // do the computation
+                SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                    dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
+                /*
+                DPCT1010:93: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                SYCL_CHECK(0);
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device = dst->data;
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
+
+                        SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
+                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
+                            row_diff * sizeof(float), row_diff * sizeof(float),
+                            src1_ncols, dpct::device_to_device, *stream)));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        SYCL_CHECK(CHECK_TRY_ERROR(
+                            stream->memcpy(dhf_dst_i, dst_dd_i,
+                                           src1_ncols * ne0 * sizeof(float)).wait()));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (i != ctx.device || is != 0)) {
+                    /*
+                    DPCT1024:94: The original code returned the error code that
+                    was further consumed by the program logic. This original
+                    code was replaced with 0. You may need to rewrite the
+                    program logic consuming the error code.
+                    */
+                    SYCL_CHECK(CHECK_TRY_ERROR(
+                        *src0_extra->events[i][is] =
+                            stream->ext_oneapi_submit_barrier()));
+                }
+            }
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && ggml_sycl_info().device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
+
+        ggml_sycl_set_device(ctx.device);
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            if (dev[i].row_low == dev[i].row_high) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                SYCL_CHECK(CHECK_TRY_ERROR(
+                    ctx.stream()->ext_oneapi_submit_barrier(
+                        {*src0_extra->events[i][is]})));
+            }
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+
+static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst) try {
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    queue_ptr main_stream = ctx.stream();
+
+    void  * src0_ddq = src0->data;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf  = (float *) dst->data;
+
+    ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                     const ggml_tensor *src1,
+                                     ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    queue_ptr main_stream = ctx.stream();
+
+    void  * src0_ddq = src0->data;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf  = (float *) dst->data;
+
+    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
+    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
+
+    ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
+                                   const sycl::half *src1_as_f16, char *dst,
+                                   const void **ptrs_src, void **ptrs_dst,
+                                   int64_t ne12, int64_t ne13, int64_t ne23,
+                                   size_t nb02, size_t nb03, size_t nb12,
+                                   size_t nb13, size_t nbd2, size_t nbd3,
+                                   int64_t r2, int64_t r3,
+                                   const sycl::nd_item<3> &item_ct1) {
+    int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                  item_ct1.get_local_id(2);
+    int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+                  item_ct1.get_local_id(1);
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int64_t i03 = i13 / r3;
+    int64_t i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
+}
+
+static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
+                                             const ggml_tensor *src0,
+                                             const ggml_tensor *src1,
+                                             ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    queue_ptr main_stream = ctx.stream();;
+
+    void * src0_ddq = src0->data;
+    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
+    float * src1_ddf = (float *) src1->data;
+    float * dst_ddf = (float *) dst->data;
+
+    // convert src1 to fp16
+    ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
+    if (src1->type != GGML_TYPE_F16) {
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+        const int64_t ne_src1 = ggml_nelements(src1);
+        src1_f16_alloc.alloc(ne_src1);
+        GGML_ASSERT(to_fp16_sycl != nullptr);
+        to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
+    }
+    sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
+                                                       : src1_f16_alloc.get();
+
+    char * dst_t;
+
+    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
+    dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
+
+    // dst strides
+    size_t nbd2 = dst->nb[2];
+    size_t nbd3 = dst->nb[3];
+
+    const float alpha_f32 = 1.0f;
+    const float beta_f32 = 0.0f;
+
+    const void * alpha = &alpha_f32;
+    const void * beta  = &beta_f32;
+
+    dst_t = (char *) dst_ddf;
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
+            *main_stream, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const char *)src0_as_f16, dpct::library_data_t::real_half,
+            nb01 / nb00, nb02 / nb00,
+            (const char *)src1_f16, dpct::library_data_t::real_half,
+            nb11 / nb10, nb12 / nb10, beta,
+            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
+            ne12 * ne13, cu_compute_type)));
+    } else {
+        const int ne23 = ne12*ne13;
+
+        ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
+        ggml_sycl_pool_alloc<      void *> ptrs_dst(ctx.pool(), 1*ne23);
+        ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
+
+        sycl::range<3> block_dims(1, ne12, ne13);
+        /*
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(main_stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            main_stream->submit([&](sycl::handler &cgh) {
+                const void **ptrs_src_get = ptrs_src.get();
+                void **ptrs_dst_get = ptrs_dst.get();
+                size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
+                size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
+                                 [=](sycl::nd_item<3> item_ct1) {
+                                     k_compute_batched_ptrs(
+                                         src0_as_f16, src1_f16,
+                                         dst_t, ptrs_src_get,
+                                         ptrs_dst_get, ne12, ne13, ne23,
+                                         nb02, nb03, nb12_scaled, nb13_scaled,
+                                         nbd2, nbd3, r2, r3, item_ct1);
+                                 });
+            });
+        }
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
+            *main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
+            (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
+            (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
+    // TODO: accuracy issues in MMQ
+    GGML_UNUSED(type);
+    return false;
+}
+
+bool ggml_sycl_supports_dmmv(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_F16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
+    int64_t min_compute_capability = INT_MAX;
+
+    if (split) {
+        ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
+        auto & tensor_split = buft_ctx->tensor_split;
+        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            // skip devices that are not going to do any work:
+            if (tensor_split[id] >= (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
+                continue;
+            }
+
+            if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
+                min_compute_capability = ggml_sycl_info().devices[id].cc;
+            }
+        }
+    } else {
+        min_compute_capability    = ggml_sycl_info().devices[ctx.device].cc;
+    }
+
+    // check data types and tensor shapes for custom matrix multiplication kernels:
+    bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+        && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+
+    bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+
+    bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type)
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+
+    // mmvq and mmq need the __dp4a instruction which is available for gen12+
+    // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
+    use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
+#ifdef SYCL_USE_XMX
+    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
+#endif // SYCL_USE_XMX
+
+    // mmvq path is faster in the CUDA backend.
+    if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
+        use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+
+    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // TODO: Refactor and cleanup of mul mat dispatching.
+        if (src0->ne[3] == 1 && src1->ne[3] == 1) {
+            // KQ single-batch
+            // mmv p021 was specific for these dimensions
+            ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
+        } else {
+            // The kernel from the if path is faster for that specific case, but does not support all mul mats.
+            ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
+        }
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
+        ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+        // KQ + KQV multi-batch
+        ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
+    } else if (use_dequantize_mul_mat_vec) {
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
+    } else if (use_mul_mat_vec_q) {
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
+    } else if (use_mul_mat_q) {
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
+    } else {
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+    }
+}
+
+
+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+
+__dpct_inline__ static void k_copy_src1_to_contiguous(
+    const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
+    int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
+    const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
+    int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
+    const sycl::nd_item<3> &item_ct1, int &src1_row) {
+    int32_t iid1 = item_ct1.get_group(2);
+    int32_t id = item_ct1.get_group(1);
+
+    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
+
+    if (row_id_i != i02) {
+        return;
+    }
+
+    const int64_t i11 = id % ne11;
+    const int64_t i12 = iid1;
+
+    if (item_ct1.get_local_id(2) == 0) {
+        src1_row =
+            dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
+                cur_src1_row, 1);
+        row_mapping[src1_row] = {id, iid1};
+    }
+    /*
+    DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
+    float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
+
+#pragma unroll
+    for (int i = item_ct1.get_local_id(2); i < ne10;
+         i += item_ct1.get_local_range(2)) {
+        src1_row_contiguous[i] = src1_row_original[i];
+    }
+}
+
+__dpct_inline__ static void k_copy_dst_from_contiguous(
+    char *__restrict__ dst_original, const char *__restrict__ dst_contiguous,
+    const mmid_row_mapping *__restrict__ row_mapping, int64_t ne0, size_t nb1,
+    size_t nb2, const sycl::nd_item<3> &item_ct1) {
+    int32_t i = item_ct1.get_group(2);
+
+    const int32_t i1 = row_mapping[i].i1;
+    const int32_t i2 = row_mapping[i].i2;
+
+    const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
+    float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
+
+#pragma unroll
+    for (int j = item_ct1.get_local_id(2); j < ne0;
+         j += item_ct1.get_local_range(2)) {
+        dst_row_original[j] = dst_row_contiguous[j];
+    }
+}
+
+static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
+                                 ggml_tensor *dst) try {
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
+
+    const ggml_tensor *ids = dst->src[2];
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const queue_ptr stream = ctx.stream();
+
+    const int64_t n_as = ne02;
+    const int64_t n_ids = ids->ne[0];
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    const char * ids_dev = (const char *) ids->data;
+
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
+    SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
+
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    char *src0_original = (char *)src0->data;
+    char *src1_original = (char *)src1->data;
+    char *dst_original = (char *)dst->data;
+
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[3] = nb02;
+
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+    if (ne12 == 1) {
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+            for (int64_t id = 0; id < n_ids; id++) {
+                const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+                GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                const int64_t i11 = id % ne11;
+                const int64_t i12 = iid1;
+
+                const int64_t i1 = id;
+                const int64_t i2 = i12;
+
+            src0_row.data = src0_original + i02*nb02;
+            src1_row.data = src1_original + + i11*nb11 + i12*nb12;
+            dst_row.data = dst_original + i1*nb1   + i2*nb2;
+
+            ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+            }
+        }
+    } else {
+        ggml_sycl_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
+        ggml_sycl_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
+
+        src1_row.data = src1_contiguous.get();
+        dst_row.data  =  dst_contiguous.get();
+
+        for (int64_t i02 = 0; i02 < n_as; i02++) {
+            int64_t num_src1_rows = 0;
+            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+                for (int64_t id = 0; id < n_ids; id++) {
+                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+                    if (row_id_i != i02) {
+                        continue;
+                    }
+
+                    num_src1_rows++;
+                }
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+
+            ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
+            ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
+
+            {
+                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
+                sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
+                stream->submit([&](sycl::handler &cgh) {
+                    sycl::local_accessor<int, 0> src1_row_acc(cgh);
+
+                    char *__restrict src1_contiguous_get =
+                        src1_contiguous.get();
+                    int *__restrict dev_cur_src1_row_get =
+                        dev_cur_src1_row.get();
+                    mmid_row_mapping *__restrict dev_row_mapping_get =
+                        dev_row_mapping.get();
+                    size_t ids_nb_ct6 = ids->nb[1];
+                    size_t ids_nb_ct7 = ids->nb[0];
+
+                    cgh.parallel_for(
+                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_copy_src1_to_contiguous(
+                                src1_original, src1_contiguous_get,
+                                dev_cur_src1_row_get,
+                                dev_row_mapping_get, ids_dev, i02,
+                                ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
+                                item_ct1, src1_row_acc);
+                        });
+                });
+            }
+
+            src0_row.data = src0_original + i02*nb02;
+
+            GGML_ASSERT(nb11 == sizeof(float)*ne10);
+            GGML_ASSERT(nb1 == sizeof(float)*ne0);
+            src1_row.ne[1] = num_src1_rows;
+
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.ne[1] = num_src1_rows;
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+
+            {
+                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
+                sycl::range<3> grid_dims(1, 1, num_src1_rows);
+                stream->submit([&](sycl::handler &cgh) {
+                    const char *__restrict dst_contiguous_get =
+                        dst_contiguous.get();
+                    const mmid_row_mapping *__restrict dev_row_mapping_get =
+                        dev_row_mapping.get();
+
+                    cgh.parallel_for(
+                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_copy_dst_from_contiguous(dst_original,
+                                                       dst_contiguous_get,
+                                                       dev_row_mapping_get,
+                                                       ne0, nb1, nb2, item_ct1);
+                        });
+                });
+            }
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale);
+}
+
+static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
+}
+
+static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst) try {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    GGML_TENSOR_BINARY_OP_LOCALS01;
+
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    queue_ptr main_stream = ctx.stream();
+
+    char * src0_ddc = (char *) src0->data;
+    char * src1_ddc = (char *) src1->data;
+
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
+        ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else {
+        GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+    GGML_UNUSED(dst);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
+    ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr);
+}
+
+static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
+}
+
+static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
+}
+
+static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d);
+}
+
+static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col);
+}
+
+static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum);
+}
+
+static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows);
+}
+
+static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort);
+}
+
+static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
+    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax);
+}
+
+
+void ggml_sycl_set_main_device(const int main_device) try {
+    if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
+        return;
+    }
+    check_allow_gpu_index(main_device);
+    dpct::select_device(main_device);
+
+    if (g_ggml_sycl_debug) {
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(main_device))));
+        GGML_LOG_INFO("Using device %d (%s) as main device\n",
+                main_device, prop.get_name());
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
+    if (!g_sycl_loaded) return false;
+
+    if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
+        ggml_sycl_set_peer_access(dst->src[1]->ne[1], ctx.device);
+    }
+
+    switch (dst->op) {
+        case GGML_OP_ARGMAX:
+            ggml_sycl_argmax(ctx, dst);
+            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_sycl_op_conv_transpose_1d(ctx, dst);
+            break;
+        case GGML_OP_REPEAT:
+            ggml_sycl_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_sycl_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_sycl_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1: // TODO: more efficient implementation
+            ggml_sycl_add(ctx, dst);
+            break;
+        case GGML_OP_SUB:
+            ggml_sycl_sub(ctx, dst);
+            break;
+        case GGML_OP_ACC:
+            ggml_sycl_acc(ctx, dst);
+            break;
+        case GGML_OP_MUL:
+            ggml_sycl_mul(ctx, dst);
+            break;
+        case GGML_OP_LOG:
+            ggml_sycl_log(ctx, dst);
+            break;
+        case GGML_OP_DIV:
+            ggml_sycl_div(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_NEG:
+                    ggml_sycl_neg(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_STEP:
+                    ggml_sycl_step(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_GELU:
+                    ggml_sycl_gelu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    ggml_sycl_silu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    ggml_sycl_gelu_quick(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    ggml_sycl_tanh(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    ggml_sycl_relu(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SIGMOID:
+                    ggml_sycl_sigmoid(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    ggml_sycl_hardsigmoid(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    ggml_sycl_hardswish(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_EXP:
+                    ggml_sycl_exp(ctx, dst);
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_sycl_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_sycl_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_sycl_op_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_sycl_upscale(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_sycl_pad(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_sycl_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_sycl_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
+                return false;
+            }
+            /* ggml_sycl_mul_mat_id is dependent on ggml_sycl_mul_mat */
+            ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
+                return false;
+            }
+            ggml_sycl_mul_mat_id(ctx, dst);
+            break;
+        case GGML_OP_OUT_PROD:
+            ggml_sycl_op_out_prod(ctx, dst);
+            break;
+        case GGML_OP_SCALE:
+            ggml_sycl_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_sycl_sqr(ctx, dst);
+            break;
+        case GGML_OP_SQRT:
+            ggml_sycl_sqrt(ctx, dst);
+            break;
+        case GGML_OP_SIN:
+            ggml_sycl_sin(ctx, dst);
+            break;
+        case GGML_OP_COS:
+            ggml_sycl_cos(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_sycl_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst);
+            break;
+        case GGML_OP_CONT:
+            ggml_sycl_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_sycl_diag_mask_inf(ctx, dst);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_sycl_op_soft_max(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_sycl_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_sycl_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_sycl_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM:
+            ggml_sycl_sum(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_sycl_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_sycl_argsort(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_sycl_op_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_RWKV_WKV6:
+            ggml_sycl_op_rwkv_wkv6(ctx, dst);
+            break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            ggml_sycl_op_gated_linear_attn(ctx, dst);
+            break;
+        default:
+            return false;
+    }
+
+    return true;
+}
+
+GGML_API void ggml_backend_sycl_get_device_description(int device, char *description,
+                                      size_t description_size) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_description\n");
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+        prop, dpct::dev_mgr::instance().get_device(device))));
+    snprintf(description, description_size, "%s", prop.get_name());
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_backend_sycl_get_device_memory(int device, size_t *free,
+                                                   size_t *total) try {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
+    ggml_sycl_set_device(device);
+
+    /*
+    DPCT1009:218: SYCL uses exceptions to report errors and does not use the
+    error codes. The original code was commented out and a warning string was
+    inserted. You need to rewrite this code.
+    */
+    /*
+    DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
+    device information which may not be supported by all compilers or runtimes.
+    You may need to adjust the code.
+    */
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend
+
+static const char * ggml_backend_sycl_get_name(ggml_backend_t backend) {
+
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+
+    return sycl_ctx->name.c_str();
+}
+
+static void ggml_backend_sycl_free(ggml_backend_t backend) {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+
+    delete sycl_ctx;
+    delete backend;
+}
+
+static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
+                                               ggml_tensor *tensor,
+                                               const void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
+    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        (stream)->memcpy((char *)tensor->data + offset, data, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
+                                               const ggml_tensor *tensor,
+                                               void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
+    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
+        data, (const char *)tensor->data + offset, size).wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
+                                               const ggml_tensor *src,
+                                               ggml_tensor *dst) try {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
+        /*
+        DPCT1009:215: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+        SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
+            dst->data, src->data, ggml_nbytes(dst)).wait()));
+        return true;
+    }
+
+    return false;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+    SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
+
+    GGML_UNUSED(backend);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+    ggml_sycl_set_main_device(sycl_ctx->device);
+
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+#ifndef NDEBUG
+        assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
+            }
+        }
+#endif
+        bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
+        if (!ok) {
+            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_sycl_event_record(ggml_backend_t backend, ggml_backend_event_t event)
+try
+{
+    ggml_backend_sycl_context *sycl_ctx =
+        (ggml_backend_sycl_context *)backend->context;
+
+    sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+
+    const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
+    // Record the current state of the queue
+    SYCL_CHECK(CHECK_TRY_ERROR(*sycl_event = stream->ext_oneapi_submit_barrier()));
+}
+catch (sycl::exception const &exc)
+{
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
+
+    sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
+
+    if (ggml_backend_is_sycl(backend)) {
+        SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
+    } else
+        GGML_ABORT("fatal error");
+} catch (sycl::exception const& exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+static ggml_backend_i ggml_backend_sycl_interface = {
+    /* .get_name                = */ ggml_backend_sycl_get_name,
+    /* .free                    = */ ggml_backend_sycl_free,
+    /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
+                                           // // TODO: update for the new
+                                           // interface
+    /* .synchronize             = */ ggml_backend_sycl_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
+    /* .event_record            = */ ggml_backend_sycl_event_record,
+    /* .event_wait              = */ ggml_backend_sycl_event_wait,
+};
+
+static ggml_guid_t ggml_backend_sycl_guid() {
+    static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
+    return &guid;
+}
+
+bool ggml_backend_is_sycl(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
+}
+
+int ggml_backend_sycl_get_device_count() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
+    return ggml_sycl_info().device_count;
+}
+
+
+// backend device
+
+struct ggml_backend_sycl_device_context {
+    int device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_sycl_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    ggml_sycl_set_device(ctx->device);
+    SYCL_CHECK(CHECK_TRY_ERROR(
+    dpct::dev_mgr::instance().get_device(ctx->device).get_memory_info(*free, *total)));
+}
+
+static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_sycl_device_get_name(dev);
+    props->description = ggml_backend_sycl_device_get_description(dev);
+    props->type        = ggml_backend_sycl_device_get_type(dev);
+    ggml_backend_sycl_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+    bool host_buffer = getenv("GGML_SYCL_NO_PINNED") == nullptr;
+#ifdef GGML_SYCL_NO_PEER_COPY
+    bool events = false;
+#else
+    bool events = true;
+#endif
+
+    props->caps = {
+        /* .async                 = */ true,
+        /* .host_buffer           = */ host_buffer,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ events,
+    };
+}
+
+static ggml_backend_t ggml_backend_sycl_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ggml_backend_sycl_init(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_sycl_device_context * ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return ggml_backend_sycl_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_sycl_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_sycl_host_buffer_type();
+}
+
+static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    GGML_UNUSED(dev);
+    GGML_UNUSED(ptr);
+    GGML_UNUSED(size);
+    GGML_UNUSED(max_tensor_size);
+    return nullptr;
+}
+
+static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_EXP:
+                    return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
+                if (op->op == GGML_OP_MUL_MAT) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                ggml_type a_type = a->type;
+                if (a_type == GGML_TYPE_IQ4_NL  || a_type == GGML_TYPE_IQ4_XS ||
+                    a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S  ||
+                    a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
+                    a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
+                    ) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                        return false;
+                    }
+                }
+                ggml_type src0_type = op->src[0]->type;
+                if (src0_type == GGML_TYPE_BF16) {
+                    return false;
+                }
+                return true;
+            } break;
+        case GGML_OP_OUT_PROD:
+            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_CONCAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+            } break;
+        case GGML_OP_DUP:
+        case GGML_OP_ARGMAX:
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_REPEAT:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_LOG:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+            return true;
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+            return true;
+        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+            return true;
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return ggml_is_contiguous(op->src[0]);
+            }
+        case GGML_OP_IM2COL:
+            // TODO: add support for the new F32 operations
+            return op->src[0]->type == GGML_TYPE_F16;
+        case GGML_OP_POOL_2D:
+        case GGML_OP_SUM:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_GATED_LINEAR_ATTN:
+            return true;
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
+        return false;
+    }
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+    ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
+    return buft_ctx->device == sycl_ctx->device;
+}
+
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_GET_ROWS:
+            return 0;
+        case GGML_OP_MUL_MAT:
+            return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+        case GGML_OP_ROPE:
+            return op->ne[2];
+        default:
+            return ggml_nrows(op);
+    }
+}
+
+static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+    return get_op_batch_size(op) >= min_batch_size;
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_event_t
+ggml_backend_sycl_device_event_new(ggml_backend_dev_t dev) {
+
+#ifdef GGML_SYCL_NO_PEER_COPY
+    return nullptr;
+#else
+  sycl::event *event_ptr = new sycl::event();
+
+  return new ggml_backend_event{
+      /* .device = */ dev,
+      /* .context = */ event_ptr,
+  };
+#endif
+}
+
+static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
+  GGML_UNUSED(dev);
+  if (event == nullptr) {
+    return;
+  }
+
+  if (event->context != nullptr) {
+    sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+    delete sycl_event;
+    event->context = nullptr;
+  }
+
+  delete event;
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+
+static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
+  GGML_UNUSED(dev);
+
+  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static const ggml_backend_device_i ggml_backend_sycl_device_interface = {
+    /* .get_name                = */ ggml_backend_sycl_device_get_name,
+    /* .get_description         = */ ggml_backend_sycl_device_get_description,
+    /* .get_memory              = */ ggml_backend_sycl_device_get_memory,
+    /* .get_type                = */ ggml_backend_sycl_device_get_type,
+    /* .get_props               = */ ggml_backend_sycl_device_get_props,
+    /* .init_backend            = */ ggml_backend_sycl_device_init,
+    /* .get_buffer_type         = */ ggml_backend_sycl_device_get_buffer_type,
+    /* .get_host_buffer_type    = */ ggml_backend_sycl_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr    = */ ggml_backend_sycl_device_buffer_from_host_ptr,
+    /* .supports_op             = */ ggml_backend_sycl_device_supports_op,
+    /* .supports_buft           = */ ggml_backend_sycl_device_supports_buft,
+    /* .offload_op              = */ ggml_backend_sycl_device_offload_op,
+    /* .event_new               = */ ggml_backend_sycl_device_event_new,
+    /* .event_free              = */ ggml_backend_sycl_device_event_free,
+    /* .event_synchronize       = */ ggml_backend_sycl_device_event_synchronize,
+};
+
+// backend reg
+
+struct ggml_backend_sycl_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_sycl_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_SYCL_NAME;
+}
+
+static size_t ggml_backend_sycl_reg_get_device_count(ggml_backend_reg_t reg) {
+    ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context;
+    return ctx->devices.size();
+}
+
+static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    ggml_backend_sycl_reg_context * ctx = (ggml_backend_sycl_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
+    GGML_UNUSED(reg);
+
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_sycl_split_buffer_type;
+    }
+
+    // SYCL doesn't support registering host memory, left here for reference
+    // "ggml_backend_register_host_buffer"
+    // "ggml_backend_unregister_host_buffer"
+    GGML_UNUSED(name);
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
+    /* .get_name          = */ ggml_backend_sycl_reg_get_name,
+    /* .get_device_count  = */ ggml_backend_sycl_reg_get_device_count,
+    /* .get_device        = */ ggml_backend_sycl_reg_get_device,
+    /* .get_proc_address  = */ ggml_backend_sycl_reg_get_proc_address,
+};
+
+
+// backend registry
+
+ggml_backend_reg_t ggml_backend_sycl_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
+
+            for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+                ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_SYCL_NAME + std::to_string(i);
+
+                ggml_sycl_set_device(i);
+
+                dpct::device_info prop;
+                SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+                    prop, dpct::dev_mgr::instance().get_device(i))));
+
+                dev_ctx->description = prop.get_name();
+
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                    /* .iface       = */ ggml_backend_sycl_device_interface,
+                    /* .reg         = */ &reg,
+                    /* .context     = */ dev_ctx
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_sycl_reg_interface,
+                /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+ggml_backend_t ggml_backend_sycl_init(int device) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
+    ggml_check_sycl();
+
+    check_allow_gpu_index(device);
+
+    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return nullptr;
+    };
+
+    ggml_backend_t sycl_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_sycl_guid(),
+        /* .interface = */ ggml_backend_sycl_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
+        /* .context   = */ ctx
+    };
+
+    return sycl_backend;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp
new file mode 100644
index 000000000..eedb47486
--- /dev/null
+++ b/ggml/src/ggml-sycl/gla.cpp
@@ -0,0 +1,105 @@
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+
+template <u_int HEAD_SIZE>
+static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, u_int T, u_int C, u_int H, float scale,
+                                         const float * k, const float * v, const float * r, const float * td,
+                                         const float * s, float * dst) {
+    const u_int head_size    = HEAD_SIZE;
+    const u_int state_size   = C * head_size;
+    const u_int n_seq_tokens = T / B;
+    sycl::range<1> block_dims((C / H));
+    sycl::range<1> grid_dims((B * H));
+    stream->submit([&](sycl::handler & cgh) {
+        /* local memory accessors*/
+        auto _k  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+        auto _r  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+        auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+
+        cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
+            u_int tid = item.get_local_id(0);
+            u_int bid = item.get_group(0);
+
+            u_int batch_i = bid / H;
+            u_int head_i  = bid % H;
+
+            float state[head_size];
+
+#pragma unroll
+            for (u_int i = 0; i < head_size; i++) {
+                state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+            }
+
+            for (u_int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
+                 t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
+
+                item.barrier(sycl::access::fence_space::local_space);  //sync threads
+                _k[tid]  = k[t];
+                _r[tid]  = r[t];
+                _td[tid] = td[t];
+                item.barrier(sycl::access::fence_space::local_space);  //sync threads
+
+                const float _v = v[t];
+                float       y  = 0;
+
+                for (u_int j = 0; j < head_size; j += 4) {
+                    const sycl::float4 & k  = (sycl::float4 &) (_k[j]);
+                    const sycl::float4 & r  = (sycl::float4 &) (_r[j]);
+                    const sycl::float4 & td = (sycl::float4 &) (_td[j]);
+                    sycl::float4 &       s  = (sycl::float4 &) (state[j]);
+                    sycl::float4         kv;
+
+                    kv.x() = k.x() * _v;
+                    kv.y() = k.y() * _v;
+                    kv.z() = k.z() * _v;
+                    kv.w() = k.w() * _v;
+
+                    s.x() = s.x() * td.x() + kv.x();
+                    s.y() = s.y() * td.y() + kv.y();
+                    s.z() = s.z() * td.z() + kv.z();
+                    s.w() = s.w() * td.w() + kv.w();
+
+                    y += r.x() * s.x();
+                    y += r.y() * s.y();
+                    y += r.z() * s.z();
+                    y += r.w() * s.w();
+                }
+                dst[t] = y * scale;
+            }
+#pragma unroll
+            for (u_int i = 0; i < head_size; i++) {
+                dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+            }
+        });
+    });
+}
+
+void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const float * k_d  = static_cast<const float *>(dst->src[0]->data);
+    const float * v_d  = static_cast<const float *>(dst->src[1]->data);
+    const float * r_d  = static_cast<const float *>(dst->src[2]->data);
+    const float * td_d = static_cast<const float *>(dst->src[3]->data);
+    const float * s_d  = static_cast<const float *>(dst->src[4]->data);
+
+    const int64_t B = dst->src[4]->ne[1];
+    const int64_t T = dst->src[0]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[1];
+
+    dpct::queue_ptr stream = ctx.stream();
+    GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == 64 || C / H == 128);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    float * dst_d = (float *) dst->data;
+
+    if (C / H == 64) {
+        gated_linear_attn_f32_kernel<64>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    } else {
+        gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    }
+}
diff --git a/ggml/src/ggml-sycl/gla.hpp b/ggml/src/ggml-sycl/gla.hpp
new file mode 100644
index 000000000..607cf3a7f
--- /dev/null
+++ b/ggml/src/ggml-sycl/gla.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_GLA_HPP
+#define GGML_SYCL_GLA_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif  // GGML_SYCL_GLA_HPP
diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp
new file mode 100644
index 000000000..6146a99ed
--- /dev/null
+++ b/ggml/src/ggml-sycl/im2col.cpp
@@ -0,0 +1,126 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "im2col.hpp"
+
+template <typename T>
+static void im2col_kernel(
+        const float *x, T *dst, int64_t batch_offset, int64_t offset_delta,
+        int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
+        int64_t pelements, int64_t CHW, int s0, int s1, int p0, int p1, int d0, int d1,
+        const sycl::nd_item<3> &item_ct1) {
+    const int64_t work_group_size = item_ct1.get_local_range(2);
+    const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
+
+    // make each work-item deal with more elements since sycl global range can not exceed max int
+    for (int64_t i = global_id; i < pelements; i += work_group_size * item_ct1.get_group_range(2)) {
+
+        const int64_t ksize = OW * (KH > 1 ? KW : 1);
+        const int64_t kx = i / ksize;
+        const int64_t kd = kx * ksize;
+        const int64_t ky = (i - kd) / OW;
+        const int64_t ix = i % OW;
+
+        const int64_t  oh = item_ct1.get_group(1);
+        const int64_t  batch = item_ct1.get_group(0) / IC;
+        const int64_t  ic = item_ct1.get_group(0) % IC;
+
+        const int64_t iiw = ix * s0 + kx * d0 - p0;
+        const int64_t iih = oh * s1 + ky * d1 - p1;
+
+        const int64_t offset_dst =
+            ((batch * OH + oh) * OW + ix) * CHW +
+            (ic * (KW * KH) + ky * KW + kx);
+
+        if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+            dst[offset_dst] =
+                sycl::vec<float, 1>(0.0f)
+                    .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+        } else {
+            const int64_t offset_src = ic * offset_delta + batch * batch_offset;
+            dst[offset_dst] =
+                sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
+                    .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+        }
+    }
+}
+
+template <typename T>
+static void im2col_sycl(
+        const float *x, T *dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
+        int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
+        int s0, int s1, int p0, int p1, int d0, int d1,
+        queue_ptr stream) {
+    const int64_t parallel_elements = OW * KW * KH;
+    const int64_t num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
+
+    // decrease global range when it exceeds the max int
+    int64_t local_size = downsample_sycl_global_range(batch * IC * OH * num_blocks, SYCL_IM2COL_BLOCK_SIZE);
+    sycl::range<3> block_nums(batch * IC, OH, num_blocks);
+    sycl::range<3> local_range(1, 1, local_size);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * local_range, local_range),
+            [=](sycl::nd_item<3> item_ct1) {
+                im2col_kernel(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH,
+                               parallel_elements, (IC * KH * KW), s0, s1, p0,
+                               p1, d0, d1, item_ct1);
+            });
+    }
+}
+
+void ggml_sycl_op_im2col(
+        ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+        ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
+        const queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const int64_t batch = src1->ne[3];
+    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+
+    if (dst->type == GGML_TYPE_F16) {
+        im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+    } else {
+        im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+    }
+
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src0_dd);
+    GGML_UNUSED(ctx);
+}
diff --git a/ggml/src/ggml-sycl/im2col.hpp b/ggml/src/ggml-sycl/im2col.hpp
new file mode 100644
index 000000000..7db144fbb
--- /dev/null
+++ b/ggml/src/ggml-sycl/im2col.hpp
@@ -0,0 +1,23 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_IM2COL_HPP
+#define GGML_SYCL_IM2COL_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_im2col(
+        ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+        ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
+        const queue_ptr &main_stream);
+
+#endif // GGML_SYCL_IM2COL_HPP
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
new file mode 100644
index 000000000..8ea82c940
--- /dev/null
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -0,0 +1,3031 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "mmq.hpp"
+#include "vecdotq.hpp"
+
+typedef void (*allocate_tiles_sycl_t)(
+    int** x_ql,
+    sycl::half2** x_dm,
+    int** x_qh,
+    int** x_sc);
+typedef void (*load_tiles_sycl_t)(
+    const void* __restrict__ vx,
+    int* __restrict__ x_ql,
+    sycl::half2* __restrict__ x_dm,
+    int* __restrict__ x_qh,
+    int* __restrict__ x_sc,
+    const int& i_offset,
+    const int& i_max,
+    const int& k,
+    const int& blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_sycl_t)(
+    const int* __restrict__ x_ql,
+    const sycl::half2* __restrict__ x_dm,
+    const int* __restrict__ x_qh,
+    const int* __restrict__ x_sc,
+    const int* __restrict__ y_qs,
+    const sycl::half2* __restrict__ y_ms,
+    const int& i,
+    const int& j,
+    const int& k);
+
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q4_0, float *tile_x_d_q4_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q4_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q4_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q4_1;
+    *x_dm = tile_x_dm_q4_1;
+}
+
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_0, float *tile_x_d_q5_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql_q5_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q5_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0 = dpct::vectorized_binary<sycl::char4>(
+            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1 = dpct::vectorized_binary<sycl::char4>(
+            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql_q5_1;
+    *x_dm = tile_x_dm_q5_1;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset < nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q8_0, float *tile_x_d_q8_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q8_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q8_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K,
+                    int *tile_x_sc_q2_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q2_K;
+    *x_dm = tile_x_dm_q2_K;
+    *x_sc = tile_x_sc_q2_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+#define VDR_Q2_K_Q8_1_MMQ  2
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const uint8_t *__restrict__ scales,
+                           const sycl::half2 &dm2, const float &d8) {
+
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m = dpct::dp4a(m, u[i],
+                                sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
+
+    return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
+}
+
+static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K,
+                    int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) {
+
+    *x_ql = tile_x_ql_q3_K;
+    *x_dm = tile_x_dm_q3_K;
+    *x_qh = tile_x_qh_q3_K;
+    *x_sc = tile_x_sc_q3_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = dpct::vectorized_binary<sycl::char4>(
+            sc_low | sc_high, 0x20202020, dpct::sub_sat());
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+#define VDR_Q3_K_Q8_1_MMQ  2
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ scales, const float &d3,
+                           const float &d8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+}
+
+static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K,
+                    int *tile_x_sc_q4_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q4_K;
+    *x_dm = tile_x_dm_q4_K;
+    *x_sc = tile_x_sc_q4_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
+                                u[i * QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
+
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+}
+
+
+static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K,
+                    int *tile_x_sc_q5_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q5_K;
+    *x_dm = tile_x_dm_q5_K;
+    *x_sc = tile_x_sc_q5_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
+                                sumi_d); // SIMD dot product
+        }
+
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
+
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+}
+
+static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
+            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
+                                                 dpct::sub_sat());
+        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
+            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
+                                                 dpct::sub_sat());
+    }
+
+    constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ sc, const float &d6,
+                           const float *__restrict__ d8) {
+
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
+                                    sumi_d.x()); // SIMD dot product
+            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
+                                    sumi_d.x()); // SIMD dot product
+
+            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
+                                    sumi_d.y()); // SIMD dot product
+            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
+                                    sumi_d.y()); // SIMD dot product
+        }
+
+        sumf_d += d8[i0 / 4] *
+                  (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y());
+    }
+
+    return d6 * sumf_d;
+}
+
+static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
+          int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
+          vec_dot_q_mul_mat_sycl_t vec_dot>
+/*
+DPCT1110:8: The total declared local variable size in device function mul_mat_q
+exceeds 128 bytes and may cause high register pressure. Consult with your
+hardware vendor to find the total register size available and adjust the code,
+or use smaller sub-group size to avoid high register pressure.
+*/
+static __dpct_inline__ void
+mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
+          float *__restrict__ dst, const int ncols_x, const int nrows_x,
+          const int ncols_y, const int nrows_y, const int nrows_dst,
+          int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
+          int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
+          sycl::half2 *tile_y_ds) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
+                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
+                   blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = dpct::min(
+                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
+                    ncols_y - 1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
+                                    kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(
+                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids =
+                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
+                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
+                    mmq_x;
+                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
+                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const sycl::half2 *dsi_src =
+                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
+                       ir * (WARP_SIZE / QI8_1) + kby]
+                         .ds;
+                sycl::half2 *dsi_dst =
+                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = (*dsi_src)[0];
+                }
+            }
+
+            /*
+            DPCT1118:9: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
+                            item_ct1.get_local_id(1) + j, k);
+                    }
+                }
+            }
+
+            /*
+            DPCT1118:10: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#endif
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_0, tile_x_d_q4_0);
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
+              vec_dot_q4_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#endif
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,
+    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_1, tile_x_dm_q4_1);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
+              vec_dot_q4_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#endif
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_0, tile_x_d_q5_0);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
+              vec_dot_q5_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#endif
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static void
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,
+    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_1, tile_x_dm_q5_1);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
+              vec_dot_q5_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#endif
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q8_0, tile_x_d_q8_0);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
+              vec_dot_q8_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#endif
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static void
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,
+    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
+              vec_dot_q2_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#endif
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static void
+mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
+    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K,
+                               tile_x_sc_q3_K);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
+              vec_dot_q3_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#endif
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,
+    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
+              vec_dot_q4_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#endif
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static void
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,
+    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
+              vec_dot_q5_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#if defined(SYCL_USE_XMX)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#endif
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static void
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    // int   * tile_x_ql = nullptr;
+    // sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    // int   * tile_x_sc = nullptr;
+
+//sycl_todo: change according to hardware
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql, tile_x_dm, tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
+              vec_dot_q6_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+}
+
+static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+#if QK_K == 256
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+#endif
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const int compute_capability = ggml_sycl_info().devices[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= VER_GEN13) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= VER_GEN12) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= VER_GEN9) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= VER_4VEC) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_acc_ct1),
+                            get_pointer(tile_x_dm_acc_ct1),
+                            get_pointer(tile_x_sc_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            get_pointer(tile_x_ql_acc_ct1),
+                            get_pointer(tile_x_dm_acc_ct1),
+                            get_pointer(tile_x_sc_acc_ct1),
+                            get_pointer(tile_y_qs_acc_ct1),
+                            get_pointer(tile_y_ds_acc_ct1));
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_sycl_op_mul_mat_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int device_id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(device_id = get_current_device_id()));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = device_id == ctx.device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
diff --git a/ggml/src/ggml-sycl/mmq.hpp b/ggml/src/ggml-sycl/mmq.hpp
new file mode 100644
index 000000000..3f5297aaa
--- /dev/null
+++ b/ggml/src/ggml-sycl/mmq.hpp
@@ -0,0 +1,33 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_MMQ_HPP
+#define GGML_SYCL_MMQ_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_mul_mat_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor* src0,
+    const ggml_tensor* src1,
+    ggml_tensor* dst,
+    const char* src0_dd_i,
+    const float* src1_ddf_i,
+    const char* src1_ddq_i,
+    float* dst_dd_i,
+    const int64_t row_low,
+    const int64_t row_high,
+    const int64_t src1_ncols,
+    const int64_t src1_padded_row_size,
+    const dpct::queue_ptr& stream);
+
+#endif // GGML_SYCL_MMQ_HPP
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
new file mode 100644
index 000000000..221f65c21
--- /dev/null
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -0,0 +1,1015 @@
+#include "mmvq.hpp"
+#include "vecdotq.hpp"
+#include <cassert>
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
+static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
+                                       const void *__restrict__ vy,
+                                       float *__restrict__ dst, const int ncols,
+                                       const int nrows,
+                                       const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
+                                      const void *__restrict__ vy,
+                                      float *__restrict__ dst, const int ncols,
+                                      const int nrows,
+                                      const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
+                                     const void *__restrict__ vy,
+                                     float *__restrict__ dst, const int ncols,
+                                     const int nrows,
+                                     const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
+                                       const void *__restrict__ vy,
+                                       float *__restrict__ dst, const int ncols,
+                                       const int nrows,
+                                       const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
+                                     const void *__restrict__ vy,
+                                     float *__restrict__ dst, const int ncols,
+                                     const int nrows,
+                                     const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
+                                     const void *__restrict__ vy,
+                                     float *__restrict__ dst, const int ncols,
+                                     const int nrows,
+                                     const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
+                                     const void *__restrict__ vy,
+                                     float *__restrict__ dst, const int ncols,
+                                     const int nrows,
+                                     const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
+                                      const void *__restrict__ vy,
+                                      float *__restrict__ dst, const int ncols,
+                                      const int nrows,
+                                      const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
+                                      const void *__restrict__ vy,
+                                      float *__restrict__ dst, const int ncols,
+                                      const int nrows,
+                                      const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
+    assert(blocks_per_warp>0);
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
+                                      VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
+                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
+                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
+                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
+                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
+                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
+                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
+                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
+                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
+                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+
+static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+        stream->submit([&](sycl::handler & cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_NL == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
+    {
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                        mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
+        });
+    }
+}
+
+void ggml_sycl_op_mul_mat_vec_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_col_size,
+    const dpct::queue_ptr &stream) {
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    SYCL_CHECK(
+        CHECK_TRY_ERROR(id = get_current_device_id()));
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the kernel writes into
+
+    for (int i = 0; i < src1_ncols; i++)
+    {
+        const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
+        const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
+        float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
+        switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ1_S:
+            mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ1_M:
+            mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ2_XXS:
+            mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ2_XS:
+            mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ2_S:
+            mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ3_S:
+            mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ4_XS:
+            mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+        }
+    }
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_ddf_i);
+    GGML_UNUSED(ctx);
+}
diff --git a/ggml/src/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp
new file mode 100644
index 000000000..049b43d45
--- /dev/null
+++ b/ggml/src/ggml-sycl/mmvq.hpp
@@ -0,0 +1,27 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_MMVQ_HPP
+#define GGML_SYCL_MMVQ_HPP
+
+#include "common.hpp"
+
+
+void ggml_sycl_op_mul_mat_vec_q(
+    ggml_backend_sycl_context & ctx,
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+
+#endif // GGML_SYCL_MMVQ_HPP
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
new file mode 100644
index 000000000..9cf2be155
--- /dev/null
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -0,0 +1,378 @@
+#include "norm.hpp"
+
+static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
+    const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+        item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    const int nthreads = item_ct1.get_local_range(2);
+    const int nwarps = nthreads / WARP_SIZE;
+    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row * ncols + col];
+        mean_var.x() += xi;
+        mean_var.y() += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        /*
+        DPCT1118:0: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        mean_var = 0.f;
+        size_t nreduce = nwarps / WARP_SIZE;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            mean_var += s_sum[lane_id + i * WARP_SIZE];
+        }
+        mean_var = warp_reduce_sum(mean_var, item_ct1);
+    }
+
+    const float mean = mean_var.x() / ncols;
+    const float var = mean_var.y() / ncols - mean * mean;
+    const float inv_std = sycl::rsqrt(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
+    }
+}
+
+static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
+    const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
+    int start = item_ct1.get_group(2) * group_size;
+    int end = start + group_size;
+    const int nthreads = item_ct1.get_local_range(2);
+    const int nwarps = nthreads / WARP_SIZE;
+    start += item_ct1.get_local_id(2);
+    size_t nreduce = nwarps / WARP_SIZE;
+
+    if (end >= ne_elements) {
+        end = ne_elements;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:1: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = 0.f;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            tmp += s_sum[lane_id + i * WARP_SIZE];
+        }
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:2: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = 0.f;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            tmp += s_sum[lane_id + i * WARP_SIZE];
+        }
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float variance = tmp / group_size;
+    float scale = sycl::rsqrt(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
+    const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+        item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+    const int nthreads = item_ct1.get_local_range(2);
+    const int nwarps = nthreads / WARP_SIZE;
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row * ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:3: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        size_t nreduce = nwarps / WARP_SIZE;
+        tmp = 0.f;
+        for (size_t i = 0; i < nreduce; i += 1)
+        {
+            tmp += s_sum[lane_id + i * WARP_SIZE];
+        }
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = sycl::rsqrt(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row * ncols + col] = scale * x[row * ncols + col];
+    }
+}
+
+static void norm_f32_sycl(const float* x, float* dst, const int ncols,
+    const int nrows, const float eps,
+    queue_ptr stream, int device) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    norm_f32(x, dst, ncols, eps, item_ct1,
+                        nullptr, WARP_SIZE);
+                });
+            });
+    }
+    else {
+        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+        const sycl::range<3> block_dims(1, 1, work_group_size);
+        /*
+        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler& cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(work_group_size / WARP_SIZE), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    norm_f32(x, dst, ncols, eps, item_ct1,
+                        get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
+    }
+}
+
+static void group_norm_f32_sycl(const float* x, float* dst,
+    const int num_groups, const float eps, const int group_size,
+    const int ne_elements, queue_ptr stream, int device) {
+    if (group_size < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler& cgh) {
+            const float eps_ct4 = eps;
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    group_norm_f32(
+                        x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                        nullptr, WARP_SIZE);
+                });
+            });
+    }
+    else {
+        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+        const sycl::range<3> block_dims(1, 1, work_group_size);
+        /*
+        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+
+        stream->submit([&](sycl::handler& cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
+                cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    group_norm_f32(x, dst, group_size, ne_elements,
+                        eps_ct4, item_ct1,
+                        get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
+    }
+}
+
+static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
+    const int nrows, const float eps,
+    queue_ptr stream, int device) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
+                        nullptr, WARP_SIZE);
+                });
+            });
+    }
+    else {
+        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
+        assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+        const sycl::range<3> block_dims(1, 1, work_group_size);
+        /*
+        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler& cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
+                cgh);
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
+                        get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
+    }
+}
+
+void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst, const float* src0_dd,
+    const float* src1_dd, float* dst_dd,
+    const queue_ptr& main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+
+    (void)src1;
+    (void)dst;
+    (void)src1_dd;
+}
+
+void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst,
+    const float* src0_dd, const float* src1_dd,
+    float* dst_dd,
+    const queue_ptr& main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+
+    float eps;
+    memcpy(&eps, dst->op_params + 1, sizeof(float));
+
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
+
+    (void)src1;
+    (void)dst;
+    (void)src1_dd;
+    GGML_UNUSED(ctx);
+}
+
+void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst,
+    const float* src0_dd, const float* src1_dd,
+    float* dst_dd,
+    const queue_ptr& main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+
+    (void)src1;
+    (void)dst;
+    (void)src1_dd;
+}
diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp
new file mode 100644
index 000000000..a9ad9156f
--- /dev/null
+++ b/ggml/src/ggml-sycl/norm.hpp
@@ -0,0 +1,35 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_NORM_HPP
+#define GGML_SYCL_NORM_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
+    ggml_tensor* dst, const float* src0_dd,
+    const float* src1_dd, float* dst_dd,
+    const queue_ptr& main_stream);
+
+void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst,
+    const float* src0_dd, const float* src1_dd,
+    float* dst_dd,
+    const queue_ptr& main_stream);
+
+void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
+    const ggml_tensor* src1, ggml_tensor* dst,
+    const float* src0_dd, const float* src1_dd,
+    float* dst_dd,
+    const queue_ptr& main_stream);
+
+#endif // GGML_SYCL_NORM_HPP
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
new file mode 100644
index 000000000..8e8347ff4
--- /dev/null
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -0,0 +1,56 @@
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include "outprod.hpp"
+
+
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // Get SYCL queue
+    dpct::queue_ptr stream = ctx.stream();
+
+    // Dimension checks
+    GGML_ASSERT(ne01 == ne11);  // Inner dimensions must match
+    GGML_ASSERT(ne0 == ne00);   // Output rows match src0 rows
+    GGML_ASSERT(ne1 == ne10);   // Output cols match src1 cols
+
+    // Get data pointers
+    const float* src0_d = (const float*)src0->data;
+    const float* src1_d = (const float*)src1->data;
+    float* dst_d = (float*)dst->data;
+
+    // GEMM parameters
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+
+    // Handle transposition of src1
+    const bool src1_T = ggml_is_transposed(src1);
+    const oneapi::mkl::transpose src1_op =
+        src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
+    const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
+
+    try {
+        // Perform matrix multiplication using oneMKL GEMM
+#ifdef GGML_SYCL_NVIDIA
+        oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
+                                              oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
+                                              ne00, src1_d, ldb, beta, dst_d, ne0);
+#else
+        oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
+                                              src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
+#endif
+    }
+    catch (sycl::exception const& exc) {
+        std::cerr << exc.what() << std::endl;
+        GGML_ASSERT(false);
+    }
+}
diff --git a/ggml/src/ggml-sycl/outprod.hpp b/ggml/src/ggml-sycl/outprod.hpp
new file mode 100644
index 000000000..f50413d3f
--- /dev/null
+++ b/ggml/src/ggml-sycl/outprod.hpp
@@ -0,0 +1,10 @@
+#ifndef GGML_SYCL_OUTPROD_HPP
+#define GGML_SYCL_OUTPROD_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
+
+
+#endif // GGML_SYCL_OUTPROD_HPP
+
diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp
new file mode 100644
index 000000000..af1890727
--- /dev/null
+++ b/ggml/src/ggml-sycl/presets.hpp
@@ -0,0 +1,74 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_PRESETS_HPP
+#define GGML_SYCL_PRESETS_HPP
+
+#define GGML_SYCL_MAX_STREAMS       8
+#define GGML_SYCL_MAX_BUFFERS       256
+
+#define WARP_SIZE GGML_SYCL_WARP_SIZE
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define SYCL_GELU_BLOCK_SIZE 256
+#define SYCL_SILU_BLOCK_SIZE 256
+#define SYCL_TANH_BLOCK_SIZE 256
+#define SYCL_RELU_BLOCK_SIZE 256
+#define SYCL_HARDSIGMOID_BLOCK_SIZE 256
+#define SYCL_HARDSWISH_BLOCK_SIZE 256
+#define SYCL_EXP_BLOCK_SIZE 256
+#define SYCL_NEG_BLOCK_SIZE 256
+#define SYCL_SIGMOID_BLOCK_SIZE 256
+#define SYCL_SQRT_BLOCK_SIZE 256
+#define SYCL_SIN_BLOCK_SIZE 256
+#define SYCL_SQR_BLOCK_SIZE 256
+#define SYCL_CPY_BLOCK_SIZE 32
+#define SYCL_SCALE_BLOCK_SIZE 256
+#define SYCL_CLAMP_BLOCK_SIZE 256
+#define SYCL_ROPE_BLOCK_SIZE 256
+#define SYCL_ALIBI_BLOCK_SIZE 32
+#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
+#define SYCL_QUANTIZE_BLOCK_SIZE 256
+#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
+#define SYCL_GET_ROWS_BLOCK_SIZE 256
+#define SYCL_UPSCALE_BLOCK_SIZE 256
+#define SYCL_CONCAT_BLOCK_SIZE 256
+#define SYCL_PAD_BLOCK_SIZE 256
+#define SYCL_ACC_BLOCK_SIZE 256
+#define SYCL_IM2COL_BLOCK_SIZE 256
+#define SYCL_POOL2D_BLOCK_SIZE 256
+#define SYCL_ARGMAX_BLOCK_SIZE 256
+#define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+#define SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_SYCL_DMMV_X
+#define GGML_SYCL_DMMV_X 32
+#endif
+#ifndef GGML_SYCL_MMV_Y
+#define GGML_SYCL_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
+#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define QK_WARP_SIZE 32
+#endif // GGML_SYCL_PRESETS_HPP
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
new file mode 100644
index 000000000..1244b231a
--- /dev/null
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -0,0 +1,276 @@
+#include "rope.hpp"
+
+struct rope_corr_dims {
+    float v[2];
+};
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
+    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
+    }
+    *cos_theta = sycl::cos(theta) * mscale;
+    *sin_theta = sycl::sin(theta) * mscale;
+}
+
+template<typename T, bool has_ff>
+static void rope_norm(
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
+    const sycl::nd_item<3> &item_ct1) {
+    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (i0 >= n_dims) {
+        const int i = row*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i = row*ne0 + i0;
+    const int i2 = row/p_delta_rows;
+
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_ff>
+static void rope_neox(
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
+    const sycl::nd_item<3> &item_ct1) {
+    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (i0 >= n_dims) {
+        const int i = row*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ne0 + i0/2;
+    const int i2 = row/p_delta_rows;
+
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+template <typename T>
+static void rope_norm_sycl(
+    const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nr);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+    if (freq_factors == nullptr) {
+        /*
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
+                               ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
+                               item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
+                              ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
+                              item_ct1);
+            });
+    }
+}
+
+template <typename T>
+static void rope_neox_sycl(
+    const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nr);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    dpct::has_capability_or_fail(stream->get_device(),
+                                    {sycl::aspect::fp16});
+
+    if (freq_factors == nullptr) {
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
+                                    p_delta_rows, ext_factor, attn_factor,
+                                    corr_dims, theta_scale, freq_factors,
+                                    item_ct1);
+            });
+    } else {
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
+                                    p_delta_rows, ext_factor, attn_factor,
+                                    corr_dims, theta_scale, freq_factors,
+                                    item_ct1);
+            });
+    }
+}
+
+void ggml_sycl_op_rope(
+    ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) {
+    const ggml_tensor * src2 = dst->src[2];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t nr = ggml_nrows(src0);
+
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig  = ((int32_t *) dst->op_params)[4];
+
+    // RoPE alteration for extended context
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+    const int32_t * pos = (const int32_t *) src1_dd;
+
+    const float * freq_factors = nullptr;
+    if (src2 != nullptr) {
+        freq_factors = (const float *) src2->data;
+    }
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_neox) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_sycl(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_sycl(
+                (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, main_stream
+            );
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_norm_sycl(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_norm_sycl(
+                (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, main_stream
+            );
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src1_dd);
+    GGML_UNUSED(ctx);
+}
diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp
new file mode 100644
index 000000000..00354c313
--- /dev/null
+++ b/ggml/src/ggml-sycl/rope.hpp
@@ -0,0 +1,22 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_ROPE_HPP
+#define GGML_SYCL_ROPE_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_rope(
+    ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
+
+#endif // GGML_SYCL_ROPE_HPP
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
new file mode 100644
index 000000000..563e0655f
--- /dev/null
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -0,0 +1,261 @@
+#include "softmax.hpp"
+
+template <bool vals_smem, int ncols_template, int block_size_template, typename T>
+static void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par,
+                         const int nrows_y, const float scale, const float max_bias, const float m0,
+                         const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
+    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
+
+    const int tid = item_ct1.get_local_id(2);
+    const int rowx = item_ct1.get_group(2);
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
+
+    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+    const int nthreads = block_size;
+    const int nwarps = nthreads / WARP_SIZE;
+    size_t nreduce = nwarps / WARP_SIZE;
+    float slope = 1.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        const uint32_t h = rowx/nrows_y; // head index
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = sycl::pow(base, float(exp));
+    }
+
+    float *vals = vals_smem ? buf + sycl::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
+    float max_val = -INFINITY;
+
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+
+        const float val = x[ix]*scale + (mask ? slope*static_cast<float>(mask[iy]) : 0.0f);
+
+        vals[col] = val;
+        max_val = sycl::max(max_val, val);
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = -INFINITY;
+            for (size_t i = 1; i < nreduce; i += 1) {
+                buf[lane_id + i * WARP_SIZE] = -INFINITY;
+            }
+        }
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        if (lane_id == 0) {
+            buf[warp_id] = max_val;
+        }
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        max_val = buf[lane_id];
+        for (size_t i = 1; i < nreduce; i += 1) {
+            max_val = sycl::max(max_val, buf[lane_id + i * WARP_SIZE]);
+        }
+        max_val = warp_reduce_max(max_val, item_ct1);
+    }
+
+    float tmp = 0.f;
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+                if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+
+        const float val = sycl::native::exp(vals[col] - max_val);
+        tmp += val;
+        vals[col] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        if (warp_id == 0) {
+            buf[lane_id] = 0.f;
+            for (size_t i = 1; i < nreduce; i += 1) {
+                buf[lane_id + i * WARP_SIZE] = 0.f;
+            }
+        }
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        if (lane_id == 0) {
+            buf[warp_id] = tmp;
+        }
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        tmp = buf[lane_id];
+        for (size_t i = 1; i < nreduce; i += 1) {
+            tmp += buf[lane_id + i * WARP_SIZE];
+        }
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float inv_sum = 1.f / tmp;
+
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+
+        if (ncols_template == 0 && col >= ncols) {
+            return;
+        }
+
+        const int idst = rowx*ncols + col;
+        dst[idst] = vals[col] * inv_sum;
+    }
+}
+
+template <bool vals_smem, int ncols_template, int block_size_template, typename T>
+static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int ncols_par,
+                                   const int nrows_y, const float scale, const float max_bias, const float m0,
+                                   const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
+                                   const size_t n_local_scratch, queue_ptr stream) {
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
+                                                                             nrows_y, scale, max_bias, m0,
+                                                                             m1, n_head_log2, item_ct1,
+                                                                             get_pointer(local_buf_acc));
+            });
+    });
+}
+
+template<typename T>
+static void soft_max_f32_sycl(const float * x, const T * mask,
+                              float * dst, const int ncols_x, const int nrows_x,
+                              const int nrows_y, const float scale, const float max_bias,
+                              queue_ptr stream, int device) {
+    int nth = WARP_SIZE;
+    int max_block_size = ggml_sycl_info().max_work_group_sizes[device];
+    while (nth < ncols_x && nth < max_block_size) nth *= 2;
+    if (nth>max_block_size) nth = max_block_size;
+
+    const sycl::range<3> block_dims(1, 1, nth);
+    const sycl::range<3> block_nums(1, 1, nrows_x);
+    const size_t n_val_tmp = nth / WARP_SIZE;
+    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
+
+    const uint32_t n_head_kv   = nrows_x/nrows_y;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
+    if (n_local_scratch*sizeof(float) < local_mem_size) {
+        if (ncols_x > max_block_size) {
+            soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
+                                               max_bias, m0, m1, n_head_log2, block_nums,
+                                               block_dims, n_local_scratch, stream);
+            return;
+        }
+        switch (ncols_x) {
+            case 32:
+                soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                     max_bias, m0, m1, n_head_log2, block_nums,
+                                                     block_dims, n_local_scratch, stream);
+                break;
+            case 64:
+                soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                     max_bias, m0, m1, n_head_log2, block_nums,
+                                                     block_dims, n_local_scratch, stream);
+                break;
+            case 128:
+                soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                       max_bias, m0, m1, n_head_log2, block_nums,
+                                                       block_dims, n_local_scratch, stream);
+                break;
+            case 256:
+                soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                       max_bias, m0, m1, n_head_log2, block_nums,
+                                                       block_dims, n_local_scratch, stream);
+                break;
+            case 512:
+                soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                       max_bias, m0, m1, n_head_log2, block_nums,
+                                                       block_dims, n_local_scratch, stream);
+                break;
+            case 1024:
+                soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                         max_bias, m0, m1, n_head_log2, block_nums,
+                                                         block_dims, n_local_scratch, stream);
+                break;
+            case 2048:
+                soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                         max_bias, m0, m1, n_head_log2, block_nums,
+                                                         block_dims, n_local_scratch, stream);
+                break;
+            case 4096:
+                soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                         max_bias, m0, m1, n_head_log2, block_nums,
+                                                         block_dims, n_local_scratch, stream);
+                break;
+            default:
+                soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
+                                                   max_bias, m0, m1, n_head_log2, block_nums,
+                                                   block_dims, n_local_scratch, stream);
+                break;
+        }
+    } else {
+        soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
+                                            max_bias, m0, m1, n_head_log2, block_nums,
+                                            block_dims, WARP_SIZE, stream);
+    }
+}
+
+void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00 = dst->src[0]->ne[0];
+    const int64_t nrows_x = ggml_nrows(dst->src[0]);
+    const int64_t nrows_y = dst->src[0]->ne[1];
+
+    float scale = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale, dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, dst->op_params + 1, sizeof(float));
+
+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+    float * dst_dd = static_cast<float *>(dst->data);
+
+    ggml_sycl_set_device(ctx.device);
+    dpct::queue_ptr main_stream = ctx.stream();
+
+    if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
+        const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
+        soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
+                          main_stream, ctx.device);
+    } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
+        const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
+        soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
+    } else {
+        /* mask unavailable */
+        soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
+    }
+}
diff --git a/ggml/src/ggml-sycl/softmax.hpp b/ggml/src/ggml-sycl/softmax.hpp
new file mode 100644
index 000000000..2cf8582ec
--- /dev/null
+++ b/ggml/src/ggml-sycl/softmax.hpp
@@ -0,0 +1,20 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_SOFTMAX_HPP
+#define GGML_SYCL_SOFTMAX_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst);
+
+#endif // GGML_SYCL_SOFTMAX_HPP
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
new file mode 100644
index 000000000..b877d18c1
--- /dev/null
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -0,0 +1,73 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "tsembd.hpp"
+
+static void timestep_embedding_f32(
+        const float * timesteps, float * dst, const int nb1,
+        const int dim, const int max_period, const sycl::nd_item<3> &item_ct1) {
+    // item_ct1.get_group(1)(blockIDx.y): idx of timesteps->ne[0]
+    // item_ct1.get_group(2) (blockIDx.x): idx of ((dim + 1) / 2) / BLOCK_SIZE
+    int i = item_ct1.get_group(1);
+    int j = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    float * embed_data = (float *)((char *)dst +  i*nb1);
+
+    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
+        embed_data[dim] = 0.f;
+    }
+
+    int half = dim / 2;
+    if (j >= half) {
+        return;
+    }
+
+    float timestep = timesteps[i];
+    float freq = (float)sycl::native::exp(-(sycl::log((float)max_period)) * j / half);
+    float arg = timestep * freq;
+    embed_data[j] = sycl::cos(arg);
+    embed_data[j + half] = sycl::sin(arg);
+}
+
+static void timestep_embedding_f32_sycl(
+        const float * x, float * dst, const int ne00, const int nb1,
+        const int dim, const int max_period, const queue_ptr& stream) {
+    // As the kernel returns when thread.idx is larger than dim/2, the half_ceil does not need to pad
+    int half_ceil = dim / 2;
+    int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
+    sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
+    sycl::range<3> gridDim(1, ne00, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            gridDim * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            timestep_embedding_f32(
+                x, dst, nb1, dim, max_period, item_ct1
+            );
+        });
+}
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    dpct::queue_ptr stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+
+    timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
+    GGML_UNUSED(src1);
+}
diff --git a/ggml/src/ggml-sycl/tsembd.hpp b/ggml/src/ggml-sycl/tsembd.hpp
new file mode 100644
index 000000000..4c18748bb
--- /dev/null
+++ b/ggml/src/ggml-sycl/tsembd.hpp
@@ -0,0 +1,20 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_TSEMBD_HPP
+#define GGML_SYCL_TSEMBD_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_TSEMBD_HPP
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
new file mode 100644
index 000000000..c5942008a
--- /dev/null
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -0,0 +1,1140 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_VECDOTQ_HPP
+#define GGML_SYCL_VECDOTQ_HPP
+
+#include "dpct/helper.hpp"
+
+typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+
+static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
+  const uint16_t* x16 =
+      (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
+                                                 // alignment
+
+  int x32 = 0;
+  x32 |= x16[0] << 0;
+  x32 |= x16[1] << 16;
+
+  return x32;
+}
+
+static __dpct_inline__ int get_int_from_uint8(
+    const uint8_t* x8,
+    const int& i32) {
+  const uint16_t* x16 =
+      (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
+                                                 // alignment
+
+  int x32 = 0;
+  x32 |= x16[0] << 0;
+  x32 |= x16[1] << 16;
+
+  return x32;
+}
+
+static __dpct_inline__ int get_int_from_int8_aligned(
+    const int8_t* x8,
+    const int& i32) {
+  return *(
+      (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __dpct_inline__ int get_int_from_uint8_aligned(
+    const uint8_t* x8,
+    const int& i32) {
+  return *(
+      (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
+                                                  const uint8_t *values,
+                                                  int &val1, int &val2) {
+
+    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
+    aux32 = q4 & 0x0f0f0f0f;
+    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
+    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
+    val1 = v1 | (v2 << 16);
+    aux32 = (q4 >> 4) & 0x0f0f0f0f;
+    v1 = values[q8[0]] | (values[q8[1]] << 8);
+    v2 = values[q8[2]] | (values[q8[3]] << 8);
+    val2 = v1 | (v2 << 16);
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
+    const sycl::half2 &dm2, const float *__restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d +=
+            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] *
+                  dpct::dp4a(
+                      m, u[i],
+                      0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
+}
+
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int &vl, const int &vh, const int *__restrict__ u,
+    const uint8_t *__restrict__ scales, const int &scale_offset,
+    const float &d3, const float *__restrict__ d8) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi =
+            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
+
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const float *__restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 =
+            dpct::dp4a(v1i, u[2 * i + 1],
+                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 1],
+                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+}
+
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int *__restrict__ vl, const int *__restrict__ vh,
+    const int *__restrict__ u, const uint8_t *__restrict__ sc,
+    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
+    const float *__restrict__ d8) {
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 =
+            dpct::dp4a(v0i, u[2 * i + 0],
+                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 0],
+                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
+}
+
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+
+// contiguous v/x values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
+                            const int *__restrict__ u,
+                            const int8_t *__restrict__ scales, const float &d,
+                            const float *__restrict__ d8) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = dpct::vectorized_binary<sycl::char4>(
+            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d4,
+                                                    const sycl::half2 &ds8) {
+    int sumi = 0;
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
+    }
+
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm4,
+                                                    const sycl::half2 &ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
+    }
+
+#ifdef GGML_SYCL_F16
+    const sycl::float2 tmp =
+        (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d4d8 = tmp.x();
+    const float m4s8 = tmp.y();
+#else
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d4d8 = dm4f.x() * ds8f.x();
+    const float m4s8 = dm4f.y() * ds8f.y();
+#endif // GGML_SYCL_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const float &d5, const sycl::half2 &ds8) {
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
+    }
+
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_SYCL_F16
+     const sycl::float2 tmp =
+        (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d5d8 = tmp.x();
+    const float m5s8 = tmp.y();
+
+
+#else
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d5d8 = dm5f.x() * ds8f.x();
+    const float m5s8 = dm5f.y() * ds8f.y();
+#endif // GGML_SYCL_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d8_0,
+                                                    const float &d8_1) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+}
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm8,
+                                                    const sycl::half2 &ds8) {
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_SYCL_F16
+    const sycl::float2 tmp =
+        (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d8d8 = tmp.x();
+    const float m8s8 = tmp.y();
+#else
+    const sycl::float2 dm8f =
+        dm8.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d8d8 = dm8f.x() * ds8f.x();
+    const float m8s8 = dm8f.y() * ds8f.y();
+#endif // GGML_SYCL_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+static __dpct_inline__ float
+vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
+                                                      bq8_1->ds[0]);
+}
+
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+
+#else
+
+#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = bq8_1[0].ds[0];
+    const float d8_2 = bq8_1[1].ds[1];
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
+    const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    bad_arch();
+#endif // __SYCL_ARCH__ >= VER_4VEC
+
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+
+#else
+
+#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = bq8_1[0].ds[0];
+    const float d8_2 = bq8_1[1].ds[1];
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    bad_arch();
+#endif // __SYCL_ARCH__ >= VER_4VEC
+
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+
+static __dpct_inline__ float
+vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
+                     const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                     const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
+                     const uint8_t *kmask_iq2xs) {
+#if QK_K == 256
+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
+
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const uint8_t  * aux8 = (const uint8_t *)q2;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = q2[2] | (q2[3] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
+        for (int j = 0; j < 8; ++j) {
+            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+        }
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
+    return d * sumi;
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
+                    const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                    const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if QK_K == 256
+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
+
+    const int ib32 = iqs;
+    const uint16_t * q2 = bq2->qs + 4*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs[0], signs[0], std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs[1], signs[1], std::minus<>());
+        sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
+        sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs[0], signs[0], std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs[1], signs[1], std::minus<>());
+        sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
+        sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
+        q8 += 8;
+    }
+    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+#else
+    assert(false);
+    return 0.f;
+#endif
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
+                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+#if QK_K == 256
+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
+
+    const int ib32 = iqs;
+    const int8_t  * q8 = bq8_1[ib32].qs;
+    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
+            std::equal_to<>());
+        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
+            std::equal_to<>());
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs0, signs0, std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs1, signs1, std::minus<>());
+        sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
+        sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
+            std::equal_to<>());
+        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
+            std::equal_to<>());
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs0, signs0, std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs1, signs1, std::minus<>());
+        sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
+        sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
+        q8 += 8;
+    }
+    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+#else
+    assert(false);
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
+                     const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                     const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if QK_K == 256
+    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t  * q3 = bq2->qs + 8*ib32;
+    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = gas[0] | (gas[1] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
+        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid1[0] ^ signs[0], signs[0], std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid2[0] ^ signs[1], signs[1], std::minus<>());
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f;
+    return d * sumi;
+#else
+    assert(false);
+    return 0.f;
+#endif
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
+                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                   const uint32_t *iq3s_grid) {
+#if QK_K == 256
+    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t  * qs = bq2->qs + 8*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
+        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
+        uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
+            ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
+            0x08040201, std::equal_to<>());
+        uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
+            ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
+            0x08040201, std::equal_to<>());
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid1[0] ^ signs0, signs0, std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid2[0] ^ signs1, signs1, std::minus<>());
+        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
+        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
+        q8 += 8;
+    }
+    const float d =
+        (float)bq2->d *
+        (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
+        bq8_1[ib32].ds[0];
+    return d * sumi;
+#else
+    assert(false);
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
+                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                   const uint32_t *iq1s_grid_gpu) {
+#if QK_K == 256
+    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
+
+    const int ib32 = iqs;
+    int sumi = 0;
+    const int * q8 = (const int *)bq8_1[ib32].qs;
+    for (int l = 0; l < 4; ++l) {
+        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
+        int grid0 = grid[0] & 0x0f0f0f0f;
+        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
+        sumi = dpct::dp4a(q8[2 * l + 1], grid1,
+                          dpct::dp4a(q8[2 * l + 0], grid0, sumi));
+    }
+
+    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
+    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
+    const float d = d1q * bq8_1[ib32].ds[0];
+    const float m = d1q * bq8_1[ib32].ds[1];
+    return d * sumi + m * delta;
+#else
+    assert(false);
+#endif
+}
+
+static __dpct_inline__ float
+vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
+                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+#if QK_K == 256
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+
+    const int ib32 = iqs;
+    int   sumi[2] = {0, 0};
+    float sumf[2] = {0.f, 0.f};
+
+    const int * q8 = (const int *)bq8_1[ib32].qs;
+    for (int l = 0; l < 4; ++l) {
+        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
+        int grid0 = grid[0] & 0x0f0f0f0f;
+        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
+        sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
+                                 dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
+        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
+        const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
+                                    dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
+        sumf[l/2] += delta*sumy;
+    }
+
+    iq1m_scale_t scale;
+    const uint16_t * sc = (const uint16_t *)bq1->scales;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
+    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
+#else
+    assert(false);
+#endif
+}
+
+
+static __dpct_inline__ float
+vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
+                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
+
+    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
+    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
+
+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
+
+    int v1, v2;
+    int sumi1 = 0, sumi2 = 0;
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
+        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
+        get_int_from_table_16(aux, values, v1, v2);
+        sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
+        sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
+    }
+
+    const float d = (float)bq->d * bq8_1->ds[0];
+    return d * (sumi1 + sumi2);
+}
+
+
+static __dpct_inline__ float
+vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
+                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#if QK_K == 256
+    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
+
+    // iqs is 0...7
+    const int ib32 = iqs;
+    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
+    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
+    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
+    const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
+    int v1, v2;
+    int sumi1 = 0, sumi2 = 0;
+    for (int j = 0; j < 4; ++j) {
+        get_int_from_table_16(q4[j], values, v1, v2);
+        sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
+        sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
+    }
+    return d * (sumi1 + sumi2);
+#else
+    assert(false);
+#endif
+}
+
+#endif // GGML_SYCL_VECDOTQ_HPP
diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp
new file mode 100644
index 000000000..b54c20964
--- /dev/null
+++ b/ggml/src/ggml-sycl/wkv6.cpp
@@ -0,0 +1,143 @@
+#include <sycl/sycl.hpp>
+#include "wkv6.hpp"
+
+constexpr int WKV_BLOCK_SIZE = 64;  // Matching CUDA_WKV_BLOCK_SIZE
+
+// Helper function for the main kernel
+static void rwkv_wkv_f32_kernel(
+    const int B, const int T, const int C, const int H,
+    const float* k, const float* v, const float* r,
+    const float* tf, const float* td, const float* s,
+    float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) {
+
+    const int tid = item_ct1.get_local_id(2);
+    const int bid = item_ct1.get_group(2);
+
+    const int head_size = WKV_BLOCK_SIZE;
+    const int batch_i = bid / H;
+    const int head_i = bid % H;
+    const int state_size = C * head_size;
+    const int n_seq_tokens = T / B;
+
+    // Set up shared memory pointers
+    float* _k = shared_mem;
+    float* _r = _k + head_size;
+    float* _tf = _r + head_size;
+    float* _td = _tf + head_size;
+
+    // Local state array
+    float state[WKV_BLOCK_SIZE];
+
+    // Load initial state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+    }
+
+    // Sync threads before shared memory operations
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Load time-mixing parameters
+    _tf[tid] = tf[head_i * head_size + tid];
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+
+    // Main sequence processing loop
+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
+         t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid;
+         t += C) {
+
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        // Load current timestep data to shared memory
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+
+        const float _v = v[t];
+        float y = 0;
+
+        // Process in chunks of 4 for better vectorization
+        sycl::float4 k4, r4, tf4, td4, s4;
+        #pragma unroll
+        for (int j = 0; j < head_size; j += 4) {
+            // Load data in vec4 chunks
+            k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+            r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+            tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+            td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+            s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]);
+
+            // Compute key-value product
+            sycl::float4 kv4 = k4 * _v;
+
+            // Accumulate weighted sum
+            y += sycl::dot(r4, tf4 * kv4 + s4);
+
+            // Update state
+            s4 = s4 * td4 + kv4;
+
+            // Store updated state
+            state[j] = s4.x();
+            state[j+1] = s4.y();
+            state[j+2] = s4.z();
+            state[j+3] = s4.w();
+        }
+
+        dst[t] = y;
+    }
+
+    // Save final state
+    #pragma unroll
+    for (int i = 0; i < head_size; i++) {
+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+    }
+}
+
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+
+    const ggml_tensor *src0 = dst->src[0];
+    const ggml_tensor *src1 = dst->src[1];
+
+    const float* k_d = (const float*)dst->src[0]->data;
+    const float* v_d = (const float*)dst->src[1]->data;
+    const float* r_d = (const float*)dst->src[2]->data;
+    const float* tf_d = (const float*)dst->src[3]->data;
+    const float* td_d = (const float*)dst->src[4]->data;
+    const float* s_d = (const float*)dst->src[5]->data;
+    float* dst_d = (float*)dst->data;
+
+    const int64_t B = dst->src[5]->ne[1];
+    const int64_t T = dst->src[0]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[1];
+
+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64
+
+    dpct::queue_ptr stream = ctx.stream();
+
+    // Calculate execution configuration
+    const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td
+    sycl::range<3> block_dims(1, 1, C / H);
+    sycl::range<3> grid_dims(1, 1, B * H);
+
+    // Submit kernel
+    stream->submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rwkv_wkv_f32_kernel(
+                    B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
+                    item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
+                );
+            });
+    });
+
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+}
diff --git a/ggml/src/ggml-sycl/wkv6.hpp b/ggml/src/ggml-sycl/wkv6.hpp
new file mode 100644
index 000000000..8c596a997
--- /dev/null
+++ b/ggml/src/ggml-sycl/wkv6.hpp
@@ -0,0 +1,9 @@
+#ifndef GGML_SYCL_WKV6_HPP
+#define GGML_SYCL_WKV6_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+
+#endif // GGML_SYCL_WKV6_HPP
diff --git a/ggml/src/ggml-threading.cpp b/ggml/src/ggml-threading.cpp
new file mode 100644
index 000000000..25a19eedb
--- /dev/null
+++ b/ggml/src/ggml-threading.cpp
@@ -0,0 +1,12 @@
+#include "ggml-threading.h"
+#include <mutex>
+
+std::mutex ggml_critical_section_mutex;
+
+void ggml_critical_section_start() {
+    ggml_critical_section_mutex.lock();
+}
+
+void ggml_critical_section_end(void) {
+    ggml_critical_section_mutex.unlock();
+}
diff --git a/ggml/src/ggml-threading.h b/ggml/src/ggml-threading.h
new file mode 100644
index 000000000..dec2c8840
--- /dev/null
+++ b/ggml/src/ggml-threading.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+GGML_API void ggml_critical_section_start(void);
+GGML_API void ggml_critical_section_end(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
new file mode 100644
index 000000000..d970f7e20
--- /dev/null
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -0,0 +1,162 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+find_package(Vulkan COMPONENTS glslc REQUIRED)
+
+function(detect_host_compiler)
+    if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+        find_program(HOST_C_COMPILER NAMES cl gcc clang NO_CMAKE_FIND_ROOT_PATH)
+        find_program(HOST_CXX_COMPILER NAMES cl g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
+    else()
+        find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH)
+        find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
+    endif()
+    set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE)
+    set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
+endfunction()
+
+if (Vulkan_FOUND)
+    message(STATUS "Vulkan found")
+
+    ggml_add_backend_library(ggml-vulkan
+                             ggml-vulkan.cpp
+                             ../../include/ggml-vulkan.h
+                            )
+
+    # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
+    # If it's not, there will be an error to stderr.
+    # If it's supported, set a define to indicate that we should compile those shaders
+    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
+                    OUTPUT_VARIABLE glslc_output
+                    ERROR_VARIABLE glslc_error)
+
+    if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
+        message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
+    else()
+        message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
+        add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    endif()
+
+    # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
+    # If it's not, there will be an error to stderr.
+    # If it's supported, set a define to indicate that we should compile those shaders
+    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
+                    OUTPUT_VARIABLE glslc_output
+                    ERROR_VARIABLE glslc_error)
+
+    if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
+        message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
+    else()
+        message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
+        add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    endif()
+
+    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
+    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+    # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
+    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+    if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
+    endif()
+
+    if (GGML_VULKAN_CHECK_RESULTS)
+        add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
+    endif()
+
+    if (GGML_VULKAN_DEBUG)
+        add_compile_definitions(GGML_VULKAN_DEBUG)
+    endif()
+
+    if (GGML_VULKAN_MEMORY_DEBUG)
+        add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
+    endif()
+
+    if (GGML_VULKAN_SHADER_DEBUG_INFO)
+        add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
+    endif()
+
+    if (GGML_VULKAN_PERF)
+        add_compile_definitions(GGML_VULKAN_PERF)
+    endif()
+
+    if (GGML_VULKAN_VALIDATE)
+        add_compile_definitions(GGML_VULKAN_VALIDATE)
+    endif()
+
+    if (GGML_VULKAN_RUN_TESTS)
+        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
+    endif()
+
+    if (NOT CMAKE_CROSSCOMPILING)
+        add_subdirectory(vulkan-shaders)
+        if (MSVC)
+            foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+                string(TOUPPER ${CONFIG} CONFIG)
+                set_target_properties(vulkan-shaders-gen PROPERTIES
+                    RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+            endforeach()
+        endif()
+    else()
+        if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
+            set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
+        else()
+            detect_host_compiler()
+            if (NOT HOST_C_COMPILER OR NOT HOST_CXX_COMPILER)
+                message(FATAL_ERROR "Host compiler not found")
+            else()
+                message(STATUS "Host compiler: ${HOST_C_COMPILER} ${HOST_CXX_COMPILER}")
+            endif()
+            configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/host-toolchain.cmake.in ${CMAKE_BINARY_DIR}/host-toolchain.cmake @ONLY)
+            set(HOST_CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/host-toolchain.cmake)
+        endif()
+        message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
+
+        include(ExternalProject)
+        # Native build through ExternalProject_Add
+        ExternalProject_Add(
+            vulkan-shaders-gen
+            SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
+            CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE}
+                    -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
+            BUILD_COMMAND ${CMAKE_COMMAND} --build .
+            INSTALL_COMMAND ${CMAKE_COMMAND} --install .
+            INSTALL_DIR ${CMAKE_BINARY_DIR}
+        )
+        ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
+    endif()
+    set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
+    set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
+    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
+    set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
+    set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
+    set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
+
+    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
+    set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)
+
+    if (CMAKE_CROSSCOMPILING)
+        set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
+    endif()
+
+    add_custom_command(
+        OUTPUT ${_ggml_vk_header}
+                ${_ggml_vk_source}
+
+        COMMAND ${_ggml_vk_genshaders_cmd}
+            --glslc      ${Vulkan_GLSLC_EXECUTABLE}
+            --input-dir  ${_ggml_vk_input_dir}
+            --output-dir ${_ggml_vk_output_dir}
+            --target-hpp ${_ggml_vk_header}
+            --target-cpp ${_ggml_vk_source}
+            --no-clean
+
+        DEPENDS ${_ggml_vk_shader_deps}
+        COMMENT "Generate vulkan shaders"
+    )
+
+    target_sources(ggml-vulkan PRIVATE ${_ggml_vk_source} ${_ggml_vk_header})
+
+else()
+    message(WARNING "Vulkan not found")
+endif()
diff --git a/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in b/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in
new file mode 100644
index 000000000..b6af747a5
--- /dev/null
+++ b/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in
@@ -0,0 +1,15 @@
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_C_FLAGS -O2)
+set(CMAKE_CXX_FLAGS -O2)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+set(CMAKE_C_COMPILER @HOST_C_COMPILER@)
+set(CMAKE_CXX_COMPILER @HOST_CXX_COMPILER@)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY @CMAKE_RUNTIME_OUTPUT_DIRECTORY@)
+
+if("@CMAKE_C_COMPILER_ID@" STREQUAL "MSVC")
+    foreach(CONFIG IN ITEMS DEBUG RELEASE MINSIZEREL RELWITHDEBINFO)
+        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+    endforeach()
+endif()
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
new file mode 100644
index 000000000..bffe95086
--- /dev/null
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -0,0 +1,9064 @@
+#include "ggml-vulkan.h"
+#include <vulkan/vulkan_core.h>
+#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
+#include <chrono>
+#include "ggml-cpu.h"
+#endif
+
+#include <vulkan/vulkan.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <tuple>
+#include <vector>
+#include <sstream>
+#include <utility>
+#include <memory>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <memory>
+#include <mutex>
+#include <future>
+#include <thread>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-vulkan-shaders.hpp"
+
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+#define VK_VENDOR_ID_AMD 0x1002
+#define VK_VENDOR_ID_APPLE 0x106b
+#define VK_VENDOR_ID_INTEL 0x8086
+#define VK_VENDOR_ID_NVIDIA 0x10de
+
+#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
+
+#define GGML_VK_MAX_NODES 8192
+
+#define MAX_VK_BUFFERS 256
+
+#define VK_CHECK(err, msg)                                          \
+    do {                                                            \
+        vk::Result err_ = (err);                                    \
+        if (err_ != vk::Result::eSuccess) {                         \
+            fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n",  \
+                #err, to_string(err_).c_str(), __FILE__, __LINE__); \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+#ifdef GGML_VULKAN_DEBUG
+#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl
+#else
+#define VK_LOG_DEBUG(msg) ((void) 0)
+#endif // GGML_VULKAN_DEBUG
+
+struct ggml_backend_vk_context;
+
+struct vk_queue {
+    uint32_t queue_family_index;
+    vk::Queue queue;
+    vk::CommandPool pool;
+    uint32_t cmd_buffer_idx;
+    std::vector<vk::CommandBuffer> cmd_buffers;
+
+    vk::PipelineStageFlags stage_flags;
+
+    bool transfer_only;
+};
+
+struct vk_pipeline_struct {
+    std::string name;
+    vk::ShaderModule shader_module;
+    vk::DescriptorSetLayout dsl;
+    std::vector<vk::DescriptorPool> descriptor_pools;
+    std::vector<vk::DescriptorSet> descriptor_sets;
+    uint32_t descriptor_set_idx;
+    vk::PipelineLayout layout;
+    vk::Pipeline pipeline;
+    uint32_t push_constant_size;
+    uint32_t parameter_count;
+    std::array<uint32_t, 3> wg_denoms;
+    uint32_t align;
+    // set to true to request the pipeline is compiled after the dryrun
+    bool needed {};
+    // set to true when the shader has been compiled
+    bool compiled {};
+};
+
+typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
+typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
+
+static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
+
+struct vk_matmul_pipeline_struct {
+    vk_pipeline l, m, s;
+    vk_pipeline a_l, a_m, a_s;
+};
+
+typedef std::shared_ptr<vk_matmul_pipeline_struct> vk_matmul_pipeline;
+
+struct vk_matmul_pipeline2 {
+    vk_matmul_pipeline2() {
+        f16acc = std::make_shared<vk_matmul_pipeline_struct>();
+        f32acc = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    vk_matmul_pipeline f32acc;
+    vk_matmul_pipeline f16acc;
+};
+
+struct vk_device_struct;
+typedef std::shared_ptr<vk_device_struct> vk_device;
+typedef std::weak_ptr<vk_device_struct> vk_device_ref;
+
+struct vk_buffer_struct;
+typedef std::shared_ptr<vk_buffer_struct> vk_buffer;
+typedef std::weak_ptr<vk_buffer_struct> vk_buffer_ref;
+
+struct ggml_backend_vk_buffer_type_context {
+    std::string name;
+    vk_device device;
+};
+
+static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
+static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
+static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
+static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
+static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
+static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_vk_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_vk_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_vk_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_vk_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+class vk_memory_logger;
+#endif
+#ifdef GGML_VULKAN_PERF
+class vk_perf_logger;
+#endif
+static void ggml_vk_destroy_buffer(vk_buffer& buf);
+
+static constexpr uint32_t mul_mat_vec_max_cols = 8;
+
+struct vk_device_struct {
+    std::mutex mutex;
+
+    vk::PhysicalDevice physical_device;
+    vk::PhysicalDeviceProperties properties;
+    std::string name;
+    uint64_t max_memory_allocation_size;
+    uint64_t suballocation_block_size;
+    bool fp16;
+    bool pipeline_robustness;
+    vk::Device device;
+    uint32_t vendor_id;
+    vk_queue compute_queue;
+    vk_queue transfer_queue;
+    bool single_queue;
+    uint32_t subgroup_size;
+    uint32_t shader_core_count;
+    bool uma;
+    bool prefer_host_memory;
+    bool float_controls_rte_fp16;
+
+    bool subgroup_size_control;
+    uint32_t subgroup_min_size;
+    uint32_t subgroup_max_size;
+    bool subgroup_require_full_support;
+
+    bool coopmat_support;
+    bool coopmat_acc_f32_support;
+    bool coopmat_acc_f16_support;
+    uint32_t coopmat_m;
+    uint32_t coopmat_n;
+    uint32_t coopmat_k;
+    bool coopmat2;
+
+    size_t idx;
+
+    bool mul_mat_l[GGML_TYPE_COUNT];
+    bool mul_mat_m[GGML_TYPE_COUNT];
+    bool mul_mat_s[GGML_TYPE_COUNT];
+    bool mul_mat_id_l[GGML_TYPE_COUNT];
+    bool mul_mat_id_m[GGML_TYPE_COUNT];
+    bool mul_mat_id_s[GGML_TYPE_COUNT];
+
+    // set to true to indicate that some shaders need to be compiled after the dryrun
+    bool need_compiles {};
+
+    vk_matmul_pipeline pipeline_matmul_f32 {};
+    vk_matmul_pipeline pipeline_matmul_f32_f16 {};
+    vk_matmul_pipeline2 pipeline_matmul_f16;
+    vk_matmul_pipeline2 pipeline_matmul_f16_f32;
+    vk_pipeline pipeline_matmul_split_k_reduce;
+
+    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
+    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
+
+    vk_matmul_pipeline pipeline_matmul_id_f32 {};
+    vk_matmul_pipeline2 pipeline_matmul_id_f16;
+    vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
+
+    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
+
+    vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
+    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
+
+    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
+    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
+    vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_acc_f32;
+    vk_pipeline pipeline_add_f32, pipeline_add_f32_norepeat;
+    vk_pipeline pipeline_add_f16_f32_f16, pipeline_add_f16_f32_f16_norepeat;
+    vk_pipeline pipeline_mul_f32, pipeline_mul_f32_norepeat;
+    vk_pipeline pipeline_div_f32, pipeline_div_f32_norepeat;
+    vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
+    vk_pipeline pipeline_upscale_f32;
+    vk_pipeline pipeline_scale_f32;
+    vk_pipeline pipeline_sqr_f32;
+    vk_pipeline pipeline_sin_f32;
+    vk_pipeline pipeline_cos_f32;
+    vk_pipeline pipeline_clamp_f32;
+    vk_pipeline pipeline_pad_f32;
+    vk_pipeline pipeline_repeat_f32;
+    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
+    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
+    vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_norm_f32;
+    vk_pipeline pipeline_group_norm_f32;
+    vk_pipeline pipeline_rms_norm_f32;
+    vk_pipeline pipeline_gelu_f32;
+    vk_pipeline pipeline_gelu_quick_f32;
+    vk_pipeline pipeline_silu_f32;
+    vk_pipeline pipeline_relu_f32;
+    vk_pipeline pipeline_leaky_relu_f32;
+    vk_pipeline pipeline_tanh_f32;
+    vk_pipeline pipeline_diag_mask_inf_f32;
+    vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
+    vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
+    vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
+    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
+    vk_pipeline pipeline_argsort_f32;
+    vk_pipeline pipeline_sum_rows_f32;
+    vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
+    vk_pipeline pipeline_timestep_embedding_f32;
+    vk_pipeline pipeline_pool2d_f32;
+    vk_pipeline pipeline_rwkv_wkv6_f32;
+
+    // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
+    vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
+
+    std::unordered_map<std::string, vk_pipeline_ref> pipelines;
+    std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
+
+    std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
+
+    vk::Fence fence;
+    vk_buffer sync_staging;
+
+    ggml_backend_buffer_type buffer_type;
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+    std::unique_ptr<vk_memory_logger> memory_logger;
+#endif
+#ifdef GGML_VULKAN_PERF
+    std::unique_ptr<vk_perf_logger> perf_logger;
+#endif
+
+    ~vk_device_struct() {
+        VK_LOG_DEBUG("destroy device " << name);
+
+        device.destroyFence(fence);
+
+        ggml_vk_destroy_buffer(sync_staging);
+
+        device.destroyCommandPool(compute_queue.pool);
+        if (!single_queue) {
+            device.destroyCommandPool(transfer_queue.pool);
+        }
+
+        for (auto& pipeline : pipelines) {
+            if (pipeline.second.expired()) {
+                continue;
+            }
+
+            vk_pipeline pl = pipeline.second.lock();
+            ggml_vk_destroy_pipeline(device, pl);
+        }
+        pipelines.clear();
+
+        device.destroy();
+    }
+};
+
+struct vk_buffer_struct {
+    vk::Buffer buffer = VK_NULL_HANDLE;
+    vk::DeviceMemory device_memory = VK_NULL_HANDLE;
+    vk::MemoryPropertyFlags memory_property_flags;
+    void * ptr;
+    size_t size = 0;
+
+    vk_device device;
+
+    ~vk_buffer_struct() {
+        if (size == 0) {
+            return;
+        }
+        VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")");
+
+        device->device.freeMemory(device_memory);
+        device->device.destroyBuffer(buffer);
+    }
+};
+
+struct vk_subbuffer {
+    vk_buffer buffer;
+    uint64_t offset;
+    uint64_t size;
+
+    operator vk::DescriptorBufferInfo() const {
+        return { buffer->buffer, offset, size };
+    }
+};
+
+struct vk_semaphore {
+    vk::Semaphore s;
+    uint64_t value;
+};
+
+struct vk_submission {
+    vk::CommandBuffer buffer;
+    std::vector<vk_semaphore> wait_semaphores;
+    std::vector<vk_semaphore> signal_semaphores;
+};
+
+typedef std::vector<vk_submission> vk_sequence;
+
+struct vk_mat_mat_push_constants {
+    uint32_t M; uint32_t N; uint32_t K;
+    uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
+    uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
+    uint32_t k_split;
+    uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
+};
+struct vk_mat_vec_push_constants {
+    uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
+    uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
+    uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
+};
+
+struct vk_mat_mat_id_push_constants {
+    uint32_t M; uint32_t N; uint32_t K;
+    uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
+    uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
+    uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
+};
+struct vk_mat_vec_id_push_constants {
+    uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
+    uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
+    uint32_t nei0; uint32_t ne11;
+};
+
+struct vk_flash_attn_push_constants {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+};
+
+struct vk_op_push_constants {
+    uint32_t KX;
+    uint32_t KY;
+    float param1;
+    float param2;
+};
+
+struct vk_op_unary_push_constants {
+    uint32_t ne;
+    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
+    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
+    uint32_t misalign_offsets;
+    float param1; float param2;
+    uint32_t ne0_012mp; uint32_t ne0_012L;
+    uint32_t ne0_01mp;  uint32_t ne0_01L;
+    uint32_t ne0_0mp;   uint32_t ne0_0L;
+    uint32_t ne1_012mp; uint32_t ne1_012L;
+    uint32_t ne1_01mp;  uint32_t ne1_01L;
+    uint32_t ne1_0mp;   uint32_t ne1_0L;
+};
+static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
+
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
+{
+    // compute L = ceil(log2(d));
+    L = 0;
+    while (L < 32 && (uint32_t{1} << L) < d) {
+        L++;
+    }
+
+    mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
+}
+
+template <typename T> void init_pushconst_fastdiv(T &p) {
+    GGML_UNUSED(p);
+    static_assert(!std::is_const<T>::value, "unexpected type");
+}
+
+template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
+    // Compute magic values to divide by these six numbers.
+    init_fastdiv_values(p.ne02*p.ne01*p.ne00,  p.ne0_012mp,    p.ne0_012L);
+    init_fastdiv_values(p.ne01*p.ne00,         p.ne0_01mp,     p.ne0_01L);
+    init_fastdiv_values(p.ne00,                p.ne0_0mp,      p.ne0_0L);
+    init_fastdiv_values(p.ne12*p.ne11*p.ne10,  p.ne1_012mp,    p.ne1_012L);
+    init_fastdiv_values(p.ne11*p.ne10,         p.ne1_01mp,     p.ne1_01L);
+    init_fastdiv_values(p.ne10,                p.ne1_0mp,      p.ne1_0L);
+}
+
+struct vk_op_binary_push_constants {
+    uint32_t ne;
+    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
+    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
+    uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
+    uint32_t misalign_offsets;
+    float param1; float param2; int32_t param3;
+};
+
+struct vk_op_diag_mask_push_constants {
+    uint32_t ncols;
+    uint32_t rows_per_channel;
+    int32_t n_past;
+};
+
+struct vk_op_rope_push_constants {
+    uint32_t ncols;
+    uint32_t n_dims;
+    float freq_scale;
+    uint32_t p_delta_rows;
+    float freq_base;
+    float ext_factor;
+    float attn_factor;
+    float corr_dims[2];
+    float theta_scale;
+    uint32_t has_ff;
+};
+
+struct vk_op_soft_max_push_constants {
+    uint32_t KX;
+    uint32_t KY;
+    float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint32_t n_head_log2;
+    uint32_t nrows_x;
+};
+
+struct vk_op_argsort_push_constants {
+    uint32_t ncols;
+    uint32_t ncols_pad;
+    int32_t order;
+};
+
+struct vk_op_im2col_push_constants {
+    uint32_t batch_offset; uint32_t offset_delta;
+    uint32_t IC;
+    uint32_t IW; uint32_t IH;
+    uint32_t OW; uint32_t OH;
+    uint32_t KW; uint32_t KH;
+    uint32_t pelements;
+    uint32_t CHW;
+    int32_t s0; int32_t s1;
+    int32_t p0; int32_t p1;
+    int32_t d0; int32_t d1;
+};
+
+struct vk_op_timestep_embedding_push_constants {
+    uint32_t nb1;
+    uint32_t dim;
+    uint32_t max_period;
+};
+
+struct vk_op_pool2d_push_constants {
+    uint32_t IW; uint32_t IH;
+    uint32_t OW; uint32_t OH;
+    uint32_t OC;
+    uint32_t pelements;
+    uint32_t op;
+    int32_t k0; int32_t k1;
+    int32_t s0; int32_t s1;
+    int32_t p0; int32_t p1;
+};
+
+struct vk_op_rwkv_wkv6_push_constants {
+    uint32_t B;
+    uint32_t T;
+    uint32_t C;
+    uint32_t H;
+};
+
+// Allow pre-recording command buffers
+struct vk_staging_memcpy {
+    vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
+
+    void * dst;
+    const void * src;
+    size_t n;
+};
+
+struct vk_op_upscale_push_constants {
+    uint32_t ne; uint32_t a_offset; uint32_t d_offset;
+    uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
+    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
+    float sf0; float sf1; float sf2; float sf3;
+};
+
+struct vk_context_struct {
+    vk_submission * s;
+    std::vector<vk_sequence> seqs;
+
+    int exit_tensor_idx;
+
+    std::vector<vk_staging_memcpy> in_memcpys;
+    std::vector<vk_staging_memcpy> out_memcpys;
+
+    vk_queue * q;
+};
+typedef std::shared_ptr<vk_context_struct> vk_context;
+typedef std::weak_ptr<vk_context_struct> vk_context_ref;
+
+struct ggml_vk_garbage_collector {
+    std::vector<vk_semaphore> tl_semaphores;
+    std::vector<vk_semaphore> semaphores;
+    std::vector<vk::Event> events;
+    std::vector<vk_buffer> temp_buffers;
+    std::vector<vk_context> contexts;
+};
+
+#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG)
+#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl
+
+static std::string format_size(size_t size) {
+    const size_t kib = 1024;
+    const size_t mib = kib * 1024;
+    const size_t gib = mib * 1024;
+
+    std::ostringstream oss;
+    oss << std::fixed << std::setprecision(2);
+
+    if (size >= gib) {
+        oss << static_cast<double>(size) / gib << " GiB";
+    } else if (size >= mib) {
+        oss << static_cast<double>(size) / mib << " MiB";
+    } else if (size >= kib) {
+        oss << static_cast<double>(size) / kib << " KiB";
+    } else {
+        oss << size << " B";
+    }
+
+    return oss.str();
+}
+
+static std::mutex log_mutex;
+
+class vk_memory_logger {
+public:
+    vk_memory_logger(): total_device(0), total_host(0) {}
+    void log_allocation(vk_buffer_ref buf_ref, size_t size);
+    void log_deallocation(vk_buffer_ref buf_ref);
+
+private:
+    std::map<vk::Buffer, size_t> allocations; // Track allocations
+    size_t total_device;
+    size_t total_host;
+};
+#else
+#define VK_LOG_MEMORY(msg) ((void) 0)
+#endif // GGML_VULKAN_MEMORY_DEBUG
+
+#if defined(GGML_VULKAN_PERF)
+
+class vk_perf_logger {
+public:
+    void print_timings() {
+        std::cerr << "----------------\nVulkan Timings:" << std::endl;
+        for (const auto& t : timings) {
+            uint64_t total = 0;
+            for (const auto& time : t.second) {
+                total += time;
+            }
+            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
+        }
+
+        timings.clear();
+    }
+
+    void log_timing(const ggml_tensor * node, uint64_t time) {
+        if (node->op == GGML_OP_UNARY) {
+            timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
+            return;
+        }
+        if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
+            const uint64_t m = node->src[0]->ne[1];
+            const uint64_t n = node->src[1]->ne[1];
+            const uint64_t k = node->src[1]->ne[0];
+            std::string name = ggml_op_name(node->op);
+            if (n == 1) {
+                name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
+            } else {
+                name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
+            }
+            timings[name].push_back(time);
+            return;
+        }
+        timings[ggml_op_name(node->op)].push_back(time);
+    }
+private:
+    std::map<std::string, std::vector<uint64_t>> timings;
+};
+#endif // GGML_VULKAN_PERF
+
+struct ggml_backend_vk_context {
+    std::string name;
+
+    vk_device device;
+
+    size_t semaphore_idx, event_idx;
+    ggml_vk_garbage_collector gc;
+    size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
+    vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
+    vk::Fence fence;
+
+    vk_buffer buffer_pool[MAX_VK_BUFFERS];
+
+    vk_context_ref compute_ctx;
+    vk_context_ref transfer_ctx;
+
+    std::vector<vk_context_ref> tensor_ctxs;
+};
+
+static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+
+static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
+    if (tensor->view_src) {
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
+    }
+    return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
+}
+
+struct ggml_backend_vk_buffer_context {
+    vk_device_ref device;
+    vk_buffer dev_buffer;
+    std::string name;
+
+    ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
+        device(device),
+        dev_buffer(dev_buffer),
+        name(name) {
+    }
+
+    ~ggml_backend_vk_buffer_context() {
+        ggml_vk_destroy_buffer(dev_buffer);
+    }
+};
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
+    std::lock_guard<std::mutex> guard(log_mutex);
+    vk_buffer buf = buf_ref.lock();
+    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
+    const std::string type = device ? "device" : "host";
+    allocations[buf->buffer] = size;
+    total_device += device ? size : 0;
+    total_host += device ? 0 : size;
+    VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
+}
+
+void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
+    if (buf_ref.expired() || buf_ref.lock()->size == 0) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> guard(log_mutex);
+    vk_buffer buf = buf_ref.lock();
+    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
+    std::string type = device ? "device" : "host";
+    auto it = allocations.find(buf->buffer);
+    total_device -= device ? it->second : 0;
+    total_host -= device ? 0 : it->second;
+    if (it != allocations.end()) {
+        VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
+        allocations.erase(it);
+    } else {
+        VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
+    }
+}
+#endif // GGML_VULKAN_MEMORY_DEBUG
+
+struct vk_instance_t {
+    vk::Instance instance;
+
+    std::vector<size_t> device_indices;
+    vk_device devices[GGML_VK_MAX_DEVICES];
+};
+
+static bool vk_instance_initialized = false;
+static vk_instance_t vk_instance;
+
+#ifdef GGML_VULKAN_CHECK_RESULTS
+static size_t vk_skip_checks;
+static size_t vk_output_tensor;
+
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
+static void ggml_vk_check_results_0(ggml_tensor * tensor);
+static void ggml_vk_check_results_1(ggml_tensor * tensor);
+#endif
+
+typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+static void ggml_backend_vk_free(ggml_backend_t backend);
+
+// variables to track number of compiles in progress
+static uint32_t compile_count = 0;
+static std::mutex compile_count_mutex;
+static std::condition_variable compile_count_cond;
+
+static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
+                                         uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
+                                         bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
+    VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << pipeline->name << ", " << entrypoint << ", " << parameter_count <<
+                 ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
+                 disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
+    GGML_ASSERT(parameter_count > 0);
+    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
+
+    vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
+    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
+
+    std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
+    std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
+    for (uint32_t i = 0; i < parameter_count; i++) {
+        dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
+        dsl_binding_flags.push_back({});
+    }
+
+    vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
+
+    vk::PushConstantRange pcr(
+        vk::ShaderStageFlagBits::eCompute,
+        0,
+        pipeline->push_constant_size
+    );
+
+    vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
+        {},
+        dsl_binding);
+    descriptor_set_layout_create_info.setPNext(&dslbfci);
+    pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+
+    vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
+    vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
+    pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
+
+    pipeline->descriptor_set_idx = 0;
+
+    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
+    pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
+
+    std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
+
+    for (size_t i = 0; i < specialization_constants.size(); i++) {
+        specialization_entries[i].constantID = i;
+        specialization_entries[i].offset = i * sizeof(uint32_t);
+        specialization_entries[i].size = sizeof(uint32_t);
+    }
+
+    vk::SpecializationInfo specialization_info(
+        specialization_entries.size(),
+        specialization_entries.data(),
+        specialization_constants.size() * sizeof(uint32_t),
+        specialization_constants.data()
+    );
+
+    vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
+
+    if (device->subgroup_require_full_support && require_full_subgroups) {
+        pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
+    }
+
+    vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
+            pipeline_shader_stage_create_flags,
+            vk::ShaderStageFlagBits::eCompute,
+            pipeline->shader_module,
+            entrypoint.c_str(),
+            &specialization_info);
+
+    vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
+    pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
+    if (device->subgroup_size_control && required_subgroup_size > 0) {
+        GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
+        pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
+    }
+
+    vk::ComputePipelineCreateInfo compute_pipeline_create_info(
+        vk::PipelineCreateFlags{},
+        pipeline_shader_create_info,
+        pipeline->layout);
+
+    vk::PipelineRobustnessCreateInfoEXT rci;
+
+    if (device->pipeline_robustness && disable_robustness) {
+        rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
+        compute_pipeline_create_info.setPNext(&rci);
+    }
+
+    try {
+        pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+    } catch (const vk::SystemError& e) {
+        std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl;
+        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        throw e;
+    }
+    pipeline->compiled = true;
+
+    {
+        std::lock_guard<std::mutex> guard(device->mutex);
+        device->pipelines.insert({ pipeline->name, pipeline });
+    }
+
+    {
+        std::lock_guard<std::mutex> guard(compile_count_mutex);
+        assert(compile_count > 0);
+        compile_count--;
+    }
+    compile_count_cond.notify_all();
+}
+
+static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
+    VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
+    for (auto& pool : pipeline->descriptor_pools) {
+        device.destroyDescriptorPool(pool);
+    }
+    pipeline->descriptor_pools.clear();
+    pipeline->descriptor_sets.clear();
+    pipeline->descriptor_set_idx = 0;
+
+    device.destroyDescriptorSetLayout(pipeline->dsl);
+
+    device.destroyPipelineLayout(pipeline->layout);
+
+    device.destroyShaderModule(pipeline->shader_module);
+
+    device.destroyPipeline(pipeline->pipeline);
+}
+
+static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
+    VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
+    device->pipeline_descriptor_set_requirements[pipeline->name] += n;
+    if (!pipeline->compiled) {
+        pipeline->needed = true;
+        device->need_compiles = true;
+    }
+}
+
+static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
+    std::lock_guard<std::mutex> guard(device->mutex);
+
+    for (auto& pair : device->pipeline_descriptor_set_requirements) {
+        vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
+        const uint64_t n = pair.second;
+
+        VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
+
+        if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
+            // Enough descriptors are available
+            continue;
+        }
+
+        uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
+        uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+        uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+
+        while (to_alloc > 0) {
+            const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
+            to_alloc -= alloc_count;
+            pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+
+            if (pool_idx >= pipeline->descriptor_pools.size()) {
+                vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
+                vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
+                pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
+            }
+
+            std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
+            for (uint32_t i = 0; i < alloc_count; i++) {
+                layouts[i] = pipeline->dsl;
+            }
+            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
+            std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+            pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
+
+            pool_idx++;
+        }
+    }
+}
+
+static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
+    VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
+    pipeline->descriptor_set_idx = 0;
+}
+
+static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
+    VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
+    std::lock_guard<std::mutex> guard(device->mutex);
+
+    if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
+        // Reuse command buffer
+        return q.cmd_buffers[q.cmd_buffer_idx++];
+    }
+
+    vk::CommandBufferAllocateInfo command_buffer_alloc_info(
+        q.pool,
+        vk::CommandBufferLevel::ePrimary,
+        1);
+    const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
+    auto buf = cmd_buffers.front();
+
+    q.cmd_buffers.push_back(buf);
+    q.cmd_buffer_idx++;
+
+    return buf;
+}
+
+static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
+    VK_LOG_DEBUG("ggml_vk_create_submission()");
+    vk_submission s;
+    s.buffer = ggml_vk_create_cmd_buffer(device, q);
+    s.wait_semaphores = std::move(wait_semaphores);
+    s.signal_semaphores = std::move(signal_semaphores);
+    return s;
+}
+
+static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
+    if (ctx->seqs.empty()) {
+        if (fence) {
+            ctx->q->queue.submit({}, fence);
+        }
+        return;
+    }
+    VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
+
+    std::vector<std::vector<uint64_t>> tl_wait_vals;
+    std::vector<std::vector<uint64_t>> tl_signal_vals;
+    std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
+    std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
+    std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
+    std::vector<vk::SubmitInfo> submit_infos;
+    int idx = -1;
+    std::vector<std::vector<vk::PipelineStageFlags>> stage_flags;
+
+    size_t reserve = 0;
+
+    for (const auto& sequence : ctx->seqs) {
+        reserve += sequence.size();
+    }
+
+    // Pre-reserve vectors to prevent reallocation, which invalidates pointers
+    tl_wait_semaphores.reserve(reserve);
+    tl_wait_vals.reserve(reserve);
+    tl_signal_semaphores.reserve(reserve);
+    tl_signal_vals.reserve(reserve);
+    tl_submit_infos.reserve(reserve);
+    submit_infos.reserve(reserve);
+    stage_flags.reserve(reserve);
+
+    for (const auto& sequence : ctx->seqs) {
+        for (const auto& submission : sequence) {
+            stage_flags.push_back({});
+            idx++;
+            tl_wait_vals.push_back({});
+            tl_wait_semaphores.push_back({});
+            tl_signal_vals.push_back({});
+            tl_signal_semaphores.push_back({});
+            for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
+                stage_flags[idx].push_back(ctx->q->stage_flags);
+                tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
+                tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
+            }
+            for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
+                tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
+                tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
+            }
+            tl_submit_infos.push_back({
+                (uint32_t) submission.wait_semaphores.size(),
+                tl_wait_vals[idx].data(),
+                (uint32_t) submission.signal_semaphores.size(),
+                tl_signal_vals[idx].data(),
+            });
+            tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo;
+            tl_submit_infos[idx].pNext = nullptr;
+            vk::SubmitInfo si{
+                (uint32_t) submission.wait_semaphores.size(),
+                tl_wait_semaphores[idx].data(),
+                stage_flags[idx].data(),
+                1,
+                &submission.buffer,
+                (uint32_t) submission.signal_semaphores.size(),
+                tl_signal_semaphores[idx].data(),
+            };
+            si.setPNext(&tl_submit_infos[idx]);
+            submit_infos.push_back(si);
+        }
+    }
+
+    ctx->q->queue.submit(submit_infos, fence);
+
+    ctx->seqs.clear();
+}
+
+static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
+    VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
+    const uint32_t qfsize = queue_family_props.size();
+
+    // Try with avoid preferences first
+    for (uint32_t i = 0; i < qfsize; i++) {
+        if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) {
+            return i;
+        }
+    }
+
+    // Fall back to only required
+    for (size_t i = 0; i < qfsize; i++) {
+        if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) {
+            return i;
+        }
+    }
+
+    // Fall back to reusing compute queue
+    for (size_t i = 0; i < qfsize; i++) {
+        if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) {
+            return i;
+        }
+    }
+
+    // Fall back to ignoring min_num_queries
+    for (size_t i = 0; i < qfsize; i++) {
+        if (queue_family_props[i].queueFlags & required) {
+            return i;
+        }
+    }
+
+    // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
+    // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
+    if (compute_index >= 0) {
+        return compute_index;
+    }
+
+    std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
+
+    for(auto &q_family : queue_family_props) {
+        std::cerr << "Queue number: "  + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl;
+    }
+    abort();
+}
+
+static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
+    VK_LOG_DEBUG("ggml_vk_create_queue()");
+    std::lock_guard<std::mutex> guard(device->mutex);
+
+    q.queue_family_index = queue_family_index;
+    q.transfer_only = transfer_only;
+
+    vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
+    q.pool = device->device.createCommandPool(command_pool_create_info_compute);
+
+    q.cmd_buffer_idx = 0;
+
+    q.queue = device->device.getQueue(queue_family_index, queue_index);
+
+    q.stage_flags = stage_flags;
+}
+
+static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
+    vk_context result = std::make_shared<vk_context_struct>();
+    VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
+    ctx->gc.contexts.emplace_back(result);
+    result->q = &q;
+    return result;
+}
+
+static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
+    vk_context result = std::make_shared<vk_context_struct>();
+    VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
+    result->q = &q;
+    return result;
+}
+
+static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
+    VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
+    vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
+    vk::SemaphoreCreateInfo ci{};
+    ci.setPNext(&tci);
+    vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
+    ctx->gc.semaphores.push_back({ semaphore, 0 });
+    return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
+}
+
+static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
+    VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
+    if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
+        vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
+        vk::SemaphoreCreateInfo ci{};
+        ci.setPNext(&tci);
+        vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
+        ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
+    }
+    return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
+}
+
+static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
+    if (ctx->event_idx >= ctx->gc.events.size()) {
+        ctx->gc.events.push_back(ctx->device->device.createEvent({}));
+    }
+    return ctx->gc.events[ctx->event_idx++];
+}
+
+static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
+    VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
+    std::lock_guard<std::mutex> guard(device->mutex);
+
+    // Requires command buffers to be done
+    device->device.resetCommandPool(q.pool);
+    q.cmd_buffer_idx = 0;
+}
+
+static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
+    for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
+        vk::MemoryType memory_type = mem_props->memoryTypes[i];
+        if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
+            (flags & memory_type.propertyFlags) == flags &&
+            mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
+            return static_cast<int32_t>(i);
+        }
+    }
+    return UINT32_MAX;
+}
+
+static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
+    VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")");
+    if (size > device->max_memory_allocation_size) {
+        throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
+    }
+
+    std::lock_guard<std::mutex> guard(device->mutex);
+
+    vk_buffer buf = std::make_shared<vk_buffer_struct>();
+
+    if (size == 0) {
+        buf->size = 0;
+        return buf;
+    }
+
+    vk::BufferCreateInfo buffer_create_info{
+        vk::BufferCreateFlags(),
+        size,
+        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
+        vk::SharingMode::eExclusive,
+        0,
+        nullptr,
+    };
+
+    buf->buffer = device->device.createBuffer(buffer_create_info);
+
+    vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
+
+    vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
+
+    uint32_t memory_type_index = UINT32_MAX;
+
+    memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
+    buf->memory_property_flags = req_flags;
+
+    if (memory_type_index == UINT32_MAX && fallback_flags) {
+        memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
+        buf->memory_property_flags = fallback_flags;
+    }
+
+    if (memory_type_index == UINT32_MAX) {
+        device->device.destroyBuffer(buf->buffer);
+        throw vk::OutOfDeviceMemoryError("No suitable memory type found");
+    }
+
+    try {
+        buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
+    } catch (const vk::SystemError& e) {
+        if (buf->memory_property_flags != fallback_flags) {
+            // Try again with fallback flags
+            memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
+            buf->memory_property_flags = fallback_flags;
+
+            try {
+                buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index });
+            }
+            catch (const vk::SystemError& e) {
+                device->device.destroyBuffer(buf->buffer);
+                throw e;
+            }
+        } else {
+            // Out of Host/Device memory, clean up buffer
+            device->device.destroyBuffer(buf->buffer);
+            throw e;
+        }
+    }
+    buf->ptr = nullptr;
+
+    if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+        buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
+    }
+
+    device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
+
+    buf->device = device;
+    buf->size = size;
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+    device->memory_logger->log_allocation(buf, size);
+#endif
+
+    return buf;
+}
+
+static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
+    try {
+        return ggml_vk_create_buffer(device, size, req_flags, fallback_flags);
+    } catch (const vk::SystemError& e) {
+        std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
+        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        throw e;
+    }
+}
+
+static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
+    vk_buffer buf;
+    try {
+        if (device->prefer_host_memory) {
+            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
+        } else if (device->uma) {
+            // Fall back to host memory type
+            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+        } else {
+            // use rebar if available, otherwise fallback to device only visible memory
+            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
+        }
+    } catch (const vk::SystemError& e) {
+        std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
+        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        throw e;
+    }
+
+    return buf;
+}
+
+static void ggml_vk_destroy_buffer(vk_buffer& buf) {
+    if (buf == nullptr) {
+        return;
+    }
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+    if (buf->device != nullptr) {
+        buf->device->memory_logger->log_deallocation(buf);
+    }
+#endif
+
+    buf.reset();
+}
+
+static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
+    return { buf, 0, VK_WHOLE_SIZE };
+}
+
+static void ggml_vk_sync_buffers(vk_context& ctx) {
+    VK_LOG_DEBUG("ggml_vk_sync_buffers()");
+
+    const bool transfer_queue = ctx->q->transfer_only;
+
+    ctx->s->buffer.pipelineBarrier(
+        ctx->q->stage_flags,
+        ctx->q->stage_flags,
+        {},
+        { {
+          { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
+          { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }
+        } },
+        {},
+        {}
+    );
+}
+
+static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
+    VK_LOG_DEBUG("ggml_vk_wait_events()");
+    if (events.empty()) {
+        return;
+    }
+
+    ctx->s->buffer.waitEvents(
+        events,
+        ctx->q->stage_flags,
+        ctx->q->stage_flags,
+        {},
+        {},
+        {}
+    );
+}
+
+// number of rows/cols for flash attention shader
+static constexpr uint32_t flash_attention_num_small_rows = 32;
+static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
+    GGML_UNUSED(clamp);
+
+    // small rows, large cols
+    if (small_rows) {
+        return {flash_attention_num_small_rows, 128};
+    }
+    // small cols to reduce register count
+    if (ggml_is_quantized(type) || D == 256) {
+        return {64, 32};
+    }
+    return {64, 64};
+};
+
+static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
+
+    uint32_t lut_size = 0;
+    switch (src0_type) {
+    case GGML_TYPE_IQ2_XXS:
+        lut_size = 8*256;
+        break;
+    case GGML_TYPE_IQ2_XS:
+        lut_size = 8*512;
+        break;
+    case GGML_TYPE_IQ2_S:
+        lut_size = 8*1024;
+        break;
+    case GGML_TYPE_IQ3_XXS:
+        lut_size = 4*256;
+        break;
+    case GGML_TYPE_IQ3_S:
+        lut_size = 4*512;
+        break;
+    case GGML_TYPE_IQ4_NL:
+    case GGML_TYPE_IQ4_XS:
+        lut_size = 4*16;
+        break;
+    default:
+        break;
+    }
+
+    // Needs to be kept up to date on shader changes
+    const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
+    const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
+    const uint32_t warps = warptile[0] / warptile[10];
+
+    const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
+    const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
+    const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;
+
+    const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
+    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
+
+    VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), "
+                 "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported);
+
+    return supported;
+}
+
+static void ggml_vk_load_shaders(vk_device& device) {
+    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
+
+    // some shaders have a minimum subgroup size
+    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
+    const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
+
+    // mulmat
+    std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
+                          l_warptile_mmq, m_warptile_mmq, s_warptile_mmq,
+                          l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k,
+                          l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid;
+    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms,
+                            l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms,
+                            l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k,
+                            l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms;
+
+    uint32_t l_align, m_align, s_align;
+    if (device->coopmat2) {
+        // spec constants and tile sizes for non-quant matmul/matmul_id
+        l_warptile = { 256, 128, 256, 64 };
+        m_warptile = { 256, 128, 128, 64 };
+        s_warptile = { 128,  64,  64, 64 };
+        l_wg_denoms = {128, 256, 1 };
+        m_wg_denoms = {128, 128, 1 };
+        s_wg_denoms = { 64,  64, 1 };
+
+        // spec constants and tile sizes for quant matmul (non-Qi_K)
+        l_warptile_mmq = { 256, 128, 256, 64 };
+        m_warptile_mmq = { 256, 128, 128, 64 };
+        s_warptile_mmq = { 256, 128, 128, 64 };
+        l_mmq_wg_denoms = { 128, 256, 1 };
+        m_mmq_wg_denoms = { 128, 128, 1 };
+        s_mmq_wg_denoms = { 128, 128, 1 };
+
+        // spec constants and tile sizes for quant matmul (Qi_K)
+        l_warptile_mmq_k = { 256, 128, 512, 16 };
+        m_warptile_mmq_k = { 256, 128, 256, 16 };
+        s_warptile_mmq_k = { 256, 32, 128, 64 };
+        l_mmq_wg_denoms_k = { 128, 512, 1 };
+        m_mmq_wg_denoms_k = { 128, 256, 1 };
+        s_mmq_wg_denoms_k = { 32, 128, 1 };
+
+        // spec constants and tile sizes for quant matmul_id
+        l_warptile_mmqid = { 256, 128, 128, 16 };
+        m_warptile_mmqid = { 256, 128, 64, 16 };
+        s_warptile_mmqid = { 256, 64, 64, 16 };
+        l_mmqid_wg_denoms = { 128, 128, 1 };
+        m_mmqid_wg_denoms = { 128, 64, 1 };
+        s_mmqid_wg_denoms = { 64, 64, 1 };
+
+        l_align = 128;
+        m_align =  64;
+        s_align =  32;
+    } else {
+        // Matrix cores require different warp group sizes
+        const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4;
+        const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4;
+        const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2;
+        const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4;
+        const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2;
+        const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2;
+        const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1;
+        const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1;
+        const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1;
+
+        l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
+        m_warptile = { 128,  64,  64, 16, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
+        s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
+
+        l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
+        m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
+        s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
+
+        l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
+        m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
+        s_mmq_wg_denoms = s_wg_denoms = { 32,  32, 1 };
+        l_align = 128;
+        m_align =  64;
+        s_align =  32;
+
+        for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
+            ggml_type t = (ggml_type)i;
+            // Disable medium and large matrix multiplication if not enough shared memory is available
+            // Check mmq warptiles as the largest configuration
+            // Throw an error if not enough for any matrix multiplication is available
+            if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) {
+                std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl;
+                throw std::runtime_error("Shared memory size too small for matrix multiplication.");
+            } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) {
+                device->mul_mat_m[i] = false;
+                device->mul_mat_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) {
+                device->mul_mat_l[i] = false;
+            }
+
+            // Disable mul_mat_id if not enough shared memory is available
+            if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true, t)) {
+                device->mul_mat_id_s[i] = false;
+                device->mul_mat_id_m[i] = false;
+                device->mul_mat_id_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true, t)) {
+                device->mul_mat_id_m[i] = false;
+                device->mul_mat_id_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true, t)) {
+                device->mul_mat_id_l[i] = false;
+            }
+        }
+    }
+
+    if (!device->pipeline_matmul_f32) {
+        device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    if (!device->pipeline_matmul_f32_f16) {
+        device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    if (!device->pipeline_matmul_id_f32) {
+        device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+
+    std::vector<std::future<void>> compiles;
+    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
+                                              uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
+                                              uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
+
+        if (!pipeline) {
+            pipeline = std::make_shared<vk_pipeline_struct>();
+            pipeline->name = name;
+            pipeline->parameter_count = parameter_count;
+            pipeline->push_constant_size = push_constant_size;
+            pipeline->wg_denoms = wg_denoms;
+            pipeline->align = align;
+        }
+
+        if (!pipeline->needed || pipeline->compiled) {
+            return;
+        }
+        {
+            // wait until fewer than N compiles are in progress
+            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
+            std::unique_lock<std::mutex> guard(compile_count_mutex);
+            while (compile_count >= N) {
+                compile_count_cond.wait(guard);
+            }
+            compile_count++;
+        }
+        compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
+                                      parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
+    };
+
+#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    if (device->coopmat2) {
+
+        auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
+            return {fa_rows_cols(D, clamp, type, small_rows)[0], 1, 1};
+        };
+
+        auto const &fa_spec_constants = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
+            // For large number of rows, 128 invocations seems to work best.
+            // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
+            // can't use 256 for D==80.
+            uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128;
+            auto rows_cols = fa_rows_cols(D, clamp, type, small_rows);
+            return {wg_size, rows_cols[0], rows_cols[1], (D), clamp};
+        };
+
+#define CREATE_FA2(TYPE, NAMELC, D) \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]);     \
+
+#define CREATE_FA(TYPE, NAMELC) \
+        CREATE_FA2(TYPE, NAMELC, 64) \
+        CREATE_FA2(TYPE, NAMELC, 80) \
+        CREATE_FA2(TYPE, NAMELC, 96) \
+        CREATE_FA2(TYPE, NAMELC, 112) \
+        CREATE_FA2(TYPE, NAMELC, 128) \
+        CREATE_FA2(TYPE, NAMELC, 256)
+
+        CREATE_FA(GGML_TYPE_F16, f16)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0)
+        CREATE_FA(GGML_TYPE_Q4_1, q4_1)
+        CREATE_FA(GGML_TYPE_Q5_0, q5_0)
+        CREATE_FA(GGML_TYPE_Q5_1, q5_1)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0)
+        // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
+        //CREATE_FA(GGML_TYPE_Q2_K, q2_k)
+        //CREATE_FA(GGML_TYPE_Q3_K, q3_k)
+        //CREATE_FA(GGML_TYPE_Q4_K, q4_k)
+        //CREATE_FA(GGML_TYPE_Q5_K, q5_k)
+        //CREATE_FA(GGML_TYPE_Q6_K, q6_k)
+        //CREATE_FA(GGML_TYPE_IQ2_XXS, iq2_xxs)
+        //CREATE_FA(GGML_TYPE_IQ2_XS, iq2_xs)
+        //CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
+        //CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
+        //CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
+        //CREATE_FA(GGML_TYPE_IQ4_XS, iq4_xs)
+        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
+#undef CREATE_FA
+
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
+
+        // Create 2 variants, {f16,f32} accumulator
+#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
+        CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT)   \
+        CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT)   \
+
+        CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f16,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f16,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
+
+        CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f16,  , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f16,   , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f16,   , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f16,  , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f16,  , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
+#undef CREATE_MM
+#undef CREATE_MM2
+    } else
+#endif  // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (device->coopmat_support) {
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \
+
+        // Create 2 variants, {f16,f32} accumulator
+#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->coopmat_acc_f16_support) { \
+            CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        } \
+        if (device->coopmat_acc_f32_support) { \
+            CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        } \
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+
+        if (device->coopmat_acc_f16_support) {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        } else {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        }
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+
+        if (device->coopmat_acc_f16_support) {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        } else {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        }
+#undef CREATE_MM2
+#undef CREATE_MM
+    } else
+#endif  // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (device->fp16) {
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
+
+        // Create 2 variants, {f16,f32} accumulator
+#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+#undef CREATE_MM2
+#undef CREATE_MM
+    } else {
+        // Create 6 variants, {s,m,l}x{unaligned,aligned}
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+#undef CREATE_MM
+    }
+
+    // mul mat vec
+
+    // the number of rows computed per shader depends on GPU model and quant
+    uint32_t rm_stdq = 1;
+    uint32_t rm_kq = 2;
+    if (device->vendor_id == VK_VENDOR_ID_AMD) {
+        if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
+            rm_stdq = 2;
+            rm_kq = 4;
+        }
+    } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
+        rm_stdq = 2;
+
+    for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1),  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f32_f32_len,  mul_mat_vec_iq2_xs_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f32_f32_len,   mul_mat_vec_iq2_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f32_f32_len,   mul_mat_vec_iq3_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f32_f32_len,  mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f32_f32_len,  mul_mat_vec_iq4_nl_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
+
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f16_f32_len,  mul_mat_vec_iq2_xs_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f16_f32_len,   mul_mat_vec_iq2_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f16_f32_len,   mul_mat_vec_iq3_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f16_f32_len,  mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f16_f32_len,  mul_mat_vec_iq4_nl_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
+    }
+
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS],  "mul_mat_vec_id_iq2_xs_f32",  mul_mat_vec_id_iq2_xs_f32_len,  mul_mat_vec_id_iq2_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S],   "mul_mat_vec_id_iq2_s_f32",   mul_mat_vec_id_iq2_s_f32_len,   mul_mat_vec_id_iq2_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S],   "mul_mat_vec_id_iq3_s_f32",   mul_mat_vec_id_iq3_s_f32_len,   mul_mat_vec_id_iq3_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS],  "mul_mat_vec_id_iq4_xs_f32",  mul_mat_vec_id_iq4_xs_f32_len,  mul_mat_vec_id_iq4_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL],  "mul_mat_vec_id_iq4_nl_f32",  mul_mat_vec_id_iq4_nl_f32_len,  mul_mat_vec_id_iq4_nl_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
+
+    // dequant shaders
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XXS], "dequant_iq2_xxs", dequant_iq2_xxs_len, dequant_iq2_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XS],  "dequant_iq2_xs",  dequant_iq2_xs_len,  dequant_iq2_xs_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S],   "dequant_iq2_s",   dequant_iq2_s_len,   dequant_iq2_s_data,   "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S],   "dequant_iq3_s",   dequant_iq3_s_len,   dequant_iq3_s_data,   "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS],  "dequant_iq4_xs",  dequant_iq4_xs_len,  dequant_iq4_xs_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL],  "dequant_iq4_nl",  dequant_iq4_nl_len,  dequant_iq4_nl_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+
+    // get_rows
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs", get_rows_iq2_xxs_len, get_rows_iq2_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XS],  "get_rows_iq2_xs",  get_rows_iq2_xs_len,  get_rows_iq2_xs_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S],   "get_rows_iq2_s",   get_rows_iq2_s_len,   get_rows_iq2_s_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S],   "get_rows_iq3_s",   get_rows_iq3_s_len,   get_rows_iq3_s_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS],  "get_rows_iq4_xs",  get_rows_iq4_xs_len,  get_rows_iq4_xs_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl",  get_rows_iq4_nl_len,  get_rows_iq4_nl_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs_f32", get_rows_iq2_xxs_f32_len, get_rows_iq2_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XS],  "get_rows_iq2_xs_f32",  get_rows_iq2_xs_f32_len,  get_rows_iq2_xs_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S],   "get_rows_iq2_s_f32",   get_rows_iq2_s_f32_len,   get_rows_iq2_s_f32_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S],   "get_rows_iq3_s_f32",   get_rows_iq3_s_f32_len,   get_rows_iq3_s_f32_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS],  "get_rows_iq4_xs_f32",  get_rows_iq4_xs_f32_len,  get_rows_iq4_xs_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16_norepeat, "add_f16_f32_f16_norepeat", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_mul_f32_norepeat, "mul_f32_norepeat", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_div_f32_norepeat, "div_f32_norepeat", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
+
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+
+    if (device->float_controls_rte_fp16) {
+        ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+    } else {
+        ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+    }
+
+    ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
+    if (device->float_controls_rte_fp16) {
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
+    } else {
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
+    }
+
+    ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
+
+    ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
+
+    for (auto &c : compiles) {
+        c.wait();
+    }
+    device->need_compiles = false;
+}
+
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
+
+static vk_device ggml_vk_get_device(size_t idx) {
+    VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
+
+    if (vk_instance.devices[idx] == nullptr) {
+        VK_LOG_DEBUG("Initializing new vk_device");
+        vk_device device = std::make_shared<vk_device_struct>();
+        vk_instance.devices[idx] = device;
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+        device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
+#endif
+#ifdef GGML_VULKAN_PERF
+        device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
+#endif
+
+        size_t dev_num = vk_instance.device_indices[idx];
+
+        std::vector<vk::PhysicalDevice> physical_devices = vk_instance.instance.enumeratePhysicalDevices();
+
+        if (dev_num >= physical_devices.size()) {
+            std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
+            throw std::runtime_error("Device not found");
+        }
+
+        device->physical_device = physical_devices[dev_num];
+        const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
+
+        const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
+        device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
+
+        bool fp16_storage = false;
+        bool fp16_compute = false;
+        bool maintenance4_support = false;
+        bool sm_builtins = false;
+        bool amd_shader_core_properties2 = false;
+        bool pipeline_robustness = false;
+        bool coopmat2_support = false;
+        device->coopmat_support = false;
+
+        // Check if maintenance4 is supported
+        for (const auto& properties : ext_props) {
+            if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
+                maintenance4_support = true;
+            } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
+                fp16_storage = true;
+            } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
+                fp16_compute = true;
+            } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) {
+                sm_builtins = true;
+            } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) {
+                amd_shader_core_properties2 = true;
+            } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
+                pipeline_robustness = true;
+            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
+                device->subgroup_size_control = true;
+            } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
+                       !getenv("GGML_VK_DISABLE_COOPMAT")) {
+                device->coopmat_support = true;
+                device->coopmat_m = 0;
+                device->coopmat_n = 0;
+                device->coopmat_k = 0;
+            } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
+                       !getenv("GGML_VK_DISABLE_COOPMAT2")) {
+                coopmat2_support = true;
+            }
+        }
+
+        vk::PhysicalDeviceProperties2 props2;
+        vk::PhysicalDeviceMaintenance3Properties props3;
+        vk::PhysicalDeviceMaintenance4Properties props4;
+        vk::PhysicalDeviceSubgroupProperties subgroup_props;
+        vk::PhysicalDeviceDriverProperties driver_props;
+        vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
+        vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
+        vk::PhysicalDeviceVulkan12Properties vk12_props;
+        vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
+
+        props2.pNext = &props3;
+        props3.pNext = &subgroup_props;
+        subgroup_props.pNext = &driver_props;
+        driver_props.pNext = &vk12_props;
+
+        VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props;
+
+        if (maintenance4_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&props4;
+            last_struct = (VkBaseOutStructure *)&props4;
+        }
+        if (sm_builtins) {
+            last_struct->pNext = (VkBaseOutStructure *)&sm_props;
+            last_struct = (VkBaseOutStructure *)&sm_props;
+        }
+        if (amd_shader_core_properties2) {
+            last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
+            last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
+        }
+        if (device->subgroup_size_control) {
+            last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
+            last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
+        }
+
+#if defined(VK_NV_cooperative_matrix2)
+        vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
+        if (coopmat2_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props;
+            last_struct = (VkBaseOutStructure *)&coopmat2_props;
+        }
+#endif
+
+        device->physical_device.getProperties2(&props2);
+        device->properties = props2.properties;
+        device->vendor_id = device->properties.vendorID;
+
+        const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
+
+        if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) {
+            device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE);
+        } else if (maintenance4_support) {
+            device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
+        } else {
+            device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
+        }
+
+        const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE");
+
+        if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
+            device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+#if defined(_WIN32)
+        } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
+            // Limit batching of allocations to 1GB by default to avoid fragmentation issues
+            device->suballocation_block_size = 1024*1024*1024;
+#endif
+        } else {
+            device->suballocation_block_size = device->max_memory_allocation_size;
+        }
+        device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
+
+        device->subgroup_size = subgroup_props.subgroupSize;
+        device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
+        if (sm_builtins) {
+            device->shader_core_count = sm_props.shaderSMCount;
+        } else if (amd_shader_core_properties2) {
+            device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount;
+        } else {
+            device->shader_core_count = 0;
+        }
+        device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
+
+        const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
+
+        device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
+
+        if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
+            device->coopmat_support = false;
+        }
+
+        std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
+
+        // Try to find a non-graphics compute queue and transfer-focused queues
+        const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
+        const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
+
+        const float priorities[] = { 1.0f, 1.0f };
+        device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
+
+        std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
+        if (compute_queue_family_index != transfer_queue_family_index) {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
+        } else if(!device->single_queue) {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
+        } else {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
+        }
+        vk::DeviceCreateInfo device_create_info;
+        std::vector<const char *> device_extensions;
+        vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures();
+
+        VkPhysicalDeviceFeatures2 device_features2;
+        device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+        device_features2.pNext = nullptr;
+        device_features2.features = (VkPhysicalDeviceFeatures)device_features;
+
+        VkPhysicalDeviceVulkan11Features vk11_features;
+        vk11_features.pNext = nullptr;
+        vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+        device_features2.pNext = &vk11_features;
+
+        VkPhysicalDeviceVulkan12Features vk12_features;
+        vk12_features.pNext = nullptr;
+        vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
+        vk11_features.pNext = &vk12_features;
+
+        last_struct = (VkBaseOutStructure *)&vk12_features;
+
+        VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features;
+        pl_robustness_features.pNext = nullptr;
+        pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT;
+        pl_robustness_features.pipelineRobustness = VK_FALSE;
+
+        if (pipeline_robustness) {
+            last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features;
+            last_struct = (VkBaseOutStructure *)&pl_robustness_features;
+            device_extensions.push_back("VK_EXT_pipeline_robustness");
+        }
+
+        VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
+        subgroup_size_control_features.pNext = nullptr;
+        subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
+        subgroup_size_control_features.computeFullSubgroups = false;
+        subgroup_size_control_features.subgroupSizeControl = false;
+
+        if (device->subgroup_size_control) {
+            last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
+            last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
+        }
+
+#if defined(VK_KHR_cooperative_matrix)
+        VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
+        coopmat_features.pNext = nullptr;
+        coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
+        coopmat_features.cooperativeMatrix = VK_FALSE;
+
+        if (device->coopmat_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
+            last_struct = (VkBaseOutStructure *)&coopmat_features;
+        }
+#endif
+
+#if defined(VK_NV_cooperative_matrix2)
+        VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {};
+        coopmat2_features.pNext = nullptr;
+        coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV;
+        if (coopmat2_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features;
+            last_struct = (VkBaseOutStructure *)&coopmat2_features;
+            device_extensions.push_back("VK_NV_cooperative_matrix2");
+        }
+#endif
+
+        VkPhysicalDeviceMaintenance4Features maint4_features {};
+        maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES;
+        if (maintenance4_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&maint4_features;
+            last_struct = (VkBaseOutStructure *)&maint4_features;
+            device_extensions.push_back("VK_KHR_maintenance4");
+        }
+
+        vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
+
+        device->fp16 = device->fp16 && vk12_features.shaderFloat16;
+
+        device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
+
+        if (device->subgroup_size_control) {
+            device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
+            device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
+            device_extensions.push_back("VK_EXT_subgroup_size_control");
+        }
+
+        device->subgroup_size_control = device->subgroup_size_control &&
+                (subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) &&
+                subgroup_size_control_features.subgroupSizeControl;
+
+        if (device->subgroup_size_control) {
+            device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
+        }
+
+#if defined(VK_KHR_cooperative_matrix)
+        device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
+#endif
+
+        if (coopmat2_support) {
+#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+            if (coopmat2_features.cooperativeMatrixWorkgroupScope &&
+                coopmat2_features.cooperativeMatrixFlexibleDimensions &&
+                coopmat2_features.cooperativeMatrixReductions &&
+                coopmat2_features.cooperativeMatrixConversions &&
+                coopmat2_features.cooperativeMatrixPerElementOperations &&
+                coopmat2_features.cooperativeMatrixTensorAddressing &&
+                coopmat2_features.cooperativeMatrixBlockLoads &&
+                vk12_features.bufferDeviceAddress) {
+
+                std::vector<VkCooperativeMatrixFlexibleDimensionsPropertiesNV> flexible_dimensions;
+                uint32_t count = 0;
+
+                PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV
+                    _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV =
+                        (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV)
+                        vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV");
+
+                _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr);
+
+                VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {};
+                empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV;
+                flexible_dimensions.resize(count, empty_prop);
+
+                _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data());
+
+                bool found_fp16_128 = false,
+                     found_fp16_256 = false,
+                     found_fp32_128 = false,
+                     found_fp32_256 = false;
+                // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128
+                // with 32x16x16 and 256 with 32x32x16.
+                for (auto &prop : flexible_dimensions) {
+                    if (prop.saturatingAccumulation == VK_FALSE &&
+                        prop.scope == VK_SCOPE_WORKGROUP_KHR &&
+                        prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                        prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+
+                        if (prop.workgroupInvocations == 128 &&
+                            prop.MGranularity <= 32 &&
+                            prop.NGranularity <= 16 &&
+                            prop.KGranularity <= 16) {
+                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                found_fp16_128 = true;
+                            }
+                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                found_fp32_128 = true;
+                            }
+                        }
+                        if (prop.workgroupInvocations == 256 &&
+                            prop.MGranularity <= 32 &&
+                            prop.NGranularity <= 32 &&
+                            prop.KGranularity <= 16) {
+                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                found_fp16_256 = true;
+                            }
+                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                found_fp32_256 = true;
+                            }
+                        }
+                    }
+                }
+                if (found_fp16_128 && found_fp16_256 &&
+                    found_fp32_128 && found_fp32_256 &&
+                    coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
+                    device->coopmat2 = true;
+                }
+            }
+#endif
+        }
+
+        if (!vk11_features.storageBuffer16BitAccess) {
+            std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
+            throw std::runtime_error("Unsupported device");
+        }
+
+        device_extensions.push_back("VK_KHR_16bit_storage");
+
+#ifdef GGML_VULKAN_VALIDATE
+        device_extensions.push_back("VK_KHR_shader_non_semantic_info");
+#endif
+
+        if (device->fp16) {
+            device_extensions.push_back("VK_KHR_shader_float16_int8");
+        }
+
+#if defined(VK_KHR_cooperative_matrix)
+        if (device->coopmat_support) {
+            // Query supported shapes
+            std::vector<VkCooperativeMatrixPropertiesKHR> cm_props;
+
+            PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR =
+                (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR");
+
+            uint32_t cm_props_num;
+
+            pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr);
+
+            cm_props.resize(cm_props_num);
+
+            for (auto& prop : cm_props) {
+                prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+            }
+
+            pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data());
+
+            VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size());
+
+            for (auto& prop : cm_props) {
+                VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope));
+
+                if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 &&
+                    (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 &&
+                    (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup
+                ) {
+                    if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 &&
+                        (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) {
+                        // coopmat sizes not set yet
+                        if (device->coopmat_m == 0) {
+                            device->coopmat_acc_f32_support = true;
+                            device->coopmat_m = prop.MSize;
+                            device->coopmat_n = prop.NSize;
+                            device->coopmat_k = prop.KSize;
+                        } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) {
+                            // Only enable if shape is identical
+                            device->coopmat_acc_f32_support = true;
+                        }
+                    } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 &&
+                               (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) {
+                        // coopmat sizes not set yet
+                        if (device->coopmat_m == 0) {
+                            device->coopmat_acc_f16_support = true;
+                            device->coopmat_m = prop.MSize;
+                            device->coopmat_n = prop.NSize;
+                            device->coopmat_k = prop.KSize;
+                        } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) {
+                            // Only enable if shape is identical
+                            device->coopmat_acc_f16_support = true;
+                        }
+                    }
+                }
+            }
+
+            if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) {
+                // No suitable matmul mode found
+                GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
+                device->coopmat_support = false;
+            }
+        }
+
+        if (device->coopmat_support) {
+            device_extensions.push_back("VK_KHR_cooperative_matrix");
+        }
+#endif
+        device->name = GGML_VK_NAME + std::to_string(idx);
+
+        device_create_info = {
+            vk::DeviceCreateFlags(),
+            device_queue_create_infos,
+            {},
+            device_extensions
+        };
+        device_create_info.setPNext(&device_features2);
+        device->device = device->physical_device.createDevice(device_create_info);
+
+        // Queues
+        ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false);
+
+        // Shaders
+        // Disable matmul tile sizes early if performance low or not supported
+        for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
+            switch (device->vendor_id) {
+#ifndef GGML_VULKAN_RUN_TESTS
+            case VK_VENDOR_ID_AMD:
+            case VK_VENDOR_ID_INTEL:
+                device->mul_mat_l[i] = false;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = true;
+                device->mul_mat_id_l[i] = false;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = true;
+                break;
+            case VK_VENDOR_ID_APPLE:
+                device->mul_mat_l[i] = false;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = false;
+                device->mul_mat_id_l[i] = false;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = false;
+                break;
+#endif
+            default:
+                device->mul_mat_l[i] = true;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = true;
+                device->mul_mat_id_l[i] = true;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = true;
+                break;
+            }
+        }
+
+        ggml_vk_load_shaders(device);
+
+        if (!device->single_queue) {
+            const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
+            ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
+        } else {
+            // TODO: Use pointer or reference to avoid copy
+            device->transfer_queue = device->compute_queue;
+        }
+
+        device->buffer_type = {
+            /* .iface    = */ ggml_backend_vk_buffer_type_interface,
+            /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
+            /* .context  = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
+        };
+
+        device->fence = device->device.createFence({});
+
+        device->idx = idx;
+
+        return device;
+    }
+
+    return vk_instance.devices[idx];
+}
+
+static void ggml_vk_print_gpu_info(size_t idx) {
+    GGML_ASSERT(idx < vk_instance.device_indices.size());
+    size_t dev_num = vk_instance.device_indices[idx];
+    VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")");
+    GGML_ASSERT(vk_instance_initialized);
+
+    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+    if (dev_num >= devices.size()) {
+        std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl;
+        throw std::runtime_error("Device not found");
+    }
+
+    vk::PhysicalDevice physical_device = devices[dev_num];
+    std::vector<vk::ExtensionProperties> ext_props = physical_device.enumerateDeviceExtensionProperties();
+
+    vk::PhysicalDeviceProperties2 props2;
+    vk::PhysicalDeviceMaintenance3Properties props3;
+    vk::PhysicalDeviceSubgroupProperties subgroup_props;
+    vk::PhysicalDeviceDriverProperties driver_props;
+    props2.pNext = &props3;
+    props3.pNext = &subgroup_props;
+    subgroup_props.pNext = &driver_props;
+    physical_device.getProperties2(&props2);
+
+    const size_t subgroup_size = subgroup_props.subgroupSize;
+    const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
+
+    bool fp16_storage = false;
+    bool fp16_compute = false;
+    bool coopmat_support = false;
+    bool coopmat2_support = false;
+
+    for (auto properties : ext_props) {
+        if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
+            fp16_storage = true;
+        } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
+            fp16_compute = true;
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+       } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
+                   !getenv("GGML_VK_DISABLE_COOPMAT")) {
+            coopmat_support = true;
+#endif
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+        } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
+                   !getenv("GGML_VK_DISABLE_COOPMAT2")) {
+            coopmat2_support = true;
+#endif
+        }
+    }
+
+    if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
+        coopmat_support = false;
+    }
+
+    const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
+    bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
+
+    bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
+
+    vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures();
+
+    VkPhysicalDeviceFeatures2 device_features2;
+    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    device_features2.pNext = nullptr;
+    device_features2.features = (VkPhysicalDeviceFeatures)device_features;
+
+    VkPhysicalDeviceVulkan11Features vk11_features;
+    vk11_features.pNext = nullptr;
+    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+    device_features2.pNext = &vk11_features;
+
+    VkPhysicalDeviceVulkan12Features vk12_features;
+    vk12_features.pNext = nullptr;
+    vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
+    vk11_features.pNext = &vk12_features;
+
+    // Pointer to the last chain element
+    VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features;
+
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
+    coopmat_features.pNext = nullptr;
+    coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
+    coopmat_features.cooperativeMatrix = VK_FALSE;
+
+    if (coopmat_support) {
+        last_struct->pNext = (VkBaseOutStructure *)&coopmat_features;
+        last_struct = (VkBaseOutStructure *)&coopmat_features;
+    }
+
+    vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
+
+    fp16 = fp16 && vk12_features.shaderFloat16;
+
+    coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
+#endif
+
+    std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
+
+    std::string device_name = props2.properties.deviceName.data();
+    GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
+              idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
+              props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());
+
+    if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
+        GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
+    }
+}
+
+static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
+static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
+
+static void ggml_vk_instance_init() {
+    if (vk_instance_initialized) {
+        return;
+    }
+    VK_LOG_DEBUG("ggml_vk_instance_init()");
+
+    uint32_t api_version = vk::enumerateInstanceVersion();
+
+    if (api_version < VK_API_VERSION_1_2) {
+        std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl;
+        GGML_ABORT("fatal error");
+    }
+
+    vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version };
+
+    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
+    const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
+#ifdef __APPLE__
+    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
+#endif
+
+    std::vector<const char*> layers;
+
+    if (validation_ext) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+    std::vector<const char*> extensions;
+    if (validation_ext) {
+        extensions.push_back("VK_EXT_validation_features");
+    }
+#ifdef __APPLE__
+    if (portability_enumeration_ext) {
+        extensions.push_back("VK_KHR_portability_enumeration");
+    }
+#endif
+    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
+#ifdef __APPLE__
+    if (portability_enumeration_ext) {
+        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
+    }
+#endif
+
+    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
+    vk::ValidationFeaturesEXT validation_features;
+
+    if (validation_ext) {
+        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
+        validation_features = {
+            features_enable,
+            {},
+        };
+        validation_features.setPNext(nullptr);
+        instance_create_info.setPNext(&validation_features);
+        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
+    }
+    vk_instance.instance = vk::createInstance(instance_create_info);
+    vk_instance_initialized = true;
+
+    size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
+
+    // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
+    char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
+    if (devices_env != nullptr) {
+        std::string devices(devices_env);
+        std::replace(devices.begin(), devices.end(), ',', ' ');
+
+        std::stringstream ss(devices);
+        size_t tmp;
+        while (ss >> tmp) {
+            if(tmp >= num_available_devices) {
+                std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl;
+                throw std::runtime_error("Invalid Vulkan device index");
+            }
+            vk_instance.device_indices.push_back(tmp);
+        }
+    } else {
+        std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+        // Make sure at least one device exists
+        if (devices.empty()) {
+            std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
+            return;
+        }
+
+        // Default to using all dedicated GPUs
+        for (size_t i = 0; i < devices.size(); i++) {
+            vk::PhysicalDeviceProperties2 new_props;
+            vk::PhysicalDeviceDriverProperties new_driver;
+            vk::PhysicalDeviceIDProperties new_id;
+            new_props.pNext = &new_driver;
+            new_driver.pNext = &new_id;
+            devices[i].getProperties2(&new_props);
+
+            if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
+                // Check if there are two physical devices corresponding to the same GPU
+                auto old_device = std::find_if(
+                    vk_instance.device_indices.begin(),
+                    vk_instance.device_indices.end(),
+                    [&devices, &new_id](const size_t k){
+                        vk::PhysicalDeviceProperties2 old_props;
+                        vk::PhysicalDeviceIDProperties old_id;
+                        old_props.pNext = &old_id;
+                        devices[k].getProperties2(&old_props);
+                        return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
+                    }
+                );
+                if (old_device == vk_instance.device_indices.end()) {
+                    vk_instance.device_indices.push_back(i);
+                } else {
+                    // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
+                    // This can cause error when splitting layers aross the devices, need to keep only 1
+                    VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
+
+                    vk::PhysicalDeviceProperties2 old_props;
+                    vk::PhysicalDeviceDriverProperties old_driver;
+                    old_props.pNext = &old_driver;
+                    devices[*old_device].getProperties2(&old_props);
+
+                    std::map<vk::DriverId, int> driver_priorities {};
+                    int old_priority = std::numeric_limits<int>::max();
+                    int new_priority = std::numeric_limits<int>::max();
+
+                    // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
+                    // Smaller number -> higher priority
+                    switch (old_props.properties.vendorID) {
+                        case VK_VENDOR_ID_AMD:
+                            driver_priorities[vk::DriverId::eMesaRadv] = 1;
+                            driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
+                            driver_priorities[vk::DriverId::eAmdProprietary] = 3;
+                            break;
+                        case VK_VENDOR_ID_INTEL:
+                            driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
+                            driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
+                            break;
+                        case VK_VENDOR_ID_NVIDIA:
+                            driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
+#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
+                            driver_priorities[vk::DriverId::eMesaNvk] = 2;
+#endif
+                            break;
+                    }
+
+                    if (driver_priorities.count(old_driver.driverID)) {
+                        old_priority = driver_priorities[old_driver.driverID];
+                    }
+                    if (driver_priorities.count(new_driver.driverID)) {
+                        new_priority = driver_priorities[new_driver.driverID];
+                    }
+
+                    if (new_priority < old_priority) {
+                        auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
+                        vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
+                        vk_instance.device_indices.push_back(i);
+
+                        VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName);
+                    }
+                    else {
+                        VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl);
+                    }
+                }
+            }
+        }
+
+        // If no dedicated GPUs found, fall back to GPU 0
+        if (vk_instance.device_indices.empty()) {
+            vk_instance.device_indices.push_back(0);
+        }
+    }
+    GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
+
+    for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
+        ggml_vk_print_gpu_info(i);
+    }
+}
+
+static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
+    VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
+    ggml_vk_instance_init();
+    GGML_ASSERT(idx < vk_instance.device_indices.size());
+
+    ctx->name = GGML_VK_NAME + std::to_string(idx);
+
+    ctx->device = ggml_vk_get_device(idx);
+
+    ctx->semaphore_idx = 0;
+    ctx->event_idx = 0;
+
+    ctx->prealloc_size_x = 0;
+    ctx->prealloc_size_y = 0;
+    ctx->prealloc_size_split_k = 0;
+
+    ctx->fence = ctx->device->device.createFence({});
+
+#ifdef GGML_VULKAN_CHECK_RESULTS
+    const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
+    vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
+    const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR");
+    vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor));
+#endif
+}
+
+static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
+    VK_LOG_DEBUG("ggml_vk_get_to_fp16()");
+    switch (type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            break;
+        default:
+            return nullptr;
+    }
+
+    return ctx->device->pipeline_dequant[type];
+}
+
+static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
+    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
+    if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+        return ctx->device->pipeline_matmul_f32;
+    }
+    if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+        return ctx->device->pipeline_matmul_f32_f16;
+    }
+    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_matmul_f16_f32.f16acc;
+        }
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_matmul_f16.f16acc;
+        }
+    } else {
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_matmul_f16_f32.f32acc;
+        }
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_matmul_f16.f32acc;
+        }
+    }
+
+    if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) {
+        return nullptr;
+    }
+
+    switch (src0_type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            break;
+        default:
+            return nullptr;
+    }
+
+    if (ctx->device->coopmat2) {
+        assert(src1_type == GGML_TYPE_F16);
+        return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc;
+    }
+    return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
+}
+
+static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
+    VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
+    GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
+    GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols);
+
+    switch (a_type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            break;
+        default:
+            return nullptr;
+    }
+
+    return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type][num_cols-1];
+}
+
+static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
+    VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()");
+    if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+        return ctx->device->pipeline_matmul_id_f32;
+    }
+    if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
+        }
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_matmul_id_f16.f16acc;
+        }
+    } else {
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_matmul_id_f16_f32.f32acc;
+        }
+        if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_matmul_id_f16.f32acc;
+        }
+    }
+
+    GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16));
+
+    switch (src0_type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            break;
+        default:
+            return nullptr;
+    }
+
+    return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc;
+}
+
+static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
+    VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
+    GGML_ASSERT(b_type == GGML_TYPE_F32);
+
+    switch (a_type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ4_NL:
+            break;
+        default:
+            return nullptr;
+    }
+
+    return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[a_type];
+}
+
+static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
+    VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
+    VK_LOG_MEMORY("ggml_vk_pool_malloc");
+
+    int best_i = -1;
+    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
+    int worst_i = -1;
+    size_t worst_size = 0; //largest unused buffer seen so far
+    for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
+        vk_buffer &b = ctx->buffer_pool[i];
+        if (b != nullptr && b->size >= size && b->size < best_size) {
+            best_i = i;
+            best_size = b->size;
+        }
+        if (b != nullptr && b->size > worst_size) {
+            worst_i = i;
+            worst_size = b->size;
+        }
+    }
+    if(best_i != -1) {
+        //found the smallest buffer that fits our needs
+        vk_buffer b = ctx->buffer_pool[best_i];
+        ctx->buffer_pool[best_i].reset();
+        return b;
+    }
+    if(worst_i != -1) {
+        //no buffer that fits our needs, resize largest one to save memory
+        vk_buffer& b = ctx->buffer_pool[worst_i];
+        ggml_vk_destroy_buffer(b);
+    }
+
+    return ggml_vk_create_buffer_device(ctx->device, size);
+}
+
+static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
+    VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
+    for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
+        vk_buffer& b = ctx->buffer_pool[i];
+        if (b == nullptr) {
+            b = buffer;
+            return;
+        }
+    }
+    std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl;
+    ggml_vk_destroy_buffer(buffer);
+}
+
+// Returns an available temporary buffer that may only be used temporarily, it will be reused
+static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) {
+    // Try to find existing temp buffer with enough capacity
+    for (auto& buffer : ctx->gc.temp_buffers) {
+        if (buffer->size >= size) {
+            return buffer;
+        }
+    }
+
+    VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
+
+    // Otherwise create new buffer
+    vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
+    ctx->gc.temp_buffers.push_back(buf);
+
+    return buf;
+}
+
+static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
+    VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
+    vk_buffer buf = ggml_vk_create_buffer(device, size,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
+            size/1024.0/1024.0);
+        device->device.freeMemory(buf->device_memory);
+        device->device.destroyBuffer(buf->buffer);
+        return nullptr;
+    }
+
+    device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
+
+    return buf->ptr;
+}
+
+static void ggml_vk_host_free(vk_device& device, void* ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+    VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
+    vk_buffer buf;
+    size_t index;
+    for (size_t i = 0; i < device->pinned_memory.size(); i++) {
+        const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
+        const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
+        if (ptr >= addr && ptr < endr) {
+            buf = std::get<2>(device->pinned_memory[i]);
+            index = i;
+            break;
+        }
+    }
+    if (buf == nullptr) {
+        fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n");
+        return;
+    }
+
+    ggml_vk_destroy_buffer(buf);
+
+    device->pinned_memory.erase(device->pinned_memory.begin() + index);
+}
+
+static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
+    buf = nullptr;
+    buf_offset = 0;
+    for (size_t i = 0; i < device->pinned_memory.size(); i++) {
+        const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
+        const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
+        if (ptr >= addr && ptr < endr) {
+            buf = std::get<2>(device->pinned_memory[i]);
+            buf_offset = ((const uint8_t *)ptr) - addr;
+            break;
+        }
+    }
+}
+
+static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
+    vk_submission s;
+    s.buffer = ggml_vk_create_cmd_buffer(device, q);
+    if (one_time) {
+        s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
+    } else {
+        s.buffer.begin({ vk::CommandBufferUsageFlags{} });
+    }
+
+    return s;
+}
+
+
+
+static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
+    const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
+    const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
+    const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
+    VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
+    for (auto& buffer : descriptor_buffer_infos) {
+        std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
+    }
+    std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
+    GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
+    GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
+
+    vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+    vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
+    ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
+
+    subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
+    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
+    subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                pipeline->layout,
+                                0,
+                                { descriptor_set },
+                                {});
+    subctx->s->buffer.dispatch(wg0, wg1, wg2);
+}
+
+static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
+    s.buffer.end();
+
+    s.wait_semaphores = std::move(wait_semaphores);
+    s.signal_semaphores = std::move(signal_semaphores);
+}
+
+static void ggml_vk_ctx_end(vk_context& ctx) {
+    VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
+    if (ctx->s == nullptr) {
+        return;
+    }
+
+    ctx->s->buffer.end();
+    ctx->s = nullptr;
+}
+
+static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
+    VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
+    if (subctx->s != nullptr) {
+        ggml_vk_ctx_end(subctx);
+    }
+
+    subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
+    subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
+}
+
+static size_t ggml_vk_align_size(size_t width, size_t align) {
+    VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
+    return CEIL_DIV(width, align) * align;
+}
+
+static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr) {
+    if (memcpys == nullptr) {
+        memcpy(dst, src, size);
+    } else {
+        memcpys->emplace_back(dst, src, size);
+    }
+}
+
+static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
+    if (device->sync_staging == nullptr || device->sync_staging->size < size) {
+        VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
+        ggml_vk_destroy_buffer(device->sync_staging);
+        device->sync_staging = ggml_vk_create_buffer_check(device, size,
+            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
+            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+    }
+}
+
+static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
+    VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
+    GGML_ASSERT(!ggml_is_contiguous(tensor));
+    // Buffer is already mapped
+    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+        std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
+        GGML_ABORT("fatal error");
+    }
+    // Check if src is pinned memory
+    vk_buffer buf = nullptr;
+    size_t buf_offset = 0;
+    ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
+
+    const uint64_t ne0 = tensor->ne[0];
+    const uint64_t ne1 = tensor->ne[1];
+    const uint64_t ne2 = tensor->ne[2];
+    const uint64_t ne3 = tensor->ne[3];
+    const uint64_t nb0 = tensor->nb[0];
+    const uint64_t nb1 = tensor->nb[1];
+    const uint64_t nb2 = tensor->nb[2];
+    const uint64_t nb3 = tensor->nb[3];
+    const ggml_type type = tensor->type;
+    const uint64_t ts = ggml_type_size(type);
+    const uint64_t bs = ggml_blck_size(type);
+
+    const uint64_t dstnb0 = ts;
+    const uint64_t dstnb1 = dstnb0*(ne0/bs);
+    const uint64_t dstnb2 = dstnb1*ne1;
+    const uint64_t dstnb3 = dstnb2*ne2;
+
+    const uint64_t ne = ggml_nelements(tensor);
+
+    if (buf != nullptr) {
+        // Memory is pinned, use as staging buffer
+        std::vector<vk::BufferCopy> slices;
+
+        for (uint64_t i3 = 0; i3 < ne3; i3++) {
+            for (uint64_t i2 = 0; i2 < ne2; i2++) {
+                // Find longest contiguous slice
+                if (ne1*nb1 == dstnb2) {
+                    slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
+                } else {
+                    for (uint64_t i1 = 0; i1 < ne1; i1++) {
+                        if (ne0*nb0/bs == dstnb1) {
+                            slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
+                        } else {
+                            const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
+                            const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
+                            for (uint64_t i0 = 0; i0 < ne0; i0++) {
+                                slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        ggml_vk_sync_buffers(subctx);
+        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
+        return;
+    }
+
+    if (!sync_staging) {
+        GGML_ABORT("Asynchronous write to non-pinned memory not supported");
+    }
+
+    // Staging buffer required
+    vk_buffer& staging = ctx->device->sync_staging;
+    const uint64_t copy_size = ts*ne/bs;
+    ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
+    VkBufferCopy buf_copy{ 0, offset, copy_size };
+
+    ggml_vk_sync_buffers(subctx);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
+
+    for (uint64_t i3 = 0; i3 < ne3; i3++) {
+        for (uint64_t i2 = 0; i2 < ne2; i2++) {
+            // Find longest contiguous slice
+            if (ne1*nb1 == dstnb2) {
+                deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
+            } else {
+                for (uint64_t i1 = 0; i1 < ne1; i1++) {
+                    if (ne0*nb0/bs == dstnb1) {
+                        deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
+                    } else {
+                        const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
+                        const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
+                        for (uint64_t i0 = 0; i0 < ne0; i0++) {
+                            deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
+    VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
+    // Buffer is already mapped
+    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+        std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
+        GGML_ABORT("fatal error");
+    }
+    // Check if src is pinned memory
+    vk_buffer buf = nullptr;
+    size_t buf_offset = 0;
+    ggml_vk_host_get(dst->device, src, buf, buf_offset);
+
+    if (buf != nullptr) {
+        // Memory is pinned, use as staging buffer
+        std::vector<vk::BufferCopy> slices(1);
+        if (width == spitch) {
+            // Only do single write if stride is equal
+            slices[0].srcOffset = buf_offset;
+            slices[0].dstOffset = offset;
+            slices[0].size = width * height;
+        } else {
+            slices.resize(height);
+            for (size_t i = 0; i < height; i++) {
+                slices[i].srcOffset = buf_offset + i * spitch;
+                slices[i].dstOffset = offset + i * width;
+                slices[i].size = width;
+            }
+        }
+
+        ggml_vk_sync_buffers(subctx);
+        subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
+        return;
+    }
+    VK_LOG_DEBUG("STAGING");
+
+    if (!sync_staging) {
+        GGML_ABORT("Asynchronous write to non-pinned memory not supported");
+    }
+
+    // Staging buffer required
+    const size_t copy_size = width*height;
+    ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
+
+    vk_buffer& staging_buffer = dst->device->sync_staging;
+
+    VkBufferCopy buf_copy = {
+        0,
+        offset,
+        copy_size};
+
+    ggml_vk_sync_buffers(subctx);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
+
+    if (width == spitch) {
+        deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
+    } else {
+        for (size_t i = 0; i < height; i++) {
+            deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
+        }
+    }
+}
+
+static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
+    VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
+    return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
+}
+
+static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
+    VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
+    // Buffer is already mapped
+    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+        GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+
+        for (size_t i = 0; i < height; i++) {
+            memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
+        }
+    } else {
+        vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+        ggml_vk_ctx_begin(dst->device, subctx);
+        ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
+        ggml_vk_ctx_end(subctx);
+
+        for (auto& cpy : subctx->in_memcpys) {
+            memcpy(cpy.dst, cpy.src, cpy.n);
+        }
+
+        ggml_vk_submit(subctx, dst->device->fence);
+        VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
+        dst->device->device.resetFences({ dst->device->fence });
+    }
+}
+
+static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) {
+    VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
+    ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
+}
+
+static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
+    VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
+    GGML_ASSERT(width > 0);
+    GGML_ASSERT(height > 0);
+    GGML_ASSERT(src != nullptr);
+
+    // TODO: staging_offset is not used
+
+    // Check if dst is pinned memory
+    vk_buffer buf = nullptr;
+    size_t buf_offset = 0;
+    ggml_vk_host_get(src->device, dst, buf, buf_offset);
+
+    std::vector<vk::BufferCopy> slices(1);
+    if (width == spitch && width == dpitch) {
+        // Only do single write if stride is equal
+        slices[0].srcOffset = offset;
+        slices[0].dstOffset = buf_offset;
+        slices[0].size = width * height;
+    } else {
+        slices.resize(height);
+        for (size_t i = 0; i < height; i++) {
+            slices[i].srcOffset = offset + i * spitch;
+            slices[i].dstOffset = buf_offset + i * dpitch;
+            slices[i].size = width;
+        }
+    }
+
+    if (buf != nullptr) {
+        // Memory is pinned, use as staging buffer
+        ggml_vk_sync_buffers(subctx);
+        subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
+
+        return;
+    }
+    VK_LOG_DEBUG("STAGING");
+
+    if (!sync_staging) {
+        GGML_ABORT("Asynchronous read from non-pinned memory not supported");
+    }
+
+    // Fall back to staging buffer
+    const size_t copy_size = dpitch * height;
+    ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
+
+    vk_buffer& staging_buffer = src->device->sync_staging;
+
+    ggml_vk_sync_buffers(subctx);
+    subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices);
+
+    deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
+}
+
+static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
+    return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
+}
+
+static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
+    VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
+
+    // If the device is not an UMA device the memory is host-accessible through rebar. While writing
+    // through PCIe is sufficient fast reading back data from PCIe is slower than going through
+    // the HW device to host copy path.
+    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
+        GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+
+        memcpy(dst, (uint8_t *) src->ptr + offset, size);
+    } else {
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+        ggml_vk_ctx_begin(src->device, subctx);
+        ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
+        ggml_vk_ctx_end(subctx);
+
+        ggml_vk_submit(subctx, src->device->fence);
+        VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
+        src->device->device.resetFences({ src->device->fence });
+
+        for (auto& cpy : subctx->out_memcpys) {
+            memcpy(cpy.dst, cpy.src, cpy.n);
+        }
+    }
+}
+
+static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
+    VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
+    // Make sure both buffers are on same device
+    GGML_ASSERT(src->device == dst->device);
+
+    VkBufferCopy bc{ src_offset, dst_offset, size };
+
+    vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
+}
+
+static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
+    if (src->device == dst->device) {
+        VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
+        // Copy within the device
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+        ggml_vk_ctx_begin(src->device, subctx);
+        ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
+        ggml_vk_ctx_end(subctx);
+        ggml_vk_submit(subctx, src->device->fence);
+        VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
+        src->device->device.resetFences({ src->device->fence });
+    } else {
+        VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
+        // Copy device to device
+        ggml_vk_ensure_sync_staging_buffer(src->device, size);
+        ggml_vk_ensure_sync_staging_buffer(dst->device, size);
+
+        // Copy to src staging buffer
+        ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
+        // memcpy to dst staging buffer
+        memcpy(dst->device->sync_staging->ptr, src->device->sync_staging->ptr, size);
+        // Copy to dst buffer
+        ggml_vk_buffer_copy(dst, dst_offset, dst->device->sync_staging, 0, size);
+    }
+}
+
+static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
+    VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
+
+    vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+    ggml_vk_ctx_begin(dst->device, subctx);
+    subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
+    ggml_vk_ctx_end(subctx);
+
+    ggml_vk_submit(subctx, dst->device->fence);
+    VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
+    dst->device->device.resetFences({ dst->device->fence });
+}
+
+static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
+    VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
+
+    uint32_t split_k = 1;
+    if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
+        // If k is 'large' and the SMs will fill less than halfway, use split_k.
+        uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
+        uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
+        if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
+            split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
+            // Clamp to 2 or 4
+            split_k = std::min(split_k, 4u);
+            if (split_k == 3) {
+                split_k = 2;
+            }
+        }
+    }
+
+    return split_k;
+}
+
+static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
+
+    if (ctx->device->coopmat2) {
+        if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
+            return aligned ? mmp->a_l : mmp->l;
+        }
+        if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) {
+            return aligned ? mmp->a_m : mmp->m;
+        }
+        return aligned ? mmp->a_s : mmp->s;
+    }
+
+    if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type])) {
+        return aligned ? mmp->a_s : mmp->s;
+    }
+    if ((ctx->device->mul_mat_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l[src0_type]) {
+        return aligned ? mmp->a_m : mmp->m;
+    }
+    return aligned ? mmp->a_l : mmp->l;
+}
+
+static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
+    return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type)->align;
+}
+
+static void ggml_vk_matmul(
+        ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
+        vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
+        uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
+        uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
+        uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
+        VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
+    ggml_vk_sync_buffers(subctx);
+    if (split_k == 1) {
+        const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
+        return;
+    }
+
+    GGML_ASSERT(batch_stride_d == m * n);
+
+    const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 };
+    // Make sure enough workgroups get assigned for split k to work
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
+    ggml_vk_sync_buffers(subctx);
+    const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
+}
+
+static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
+
+    if (ctx->device->coopmat2) {
+        if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
+            return aligned ? mmp->a_l : mmp->l;
+        }
+        if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) {
+            return aligned ? mmp->a_m : mmp->m;
+        }
+        return aligned ? mmp->a_s : mmp->s;
+    }
+
+    if ((ctx->device->mul_mat_id_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_l[src0_type])) {
+        return aligned ? mmp->a_s : mmp->s;
+    }
+    if ((ctx->device->mul_mat_id_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l[src0_type]) {
+        return aligned ? mmp->a_m : mmp->m;
+    }
+    return aligned ? mmp->a_l : mmp->l;
+}
+
+static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
+    return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type)->align;
+}
+
+static void ggml_vk_matmul_id(
+        ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline,
+        vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
+        uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
+        uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
+        uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
+    VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
+        "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
+        "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
+        "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
+    ggml_vk_sync_buffers(subctx);
+    const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
+                                              nei0, nei1, nbi1, ne11 };
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
+}
+
+static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
+
+    // Choose "contiguous copy" shader if src/dst are contiguous
+    bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
+
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f32;
+        }
+    }
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f16;
+        }
+    }
+    if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f16_f16;
+        }
+    }
+    if (src->type == GGML_TYPE_F32) {
+        switch (to) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
+            return ctx->device->pipeline_cpy_f32_quant[to];
+        default:
+            break;
+        }
+    }
+
+    if (to == GGML_TYPE_F32) {
+        switch (src->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
+            return ctx->device->pipeline_cpy_quant_f32[src->type];
+        default:
+            break;
+        }
+    }
+
+    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
+    GGML_ABORT("fatal error");
+}
+
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
+    VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
+    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")");
+    const int tensor_type_size = ggml_type_size(tensor->type);
+
+    const uint32_t ne = ggml_nelements(tensor);
+    std::array<uint32_t, 3> elements;
+
+    if (ne > 262144) {
+        elements = { 512, 512, CEIL_DIV(ne, 262144) };
+    } else if (ne > 512) {
+        elements = { 512, CEIL_DIV(ne, 512), 1 };
+    } else {
+        elements = { ne, 1, 1 };
+    }
+
+    vk_op_unary_push_constants pc = {
+        (uint32_t)ne,
+        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
+        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    init_pushconst_fastdiv(pc);
+    ggml_vk_sync_buffers(subctx);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
+}
+
+static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    const uint64_t ne13 = src1->ne[3];
+
+    const uint64_t ne20 = dst->ne[0];
+    const uint64_t ne21 = dst->ne[1];
+
+    const uint64_t r2 = ne12 / ne02;
+    const uint64_t r3 = ne13 / ne03;
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+
+    vk_buffer d_Qx = nullptr;
+    size_t qx_buf_offset = 0;
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+
+    bool src0_uma = false;
+    bool src1_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset);
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        src0_uma = d_Qx != nullptr;
+        src1_uma = d_Qy != nullptr;
+    }
+
+    // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
+    const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
+                              !ggml_vk_dim01_contiguous(src0);
+    const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
+                              !ggml_vk_dim01_contiguous(src1);
+
+    const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
+
+    vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]);
+
+    const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
+    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
+
+    if (qx_needs_dequant) {
+        // Fall back to dequant + f16 mulmat
+        mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]);
+    }
+
+    // Not implemented
+    GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
+
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
+    const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
+
+    vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
+
+    const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
+
+    const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
+    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
+    const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
+    const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
+
+    if (x_non_contig) {
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
+    } else {
+        to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
+    }
+    if (y_non_contig) {
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
+    } else {
+        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
+    }
+    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
+    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
+
+    if (dryrun) {
+        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
+        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
+        const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
+        if (
+                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
+                (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) {
+            GGML_ABORT("Requested preallocation size is too large");
+        }
+        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
+            ctx->prealloc_size_x = x_sz_upd;
+        }
+        if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
+            ctx->prealloc_size_y = y_sz_upd;
+        }
+        if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) {
+            ctx->prealloc_size_split_k = split_k_size;
+        }
+
+        // Request descriptor sets
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        if (qx_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+        }
+        if (qy_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+        }
+        if (split_k > 1) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
+        }
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
+    vk_buffer d_X;
+    uint64_t x_buf_offset = 0;
+    vk_buffer d_Y;
+    uint64_t y_buf_offset = 0;
+    if (!src0_uma) {
+        d_Qx = src0_buf_ctx->dev_buffer;
+        qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+    if (!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qy != nullptr);
+    }
+    if (qx_needs_dequant) {
+        d_X = ctx->prealloc_x;
+        GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
+    } else {
+        d_X = d_Qx;
+        x_buf_offset = qx_buf_offset;
+        GGML_ASSERT(qx_sz == x_sz);
+    }
+    if (qy_needs_dequant) {
+        d_Y = ctx->prealloc_y;
+        GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
+    } else {
+        d_Y = d_Qy;
+        y_buf_offset = qy_buf_offset;
+        GGML_ASSERT(qy_sz == y_sz);
+    }
+
+    if (x_non_contig) {
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+    } else if (qx_needs_dequant) {
+        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+    }
+    if (y_non_contig) {
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+    }
+
+    uint32_t stride_batch_x = ne00*ne01;
+    uint32_t stride_batch_y = ne10*ne11;
+
+    if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
+        stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
+    }
+
+    if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
+        stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
+    }
+
+    // compute
+    ggml_vk_matmul(
+        ctx, subctx, pipeline,
+        { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
+        { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
+        ne01, ne11, ne10,
+        ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
+        split_k, ne12*ne13, ne02, ne12, r2, r3
+    );  // NOLINT
+}
+
+static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)");
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    const uint64_t ne13 = src1->ne[3];
+
+    const uint64_t ne20 = dst->ne[0];
+    const uint64_t ne21 = dst->ne[1];
+    const uint64_t ne22 = dst->ne[2];
+    const uint64_t ne23 = dst->ne[3];
+
+    const uint64_t r2 = ne12 / ne02;
+    const uint64_t r3 = ne13 / ne03;
+
+    // batch_n indicates that we need to compute a few vector results, and this assumes
+    // ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides.
+    GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1);
+    bool batch_n = ne11 > 1;
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+
+    vk_buffer d_Qx = nullptr;
+    size_t qx_buf_offset = 0;
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+
+    bool src0_uma = false;
+    bool src1_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset);
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        src0_uma = d_Qx != nullptr;
+        src1_uma = d_Qy != nullptr;
+    }
+
+    const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
+    const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
+
+    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
+
+    const bool qx_needs_dequant = x_non_contig;
+    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
+
+    // Not implemented
+    GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
+
+    const uint64_t x_ne = ne01 * ne00;
+    const uint64_t y_ne = ne11 * ne10;
+    const uint64_t d_ne = ne11 * ne01;
+
+    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
+    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
+    const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
+    const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
+    if (x_non_contig) {
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
+    }
+    if (y_non_contig) {
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
+    } else {
+        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
+    }
+    vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11);
+    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
+    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
+    GGML_ASSERT(dmmv != nullptr);
+
+    if (dryrun) {
+        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
+        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
+        if (
+                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+            GGML_ABORT("Requested preallocation size is too large");
+        }
+        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
+            ctx->prealloc_size_x = x_sz_upd;
+        }
+        if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
+            ctx->prealloc_size_y = y_sz_upd;
+        }
+
+        // Request descriptor sets
+        if (qx_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+        }
+        if (qy_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+        }
+        ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    vk_buffer d_X;
+    uint64_t x_buf_offset = 0;
+    vk_buffer d_Y;
+    uint64_t y_buf_offset = 0;
+    if(!src0_uma) {
+        d_Qx = src0_buf_ctx->dev_buffer;
+        qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+    if(!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qy != nullptr);
+    }
+    if (qx_needs_dequant) {
+        d_X = ctx->prealloc_x;
+    } else {
+        d_X = d_Qx;
+        x_buf_offset = qx_buf_offset;
+        GGML_ASSERT(qx_sz == x_sz);
+    }
+    if (qy_needs_dequant) {
+        d_Y = ctx->prealloc_y;
+    } else {
+        d_Y = d_Qy;
+        y_buf_offset = qy_buf_offset;
+        GGML_ASSERT(qy_sz == y_sz);
+    }
+
+    if (x_non_contig) {
+        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+    }
+    if (y_non_contig) {
+        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+    }
+
+    // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
+    uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01;
+    uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11);
+    uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21);
+
+    if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
+        stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
+    }
+
+    if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
+        stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
+    }
+
+    const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0];
+
+    uint32_t groups_x = ne01;
+    uint32_t groups_z = 1;
+
+    if (ne01 > max_groups_x) {
+        groups_z = 64;
+        groups_x = CEIL_DIV(groups_x, groups_z);
+    }
+
+    // compute
+    const vk_mat_vec_push_constants pc = {
+        (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
+        stride_batch_x, stride_batch_y, stride_batch_d,
+        (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
+    };
+    ggml_vk_sync_buffers(subctx);
+    ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
+                              { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
+                              sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
+}
+
+static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]);  // NOLINT
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]);  // NOLINT
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    // const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    // const uint64_t ne13 = src1->ne[3];
+
+    GGML_ASSERT(ne11 == 1);
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+
+    bool src1_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        src1_uma = d_Qy != nullptr;
+    }
+
+    const uint64_t x_ne = ne00 * ne01 * ne02;
+    const uint64_t y_ne = ne10 * ne11 * ne12;
+    const uint64_t d_ne = ne01 * ne11 * ne12;
+
+    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
+    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    if (dryrun) {
+        // Request descriptor sets
+        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
+    const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+    GGML_ASSERT(d_Qx != nullptr);
+    if (!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+
+    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
+
+    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
+
+    // compute
+    const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
+    ggml_vk_sync_buffers(subctx);
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+}
+
+static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    // const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t nb01 = src0->nb[1];
+    const uint64_t nb02 = src0->nb[2];
+
+    // const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    // const uint64_t ne13 = src1->ne[3];
+
+    GGML_ASSERT(ne11 == 1);
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+
+    bool src1_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        src1_uma = d_Qy != nullptr;
+    }
+
+    const uint64_t d_ne = ne01 * ne11 * ne12;
+
+    const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
+    const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
+
+    const uint64_t qx_sz = ggml_nbytes(src0);
+    const uint64_t qy_sz = ggml_nbytes(src1);
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    if (dryrun) {
+        // Request descriptor sets
+        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    vk_buffer d_Qx = src0_buf_ctx->dev_buffer;
+    const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+    GGML_ASSERT(d_Qx != nullptr);
+    if (!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+
+    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
+
+    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
+
+    // compute
+    const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
+    ggml_vk_sync_buffers(subctx);
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
+        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+}
+
+static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")");
+    if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 &&
+        // detect 0213 permutation, and batch size of 1
+        src0->nb[0] <= src0->nb[2] &&
+        src0->nb[2] <= src0->nb[1] &&
+        src0->nb[1] <= src0->nb[3] &&
+        src1->nb[0] <= src1->nb[2] &&
+        src1->nb[2] <= src1->nb[1] &&
+        src1->nb[1] <= src1->nb[3] &&
+        src0->ne[3] == 1 &&
+        src1->ne[3] == 1) {
+        ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
+    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
+               !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
+        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
+    // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
+    // when ne12 and ne13 are one.
+    } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
+               (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
+        ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
+    } else {
+        ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
+    }
+}
+
+static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)");
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    const uint64_t ne13 = src1->ne[3];
+
+    const uint64_t nei0 = ids->ne[0];
+    const uint64_t nei1 = ids->ne[1];
+    GGML_ASSERT(nei0 * nei1 <= 3072);
+
+    const uint32_t nbi1 = ids->nb[1];
+    const uint32_t nbi2 = ids->nb[2];
+
+    const uint64_t ne20 = dst->ne[0];
+    const uint64_t ne21 = dst->ne[1];
+    const uint64_t ne22 = dst->ne[2];
+    const uint64_t ne23 = dst->ne[3];
+
+    const uint64_t n_as = ne02;
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+    ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
+
+    vk_buffer d_Qx = nullptr;
+    size_t qx_buf_offset = 0;
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+    vk_buffer d_ids = nullptr;
+    size_t ids_buf_offset = 0;
+
+    bool src0_uma = false;
+    bool src1_uma = false;
+    bool ids_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset);
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset);
+        src0_uma = d_Qx != nullptr;
+        src1_uma = d_Qy != nullptr;
+        ids_uma = d_ids != nullptr;
+    }
+
+    // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
+    const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
+                              !ggml_vk_dim01_contiguous(src0);
+    const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
+                              !ggml_vk_dim01_contiguous(src1);
+
+    const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
+
+    vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]);
+
+    const bool qx_needs_dequant = mmp == nullptr || x_non_contig;
+    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
+
+    if (qx_needs_dequant) {
+        // Fall back to dequant + f16 mulmat
+        mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]);
+    }
+
+    // Not implemented
+    GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
+
+    const uint64_t x_ne = ne01 * ne00;
+    const uint64_t y_ne = ne11 * ne10;
+    const uint64_t d_ne = ne21 * ne20;
+
+    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
+    const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
+
+    vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
+
+    const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
+    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
+    const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
+    const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
+    const uint64_t ids_sz = nbi2;
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
+
+    if (x_non_contig) {
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
+    } else {
+        to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
+    }
+    if (y_non_contig) {
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
+    } else {
+        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
+    }
+    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
+    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
+
+    if (dryrun) {
+        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
+        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
+        if (
+                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+            GGML_ABORT("Requested preallocation size is too large");
+        }
+        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
+            ctx->prealloc_size_x = x_sz_upd;
+        }
+        if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
+            ctx->prealloc_size_y = y_sz_upd;
+        }
+
+        // Request descriptor sets
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        if (qx_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+        }
+        if (qy_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+        }
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    vk_buffer d_X;
+    uint64_t x_buf_offset = 0;
+    vk_buffer d_Y;
+    uint64_t y_buf_offset = 0;
+    if (!src0_uma) {
+        d_Qx = src0_buf_ctx->dev_buffer;
+        qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+    if (!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qy != nullptr);
+    }
+    if (!ids_uma) {
+        d_ids = ids_buf_ctx->dev_buffer;
+        ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
+        GGML_ASSERT(d_ids != nullptr);
+    }
+    if (qx_needs_dequant) {
+        d_X = ctx->prealloc_x;
+        GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03);
+    } else {
+        d_X = d_Qx;
+        x_buf_offset = qx_buf_offset;
+        GGML_ASSERT(qx_sz == x_sz);
+    }
+    if (qy_needs_dequant) {
+        d_Y = ctx->prealloc_y;
+        GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
+    } else {
+        d_Y = d_Qy;
+        y_buf_offset = qy_buf_offset;
+        GGML_ASSERT(qy_sz == y_sz);
+    }
+
+    if (x_non_contig) {
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+    } else if (qx_needs_dequant) {
+        const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
+            { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+    }
+    if (y_non_contig) {
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+    }
+
+    uint32_t stride_batch_x = ne00*ne01;
+    uint32_t stride_batch_y = ne10*ne11;
+
+    if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
+        stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
+    }
+
+    if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
+        stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
+    }
+
+    // compute
+    ggml_vk_matmul_id(
+        ctx, subctx, pipeline,
+        { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 },
+        { d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz },
+        ne01, ne21, ne10, ne10, ne10, ne01,
+        stride_batch_x, stride_batch_y, ne20*ne21,
+        n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11
+    );  // NOLINT
+}
+
+static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);  // NOLINT
+    GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);  // NOLINT
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    const uint64_t ne03 = src0->ne[3];
+
+    const uint64_t ne10 = src1->ne[0];
+    const uint64_t ne11 = src1->ne[1];
+    const uint64_t ne12 = src1->ne[2];
+    const uint64_t ne13 = src1->ne[3];
+
+    const uint64_t nei0 = ids->ne[0];
+    const uint64_t nei1 = ids->ne[1];
+
+    const uint64_t nbi2 = ids->nb[2];
+
+    GGML_ASSERT(nei1 == 1);
+
+    const uint64_t ne20 = dst->ne[0];
+    const uint64_t ne21 = dst->ne[1];
+    const uint64_t ne22 = dst->ne[2];
+    const uint64_t ne23 = dst->ne[3];
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+    ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
+
+    vk_buffer d_Qx = nullptr;
+    size_t qx_buf_offset = 0;
+    vk_buffer d_Qy = nullptr;
+    size_t qy_buf_offset = 0;
+    vk_buffer d_ids = nullptr;
+    size_t ids_buf_offset = 0;
+
+    bool src0_uma = false;
+    bool src1_uma = false;
+    bool ids_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset);
+        ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset);
+        ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset);
+        src0_uma = d_Qx != nullptr;
+        src1_uma = d_Qy != nullptr;
+        ids_uma = d_ids != nullptr;
+    }
+
+    const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
+    const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
+
+    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
+
+    const bool qx_needs_dequant = x_non_contig;
+    const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
+
+    // Not implemented
+    GGML_ASSERT(y_non_contig || !qy_needs_dequant);  // NOLINT
+
+    const uint64_t x_ne = ne01 * ne00;
+    const uint64_t y_ne = ne11 * ne10;
+    const uint64_t d_ne = ne21 * ne20;
+
+    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
+    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
+    const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
+    const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
+    const uint64_t ids_sz = nbi2;
+    const uint64_t d_sz = sizeof(float) * d_ne;
+
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
+    if (x_non_contig) {
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
+    }
+    if (y_non_contig) {
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
+    } else {
+        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
+    }
+    vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type);
+    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
+    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
+    GGML_ASSERT(dmmv != nullptr);
+
+    if (dryrun) {
+        const uint64_t x_sz_upd = x_sz * ne02 * ne03;
+        const uint64_t y_sz_upd = y_sz * ne12 * ne13;
+        if (
+                (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
+                (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) {
+            GGML_ABORT("Requested preallocation size is too large");
+        }
+        if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) {
+            ctx->prealloc_size_x = x_sz_upd;
+        }
+        if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) {
+            ctx->prealloc_size_y = y_sz_upd;
+        }
+
+        // Request descriptor sets
+        if (qx_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+        }
+        if (qy_needs_dequant) {
+            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+        }
+        ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+        return;
+    }
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+    const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    GGML_ASSERT(d_D != nullptr);
+    vk_buffer d_X;
+    uint64_t x_buf_offset = 0;
+    vk_buffer d_Y;
+    uint64_t y_buf_offset = 0;
+    if(!src0_uma) {
+        d_Qx = src0_buf_ctx->dev_buffer;
+        qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+        GGML_ASSERT(d_Qx != nullptr);
+    }
+    if(!src1_uma) {
+        d_Qy = src1_buf_ctx->dev_buffer;
+        qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Qy != nullptr);
+    }
+    if(!ids_uma) {
+        d_ids = ids_buf_ctx->dev_buffer;
+        ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
+        GGML_ASSERT(d_ids != nullptr);
+    }
+    if (qx_needs_dequant) {
+        d_X = ctx->prealloc_x;
+    } else {
+        d_X = d_Qx;
+        x_buf_offset = qx_buf_offset;
+        GGML_ASSERT(qx_sz == x_sz);
+    }
+    if (qy_needs_dequant) {
+        d_Y = ctx->prealloc_y;
+    } else {
+        d_Y = d_Qy;
+        y_buf_offset = qy_buf_offset;
+        GGML_ASSERT(qy_sz == y_sz);
+    }
+
+    if (x_non_contig) {
+        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
+    }
+    if (y_non_contig) {
+        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
+    }
+
+    uint32_t stride_batch_y = ne10*ne11;
+
+    if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
+        stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
+    }
+
+    const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0];
+
+    uint32_t groups_x = ne01;
+    uint32_t groups_z = 1;
+
+    if (ne01 > max_groups_x) {
+        groups_z = 64;
+        groups_x = CEIL_DIV(groups_x, groups_z);
+    }
+
+    // compute
+    const vk_mat_vec_id_push_constants pc = {
+        (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
+        (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
+        (uint32_t)nei0, (uint32_t)ne11,
+    };
+    ggml_vk_sync_buffers(subctx);
+    ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
+        { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
+        vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
+        sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
+}
+
+static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")");
+    if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
+        ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+    } else {
+        ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
+    }
+}
+
+static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
+    std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
+    std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const uint32_t nem1 = mask ? mask->ne[1] : 0;
+    const uint32_t nbm1 = mask ? mask->nb[1] : 0;
+
+    const uint32_t D = neq0;
+    const uint32_t N = neq1;
+    const uint32_t KV = nek1;
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne2 == N);
+
+    // input tensor rows must be contiguous
+    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
+    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
+    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nev0 == D);
+
+    GGML_ASSERT(nev1 == nek1);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    assert(dst->type == GGML_TYPE_F32);
+    assert(q->type == GGML_TYPE_F32);
+    assert(k->type == v->type);
+
+    vk_pipeline *pipelines;
+    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
+    bool f32acc = dst->op_params[3] == GGML_PREC_F32;
+    bool small_rows = N <= flash_attention_num_small_rows;
+    switch (D) {
+    case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
+    case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
+    case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break;
+    case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break;
+    case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break;
+    case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break;
+    default:
+        assert(!"unsupported D value");
+        return;
+    }
+    assert(pipelines);
+
+    const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
+    const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
+    const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
+
+    bool aligned = (KV % pipelines[1]->align) == 0 &&
+                   // the "aligned" shader variant will forcibly align strides, for performance
+                   (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
+
+    vk_pipeline pipeline = pipelines[aligned];
+    assert(pipeline);
+
+    if (dryrun) {
+        // Request descriptor sets
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        return;
+    }
+
+    float scale         = 1.0f;
+    float max_bias      = 0.0f;
+    float logit_softcap = 0.0f;
+
+    memcpy(&scale,         (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias,      (const float *) dst->op_params + 1, sizeof(float));
+    memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float));
+
+    if (logit_softcap != 0) {
+        scale /= logit_softcap;
+    }
+
+    const uint32_t n_head_kv   = neq2;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    ggml_vk_sync_buffers(subctx);
+
+    vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
+    size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
+
+    bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
+        ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
+        ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
+        ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
+        Q_uma = d_Q != nullptr;
+        K_uma = d_K != nullptr;
+        V_uma = d_V != nullptr;
+        D_uma = d_D != nullptr;
+        if (mask) {
+            ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
+            M_uma = d_M != nullptr;
+        }
+    }
+
+
+    ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context;
+    ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
+    ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
+
+    if (!Q_uma) {
+        d_Q = q_buf_ctx->dev_buffer;
+        q_buf_offset = vk_tensor_offset(q) + q->view_offs;
+    }
+    if (!K_uma) {
+        d_K = k_buf_ctx->dev_buffer;
+        k_buf_offset = vk_tensor_offset(k) + k->view_offs;
+    }
+    if (!V_uma) {
+        d_V = v_buf_ctx->dev_buffer;
+        v_buf_offset = vk_tensor_offset(v) + v->view_offs;
+    }
+    if (!D_uma) {
+        d_D = d_buf_ctx->dev_buffer;
+        d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    }
+
+    if (!M_uma) {
+        d_M = d_Q;
+        m_buf_offset = q_buf_offset;
+        if (mask) {
+            ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context;
+            d_M = m_buf_ctx->dev_buffer;
+            m_buf_offset = vk_tensor_offset(mask) + mask->view_offs;
+        }
+    }
+
+    const vk_flash_attn_push_constants pc = { N, KV,
+                                              (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
+                                              (uint32_t)neq2, (uint32_t)neq3,
+                                              (uint32_t)nek2, (uint32_t)nek3,
+                                              (uint32_t)nev2, (uint32_t)nev3,
+                                              nem1,
+                                              q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
+                                              k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
+                                              v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
+                                              nbm1,
+                                              scale, max_bias, logit_softcap,
+                                              mask != nullptr, n_head_log2, m0, m1 };
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+                                {
+                                    vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
+                                    vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
+                                    vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
+                                    vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
+                                    vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
+                                },
+                                sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 });
+}
+
+static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
+    switch (op) {
+    case GGML_OP_GET_ROWS:
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        if (dst->type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_get_rows[src0->type];
+        }
+        if (dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_get_rows_f32[src0->type];
+        }
+        return nullptr;
+    case GGML_OP_ACC:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_acc_f32;
+        }
+        return nullptr;
+    case GGML_OP_ADD:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f32_norepeat : ctx->device->pipeline_add_f32;
+        }
+        if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_f16_f32_f16_norepeat : ctx->device->pipeline_add_f16_f32_f16;
+        }
+        return nullptr;
+    case GGML_OP_MUL:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_f32_norepeat : ctx->device->pipeline_mul_f32;
+        }
+        return nullptr;
+    case GGML_OP_DIV:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_f32_norepeat : ctx->device->pipeline_div_f32;
+        }
+        return nullptr;
+    case GGML_OP_CONCAT:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_concat_f32;
+        }
+        if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_concat_f16;
+        }
+        if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+            return ctx->device->pipeline_concat_i32;
+        }
+        return nullptr;
+    case GGML_OP_UPSCALE:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_upscale_f32;
+        }
+        return nullptr;
+    case GGML_OP_SCALE:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_scale_f32;
+        }
+        return nullptr;
+    case GGML_OP_SQR:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_sqr_f32;
+        }
+        return nullptr;
+    case GGML_OP_SIN:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_sin_f32;
+        }
+        return nullptr;
+    case GGML_OP_COS:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_cos_f32;
+        }
+        return nullptr;
+    case GGML_OP_CLAMP:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_clamp_f32;
+        }
+        return nullptr;
+    case GGML_OP_PAD:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_pad_f32;
+        }
+        return nullptr;
+    case GGML_OP_REPEAT:
+        if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
+            return ctx->device->pipeline_repeat_f32;
+        }
+        return nullptr;
+    case GGML_OP_CPY:
+    case GGML_OP_CONT:
+    case GGML_OP_DUP:
+        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
+    case GGML_OP_NORM:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_norm_f32;
+        }
+        return nullptr;
+    case GGML_OP_GROUP_NORM:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_group_norm_f32;
+        }
+        return nullptr;
+    case GGML_OP_RMS_NORM:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_rms_norm_f32;
+        }
+        return nullptr;
+    case GGML_OP_UNARY:
+        switch (ggml_get_unary_op(dst)) {
+            case GGML_UNARY_OP_SILU:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_silu_f32;
+                }
+                break;
+            case GGML_UNARY_OP_GELU:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_gelu_f32;
+                }
+                break;
+            case GGML_UNARY_OP_GELU_QUICK:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_gelu_quick_f32;
+                }
+                break;
+            case GGML_UNARY_OP_RELU:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_relu_f32;
+                }
+                break;
+            case GGML_UNARY_OP_TANH:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_tanh_f32;
+                }
+                break;
+            default:
+                break;
+        }
+        return nullptr;
+    case GGML_OP_DIAG_MASK_INF:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_diag_mask_inf_f32;
+        }
+        return nullptr;
+    case GGML_OP_SOFT_MAX:
+        GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+
+        if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
+            return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32;
+        }
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+            return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16;
+        }
+        return nullptr;
+    case GGML_OP_ROPE:
+        {
+            const int mode = ((const int32_t *) dst->op_params)[2];
+            const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+
+            if (is_neox) {
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_rope_neox_f32;
+                }
+                if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+                    return ctx->device->pipeline_rope_neox_f16;
+                }
+            } else {
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_rope_norm_f32;
+                }
+                if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+                    return ctx->device->pipeline_rope_norm_f16;
+                }
+            }
+            return nullptr;
+        }
+    case GGML_OP_ARGSORT:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) {
+            return ctx->device->pipeline_argsort_f32;
+        }
+        return nullptr;
+    case GGML_OP_SUM_ROWS:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_sum_rows_f32;
+        }
+        return nullptr;
+    case GGML_OP_IM2COL:
+        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_im2col_f32;
+        }
+        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+            return ctx->device->pipeline_im2col_f32_f16;
+        }
+        return nullptr;
+    case GGML_OP_TIMESTEP_EMBEDDING:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_timestep_embedding_f32;
+        }
+        return nullptr;
+    case GGML_OP_POOL_2D:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_pool2d_f32;
+        }
+        return nullptr;
+    case GGML_OP_RWKV_WKV6:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_rwkv_wkv6_f32;
+        }
+        return nullptr;
+    case GGML_OP_LEAKY_RELU:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_leaky_relu_f32;
+        }
+        return nullptr;
+    default:
+        return nullptr;
+    }
+
+    GGML_UNUSED(src2);
+}
+
+static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
+    switch (op) {
+    case GGML_OP_CPY:
+    case GGML_OP_GET_ROWS:
+    case GGML_OP_ADD:
+    case GGML_OP_MUL:
+    case GGML_OP_DIV:
+    case GGML_OP_CONCAT:
+    case GGML_OP_UPSCALE:
+    case GGML_OP_SQR:
+    case GGML_OP_SIN:
+    case GGML_OP_COS:
+    case GGML_OP_CLAMP:
+    case GGML_OP_PAD:
+    case GGML_OP_REPEAT:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
+{
+    return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
+}
+
+template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    GGML_UNUSED(p);
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+    GGML_UNUSED(dst);
+    static_assert(!std::is_const<T>::value, "unexpected type");
+    GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
+    GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
+    GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
+    GGML_ASSERT(!dst  || get_misalign_bytes(ctx, dst) == 0);
+}
+
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+
+    p.misalign_offsets = (a_offset << 16) | d_offset;
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+}
+
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+
+    GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
+
+    p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
+
+    GGML_UNUSED(src2);
+}
+
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
+    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
+    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
+
+    p.a_offset = a_offset;
+    p.d_offset = d_offset;
+
+    GGML_UNUSED(src1);
+    GGML_UNUSED(src2);
+}
+
+template<typename PC>
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
+    VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
+    if (src1 != nullptr) {
+        std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
+    }
+    if (src2 != nullptr) {
+        std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
+    }
+    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
+    std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
+    GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT
+    GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0));  // NOLINT
+    GGML_ASSERT(dst->buffer != nullptr);
+    const uint64_t ne00 = src0->ne[0];
+    const uint64_t ne01 = src0->ne[1];
+    const uint64_t ne02 = src0->ne[2];
+    const uint64_t ne03 = src0->ne[3];
+    const uint64_t ne0 = ne00 * ne01;
+
+    const bool use_src1 = src1 != nullptr;
+    const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
+    const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
+    const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
+    const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
+    const uint64_t ne1 = ne10 * ne11;
+    // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
+
+    const bool use_src2 = src2 != nullptr;
+    const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
+    const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
+    const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
+    const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
+    const uint64_t ne2 = ne20 * ne21;
+
+    const uint64_t ned0 = dst->ne[0];
+    const uint64_t ned1 = dst->ne[1];
+    const uint64_t ned2 = dst->ne[2];
+    const uint64_t ned3 = dst->ne[3];
+    const uint64_t ned = ned0 * ned1;
+
+    init_pushconst_fastdiv(pc);
+
+    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
+
+    if (pipeline == nullptr) {
+        std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type);
+        if (src1 != nullptr) {
+            std::cerr << " and " << ggml_type_name(src1->type);
+        }
+        std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
+        GGML_ABORT("fatal error");
+    }
+
+    if (dryrun) {
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        return;
+    }
+
+    const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+    ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
+    ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
+
+    vk_buffer d_X = nullptr;
+    size_t x_buf_offset = 0;
+    vk_buffer d_Y = nullptr;
+    size_t y_buf_offset = 0;
+    vk_buffer d_Z = nullptr;
+    size_t z_buf_offset = 0;
+
+    bool src0_uma = false;
+    bool src1_uma = false;
+    bool src2_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, src0->data, d_X, x_buf_offset);
+        src0_uma = d_X != nullptr;
+        if (use_src1) {
+            ggml_vk_host_get(ctx->device, src1->data, d_Y, y_buf_offset);
+            src1_uma = d_Y != nullptr;
+        }
+        if (use_src2) {
+            ggml_vk_host_get(ctx->device, src2->data, d_Z, z_buf_offset);
+            src2_uma = d_Z != nullptr;
+        }
+    }
+
+    uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0;
+    uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0;
+    uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0;
+    uint64_t d_sz = ggml_type_size(dst->type) * ned;
+
+    vk_buffer d_D = dst_buf_ctx->dev_buffer;
+
+    // Workaround for tiny tensor inputs on ROPE
+    if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) {
+        y_sz = VK_WHOLE_SIZE;
+    }
+
+    GGML_ASSERT(d_D != nullptr);
+    uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
+    if(!src0_uma) {
+        d_X = src0_buf_ctx->dev_buffer;
+        x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
+        GGML_ASSERT(d_X != nullptr);
+    }
+    if (use_src1 && !src1_uma) {
+        d_Y = src1_buf_ctx->dev_buffer;
+        y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
+        GGML_ASSERT(d_Y != nullptr);
+    }
+    if (use_src2 && !src2_uma) {
+        d_Z = src2_buf_ctx->dev_buffer;
+        z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
+        GGML_ASSERT(d_Z != nullptr);
+    }
+    // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
+    init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
+    x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+
+    if (op_supports_incontiguous) {
+        x_sz = ggml_nbytes(src0);
+        y_sz = use_src1 ? ggml_nbytes(src1) : 0;
+        z_sz = use_src2 ? ggml_nbytes(src2) : 0;
+        d_sz = ggml_nbytes(dst);
+
+        if (x_buf_offset + x_sz >= d_X->size) {
+            x_sz = VK_WHOLE_SIZE;
+        }
+        if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
+            y_sz = VK_WHOLE_SIZE;
+        }
+        if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
+            z_sz = VK_WHOLE_SIZE;
+        }
+        if (d_buf_offset + d_sz >= d_D->size) {
+            d_sz = VK_WHOLE_SIZE;
+        }
+    }
+
+    std::array<uint32_t, 3> elements;
+
+    // Single call if dimension 2 is contiguous
+    GGML_ASSERT(op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1))));
+
+    switch (op) {
+    case GGML_OP_NORM:
+    case GGML_OP_RMS_NORM:
+    case GGML_OP_SOFT_MAX:
+    case GGML_OP_SUM_ROWS:
+        {
+            const uint32_t nr = ggml_nrows(src0);
+            if (nr > 262144) {
+                elements = { 512, 512, CEIL_DIV(nr, 262144) };
+            } else if (nr > 512) {
+                elements = { 512, CEIL_DIV(nr, 512), 1 };
+            } else {
+                elements = { nr, 1, 1 };
+            }
+        } break;
+    case GGML_OP_GROUP_NORM:
+        {
+            const uint32_t num_groups = dst->op_params[0];
+            elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 };
+        } break;
+    case GGML_OP_DIAG_MASK_INF:
+    case GGML_OP_ROPE:
+        elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
+        break;
+    case GGML_OP_GET_ROWS:
+        elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
+        break;
+    case GGML_OP_ARGSORT:
+        elements = { (uint32_t)ne00, (uint32_t)ggml_nrows(src0), 1 };
+        break;
+    case GGML_OP_IM2COL:
+        {
+            const bool is_2D = dst->op_params[6] == 1;
+
+            const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+
+            const uint32_t KH = is_2D ? src0->ne[1] : 1;
+            const uint32_t KW =         src0->ne[0];
+
+            const uint32_t OH = is_2D ? dst->ne[2] : 1;
+            const uint32_t OW =         dst->ne[1];
+
+            const uint32_t batch = src1->ne[is_2D ? 3 : 2];
+
+            elements = { OW * KW * KH, OH, batch * IC };
+        } break;
+    case GGML_OP_TIMESTEP_EMBEDDING:
+        {
+            const uint32_t dim = dst->op_params[0];
+            uint32_t half_ceil = (dim + 1) / 2;
+            elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
+        } break;
+    case GGML_OP_POOL_2D:
+        {
+            const uint32_t N = dst->ne[3];
+            const uint32_t OC = dst->ne[2];
+            const uint32_t OH = dst->ne[1];
+            const uint32_t OW = dst->ne[0];
+            elements = { N * OC * OH * OW, 1, 1};
+        } break;
+    case GGML_OP_ADD:
+    case GGML_OP_DIV:
+    case GGML_OP_MUL:
+    case GGML_OP_SCALE:
+    case GGML_OP_SQR:
+    case GGML_OP_SIN:
+    case GGML_OP_COS:
+    case GGML_OP_CLAMP:
+    case GGML_OP_PAD:
+    case GGML_OP_REPEAT:
+    case GGML_OP_CPY:
+    case GGML_OP_CONCAT:
+    case GGML_OP_UPSCALE:
+    case GGML_OP_UNARY:
+        {
+            const uint32_t ne = ggml_nelements(dst);
+            if (ne > 262144) {
+                elements = { 512, 512, CEIL_DIV(ne, 262144) };
+            } else if (ne > 512) {
+                elements = { 512, CEIL_DIV(ne, 512), 1 };
+            } else {
+                elements = { ne, 1, 1 };
+            }
+        } break;
+    default:
+        elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
+        break;
+    }
+
+    if (!op_supports_incontiguous) {
+        if (x_sz != VK_WHOLE_SIZE) {
+            x_sz *= ne02 * ne03;
+        }
+        if (use_src1 && y_sz != VK_WHOLE_SIZE) {
+            y_sz *= ne12 * ne13;
+        }
+        if (use_src2 && z_sz != VK_WHOLE_SIZE) {
+            z_sz *= ne22 * ne23;
+        }
+        if (d_sz != VK_WHOLE_SIZE) {
+            d_sz *= ned2 * ned3;
+        }
+    }
+
+    if (op == GGML_OP_SOFT_MAX) {
+        // Empty src1 is possible in soft_max, but the shader needs a buffer
+        vk_subbuffer subbuf_y;
+        if (use_src1) {
+            subbuf_y = { d_Y, y_buf_offset, y_sz };
+        } else {
+            subbuf_y = { d_X, 0, x_sz };
+        }
+
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    } else if (op == GGML_OP_ROPE) {
+        // Empty src2 is possible in rope, but the shader needs a buffer
+        vk_subbuffer subbuf_z;
+        if (use_src2) {
+            subbuf_z = { d_Z, z_buf_offset, z_sz };
+        } else {
+            subbuf_z = { d_X, 0, x_sz };
+        }
+
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    } else if (op == GGML_OP_IM2COL) {
+        // im2col uses only src1 and dst buffers
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    } else if (use_src2) {
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    } else if (use_src1) {
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    } else {
+        ggml_vk_sync_buffers(subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+    }
+}
+
+static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, offset,
+    }, dryrun);
+}
+
+static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_DIV, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) {
+    const ggml_tensor * k = dst->src[0];
+    const ggml_tensor * v = dst->src[1];
+    const ggml_tensor * r = dst->src[2];
+    const ggml_tensor * tf = dst->src[3];
+    const ggml_tensor * td = dst->src[4];
+    const ggml_tensor * state = dst->src[5];
+
+    GGML_ASSERT(!ggml_is_quantized(k->type));
+    GGML_ASSERT(!ggml_is_quantized(v->type));
+    GGML_ASSERT(!ggml_is_quantized(r->type));
+    GGML_ASSERT(!ggml_is_quantized(tf->type));
+    GGML_ASSERT(!ggml_is_quantized(td->type));
+    GGML_ASSERT(!ggml_is_quantized(state->type));
+    GGML_ASSERT(dst->buffer != nullptr);
+
+    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6);
+    GGML_ASSERT(pipeline != nullptr);
+
+    if (dryrun) {
+        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        return;
+    }
+
+    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+    ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
+    ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
+    ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context;
+    ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context;
+    ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context;
+    ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context;
+
+    ggml_vk_sync_buffers(subctx);
+
+    vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
+    size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
+    bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
+
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, k->data, d_K, k_offset);
+        ggml_vk_host_get(ctx->device, v->data, d_V, v_offset);
+        ggml_vk_host_get(ctx->device, r->data, d_R, r_offset);
+        ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset);
+        ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset);
+        ggml_vk_host_get(ctx->device, state->data, d_State, state_offset);
+        ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
+
+        K_uma = d_K != nullptr;
+        V_uma = d_V != nullptr;
+        R_uma = d_R != nullptr;
+        TF_uma = d_TF != nullptr;
+        TD_uma = d_TD != nullptr;
+        STATE_uma = d_State != nullptr;
+        DST_uma = d_D != nullptr;
+    }
+
+    if (!K_uma) {
+        d_K = k_buf_ctx->dev_buffer;
+        k_offset = vk_tensor_offset(k) + k->view_offs;
+    }
+    if (!V_uma) {
+        d_V = v_buf_ctx->dev_buffer;
+        v_offset = vk_tensor_offset(v) + v->view_offs;
+    }
+    if (!R_uma) {
+        d_R = r_buf_ctx->dev_buffer;
+        r_offset = vk_tensor_offset(r) + r->view_offs;
+    }
+    if (!TF_uma) {
+        d_TF = tf_buf_ctx->dev_buffer;
+        tf_offset = vk_tensor_offset(tf) + tf->view_offs;
+    }
+    if (!TD_uma) {
+        d_TD = td_buf_ctx->dev_buffer;
+        td_offset = vk_tensor_offset(td) + td->view_offs;
+    }
+    if (!STATE_uma) {
+        d_State = state_buf_ctx->dev_buffer;
+        state_offset = vk_tensor_offset(state) + state->view_offs;
+    }
+    if (!DST_uma) {
+        d_D = dst_buf_ctx->dev_buffer;
+        dst_offset = vk_tensor_offset(dst) + dst->view_offs;
+    }
+
+    const uint64_t k_size = ggml_nbytes(k);
+    const uint64_t v_size = ggml_nbytes(v);
+    const uint64_t r_size = ggml_nbytes(r);
+    const uint64_t tf_size = ggml_nbytes(tf);
+    const uint64_t td_size = ggml_nbytes(td);
+    const uint64_t state_size = ggml_nbytes(state);
+    const uint64_t dst_size = ggml_nbytes(dst);
+
+    std::array<uint32_t, 3> elements = {
+        (uint32_t)(pc.B * pc.H),
+        1,
+        1
+    };
+
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
+        vk_subbuffer{ d_K, k_offset, k_size },
+        vk_subbuffer{ d_V, v_offset, v_size },
+        vk_subbuffer{ d_R, r_offset, r_size },
+        vk_subbuffer{ d_TF, tf_offset, tf_size },
+        vk_subbuffer{ d_TD, td_offset, td_size },
+        vk_subbuffer{ d_State, state_offset, state_size },
+        vk_subbuffer{ d_D, dst_offset, dst_size }
+    }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
+}
+
+static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
+    const size_t seq_length = dst->src[0]->ne[2];
+    const size_t n_embed = dst->ne[0];
+    const size_t n_heads = dst->src[0]->ne[1];
+    const size_t n_seqs = dst->src[5]->ne[1];
+
+    ggml_vk_op_f32_rwkv6(
+        ctx, subctx, dst,
+        {
+            (uint32_t)n_seqs,
+            (uint32_t)seq_length,
+            (uint32_t)n_embed,
+            (uint32_t)n_heads,
+        },
+        dryrun
+    );
+}
+
+static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    int * op_params = (int *)dst->op_params;
+
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t src1_type_size = ggml_type_size(src1->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f, op_params[0],
+    }, dryrun);
+}
+
+static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+
+    const float sf0 = (float)dst->ne[0] / src0->ne[0];
+    const float sf1 = (float)dst->ne[1] / src0->ne[1];
+    const float sf2 = (float)dst->ne[2] / src0->ne[2];
+    const float sf3 = (float)dst->ne[3] / src0->ne[3];
+
+    ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
+        (uint32_t)ggml_nelements(dst), 0, 0,
+        (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
+        sf0, sf1, sf2, sf3,
+    }, dryrun);
+}
+
+static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        op_params[0], 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        op_params[0], op_params[1],
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
+        (uint32_t)ggml_nelements(dst),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        0.0f, 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
+}
+
+static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+}
+
+static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const int * int_op_params = (const int *)dst->op_params;
+    const float * float_op_params = (const float *)dst->op_params;
+
+    const uint32_t num_groups = int_op_params[0];
+    const float eps = float_op_params[1];
+    const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
+}
+
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+}
+
+static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+}
+
+static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    int32_t * op_params = (int32_t *)dst->op_params;
+    ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
+}
+
+static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+
+    float scale = op_params[0];
+    float max_bias = op_params[1];
+
+    const uint32_t ncols =   (uint32_t)src0->ne[0];
+    const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
+    const uint32_t nrows_y = (uint32_t)src0->ne[1];
+
+    const uint32_t n_head_kv   = nrows_x/nrows_y;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
+
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+
+    ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
+        ncols,
+        src1 != nullptr ? nrows_y : (uint32_t)0,
+        scale, max_bias,
+        m0, m1,
+        n_head_log2,
+        nrows_x,
+    }, dryrun);
+}
+
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+    const int n_dims        = ((int32_t *) dst->op_params)[1];
+    // const int mode          = ((int32_t *) dst->op_params)[2];
+    // const int n_ctx         = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig    = ((int32_t *) dst->op_params)[4];
+    const float freq_base   = ((float *)   dst->op_params)[5];
+    const float freq_scale  = ((float *)   dst->op_params)[6];
+    const float ext_factor  = ((float *)   dst->op_params)[7];
+    const float attn_factor = ((float *)   dst->op_params)[8];
+    const float beta_fast   = ((float *)   dst->op_params)[9];
+    const float beta_slow   = ((float *)   dst->op_params)[10];
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
+        (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
+        freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
+        src2 != nullptr,
+    }, dryrun);
+}
+
+static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    int32_t * op_params = (int32_t *)dst->op_params;
+
+    uint32_t ncols = src0->ne[0];
+
+    uint32_t ncols_pad = 1;
+    while (ncols_pad < ncols) {
+        ncols_pad *= 2;
+    }
+
+    GGML_ASSERT(ncols_pad <= 1024);
+
+    ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
+        ncols,
+        ncols_pad,
+        op_params[0],
+    }, dryrun);
+}
+
+static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
+}
+
+static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    const int32_t s0 = dst->op_params[0];
+    const int32_t s1 = dst->op_params[1];
+    const int32_t p0 = dst->op_params[2];
+    const int32_t p1 = dst->op_params[3];
+    const int32_t d0 = dst->op_params[4];
+    const int32_t d1 = dst->op_params[5];
+
+    const bool is_2D = dst->op_params[6] == 1;
+
+    const uint32_t IC = src1->ne[is_2D ? 2 : 1];
+    const uint32_t IH = is_2D ? src1->ne[1] : 1;
+    const uint32_t IW =         src1->ne[0];
+
+    const uint32_t KH = is_2D ? src0->ne[1] : 1;
+    const uint32_t KW =         src0->ne[0];
+
+    const uint32_t OH = is_2D ? dst->ne[2] : 1;
+    const uint32_t OW =         dst->ne[1];
+
+    const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
+
+    const uint32_t pelements = OW * KW * KH;
+
+    ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
+        batch_offset, offset_delta,
+        IC, IW, IH, OW, OH, KW, KH,
+        pelements,
+        IC * KH * KW,
+        s0, s1, p0, p1, d0, d1,
+    }, dryrun);
+}
+
+static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const uint32_t dim = dst->op_params[0];
+    const uint32_t max_period = dst->op_params[1];
+    const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
+        nb1, dim, max_period,
+    }, dryrun);
+}
+
+static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
+    const int32_t k1 = dst->op_params[1];
+    const int32_t k0 = dst->op_params[2];
+    const int32_t s1 = dst->op_params[3];
+    const int32_t s0 = dst->op_params[4];
+    const int32_t p1 = dst->op_params[5];
+    const int32_t p0 = dst->op_params[6];
+
+    const uint32_t IH = src0->ne[1];
+    const uint32_t IW = src0->ne[0];
+
+    const uint32_t N = dst->ne[3];
+
+    const uint32_t OC = dst->ne[2];
+    const uint32_t OH = dst->ne[1];
+    const uint32_t OW = dst->ne[0];
+
+    const uint32_t parallel_elements = N * OC * OH * OW;
+
+    ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
+        IW, IH, OW, OH, OC,
+        parallel_elements,
+        op,
+        k0, k1, s0, s1, p0, p1,
+    }, dryrun);
+}
+
+static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
+    const float * op_params = (const float *)dst->op_params;
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
+}
+
+#ifdef GGML_VULKAN_RUN_TESTS
+static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
+    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
+        return;
+    }
+    i0 = std::max(i0, 5);
+    i1 = std::max(i1, 5);
+    i2 = std::max(i2, 0);
+    fprintf(stderr, "         ");
+    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+        fprintf(stderr, "%7d ", idx1);
+    }
+    fprintf(stderr, "\n");
+    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
+        fprintf(stderr, "%7d: ", idx0);
+        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+            if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) {
+                float val;
+                if (type == GGML_TYPE_F32) {
+                    val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
+                } else if (type == GGML_TYPE_F16) {
+                    val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
+                } else {
+                    GGML_ABORT("fatal error");
+                }
+                fprintf(stderr, "% 7.2f ", val);
+            } else {
+                fprintf(stderr, "        ");
+            }
+        }
+        fprintf(stderr, "\n");
+    }
+}
+
+template <typename X_TYPE, typename Y_TYPE>
+static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) {
+    VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")");
+    const size_t x_ne = m * k * batch;
+    const size_t y_ne = k * n * batch;
+    const size_t d_ne = m * n * batch;
+
+    vk_pipeline p;
+    std::string shname;
+    if (shader_size == 0) {
+        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32->a_s;
+            shname = "F32_ALIGNED_S";
+        } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32_f16->a_s;
+            shname = "F32_F16_ALIGNED_S";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_s;
+            shname = "F16_F32_ALIGNED_S";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_s;
+            shname = "F16_ALIGNED_S";
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else if (shader_size == 1) {
+        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32->a_m;
+            shname = "F32_ALIGNED_M";
+        } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32_f16->a_m;
+            shname = "F32_F16_ALIGNED_M";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_m;
+            shname = "F16_F32_ALIGNED_M";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_m;
+            shname = "F16_ALIGNED_M";
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else if (shader_size == 2) {
+        if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32->a_l;
+            shname = "F32_ALIGNED_L";
+        } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f32_f16->a_l;
+            shname = "F32_F16_ALIGNED_L";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_l;
+            shname = "F16_F32_ALIGNED_L";
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            p = ctx->device->pipeline_matmul_f16.f32acc->a_l;
+            shname = "F16_ALIGNED_L";
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    } else {
+        GGML_ASSERT(0);
+    }
+
+    const size_t kpad = ggml_vk_align_size(k, p->align);
+
+    if (k != kpad) {
+        if (shader_size == 0) {
+            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32->s;
+                shname = "F32_S";
+            } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32_f16->s;
+                shname = "F32_F16_S";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->s;
+                shname = "F16_F32_S";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16.f32acc->s;
+                shname = "F16_S";
+            }
+        } else if (shader_size == 1) {
+            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32->m;
+                shname = "F32_M";
+            } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32_f16->m;
+                shname = "F32_F16_M";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->m;
+                shname = "F16_F32_M";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16.f32acc->m;
+                shname = "F16_M";
+            }
+        } else if (shader_size == 2) {
+            if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32->l;
+                shname = "F32_L";
+            } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f32_f16->l;
+                shname = "F32_F16_L";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16_f32.f32acc->l;
+                shname = "F16_F32_L";
+            } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
+                p = ctx->device->pipeline_matmul_f16.f32acc->l;
+                shname = "F16_L";
+            }
+        }
+    }
+
+    ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+    if (split_k > 1) {
+        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+
+        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
+            // Resize buffer
+            if (ctx->prealloc_split_k != nullptr) {
+                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+            }
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+        }
+    }
+
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
+    vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
+    Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
+    float* d = (float *) malloc(sizeof(float) * d_ne);
+
+    for (size_t i = 0; i < x_ne; i++) {
+        if (std::is_same<float, X_TYPE>()) {
+            x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+            // x[i] = 1.0f;
+            // x[i] = i + 1;
+            // x[i] = (i % k == i / k) ? 1.0f : 0.0f;
+        } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
+            x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
+            // x[i] = ggml_fp32_to_fp16(1.0f);
+            // x[i] = ggml_fp32_to_fp16(i + 1);
+            // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+    for (size_t i = 0; i < y_ne; i++) {
+        if (std::is_same<float, Y_TYPE>()) {
+            y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+            // y[i] = (i % k == i / k) ? 1.0f : 0.0f;
+            // y[i] = i + 1;
+        } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
+            y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
+            // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
+            // y[i] = ggml_fp32_to_fp16(i + 1);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
+    ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
+
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    ggml_vk_ctx_begin(ctx->device, subctx);
+    for (size_t i = 0; i < num_it; i++) {
+        ggml_vk_matmul(
+            ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
+            m, n, k,
+            k, k, m, k*m, k*n, m*n,
+            split_k, batch, batch, batch, 1, 1
+        );
+    }
+    ggml_vk_ctx_end(subctx);
+
+    auto begin = std::chrono::high_resolution_clock::now();
+    ggml_vk_submit(subctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    auto end = std::chrono::high_resolution_clock::now();
+    double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
+
+    // copy dst to host
+    ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
+
+    float * d_chk = (float *) malloc(sizeof(float) * d_ne);
+
+    ggml_init_params iparams = {
+        /*.mem_size   =*/ 1024*1024*1024,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+
+    ggml_context * ggml_ctx = ggml_init(iparams);
+
+    ggml_type src0_type;
+    ggml_type src1_type;
+
+    if (std::is_same<float, X_TYPE>()) {
+        src0_type = GGML_TYPE_F32;
+    } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
+        src0_type = GGML_TYPE_F16;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+    if (std::is_same<float, Y_TYPE>()) {
+        src1_type = GGML_TYPE_F32;
+    } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
+        src1_type = GGML_TYPE_F16;
+    } else {
+        GGML_ABORT("fatal error");
+    }
+
+    ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
+    ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch);
+    ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
+
+    src0_ggml->data = x;
+    src1_ggml->data = y;
+    tensor_ggml->data = d_chk;
+
+    ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
+    ggml_build_forward_expand(cgraph, tensor_ggml);
+
+    ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
+
+    ggml_free(ggml_ctx);
+
+    double avg_err = 0.0;
+    int first_err_n = -1;
+    int first_err_m = -1;
+    int first_err_b = -1;
+
+    for (size_t i = 0; i < m*n*batch; i++) {
+        double err = std::fabs(d[i] - d_chk[i]);
+        avg_err += err;
+
+        if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) {
+            first_err_b = i / (m * n);
+            first_err_n = (i % (m * n)) / m;
+            first_err_m = (i % (m * n)) % m;
+        }
+    }
+
+    avg_err /= m * n;
+
+    double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
+
+    std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
+
+    if (avg_err > 0.1 || std::isnan(avg_err)) {
+        std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
+        std::cerr << "Actual result: " << std::endl << std::endl;
+        ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+        std::cerr << "Expected result: " << std::endl << std::endl;
+        ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+        if (split_k > 1) {
+            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+
+            std::cerr << "d_buf0: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf1: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf2: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf3: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            free(split_k_buf);
+        }
+    }
+
+    free(d_chk);
+
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
+
+    ggml_vk_destroy_buffer(d_X);
+    ggml_vk_destroy_buffer(d_Y);
+    ggml_vk_destroy_buffer(d_D);
+
+    ggml_pipeline_cleanup(p);
+    ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
+
+    free(x);
+    free(y);
+    free(d);
+}
+
+static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
+        return;
+    }
+    i0 = std::max(i0, 5);
+    i1 = std::max(i1, 5);
+    i2 = std::max(i2, 0);
+    i3 = std::max(i3, 0);
+    fprintf(stderr, "         ");
+    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+        fprintf(stderr, "%7d ", idx1);
+    }
+    fprintf(stderr, "\n");
+    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
+        fprintf(stderr, "%7d: ", idx0);
+        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+            if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
+                float val;
+                if (tensor->type == GGML_TYPE_F32) {
+                    val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
+                } else if (tensor->type == GGML_TYPE_F16) {
+                    val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
+                } else {
+                    GGML_ABORT("fatal error");
+                }
+                fprintf(stderr, "% 7.2f ", val);
+            } else {
+                fprintf(stderr, "        ");
+            }
+        }
+        fprintf(stderr, "\n");
+    }
+}
+
+static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
+    ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
+}
+
+static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
+    if (quant == GGML_TYPE_F32) {
+        memcpy(to, from, sizeof(float) * ne);
+        return;
+    }
+
+    const auto * tt = ggml_get_type_traits(quant);
+
+    ggml_to_float_t dequant_fn = tt->to_float;
+
+    dequant_fn(from, to, ne);
+}
+
+static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
+    VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
+    const size_t x_sz = sizeof(float) * ne;
+    const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne;
+    const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
+    float * x = (float *) malloc(x_sz);
+    void * qx = malloc(qx_sz);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    float * x_ref = (float *) malloc(x_sz);
+    ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
+
+    for (size_t i = 0; i < ne; i++) {
+        x[i] = rand() / (float)RAND_MAX;
+    }
+
+    vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
+
+    ggml_vk_quantize_data(x, qx, ne, quant);
+    ggml_vk_dequantize_data(qx, x_ref, ne, quant);
+
+    ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
+
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
+
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    ggml_vk_ctx_begin(ctx->device, subctx);
+    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
+    ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
+    ggml_vk_ctx_end(subctx);
+
+    auto begin = std::chrono::high_resolution_clock::now();
+
+    ggml_vk_submit(subctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
+    ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
+
+    int first_err = -1;
+
+    double avg_err = 0.0;
+    for (size_t i = 0; i < ne; i++) {
+        double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
+        avg_err += error;
+
+        if (first_err < 0 && error > 0.05) {
+            first_err = i;
+        }
+    }
+
+    avg_err /= ne;
+
+    std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl;
+
+    if (avg_err > 0.1) {
+        std::cerr << "first_error = " << first_err << std::endl;
+        std::cerr << "Actual result: " << std::endl << std::endl;
+        for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
+            std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", ";
+        }
+        std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
+        for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
+            std::cerr << x_ref[i] << ", ";
+        }
+        std::cerr << std::endl;
+    }
+
+    ggml_vk_destroy_buffer(x_buf);
+    ggml_vk_destroy_buffer(qx_buf);
+
+    free(x);
+    free(qx);
+    free(x_ref);
+    free(x_chk);
+}
+
+static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) {
+    VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")");
+    const size_t x_ne = m * k * batch;
+    const size_t y_ne = k * n * batch;
+    const size_t d_ne = m * n * batch;
+
+    vk_pipeline p;
+    std::string shname;
+    if (shader_size == 0) {
+        p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s;
+        shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S";
+    } else if (shader_size == 1) {
+        p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m;
+        shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M";
+    } else if (shader_size == 2) {
+        p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l;
+        shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L";
+    } else {
+        GGML_ASSERT(0);
+    }
+
+    const size_t kpad = ggml_vk_align_size(k, p->align);
+
+    if (k != kpad) {
+        if (shader_size == 0) {
+            p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s;
+            shname = std::string(ggml_type_name(quant)) + "_S";
+        } else if (shader_size == 1) {
+            p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m;
+            shname = std::string(ggml_type_name(quant)) + "_M";
+        } else if (shader_size == 2) {
+            p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l;
+            shname = std::string(ggml_type_name(quant)) + "_L";
+        } else {
+            GGML_ASSERT(0);
+        }
+    }
+
+    const size_t x_sz = sizeof(float) * x_ne;
+    const size_t y_sz = sizeof(float) * y_ne;
+    const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant);
+    const size_t d_sz = sizeof(float) * d_ne;
+    float * x = (float *) malloc(x_sz);
+    float * y = (float *) malloc(y_sz);
+    void * qx = malloc(qx_sz);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    float * d = (float *) malloc(d_sz);
+    float * d_chk = (float *) malloc(d_sz);
+
+    for (size_t i = 0; i < x_ne; i++) {
+        x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+    }
+
+    ggml_vk_quantize_data(x, qx, x_ne, quant);
+
+    for (size_t i = 0; i < y_ne; i++) {
+        // y[i] = rand() / (float)RAND_MAX;
+        y[i] = (i % k == i / k) ? 1.0f : 0.0f;
+    }
+
+    ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+    if (split_k > 1) {
+        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+
+        if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
+            // Resize buffer
+            if (ctx->prealloc_split_k != nullptr) {
+                ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+            }
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+        }
+    }
+
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(y_buf, 0, y, y_sz);
+
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    ggml_vk_ctx_begin(ctx->device, subctx);
+    for (size_t i = 0; i < num_it; i++) {
+        ggml_vk_matmul(
+            ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
+            m, n, k,
+            k, k, m, k*m, k*n, m*n,
+            split_k, batch, batch, batch, 1, 1
+        );
+    }
+    ggml_vk_ctx_end(subctx);
+
+    auto begin = std::chrono::high_resolution_clock::now();
+
+    ggml_vk_submit(subctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
+    ggml_vk_buffer_read(d_buf, 0, d, d_sz);
+
+    ggml_init_params iparams = {
+        /*.mem_size   =*/ 1024*1024*1024,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+
+    ggml_context * ggml_ctx = ggml_init(iparams);
+
+    ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch);
+    ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch);
+    ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml);
+
+    src0_ggml->data = qx;
+    src1_ggml->data = y;
+    tensor_ggml->data = d_chk;
+
+    ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
+    ggml_build_forward_expand(cgraph, tensor_ggml);
+
+    ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
+
+    ggml_free(ggml_ctx);
+
+    double avg_err = 0.0;
+    int first_err_n = -1;
+    int first_err_m = -1;
+    int first_err_b = -1;
+
+    for (size_t i = 0; i < m*n*batch; i++) {
+        double err = std::fabs(d[i] - d_chk[i]);
+        avg_err += err;
+
+        if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) {
+            first_err_b = i / (m * n);
+            first_err_n = (i % (m * n)) / m;
+            first_err_m = (i % (m * n)) % m;
+        }
+    }
+
+    avg_err /= m * n;
+
+    double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
+
+    std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
+
+    if (avg_err > 0.01 || std::isnan(avg_err)) {
+        std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
+        std::cerr << "Actual result: " << std::endl << std::endl;
+        ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+        std::cerr << std::endl;
+        std::cerr << "Expected result: " << std::endl << std::endl;
+        ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+        if (split_k > 1) {
+            float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+
+            std::cerr << "d_buf0: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf1: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf2: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            std::cerr << "d_buf3: " << std::endl << std::endl;
+            ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
+
+            free(split_k_buf);
+        }
+    }
+
+    ggml_vk_destroy_buffer(qx_buf);
+    ggml_vk_destroy_buffer(y_buf);
+    ggml_vk_destroy_buffer(d_buf);
+
+    free(x);
+    free(qx);
+    free(y);
+    free(d);
+    free(d_chk);
+}
+#endif
+
+static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
+#if defined(GGML_VULKAN_RUN_TESTS)
+    const std::vector<size_t> vals {
+        512, 512, 128,
+        128, 512, 512,
+        4096, 512, 4096,
+        11008, 512, 4096,
+        4096, 512, 11008,
+        32000, 512, 4096,
+        8, 8, 8,
+        100, 46, 576,
+        623, 111, 128,
+        100, 46, 558,
+        512, 1, 256,
+        128, 110, 622,
+        511, 511, 127,
+        511, 511, 7,
+        511, 511, 17,
+        49, 49, 128,
+        128, 49, 49,
+        4096, 49, 4096,
+    };
+    const size_t num_it = 100;
+
+    for (size_t i = 0; i < vals.size(); i += 3) {
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
+        std::cerr << '\n';
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2);
+        std::cerr << '\n';
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
+        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
+        std::cerr << '\n' << std::endl;
+
+        if (vals[i + 2] % 32 == 0) {
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0);
+            std::cerr << '\n';
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0);
+            std::cerr << '\n';
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0);
+            std::cerr << '\n' << std::endl;
+        }
+
+        if (vals[i + 2] % 256 == 0) {
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K);
+            std::cerr << '\n';
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K);
+            std::cerr << '\n';
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K);
+            ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K);
+            std::cerr << '\n' << std::endl;
+        }
+    }
+
+    GGML_ABORT("fatal error");
+#endif
+
+    if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
+        VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")");
+        // Resize buffer
+        if (ctx->prealloc_x != nullptr) {
+            ggml_vk_destroy_buffer(ctx->prealloc_x);
+        }
+        ctx->prealloc_x = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_x);
+    }
+    if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) {
+        VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")");
+        // Resize buffer
+        if (ctx->prealloc_y != nullptr) {
+            ggml_vk_destroy_buffer(ctx->prealloc_y);
+        }
+        ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y);
+    }
+    if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) {
+        VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")");
+        // Resize buffer
+        if (ctx->prealloc_split_k != nullptr) {
+            ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+        }
+        ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k);
+    }
+}
+
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+
+// Returns true if node has enqueued work into the queue, false otherwise
+// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
+    if (ggml_is_empty(node) || !node->buffer) {
+        return false;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
+    ctx->semaphore_idx = 0;
+
+    const ggml_tensor * src0 = node->src[0];
+    const ggml_tensor * src1 = node->src[1];
+    const ggml_tensor * src2 = node->src[2];
+    const ggml_tensor * src3 = node->src[3];
+
+    switch (node->op) {
+    // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
+    case GGML_OP_RESHAPE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+    case GGML_OP_TRANSPOSE:
+    case GGML_OP_NONE:
+        return false;
+    case GGML_OP_UNARY:
+        switch (ggml_get_unary_op(node)) {
+        case GGML_UNARY_OP_SILU:
+        case GGML_UNARY_OP_GELU:
+        case GGML_UNARY_OP_GELU_QUICK:
+        case GGML_UNARY_OP_RELU:
+        case GGML_UNARY_OP_TANH:
+            break;
+        default:
+            return false;
+        }
+        break;
+    case GGML_OP_REPEAT:
+    case GGML_OP_GET_ROWS:
+    case GGML_OP_ADD:
+    case GGML_OP_ACC:
+    case GGML_OP_MUL:
+    case GGML_OP_DIV:
+    case GGML_OP_CONCAT:
+    case GGML_OP_UPSCALE:
+    case GGML_OP_SCALE:
+    case GGML_OP_SQR:
+    case GGML_OP_SIN:
+    case GGML_OP_COS:
+    case GGML_OP_CLAMP:
+    case GGML_OP_PAD:
+    case GGML_OP_CPY:
+    case GGML_OP_CONT:
+    case GGML_OP_DUP:
+    case GGML_OP_NORM:
+    case GGML_OP_GROUP_NORM:
+    case GGML_OP_RMS_NORM:
+    case GGML_OP_DIAG_MASK_INF:
+    case GGML_OP_SOFT_MAX:
+    case GGML_OP_ROPE:
+    case GGML_OP_MUL_MAT:
+    case GGML_OP_MUL_MAT_ID:
+    case GGML_OP_ARGSORT:
+    case GGML_OP_SUM_ROWS:
+    case GGML_OP_IM2COL:
+    case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_POOL_2D:
+    case GGML_OP_RWKV_WKV6:
+    case GGML_OP_LEAKY_RELU:
+    case GGML_OP_FLASH_ATTN_EXT:
+        break;
+    default:
+        std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
+        GGML_ABORT("fatal error");
+        return false;
+    }
+
+    vk_context compute_ctx;
+
+    if (!dryrun) {
+        if (ctx->compute_ctx.expired()) {
+            compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+            ctx->compute_ctx = compute_ctx;
+            ggml_vk_ctx_begin(ctx->device, compute_ctx);
+        } else {
+            compute_ctx = ctx->compute_ctx.lock();
+        }
+    } else {
+        switch (node->op) {
+        case GGML_OP_REPEAT:
+        case GGML_OP_ACC:
+        case GGML_OP_GET_ROWS:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_CONCAT:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+        case GGML_OP_PAD:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+        case GGML_OP_DUP:
+        case GGML_OP_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_UNARY:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_IM2COL:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_LEAKY_RELU:
+            {
+                // These operations all go through ggml_vk_op_f32, so short-circuit and
+                // do the only thing needed for the dryrun.
+                vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
+                ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+                return false;
+            }
+        default:
+            break;
+        }
+    }
+
+    switch (node->op) {
+    case GGML_OP_REPEAT:
+        ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_ACC:
+        ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_GET_ROWS:
+        ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_ADD:
+        ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_MUL:
+        ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_DIV:
+        ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_CONCAT:
+        ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_UPSCALE:
+        ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_SCALE:
+        ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_SQR:
+        ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_SIN:
+        ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_COS:
+        ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_CLAMP:
+        ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_PAD:
+        ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_CPY:
+    case GGML_OP_CONT:
+    case GGML_OP_DUP:
+        ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_NORM:
+        ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_GROUP_NORM:
+        ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_RMS_NORM:
+        ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_UNARY:
+        switch (ggml_get_unary_op(node)) {
+        case GGML_UNARY_OP_SILU:
+        case GGML_UNARY_OP_GELU:
+        case GGML_UNARY_OP_GELU_QUICK:
+        case GGML_UNARY_OP_RELU:
+        case GGML_UNARY_OP_TANH:
+            ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
+            break;
+        default:
+            return false;
+        }
+        break;
+    case GGML_OP_DIAG_MASK_INF:
+        ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_SOFT_MAX:
+        ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_ROPE:
+        ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+
+        break;
+    case GGML_OP_ARGSORT:
+        ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_SUM_ROWS:
+        ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_IM2COL:
+        ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_TIMESTEP_EMBEDDING:
+        ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_POOL_2D:
+        ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_LEAKY_RELU:
+        ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
+
+        break;
+    case GGML_OP_MUL_MAT:
+        ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node, dryrun);
+
+        break;
+    case GGML_OP_MUL_MAT_ID:
+        ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+
+        break;
+
+    case GGML_OP_FLASH_ATTN_EXT:
+        ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun);
+
+        break;
+
+    case GGML_OP_RWKV_WKV6:
+        ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun);
+
+        break;
+    default:
+        return false;
+    }
+
+    if (dryrun) {
+        return false;
+    }
+
+    ctx->tensor_ctxs[node_idx] = compute_ctx;
+
+#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
+    // Force context reset on each node so that each tensor ends up in its own context
+    // and can be run and compared to its CPU equivalent separately
+    last_node = true;
+#endif
+
+    if (submit || last_node) {
+        ggml_vk_ctx_end(compute_ctx);
+
+        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
+        if (last_node) {
+            compute_ctx->exit_tensor_idx = node_idx_begin;
+        }
+        else {
+            compute_ctx->exit_tensor_idx = -1;
+        }
+
+        ctx->compute_ctx.reset();
+
+        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
+        if (!ok) {
+            if (node->op == GGML_OP_UNARY) {
+                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+            }
+            else {
+                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+            }
+        }
+
+    }
+    return true;
+}
+
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
+    ggml_backend_buffer * buf = nullptr;
+
+    switch (tensor->op) {
+    case GGML_OP_ADD:
+    case GGML_OP_ACC:
+    case GGML_OP_GET_ROWS:
+    case GGML_OP_MUL:
+    case GGML_OP_DIV:
+    case GGML_OP_CONCAT:
+    case GGML_OP_UPSCALE:
+    case GGML_OP_SCALE:
+    case GGML_OP_SQR:
+    case GGML_OP_SIN:
+    case GGML_OP_COS:
+    case GGML_OP_CLAMP:
+    case GGML_OP_PAD:
+    case GGML_OP_CPY:
+    case GGML_OP_CONT:
+    case GGML_OP_DUP:
+    case GGML_OP_NORM:
+    case GGML_OP_GROUP_NORM:
+    case GGML_OP_RMS_NORM:
+    case GGML_OP_DIAG_MASK_INF:
+    case GGML_OP_SOFT_MAX:
+    case GGML_OP_ROPE:
+    case GGML_OP_RESHAPE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+    case GGML_OP_TRANSPOSE:
+    case GGML_OP_NONE:
+    case GGML_OP_ARGSORT:
+    case GGML_OP_SUM_ROWS:
+    case GGML_OP_IM2COL:
+    case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_POOL_2D:
+    case GGML_OP_RWKV_WKV6:
+    case GGML_OP_LEAKY_RELU:
+    case GGML_OP_REPEAT:
+        buf = tensor->buffer;
+
+        break;
+    case GGML_OP_UNARY:
+        switch (ggml_get_unary_op(tensor)) {
+        case GGML_UNARY_OP_SILU:
+        case GGML_UNARY_OP_GELU:
+        case GGML_UNARY_OP_GELU_QUICK:
+        case GGML_UNARY_OP_RELU:
+        case GGML_UNARY_OP_TANH:
+            buf = tensor->buffer;
+            break;
+        default:
+            return false;
+        }
+        break;
+    case GGML_OP_MUL_MAT:
+    case GGML_OP_MUL_MAT_ID:
+    case GGML_OP_FLASH_ATTN_EXT:
+        buf = tensor->buffer;
+
+        break;
+    default:
+        return false;
+    }
+
+    if (buf == nullptr) {
+        return false;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
+
+    vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
+
+    // always wait for the GPU work to be done for the last submit
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        use_fence = true;
+    }
+
+    // Only run if ctx hasn't been submitted yet
+    if (!subctx->seqs.empty()) {
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_0(tensor);
+        use_fence = true;
+#endif
+
+        // Do staging buffer copies
+        for (auto& cpy : subctx->in_memcpys) {
+            memcpy(cpy.dst, cpy.src, cpy.n);
+        }
+
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
+
+        if (use_fence) {
+            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+
+            ctx->device->device.resetFences({ ctx->fence });
+        }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_1(tensor);
+#endif
+    }
+
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        // Do staging buffer copies
+        for (auto& cpy : subctx->out_memcpys) {
+            memcpy(cpy.dst, cpy.src, cpy.n);
+        }
+        subctx->in_memcpys.clear();
+        subctx->out_memcpys.clear();
+    }
+
+    return true;
+}
+
+// Clean up after graph processing is done
+static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
+    VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
+    for (auto& buffer : ctx->gc.temp_buffers) {
+        ggml_vk_pool_free(ctx, buffer);
+    }
+    ctx->gc.temp_buffers.clear();
+
+    for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
+        vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
+
+        if (plr.expired()) {
+            continue;
+        }
+
+        vk_pipeline pl = plr.lock();
+        ggml_pipeline_cleanup(pl);
+    }
+
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
+
+    for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
+        ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
+    }
+    ctx->gc.semaphores.clear();
+
+    for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
+        ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
+    }
+    ctx->gc.tl_semaphores.clear();
+    ctx->semaphore_idx = 0;
+
+    ctx->event_idx = 0;
+
+    for (auto& event : ctx->gc.events) {
+        ctx->device->device.resetEvent(event);
+    }
+
+    ctx->tensor_ctxs.clear();
+    ctx->gc.contexts.clear();
+    ctx->device->pipeline_descriptor_set_requirements.clear();
+}
+
+// Clean up on backend free
+static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
+    VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")");
+    ggml_vk_graph_cleanup(ctx);
+
+    ggml_vk_destroy_buffer(ctx->prealloc_x);
+    ggml_vk_destroy_buffer(ctx->prealloc_y);
+    ggml_vk_destroy_buffer(ctx->prealloc_split_k);
+
+    for (auto& buffer : ctx->buffer_pool) {
+        ggml_vk_destroy_buffer(buffer);
+    }
+
+    ctx->prealloc_size_x = 0;
+    ctx->prealloc_size_y = 0;
+    ctx->prealloc_size_split_k = 0;
+
+    for (auto& event : ctx->gc.events) {
+        ctx->device->device.destroyEvent(event);
+    }
+    ctx->gc.events.clear();
+
+    ctx->device->device.destroyFence(ctx->fence);
+}
+
+static int ggml_vk_get_device_count() {
+    ggml_vk_instance_init();
+
+    return vk_instance.device_indices.size();
+}
+
+static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
+    ggml_vk_instance_init();
+
+    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
+
+    vk::PhysicalDeviceProperties props;
+    devices[device].getProperties(&props);
+
+    snprintf(description, description_size, "%s", props.deviceName.data());
+}
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+// device backend
+
+static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
+    return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
+}
+
+static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()");
+    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
+    ggml_vk_destroy_buffer(ctx->dev_buffer);
+    delete ctx;
+}
+
+static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return vk_ptr_base;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
+    if (tensor->view_src != nullptr) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+    }
+}
+
+static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
+    vk_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+}
+
+static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
+
+    vk_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+}
+
+static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_vk(src->buffer)) {
+        ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
+        ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+
+        vk_buffer src_buf = src_buf_ctx->dev_buffer;
+        vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
+
+        ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
+
+        return true;
+    }
+    return false;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
+
+    ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_vk_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_vk_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_vk_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_vk_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_vk_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_vk_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_vk_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// vk buffer type
+static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")");
+    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
+
+    vk_buffer dev_buffer = nullptr;
+    try {
+        dev_buffer = ggml_vk_create_buffer_device(ctx->device, size);
+    } catch (const vk::SystemError& e) {
+        return nullptr;
+    }
+
+    ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->device, std::move(dev_buffer), ctx->name);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size);
+}
+
+static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
+    return ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+}
+
+static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
+    return ctx->device->suballocation_block_size;
+}
+
+static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
+    ggml_vk_instance_init();
+
+    VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")");
+
+    vk_device dev = ggml_vk_get_device(dev_num);
+
+    return &dev->buffer_type;
+}
+
+// host buffer type
+
+static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_VK_NAME "_Host";
+
+    UNUSED(buft);
+}
+
+static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return GGML_VK_NAME "_Host";
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
+    ggml_vk_host_free(vk_instance.devices[0], buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")");
+
+    size += 32;  // Behave like the CPU buffer type
+    void * ptr = nullptr;
+    try {
+        ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
+    } catch (vk::SystemError& e) {
+        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
+        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
+
+    return buffer;
+
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment;
+
+    UNUSED(buft);
+}
+
+// Should be changed to return device-specific host buffer type
+// but that probably requires changes in llama.cpp
+ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_vk_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    // Make sure device 0 is initialized
+    ggml_vk_instance_init();
+    ggml_vk_get_device(0);
+
+    return &ggml_backend_vk_buffer_type_host;
+}
+
+
+// backend
+
+static const char * ggml_backend_vk_name(ggml_backend_t backend) {
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_vk_free(ggml_backend_t backend) {
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")");
+
+    ggml_vk_cleanup(ctx);
+
+    delete ctx;
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+    return &ctx->device->buffer_type;
+}
+
+static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
+
+    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
+
+    vk_context transfer_ctx;
+
+    if (ctx->transfer_ctx.expired()) {
+        // Initialize new transfer context
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+        ctx->transfer_ctx = transfer_ctx;
+        ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+    } else {
+        transfer_ctx = ctx->transfer_ctx.lock();
+    }
+
+    vk_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+}
+
+static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
+
+    ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
+
+    vk_context transfer_ctx;
+
+    if (ctx->transfer_ctx.expired()) {
+        // Initialize new transfer context
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+        ctx->transfer_ctx = transfer_ctx;
+        ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+    } else {
+        transfer_ctx = ctx->transfer_ctx.lock();
+    }
+
+    vk_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+}
+
+static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+    VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
+        ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context;
+        ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
+
+        vk_context transfer_ctx;
+
+        if (ctx->transfer_ctx.expired()) {
+            // Initialize new transfer context
+            transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+            ctx->transfer_ctx = transfer_ctx;
+            ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+        } else {
+            transfer_ctx = ctx->transfer_ctx.lock();
+        }
+
+        vk_buffer src_buf = src_buf_ctx->dev_buffer;
+        vk_buffer dst_buf = dst_buf_ctx->dev_buffer;
+
+        ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src));
+        return true;
+    }
+
+    return false;
+}
+
+static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
+    VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    if(ctx->transfer_ctx.expired()) {
+        return;
+    }
+
+    vk_context transfer_ctx = ctx->transfer_ctx.lock();
+
+    ggml_vk_ctx_end(transfer_ctx);
+
+    for (auto& cpy : transfer_ctx->in_memcpys) {
+        memcpy(cpy.dst, cpy.src, cpy.n);
+    }
+
+    ggml_vk_submit(transfer_ctx, ctx->fence);
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
+
+    for (auto& cpy : transfer_ctx->out_memcpys) {
+        memcpy(cpy.dst, cpy.src, cpy.n);
+    }
+
+    ctx->transfer_ctx.reset();
+}
+
+static bool ggml_vk_is_empty(ggml_tensor * node) {
+    return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
+}
+
+static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
+    }
+    if (ctx->device->need_compiles) {
+        ggml_vk_load_shaders(ctx->device);
+    }
+    ggml_vk_preallocate_buffers(ctx);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+
+    int last_node = cgraph->n_nodes - 1;
+
+    // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
+    while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
+        last_node -= 1;
+    }
+
+    // Reserve tensor context space for all nodes
+    ctx->tensor_ctxs.resize(cgraph->n_nodes);
+
+    bool first_node_in_batch = true; // true if next node will be first node in a batch
+    int submit_node_idx = 0; // index to first node in a batch
+
+    // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
+    // Start with a smaller count to get work submitted right away, and increase it after each submit.
+    int nodes_per_submit = 20;
+    int submitted_nodes = 0;
+    int submit_count = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (first_node_in_batch) {
+            submit_node_idx = i;
+        }
+
+        bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
+
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
+
+        if (enqueued) {
+            ++submitted_nodes;
+
+#ifndef GGML_VULKAN_CHECK_RESULTS
+            if (first_node_in_batch) {
+                first_node_in_batch = false;
+            }
+#endif
+        }
+
+        if (submit) {
+            first_node_in_batch = true;
+            submitted_nodes = 0;
+            switch (submit_count) {
+            case 0:
+                nodes_per_submit = 50;
+                break;
+            default:
+                nodes_per_submit = 100;
+                break;
+            }
+            submit_count++;
+        }
+    }
+
+#ifdef GGML_VULKAN_PERF
+    ctx->device->perf_logger->print_timings();
+#endif
+
+    ggml_vk_graph_cleanup(ctx);
+
+    return GGML_STATUS_SUCCESS;
+
+    UNUSED(backend);
+}
+
+// TODO: enable async and synchronize
+static ggml_backend_i ggml_backend_vk_interface = {
+    /* .get_name                = */ ggml_backend_vk_name,
+    /* .free                    = */ ggml_backend_vk_free,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_vk_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_vk_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
+    VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
+
+    ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
+    ggml_vk_init(ctx, dev_num);
+
+    ggml_backend_t vk_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_vk_guid(),
+        /* .interface = */ ggml_backend_vk_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
+        /* .context   = */ ctx,
+    };
+
+    return vk_backend;
+}
+
+bool ggml_backend_is_vk(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
+}
+
+int ggml_backend_vk_get_device_count() {
+    return ggml_vk_get_device_count();
+}
+
+void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+    int dev_idx = vk_instance.device_indices[device];
+    ggml_vk_get_device_description(dev_idx, description, description_size);
+}
+
+void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
+
+    vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+
+    for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
+        if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+            *total = heap.size;
+            *free = heap.size;
+            break;
+        }
+    }
+}
+
+//////////////////////////
+
+struct ggml_backend_vk_device_context {
+    size_t device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->name.c_str();
+}
+
+static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
+    ggml_backend_vk_get_device_memory(ctx->device, free, total);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return ggml_backend_vk_host_buffer_type();
+}
+
+static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_vk_device_get_name(dev);
+    props->description = ggml_backend_vk_device_get_description(dev);
+    props->type        = ggml_backend_vk_device_get_type(dev);
+    ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ true,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    return ggml_backend_vk_init(ctx->device);
+}
+
+static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_TANH:
+                    return ggml_is_contiguous(op->src[0]);
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+                const vk_device& device = ggml_vk_get_device(ctx->device);
+                if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
+                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
+                    return false;
+                }
+                switch (src0_type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q2_K:
+                    case GGML_TYPE_Q3_K:
+                    case GGML_TYPE_Q4_K:
+                    case GGML_TYPE_Q5_K:
+                    case GGML_TYPE_Q6_K:
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ4_XS:
+                    case GGML_TYPE_IQ4_NL:
+                        break;
+                    default:
+                        return false;
+                }
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
+                if (op->op == GGML_OP_MUL_MAT) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                if (!(ggml_vk_dim01_contiguous(op->src[0]) || op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) ||
+                    !(ggml_vk_dim01_contiguous(op->src[1]) || op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16)) {
+                    return false;
+                }
+
+                return true;
+            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+                if (!ggml_vk_get_device(ctx->device)->coopmat2) {
+                    return false;
+                }
+                switch (op->src[0]->ne[0]) {
+                case 64:
+                case 80:
+                case 96:
+                case 112:
+                case 128:
+                case 256:
+                    break;
+                default:
+                    return false;
+                }
+                if (op->src[0]->type != GGML_TYPE_F32) {
+                    return false;
+                }
+                if (op->type != GGML_TYPE_F32) {
+                    return false;
+                }
+                if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
+                    return false;
+                }
+                // It's straightforward to support different K/V dequant, but would
+                // significantly increase the number of pipelines
+                if (op->src[1]->type != op->src[2]->type) {
+                    return false;
+                }
+                switch (op->src[1]->type) {
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
+                case GGML_TYPE_Q8_0:
+                // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
+                //case GGML_TYPE_Q2_K:
+                //case GGML_TYPE_Q3_K:
+                //case GGML_TYPE_Q4_K:
+                //case GGML_TYPE_Q5_K:
+                //case GGML_TYPE_Q6_K:
+                //case GGML_TYPE_IQ2_XXS:
+                //case GGML_TYPE_IQ2_XS:
+                //case GGML_TYPE_IQ2_S:
+                //case GGML_TYPE_IQ3_XXS:
+                //case GGML_TYPE_IQ3_S:
+                //case GGML_TYPE_IQ4_XS:
+                case GGML_TYPE_IQ4_NL:
+                    break;
+                default:
+                    return false;
+                }
+                return true;
+            }
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ4_XS:
+                    case GGML_TYPE_IQ4_NL:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_CONT:
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
+
+                if (src0_type == GGML_TYPE_F32) {
+                    switch (src1_type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_IQ4_NL:
+                        return true;
+                    default:
+                        break;
+                    }
+                }
+                if (src1_type == GGML_TYPE_F32) {
+                    switch (src0_type) {
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_IQ4_NL:
+                        return true;
+                    default:
+                        break;
+                    }
+                }
+
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_REPEAT:
+            return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
+        case GGML_OP_ROPE:
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return ggml_is_contiguous(op->src[0]);
+            }
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+        case GGML_OP_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_RMS_NORM:
+            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_ADD:
+        case GGML_OP_ACC:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_CONCAT:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+        case GGML_OP_CLAMP:
+        case GGML_OP_PAD:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_IM2COL:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    UNUSED(dev);
+}
+
+static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
+        return false;
+    }
+
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+
+    return buft_ctx->device->idx == ctx->device;
+}
+
+static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+
+    UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
+    /* .get_name             = */ ggml_backend_vk_device_get_name,
+    /* .get_description      = */ ggml_backend_vk_device_get_description,
+    /* .get_memory           = */ ggml_backend_vk_device_get_memory,
+    /* .get_type             = */ ggml_backend_vk_device_get_type,
+    /* .get_props            = */ ggml_backend_vk_device_get_props,
+    /* .init_backend         = */ ggml_backend_vk_device_init,
+    /* .get_buffer_type      = */ ggml_backend_vk_device_get_buffer_type,
+    /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_vk_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_vk_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_vk_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return GGML_VK_NAME;
+}
+
+static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return ggml_backend_vk_get_device_count();
+}
+
+static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+                ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                char desc[256];
+                ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
+                ctx->device = i;
+                ctx->name = GGML_VK_NAME + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_vk_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
+        }
+    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
+    /* .get_name         = */ ggml_backend_vk_reg_get_name,
+    /* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_vk_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_vk_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_vk_reg_i,
+        /* .context     = */ nullptr,
+    };
+    try {
+        ggml_vk_instance_init();
+        return &reg;
+    } catch (const vk::SystemError& e) {
+        VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what());
+        return nullptr;
+    }
+}
+
+// Extension availability
+static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
+#ifdef GGML_VULKAN_VALIDATE
+    bool portability_enumeration_ext = false;
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto& properties : instance_extensions) {
+        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
+            return true;
+        }
+    }
+    if (!portability_enumeration_ext) {
+        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
+    }
+#endif
+    return false;
+
+    UNUSED(instance_extensions);
+}
+static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
+#ifdef __APPLE__
+    bool portability_enumeration_ext = false;
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto& properties : instance_extensions) {
+        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
+            return true;
+        }
+    }
+    if (!portability_enumeration_ext) {
+        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
+    }
+#endif
+    return false;
+
+    UNUSED(instance_extensions);
+}
+
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
+    switch (props.vendorID) {
+    case VK_VENDOR_ID_INTEL:
+        // Intel drivers don't support coopmat properly yet
+        return false;
+    case VK_VENDOR_ID_AMD:
+        if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
+            // Workaround for AMD proprietary driver reporting support on all GPUs
+            const std::string name = props.deviceName;
+            return name.rfind("AMD Radeon RX 7", 0) == 0   || name.rfind("AMD Radeon(TM) RX 7", 0) == 0   || // RDNA 3 consumer GPUs
+                   name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
+                   name.rfind("AMD Radeon 7", 0) == 0      || name.rfind("AMD Radeon(TM) 7", 0) == 0;        // RDNA 3 APUs
+        }
+        return true;
+    default:
+        return true;
+    }
+}
+
+// checks
+
+#ifdef GGML_VULKAN_CHECK_RESULTS
+static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
+    if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
+        return;
+    }
+    for (int j = 0; j < level; j++) {
+        std::cerr << " ";
+    }
+    std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
+
+    done.push_back(tensor);
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (tensor->src[i] != nullptr) {
+            ggml_vk_print_graph_origin(tensor->src[i], done, level + 1);
+        }
+    }
+}
+
+static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
+    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16 && tensor->type != GGML_TYPE_I32) {
+        return;
+    }
+    i0 = std::max(i0, 5);
+    i1 = std::max(i1, 5);
+    i2 = std::max(i2, 0);
+    i3 = std::max(i3, 0);
+    fprintf(stderr, "         ");
+    for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+        fprintf(stderr, "%7d ", idx1);
+    }
+    fprintf(stderr, "\n");
+    for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) {
+        fprintf(stderr, "%7d: ", idx0);
+        for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
+            if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
+                float val;
+                if (tensor->type == GGML_TYPE_F32) {
+                    val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
+                } else if (tensor->type == GGML_TYPE_F16) {
+                    val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
+                } else if (tensor->type == GGML_TYPE_I32) {
+                    val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
+                } else {
+                    GGML_ABORT("fatal error");
+                }
+                fprintf(stderr, "% 7.2f ", val);
+            } else {
+                fprintf(stderr, "        ");
+            }
+        }
+        fprintf(stderr, "\n");
+    }
+}
+
+static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
+    void * tensor_data = tensor->data;
+
+    const bool is_gpu = tensor->buffer != nullptr && ggml_backend_buffer_is_vk(tensor->buffer);
+
+    if (is_gpu) {
+        const size_t tensor_size = ggml_nbytes(tensor);
+        tensor_data = malloc(tensor_size);
+
+        ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
+
+        vk_buffer buffer_gpu = buf_ctx->dev_buffer;
+        ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size);
+    }
+
+    std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
+    std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
+    if (tensor->src[0] != nullptr) {
+        std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
+    }
+    if (tensor->src[1] != nullptr) {
+        std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
+    }
+    std::cerr << std::endl << "Result:" << std::endl;
+    ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
+    std::cerr << std::endl;
+    std::vector<const ggml_tensor *> done;
+    ggml_vk_print_graph_origin(tensor, done);
+
+    if (is_gpu) {
+        free(tensor_data);
+    }
+}
+
+void * comp_result;
+size_t comp_size;
+size_t comp_nb[GGML_MAX_DIMS];
+size_t check_counter = 0;
+static void ggml_vk_check_results_0(ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_TRANSPOSE) {
+        return;
+    }
+
+    check_counter++;
+    if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
+        return;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
+
+    ggml_tensor * src0 = tensor->src[0];
+    ggml_tensor * src1 = tensor->src[1];
+    ggml_tensor * src2 = tensor->src[2];
+    ggml_tensor * src3 = tensor->src[3];
+
+    struct ggml_init_params iparams = {
+        /*.mem_size   =*/ 2ul*1024ul*1024ul*1024ul,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ggml_ctx = ggml_init(iparams);
+
+    struct ggml_tensor * src0_clone = nullptr;
+    struct ggml_tensor * src1_clone = nullptr;
+    struct ggml_tensor * src2_clone = nullptr;
+    struct ggml_tensor * src3_clone = nullptr;
+    struct ggml_tensor * tensor_clone = nullptr;
+
+    size_t src0_size;
+    size_t src1_size;
+    size_t src2_size;
+    size_t src3_size;
+
+    void * src0_buffer = nullptr;
+    void * src1_buffer = nullptr;
+    void * src2_buffer = nullptr;
+    void * src3_buffer = nullptr;
+
+    if (src0 != nullptr) {
+        src0_clone = ggml_dup_tensor(ggml_ctx, src0);
+
+        src0_size = ggml_nbytes(src0);
+
+        src0_buffer = malloc(src0_size);
+        src0_clone->data = src0_buffer;
+        if (ggml_backend_buffer_is_host(src0->buffer)) {
+            memcpy(src0_clone->data, src0->data, src0_size);
+            memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
+        } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
+            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
+            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+            uint64_t offset = vk_tensor_offset(src0) + src0->view_offs;
+            if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
+                for (int i3 = 0; i3 < src0->ne[3]; i3++) {
+                    for (int i2 = 0; i2 < src0->ne[2]; i2++) {
+                        const int idx = i3*src0->ne[2] + i2;
+                        ggml_vk_buffer_read(buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
+                    }
+                }
+
+                src0_clone->nb[0] = src0->nb[0];
+                src0_clone->nb[1] = src0->nb[1];
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1];
+                }
+            } else {
+                if (offset + src0_size >= buffer_gpu->size) {
+                    src0_size = buffer_gpu->size - offset;
+                }
+                ggml_vk_buffer_read(buffer_gpu, offset, src0_clone->data, src0_size);
+                memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
+            }
+        } else {
+            GGML_ABORT("fatal error");
+        }
+
+        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+            ggml_vk_print_tensor(src0, "src0");
+        }
+    }
+    if (src1 != nullptr) {
+        src1_clone = ggml_dup_tensor(ggml_ctx, src1);
+
+        src1_size = ggml_nbytes(src1);
+
+        src1_buffer = malloc(src1_size);
+        src1_clone->data = src1_buffer;
+        if (ggml_backend_buffer_is_host(src1->buffer)) {
+            memcpy(src1_clone->data, src1->data, src1_size);
+            memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
+        } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
+            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
+            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+            uint64_t offset = vk_tensor_offset(src1) + src1->view_offs;
+            if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
+                for (int i3 = 0; i3 < src1->ne[3]; i3++) {
+                    for (int i2 = 0; i2 < src1->ne[2]; i2++) {
+                        const int idx = i3*src1->ne[2] + i2;
+                        ggml_vk_buffer_read(buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
+                    }
+                }
+
+                src1_clone->nb[0] = src1->nb[0];
+                src1_clone->nb[1] = src1->nb[1];
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1];
+                }
+            } else {
+                if (offset + src1_size >= buffer_gpu->size) {
+                    src1_size = buffer_gpu->size - offset;
+                }
+                ggml_vk_buffer_read(buffer_gpu, offset, src1_clone->data, src1_size);
+                memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
+            }
+        } else {
+            GGML_ABORT("fatal error");
+        }
+
+        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+            ggml_vk_print_tensor(src1, "src1");
+        }
+    }
+    if (src2 != nullptr) {
+        src2_clone = ggml_dup_tensor(ggml_ctx, src2);
+
+        src2_size = ggml_nbytes(src2);
+
+        src2_buffer = malloc(src2_size);
+        src2_clone->data = src2_buffer;
+        if (ggml_backend_buffer_is_host(src2->buffer)) {
+            memcpy(src2_clone->data, src2->data, src2_size);
+            memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
+        } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
+            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context;
+            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+            uint64_t offset = vk_tensor_offset(src2) + src2->view_offs;
+            if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
+                for (int i3 = 0; i3 < src2->ne[3]; i3++) {
+                    for (int i2 = 0; i2 < src2->ne[2]; i2++) {
+                        const int idx = i3*src2->ne[2] + i2;
+                        ggml_vk_buffer_read(buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
+                    }
+                }
+
+                src2_clone->nb[0] = src2->nb[0];
+                src2_clone->nb[1] = src2->nb[1];
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
+                }
+            } else {
+                if (offset + src2_size >= buffer_gpu->size) {
+                    src2_size = buffer_gpu->size - offset;
+                }
+                ggml_vk_buffer_read(buffer_gpu, offset, src2_clone->data, src2_size);
+                memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
+            }
+        } else {
+            GGML_ABORT("fatal error");
+        }
+
+        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+            ggml_vk_print_tensor(src2, "src2");
+        }
+    }
+    if (src3 != nullptr) {
+        src3_clone = ggml_dup_tensor(ggml_ctx, src3);
+
+        src3_size = ggml_nbytes(src3);
+
+        src3_buffer = malloc(src3_size);
+        src3_clone->data = src3_buffer;
+        if (ggml_backend_buffer_is_host(src3->buffer)) {
+            memcpy(src3_clone->data, src3->data, src3_size);
+            memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS);
+        } else if (ggml_backend_buffer_is_vk(src3->buffer)) {
+            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src3->buffer->context;
+            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+            uint64_t offset = vk_tensor_offset(src3) + src3->view_offs;
+            if (!ggml_is_contiguous(src3) && ggml_vk_dim01_contiguous(src3)) {
+                for (int i3 = 0; i3 < src3->ne[3]; i3++) {
+                    for (int i2 = 0; i2 < src3->ne[2]; i2++) {
+                        const int idx = i3*src3->ne[2] + i2;
+                        ggml_vk_buffer_read(buffer_gpu, offset + idx * src3->nb[2], ((char *)src3_clone->data + idx * src3_clone->nb[2]), src3->ne[1] * src3->nb[1]);
+                    }
+                }
+
+                src3_clone->nb[0] = src3->nb[0];
+                src3_clone->nb[1] = src3->nb[1];
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    src3_clone->nb[i] = src3_clone->nb[i - 1]*src3_clone->ne[i - 1];
+                }
+            } else {
+                if (offset + src3_size >= buffer_gpu->size) {
+                    src3_size = buffer_gpu->size - offset;
+                }
+                ggml_vk_buffer_read(buffer_gpu, offset, src3_clone->data, src3_size);
+                memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS);
+            }
+        } else {
+            GGML_ABORT("fatal error");
+        }
+
+        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+            ggml_vk_print_tensor(src3, "src3");
+        }
+    }
+
+    if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
+        const float *params = (const float *)tensor->op_params;
+        tensor_clone = ggml_flash_attn_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, src3_clone, params[0], params[1], params[2]);
+    } else if (tensor->op == GGML_OP_MUL_MAT) {
+        tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
+        tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone);
+    } else if (tensor->op == GGML_OP_MUL) {
+        tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_DIV) {
+        tensor_clone = ggml_div(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_CONCAT) {
+        tensor_clone = ggml_concat(ggml_ctx, src0_clone, src1_clone, *(int *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_UPSCALE) {
+        tensor_clone = ggml_upscale_ext(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    } else if (tensor->op == GGML_OP_SCALE) {
+        tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
+    } else if (tensor->op == GGML_OP_SQR) {
+        tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
+    } else if (tensor->op == GGML_OP_SIN) {
+        tensor_clone = ggml_sin(ggml_ctx, src0_clone);
+    } else if (tensor->op == GGML_OP_COS) {
+        tensor_clone = ggml_cos(ggml_ctx, src0_clone);
+    } else if (tensor->op == GGML_OP_CLAMP) {
+        tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
+    } else if (tensor->op == GGML_OP_PAD) {
+        tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]);
+    } else if (tensor->op == GGML_OP_REPEAT) {
+        tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor);
+    } else if (tensor->op == GGML_OP_ADD) {
+        tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_ACC) {
+        tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
+    } else if (tensor->op == GGML_OP_NORM) {
+        tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_GROUP_NORM) {
+        tensor_clone = ggml_group_norm(ggml_ctx, src0_clone, *(int *)tensor->op_params, ((float *)tensor->op_params)[1]);
+    } else if (tensor->op == GGML_OP_RMS_NORM) {
+        tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_SOFT_MAX) {
+        if (src1 != nullptr) {
+            tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
+        } else {
+            tensor_clone = ggml_soft_max(ggml_ctx, src0_clone);
+        }
+    } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
+        tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(int *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_ROPE) {
+        const int n_dims      = ((int32_t *) tensor->op_params)[1];
+        const int mode        = ((int32_t *) tensor->op_params)[2];
+        //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
+        const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
+        const float freq_base       = ((float *) tensor->op_params)[5];
+        const float freq_scale      = ((float *) tensor->op_params)[6];
+        const float ext_factor      = ((float *) tensor->op_params)[7];
+        const float attn_factor     = ((float *) tensor->op_params)[8];
+        const float beta_fast       = ((float *) tensor->op_params)[9];
+        const float beta_slow       = ((float *) tensor->op_params)[10];
+        tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    } else if (tensor->op == GGML_OP_UNARY) {
+        switch (ggml_get_unary_op(tensor)) {
+        case GGML_UNARY_OP_SILU:
+            tensor_clone = ggml_silu(ggml_ctx, src0_clone);
+            break;
+        case GGML_UNARY_OP_GELU:
+            tensor_clone = ggml_gelu(ggml_ctx, src0_clone);
+            break;
+        case GGML_UNARY_OP_GELU_QUICK:
+            tensor_clone = ggml_gelu_quick(ggml_ctx, src0_clone);
+            break;
+        case GGML_UNARY_OP_RELU:
+            tensor_clone = ggml_relu(ggml_ctx, src0_clone);
+            break;
+        case GGML_UNARY_OP_TANH:
+            tensor_clone = ggml_tanh(ggml_ctx, src0_clone);
+            break;
+        default:
+            std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
+            GGML_ABORT("fatal error");
+        }
+    } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
+        if (src1 == nullptr) {
+            tensor_clone = ggml_dup(ggml_ctx, src0_clone);
+            tensor_clone->type = tensor->type;
+        } else {
+            tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone);
+        }
+    } else if (tensor->op == GGML_OP_CONT) {
+        tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    } else if (tensor->op == GGML_OP_RESHAPE) {
+        tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    } else if (tensor->op == GGML_OP_VIEW) {
+        tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
+    } else if (tensor->op == GGML_OP_PERMUTE) {
+        int32_t * params = (int32_t *)tensor->op_params;
+        tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
+    } else if (tensor->op == GGML_OP_TRANSPOSE) {
+        tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
+    } else if (tensor->op == GGML_OP_GET_ROWS) {
+        tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
+    } else if (tensor->op == GGML_OP_ARGSORT) {
+        tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_SUM_ROWS) {
+        tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
+    } else if (tensor->op == GGML_OP_IM2COL) {
+        const int32_t s0 = tensor->op_params[0];
+        const int32_t s1 = tensor->op_params[1];
+        const int32_t p0 = tensor->op_params[2];
+        const int32_t p1 = tensor->op_params[3];
+        const int32_t d0 = tensor->op_params[4];
+        const int32_t d1 = tensor->op_params[5];
+
+        const bool is_2D = tensor->op_params[6] == 1;
+        tensor_clone = ggml_im2col(ggml_ctx, src0_clone, src1_clone, s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
+    } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
+        const int32_t dim = tensor->op_params[0];
+        const int32_t max_period = tensor->op_params[1];
+        tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
+    } else if (tensor->op == GGML_OP_POOL_2D) {
+        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
+        const int32_t k0 = tensor->op_params[1];
+        const int32_t k1 = tensor->op_params[2];
+        const int32_t s0 = tensor->op_params[3];
+        const int32_t s1 = tensor->op_params[4];
+        const int32_t p0 = tensor->op_params[5];
+        const int32_t p1 = tensor->op_params[6];
+
+        tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
+    } else if (tensor->op == GGML_OP_LEAKY_RELU) {
+        const float * op_params = (const float *)tensor->op_params;
+        tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
+    } else if (tensor->op == GGML_OP_RWKV_WKV6) {
+        tensor_clone = ggml_rwkv_wkv6(ggml_ctx, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3],
+        tensor->src[4], tensor->src[5]);
+    }
+    else {
+        std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
+        GGML_ABORT("fatal error");
+    }
+
+    ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
+    ggml_build_forward_expand(cgraph, tensor_clone);
+
+    ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
+
+    if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+        ggml_vk_print_tensor(tensor_clone, "tensor_clone");
+    }
+
+    comp_size = ggml_nbytes(tensor_clone);
+
+    comp_result = malloc(comp_size);
+    memcpy(comp_result, tensor_clone->data, comp_size);
+    memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
+
+    if (src0 != nullptr) {
+        free(src0_buffer);
+    }
+    if (src1 != nullptr) {
+        free(src1_buffer);
+    }
+
+    ggml_free(ggml_ctx);
+
+    VK_LOG_DEBUG("END ggml_vk_check_results_0(" << tensor->name << ")");
+}
+
+static void ggml_vk_check_results_1(ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_TRANSPOSE) {
+        return;
+    }
+    if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
+        return;
+    }
+
+    VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")");
+
+    ggml_tensor * src0 = tensor->src[0];
+    ggml_tensor * src1 = tensor->src[1];
+    ggml_tensor * src2 = tensor->src[2];
+    ggml_tensor * src3 = tensor->src[3];
+
+    void * tensor_data = tensor->data;
+
+    if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+        size_t tensor_size = ggml_nbytes(tensor);
+        tensor_data = malloc(tensor_size);
+
+        ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
+
+        vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+        uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs;
+        if (offset + tensor_size >= buffer_gpu->size) {
+            tensor_size = buffer_gpu->size - offset;
+        }
+
+        ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size);
+    }
+
+    float first_error_result = -1.0f;
+    float first_error_correct = -1.0f;
+    std::array<int, 4> first_error = { -1, -1, -1, -1 };
+    double avg_err = 0.0;
+    size_t counter = 0;
+
+    for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+        for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    const bool buffer_size_fit = i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0] < comp_size;
+                    float correct = 0.0f;
+                    float result = 0.0f;
+
+                    if (buffer_size_fit) {
+                        if (tensor->type == GGML_TYPE_F32) {
+                            correct = *(float *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
+                            result  = *(float *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
+                        } else if (tensor->type == GGML_TYPE_F16) {
+                            correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
+                            result  = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
+                        } else if (tensor->type == GGML_TYPE_I32) {
+                            correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
+                            result  = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
+                        } else {
+                            std::cerr << "Results check not implemented for type " << ggml_type_name(tensor->type) << std::endl;
+                        }
+                    } else {
+                        std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
+                        GGML_ABORT("fatal error");
+                    }
+
+                    if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
+                        std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
+                        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
+                        if (src0 != nullptr) {
+                            std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
+                        }
+                        if (src1 != nullptr) {
+                            std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
+                        }
+                        if (src2 != nullptr) {
+                            std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
+                        }
+                        if (src3 != nullptr) {
+                            std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+                        }
+                        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
+                        std::cerr << std::endl << "Result:" << std::endl;
+                        ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
+                        std::cerr << std::endl << "Correct:" << std::endl;
+                        ggml_vk_print_tensor_area(tensor, comp_result, i0, i1, i2, i3);
+                        std::cerr << std::endl;
+                        std::vector<const ggml_tensor *> done;
+                        ggml_vk_print_graph_origin(tensor, done);
+                        GGML_ABORT("fatal error");
+                    }
+                    if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
+                        first_error[0] = i0;
+                        first_error[1] = i1;
+                        first_error[2] = i2;
+                        first_error[3] = i3;
+                        first_error_result = result;
+                        first_error_correct = correct;
+                    }
+
+                    // Special case, value is infinite, avoid NaN result in avg_err
+                    // NaN also appears in results, if both are nan error is 0
+                    if (!std::isinf(correct) && !std::isinf(result) && !std::isnan(correct) && !std::isnan(result)) {
+                        avg_err += std::fabs(correct - result);
+                    }
+                    counter++;
+                }
+            }
+        }
+    }
+
+    avg_err /= counter;
+
+    if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+        std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
+        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
+        if (src0 != nullptr) {
+            std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
+        }
+        if (src1 != nullptr) {
+            std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
+        }
+        if (src2 != nullptr) {
+            std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
+        }
+        if (src3 != nullptr) {
+            std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+        }
+        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
+        std::cerr << std::endl << "Result:" << std::endl;
+        ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
+        std::cerr << std::endl << "Correct:" << std::endl;
+        ggml_vk_print_tensor_area(tensor, comp_result, 5, 5, 0, 0);
+        std::cerr << std::endl;
+        std::vector<const ggml_tensor *> done;
+        ggml_vk_print_graph_origin(tensor, done);
+    }
+
+    if (avg_err > 0.05 || std::isnan(avg_err)) {
+        std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
+        std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
+        if (src0 != nullptr) {
+            std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
+        }
+        if (src1 != nullptr) {
+            std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
+        }
+        if (src2 != nullptr) {
+            std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
+        }
+        if (src3 != nullptr) {
+            std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+        }
+        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
+        std::cerr << std::endl << "Result:" << std::endl;
+        ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
+        std::cerr << std::endl << "Correct:" << std::endl;
+        ggml_vk_print_tensor_area(tensor, comp_result, first_error[0], first_error[1], first_error[2], first_error[3]);
+        std::cerr << std::endl;
+        std::vector<const ggml_tensor *> done;
+        ggml_vk_print_graph_origin(tensor, done);
+        GGML_ABORT("fatal error");
+    } else {
+        std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
+    }
+
+    free(comp_result);
+    comp_result = nullptr;
+    comp_size = 0;
+
+    if (ggml_backend_buffer_is_vk(tensor->buffer)) {
+        free(tensor_data);
+    }
+
+    VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
+}
+#endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
new file mode 100644
index 000000000..074031087
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -0,0 +1,11 @@
+find_package (Threads REQUIRED)
+find_program(GLSLC_EXECUTABLE glslc)
+if(NOT GLSLC_EXECUTABLE)
+    message(FATAL_ERROR "glslc not found.")
+endif()
+
+set(TARGET vulkan-shaders-gen)
+add_executable(${TARGET} vulkan-shaders-gen.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
new file mode 100644
index 000000000..d896f1ef0
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
@@ -0,0 +1,29 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.x;
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint offset = p.param3;
+    const uint src1_i = idx - offset;
+    const uint oz = src1_i / p.nb02;
+    const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
+    const uint ox = src1_i % p.nb01;
+
+    uint i00, i01, i02, i03;
+    get_indices(idx, i00, i01, i02, i03);
+
+    if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
+    } else {
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
+    }
+}
+
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
new file mode 100644
index 000000000..2b4085c4f
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
@@ -0,0 +1,29 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
new file mode 100644
index 000000000..d4fa45b1e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
@@ -0,0 +1,69 @@
+#version 450
+
+#include "types.comp"
+
+#define BLOCK_SIZE 1024
+#define ASC 0
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1)          buffer D {int data_d[];};
+
+layout (push_constant) uniform parameter {
+    uint ncols;
+    uint ncols_pad;
+    uint order;
+} p;
+
+shared int dst_row[BLOCK_SIZE];
+
+void swap(uint idx0, uint idx1) {
+    int tmp = dst_row[idx0];
+    dst_row[idx0] = dst_row[idx1];
+    dst_row[idx1] = tmp;
+}
+
+void main() {
+    // bitonic sort
+    const int col = int(gl_LocalInvocationID.x);
+    const uint row = gl_WorkGroupID.y;
+
+    const uint row_offset = row * p.ncols;
+
+    // initialize indices
+    if (col < p.ncols_pad) {
+        dst_row[col] = col;
+    }
+    barrier();
+
+    for (uint k = 2; k <= p.ncols_pad; k *= 2) {
+        for (uint j = k / 2; j > 0; j /= 2) {
+            const uint ixj = col ^ j;
+            if (col < p.ncols_pad && ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= p.ncols ||
+                        (dst_row[ixj] < p.ncols && (p.order == ASC ?
+                            data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] :
+                            data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]]))
+                    ) {
+                        swap(col, ixj);
+                    }
+                } else {
+                    if (dst_row[ixj] >= p.ncols ||
+                        (dst_row[col] < p.ncols && (p.order == ASC ?
+                            data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] :
+                            data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]]))
+                    ) {
+                        swap(col, ixj);
+                    }
+                }
+            }
+            barrier();
+        }
+    }
+
+    if (col < p.ncols) {
+        data_d[row_offset + col] = dst_row[col];
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
new file mode 100644
index 000000000..1e5cb8dae
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp b/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
new file mode 100644
index 000000000..9ee2f1fae
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
@@ -0,0 +1,41 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+    const int dim = p.param3;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
+    const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
+    const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
+    const uint i2_offset = i2*p.ne21*p.ne20;
+    const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
+    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
+
+    uint o[4] = {0, 0, 0, 0};
+    o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
+
+    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+    const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
+    const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
+
+    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
+#else
+    if (is_src0) {
+        data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
+    } else {
+        data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
+    }
+#endif
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
new file mode 100644
index 000000000..dd828c232
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
@@ -0,0 +1,42 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+#extension GL_EXT_control_flow_attributes : require
+
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    // fast path for when all four iterations are in-bounds
+    if (idx + (num_iter-1)*num_threads < p.ne) {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
+#else
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
+#endif
+            idx += num_threads;
+        }
+    } else {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+            if (idx >= p.ne) {
+                continue;
+            }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
+#else
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
+#endif
+            idx += num_threads;
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
new file mode 100644
index 000000000..29c906494
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
@@ -0,0 +1,20 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+#else
+    data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
+#endif
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
new file mode 100644
index 000000000..9c9fe9626
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
@@ -0,0 +1,51 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+#include "dequant_funcs.comp"
+
+#if defined(DATA_A_IQ4_NL)
+// 16 invocations needed for init_iq4nl_shmem
+layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
+#else
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+#endif
+
+void main() {
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+    if (gl_LocalInvocationIndex.x != 0) {
+        return;
+    }
+#endif
+
+    const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    uint dst_idx = get_doffset() + dst_idx(idx);
+    uint src_idx = src0_idx_quant(idx, QUANT_K);
+
+    const uint a_offset = 0;
+    const uint ib = src_idx;
+    const vec2 dm = get_dm(ib, a_offset);
+
+    [[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
+        vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
+        v = v * dm.x + vec4(dm.y);
+
+#if QUANT_R == 2
+        data_d[dst_idx + j/2 +             0] = v[0];
+        data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
+        data_d[dst_idx + j/2 +             1] = v[2];
+        data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
+#else
+        data_d[dst_idx + j + 0] = v[0];
+        data_d[dst_idx + j + 1] = v[1];
+        data_d[dst_idx + j + 2] = v[2];
+        data_d[dst_idx + j + 3] = v[3];
+#endif
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
new file mode 100644
index 000000000..660811086
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
@@ -0,0 +1,237 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+#if defined(DATA_A_IQ4_NL)
+// 16 invocations needed for init_iq4nl_shmem
+layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
+#else
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+#endif
+
+layout (binding = 0) readonly buffer S {float data_s[];};
+layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
+
+#if defined(DATA_A_Q4_0)
+void quantize(uint dst_idx, uint src_idx)
+{
+    float amax = 0.0;
+    float vmax = 0.0;
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q4_0; ++j) {
+        const float v = data_s[src_idx + j];
+        if (amax < abs(v)) {
+            amax = abs(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = (d != 0.0) ? 1.0/d : 0.0;
+
+    data_q[dst_idx].d = float16_t(d);
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q4_0/2; ++j) {
+        const float x0 = data_s[src_idx + 0              + j]*id;
+        const float x1 = data_s[src_idx + QUANT_K_Q4_0/2 + j]*id;
+
+        const uint xi0 = min(15, int(x0 + 8.5));
+        const uint xi1 = min(15, int(x1 + 8.5));
+
+        data_q[dst_idx].qs[j]  = uint8_t(xi0 | (xi1 << 4));
+    }
+}
+#endif
+
+#if defined(DATA_A_Q4_1)
+void quantize(uint dst_idx, uint src_idx)
+{
+    float vmin = 1.0/0.0;
+    float vmax = -vmin;
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q4_1; ++j) {
+        const float v = data_s[src_idx + j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = (d != 0.0) ? 1.0/d : 0.0;
+
+    data_q[dst_idx].d = float16_t(d);
+    data_q[dst_idx].m = float16_t(vmin);
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q4_1/2; ++j) {
+        const float x0 = (data_s[src_idx + 0              + j] - vmin)*id;
+        const float x1 = (data_s[src_idx + QUANT_K_Q4_1/2 + j] - vmin)*id;
+
+        const uint xi0 = min(15, int(x0 + 0.5));
+        const uint xi1 = min(15, int(x1 + 0.5));
+
+        data_q[dst_idx].qs[j]  = uint8_t(xi0 | (xi1 << 4));
+    }
+}
+#endif
+
+#if defined(DATA_A_Q5_0)
+void quantize(uint dst_idx, uint src_idx)
+{
+    float amax = 0.0;
+    float vmax = 0.0;
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q5_0; ++j) {
+        const float v = data_s[src_idx + j];
+        if (amax < abs(v)) {
+            amax = abs(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -16;
+    const float id = (d != 0.0) ? 1.0/d : 0.0;
+
+    data_q[dst_idx].d = float16_t(d);
+
+    uint32_t qh = 0;
+    [[unroll]] for (int j = 0; j < QUANT_K_Q5_0/2; ++j) {
+        const float x0 = data_s[src_idx + 0              + j]*id;
+        const float x1 = data_s[src_idx + QUANT_K_Q5_0/2 + j]*id;
+
+        const uint xi0 = min(31, int(x0 + 16.5));
+        const uint xi1 = min(31, int(x1 + 16.5));
+
+        data_q[dst_idx].qs[j]  = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4));
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_0/2);
+    }
+    data_q[dst_idx].qh[0] = uint16_t(qh & 0xFFFF);
+    data_q[dst_idx].qh[1] = uint16_t(qh >> 16);
+}
+#endif
+
+#if defined(DATA_A_Q5_1)
+void quantize(uint dst_idx, uint src_idx)
+{
+    float min = data_s[src_idx + 0];
+    float max = min;
+
+    [[unroll]] for (int j = 1; j < QUANT_K_Q5_1; ++j) {
+        const float v = data_s[src_idx + j];
+        min = v < min ? v : min;
+        max = v > max ? v : max;
+    }
+
+    const float d  = (max - min) / 31;
+    const float id = (d != 0) ? 1.0/d : 0.0;
+
+    data_q[dst_idx].d = float16_t(d);
+    data_q[dst_idx].m = float16_t(min);
+
+    uint32_t qh = 0;
+    [[unroll]] for (int j = 0; j < QUANT_K_Q5_1/2; ++j) {
+        const float x0 = (data_s[src_idx + 0              + j] - min)*id;
+        const float x1 = (data_s[src_idx + QUANT_K_Q5_1/2 + j] - min)*id;
+
+        const uint xi0 = uint(x0 + 0.5);
+        const uint xi1 = uint(x1 + 0.5);
+
+        data_q[dst_idx].qs[j]  = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4));
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_1/2);
+    }
+    data_q[dst_idx].qh = qh;
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+void quantize(uint dst_idx, uint src_idx)
+{
+    float amax = 0.0; // absolute max
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q8_0; j++) {
+        const float v = data_s[src_idx + j];
+        amax = max(amax, abs(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = (d != 0.0) ? 1.0/d : 0.0;
+
+    data_q[dst_idx].d = float16_t(d);
+
+    [[unroll]] for (int j = 0; j < QUANT_K_Q8_0; ++j) {
+        const float x0 = data_s[src_idx + j]*id;
+
+        data_q[dst_idx].qs[j] = int8_t(round(x0));
+    }
+}
+#endif
+
+#if defined(DATA_A_IQ4_NL)
+uint best_index(float x) {
+    if (x <= kvalues_iq4nl[0]) return 0;
+    if (x >= kvalues_iq4nl[15]) return 15;
+    int ml = 0, mu = 15;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < kvalues_iq4nl[mav]) mu = mav; else ml = mav;
+    }
+    return x - kvalues_iq4nl[mu-1] < kvalues_iq4nl[mu] - x ? mu-1 : mu;
+}
+
+void quantize(uint dst_idx, uint src_idx)
+{
+    float amax = 0.0;
+    float vmax = 0.0;
+
+    [[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL; ++j) {
+        const float v = data_s[src_idx + j];
+        if (amax < abs(v)) {
+            amax = abs(v);
+            vmax = v;
+        }
+    }
+
+    float d = vmax / kvalues_iq4nl[0];
+    const float id = (d != 0.0) ? 1.0/d : 0.0;
+
+    float sumqx = 0, sumq2 = 0;
+    [[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL/2; ++j) {
+        const float x0 = data_s[src_idx + 0                + j]*id;
+        const float x1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*id;
+        const uint xi0 = best_index(x0);
+        const uint xi1 = best_index(x1);
+        data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4));
+        const float v0 = kvalues_iq4nl[xi0];
+        const float v1 = kvalues_iq4nl[xi1];
+        const float w0 = data_s[src_idx + 0                + j]*data_s[src_idx + 0                + j];
+        const float w1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*data_s[src_idx + QUANT_K_IQ4_NL/2 + j];
+        sumqx += w0*v0*data_s[src_idx + j] + w1*v1*data_s[src_idx + QUANT_K_IQ4_NL/2 + j];
+        sumq2 += w0*v0*v0 + w1*v1*v1;
+    }
+
+    data_q[dst_idx].d = float16_t(sumq2 > 0 ? sumqx/sumq2 : d);
+
+}
+#endif
+
+void main() {
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+    if (gl_LocalInvocationIndex.x != 0) {
+        return;
+    }
+#endif
+
+    const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    uint dst_idx = dst_idx_quant(idx, QUANT_K);
+    uint src_idx = get_aoffset() + src0_idx(idx);
+
+    quantize(dst_idx, src_idx);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
new file mode 100644
index 000000000..0b8d02f58
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
new file mode 100644
index 000000000..a4d3fca55
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp
@@ -0,0 +1,20 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x * 16;
+
+    if (i >= p.nel) {
+        return;
+    }
+
+    [[unroll]] for (uint l = 0; l < 16; l++) {
+        data_b[i + l] = D_TYPE(data_a[i + l]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
new file mode 100644
index 000000000..ecfdbfaa8
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@@ -0,0 +1,370 @@
+#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#endif
+
+#include "types.comp"
+
+#if defined(A_TYPE_PACKED16)
+layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
+#endif
+#if defined(A_TYPE_PACKED32)
+layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
+#endif
+
+#if defined(DATA_A_F32)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
+}
+#endif
+
+#if defined(DATA_A_F16)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
+}
+#endif
+
+#if defined(DATA_A_Q4_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return (vec2(vui & 0xF, vui >> 4) - 8.0f);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
+}
+#endif
+
+#if defined(DATA_A_Q4_1)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2(vui & 0xF, vui >> 4);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
+}
+#endif
+
+#if defined(DATA_A_Q5_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
+    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
+}
+#endif
+
+#if defined(DATA_A_Q5_1)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint uint_qh = data_a[a_offset + ib].qh;
+    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint uint_qh = data_a_packed16[a_offset + ib].qh;
+    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
+    uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
+    return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
+}
+#endif
+
+#if defined(DATA_A_IQ2_XXS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint ib8 = (iqs / 8) % 4;
+    const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
+    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
+        data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
+    const float db = 0.25 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    return db * vec2(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint ib8 = (iqs / 8) % 4;
+    const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
+    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
+        data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
+    const float db = 0.25 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    return db * vec4(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0),
+        grid.z * (sign2 ? -1.0 : 1.0),
+        grid.w * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
+#if defined(DATA_A_IQ2_XS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
+    const uint qs = data_a[a_offset + ib].qs[iqs / 8];
+    const float db = 0.25 * (0.5 + scale);
+    const uint sign7 = qs >> 9;
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    return db * vec2(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
+    const uint qs = data_a[a_offset + ib].qs[iqs / 8];
+    const float db = 0.25 * (0.5 + scale);
+    const uint sign7 = qs >> 9;
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    return db * vec4(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0),
+        grid.z * (sign2 ? -1.0 : 1.0),
+        grid.w * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
+#if defined(DATA_A_IQ2_S)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint ib8 = iqs / 8;
+
+    const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
+    const uint qs = data_a[a_offset + ib].qs[ib8];
+    const uint qh = data_a[a_offset + ib].qh[ib32];
+    const uint qhshift = 2 * (ib8 % 4);
+    const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
+
+    const float db = 0.25 * (0.5 + scale);
+    const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    return db * vec2(
+        grid[iqs % 4] * (sign0 ? -1.0 : 1.0),
+        grid[(iqs % 4) + 1] * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint ib8 = iqs / 8;
+
+    const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
+    const uint qs = data_a[a_offset + ib].qs[ib8];
+    const uint qh = data_a[a_offset + ib].qh[ib32];
+    const uint qhshift = 2 * (ib8 % 4);
+    const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
+
+    const float db = 0.25 * (0.5 + scale);
+    const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    return db * vec4(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0),
+        grid.z * (sign2 ? -1.0 : 1.0),
+        grid.w * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
+#if defined(DATA_A_IQ3_XXS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib4 = iqs / 4;
+    const uint ib32 = iqs / 32;
+    const uint is = QUANT_K / 4 + 4 * ib32;
+    const uint qs = data_a[a_offset + ib].qs[ib4];
+    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
+        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
+    const float db = 0.5 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq3xxs_grid[qs] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    return db * vec2(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib4 = iqs / 4;
+    const uint ib32 = iqs / 32;
+    const uint is = QUANT_K / 4 + 4 * ib32;
+    const uint qs = data_a[a_offset + ib].qs[ib4];
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
+        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
+    const float db = 0.5 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq3xxs_grid[qs]);
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    return db * vec4(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0),
+        grid.z * (sign2 ? -1.0 : 1.0),
+        grid.w * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
+#if defined(DATA_A_IQ3_S)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint qs = data_a[a_offset + ib].qs[iqs / 4];
+    const uint qh = data_a[a_offset + ib].qh[iqs / 32];
+    const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
+    const uint scale = data_a[a_offset + ib].scales[iqs / 64];
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    const float db = 1 + 2 * ((scale >> (4 * ((iqs / 32) & 1))) & 0xf);
+    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ((iqs / 4) % 8))) & 256)] >> (8 * (iqs % 4));
+    return db * vec2(
+        int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
+        int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib4 = iqs / 4;
+    const uint ib32 = iqs / 32;
+    const uint qs = data_a[a_offset + ib].qs[ib4];
+    const uint qh = data_a[a_offset + ib].qh[ib32];
+    const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
+    const uint scale = data_a[a_offset + ib].scales[ib32 / 2];
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    const float db = 1 + 2 * ((scale >> (4 * (ib32 & 1))) & 0xf);
+    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ib4 % 8)) & 256)] >> (8 * (iqs % 4));
+    return db * vec4(
+        int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
+        int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0),
+        int((grid >> 16) & 0xFF) * (sign2 ? -1.0 : 1.0),
+        int((grid >> 24) & 0xFF) * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
+#if defined(DATA_A_IQ4_XS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint iq = 16 * ib32 + (iqs % 16);
+
+    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
+    const uint qshift = (iqs & 16) >> 2;
+    u8vec2 qs = u8vec2(data_a[a_offset + ib].qs[iq], data_a[a_offset + ib].qs[iq + 1]);
+    qs = (qs >> qshift) & uint8_t(0xF);
+
+    const float dl = float(int(sl | (sh << 4)) - 32);
+    return dl * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint iq = 16 * ib32 + (iqs % 16);
+
+    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
+    const uint qshift = (iqs & 16) >> 2;
+    u8vec4 qs = u8vec4(
+        data_a[a_offset + ib].qs[iq + 0],
+        data_a[a_offset + ib].qs[iq + 1],
+        data_a[a_offset + ib].qs[iq + 2],
+        data_a[a_offset + ib].qs[iq + 3]
+    );
+    qs = (qs >> qshift) & uint8_t(0xF);
+
+    const float dl = float(int(sl | (sh << 4)) - 32);
+    return dl * vec4(
+        kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y],
+        kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
+}
+#endif
+
+#if defined(DATA_A_IQ4_NL)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
+}
+#endif
+
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(0, 0);
+}
+#endif
+
+#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(float(data_a[a_offset + ib].d), 0);
+}
+#endif
+
+#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
+vec2 get_dm(uint ib, uint a_offset) {
+    return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
+}
+#endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
new file mode 100644
index 000000000..0eba37420
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@@ -0,0 +1,529 @@
+
+#include "types.comp"
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
+   block_q4_0_packed16 block;
+};
+
+float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint shift = (idx & 0x10) >> 2;
+    uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
+    qs >>= shift;
+    qs &= 0x0F0F;
+    qs = unpack8(qs)[idx & 1];
+    float16_t ret = (float16_t(qs) - float16_t(8)) * d;
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
+   block_q4_1 block;
+};
+
+float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+    const uint shift = (idx & 0x10) >> 2;
+    uint32_t qs = bl.block.qs[iqs];
+    qs >>= shift;
+    qs &= 0xF;
+    float16_t ret = float16_t(qs) * d + m;
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
+   block_q5_0 block;
+};
+
+float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+
+    const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0];
+    const uint qh = ((uint_qh >> idx) << 4) & 0x10;
+
+    const uint shift = (idx & 0x10) >> 2;
+    uint32_t qs = bl.block.qs[iqs];
+    qs >>= shift;
+    qs &= 0xF;
+
+    float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d;
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
+   block_q5_1 block;
+};
+
+float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const float16_t m = bl.block.m;
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+
+    const uint uint_qh = bl.block.qh;
+    const uint qh = ((uint_qh >> idx) << 4) & 0x10;
+
+    const uint shift = (idx & 0x10) >> 2;
+    uint32_t qs = bl.block.qs[iqs];
+    qs >>= shift;
+    qs &= 0xF;
+
+    float16_t ret = float16_t(qs | qh) * d + m;
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
+   block_q8_0_packed16 block;
+};
+
+float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx;
+
+    // Load 16b and select the byte for this element
+    int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1];
+    float16_t ret = float16_t(qs) * d;
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
+   block_q2_K block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2_K_packed16 {
+   block_q2_K_packed16 block;
+};
+
+float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
+    const f16vec2 d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint scalesi = (idx & 0xF0) >> 4;             // 0..15
+    const uint qsshift = (idx & 0x60) >> 4;             // 0,2,4,6
+
+    uint qs = uint32_t(bl16.block.qs[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> qsshift) & 0x0303;
+    qs = unpack8(qs)[idx & 1];
+
+    const uint scales = bl.block.scales[scalesi];
+    float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4);
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
+   block_q3_K block;
+};
+
+float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx;
+
+    const uint n = iqs / 128;                    // 0,1
+    const uint qsi = n * 32 + (iqs % 32);        // 0..63
+    const uint hmi =          (iqs % 32);        // 0..31
+    const uint j = (iqs % 128) / 8;              // 0..15
+    const uint is = iqs / 16;                    // 0..15
+    const uint halfsplit = ((iqs % 128) / 32);   // 0,1,2,3
+    const uint qsshift = halfsplit * 2;          // 0,2,4,6
+    const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
+
+    uint32_t scaleidx0 = (is < 8) ? is : (is-8);
+    uint32_t scaleidx0shift = (is < 8) ? 0 : 4;
+    uint32_t scaleidx1 = is + 8 - (is/4)*4;
+    uint32_t scaleidx1shift = (is/4)*2;
+
+    const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
+
+    const float16_t dl = bl.block.d * float16_t(us - 32);
+
+    float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi    ] >> qsshift) & 3) - (((bl.block.hmask[hmi    ] & m) != 0) ? 0 : 4));
+
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
+   block_q4_K block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
+   block_q4_K_packed16 block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
+   block_q4_K_packed128 block;
+};
+
+float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
+    decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint b = (idx & 0x20) >> 5;            // 0,1
+    const uint is = (idx & 0xE0) >> 5;         // 0..7
+
+    uvec4 v = bl128.block.q4k[0];
+
+    const f16vec2 loadd = unpackFloat2x16(v.x);
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float16_t d = loadd.x * float16_t(sc);
+    const float16_t m = loadd.y * float16_t(mbyte);
+
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> (b * 4 + 8 * (idx & 1))) & 0xF;
+
+    float16_t ret = d * float16_t(qs) - m;
+
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
+   block_q5_K block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
+   block_q5_K_packed16 block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed128 {
+   block_q5_K_packed128 block;
+};
+
+float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
+    decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint b = (idx & 0x20) >> 5;          // 0,1
+    const uint is = (idx & 0xE0) >> 5;         // 0..7
+
+    uvec4 v = bl128.block.q5k[0];
+
+    const f16vec2 loadd = unpackFloat2x16(v.x);
+
+    uint32_t sc;
+    uint32_t mbyte;
+
+    uint32_t scale0 = v.y;
+    uint32_t scale4 = v.z;
+    uint32_t scale8 = v.w;
+
+    uint32_t sc_lo = scale0;
+    uint32_t mb_lo = scale4;
+    uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
+    uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
+
+    sc = is < 4 ? sc_lo : sc_hi;
+    mbyte = is < 4 ? mb_lo : mb_hi;
+    sc = sc >> (8 * (is & 3));
+    mbyte = mbyte >> (8 * (is & 3));
+    sc &= 0x3F;
+    mbyte &= 0x3F;
+
+    const float16_t d = loadd.x * float16_t(sc);
+    const float16_t m = loadd.y * float16_t(mbyte);
+
+    uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
+    qh = ((qh >> is) & 0x101) << 4;
+
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
+    qs = (qs >> (b * 4)) & 0x0F0F;
+    qs = unpack8(qs | qh)[idx & 1];
+
+    float16_t ret = d * (float16_t(qs)) - m;
+
+    return ret;
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
+   block_q6_K block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
+   block_q6_K_packed16 block;
+};
+
+float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
+    const uint idx = coordInBlock[1];
+
+    const uint b = (idx & 0x40) >> 6;           // 0,1
+    const uint qhshift = (idx & 0x60) >> 4;    // 0,2,4,6
+    const uint is = (idx & 0xF0) >> 4;          // 0..15
+
+    const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
+
+    uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
+    ql = (ql >> (b * 4)) & 0x0F0F;
+
+    uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
+    qh = ((qh >> qhshift) & 0x0303) << 4;
+
+    int q = unpack8(ql | qh)[idx & 1];
+
+    float16_t ret = dscale * float16_t(q - 32);
+
+    return ret;
+}
+
+#if defined(DATA_A_IQ2_XXS)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XXS {
+   block_iq2_xxs block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XXS_packed16 {
+   block_iq2_xxs_packed16 block;
+};
+
+float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = (idx & 0xE0) >> 5; // 0..7
+    const uint ib8 = (idx & 0x18) >> 3;  // 0..3
+    const uint iqs = 8 * ib32 + ib8;
+
+    const uint8_t qs = bl.block.qs[iqs];
+    const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
+
+    const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
+    uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
+    sign |= bitCount(sign) << 7;
+
+    uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 g = vec2(unpack8(g2));
+
+    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
+    return float16_t(ret[idx & 1]);
+}
+#endif
+
+#if defined(DATA_A_IQ2_XS)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_XS {
+   block_iq2_xs block;
+};
+
+float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint is = (idx & 0xE0) >> 5;     // 0..8
+    const uint sshift = (idx & 0x10) >> 2; // 0,4
+    const uint iqs = (idx & 0xF8) >> 3;    // 0..63
+
+    const uint16_t qs = bl.block.qs[iqs];
+    const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
+
+    uint sign = uint(qs >> 9);
+    sign |= bitCount(sign) << 7;
+    uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 g = vec2(unpack8(g2));
+
+    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
+    return float16_t(ret[idx & 1]);
+}
+#endif
+
+#if defined(DATA_A_IQ2_S)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2_S {
+   block_iq2_s block;
+};
+
+float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    uint idx = coordInBlock[1];
+
+    const uint ib32 = (idx & 0xE0) >> 5;        // 0..7
+    const uint ib8 = (idx & 0xF8) >> 3;         // 0..31
+    const uint qhshift = 2 * (ib8 % 4);
+
+    const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
+    const uint qs = bl.block.qs[ib8];
+    const uint qh = bl.block.qh[ib32];
+    const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (idx & 0x6);
+
+    const float d = float(bl.block.d);
+    const float db = d * 0.25 * (0.5 + scale);
+    const ivec2 sign01 = 1 - (2 & ivec2(sign << 1, sign));
+    uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
+    return float16_t(v[idx & 1]);
+}
+#endif
+
+#if defined(DATA_A_IQ3_XXS)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_XXS {
+   block_iq3_xxs block;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_XXS_packed16 {
+   block_iq3_xxs_packed16 block;
+};
+
+float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
+    uint idx = coordInBlock[1];
+
+    const uint iqs = (idx & 0xFC) >> 2;             // 0..63
+    const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3);// 8 values
+
+    const float d = float(bl.block.d);
+    const uint qs = bl.block.qs[iqs];
+    const uint signs = pack32(u16vec2(
+        bl16.block.qs[is/2+0],
+        bl16.block.qs[is/2+1]
+    ));
+    const float db = d * 0.5 * (0.5 + (signs >> 28));
+    const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+    const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6);
+    const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
+    const uint grid = iq3xxs_grid[qs] >> (16 * ((idx & 2) >> 1));
+    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+    return float16_t(v[idx & 1]);
+}
+#endif
+
+#if defined(DATA_A_IQ3_S)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3_S {
+   block_iq3_s block;
+};
+
+float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    uint idx = coordInBlock[1];
+
+    const uint iqs = (idx & 0xFC) >> 2;           // 0..63
+    const uint iqh = (idx & 0xE0) >> 5;
+
+    const float d = float(bl.block.d);
+    const uint qs = bl.block.qs[iqs];
+    const uint qh = bl.block.qh[iqh];
+    const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (idx & 0x6));
+    const uint scale = bl.block.scales[iqs / 16];
+    const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
+    const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
+    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> ((idx & 2) << 3);
+    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+    return float16_t(v[idx & 1]);
+}
+#endif
+
+#if defined(DATA_A_IQ4_XS)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_XS {
+   block_iq4_xs block;
+};
+
+float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = (idx & 0xE0) >> 5; // 0..7
+
+    const uint sl = (bl.block.scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = ((bl.block.scales_h) >> (2 * ib32)) & 3;
+    const uint qshift = (idx & 16) >> 2;
+    const uint q = (bl.block.qs[16 * ib32 + (idx % 16)] >> qshift) & 0xF;
+
+    float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
+    return ret;
+}
+#endif
+
+#if defined(DATA_A_IQ4_NL)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
+   block_iq4_nl block;
+};
+
+float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+    const uint iqs = idx & 0xF;
+    const uint shift = (idx & 0x10) >> 2;
+    uint32_t qs = bl.block.qs[iqs];
+    qs >>= shift;
+    qs &= 0xF;
+    float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
+    return ret;
+}
+#endif
+
+#if defined(DATA_A_Q4_0)
+#define dequantFuncA dequantFuncQ4_0
+#elif defined(DATA_A_Q4_1)
+#define dequantFuncA dequantFuncQ4_1
+#elif defined(DATA_A_Q5_0)
+#define dequantFuncA dequantFuncQ5_0
+#elif defined(DATA_A_Q5_1)
+#define dequantFuncA dequantFuncQ5_1
+#elif defined(DATA_A_Q8_0)
+#define dequantFuncA dequantFuncQ8_0
+#elif defined(DATA_A_Q2_K)
+#define dequantFuncA dequantFuncQ2_K
+#elif defined(DATA_A_Q3_K)
+#define dequantFuncA dequantFuncQ3_K
+#elif defined(DATA_A_Q4_K)
+#define dequantFuncA dequantFuncQ4_K
+#elif defined(DATA_A_Q5_K)
+#define dequantFuncA dequantFuncQ5_K
+#elif defined(DATA_A_Q6_K)
+#define dequantFuncA dequantFuncQ6_K
+#elif defined(DATA_A_IQ2_XXS)
+#define dequantFuncA dequantFuncIQ2_XXS
+#elif defined(DATA_A_IQ2_XS)
+#define dequantFuncA dequantFuncIQ2_XS
+#elif defined(DATA_A_IQ2_S)
+#define dequantFuncA dequantFuncIQ2_S
+#elif defined(DATA_A_IQ3_XXS)
+#define dequantFuncA dequantFuncIQ3_XXS
+#elif defined(DATA_A_IQ3_S)
+#define dequantFuncA dequantFuncIQ3_S
+#elif defined(DATA_A_IQ4_XS)
+#define dequantFuncA dequantFuncIQ4_XS
+#elif defined(DATA_A_IQ4_NL)
+#define dequantFuncA dequantFuncIQ4_NL
+#endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp
new file mode 100644
index 000000000..8d806435b
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp
@@ -0,0 +1,13 @@
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint M;
+    uint K;
+    uint stride_a;
+    uint stride_b;
+    uint nel;
+} p;
+
+#include "types.comp"
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp
new file mode 100644
index 000000000..48f6b65bc
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp
@@ -0,0 +1,44 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq2_s data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 subblock (32 values with 2 scales)
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint ib32 = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * ib32;
+
+    const float d = float(data_a[ib].d);
+    const vec2 scale = vec2(data_a[ib].scales[ib32] & 0xf, data_a[ib].scales[ib32] >> 4);
+    const vec2 db = d * (0.5 + scale) * 0.25;
+
+    uint qh = data_a[ib].qh[ib32];
+    [[unroll]] for (uint l = 0; l < 4; ++l) {
+        uint qs = data_a[ib].qs[4 * ib32 + l];
+        const uint8_t sign = data_a[ib].qs[QUANT_K / 8 + 4 * ib32 + l];
+        qs |= (qh << (8 - 2 * l)) & 0x300;
+        const uvec2 grid = iq2s_grid[qs & 511];
+        const u8vec4 grid0 = unpack8(grid.x);
+        const u8vec4 grid1 = unpack8(grid.y);
+        data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 1] = D_TYPE(db[l/2] * grid0.y * ((sign & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 2] = D_TYPE(db[l/2] * grid0.z * ((sign & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 3] = D_TYPE(db[l/2] * grid0.w * ((sign & 8) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 4] = D_TYPE(db[l/2] * grid1.x * ((sign & 16) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 5] = D_TYPE(db[l/2] * grid1.y * ((sign & 32) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 6] = D_TYPE(db[l/2] * grid1.z * ((sign & 64) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 7] = D_TYPE(db[l/2] * grid1.w * ((sign & 128) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp
new file mode 100644
index 000000000..a08331c40
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp
@@ -0,0 +1,43 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq2_xs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 subblock (32 values with 2 scales)
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint ib32 = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * ib32;
+
+    const float d = float(data_a[ib].d);
+    const vec2 scale = vec2(data_a[ib].scales[ib32] & 0xf, data_a[ib].scales[ib32] >> 4);
+    const vec2 db = d * (0.5 + scale) * 0.25;
+
+    [[unroll]] for (uint l = 0; l < 4; ++l) {
+        uint16_t qs = data_a[ib].qs[4 * ib32 + l];
+        const uint sign7 = qs >> 9;
+        const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
+        const uvec2 grid = iq2xs_grid[qs & 511];
+        const u8vec4 grid0 = unpack8(grid.x);
+        const u8vec4 grid1 = unpack8(grid.y);
+        data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 1] = D_TYPE(db[l/2] * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 2] = D_TYPE(db[l/2] * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 3] = D_TYPE(db[l/2] * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 4] = D_TYPE(db[l/2] * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 5] = D_TYPE(db[l/2] * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 6] = D_TYPE(db[l/2] * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 7] = D_TYPE(db[l/2] * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp
new file mode 100644
index 000000000..e370690bc
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp
@@ -0,0 +1,48 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq2_xxs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 scale block (32 values)
+    // Each block is described by 4 lattice indices, 4x7 sign bits and 4 scale bits
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint is = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * is;
+
+    const float d = float(data_a[ib].d);
+    uint signscale = pack32(u8vec4(
+        data_a[ib].qs[8*is + 4],
+        data_a[ib].qs[8*is + 5],
+        data_a[ib].qs[8*is + 6],
+        data_a[ib].qs[8*is + 7]
+    ));
+    const float db = d * (0.5 + (signscale >> 28)) * 0.25;
+
+    [[unroll]] for (uint l = 0; l < 4; ++l) {
+        const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
+        const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
+        const uvec2 grid = iq2xxs_grid[data_a[ib].qs[8 * is + l]];
+        const u8vec4 grid0 = unpack8(grid.x);
+        const u8vec4 grid1 = unpack8(grid.y);
+        data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 3] = D_TYPE(db * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 4] = D_TYPE(db * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 5] = D_TYPE(db * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 6] = D_TYPE(db * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 7] = D_TYPE(db * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp
new file mode 100644
index 000000000..c3f4bca5d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp
@@ -0,0 +1,39 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq3_s data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 scale nibble.
+    // Each block contains 4 scale bytes (8 scales) for 256 output values.
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint is = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * is;
+
+    const float d = float(data_a[ib].d);
+    const float db = d * (1 + 2 * ((data_a[ib].scales[is] >> (4 * (is % 2))) & 0xf));
+
+    // We must produce 32 values using 4 sign bytes, 1 qh byte, 8 qs bytes.
+    uint qh = data_a[ib].qh[is];
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        uint qs = data_a[ib].qs[8 * is + l];
+        uint gidx = qs | ((qh << (8 - l)) & 256);
+        uint8_t signs = data_a[ib].signs[8 * is + l / 2] >> (4 * (l & 1));
+        u8vec4 grid = unpack8(iq3s_grid[gidx]);
+        data_b[b_idx + 4 * l + 0] = D_TYPE(db * grid.x * ((signs & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 4 * l + 1] = D_TYPE(db * grid.y * ((signs & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 4 * l + 2] = D_TYPE(db * grid.z * ((signs & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 4 * l + 3] = D_TYPE(db * grid.w * ((signs & 8) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
new file mode 100644
index 000000000..a92b82961
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
@@ -0,0 +1,49 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq3_xxs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 scale block (32 values)
+    // 8 threads handle 1 superblock
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint is = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * is;
+    const uint s_idx = QUANT_K / 4 + 4 * is;
+
+    const float d = float(data_a[ib].d);
+    uint signscale = pack32(u8vec4(
+        data_a[ib].qs[s_idx + 0],
+        data_a[ib].qs[s_idx + 1],
+        data_a[ib].qs[s_idx + 2],
+        data_a[ib].qs[s_idx + 3]
+    ));
+    const float db = d * (0.5 + (signscale >> 28)) * 0.5;
+
+    [[unroll]] for (uint l = 0; l < 4; ++l) {
+        const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
+        // Restore parity bit.
+        const uint sign8 = sign7 | (bitCount(sign7) << 7);
+        const u8vec4 grid0 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l]]);
+        const u8vec4 grid1 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l + 1]]);
+        data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 3] = D_TYPE(db * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 4] = D_TYPE(db * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 5] = D_TYPE(db * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 6] = D_TYPE(db * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 7] = D_TYPE(db * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
new file mode 100644
index 000000000..46d9ad15e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp
@@ -0,0 +1,32 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint q_idx = 8*il;
+    const uint b_idx = 1024*i + 32*ir + q_idx;
+
+    const float d = float(data_a[ib].d);
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
new file mode 100644
index 000000000..f930852a4
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
@@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq4_xs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 subblock (1 scale and 32 quantized values)
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint ib32 = gl_LocalInvocationID.x % 8;
+
+    const float d = float(data_a[ib].d);
+    // Scales are 6 bits
+    const uint scale = ((data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF)
+                     | (((data_a[ib].scales_h >> (2 * ib32)) & 3) << 4);
+    const float dl = d * (int(scale) - 32);
+
+    const uint b_idx = 256 * ib + 32 * ib32;
+    const uint q_idx = 16 * ib32;
+    [[unroll]] for (uint l = 0; l < 16; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
new file mode 100644
index 000000000..157154af3
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
@@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = gl_WorkGroupID.x * 256 + wgy;
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint ip = tid / 32;
+        const uint il = tid - 32 * ip;
+        const uint is = 8 * ip + il / 16;
+
+        const uint y_idx = i * QUANT_K + 128 * ip + il;
+
+        const uint ql_idx = 32 * ip + il;
+        const uint8_t qs = data_a[i].qs[32 * ip + il];
+
+        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
+        data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
+        data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
+        data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
+        data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
new file mode 100644
index 000000000..c17dd0d99
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
@@ -0,0 +1,42 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint r = gl_LocalInvocationID.x / 4;
+        const uint tid = r / 2;
+        const uint is0 = r % 2;
+        const uint l0 = 16 * is0 + 4 * (gl_LocalInvocationID.x % 4);
+        const uint n = tid / 4;
+        const uint j = tid - 4*n;
+
+        const uint8_t m = uint8_t(1 << (4*n + j));
+        const uint is = 8*n + 2*j + is0;
+        const uint shift = 2*j;
+
+        const int8_t us = int8_t(is <  4 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+8] >> 0) & 3) << 4) :
+                                 is <  8 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+4] >> 2) & 3) << 4) :
+                                 is < 12 ? (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is+0] >> 4) & 3) << 4) :
+                                           (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is-4] >> 6) & 3) << 4));
+        const FLOAT_TYPE d_all = FLOAT_TYPE(data_a[i].d);
+        const FLOAT_TYPE dl    = d_all * FLOAT_TYPE(us - 32);
+
+        const uint y_idx = i * QUANT_K + 128 * n + 32 * j;
+        const uint qs_idx = 32*n;
+
+        for (uint l = l0; l < l0 + 4; ++l) {
+            data_b[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((data_a[i].qs[qs_idx + l] >> shift) & 3) - (((data_a[i].hmask[l] & m) != 0) ? 0 : 4)));
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
new file mode 100644
index 000000000..408185327
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp
@@ -0,0 +1,30 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint q_idx = 8*il;
+    const uint b_idx = 1024*i + 32*ir + q_idx;
+
+    const float d = float(data_a[ib].d);
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
+        data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
new file mode 100644
index 000000000..2f27eee68
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp
@@ -0,0 +1,32 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const float m = float(data_a[ib].m);
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
+        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + m);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
new file mode 100644
index 000000000..987f113a3
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
@@ -0,0 +1,68 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint ib = gl_WorkGroupID.x * 256 + wgy;
+        if (ib >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint il = tid / 8;
+        const uint ir = tid % 8;
+        const uint is = 2 * il;
+        const uint n = 4;
+
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
+
+        const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
+        const uint qs_idx = 32*il + n * ir;
+
+        uint scidx0 = (is < 4) ? is : (is + 4);
+        uint scidx1 = (is < 4) ? is : (is - 4);
+        uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint scidxshift1 = (is < 4) ? 0 : 2;
+        uint mbidx0 = is + 4;
+        uint mbidx1 = (is < 4) ? is + 4 : is;
+        uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        uint mbidxshift0 = (is < 4) ? 0 : 4;
+        uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+        uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+        const FLOAT_TYPE d1 = dall * sc;
+        const FLOAT_TYPE m1 = dmin * mbyte;
+
+        scidx0 = (is < 4) ? is + 1 : (is + 5);
+        scidx1 = (is < 4) ? is + 1 : (is - 3);
+        scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        scidxshift1 = (is < 4) ? 0 : 2;
+        mbidx0 = is + 5;
+        mbidx1 = (is < 4) ? is + 5 : is + 1;
+        mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        mbidxshift0 = (is < 4) ? 0 : 4;
+        mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        mbidxshift1 = (is < 4) ? 0 : 2;
+
+        sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+        const FLOAT_TYPE d2 = dall * sc;
+        const FLOAT_TYPE m2 = dmin * mbyte;
+
+        [[unroll]] for (uint l = 0; l < n; ++l) {
+            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
+            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >>  4) - m2);
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
new file mode 100644
index 000000000..b20b80529
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp
@@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q5_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        const uint iqs = q_idx + l;
+        const uint vui = uint(data_a[ib].qs[iqs]);
+        data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f));
+        data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
new file mode 100644
index 000000000..dc59fe3b7
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp
@@ -0,0 +1,35 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q5_1 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 8*il;
+
+    const float d = float(data_a[ib].d);
+    const float m = float(data_a[ib].m);
+    const uint qh = data_a[ib].qh;
+
+    const uint q_idx = 8*il;
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        const uint iqs = q_idx + l;
+        const uint vui = uint(data_a[ib].qs[iqs]);
+        data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m);
+        data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10))) + m);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
new file mode 100644
index 000000000..6db5403b6
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
@@ -0,0 +1,70 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint ib = gl_WorkGroupID.x * 256 + wgy;
+        if (ib >= p.M * p.K / QUANT_K) {
+            return;
+        }
+
+        const uint tid = gl_LocalInvocationID.x;
+        const uint il = tid / 16;
+        const uint ir = tid % 16;
+        const uint is = 2 * il;
+
+        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
+
+        const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
+        const uint qs_idx = 32*il + 2 * ir;
+        const uint qh_idx = 2 * ir;
+
+        uint scidx0 = (is < 4) ? is : (is + 4);
+        uint scidx1 = (is < 4) ? is : (is - 4);
+        uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint scidxshift1 = (is < 4) ? 0 : 2;
+        uint mbidx0 = is + 4;
+        uint mbidx1 = (is < 4) ? is + 4 : is;
+        uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        uint mbidxshift0 = (is < 4) ? 0 : 4;
+        uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+        uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+        const FLOAT_TYPE d1 = dall * sc;
+        const FLOAT_TYPE m1 = dmin * mbyte;
+
+        scidx0 = (is < 4) ? is + 1 : (is + 5);
+        scidx1 = (is < 4) ? is + 1 : (is - 3);
+        scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        scidxshift1 = (is < 4) ? 0 : 2;
+        mbidx0 = is + 5;
+        mbidx1 = (is < 4) ? is + 5 : is + 1;
+        mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+        mbidxshift0 = (is < 4) ? 0 : 4;
+        mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+        mbidxshift1 = (is < 4) ? 0 : 2;
+
+        sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+        mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+        const FLOAT_TYPE d2 = dall * sc;
+        const FLOAT_TYPE m2 = dmin * mbyte;
+
+        const uint8_t hm1 = uint8_t(1 << (2 * il    ));
+        const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
+        data_b[y_idx     ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx    ] & 0xF) + (((data_a[ib].qh[qh_idx    ] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx +  1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
+        data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx    ]  >> 4) + (((data_a[ib].qh[qh_idx    ] & hm2) != 0) ? 16 : 0)) - m2);
+        data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1]  >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
new file mode 100644
index 000000000..0b9131755
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
@@ -0,0 +1,33 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
+        const uint i = gl_WorkGroupID.x * 256 + wgy;
+        if (i >= p.M * p.K / QUANT_K) {
+            return;
+        }
+        const uint tid = gl_LocalInvocationID.x;
+        const uint ip = tid / 32;
+        const uint il = tid - 32 * ip;
+        const uint is = 8 * ip + il / 16;
+
+        const uint y_idx = i * QUANT_K + 128 * ip + il;
+
+        const uint ql_idx = 64 * ip + il;
+        const uint8_t qh = data_a[i].qh[32 * ip + il];
+
+        const FLOAT_TYPE d = FLOAT_TYPE(data_a[i].d);
+
+        data_b[y_idx +  0] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 0] * (int8_t((data_a[i].ql[ql_idx +  0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
+        data_b[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 2] * (int8_t((data_a[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
+        data_b[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 4] * (int8_t((data_a[i].ql[ql_idx +  0] >>  4) | (((qh >> 4) & 3) << 4)) - 32)));
+        data_b[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 6] * (int8_t((data_a[i].ql[ql_idx + 32] >>  4) | (((qh >> 6) & 3) << 4)) - 32)));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
new file mode 100644
index 000000000..bd1344a88
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp
@@ -0,0 +1,31 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_q8_0 data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint b_idx = 1024*i + 32*ir + 16*il;
+
+    const float d = float(data_a[ib].d);
+
+    const uint q_idx = 16*il;
+
+    [[unroll]] for (uint l = 0; l < 16; l += 2) {
+        data_b[b_idx + l    ] = D_TYPE(d * data_a[ib].qs[q_idx + l    ]);
+        data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
new file mode 100644
index 000000000..26d8bc22a
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp
@@ -0,0 +1,34 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : enable
+
+layout (push_constant) uniform parameter
+{
+    uint ncols;
+    uint rows_per_channel;
+    uint n_past;
+} p;
+
+#include "types.comp"
+
+layout(local_size_x = 1, local_size_y = 512, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint col = gl_GlobalInvocationID.y;
+    const uint row = gl_GlobalInvocationID.x;
+
+    if (col >= p.ncols) {
+        return;
+    }
+
+    const uint i = row*p.ncols + col;
+    if (col > p.n_past + row % p.rows_per_channel) {
+        data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000));
+    } else {
+        data_d[i] = D_TYPE(data_a[i]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/div.comp b/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
new file mode 100644
index 000000000..9fb69c6c1
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
@@ -0,0 +1,27 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
new file mode 100644
index 000000000..ba88ce79a
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -0,0 +1,309 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_NV_cooperative_matrix2 : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_vote : enable
+#extension GL_EXT_null_initializer : enable
+
+#include "types.comp"
+#include "dequant_funcs_cm2.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 1) const uint32_t Br = 32;
+layout (constant_id = 2) const uint32_t Bc = 32;
+layout (constant_id = 3) const uint32_t D = 32;
+layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV;
+
+layout (push_constant) uniform parameter {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+} p;
+
+layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
+layout (binding = 1) readonly buffer K {uint8_t data_k[];};
+layout (binding = 2) readonly buffer V {uint8_t data_v[];};
+layout (binding = 3) readonly buffer M {uint8_t data_m[];};
+layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
+    return max(x, y);
+}
+
+ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
+    return x;
+}
+
+// Replace matrix elements >= numRows or numCols with 'replace'
+ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) {
+    if (row >= numRows || col >= numCols) {
+        return replace;
+    }
+    return elem;
+}
+
+ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem)
+{
+    return exp(elem);
+}
+
+ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1)
+{
+    return max(elem0, elem1);
+}
+
+#if defined(BLOCK_SIZE)
+#define DECODEFUNC , DEQUANTFUNC
+#else
+#define DECODEFUNC
+#endif
+
+void main() {
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+    const uint32_t N = p.N;
+    const uint32_t KV = p.KV;
+
+    const uint32_t Tr = CEIL_DIV(N, Br);
+    const uint32_t Tc = CEIL_DIV(KV, Bc);
+
+    const uint32_t i = gl_WorkGroupID.x;
+
+    const uint32_t iq2 = gl_WorkGroupID.y;
+    const uint32_t iq3 = gl_WorkGroupID.z;
+
+    // broadcast factors
+    const uint32_t rk2 = p.neq2/p.nek2;
+    const uint32_t rk3 = p.neq3/p.nek3;
+
+    const uint32_t rv2 = p.neq2/p.nev2;
+    const uint32_t rv3 = p.neq3/p.nev3;
+
+    // k indices
+    const uint32_t ik3 = iq3 / rk3;
+    const uint32_t ik2 = iq2 / rk2;
+
+    // v indices
+    const uint32_t iv3 = iq3 / rv3;
+    const uint32_t iv2 = iq2 / rv2;
+
+    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+    tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp);
+    tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp);
+
+    tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
+
+#if defined(BLOCK_SIZE)
+    tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE);
+    tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
+#endif
+
+    tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D);
+    tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
+    tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
+
+    // nb?1 are already divided by the type size and are in units of elements
+    uint32_t q_stride = p.nb01;
+    uint32_t k_stride = p.nb11;
+    uint32_t v_stride = p.nb21;
+    // hint to the compiler that strides are aligned for the aligned variant of the shader
+    if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
+    {
+        q_stride &= ~7;
+#if !defined(BLOCK_SIZE)
+        k_stride &= ~7;
+        v_stride &= ~7;
+#endif
+    }
+    tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1);
+    tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
+    tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
+
+    coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Q;
+    coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
+
+    uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
+    coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D));
+
+    Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA>(Q);
+    Qf16 *= float16_t(p.scale);
+
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
+
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
+
+    L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
+    M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
+
+    ACC_TYPE slope = ACC_TYPE(1.0);
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        const uint32_t h = iq2;
+
+        const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+        const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+        slope = pow(base, ACC_TYPE(exph));
+    }
+
+    [[dont_unroll]]
+    for (uint32_t j = 0; j < Tc; ++j) {
+
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
+
+        coopmat<float16_t, gl_ScopeWorkgroup, D, Bc, gl_MatrixUseB> K_T;
+
+        uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
+        coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC);
+        S = coopMatMulAdd(Qf16, K_T, S);
+
+        if (p.logit_softcap != 0.0f) {
+            [[unroll]]
+            for (int k = 0; k < S.length(); ++k) {
+                S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]);
+            }
+        }
+
+        if (p.mask != 0) {
+            tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+            tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
+
+            coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
+
+            coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+
+            S += slope*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+        }
+
+        // Clear padding elements to -inf, so they don't contribute to rowmax
+        if (Clamp != 0 &&
+            ((j + 1) * Bc > KV ||
+             (i + 1) * Br > N)) {
+
+            uint R = ((i + 1) * Br >  N) ?  (N % Br) : Br;
+            uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
+
+            coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C);
+        }
+
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> rowmax, P, rowsum, eM;
+
+        coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce);
+
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> Mold = M;
+
+        // M = max(rowmax, Mold)
+        // P = e^(S - M)
+        // eM = e^(Mold - M)
+        coopMatPerElementNV(M, rowmax, Max, Mold);
+        coopMatPerElementNV(P, S - M, Exp);
+        coopMatPerElementNV(eM, Mold - M, Exp);
+
+        // Clear padding elements to 0, so they don't contribute to rowsum
+        if (Clamp != 0 &&
+            ((j + 1) * Bc > KV ||
+             (i + 1) * Br > N)) {
+
+            uint R = ((i + 1) * Br >  N) ?  (N % Br) : Br;
+            uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
+
+            coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
+        }
+
+        coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
+
+        // compute rowsum by multiplying by matrix of all ones.
+        coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
+
+        rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
+        rowsum = coopMatMulAdd(P_A, One, rowsum);
+
+        coopmat<float16_t, gl_ScopeWorkgroup, Bc, D, gl_MatrixUseB> V;
+        uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
+        coopMatLoadTensorNV(V,  data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC);
+
+        L = eM*L + rowsum;
+
+        // This is the "diagonal" matrix in the paper, but since we do componentwise
+        // multiply rather than matrix multiply it has the diagonal element smeared
+        // across the row
+        coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> eMdiag;
+
+        // resize eM by using smear/reduce
+        coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
+
+        O = eMdiag * O;
+
+        O = coopMatMulAdd(P_A, V, O);
+    }
+
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
+
+    // resize L by using smear/reduce
+    coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
+
+    [[unroll]]
+    for (int k = 0; k < Ldiag.length(); ++k) {
+        Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k];
+    }
+
+    O = Ldiag*O;
+
+    tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
+    tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
+
+    // permute dimensions
+    tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
+    uint32_t o_offset = iq3*p.ne2*p.ne1;
+
+    coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
+    coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
new file mode 100644
index 000000000..4cc7a68ca
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp
@@ -0,0 +1,25 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float xi = float(data_a[i]);
+    const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
+    data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
new file mode 100644
index 000000000..e6e6fcfd2
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp
@@ -0,0 +1,23 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const float GELU_QUICK_COEF = -1.702f;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float x = float(data_a[i]);
+    data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
new file mode 100644
index 000000000..062e2a4cd
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
@@ -0,0 +1,64 @@
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
+
+layout (push_constant) uniform parameter
+{
+    uint ne;
+    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
+    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
+    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
+    uint misalign_offsets;
+    float param1; float param2; int param3;
+} p;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+// true if src0/src1 are the same shape and the indices can be reused without additional modulus
+layout(constant_id = 0) const bool norepeat = false;
+
+uint get_idx() {
+    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
+uint get_doffset() { return p.misalign_offsets & 0xFF; }
+
+// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
+uint fastmod(uint a, uint b) {
+    if ((b & (b-1)) == 0) {
+        return a & (b-1);
+    }
+    return a % b;
+}
+
+uint fastdiv(uint a, uint b) {
+    return (a < b) ? 0 : (a / b);
+}
+
+void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03) {
+    i03 = fastdiv(idx, (p.ne02*p.ne01*p.ne00));
+    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
+    i02 = fastdiv((idx - i03_offset), (p.ne01*p.ne00));
+    const uint i02_offset = i02*p.ne01*p.ne00;
+    i01 = (idx - i03_offset - i02_offset) / p.ne00;
+    i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+}
+
+uint src0_idx(uint i00, uint i01, uint i02, uint i03) {
+    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
+}
+
+uint src1_idx(uint i00, uint i01, uint i02, uint i03) {
+    if (norepeat) {
+        return i03*p.nb13 + i02*p.nb12 + i01*p.nb11 + i00*p.nb10;
+    } else {
+        return fastmod(i03, p.ne13)*p.nb13 + fastmod(i02, p.ne12)*p.nb12 + fastmod(i01, p.ne11)*p.nb11 + fastmod(i00, p.ne10)*p.nb10;
+    }
+}
+
+uint dst_idx(uint i00, uint i01, uint i02, uint i03) {
+    return i03*p.nb23 + i02*p.nb22 + i01*p.nb21 + i00*p.nb20;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp
new file mode 100644
index 000000000..66e46ae67
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp
@@ -0,0 +1,9 @@
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint KX;
+    uint KY;
+    float param1;
+    float param2;
+} p;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
new file mode 100644
index 000000000..8dc9d360d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
@@ -0,0 +1,76 @@
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require
+
+layout (push_constant) uniform parameter
+{
+    uint ne;
+    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
+    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
+    uint misalign_offsets;
+    float param1; float param2;
+
+    uint ne0_012mp; uint ne0_012L;
+    uint ne0_01mp;  uint ne0_01L;
+    uint ne0_0mp;   uint ne0_0L;
+    uint ne1_012mp; uint ne1_012L;
+    uint ne1_01mp;  uint ne1_01L;
+    uint ne1_0mp;   uint ne1_0L;
+} p;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+uint get_idx() {
+    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+}
+
+uint get_aoffset() { return p.misalign_offsets >> 16; }
+uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
+
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    // msbs = mulhi(n, mp)
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
+
+uint src0_idx(uint idx) {
+    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02_offset = i02*p.ne01*p.ne00;
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
+}
+
+uint dst_idx(uint idx) {
+    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12_offset = i12*p.ne11*p.ne10;
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
+    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
+}
+
+uint src0_idx_quant(uint idx, uint qk) {
+    const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
+    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
+    const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
+    const uint i02_offset = i02*p.ne01*p.ne00;
+    const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
+    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
+    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
+}
+
+uint dst_idx_quant(uint idx, uint qk) {
+    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12_offset = i12*p.ne11*p.ne10;
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
+    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
new file mode 100644
index 000000000..e877ed779
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@@ -0,0 +1,28 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint i00 = gl_GlobalInvocationID.x;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
+
+    if (i00 >= p.ne00) {
+        return;
+    }
+
+    const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+
+    const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
+#else
+    data_d[d_offset + i00] = data_a[a_offset + i00];
+#endif
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
new file mode 100644
index 000000000..c16a2a9f6
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -0,0 +1,39 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+#include "dequant_funcs.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint i00 = (gl_GlobalInvocationID.x)*2;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
+
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+    if (i00 >= p.ne00) {
+        return;
+    }
+
+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+
+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
+
+    const uint ib = a_offset + i00/QUANT_K; // block index
+    const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
+    const uint iybs = i00 - i00%QUANT_K; // dst block start index
+    const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
+
+    vec2 v = dequantize(ib, iqs, 0);
+    const vec2 dm = get_dm(ib, 0);
+    v = v * dm.x + dm.y;
+
+    data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
+    data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
new file mode 100644
index 000000000..b6a0d5645
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
@@ -0,0 +1,66 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+shared float tmp[BLOCK_SIZE];
+
+void main() {
+    const uint group_size = p.KX;
+    const float eps = p.param1;
+
+    const uint tid = gl_LocalInvocationID.x;
+    const uint start = gl_WorkGroupID.x * group_size + tid;
+    const uint end = (gl_WorkGroupID.x + 1) * group_size;
+
+    tmp[tid] = 0.0f;
+
+    // Calculate mean
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        tmp[tid] += float(data_a[col]);
+    }
+
+    // tmp up partial tmps and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    const float mean = tmp[0] / group_size;
+    barrier();
+    tmp[tid] = 0.0f;
+
+    // Calculate variance
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        const float xi = float(data_a[col]) - mean;
+        data_d[col] = D_TYPE(xi);
+        tmp[tid] += xi * xi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    const float variance = tmp[0] / group_size;
+    const float scale = inversesqrt(variance + eps);
+
+    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
+        data_d[col] *= D_TYPE(scale);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
new file mode 100644
index 000000000..122b1e93f
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -0,0 +1,87 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_spirv_intrinsics: enable
+#extension GL_EXT_control_flow_attributes : require
+
+#if RTE16
+spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
+#endif
+
+layout (push_constant) uniform parameter
+{
+    uint batch_offset; uint offset_delta;
+    uint IC;
+    uint IW; uint IH;
+    uint OW; uint OH;
+    uint KW; uint KH;
+    uint pelements;
+    uint CHW;
+    int s0; int s1;
+    int p0; int p1;
+    int d0; int d1;
+} p;
+
+#include "types.comp"
+
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
+
+const uint NUM_ITER = 512 / BLOCK_SIZE;
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint gidx = gl_GlobalInvocationID.x;
+
+    const uint oh = gl_GlobalInvocationID.y;
+    const uint batch = gl_GlobalInvocationID.z / p.IC;
+    const uint ic = gl_GlobalInvocationID.z % p.IC;
+
+    A_TYPE values[NUM_ITER];
+    uint offset_dst[NUM_ITER];
+    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
+        values[idx] = A_TYPE(0);
+    }
+
+    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
+
+        const uint i = gidx * NUM_ITER + idx;
+
+        const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+        const uint kx = i / ksize;
+        const uint kd = kx * ksize;
+        const uint ky = (i - kd) / p.OW;
+        const uint ix = i % p.OW;
+
+        const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
+        const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
+
+        offset_dst[idx] =
+            ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
+            (ic * (p.KW * p.KH) + ky * p.KW + kx);
+
+        if (i >= p.pelements) {
+            continue;
+        }
+
+        if (iih < p.IH && iiw < p.IW) {
+            const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
+            values[idx] = data_a[offset_src + iih * p.IW + iiw];
+        }
+    }
+
+    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
+
+        const uint i = gidx * NUM_ITER + idx;
+
+        if (i >= p.pelements) {
+            continue;
+        }
+
+        data_d[offset_dst[idx]] = D_TYPE(values[idx]);
+    }
+
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp
new file mode 100644
index 000000000..d90a99aea
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp
@@ -0,0 +1,22 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float val = float(data_a[i]);
+    data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
new file mode 100644
index 000000000..43de19df8
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
@@ -0,0 +1,27 @@
+#version 450
+
+#include "types.comp"
+#include "generic_binary_head.comp"
+
+const uint num_threads = 256;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 2;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+        uint i00, i01, i02, i03;
+        get_indices(idx, i00, i01, i02, i03);
+
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
+
+        idx += num_threads;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
new file mode 100644
index 000000000..4c64fd47a
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp
@@ -0,0 +1,48 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {float data_a[];};
+layout (binding = 0) readonly buffer A4 {vec4 data_a4[];};
+layout (binding = 1) writeonly buffer D {float data_d[];};
+layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];};
+
+layout (push_constant) uniform parameter {
+    uint ne;
+    uint k_num;
+} p;
+
+void main() {
+    // Each invocation handles four consecutive components
+    const uint idx = gl_GlobalInvocationID.x * 4;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    // Check if all four components are in bounds and aligned,
+    // then use vector loads
+    if (idx + 3 < p.ne && (p.ne % 4) == 0) {
+        vec4 result = vec4(0.0f);
+
+        [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+            result += data_a4[(i * p.ne + idx) / 4];
+        }
+
+        data_d4[idx / 4] = result;
+    } else {
+        [[unroll]] for (uint j = 0; j < 4; ++j) {
+            if (idx + j < p.ne) {
+                float result = 0.0f;
+
+                [[unroll]] for (uint i = 0; i < p.k_num; i++) {
+                    result += data_a[i * p.ne + idx + j];
+                }
+
+                data_d[idx + j] = result;
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
new file mode 100644
index 000000000..d7e99727d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -0,0 +1,149 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
+#define K_PER_ITER 8
+#else
+#define K_PER_ITER 2
+#endif
+
+
+uint a_offset, b_offset, d_offset, y_offset;
+
+void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
+{
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
+        const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
+        const uint iybs = col - col%QUANT_K; // y block start index
+
+#if K_PER_ITER == 8
+#if QUANT_R == 2
+        const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
+        const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]);
+        const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
+        const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
+#else
+        const vec4 bv0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
+        const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
+#endif
+#else
+        // Check if the second of the pair of elements is OOB, and don't fetch B or
+        // accumulate it. We still fetch a pair of elements for A, which is fine for
+        // quantized formats since they'll be within the same block. We should
+        // probably skip fetching the second element for F16/F32, but as of now we
+        // still do.
+        const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
+
+        FLOAT_TYPE b0 = 0, b1 = 0;
+        b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
+        if (!OOB) {
+            b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
+        }
+#endif
+        uint ibi = first_row*p.ncols;
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            const uint ib = (ibi + col)/QUANT_K; // block index
+            ibi += p.ncols;
+
+#if K_PER_ITER == 8
+            vec4 v = dequantize4(ib, iqs, a_offset);
+            vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
+
+            const vec2 dm = get_dm(ib, a_offset);
+            if (dm.y != 0) { // quant has min component
+                v = v * dm.x + dm.y;
+                v2 = v2 * dm.x + dm.y;
+            }
+
+            // matrix multiplication
+            FLOAT_TYPE rowtmp = dot(bv0, v);
+            rowtmp += dot(bv1, v2);
+
+            if (dm.y == 0)
+                rowtmp *= dm.x;
+
+            temp[j][n] += rowtmp;
+#else
+            const vec2 v = dequantize(ib, iqs, a_offset);
+
+            // matrix multiplication
+            temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
+            if (!OOB) {
+                temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
+            }
+#endif
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    const uint tid = gl_LocalInvocationID.x;
+
+    get_offsets(a_offset, b_offset, d_offset);
+    a_offset /= QUANT_K;
+
+    y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
+
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
+    if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
+        num_iters++;
+    }
+    int unroll_count = 4;
+    uint unrolled_iters = num_iters & ~(unroll_count - 1);
+
+    uint i = 0;
+    while (i < unrolled_iters) {
+        // Manually partially unroll the loop
+        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+            iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+            i++;
+        }
+    }
+    unroll_count = 2;
+    unrolled_iters = num_iters & ~(unroll_count - 1);
+    while (i < unrolled_iters) {
+        // Manually partially unroll the loop
+        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
+            iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+            i++;
+        }
+    }
+    while (i < num_iters) {
+        iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
+        i++;
+    }
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
new file mode 100644
index 000000000..903753c7e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@@ -0,0 +1,118 @@
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+
+#ifdef MUL_MAT_ID
+#define EXPERT_COUNT 8
+#endif
+
+#include "types.comp"
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
+layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
+
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+#ifdef MUL_MAT_ID
+layout (binding = 3) readonly buffer IDS {int data_ids[];};
+#endif
+
+#include "dequant_funcs.comp"
+
+layout (push_constant) uniform parameter
+{
+    uint ncols;
+    uint stride_a;
+    uint stride_b;
+    uint stride_d;
+
+    uint batch_stride_a;
+    uint batch_stride_b;
+    uint batch_stride_d;
+
+#ifdef MUL_MAT_ID
+    uint nei0;
+    uint ne11;
+#else
+    uint ne02;
+    uint ne12;
+    uint broadcast2;
+    uint broadcast3;
+#endif
+} p;
+
+void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.y;
+#else
+    const uint batch_idx = gl_GlobalInvocationID.y;
+#endif
+
+#ifndef MUL_MAT_ID
+    uint batch_idx_a = 0;
+    if (batch_idx != 0) {
+        const uint i13 = batch_idx / p.ne12;
+        const uint i12 = batch_idx % p.ne12;
+
+        const uint i03 = i13 / p.broadcast3;
+        const uint i02 = i12 / p.broadcast2;
+
+        batch_idx_a = i03 * p.ne02 + i02;
+    }
+#else
+    const uint expert_id = data_ids[expert_idx];
+#endif
+
+    a_offset =
+#ifdef MUL_MAT_ID
+            expert_id * p.batch_stride_a;
+#else
+            batch_idx_a * p.batch_stride_a;
+#endif
+    b_offset =
+#ifdef MUL_MAT_ID
+            (expert_idx % p.ne11) * p.stride_b;
+#else
+            batch_idx * p.batch_stride_b;
+#endif
+    d_offset =
+#ifdef MUL_MAT_ID
+            expert_idx * p.stride_d;
+#else
+            batch_idx * p.batch_stride_d;
+#endif
+}
+
+layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+layout (constant_id = 1) const uint NUM_ROWS = 1;
+layout (constant_id = 2) const uint NUM_COLS = 1;
+
+shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
+
+void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
+    // sum up partial sums and write back result
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+            tmpsh[j][n][tid] = temp[j][n];
+        }
+    }
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
+        if (tid < s) {
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                    tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
+                }
+            }
+        }
+        barrier();
+    }
+    if (tid == 0) {
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
new file mode 100644
index 000000000..1cc4996d3
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
@@ -0,0 +1,71 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#define BLOCK_SIZE 32
+#define FLOAT_TYPE float
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
+
+layout (push_constant) uniform parameter
+{
+    uint ncols_x;
+    uint nrows_x;
+    uint row_stride_x;
+    uint channel_stride_x;
+    uint channel_x_divisor;
+    uint b_offset;
+    uint d_offset;
+} p;
+
+shared FLOAT_TYPE tmp[BLOCK_SIZE];
+
+void main() {
+    const uint tid       = gl_LocalInvocationID.x;
+    const uint row_x     = gl_GlobalInvocationID.y;
+    const uint channel   = gl_GlobalInvocationID.z;
+    const uint channel_x = channel / p.channel_x_divisor;
+
+    const uint nrows_y   = p.ncols_x;
+    const uint nrows_dst = p.nrows_x;
+    const uint row_dst   = row_x;
+
+    const uint idst = channel*nrows_dst + row_dst;
+
+    tmp[tid] = 0.0f;
+
+    for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
+        const uint col_x = col_x0 + tid;
+
+        if (col_x >= p.ncols_x) {
+            break;
+        }
+
+        const uint row_y = col_x;
+
+        const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
+        const uint iy = channel*nrows_y + row_y;
+
+        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
+
+        tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        dst[idst] = tmp[0];
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp
new file mode 100644
index 000000000..9b443807d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp
@@ -0,0 +1,73 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#define BLOCK_SIZE 32
+#define FLOAT_TYPE float
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
+
+layout (push_constant) uniform parameter
+{
+    uint ncols_x;
+    uint nrows_x;
+    uint nchannels_x;
+    uint nchannels_y;
+    uint b_offset;
+    uint d_offset;
+} p;
+
+shared FLOAT_TYPE tmp[BLOCK_SIZE];
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint row_x = gl_GlobalInvocationID.y;
+    const uint channel = gl_GlobalInvocationID.z;
+    const uint channel_x = channel / (p.nchannels_y / p.nchannels_x);
+
+    const uint nrows_y = p.ncols_x;
+    const uint nrows_dst = p.nrows_x;
+    const uint row_dst = row_x;
+
+    tmp[tid] = FLOAT_TYPE(0.0f);
+
+    for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
+        const uint col_x = col_x0 + tid;
+
+        if (col_x >= p.ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
+        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
+
+        const uint row_y = col_x;
+
+        // y is not transposed but permuted
+        const uint iy = channel*nrows_y + row_y;
+
+        tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
+    }
+
+    // dst is not transposed and not permuted
+    const uint idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        dst[idst] = tmp[0];
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
new file mode 100644
index 000000000..8cdc640e8
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -0,0 +1,129 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
+shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
+    const uint y_idx = i * QUANT_K + y_offset;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        barrier();
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            if (i < num_blocks_per_row) {
+                const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
+                sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+                sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+            }
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        } else {
+            const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
+            sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+            sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+            barrier();
+        }
+
+        const uint32_t qs_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+        const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
+        const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
+        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
+        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
+
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
+            vec2 b16 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  8]);
+            vec2 b32 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
+            vec2 b48 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
+            vec2 b64 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
+            vec2 b80 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
+            vec2 b96 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
+            vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
+
+            FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
+            FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
+            [[unroll]] for (int l = 0; l < 2; ++l) {
+                sum1 = fma(FLOAT_TYPE(b0[l]),   sccache1[ix][    8*v_im] * qs_u32_0[l  ],
+                       fma(FLOAT_TYPE(b16[l]),  sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
+                       fma(FLOAT_TYPE(b32[l]),  sccache1[ix][2 + 8*v_im] * qs_u32_2[l  ],
+                       fma(FLOAT_TYPE(b48[l]),  sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
+                       fma(FLOAT_TYPE(b64[l]),  sccache1[ix][4 + 8*v_im] * qs_u32_4[l  ],
+                       fma(FLOAT_TYPE(b80[l]),  sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
+                       fma(FLOAT_TYPE(b96[l]),  sccache1[ix][6 + 8*v_im] * qs_u32_6[l  ],
+                       fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
+                sum2 = fma(FLOAT_TYPE(b0[l]),   sccache2[ix][    8*v_im],
+                       fma(FLOAT_TYPE(b16[l]),  sccache2[ix][1 + 8*v_im],
+                       fma(FLOAT_TYPE(b32[l]),  sccache2[ix][2 + 8*v_im],
+                       fma(FLOAT_TYPE(b48[l]),  sccache2[ix][3 + 8*v_im],
+                       fma(FLOAT_TYPE(b64[l]),  sccache2[ix][4 + 8*v_im],
+                       fma(FLOAT_TYPE(b80[l]),  sccache2[ix][5 + 8*v_im],
+                       fma(FLOAT_TYPE(b96[l]),  sccache2[ix][6 + 8*v_im],
+                       fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
+            }
+            temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint v_im = itid/8;                                // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - 8*v_im;                         // 0...7
+
+    const uint l0 = 2*v_in;                                  // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 128*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
new file mode 100644
index 000000000..3116fad16
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -0,0 +1,132 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
+    const uint y_idx = i * QUANT_K + y_offset;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
+            if (i < num_blocks_per_row)
+                sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        }
+
+        const uint32_t hmk = ~(uint32_t(data_a_packed16[ib0 + i].hmask[v_in]) | (uint32_t(data_a_packed16[ib0 + i].hmask[v_in + 8]) << 16));
+        const vec4 hmk_0 = vec4(unpack8(((hmk & hm_m[0]) >> (    v_im4)) << 2));
+        const vec4 hmk_1 = vec4(unpack8(((hmk & hm_m[1]) >> (1 + v_im4)) << 2));
+        const vec4 hmk_2 = vec4(unpack8(((hmk & hm_m[2]) >> (2 + v_im4)) << 2));
+        const vec4 hmk_3 = vec4(unpack8(((hmk & hm_m[3]) >> (3 + v_im4)) << 2));
+
+        // 0, 1, 16, 17
+        uint32_t qs_u32 = uint32_t(data_a[ib0 + i].qs[q_offset]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 1]) << 8);
+        qs_u32 |= (uint32_t(data_a[ib0 + i].qs[q_offset + 16]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 17]) << 8)) << 16;
+        const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
+        const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
+        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
+        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
+
+        if (all_threads) {
+            barrier();
+            sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
+            barrier();
+        }
+
+        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
+            vec2 b16 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  8]);
+            vec2 b32 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
+            vec2 b48 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
+            vec2 b64 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
+            vec2 b80 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
+            vec2 b96 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
+            vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
+
+            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+            [[unroll]] for (int l = 0; l < 2; ++l) {
+                sum = fma(FLOAT_TYPE(  b0[l]) * sccache[ix][v_im][0], qs_u32_0[l  ] - hmk_0[l  ],
+                      fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
+                      fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l  ] - hmk_1[l  ],
+                      fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
+                      fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l  ] - hmk_2[l  ],
+                      fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
+                      fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l  ] - hmk_3[l  ],
+                      fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
+            }
+            temp[j][n] = fma(d, sum, temp[j][n]);
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+    const uint itid8 = itid%8;
+
+    const uint v_im = itid/8;                               // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_im4 = v_im*4;
+    const uint v_in = itid - 8*v_im;                        // 0...7
+
+    const uint32_t m = 0x01010101 << (4 * v_im);
+    uint32_t hm_m[4];
+    [[unroll]] for (uint j = 0; j < 4; ++j)
+        hm_m[j] = m << j;
+
+    const uint l0 = 2*v_in;                                 // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 128*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint s_shift = v_im4 + 2*(itid8/4);
+
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
new file mode 100644
index 000000000..f9cde0648
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@@ -0,0 +1,136 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y1_idx = i * QUANT_K + y_offset;
+    const uint y2_idx = y1_idx + 128;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
+
+        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
+
+        const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
+        const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
+        const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
+        const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
+
+        const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
+        const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
+        const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
+        const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
+        const FLOAT_TYPE sc4 = scale8_f.x;
+        const FLOAT_TYPE sc5 = scale8_f.y;
+        const FLOAT_TYPE sc6 = scale8_f.z;
+        const FLOAT_TYPE sc7 = scale8_f.w;
+
+        const uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
+        const uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
+
+        const uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
+        const uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
+        const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
+        const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
+
+        const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
+        const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
+        const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
+        const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
+
+        const FLOAT_TYPE q4_0  = qs0_lo4.x;
+        const FLOAT_TYPE q4_1  = qs0_lo4.y;
+        const FLOAT_TYPE q4_2  = qs0_lo4.z;
+        const FLOAT_TYPE q4_3  = qs0_lo4.w;
+        const FLOAT_TYPE q4_4  = qs0_hi4.x;
+        const FLOAT_TYPE q4_5  = qs0_hi4.y;
+        const FLOAT_TYPE q4_6  = qs0_hi4.z;
+        const FLOAT_TYPE q4_7  = qs0_hi4.w;
+        const FLOAT_TYPE q4_8  = qs64_lo4.x;
+        const FLOAT_TYPE q4_9  = qs64_lo4.y;
+        const FLOAT_TYPE q4_10 = qs64_lo4.z;
+        const FLOAT_TYPE q4_11 = qs64_lo4.w;
+        const FLOAT_TYPE q4_12 = qs64_hi4.x;
+        const FLOAT_TYPE q4_13 = qs64_hi4.y;
+        const FLOAT_TYPE q4_14 = qs64_hi4.z;
+        const FLOAT_TYPE q4_15 = qs64_hi4.w;
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec4 by10 =  vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4    ]);
+            vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
+            vec4 by20 =  vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4    ]);
+            vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]);
+
+            const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x),      q4_0,  fma(FLOAT_TYPE(by10.y),  q4_1,  fma(FLOAT_TYPE(by10.z),  q4_2,  FLOAT_TYPE(by10.w) *  q4_3)));
+            const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x),     q4_4,  fma(FLOAT_TYPE(by132.y), q4_5,  fma(FLOAT_TYPE(by132.z), q4_6,  FLOAT_TYPE(by132.w) * q4_7)));
+            const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x),      q4_8,  fma(FLOAT_TYPE(by20.y),  q4_9,  fma(FLOAT_TYPE(by20.z),  q4_10, FLOAT_TYPE(by20.w) *  q4_11)));
+            const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x),     q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
+            const FLOAT_TYPE smin =
+                fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
+                fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
+                fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
+                fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
+            temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint il = itid/4;                         // 0...3
+    const uint ir = itid - 4*il;                    // 0...3
+    const uint n =  4;
+
+    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const uint v_in = il % 2;
+
+    const uint l0 = n * (2 * ir + v_in);            // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 64*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
+        calc_superblock(a_offset, b_offset, v_im, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
new file mode 100644
index 000000000..6c84ef3cd
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@@ -0,0 +1,167 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint l0, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y1_idx = i * QUANT_K + y_offset;
+    const uint y2_idx = y1_idx + 128;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+        vec2 d = vec2(data_a[ib0 + i].d);
+        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
+        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
+
+        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
+
+        const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
+        const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
+        const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
+        const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
+
+        const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
+        const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
+        const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
+        const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
+        const FLOAT_TYPE sc4 = scale8_f.x;
+        const FLOAT_TYPE sc5 = scale8_f.y;
+        const FLOAT_TYPE sc6 = scale8_f.z;
+        const FLOAT_TYPE sc7 = scale8_f.w;
+
+        const uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+        const uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
+
+        uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
+        uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
+        uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
+
+        const uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+
+        const uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+        const uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+        const uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010);
+        const uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+
+        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
+        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
+        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
+        qs64_80_u32_hi4 += qs64_80_hi4_offset16;
+
+        const vec4 qs0_16_lo4 = vec4(unpack8(qs0_16_u32_lo4));
+        const vec4 qs64_80_lo4 = vec4(unpack8(qs64_80_u32_lo4));
+        const vec4 qs0_16_hi4 = vec4(unpack8(qs0_16_u32_hi4));
+        const vec4 qs64_80_hi4 = vec4(unpack8(qs64_80_u32_hi4));
+
+        const FLOAT_TYPE q4_0  = qs0_16_lo4.x;
+        const FLOAT_TYPE q4_1  = qs0_16_lo4.y;
+        const FLOAT_TYPE q4_2  = qs0_16_lo4.z;
+        const FLOAT_TYPE q4_3  = qs0_16_lo4.w;
+        const FLOAT_TYPE q4_4  = qs0_16_hi4.x;
+        const FLOAT_TYPE q4_5  = qs0_16_hi4.y;
+        const FLOAT_TYPE q4_6  = qs0_16_hi4.z;
+        const FLOAT_TYPE q4_7  = qs0_16_hi4.w;
+        const FLOAT_TYPE q4_8  = qs64_80_lo4.x;
+        const FLOAT_TYPE q4_9  = qs64_80_lo4.y;
+        const FLOAT_TYPE q4_10 = qs64_80_lo4.z;
+        const FLOAT_TYPE q4_11 = qs64_80_lo4.w;
+        const FLOAT_TYPE q4_12 = qs64_80_hi4.x;
+        const FLOAT_TYPE q4_13 = qs64_80_hi4.y;
+        const FLOAT_TYPE q4_14 = qs64_80_hi4.z;
+        const FLOAT_TYPE q4_15 = qs64_80_hi4.w;
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec2 by10 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2     ]);
+            vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 +  8]);
+            vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]);
+            vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]);
+            vec2 by20 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2     ]);
+            vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 +  8]);
+            vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]);
+            vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]);
+
+            const FLOAT_TYPE sx =
+              fma(FLOAT_TYPE(by10.x), q4_0,
+              fma(FLOAT_TYPE(by10.y), q4_1,
+              fma(FLOAT_TYPE(by116.x), q4_2,
+                 FLOAT_TYPE(by116.y) * q4_3)));
+            const FLOAT_TYPE sy =
+              fma(FLOAT_TYPE(by132.x), q4_4,
+              fma(FLOAT_TYPE(by132.y), q4_5,
+              fma(FLOAT_TYPE(by148.x), q4_6,
+                 FLOAT_TYPE(by148.y) * q4_7)));
+            const FLOAT_TYPE sz =
+              fma(FLOAT_TYPE(by20.x), q4_8,
+              fma(FLOAT_TYPE(by20.y), q4_9,
+              fma(FLOAT_TYPE(by216.x), q4_10,
+                 FLOAT_TYPE(by216.y) * q4_11)));
+            const FLOAT_TYPE sw =
+              fma(FLOAT_TYPE(by232.x), q4_12,
+              fma(FLOAT_TYPE(by232.y), q4_13,
+              fma(FLOAT_TYPE(by248.x), q4_14,
+                 FLOAT_TYPE(by248.y) * q4_15)));
+            const FLOAT_TYPE smin =
+              fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
+              fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
+              fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
+                  (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
+            temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
+        }
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint il = itid/4;                          // 0...3
+    const uint ir = itid - 4*il;                     // 0...3
+
+    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const uint v_in = il % 2;
+
+    const uint l0 = 4*ir + 2*v_in;                   // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 64*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
+        calc_superblock(a_offset, b_offset, v_im, l0, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
new file mode 100644
index 000000000..f05f96b5e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -0,0 +1,130 @@
+#version 450
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
+    const uint y_idx = i * QUANT_K + y_offset;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
+            if (i < num_blocks_per_row)
+                sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        }
+
+        const uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
+        const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+
+        const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
+        const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
+        const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
+        const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
+
+        const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
+        const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
+        const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
+        const uint32_t qh4_u32 = (qh_u32 & 0x30303030);
+        const uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
+
+        const uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
+        const uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
+        const uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
+        const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
+
+        const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
+        const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
+        const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
+        const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
+
+        if (all_threads) {
+            barrier();
+            sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
+            barrier();
+        }
+
+        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec4 by0  = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4     ]);
+            vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 +  8]);
+            vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
+            vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
+
+            FLOAT_TYPE sum[4] = {0, 0, 0, 0};
+            [[unroll]] for (uint l = 0; l < 4; ++l) {
+                sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]);
+                sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]);
+                sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
+                sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
+            }
+            temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
+        }
+    }
+}
+
+void compute_outputs(const uint first_row, const uint num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint v_im = itid/8;                               // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - 8*v_im;                        // 0...7
+
+    const uint l0 = 4 * v_in;                               // 0, 4, 8, ..., 28
+    const uint is = v_in / 4;
+
+    const uint ql_offset = 64*v_im + l0;
+    const uint qh_offset = 32*v_im + l0;
+    const uint s_offset  =  8*v_im + is;
+    const uint y_offset = 128*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
new file mode 100644
index 000000000..33b2234e7
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -0,0 +1,760 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#ifdef FLOAT16
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#endif
+
+#ifdef COOPMAT
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#endif
+
+#ifdef MUL_MAT_ID
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#endif
+
+#include "types.comp"
+
+#ifndef LOAD_VEC_A
+#define LOAD_VEC_A 1
+#endif
+#ifndef LOAD_VEC_B
+#define LOAD_VEC_B 1
+#endif
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+#ifdef MUL_MAT_ID
+layout (binding = 3) readonly buffer IDS {int data_ids[];};
+#endif
+
+layout (push_constant) uniform parameter
+{
+    uint M;
+    uint N;
+    uint K;
+    uint stride_a;
+    uint stride_b;
+    uint stride_d;
+
+    uint batch_stride_a;
+    uint batch_stride_b;
+    uint batch_stride_d;
+
+#ifdef MUL_MAT_ID
+    uint nei0;
+    uint nei1;
+    uint nbi1;
+    uint ne11;
+#else
+    uint k_split;
+    uint ne02;
+    uint ne12;
+    uint broadcast2;
+    uint broadcast3;
+#endif
+} p;
+
+layout (constant_id = 0) const uint BLOCK_SIZE = 64;
+layout (constant_id = 1) const uint BM = 64;
+layout (constant_id = 2) const uint BN = 64;
+layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working with a quant
+layout (constant_id = 4) const uint WM = 32;
+layout (constant_id = 5) const uint WN = 32;
+layout (constant_id = 6) const uint WMITER = 2;
+layout (constant_id = 7) const uint TM = 4;
+layout (constant_id = 8) const uint TN = 2;
+layout (constant_id = 9) const uint TK = 1;  // Only needed for coopmat
+layout (constant_id = 10) const uint WARP = 32;
+
+#ifdef COOPMAT
+#define SHMEM_STRIDE (BK + 8)
+#else
+#define SHMEM_STRIDE (BK + 1)
+#endif
+
+shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
+shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
+
+#ifdef MUL_MAT_ID
+shared u16vec2 row_ids[3072];
+#endif // MUL_MAT_ID
+
+#define NUM_WARPS (BLOCK_SIZE / WARP)
+
+#ifdef COOPMAT
+shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
+#endif
+
+void main() {
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.z;
+#else
+    const uint batch_idx = gl_GlobalInvocationID.z;
+
+    const uint i13 = batch_idx / p.ne12;
+    const uint i12 = batch_idx % p.ne12;
+
+    const uint i03 = i13 / p.broadcast3;
+    const uint i02 = i12 / p.broadcast2;
+
+    const uint batch_idx_a = i03 * p.ne02 + i02;
+#endif
+
+    const uint blocks_m = (p.M + BM - 1) / BM;
+    const uint ir = gl_WorkGroupID.x % blocks_m;
+    const uint ik = gl_WorkGroupID.x / blocks_m;
+    const uint ic = gl_WorkGroupID.y;
+
+    const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
+    const uint WSUBM = WM / WMITER;
+    const uint WSUBN = WN / WNITER;
+
+#ifdef COOPMAT
+    const uint warp_i = gl_SubgroupID;
+
+    const uint tiw = gl_SubgroupInvocationID;
+
+    const uint cms_per_row = WM / TM;
+    const uint cms_per_col = WN / TN;
+
+    const uint storestride = WARP / TM;
+    const uint store_r = tiw % TM;
+    const uint store_c = tiw / TM;
+#else
+    const uint warp_i = gl_LocalInvocationID.x / WARP;
+
+    const uint tiw = gl_LocalInvocationID.x % WARP;
+
+    const uint tiwr = tiw % (WSUBM / TM);
+    const uint tiwc = tiw / (WSUBM / TM);
+#endif
+
+    const uint warp_r = warp_i % (BM / WM);
+    const uint warp_c = warp_i / (BM / WM);
+
+    const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A);
+    const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A);
+    const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B);
+    const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B);
+
+    const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A / BK;
+    const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;
+
+#ifdef MUL_MAT_ID
+    uint _ne1 = 0;
+    for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
+        for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
+            if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
+                row_ids[_ne1] = u16vec2(ii0, ii1);
+                _ne1++;
+            }
+        }
+    }
+
+    barrier();
+
+    // Workgroup has no work
+    if (ic * BN >= _ne1) return;
+#endif
+
+#ifdef MUL_MAT_ID
+    const uint start_k = 0;
+    const uint end_k = p.K;
+#else
+    const uint start_k = ik * p.k_split;
+    const uint end_k = min(p.K, (ik + 1) * p.k_split);
+#endif
+
+    uint pos_a = (
+#ifdef MUL_MAT_ID
+        expert_idx * p.batch_stride_a +
+#else
+        batch_idx_a * p.batch_stride_a +
+#endif
+        ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
+#ifdef MUL_MAT_ID
+    uint pos_b = 0;
+#else
+    uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B;
+#endif
+
+#ifdef COOPMAT
+    coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a;
+    coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
+    coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
+
+    [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
+        sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
+    }
+#else
+    ACC_TYPE sums[WMITER * TM * WNITER * TN];
+    FLOAT_TYPE cache_a[WMITER * TM];
+    FLOAT_TYPE cache_b[WNITER * TN];
+
+    [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
+        sums[i] = ACC_TYPE(0.0f);
+    }
+#endif
+
+    for (uint block = start_k; block < end_k; block += BK) {
+        [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
+
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+#if LOAD_VEC_A == 8
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+            buf_a[buf_idx    ] = FLOAT_TYPE(data_a[idx][0].x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx][0].w);
+            buf_a[buf_idx + 4] = FLOAT_TYPE(data_a[idx][1].x);
+            buf_a[buf_idx + 5] = FLOAT_TYPE(data_a[idx][1].y);
+            buf_a[buf_idx + 6] = FLOAT_TYPE(data_a[idx][1].z);
+            buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w);
+#elif LOAD_VEC_A == 4
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+            buf_a[buf_idx    ] = FLOAT_TYPE(data_a[idx].x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w);
+#else
+            if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
+                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
+            } else {
+                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f);
+            }
+#endif
+#elif defined(DATA_A_Q4_0)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q4_1)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const float m = float(data_a[ib].m);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = vec2(vui & 0xF, vui >> 4) * d + m;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q5_0)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
+            const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q5_1)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const float m = float(data_a[ib].m);
+            const uint uint_qh = data_a[ib].qh;
+            const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q8_0)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 16;
+            const uint iqs = (idx & 0xF) * 2;
+
+            const float d = float(data_a[ib].d);
+            const vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])) * d;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q2_K)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                         // 2 values per idx
+            const uint iqs = idx % 128;                        // 0..127
+
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
+            const uint scalesi = iqs / 8;                      // 0..15
+            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
+
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
+            const uint scales = data_a[ib].scales[scalesi];
+            const vec2 d = vec2(data_a[ib].d);
+
+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q3_K)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                   // 2 values per idx
+            const uint iqs = idx % 128;                  // 0..127
+
+            const uint n = iqs / 64;                     // 0,1
+            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
+            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
+            const uint j = (iqs % 64) / 4;               // 0..3
+            const uint is = iqs / 8;                     // 0..15
+            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
+            const uint qsshift = halfsplit * 2;          // 0,2,4,6
+            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
+
+            const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
+                                  | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
+            const float dl = float(data_a[ib].d) * float(us - 32);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
+#elif defined(DATA_A_Q4_K)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            const uint scidx0 = (is < 4) ? is : (is + 4);
+            const uint scidx1 = (is < 4) ? is : (is - 4);
+            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint scidxshift1 = (is < 4) ? 0 : 2;
+            const uint mbidx0 = is + 4;
+            const uint mbidx1 = (is < 4) ? is + 4 : is;
+            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+            const uint mbidxshift0 = (is < 4) ? 0 : 4;
+            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+            const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+            const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+            const float d = loadd.x * sc;
+            const float m = -loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF), m));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
+#elif defined(DATA_A_Q5_K)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
+
+            const uint8_t hm = uint8_t(1 << (iqs / 16));
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            const uint scidx0 = (is < 4) ? is : (is + 4);
+            const uint scidx1 = (is < 4) ? is : (is - 4);
+            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint scidxshift1 = (is < 4) ? 0 : 2;
+            const uint mbidx0 = is + 4;
+            const uint mbidx1 = (is < 4) ? is + 4 : is;
+            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+            const uint mbidxshift0 = (is < 4) ? 0 : 4;
+            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+            const uint8_t sc    = uint8_t((data_a[ib].scales[scidx0] & 0xF)                         | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+            const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+            const float d = loadd.x * sc;
+            const float m = -loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0), m));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
+#elif defined(DATA_A_Q6_K)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = idx % 128;                 // 0..127
+
+            const uint n = iqs / 64;                    // 0,1
+            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint is_b = (iqs % 16) / 8;           // 0,1
+            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
+            const uint is = 8 * n + qhshift + is_b;     // 0..15
+            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
+            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+
+            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+#elif defined(DATA_A_IQ2_XXS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint ib32 = (idx % 128) / 16;         // 0..7
+            const uint ib8 = (idx / 4) % 4;
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[8 * ib32 + ib8];
+            const uint signs = pack32(u8vec4(
+                data_a[ib].qs[8*ib32 + 4],
+                data_a[ib].qs[8*ib32 + 5],
+                data_a[ib].qs[8*ib32 + 6],
+                data_a[ib].qs[8*ib32 + 7]
+            ));
+            const float db = d * 0.25 * (0.5 + (signs >> 28));
+            const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
+            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
+            const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1));
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ2_XS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint ib32 = (idx % 128) / 16;         // 0..7
+            const uint ib8 = (idx / 4) % 4;             // 0..3
+
+            const float d = float(data_a[ib].d);
+            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
+            const float db = d * 0.25 * (0.5 + scale);
+            const uint qs = data_a[ib].qs[4 * ib32 + ib8];
+            const uint sign7 = qs >> 9;
+            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
+            const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1));
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ2_S)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;        // 2 values per idx
+            const uint ib8 = (idx % 128) / 4; // 0..31
+            const uint ib32 = ib8 / 4;        // 0..7
+
+            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
+            const uint qs = data_a[ib].qs[ib8];
+            const uint qh = data_a[ib].qh[ib32];
+            const uint qhshift = 2 * (ib8 % 4);
+            const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4));
+
+            const float d = float(data_a[ib].d);
+            const float db = d * 0.25 * (0.5 + scale);
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
+            const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1];
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid));
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ3_XXS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = (idx % 128) / 2;           // 0..63
+            const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[iqs];
+            const uint signs = pack32(u8vec4(
+                data_a[ib].qs[is+0],
+                data_a[ib].qs[is+1],
+                data_a[ib].qs[is+2],
+                data_a[ib].qs[is+3]
+            ));
+            const float db = d * 0.5 * (0.5 + (signs >> 28));
+            const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
+            const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1));
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ3_S)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = (idx % 128) / 2;           // 0..63
+            const uint iqh = iqs / 8;
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[iqs];
+            const uint qh = data_a[ib].qh[iqh];
+            const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4)));
+            const uint scale = data_a[ib].scales[iqs / 16];
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
+            const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
+            const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ4_XS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint ib32 = (idx % 128) / 16;         // 0..7
+            const uint iq = 16 * ib32 + 2 * (idx % 8);
+
+            const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+            const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
+            const uint qshift = (idx & 8) >> 1;
+            u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
+            qs = (qs >> qshift) & uint8_t(0xF);
+
+            const float d = float(data_a[ib].d);
+            const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ4_NL)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+#endif
+        }
+        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
+#if LOAD_VEC_B == 8
+#ifdef MUL_MAT_ID
+            const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
+            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
+#else
+            const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
+#endif
+            const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
+            buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x);
+            buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y);
+            buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z);
+            buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx][0].w);
+            buf_b[buf_idx + 4] = FLOAT_TYPE(data_b[idx][1].x);
+            buf_b[buf_idx + 5] = FLOAT_TYPE(data_b[idx][1].y);
+            buf_b[buf_idx + 6] = FLOAT_TYPE(data_b[idx][1].z);
+            buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w);
+#elif LOAD_VEC_B == 4
+#ifdef MUL_MAT_ID
+            const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
+            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
+#else
+            const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
+#endif
+            const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
+            buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x);
+            buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y);
+            buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z);
+            buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w);
+#elif !MUL_MAT_ID
+            if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
+                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
+            } else {
+                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
+            }
+#else
+            const uint row_i = ic * BN + loadc_b + l;
+            if (row_i < _ne1) {
+                const u16vec2 row_idx = row_ids[row_i];
+                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]);
+            } else {
+                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
+            }
+#endif
+        }
+
+        barrier();
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+#ifdef COOPMAT
+        [[unroll]] for (uint i = 0; i < BK; i += TK) {
+            [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+                // Load from shared into cache
+                coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
+
+                [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+                    coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
+
+                    sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]);
+                }
+            }
+        }
+#else
+        [[unroll]] for (uint i = 0; i < BK; i++) {
+            // Load from shared into cache
+            [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+                [[unroll]] for (uint j = 0; j < TM; j++) {
+                    cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
+                }
+            }
+            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+                [[unroll]] for (uint j = 0; j < TN; j++) {
+                    cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
+                }
+            }
+
+            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+                [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+                    [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+                        [[unroll]] for (uint cr = 0; cr < TM; cr++) {
+                            const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
+                            sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]);
+                        }
+                    }
+                }
+            }
+        }
+#endif
+
+        barrier();
+    }
+
+    const uint dr = ir * BM + warp_r * WM;
+    const uint dc = ic * BN + warp_c * WN;
+
+#ifndef MUL_MAT_ID
+    const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
+#endif
+
+#ifdef COOPMAT
+#ifdef MUL_MAT_ID
+    [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+        [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+            coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+            [[unroll]] for (uint col = 0; col < BN; col += storestride) {
+                const uint row_i = dc + cm_col * TN + col + store_c;
+                if (row_i >= _ne1) break;
+
+                const u16vec2 row_idx = row_ids[row_i];
+
+                data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+            }
+        }
+    }
+#else
+    const bool is_aligned = p.stride_d % 4 == 0;  // Assumption: D_TYPE == float
+
+    [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+        [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+            const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N;
+
+            if (is_aligned && is_in_bounds) {
+                // Full coopMat is within bounds and stride_d is aligned with 16B
+                coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_dtype = coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(sums[cm_col * cms_per_row + cm_row]);
+                coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor);
+            } else if (is_in_bounds) {
+                // Full coopMat is within bounds, but stride_d is not aligned
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+                [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+                    data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+                }
+            } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) {
+                // Partial coopMat is within bounds
+                coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+                [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+                    if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) {
+                        data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+                    }
+                }
+            }
+        }
+    }
+#endif // MUL_MAT_ID
+#else
+    [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+        [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+
+            const uint dr_warp = dr + wsir * WSUBM + tiwr * TM;
+            const uint dc_warp = dc + wsic * WSUBN + tiwc * TN;
+            [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+#ifdef MUL_MAT_ID
+                const uint row_i = dc_warp + cc;
+                if (row_i >= _ne1) break;
+
+                const u16vec2 row_idx = row_ids[row_i];
+#endif // MUL_MAT_ID
+                [[unroll]] for (uint cr = 0; cr < TM; cr++) {
+#ifdef MUL_MAT_ID
+                    data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
+#else
+                    if (dr_warp + cr < p.M && dc_warp + cc < p.N) {
+                        data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
+                    }
+#endif // MUL_MAT_ID
+                }
+            }
+        }
+    }
+#endif // COOPMAT
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
new file mode 100644
index 000000000..7e29bbfec
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -0,0 +1,311 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_NV_cooperative_matrix2 : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_vote : enable
+
+#include "types.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 1) const uint BM = 64;
+layout (constant_id = 2) const uint BN = 64;
+layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working with a quant
+
+layout (push_constant) uniform parameter
+{
+    uint M;
+    uint N;
+    uint K;
+    uint stride_a;
+    uint stride_b;
+    uint stride_d;
+
+    uint batch_stride_a;
+    uint batch_stride_b;
+    uint batch_stride_d;
+
+#ifdef MUL_MAT_ID
+    uint nei0;
+    uint nei1;
+    uint nbi1;
+    uint ne11;
+#else
+    uint k_split;
+    uint ne02;
+    uint ne12;
+    uint broadcast2;
+    uint broadcast3;
+#endif
+} p;
+
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+#if QUANT_K > 1
+#define DECODEFUNCA , dequantFuncA
+
+#include "dequant_funcs_cm2.comp"
+
+#else
+#define DECODEFUNCA
+#endif
+
+#ifdef MUL_MAT_ID
+layout (binding = 3) readonly buffer IDS {int data_ids[];};
+
+shared u16vec4 row_ids[3072];
+
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
+   B_TYPE b[];
+};
+
+uint _ne1;
+shared uint _ne1_sh;
+
+B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const uint row_i = blockCoords[0];
+
+    if (row_i >= _ne1) {
+        return B_TYPE(0.0);
+    }
+
+    const u16vec4 row_idx = row_ids[row_i];
+    B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];
+
+    return ret;
+}
+
+D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic)
+{
+    uint dr = ir * BM + r;
+    uint dc = ic * BN + c;
+
+    if (dr < p.M && dc < _ne1) {
+        uint row_i = dc;
+        const u16vec4 row_idx = row_ids[row_i];
+        data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem;
+    }
+    return elem;
+}
+
+#endif
+
+void main() {
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+#ifdef MUL_MAT_ID
+    const uint expert_idx = gl_GlobalInvocationID.z;
+#else
+    const uint batch_idx = gl_GlobalInvocationID.z;
+
+    const uint i13 = batch_idx / p.ne12;
+    const uint i12 = batch_idx % p.ne12;
+
+    const uint i03 = i13 / p.broadcast3;
+    const uint i02 = i12 / p.broadcast2;
+
+    const uint batch_idx_a = i03 * p.ne02 + i02;
+#endif
+
+    const uint blocks_m = (p.M + BM - 1) / BM;
+    const uint ir = gl_WorkGroupID.x % blocks_m;
+    const uint ik = gl_WorkGroupID.x / blocks_m;
+    const uint ic = gl_WorkGroupID.y;
+
+#ifdef MUL_MAT_ID
+    // Spread the search across all elements in the first subgroup
+    if (gl_SubgroupID == 0) {
+        _ne1 = 0;
+        uint num_elements = p.nei1 * p.nei0;
+
+        for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) {
+            bool in_range = i < num_elements;
+            uint ii0 = i % p.nei0;
+            uint ii1 = i / p.nei0;
+            uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
+            uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
+            uint idx = subgroupBallotExclusiveBitCount(ballot);
+            if (in_range && id == expert_idx) {
+                row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
+            }
+            _ne1 += subgroupBallotBitCount(ballot);
+        }
+        _ne1_sh = _ne1;
+    }
+
+    barrier();
+
+    _ne1 = _ne1_sh;
+
+    // Workgroup has no work
+    if (ic * BN >= _ne1) return;
+#endif
+
+#ifdef MUL_MAT_ID
+    uint start_k = 0;
+    const uint end_k = p.K;
+#else
+    uint start_k = ik * p.k_split;
+    const uint end_k = min(p.K, (ik + 1) * p.k_split);
+#endif
+
+    coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum;
+    sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
+
+#ifdef MUL_MAT_ID
+    uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K;
+    uint pos_b = 0;
+#else
+    uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K;
+    uint pos_b = batch_idx * p.batch_stride_b;
+#endif
+
+    uint stride_a = p.stride_a / QUANT_K;
+    uint stride_b = p.stride_b;
+
+    // Hint to the compiler that values are aligned (want 16B alignment).
+    // Quants are always block-aligned, no alignment needed.
+#if ALIGNED
+#if QUANT_K == 1
+    stride_a &= ~7;
+#endif
+    stride_b &= ~7;
+#endif
+
+    // Create layouts for both clamped and unclamped accesses
+    tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2);
+    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+    tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2);
+    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+
+#if QUANT_K > 1
+    tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
+    tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K);
+#endif
+
+    // Use end_k rather than p.K as the dimension because that's what
+    // we need to bound check against when using split_k
+    tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k);
+    tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k);
+    tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M);
+    tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k);
+    tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k);
+
+    tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
+
+#if !defined(MUL_MAT_ID)
+    // Detect a fast path where all loads are entirely in bounds and no clamping is required
+    if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 &&
+#if QUANT_K == 1
+        (stride_a % 8) == 0 &&
+#endif
+        (stride_b % 8) == 0 && (start_k % 8) == 0) {
+        // Hint to the compiler that values are aligned (want 16B alignment)
+        start_k &= ~7;
+        stride_b &= ~7;
+#if QUANT_K == 1
+        stride_a &= ~7;
+#endif
+
+        tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
+        tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
+
+        uint k_iters = (end_k - start_k + BK - 1) / BK;
+
+        for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
+
+            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
+            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
+
+            coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
+            coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
+
+            sum = coopMatMulAdd(mat_a, mat_b, sum);
+        }
+    } else
+#endif // !defined(MUL_MAT_ID)
+    {
+        tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
+
+        tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1);
+
+        tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
+
+        tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1);
+
+        [[dont_unroll]]
+        for (uint block_k = start_k; block_k < end_k; block_k += BK) {
+
+            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
+            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
+
+            // Clamping is expensive, so detect different code paths for each combination
+            // of A and B needing clamping.
+            bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0;
+#ifdef MUL_MAT_ID
+            bool unclampedB = true;
+#else
+            bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0;
+#endif
+            if (unclampedA && unclampedB) {
+                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
+#ifdef MUL_MAT_ID
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
+#else
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
+#endif
+                sum = coopMatMulAdd(mat_a, mat_b, sum);
+            } else if (unclampedA && !unclampedB) {
+                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
+
+                sum = coopMatMulAdd(mat_a, mat_b, sum);
+            } else if (!unclampedA && unclampedB) {
+                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
+#ifdef MUL_MAT_ID
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
+#else
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
+#endif
+                sum = coopMatMulAdd(mat_a, mat_b, sum);
+            } else if (!unclampedA && !unclampedB) {
+                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
+                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
+
+                sum = coopMatMulAdd(mat_a, mat_b, sum);
+            }
+        }
+    }
+
+    // Convert from ACC_TYPE to D_TYPE
+    coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
+    mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
+
+#ifdef MUL_MAT_ID
+    // Call callback to store each element, remapping row through shared memory
+    coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic);
+#else
+    tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
+
+    uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
+    coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
+#endif
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp
new file mode 100644
index 000000000..6627a50bd
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp
@@ -0,0 +1,44 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+shared vec2 sum[BLOCK_SIZE];
+
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    sum[tid] = vec2(0.0f, 0.0f);
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        const float xi = float(data_a[row*p.KX + col]);
+        sum[tid].x += xi;
+        sum[tid].y += xi * xi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        barrier();
+    }
+
+    const float mean = sum[0].x / p.KX;
+    const float var = sum[0].y / p.KX - mean * mean;
+    const float inv_std = inversesqrt(var + p.param1);
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
new file mode 100644
index 000000000..450b67fc5
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -0,0 +1,28 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
+    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
+    const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
+    const uint i2_offset = i2*p.ne11*p.ne10;
+    const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
+    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
+
+    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
+    const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
+
+    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
+
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp
new file mode 100644
index 000000000..b6124411a
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp
@@ -0,0 +1,74 @@
+#version 450
+
+#include "types.comp"
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout(push_constant) uniform parameter {
+    uint IW; uint IH;
+    uint OW; uint OH;
+    uint OC;
+    uint pelements;
+    uint op;
+    int k0; int k1;
+    int s0; int s1;
+    int p0; int p1;
+} p;
+
+#define BLOCK_SIZE 512
+#define FLT_MAX 3.402823466e+38F
+#define OP_POOL_MAX 0u
+#define OP_POOL_AVG 1u
+
+layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.x;
+    if (idx >= p.pelements) {
+        return;
+    }
+
+    const uint O_HW = p.OW * p.OH;
+
+    const uint nc = idx / O_HW;
+    const uint cur_oh = (idx % O_HW) / p.OW;
+    const uint cur_ow = (idx % O_HW) % p.OW;
+
+    const int start_h = int(cur_oh) * p.s0 - p.p0;
+    const uint bh = max(start_h, 0);
+    const uint eh = min(start_h + p.k0, p.IH);
+
+    const int start_w = int(cur_ow) * p.s1 - p.p1;
+    const uint bw = max(start_w, 0);
+    const uint ew = min(start_w + p.k1, p.IW);
+
+    const float scale = 1.0 / float(p.k0 * p.k1);
+    float res;
+
+    if (p.op == OP_POOL_AVG) {
+        res = 0.0;
+    } else if (p.op == OP_POOL_MAX) {
+        res = -FLT_MAX;
+    } else {
+        return;
+    }
+
+    #pragma unroll
+    for (uint i = bh; i < eh; i++) {
+        #pragma unroll
+        for (uint j = bw; j < ew; j++) {
+            const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
+
+            if (p.op == OP_POOL_AVG) {
+                res += cur * scale;
+            } else if (p.op == OP_POOL_MAX) {
+                res = max(res, cur);
+            }
+        }
+    }
+
+    data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
new file mode 100644
index 000000000..52a19b62a
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp
@@ -0,0 +1,21 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    data_d[i] = max(float(data_a[i]), 0);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
new file mode 100644
index 000000000..1568b141d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
@@ -0,0 +1,26 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+uint src0_idx_mod(uint idx) {
+    const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
+    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
+    const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
+    const uint i12_offset = i12*p.ne11*p.ne10;
+    const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
+    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
+    return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
+}
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
new file mode 100644
index 000000000..b554400ba
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
@@ -0,0 +1,42 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+shared FLOAT_TYPE sum[BLOCK_SIZE];
+
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
+        sum[tid] += xi * xi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        barrier();
+    }
+
+    const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
+    const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
new file mode 100644
index 000000000..574b51ca5
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
@@ -0,0 +1,49 @@
+#include "types.comp"
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_spirv_intrinsics: enable
+
+#if RTE16
+spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
+#endif
+
+layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer Y {int data_pos[];};
+layout (binding = 2) readonly buffer Z {float data_ff[];};
+layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
+
+layout (push_constant) uniform parameter {
+    uint ncols;
+    uint n_dims;
+    float freq_scale;
+    uint p_delta_rows;
+    float freq_base;
+    float ext_factor;
+    float attn_factor;
+    float corr_dims[2];
+    float theta_scale;
+    uint has_ff;
+} p;
+
+float rope_yarn_ramp(const float low, const float high, const uint i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
+    float mscale = p.attn_factor;
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = p.freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (p.ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
+    }
+    cos_theta = cos(theta) * mscale;
+    sin_theta = sin(theta) * mscale;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
new file mode 100644
index 000000000..83b46b69b
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -0,0 +1,37 @@
+#version 450
+
+#include "rope_head.comp"
+
+void main() {
+    const uint col = gl_GlobalInvocationID.y * 2;
+    const uint row = gl_GlobalInvocationID.x;
+
+    if (col >= p.ncols) {
+        return;
+    }
+
+    if (col >= p.n_dims) {
+        const uint i = row*p.ncols + col;
+
+        data_d[i + 0] = data_a[i + 0];
+        data_d[i + 1] = data_a[i + 1];
+
+        return;
+    }
+
+    const uint i  = row*p.ncols + col/2;
+    const uint i2 = row/p.p_delta_rows;
+
+    const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
+
+    const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
+
+    const float x0 = float(data_a[i + 0]);
+    const float x1 = float(data_a[i + p.n_dims/2]);
+
+    data_d[i + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
+    data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
new file mode 100644
index 000000000..e416ad938
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
@@ -0,0 +1,37 @@
+#version 450
+
+#include "rope_head.comp"
+
+void main() {
+    const uint col = gl_GlobalInvocationID.y * 2;
+    const uint row = gl_GlobalInvocationID.x;
+
+    if (col >= p.ncols) {
+        return;
+    }
+
+    if (col >= p.n_dims) {
+        const uint i = row*p.ncols + col;
+
+        data_d[i + 0] = data_a[i + 0];
+        data_d[i + 1] = data_a[i + 1];
+
+        return;
+    }
+
+    const uint i = row*p.ncols + col;
+    const uint i2 = row/p.p_delta_rows;
+
+    const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
+
+    const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
+
+    const float x0 = float(data_a[i + 0]);
+    const float x1 = float(data_a[i + 1]);
+
+    data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
+    data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
new file mode 100644
index 000000000..4663428de
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
@@ -0,0 +1,24 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+
+        data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
+        idx += num_threads;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
new file mode 100644
index 000000000..4d36f88e0
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp
@@ -0,0 +1,22 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const float xi = float(data_a[i]);
+    data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
new file mode 100644
index 000000000..d7c15a169
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
new file mode 100644
index 000000000..51fc2dc7e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp
@@ -0,0 +1,173 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout (push_constant) uniform parameter
+{
+    uint KX;
+    uint KY;
+    float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
+    uint nrows_x;
+} p;
+
+#include "types.comp"
+
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
+layout (binding = 2) buffer D {D_TYPE data_d[];};
+
+shared FLOAT_TYPE vals[BLOCK_SIZE];
+
+// num_iters is the number of BLOCK_SIZE loop iterations we need to iterate
+// over all the columns. The main function tries to pass a constant here,
+// as if it were a template function, to allow unrolling.
+void soft_max(uint num_iters) {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0;
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    float slope = 1.0f;
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        const uint h = rowx/p.KY; // head index
+
+        const float base = h < p.n_head_log2 ? p.m0 : p.m1;
+        const uint   exp  = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    // Find max
+    FLOAT_TYPE max_val = uintBitsToFloat(0xFF800000);
+
+    // Cache values while we compute the max, so we don't need to read them
+    // again when we're ready to compute exp(x-max).
+    const uint DATA_CACHE_SIZE = 16;
+    FLOAT_TYPE data_cache[DATA_CACHE_SIZE];
+
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        FLOAT_TYPE a = FLOAT_TYPE(0);
+        if (col < p.KX) {
+            a = data_a[rowx * p.KX + col];
+        }
+
+        FLOAT_TYPE b = FLOAT_TYPE(0);
+        if (p.KY > 0 && col < p.KX) {
+            b = data_b[rowy * p.KX + col];
+        }
+
+        FLOAT_TYPE v = a * p.scale + slope * b;
+
+        if (col < p.KX) {
+            max_val = max(max_val, v);
+        }
+
+        if (idx < DATA_CACHE_SIZE) {
+            data_cache[idx] = v;
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(vals[tid], vals[tid + s]);
+        }
+        barrier();
+    }
+
+    max_val = vals[0];
+    barrier();
+
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
+
+    // Compute sum{exp(x - max)}
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            break;
+        }
+
+        // compute exp(a*scale+b*slope), add it to sum, and cache the new value
+        // in data_cache if possible.
+        const uint i = rowx * p.KX + col;
+        FLOAT_TYPE val;
+        if (idx < DATA_CACHE_SIZE) {
+            val = exp(data_cache[idx] - max_val);
+        } else {
+            val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
+        }
+        sum += val;
+        if (idx < DATA_CACHE_SIZE) {
+            data_cache[idx] = val;
+        } else {
+            data_d[i] = D_TYPE(val);
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = sum;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] += vals[tid + s];
+        }
+        barrier();
+    }
+    sum = vals[0];
+
+    FLOAT_TYPE rcpdivisor = 1.0/sum;
+
+    [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            continue;
+        }
+
+        if (idx < DATA_CACHE_SIZE) {
+            data_d[rowx*p.KX + col] = D_TYPE(data_cache[idx] * rcpdivisor);
+        } else {
+            data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
+        }
+    }
+}
+
+void main() {
+    // instantiate the soft_max function for several different
+    // dimensions, to allow loop unrolling
+    uint num_blocks = (p.KX + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (num_blocks > 32) {
+        soft_max(num_blocks);
+    } else if (num_blocks > 16) {
+        soft_max(32);
+    } else if (num_blocks > 8) {
+        soft_max(16);
+    } else if (num_blocks > 4) {
+        soft_max(8);
+    } else if (num_blocks == 4) {
+        soft_max(4);
+    } else if (num_blocks == 3) {
+        soft_max(3);
+    } else if (num_blocks == 2) {
+        soft_max(2);
+    } else if (num_blocks == 1) {
+        soft_max(1);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp
new file mode 100644
index 000000000..ef43598ba
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp
@@ -0,0 +1,17 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
new file mode 100644
index 000000000..961e5ffa1
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp
@@ -0,0 +1,37 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+
+shared FLOAT_TYPE tmp[BLOCK_SIZE];
+
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint col = gl_LocalInvocationID.x;
+
+    tmp[col] = FLOAT_TYPE(0.0f);
+
+    for (uint i = col; i < p.KX; i += BLOCK_SIZE) {
+        tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]);
+    }
+
+    barrier();
+    [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) {
+        if (col < s) {
+            tmp[col] += tmp[col + s];
+        }
+        barrier();
+    }
+
+    if (col == 0) {
+        data_d[row] = D_TYPE(tmp[0]);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
new file mode 100644
index 000000000..495f966bd
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp
@@ -0,0 +1,20 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+    data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp
new file mode 100644
index 000000000..28eb24e11
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp
@@ -0,0 +1,7 @@
+#version 460
+
+#extension GL_NV_cooperative_matrix2 : require
+
+void main()
+{
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp
new file mode 100644
index 000000000..8c5dd1bd1
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp
@@ -0,0 +1,7 @@
+#version 460
+
+#extension GL_KHR_cooperative_matrix : require
+
+void main()
+{
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
new file mode 100644
index 000000000..79e065a93
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp
@@ -0,0 +1,41 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : require
+
+layout (push_constant) uniform parameter
+{
+    uint nb1;
+    uint dim;
+    uint max_period;
+} p;
+
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 256
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.y;
+    const uint j = gl_GlobalInvocationID.x;
+    const uint d_offset = i * p.nb1;
+
+    if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
+        data_d[d_offset + p.dim] = 0.f;
+    }
+
+    const uint half_dim = p.dim / 2;
+    if (j >= half_dim) {
+        return;
+    }
+
+    const float timestep = float(data_a[i]);
+    const float freq = float(exp(-log(p.max_period) * j / half_dim));
+    const float arg = timestep * freq;
+    data_d[d_offset + j] = D_TYPE(cos(arg));
+    data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
new file mode 100644
index 000000000..db643a54c
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -0,0 +1,1086 @@
+
+#if !defined(GGML_TYPES_COMP)
+#define GGML_TYPES_COMP
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_16bit_storage : require
+
+#if defined(DATA_A_F32)
+#define QUANT_K 1
+#define QUANT_R 1
+
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
+#define A_TYPE float
+#elif LOAD_VEC_A == 4
+#define A_TYPE vec4
+#elif LOAD_VEC_A == 8
+#define A_TYPE mat2x4
+#endif
+#endif
+
+#if defined(DATA_A_F16)
+#define QUANT_K 1
+#define QUANT_R 1
+
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
+#define A_TYPE float16_t
+#elif LOAD_VEC_A == 4
+#define A_TYPE f16vec4
+#elif LOAD_VEC_A == 8
+#define A_TYPE f16mat2x4
+#endif
+#endif
+
+#define QUANT_K_Q4_0 32
+#define QUANT_R_Q4_0 2
+
+struct block_q4_0
+{
+    float16_t d;
+    uint8_t qs[16];
+};
+struct block_q4_0_packed16
+{
+    float16_t d;
+    uint16_t qs[16/2];
+};
+
+#if defined(DATA_A_Q4_0)
+#define QUANT_K QUANT_K_Q4_0
+#define QUANT_R QUANT_R_Q4_0
+#define A_TYPE block_q4_0
+#define A_TYPE_PACKED16 block_q4_0_packed16
+#endif
+
+#define QUANT_K_Q4_1 32
+#define QUANT_R_Q4_1 2
+
+struct block_q4_1
+{
+    float16_t d;
+    float16_t m;
+    uint8_t qs[16];
+};
+
+struct block_q4_1_packed16
+{
+    float16_t d;
+    float16_t m;
+    uint16_t qs[16/2];
+};
+
+#if defined(DATA_A_Q4_1)
+#define QUANT_K QUANT_K_Q4_1
+#define QUANT_R QUANT_R_Q4_1
+#define A_TYPE block_q4_1
+#define A_TYPE_PACKED16 block_q4_1_packed16
+#endif
+
+#define QUANT_K_Q5_0 32
+#define QUANT_R_Q5_0 2
+
+struct block_q5_0
+{
+    float16_t d;
+    uint16_t qh[2];
+    uint8_t qs[16];
+};
+
+struct block_q5_0_packed16
+{
+    float16_t d;
+    uint16_t qh[2];
+    uint16_t qs[16/2];
+};
+
+#if defined(DATA_A_Q5_0)
+#define QUANT_K QUANT_K_Q5_0
+#define QUANT_R QUANT_R_Q5_0
+#define A_TYPE block_q5_0
+#define A_TYPE_PACKED16 block_q5_0_packed16
+#endif
+
+#define QUANT_K_Q5_1 32
+#define QUANT_R_Q5_1 2
+
+struct block_q5_1
+{
+    float16_t d;
+    float16_t m;
+    uint qh;
+    uint8_t qs[16];
+};
+
+struct block_q5_1_packed16
+{
+    float16_t d;
+    float16_t m;
+    uint qh;
+    uint16_t qs[16/2];
+};
+
+#if defined(DATA_A_Q5_1)
+#define QUANT_K QUANT_K_Q5_1
+#define QUANT_R QUANT_R_Q5_1
+#define A_TYPE block_q5_1
+#define A_TYPE_PACKED16 block_q5_1_packed16
+#endif
+
+#define QUANT_K_Q8_0 32
+#define QUANT_R_Q8_0 1
+
+struct block_q8_0
+{
+    float16_t d;
+    int8_t qs[32];
+};
+struct block_q8_0_packed16
+{
+    float16_t d;
+    uint16_t qs[32/2];
+};
+
+#if defined(DATA_A_Q8_0)
+#define QUANT_K QUANT_K_Q8_0
+#define QUANT_R QUANT_R_Q8_0
+#define A_TYPE block_q8_0
+#define A_TYPE_PACKED16 block_q8_0_packed16
+#endif
+
+// K-quants
+#define QUANT_K_Q2_K 256
+
+struct block_q2_K
+{
+    uint8_t scales[QUANT_K_Q2_K/16];
+    uint8_t qs[QUANT_K_Q2_K/4];
+    f16vec2 d;
+};
+
+struct block_q2_K_packed16
+{
+    uint16_t scales[QUANT_K_Q2_K/16/2];
+    uint16_t qs[QUANT_K_Q2_K/4/2];
+    f16vec2 d;
+};
+
+struct block_q2_K_packed32
+{
+    uint32_t scales[QUANT_K_Q2_K/16/4];
+    uint32_t qs[QUANT_K_Q2_K/4/4];
+    f16vec2 d;
+};
+
+#if defined(DATA_A_Q2_K)
+#define QUANT_K QUANT_K_Q2_K
+#define A_TYPE block_q2_K
+#define A_TYPE_PACKED16 block_q2_K_packed16
+#define A_TYPE_PACKED32 block_q2_K_packed32
+#endif
+
+#define QUANT_K_Q3_K 256
+
+struct block_q3_K
+{
+    uint8_t hmask[QUANT_K_Q3_K/8];
+    uint8_t qs[QUANT_K_Q3_K/4];
+    uint8_t scales[12];
+    float16_t d;
+};
+
+struct block_q3_K_packed16
+{
+    uint16_t hmask[QUANT_K_Q3_K/8/2];
+    uint16_t qs[QUANT_K_Q3_K/4/2];
+    uint16_t scales[12/2];
+    float16_t d;
+};
+
+#if defined(DATA_A_Q3_K)
+#define QUANT_K QUANT_K_Q3_K
+#define A_TYPE block_q3_K
+#define A_TYPE_PACKED16 block_q3_K_packed16
+#endif
+
+#define QUANT_K_Q4_K 256
+
+struct block_q4_K
+{
+    f16vec2 d;
+    uint8_t scales[3*QUANT_K_Q4_K/64];
+    uint8_t qs[QUANT_K_Q4_K/2];
+};
+
+struct block_q4_K_packed16
+{
+    f16vec2 d;
+    uint16_t scales[3*QUANT_K_Q4_K/64/2];
+    uint16_t qs[QUANT_K_Q4_K/2/2];
+};
+
+struct block_q4_K_packed32
+{
+    f16vec2 d;
+    uint32_t scales[3*QUANT_K_Q4_K/64/4];
+    uint32_t qs[QUANT_K_Q4_K/2/4];
+};
+
+struct block_q4_K_packed128
+{
+    uvec4 q4k[9];
+};
+
+#if defined(DATA_A_Q4_K)
+#define QUANT_K QUANT_K_Q4_K
+#define A_TYPE block_q4_K
+#define A_TYPE_PACKED16 block_q4_K_packed16
+#define A_TYPE_PACKED32 block_q4_K_packed32
+#endif
+
+#define QUANT_K_Q5_K 256
+
+struct block_q5_K
+{
+    f16vec2 d;
+    uint8_t scales[12];
+    uint8_t qh[QUANT_K_Q5_K/8];
+    uint8_t qs[QUANT_K_Q5_K/2];
+};
+
+struct block_q5_K_packed16
+{
+    f16vec2 d;
+    uint16_t scales[12/2];
+    uint16_t qh[QUANT_K_Q5_K/8/2];
+    uint16_t qs[QUANT_K_Q5_K/2/2];
+};
+
+struct block_q5_K_packed128
+{
+    uvec4 q5k[11];
+};
+
+#if defined(DATA_A_Q5_K)
+#define QUANT_K QUANT_K_Q5_K
+#define A_TYPE block_q5_K
+#define A_TYPE_PACKED16 block_q5_K_packed16
+#endif
+
+#define QUANT_K_Q6_K 256
+
+struct block_q6_K
+{
+    uint8_t ql[QUANT_K_Q6_K/2];
+    uint8_t qh[QUANT_K_Q6_K/4];
+    int8_t scales[QUANT_K_Q6_K/16];
+    float16_t d;
+};
+
+struct block_q6_K_packed16
+{
+    uint16_t ql[QUANT_K_Q6_K/2/2];
+    uint16_t qh[QUANT_K_Q6_K/4/2];
+    int8_t scales[QUANT_K_Q6_K/16];
+    float16_t d;
+};
+
+#if defined(DATA_A_Q6_K)
+#define QUANT_K QUANT_K_Q6_K
+#define A_TYPE block_q6_K
+#define A_TYPE_PACKED16 block_q6_K_packed16
+#endif
+
+// IQuants
+
+#define QUANT_K_IQ2_XXS 256
+#define QUANT_R_IQ2_XXS 1
+
+struct block_iq2_xxs
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ2_XXS/4];
+};
+
+struct block_iq2_xxs_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ2_XXS/8];
+};
+
+#if defined(DATA_A_IQ2_XXS)
+
+const uvec2[256] iq2xxs_grid_const = {
+    uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
+    uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x082b0808, 0x08080808),
+    uvec2(0x082b082b, 0x08080808), uvec2(0x082b2b08, 0x08080808), uvec2(0x082b2b2b, 0x08080808), uvec2(0x19080819, 0x08080808),
+    uvec2(0x19081908, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808),
+    uvec2(0x192b1908, 0x08080808), uvec2(0x2b080808, 0x08080808), uvec2(0x2b08082b, 0x08080808), uvec2(0x2b082b2b, 0x08080808),
+    uvec2(0x2b2b082b, 0x08080808), uvec2(0x08080819, 0x08080819), uvec2(0x08081908, 0x08080819), uvec2(0x08190808, 0x08080819),
+    uvec2(0x08191919, 0x08080819), uvec2(0x19080808, 0x08080819), uvec2(0x2b081908, 0x08080819), uvec2(0x2b192b08, 0x08080819),
+    uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b), uvec2(0x082b082b, 0x0808082b), uvec2(0x2b08082b, 0x0808082b),
+    uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x082b0819, 0x08081908),
+    uvec2(0x082b1908, 0x08081908), uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19082b08, 0x08081908),
+    uvec2(0x192b0808, 0x08081908), uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b190808, 0x08081908),
+    uvec2(0x2b2b1908, 0x08081908), uvec2(0x08080808, 0x08081919), uvec2(0x0808082b, 0x08081919), uvec2(0x08082b08, 0x08081919),
+    uvec2(0x082b0808, 0x08081919), uvec2(0x1908192b, 0x08081919), uvec2(0x192b2b19, 0x08081919), uvec2(0x2b080808, 0x08081919),
+    uvec2(0x2b190819, 0x08081919), uvec2(0x08082b19, 0x0808192b), uvec2(0x08190808, 0x0808192b), uvec2(0x19080808, 0x0808192b),
+    uvec2(0x2b081908, 0x0808192b), uvec2(0x2b2b1908, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x08081919, 0x08082b08),
+    uvec2(0x08082b08, 0x08082b08), uvec2(0x08191908, 0x08082b08), uvec2(0x082b2b08, 0x08082b08), uvec2(0x19080819, 0x08082b08),
+    uvec2(0x19081908, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x1919082b, 0x08082b08), uvec2(0x2b082b08, 0x08082b08),
+    uvec2(0x08081908, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x0808082b, 0x08082b2b), uvec2(0x08191908, 0x08082b2b),
+    uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x082b0819, 0x08190808),
+    uvec2(0x19080808, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x2b081908, 0x08190808), uvec2(0x2b190808, 0x08190808),
+    uvec2(0x2b191919, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x08082b08, 0x08190819), uvec2(0x082b0808, 0x08190819),
+    uvec2(0x19190808, 0x08190819), uvec2(0x19192b2b, 0x08190819), uvec2(0x2b080808, 0x08190819), uvec2(0x082b1908, 0x0819082b),
+    uvec2(0x19081919, 0x0819082b), uvec2(0x08080808, 0x08191908), uvec2(0x08082b08, 0x08191908), uvec2(0x082b0808, 0x08191908),
+    uvec2(0x082b1919, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x08192b08, 0x08191919),
+    uvec2(0x192b082b, 0x08191919), uvec2(0x08080808, 0x0819192b), uvec2(0x0819192b, 0x0819192b), uvec2(0x08080819, 0x08192b08),
+    uvec2(0x08081908, 0x08192b08), uvec2(0x08190808, 0x08192b08), uvec2(0x19080808, 0x08192b08), uvec2(0x2b080819, 0x08192b08),
+    uvec2(0x08080808, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x2b2b0808, 0x08192b19), uvec2(0x19190819, 0x08192b2b),
+    uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08082b2b, 0x082b0808), uvec2(0x19081908, 0x082b0808),
+    uvec2(0x192b0819, 0x082b0808), uvec2(0x2b080808, 0x082b0808), uvec2(0x2b08082b, 0x082b0808), uvec2(0x082b2b19, 0x082b0819),
+    uvec2(0x19082b08, 0x082b0819), uvec2(0x08080808, 0x082b082b), uvec2(0x0808082b, 0x082b082b), uvec2(0x08080819, 0x082b1908),
+    uvec2(0x08081908, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x19080808, 0x082b1908), uvec2(0x1919192b, 0x082b1908),
+    uvec2(0x08080808, 0x082b1919), uvec2(0x19080819, 0x082b1919), uvec2(0x192b1908, 0x082b1919), uvec2(0x2b190808, 0x082b192b),
+    uvec2(0x08082b08, 0x082b2b08), uvec2(0x082b0808, 0x082b2b08), uvec2(0x2b191908, 0x082b2b08), uvec2(0x19081908, 0x082b2b2b),
+    uvec2(0x08080819, 0x19080808), uvec2(0x08081908, 0x19080808), uvec2(0x08190808, 0x19080808), uvec2(0x08192b08, 0x19080808),
+    uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808), uvec2(0x19080808, 0x19080808), uvec2(0x19082b08, 0x19080808),
+    uvec2(0x1919192b, 0x19080808), uvec2(0x192b0808, 0x19080808), uvec2(0x2b080819, 0x19080808), uvec2(0x2b081908, 0x19080808),
+    uvec2(0x2b190808, 0x19080808), uvec2(0x08080808, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x192b0819, 0x19080819),
+    uvec2(0x2b080808, 0x19080819), uvec2(0x2b081919, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08190808, 0x1908082b),
+    uvec2(0x19082b08, 0x1908082b), uvec2(0x1919192b, 0x1908082b), uvec2(0x192b2b08, 0x1908082b), uvec2(0x08080808, 0x19081908),
+    uvec2(0x08082b08, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x2b080808, 0x19081908), uvec2(0x2b192b19, 0x19081908),
+    uvec2(0x0819082b, 0x19081919), uvec2(0x082b1908, 0x19081919), uvec2(0x08080808, 0x1908192b), uvec2(0x08080819, 0x19082b08),
+    uvec2(0x08081908, 0x19082b08), uvec2(0x08190808, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x19081919, 0x19082b08),
+    uvec2(0x08080808, 0x19082b19), uvec2(0x19192b08, 0x19082b19), uvec2(0x192b0819, 0x19082b19), uvec2(0x2b08082b, 0x19082b19),
+    uvec2(0x19081919, 0x19082b2b), uvec2(0x2b190808, 0x19082b2b), uvec2(0x08080808, 0x19190808), uvec2(0x08082b08, 0x19190808),
+    uvec2(0x08190819, 0x19190808), uvec2(0x08192b19, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x2b080808, 0x19190808),
+    uvec2(0x2b082b08, 0x19190808), uvec2(0x08081908, 0x19190819), uvec2(0x1908082b, 0x19190819), uvec2(0x2b2b1908, 0x19190819),
+    uvec2(0x2b190819, 0x1919082b), uvec2(0x2b190808, 0x19191908), uvec2(0x2b19082b, 0x19191908), uvec2(0x08082b2b, 0x19191919),
+    uvec2(0x08080819, 0x1919192b), uvec2(0x19191908, 0x1919192b), uvec2(0x08080808, 0x19192b08), uvec2(0x08190819, 0x19192b08),
+    uvec2(0x08192b19, 0x19192b08), uvec2(0x192b1908, 0x19192b08), uvec2(0x19080808, 0x19192b19), uvec2(0x08082b08, 0x19192b2b),
+    uvec2(0x08081908, 0x192b0808), uvec2(0x08190808, 0x192b0808), uvec2(0x19080808, 0x192b0808), uvec2(0x192b2b08, 0x192b0808),
+    uvec2(0x08080808, 0x192b0819), uvec2(0x19191919, 0x192b0819), uvec2(0x08192b08, 0x192b082b), uvec2(0x192b0808, 0x192b082b),
+    uvec2(0x08080808, 0x192b1908), uvec2(0x08081919, 0x192b1908), uvec2(0x08190808, 0x192b1919), uvec2(0x0819082b, 0x192b1919),
+    uvec2(0x2b081908, 0x192b1919), uvec2(0x1908082b, 0x192b2b08), uvec2(0x08080808, 0x2b080808), uvec2(0x0808082b, 0x2b080808),
+    uvec2(0x08082b2b, 0x2b080808), uvec2(0x19080819, 0x2b080808), uvec2(0x2b08082b, 0x2b080808), uvec2(0x08081908, 0x2b080819),
+    uvec2(0x08192b08, 0x2b080819), uvec2(0x19080808, 0x2b080819), uvec2(0x08190819, 0x2b08082b), uvec2(0x08080819, 0x2b081908),
+    uvec2(0x08081908, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x08191919, 0x2b081908), uvec2(0x19080808, 0x2b081908),
+    uvec2(0x192b0808, 0x2b081908), uvec2(0x08080808, 0x2b081919), uvec2(0x1908192b, 0x2b081919), uvec2(0x2b191908, 0x2b081919),
+    uvec2(0x08082b19, 0x2b08192b), uvec2(0x19080808, 0x2b08192b), uvec2(0x192b0808, 0x2b08192b), uvec2(0x0808082b, 0x2b082b08),
+    uvec2(0x08081908, 0x2b082b19), uvec2(0x08190819, 0x2b082b2b), uvec2(0x08081908, 0x2b190808), uvec2(0x08190808, 0x2b190808),
+    uvec2(0x082b1908, 0x2b190808), uvec2(0x19080808, 0x2b190808), uvec2(0x2b2b0819, 0x2b190808), uvec2(0x0819192b, 0x2b190819),
+    uvec2(0x2b080808, 0x2b190819), uvec2(0x19081919, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x082b082b, 0x2b191908),
+    uvec2(0x19081908, 0x2b191908), uvec2(0x19190819, 0x2b191919), uvec2(0x2b080819, 0x2b192b08), uvec2(0x082b0808, 0x2b192b19),
+    uvec2(0x0808082b, 0x2b2b0808), uvec2(0x19190808, 0x2b2b0808), uvec2(0x2b081919, 0x2b2b0808), uvec2(0x08082b19, 0x2b2b0819),
+    uvec2(0x08080808, 0x2b2b082b), uvec2(0x08192b08, 0x2b2b1908), uvec2(0x19190808, 0x2b2b2b08), uvec2(0x08081908, 0x2b2b2b19)
+};
+
+shared uvec2 iq2xxs_grid[256];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
+        iq2xxs_grid[i] = iq2xxs_grid_const[i];
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ2_XXS
+#define QUANT_R QUANT_R_IQ2_XXS
+#define A_TYPE block_iq2_xxs
+#define A_TYPE_PACKED16 block_iq2_xxs_packed16
+#endif
+
+#define QUANT_K_IQ2_XS 256
+#define QUANT_R_IQ2_XS 1
+
+struct block_iq2_xs
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ2_XS/8];
+    uint8_t scales[QUANT_K_IQ2_XS/32];
+};
+
+struct block_iq2_xs_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ2_XS/8];
+    uint16_t scales[QUANT_K_IQ2_XS/64];
+};
+
+#if defined(DATA_A_IQ2_XS)
+
+const uvec2 iq2xs_grid_const[512] = {
+    uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
+    uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x0819192b, 0x08080808),
+    uvec2(0x08192b19, 0x08080808), uvec2(0x082b0808, 0x08080808), uvec2(0x082b082b, 0x08080808), uvec2(0x082b1919, 0x08080808),
+    uvec2(0x082b2b08, 0x08080808), uvec2(0x19080819, 0x08080808), uvec2(0x19081908, 0x08080808), uvec2(0x1908192b, 0x08080808),
+    uvec2(0x19082b19, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x1919082b, 0x08080808), uvec2(0x19191919, 0x08080808),
+    uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808), uvec2(0x192b1908, 0x08080808), uvec2(0x2b080808, 0x08080808),
+    uvec2(0x2b08082b, 0x08080808), uvec2(0x2b081919, 0x08080808), uvec2(0x2b082b08, 0x08080808), uvec2(0x2b190819, 0x08080808),
+    uvec2(0x2b191908, 0x08080808), uvec2(0x2b192b19, 0x08080808), uvec2(0x2b2b0808, 0x08080808), uvec2(0x08080819, 0x08080819),
+    uvec2(0x08081908, 0x08080819), uvec2(0x0808192b, 0x08080819), uvec2(0x08082b19, 0x08080819), uvec2(0x08190808, 0x08080819),
+    uvec2(0x0819082b, 0x08080819), uvec2(0x08191919, 0x08080819), uvec2(0x08192b08, 0x08080819), uvec2(0x08192b2b, 0x08080819),
+    uvec2(0x082b0819, 0x08080819), uvec2(0x082b1908, 0x08080819), uvec2(0x19080808, 0x08080819), uvec2(0x1908082b, 0x08080819),
+    uvec2(0x19081919, 0x08080819), uvec2(0x19082b08, 0x08080819), uvec2(0x19190819, 0x08080819), uvec2(0x19191908, 0x08080819),
+    uvec2(0x192b0808, 0x08080819), uvec2(0x192b2b08, 0x08080819), uvec2(0x2b080819, 0x08080819), uvec2(0x2b081908, 0x08080819),
+    uvec2(0x2b190808, 0x08080819), uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b), uvec2(0x08081919, 0x0808082b),
+    uvec2(0x08082b08, 0x0808082b), uvec2(0x08190819, 0x0808082b), uvec2(0x08191908, 0x0808082b), uvec2(0x082b0808, 0x0808082b),
+    uvec2(0x19080819, 0x0808082b), uvec2(0x19081908, 0x0808082b), uvec2(0x19190808, 0x0808082b), uvec2(0x19191919, 0x0808082b),
+    uvec2(0x2b080808, 0x0808082b), uvec2(0x2b082b2b, 0x0808082b), uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908),
+    uvec2(0x0808192b, 0x08081908), uvec2(0x08082b19, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x0819082b, 0x08081908),
+    uvec2(0x08191919, 0x08081908), uvec2(0x08192b08, 0x08081908), uvec2(0x082b0819, 0x08081908), uvec2(0x082b1908, 0x08081908),
+    uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19081919, 0x08081908), uvec2(0x19082b08, 0x08081908),
+    uvec2(0x19190819, 0x08081908), uvec2(0x19191908, 0x08081908), uvec2(0x1919192b, 0x08081908), uvec2(0x192b0808, 0x08081908),
+    uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b190808, 0x08081908), uvec2(0x08080808, 0x08081919),
+    uvec2(0x0808082b, 0x08081919), uvec2(0x08081919, 0x08081919), uvec2(0x08082b08, 0x08081919), uvec2(0x08190819, 0x08081919),
+    uvec2(0x08191908, 0x08081919), uvec2(0x082b0808, 0x08081919), uvec2(0x19080819, 0x08081919), uvec2(0x19081908, 0x08081919),
+    uvec2(0x19190808, 0x08081919), uvec2(0x192b0819, 0x08081919), uvec2(0x2b080808, 0x08081919), uvec2(0x08080819, 0x0808192b),
+    uvec2(0x08081908, 0x0808192b), uvec2(0x08190808, 0x0808192b), uvec2(0x082b192b, 0x0808192b), uvec2(0x19080808, 0x0808192b),
+    uvec2(0x1908082b, 0x0808192b), uvec2(0x2b081908, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x0808082b, 0x08082b08),
+    uvec2(0x08081919, 0x08082b08), uvec2(0x08082b08, 0x08082b08), uvec2(0x08082b2b, 0x08082b08), uvec2(0x08190819, 0x08082b08),
+    uvec2(0x08191908, 0x08082b08), uvec2(0x082b0808, 0x08082b08), uvec2(0x082b1919, 0x08082b08), uvec2(0x19080819, 0x08082b08),
+    uvec2(0x19081908, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x19192b08, 0x08082b08), uvec2(0x2b080808, 0x08082b08),
+    uvec2(0x2b2b0808, 0x08082b08), uvec2(0x2b2b2b2b, 0x08082b08), uvec2(0x08080819, 0x08082b19), uvec2(0x08081908, 0x08082b19),
+    uvec2(0x08190808, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x2b080819, 0x08082b19), uvec2(0x2b082b19, 0x08082b19),
+    uvec2(0x08080808, 0x08082b2b), uvec2(0x082b0808, 0x08082b2b), uvec2(0x082b2b08, 0x08082b2b), uvec2(0x2b19192b, 0x08082b2b),
+    uvec2(0x2b2b0808, 0x08082b2b), uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808), uvec2(0x0808192b, 0x08190808),
+    uvec2(0x08082b19, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x0819082b, 0x08190808), uvec2(0x08191919, 0x08190808),
+    uvec2(0x08192b08, 0x08190808), uvec2(0x082b0819, 0x08190808), uvec2(0x082b1908, 0x08190808), uvec2(0x19080808, 0x08190808),
+    uvec2(0x1908082b, 0x08190808), uvec2(0x19081919, 0x08190808), uvec2(0x19082b08, 0x08190808), uvec2(0x19190819, 0x08190808),
+    uvec2(0x19191908, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x192b2b2b, 0x08190808), uvec2(0x2b080819, 0x08190808),
+    uvec2(0x2b081908, 0x08190808), uvec2(0x2b190808, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x0808082b, 0x08190819),
+    uvec2(0x08081919, 0x08190819), uvec2(0x08082b08, 0x08190819), uvec2(0x08190819, 0x08190819), uvec2(0x08191908, 0x08190819),
+    uvec2(0x082b0808, 0x08190819), uvec2(0x19080819, 0x08190819), uvec2(0x19081908, 0x08190819), uvec2(0x19190808, 0x08190819),
+    uvec2(0x2b080808, 0x08190819), uvec2(0x2b191908, 0x08190819), uvec2(0x2b19192b, 0x08190819), uvec2(0x08080819, 0x0819082b),
+    uvec2(0x08081908, 0x0819082b), uvec2(0x0808192b, 0x0819082b), uvec2(0x08190808, 0x0819082b), uvec2(0x19080808, 0x0819082b),
+    uvec2(0x192b0808, 0x0819082b), uvec2(0x08080808, 0x08191908), uvec2(0x0808082b, 0x08191908), uvec2(0x08081919, 0x08191908),
+    uvec2(0x08082b08, 0x08191908), uvec2(0x08190819, 0x08191908), uvec2(0x08191908, 0x08191908), uvec2(0x082b0808, 0x08191908),
+    uvec2(0x19080819, 0x08191908), uvec2(0x19081908, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x19190808, 0x08191908),
+    uvec2(0x192b1908, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x08080819, 0x08191919), uvec2(0x08081908, 0x08191919),
+    uvec2(0x08190808, 0x08191919), uvec2(0x19080808, 0x08191919), uvec2(0x08080808, 0x0819192b), uvec2(0x08191908, 0x0819192b),
+    uvec2(0x19082b19, 0x0819192b), uvec2(0x08080819, 0x08192b08), uvec2(0x08081908, 0x08192b08), uvec2(0x08190808, 0x08192b08),
+    uvec2(0x0819082b, 0x08192b08), uvec2(0x19080808, 0x08192b08), uvec2(0x19191908, 0x08192b08), uvec2(0x2b08192b, 0x08192b08),
+    uvec2(0x08080808, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x192b192b, 0x08192b19), uvec2(0x19190819, 0x08192b2b),
+    uvec2(0x2b2b2b19, 0x08192b2b), uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08081919, 0x082b0808),
+    uvec2(0x08082b08, 0x082b0808), uvec2(0x08082b2b, 0x082b0808), uvec2(0x08190819, 0x082b0808), uvec2(0x08191908, 0x082b0808),
+    uvec2(0x082b0808, 0x082b0808), uvec2(0x19080819, 0x082b0808), uvec2(0x19081908, 0x082b0808), uvec2(0x19190808, 0x082b0808),
+    uvec2(0x2b080808, 0x082b0808), uvec2(0x2b2b0808, 0x082b0808), uvec2(0x08080819, 0x082b0819), uvec2(0x08081908, 0x082b0819),
+    uvec2(0x08190808, 0x082b0819), uvec2(0x19080808, 0x082b0819), uvec2(0x19082b08, 0x082b0819), uvec2(0x192b1919, 0x082b0819),
+    uvec2(0x08080808, 0x082b082b), uvec2(0x082b082b, 0x082b082b), uvec2(0x2b080808, 0x082b082b), uvec2(0x2b2b2b08, 0x082b082b),
+    uvec2(0x08080819, 0x082b1908), uvec2(0x08081908, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x082b2b19, 0x082b1908),
+    uvec2(0x19080808, 0x082b1908), uvec2(0x08080808, 0x082b1919), uvec2(0x19080819, 0x082b1919), uvec2(0x1919082b, 0x082b1919),
+    uvec2(0x2b192b19, 0x082b1919), uvec2(0x08080819, 0x082b192b), uvec2(0x08192b2b, 0x082b192b), uvec2(0x2b2b192b, 0x082b192b),
+    uvec2(0x08080808, 0x082b2b08), uvec2(0x08082b08, 0x082b2b08), uvec2(0x08082b2b, 0x082b2b08), uvec2(0x082b0808, 0x082b2b08),
+    uvec2(0x19191919, 0x082b2b08), uvec2(0x2b082b08, 0x082b2b08), uvec2(0x2b2b082b, 0x082b2b08), uvec2(0x192b2b08, 0x082b2b19),
+    uvec2(0x2b190808, 0x082b2b19), uvec2(0x08082b08, 0x082b2b2b), uvec2(0x082b0808, 0x082b2b2b), uvec2(0x2b08082b, 0x082b2b2b),
+    uvec2(0x2b082b08, 0x082b2b2b), uvec2(0x2b082b2b, 0x082b2b2b), uvec2(0x08080819, 0x19080808), uvec2(0x08081908, 0x19080808),
+    uvec2(0x0808192b, 0x19080808), uvec2(0x08082b19, 0x19080808), uvec2(0x08190808, 0x19080808), uvec2(0x0819082b, 0x19080808),
+    uvec2(0x08191919, 0x19080808), uvec2(0x08192b08, 0x19080808), uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808),
+    uvec2(0x19080808, 0x19080808), uvec2(0x1908082b, 0x19080808), uvec2(0x19081919, 0x19080808), uvec2(0x19082b08, 0x19080808),
+    uvec2(0x19082b2b, 0x19080808), uvec2(0x19190819, 0x19080808), uvec2(0x19191908, 0x19080808), uvec2(0x192b0808, 0x19080808),
+    uvec2(0x192b1919, 0x19080808), uvec2(0x2b080819, 0x19080808), uvec2(0x2b081908, 0x19080808), uvec2(0x2b190808, 0x19080808),
+    uvec2(0x08080808, 0x19080819), uvec2(0x0808082b, 0x19080819), uvec2(0x08081919, 0x19080819), uvec2(0x08082b08, 0x19080819),
+    uvec2(0x08190819, 0x19080819), uvec2(0x08191908, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x19080819, 0x19080819),
+    uvec2(0x19081908, 0x19080819), uvec2(0x19190808, 0x19080819), uvec2(0x2b080808, 0x19080819), uvec2(0x2b081919, 0x19080819),
+    uvec2(0x2b2b082b, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08081908, 0x1908082b), uvec2(0x08190808, 0x1908082b),
+    uvec2(0x0819082b, 0x1908082b), uvec2(0x082b2b19, 0x1908082b), uvec2(0x19080808, 0x1908082b), uvec2(0x08080808, 0x19081908),
+    uvec2(0x0808082b, 0x19081908), uvec2(0x08081919, 0x19081908), uvec2(0x08082b08, 0x19081908), uvec2(0x08190819, 0x19081908),
+    uvec2(0x08191908, 0x19081908), uvec2(0x08192b19, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x19080819, 0x19081908),
+    uvec2(0x19081908, 0x19081908), uvec2(0x19190808, 0x19081908), uvec2(0x2b080808, 0x19081908), uvec2(0x2b191908, 0x19081908),
+    uvec2(0x08080819, 0x19081919), uvec2(0x08081908, 0x19081919), uvec2(0x08190808, 0x19081919), uvec2(0x082b1908, 0x19081919),
+    uvec2(0x19080808, 0x19081919), uvec2(0x2b192b2b, 0x19081919), uvec2(0x08080808, 0x1908192b), uvec2(0x08082b2b, 0x1908192b),
+    uvec2(0x19081908, 0x1908192b), uvec2(0x19190808, 0x1908192b), uvec2(0x08080819, 0x19082b08), uvec2(0x08081908, 0x19082b08),
+    uvec2(0x08190808, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x19081919, 0x19082b08), uvec2(0x19191908, 0x19082b08),
+    uvec2(0x192b082b, 0x19082b08), uvec2(0x08080808, 0x19082b19), uvec2(0x08190819, 0x19082b19), uvec2(0x19081908, 0x19082b19),
+    uvec2(0x19190808, 0x19082b19), uvec2(0x192b2b19, 0x19082b19), uvec2(0x08081908, 0x19082b2b), uvec2(0x08080808, 0x19190808),
+    uvec2(0x0808082b, 0x19190808), uvec2(0x08081919, 0x19190808), uvec2(0x08082b08, 0x19190808), uvec2(0x08190819, 0x19190808),
+    uvec2(0x08191908, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x082b2b08, 0x19190808), uvec2(0x19080819, 0x19190808),
+    uvec2(0x19081908, 0x19190808), uvec2(0x19190808, 0x19190808), uvec2(0x2b080808, 0x19190808), uvec2(0x08080819, 0x19190819),
+    uvec2(0x08081908, 0x19190819), uvec2(0x08190808, 0x19190819), uvec2(0x08191919, 0x19190819), uvec2(0x19080808, 0x19190819),
+    uvec2(0x1908082b, 0x19190819), uvec2(0x08080808, 0x1919082b), uvec2(0x19081908, 0x1919082b), uvec2(0x2b2b2b2b, 0x1919082b),
+    uvec2(0x08080819, 0x19191908), uvec2(0x08081908, 0x19191908), uvec2(0x08190808, 0x19191908), uvec2(0x082b0819, 0x19191908),
+    uvec2(0x19080808, 0x19191908), uvec2(0x192b0808, 0x19191908), uvec2(0x2b080819, 0x19191908), uvec2(0x2b2b0819, 0x19191908),
+    uvec2(0x08080808, 0x19191919), uvec2(0x08082b08, 0x19191919), uvec2(0x2b080808, 0x19191919), uvec2(0x2b082b08, 0x19191919),
+    uvec2(0x082b0819, 0x1919192b), uvec2(0x192b2b08, 0x1919192b), uvec2(0x2b2b0819, 0x1919192b), uvec2(0x08080808, 0x19192b08),
+    uvec2(0x08191908, 0x19192b08), uvec2(0x19080819, 0x19192b08), uvec2(0x19190808, 0x19192b08), uvec2(0x2b192b19, 0x19192b08),
+    uvec2(0x08192b2b, 0x19192b19), uvec2(0x19080808, 0x19192b19), uvec2(0x1908082b, 0x19192b19), uvec2(0x2b081919, 0x19192b2b),
+    uvec2(0x08080819, 0x192b0808), uvec2(0x08081908, 0x192b0808), uvec2(0x08190808, 0x192b0808), uvec2(0x19080808, 0x192b0808),
+    uvec2(0x19191908, 0x192b0808), uvec2(0x192b082b, 0x192b0808), uvec2(0x2b08192b, 0x192b0808), uvec2(0x2b2b2b19, 0x192b0808),
+    uvec2(0x08080808, 0x192b0819), uvec2(0x082b1908, 0x192b082b), uvec2(0x19082b2b, 0x192b082b), uvec2(0x2b19082b, 0x192b082b),
+    uvec2(0x08080808, 0x192b1908), uvec2(0x0819192b, 0x192b1908), uvec2(0x08190808, 0x192b1919), uvec2(0x19080808, 0x192b1919),
+    uvec2(0x19081919, 0x192b1919), uvec2(0x2b2b1908, 0x192b1919), uvec2(0x08080819, 0x192b2b08), uvec2(0x192b2b2b, 0x192b2b08),
+    uvec2(0x082b1919, 0x192b2b19), uvec2(0x0808192b, 0x192b2b2b), uvec2(0x19191908, 0x192b2b2b), uvec2(0x192b082b, 0x192b2b2b),
+    uvec2(0x08080808, 0x2b080808), uvec2(0x0808082b, 0x2b080808), uvec2(0x08081919, 0x2b080808), uvec2(0x08082b08, 0x2b080808),
+    uvec2(0x08190819, 0x2b080808), uvec2(0x08191908, 0x2b080808), uvec2(0x082b0808, 0x2b080808), uvec2(0x082b2b2b, 0x2b080808),
+    uvec2(0x19080819, 0x2b080808), uvec2(0x19081908, 0x2b080808), uvec2(0x19190808, 0x2b080808), uvec2(0x2b080808, 0x2b080808),
+    uvec2(0x2b08082b, 0x2b080808), uvec2(0x2b2b2b08, 0x2b080808), uvec2(0x2b2b2b2b, 0x2b080808), uvec2(0x08080819, 0x2b080819),
+    uvec2(0x08081908, 0x2b080819), uvec2(0x0808192b, 0x2b080819), uvec2(0x08190808, 0x2b080819), uvec2(0x19080808, 0x2b080819),
+    uvec2(0x19190819, 0x2b080819), uvec2(0x19192b19, 0x2b080819), uvec2(0x08080808, 0x2b08082b), uvec2(0x082b0808, 0x2b08082b),
+    uvec2(0x2b080808, 0x2b08082b), uvec2(0x2b08082b, 0x2b08082b), uvec2(0x2b2b0808, 0x2b08082b), uvec2(0x2b2b2b08, 0x2b08082b),
+    uvec2(0x08080819, 0x2b081908), uvec2(0x08081908, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x0819082b, 0x2b081908),
+    uvec2(0x08191919, 0x2b081908), uvec2(0x19080808, 0x2b081908), uvec2(0x192b0808, 0x2b081908), uvec2(0x2b082b19, 0x2b081908),
+    uvec2(0x08080808, 0x2b081919), uvec2(0x19081908, 0x2b081919), uvec2(0x2b2b1919, 0x2b081919), uvec2(0x08192b08, 0x2b08192b),
+    uvec2(0x192b2b2b, 0x2b08192b), uvec2(0x08080808, 0x2b082b08), uvec2(0x08082b08, 0x2b082b08), uvec2(0x082b1919, 0x2b082b08),
+    uvec2(0x19192b2b, 0x2b082b08), uvec2(0x2b080808, 0x2b082b08), uvec2(0x2b08082b, 0x2b082b08), uvec2(0x2b2b2b08, 0x2b082b08),
+    uvec2(0x0808192b, 0x2b082b19), uvec2(0x082b082b, 0x2b082b2b), uvec2(0x2b080808, 0x2b082b2b), uvec2(0x2b082b08, 0x2b082b2b),
+    uvec2(0x2b19192b, 0x2b082b2b), uvec2(0x2b2b2b08, 0x2b082b2b), uvec2(0x08080819, 0x2b190808), uvec2(0x08081908, 0x2b190808),
+    uvec2(0x08190808, 0x2b190808), uvec2(0x19080808, 0x2b190808), uvec2(0x1919192b, 0x2b190808), uvec2(0x2b081908, 0x2b190808),
+    uvec2(0x08080808, 0x2b190819), uvec2(0x082b082b, 0x2b190819), uvec2(0x192b1908, 0x2b190819), uvec2(0x1919192b, 0x2b19082b),
+    uvec2(0x2b082b19, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x08081919, 0x2b191908), uvec2(0x19081908, 0x2b191908),
+    uvec2(0x19190808, 0x2b191908), uvec2(0x19192b08, 0x2b191908), uvec2(0x082b2b19, 0x2b191919), uvec2(0x2b190808, 0x2b191919),
+    uvec2(0x2b19082b, 0x2b191919), uvec2(0x19080819, 0x2b19192b), uvec2(0x19190819, 0x2b192b08), uvec2(0x2b2b192b, 0x2b192b08),
+    uvec2(0x19082b19, 0x2b192b19), uvec2(0x08191919, 0x2b192b2b), uvec2(0x192b0808, 0x2b192b2b), uvec2(0x08080808, 0x2b2b0808),
+    uvec2(0x0808082b, 0x2b2b0808), uvec2(0x08082b08, 0x2b2b0808), uvec2(0x08082b2b, 0x2b2b0808), uvec2(0x082b0808, 0x2b2b0808),
+    uvec2(0x082b2b2b, 0x2b2b0808), uvec2(0x2b2b0808, 0x2b2b0808), uvec2(0x19190819, 0x2b2b0819), uvec2(0x19192b19, 0x2b2b0819),
+    uvec2(0x2b2b192b, 0x2b2b0819), uvec2(0x08080808, 0x2b2b082b), uvec2(0x0808082b, 0x2b2b082b), uvec2(0x08082b08, 0x2b2b082b),
+    uvec2(0x082b2b2b, 0x2b2b082b), uvec2(0x2b080808, 0x2b2b082b), uvec2(0x2b2b0808, 0x2b2b082b), uvec2(0x19080808, 0x2b2b1908),
+    uvec2(0x2b191919, 0x2b2b1908), uvec2(0x192b1919, 0x2b2b192b), uvec2(0x2b192b08, 0x2b2b192b), uvec2(0x08082b2b, 0x2b2b2b08),
+    uvec2(0x082b0808, 0x2b2b2b08), uvec2(0x082b082b, 0x2b2b2b08), uvec2(0x082b2b08, 0x2b2b2b08), uvec2(0x2b2b0808, 0x2b2b2b08),
+    uvec2(0x2b2b2b08, 0x2b2b2b08), uvec2(0x08081908, 0x2b2b2b19), uvec2(0x2b081908, 0x2b2b2b19), uvec2(0x2b08192b, 0x2b2b2b19),
+    uvec2(0x082b2b08, 0x2b2b2b2b), uvec2(0x082b2b2b, 0x2b2b2b2b), uvec2(0x2b190819, 0x2b2b2b2b), uvec2(0x2b2b2b2b, 0x2b2b2b2b),
+};
+
+shared uvec2 iq2xs_grid[512];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
+        iq2xs_grid[i] = iq2xs_grid_const[i];
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ2_XS
+#define QUANT_R QUANT_R_IQ2_XS
+#define A_TYPE block_iq2_xs
+#define A_TYPE_PACKED16 block_iq2_xs_packed16
+#endif
+
+#define QUANT_K_IQ2_S 256
+#define QUANT_R_IQ2_S 1
+
+struct block_iq2_s
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ2_S/4];
+    uint8_t qh[QUANT_K_IQ2_S/32];
+    uint8_t scales[QUANT_K_IQ2_S/32];
+};
+
+#if defined(DATA_A_IQ2_S)
+
+const uvec2 iq2s_grid_const[1024] = {
+    uvec2(0x08080808, 0x08080808), uvec2(0x0808082b, 0x08080808), uvec2(0x08081919, 0x08080808), uvec2(0x08082b08, 0x08080808),
+    uvec2(0x08082b2b, 0x08080808), uvec2(0x08190819, 0x08080808), uvec2(0x08191908, 0x08080808), uvec2(0x0819192b, 0x08080808),
+    uvec2(0x08192b19, 0x08080808), uvec2(0x082b0808, 0x08080808), uvec2(0x082b082b, 0x08080808), uvec2(0x082b1919, 0x08080808),
+    uvec2(0x082b2b08, 0x08080808), uvec2(0x19080819, 0x08080808), uvec2(0x19081908, 0x08080808), uvec2(0x1908192b, 0x08080808),
+    uvec2(0x19082b19, 0x08080808), uvec2(0x19190808, 0x08080808), uvec2(0x1919082b, 0x08080808), uvec2(0x19191919, 0x08080808),
+    uvec2(0x19192b08, 0x08080808), uvec2(0x192b0819, 0x08080808), uvec2(0x192b1908, 0x08080808), uvec2(0x192b192b, 0x08080808),
+    uvec2(0x192b2b19, 0x08080808), uvec2(0x2b080808, 0x08080808), uvec2(0x2b08082b, 0x08080808), uvec2(0x2b081919, 0x08080808),
+    uvec2(0x2b082b08, 0x08080808), uvec2(0x2b190819, 0x08080808), uvec2(0x2b191908, 0x08080808), uvec2(0x2b2b0808, 0x08080808),
+    uvec2(0x2b2b1919, 0x08080808), uvec2(0x2b2b2b2b, 0x08080808), uvec2(0x08080819, 0x08080819), uvec2(0x08081908, 0x08080819),
+    uvec2(0x0808192b, 0x08080819), uvec2(0x08082b19, 0x08080819), uvec2(0x08190808, 0x08080819), uvec2(0x0819082b, 0x08080819),
+    uvec2(0x08191919, 0x08080819), uvec2(0x08192b08, 0x08080819), uvec2(0x082b0819, 0x08080819), uvec2(0x082b1908, 0x08080819),
+    uvec2(0x19080808, 0x08080819), uvec2(0x1908082b, 0x08080819), uvec2(0x19081919, 0x08080819), uvec2(0x19082b08, 0x08080819),
+    uvec2(0x19190819, 0x08080819), uvec2(0x19191908, 0x08080819), uvec2(0x1919192b, 0x08080819), uvec2(0x19192b19, 0x08080819),
+    uvec2(0x192b0808, 0x08080819), uvec2(0x192b1919, 0x08080819), uvec2(0x192b2b08, 0x08080819), uvec2(0x2b080819, 0x08080819),
+    uvec2(0x2b081908, 0x08080819), uvec2(0x2b190808, 0x08080819), uvec2(0x2b19082b, 0x08080819), uvec2(0x2b191919, 0x08080819),
+    uvec2(0x2b2b0819, 0x08080819), uvec2(0x2b2b1908, 0x08080819), uvec2(0x08080808, 0x0808082b), uvec2(0x0808082b, 0x0808082b),
+    uvec2(0x08081919, 0x0808082b), uvec2(0x08082b08, 0x0808082b), uvec2(0x08190819, 0x0808082b), uvec2(0x08191908, 0x0808082b),
+    uvec2(0x082b0808, 0x0808082b), uvec2(0x082b2b2b, 0x0808082b), uvec2(0x19080819, 0x0808082b), uvec2(0x19081908, 0x0808082b),
+    uvec2(0x1908192b, 0x0808082b), uvec2(0x19082b19, 0x0808082b), uvec2(0x19190808, 0x0808082b), uvec2(0x19191919, 0x0808082b),
+    uvec2(0x2b080808, 0x0808082b), uvec2(0x2b081919, 0x0808082b), uvec2(0x2b082b2b, 0x0808082b), uvec2(0x2b191908, 0x0808082b),
+    uvec2(0x2b2b082b, 0x0808082b), uvec2(0x08080819, 0x08081908), uvec2(0x08081908, 0x08081908), uvec2(0x0808192b, 0x08081908),
+    uvec2(0x08082b19, 0x08081908), uvec2(0x08190808, 0x08081908), uvec2(0x0819082b, 0x08081908), uvec2(0x08191919, 0x08081908),
+    uvec2(0x08192b08, 0x08081908), uvec2(0x082b0819, 0x08081908), uvec2(0x082b1908, 0x08081908), uvec2(0x082b192b, 0x08081908),
+    uvec2(0x082b2b19, 0x08081908), uvec2(0x19080808, 0x08081908), uvec2(0x1908082b, 0x08081908), uvec2(0x19081919, 0x08081908),
+    uvec2(0x19082b08, 0x08081908), uvec2(0x19082b2b, 0x08081908), uvec2(0x19190819, 0x08081908), uvec2(0x19191908, 0x08081908),
+    uvec2(0x1919192b, 0x08081908), uvec2(0x19192b19, 0x08081908), uvec2(0x192b0808, 0x08081908), uvec2(0x192b082b, 0x08081908),
+    uvec2(0x192b1919, 0x08081908), uvec2(0x2b080819, 0x08081908), uvec2(0x2b081908, 0x08081908), uvec2(0x2b08192b, 0x08081908),
+    uvec2(0x2b082b19, 0x08081908), uvec2(0x2b190808, 0x08081908), uvec2(0x2b191919, 0x08081908), uvec2(0x2b192b08, 0x08081908),
+    uvec2(0x2b2b0819, 0x08081908), uvec2(0x2b2b1908, 0x08081908), uvec2(0x08080808, 0x08081919), uvec2(0x0808082b, 0x08081919),
+    uvec2(0x08081919, 0x08081919), uvec2(0x08082b08, 0x08081919), uvec2(0x08082b2b, 0x08081919), uvec2(0x08190819, 0x08081919),
+    uvec2(0x08191908, 0x08081919), uvec2(0x0819192b, 0x08081919), uvec2(0x08192b19, 0x08081919), uvec2(0x082b0808, 0x08081919),
+    uvec2(0x082b1919, 0x08081919), uvec2(0x082b2b08, 0x08081919), uvec2(0x19080819, 0x08081919), uvec2(0x19081908, 0x08081919),
+    uvec2(0x1908192b, 0x08081919), uvec2(0x19082b19, 0x08081919), uvec2(0x19190808, 0x08081919), uvec2(0x1919082b, 0x08081919),
+    uvec2(0x19191919, 0x08081919), uvec2(0x19192b08, 0x08081919), uvec2(0x192b0819, 0x08081919), uvec2(0x192b1908, 0x08081919),
+    uvec2(0x2b080808, 0x08081919), uvec2(0x2b08082b, 0x08081919), uvec2(0x2b081919, 0x08081919), uvec2(0x2b082b08, 0x08081919),
+    uvec2(0x2b190819, 0x08081919), uvec2(0x2b191908, 0x08081919), uvec2(0x2b2b0808, 0x08081919), uvec2(0x08080819, 0x0808192b),
+    uvec2(0x08081908, 0x0808192b), uvec2(0x0808192b, 0x0808192b), uvec2(0x08082b19, 0x0808192b), uvec2(0x08190808, 0x0808192b),
+    uvec2(0x08191919, 0x0808192b), uvec2(0x19080808, 0x0808192b), uvec2(0x19081919, 0x0808192b), uvec2(0x19082b08, 0x0808192b),
+    uvec2(0x19190819, 0x0808192b), uvec2(0x19191908, 0x0808192b), uvec2(0x192b0808, 0x0808192b), uvec2(0x2b080819, 0x0808192b),
+    uvec2(0x2b081908, 0x0808192b), uvec2(0x2b190808, 0x0808192b), uvec2(0x08080808, 0x08082b08), uvec2(0x0808082b, 0x08082b08),
+    uvec2(0x08081919, 0x08082b08), uvec2(0x08082b08, 0x08082b08), uvec2(0x08190819, 0x08082b08), uvec2(0x08191908, 0x08082b08),
+    uvec2(0x0819192b, 0x08082b08), uvec2(0x08192b19, 0x08082b08), uvec2(0x082b0808, 0x08082b08), uvec2(0x082b1919, 0x08082b08),
+    uvec2(0x082b2b2b, 0x08082b08), uvec2(0x19080819, 0x08082b08), uvec2(0x19081908, 0x08082b08), uvec2(0x1908192b, 0x08082b08),
+    uvec2(0x19082b19, 0x08082b08), uvec2(0x19190808, 0x08082b08), uvec2(0x1919082b, 0x08082b08), uvec2(0x19191919, 0x08082b08),
+    uvec2(0x19192b08, 0x08082b08), uvec2(0x192b0819, 0x08082b08), uvec2(0x192b1908, 0x08082b08), uvec2(0x2b080808, 0x08082b08),
+    uvec2(0x2b081919, 0x08082b08), uvec2(0x2b191908, 0x08082b08), uvec2(0x2b2b2b2b, 0x08082b08), uvec2(0x08080819, 0x08082b19),
+    uvec2(0x08081908, 0x08082b19), uvec2(0x08190808, 0x08082b19), uvec2(0x0819082b, 0x08082b19), uvec2(0x08191919, 0x08082b19),
+    uvec2(0x08192b08, 0x08082b19), uvec2(0x082b0819, 0x08082b19), uvec2(0x19080808, 0x08082b19), uvec2(0x19081919, 0x08082b19),
+    uvec2(0x19082b08, 0x08082b19), uvec2(0x19190819, 0x08082b19), uvec2(0x19191908, 0x08082b19), uvec2(0x192b0808, 0x08082b19),
+    uvec2(0x2b080819, 0x08082b19), uvec2(0x2b190808, 0x08082b19), uvec2(0x08080808, 0x08082b2b), uvec2(0x08190819, 0x08082b2b),
+    uvec2(0x08191908, 0x08082b2b), uvec2(0x082b082b, 0x08082b2b), uvec2(0x082b2b08, 0x08082b2b), uvec2(0x082b2b2b, 0x08082b2b),
+    uvec2(0x19190808, 0x08082b2b), uvec2(0x2b192b19, 0x08082b2b), uvec2(0x08080819, 0x08190808), uvec2(0x08081908, 0x08190808),
+    uvec2(0x0808192b, 0x08190808), uvec2(0x08082b19, 0x08190808), uvec2(0x08190808, 0x08190808), uvec2(0x0819082b, 0x08190808),
+    uvec2(0x08191919, 0x08190808), uvec2(0x08192b08, 0x08190808), uvec2(0x082b0819, 0x08190808), uvec2(0x082b1908, 0x08190808),
+    uvec2(0x082b192b, 0x08190808), uvec2(0x19080808, 0x08190808), uvec2(0x1908082b, 0x08190808), uvec2(0x19081919, 0x08190808),
+    uvec2(0x19082b08, 0x08190808), uvec2(0x19190819, 0x08190808), uvec2(0x19191908, 0x08190808), uvec2(0x1919192b, 0x08190808),
+    uvec2(0x19192b19, 0x08190808), uvec2(0x192b0808, 0x08190808), uvec2(0x192b082b, 0x08190808), uvec2(0x192b1919, 0x08190808),
+    uvec2(0x192b2b08, 0x08190808), uvec2(0x2b080819, 0x08190808), uvec2(0x2b081908, 0x08190808), uvec2(0x2b08192b, 0x08190808),
+    uvec2(0x2b190808, 0x08190808), uvec2(0x2b191919, 0x08190808), uvec2(0x2b192b08, 0x08190808), uvec2(0x2b2b0819, 0x08190808),
+    uvec2(0x2b2b1908, 0x08190808), uvec2(0x08080808, 0x08190819), uvec2(0x0808082b, 0x08190819), uvec2(0x08081919, 0x08190819),
+    uvec2(0x08082b08, 0x08190819), uvec2(0x08082b2b, 0x08190819), uvec2(0x08190819, 0x08190819), uvec2(0x08191908, 0x08190819),
+    uvec2(0x0819192b, 0x08190819), uvec2(0x08192b19, 0x08190819), uvec2(0x082b0808, 0x08190819), uvec2(0x082b082b, 0x08190819),
+    uvec2(0x082b1919, 0x08190819), uvec2(0x082b2b08, 0x08190819), uvec2(0x19080819, 0x08190819), uvec2(0x19081908, 0x08190819),
+    uvec2(0x1908192b, 0x08190819), uvec2(0x19082b19, 0x08190819), uvec2(0x19190808, 0x08190819), uvec2(0x1919082b, 0x08190819),
+    uvec2(0x19191919, 0x08190819), uvec2(0x19192b08, 0x08190819), uvec2(0x192b0819, 0x08190819), uvec2(0x192b1908, 0x08190819),
+    uvec2(0x2b080808, 0x08190819), uvec2(0x2b08082b, 0x08190819), uvec2(0x2b081919, 0x08190819), uvec2(0x2b082b08, 0x08190819),
+    uvec2(0x2b190819, 0x08190819), uvec2(0x2b191908, 0x08190819), uvec2(0x08080819, 0x0819082b), uvec2(0x08081908, 0x0819082b),
+    uvec2(0x08082b19, 0x0819082b), uvec2(0x08190808, 0x0819082b), uvec2(0x08191919, 0x0819082b), uvec2(0x082b0819, 0x0819082b),
+    uvec2(0x082b1908, 0x0819082b), uvec2(0x19080808, 0x0819082b), uvec2(0x19081919, 0x0819082b), uvec2(0x19190819, 0x0819082b),
+    uvec2(0x19191908, 0x0819082b), uvec2(0x2b080819, 0x0819082b), uvec2(0x2b081908, 0x0819082b), uvec2(0x2b190808, 0x0819082b),
+    uvec2(0x08080808, 0x08191908), uvec2(0x0808082b, 0x08191908), uvec2(0x08081919, 0x08191908), uvec2(0x08082b08, 0x08191908),
+    uvec2(0x08190819, 0x08191908), uvec2(0x08191908, 0x08191908), uvec2(0x0819192b, 0x08191908), uvec2(0x08192b19, 0x08191908),
+    uvec2(0x082b0808, 0x08191908), uvec2(0x082b1919, 0x08191908), uvec2(0x082b2b08, 0x08191908), uvec2(0x19080819, 0x08191908),
+    uvec2(0x19081908, 0x08191908), uvec2(0x1908192b, 0x08191908), uvec2(0x19082b19, 0x08191908), uvec2(0x19190808, 0x08191908),
+    uvec2(0x1919082b, 0x08191908), uvec2(0x19191919, 0x08191908), uvec2(0x19192b08, 0x08191908), uvec2(0x192b0819, 0x08191908),
+    uvec2(0x192b1908, 0x08191908), uvec2(0x2b080808, 0x08191908), uvec2(0x2b08082b, 0x08191908), uvec2(0x2b081919, 0x08191908),
+    uvec2(0x2b082b08, 0x08191908), uvec2(0x2b190819, 0x08191908), uvec2(0x2b191908, 0x08191908), uvec2(0x2b2b0808, 0x08191908),
+    uvec2(0x08080819, 0x08191919), uvec2(0x08081908, 0x08191919), uvec2(0x0808192b, 0x08191919), uvec2(0x08082b19, 0x08191919),
+    uvec2(0x08190808, 0x08191919), uvec2(0x0819082b, 0x08191919), uvec2(0x08191919, 0x08191919), uvec2(0x08192b08, 0x08191919),
+    uvec2(0x082b0819, 0x08191919), uvec2(0x082b1908, 0x08191919), uvec2(0x19080808, 0x08191919), uvec2(0x1908082b, 0x08191919),
+    uvec2(0x19081919, 0x08191919), uvec2(0x19082b08, 0x08191919), uvec2(0x19190819, 0x08191919), uvec2(0x19191908, 0x08191919),
+    uvec2(0x192b0808, 0x08191919), uvec2(0x2b080819, 0x08191919), uvec2(0x2b081908, 0x08191919), uvec2(0x2b190808, 0x08191919),
+    uvec2(0x08080808, 0x0819192b), uvec2(0x08081919, 0x0819192b), uvec2(0x08082b08, 0x0819192b), uvec2(0x08190819, 0x0819192b),
+    uvec2(0x08191908, 0x0819192b), uvec2(0x082b0808, 0x0819192b), uvec2(0x19080819, 0x0819192b), uvec2(0x19081908, 0x0819192b),
+    uvec2(0x19190808, 0x0819192b), uvec2(0x2b080808, 0x0819192b), uvec2(0x2b2b2b2b, 0x0819192b), uvec2(0x08080819, 0x08192b08),
+    uvec2(0x08081908, 0x08192b08), uvec2(0x0808192b, 0x08192b08), uvec2(0x08082b19, 0x08192b08), uvec2(0x08190808, 0x08192b08),
+    uvec2(0x08191919, 0x08192b08), uvec2(0x08192b08, 0x08192b08), uvec2(0x082b0819, 0x08192b08), uvec2(0x19080808, 0x08192b08),
+    uvec2(0x1908082b, 0x08192b08), uvec2(0x19081919, 0x08192b08), uvec2(0x19082b08, 0x08192b08), uvec2(0x19190819, 0x08192b08),
+    uvec2(0x19191908, 0x08192b08), uvec2(0x192b0808, 0x08192b08), uvec2(0x2b080819, 0x08192b08), uvec2(0x2b081908, 0x08192b08),
+    uvec2(0x08080808, 0x08192b19), uvec2(0x0808082b, 0x08192b19), uvec2(0x08081919, 0x08192b19), uvec2(0x08082b08, 0x08192b19),
+    uvec2(0x08190819, 0x08192b19), uvec2(0x08191908, 0x08192b19), uvec2(0x082b0808, 0x08192b19), uvec2(0x19080819, 0x08192b19),
+    uvec2(0x19081908, 0x08192b19), uvec2(0x19190808, 0x08192b19), uvec2(0x192b2b19, 0x08192b19), uvec2(0x2b2b082b, 0x08192b19),
+    uvec2(0x08081908, 0x08192b2b), uvec2(0x08190808, 0x08192b2b), uvec2(0x19080808, 0x08192b2b), uvec2(0x1919192b, 0x08192b2b),
+    uvec2(0x08080808, 0x082b0808), uvec2(0x0808082b, 0x082b0808), uvec2(0x08081919, 0x082b0808), uvec2(0x08082b08, 0x082b0808),
+    uvec2(0x08190819, 0x082b0808), uvec2(0x08191908, 0x082b0808), uvec2(0x0819192b, 0x082b0808), uvec2(0x08192b19, 0x082b0808),
+    uvec2(0x082b0808, 0x082b0808), uvec2(0x082b1919, 0x082b0808), uvec2(0x082b2b2b, 0x082b0808), uvec2(0x19080819, 0x082b0808),
+    uvec2(0x19081908, 0x082b0808), uvec2(0x19190808, 0x082b0808), uvec2(0x1919082b, 0x082b0808), uvec2(0x19191919, 0x082b0808),
+    uvec2(0x192b1908, 0x082b0808), uvec2(0x2b080808, 0x082b0808), uvec2(0x2b082b2b, 0x082b0808), uvec2(0x2b191908, 0x082b0808),
+    uvec2(0x2b2b2b2b, 0x082b0808), uvec2(0x08080819, 0x082b0819), uvec2(0x08081908, 0x082b0819), uvec2(0x08190808, 0x082b0819),
+    uvec2(0x0819082b, 0x082b0819), uvec2(0x08191919, 0x082b0819), uvec2(0x082b0819, 0x082b0819), uvec2(0x19080808, 0x082b0819),
+    uvec2(0x1908082b, 0x082b0819), uvec2(0x19081919, 0x082b0819), uvec2(0x19190819, 0x082b0819), uvec2(0x19191908, 0x082b0819),
+    uvec2(0x192b0808, 0x082b0819), uvec2(0x2b080819, 0x082b0819), uvec2(0x2b081908, 0x082b0819), uvec2(0x2b190808, 0x082b0819),
+    uvec2(0x08080808, 0x082b082b), uvec2(0x08082b2b, 0x082b082b), uvec2(0x082b082b, 0x082b082b), uvec2(0x082b2b08, 0x082b082b),
+    uvec2(0x082b2b2b, 0x082b082b), uvec2(0x19081908, 0x082b082b), uvec2(0x19190808, 0x082b082b), uvec2(0x2b082b08, 0x082b082b),
+    uvec2(0x2b082b2b, 0x082b082b), uvec2(0x2b2b2b08, 0x082b082b), uvec2(0x08080819, 0x082b1908), uvec2(0x08081908, 0x082b1908),
+    uvec2(0x0808192b, 0x082b1908), uvec2(0x08082b19, 0x082b1908), uvec2(0x08190808, 0x082b1908), uvec2(0x08191919, 0x082b1908),
+    uvec2(0x08192b08, 0x082b1908), uvec2(0x082b0819, 0x082b1908), uvec2(0x082b1908, 0x082b1908), uvec2(0x19080808, 0x082b1908),
+    uvec2(0x1908082b, 0x082b1908), uvec2(0x19081919, 0x082b1908), uvec2(0x19082b08, 0x082b1908), uvec2(0x19190819, 0x082b1908),
+    uvec2(0x19191908, 0x082b1908), uvec2(0x192b0808, 0x082b1908), uvec2(0x2b080819, 0x082b1908), uvec2(0x2b081908, 0x082b1908),
+    uvec2(0x2b190808, 0x082b1908), uvec2(0x08080808, 0x082b1919), uvec2(0x08081919, 0x082b1919), uvec2(0x08082b08, 0x082b1919),
+    uvec2(0x08190819, 0x082b1919), uvec2(0x08191908, 0x082b1919), uvec2(0x082b0808, 0x082b1919), uvec2(0x19080819, 0x082b1919),
+    uvec2(0x19081908, 0x082b1919), uvec2(0x19190808, 0x082b1919), uvec2(0x192b192b, 0x082b1919), uvec2(0x2b080808, 0x082b1919),
+    uvec2(0x08080819, 0x082b192b), uvec2(0x08081908, 0x082b192b), uvec2(0x08190808, 0x082b192b), uvec2(0x19080808, 0x082b192b),
+    uvec2(0x19192b19, 0x082b192b), uvec2(0x08080808, 0x082b2b08), uvec2(0x08081919, 0x082b2b08), uvec2(0x08190819, 0x082b2b08),
+    uvec2(0x08191908, 0x082b2b08), uvec2(0x19080819, 0x082b2b08), uvec2(0x19081908, 0x082b2b08), uvec2(0x19190808, 0x082b2b08),
+    uvec2(0x2b082b2b, 0x082b2b08), uvec2(0x2b2b2b2b, 0x082b2b08), uvec2(0x08080819, 0x082b2b19), uvec2(0x08081908, 0x082b2b19),
+    uvec2(0x08190808, 0x082b2b19), uvec2(0x2b191919, 0x082b2b19), uvec2(0x08082b2b, 0x082b2b2b), uvec2(0x082b082b, 0x082b2b2b),
+    uvec2(0x192b1908, 0x082b2b2b), uvec2(0x2b082b08, 0x082b2b2b), uvec2(0x2b082b2b, 0x082b2b2b), uvec2(0x08080819, 0x19080808),
+    uvec2(0x08081908, 0x19080808), uvec2(0x0808192b, 0x19080808), uvec2(0x08082b19, 0x19080808), uvec2(0x08190808, 0x19080808),
+    uvec2(0x0819082b, 0x19080808), uvec2(0x08191919, 0x19080808), uvec2(0x08192b08, 0x19080808), uvec2(0x08192b2b, 0x19080808),
+    uvec2(0x082b0819, 0x19080808), uvec2(0x082b1908, 0x19080808), uvec2(0x082b192b, 0x19080808), uvec2(0x19080808, 0x19080808),
+    uvec2(0x1908082b, 0x19080808), uvec2(0x19081919, 0x19080808), uvec2(0x19082b08, 0x19080808), uvec2(0x19082b2b, 0x19080808),
+    uvec2(0x19190819, 0x19080808), uvec2(0x19191908, 0x19080808), uvec2(0x1919192b, 0x19080808), uvec2(0x19192b19, 0x19080808),
+    uvec2(0x192b0808, 0x19080808), uvec2(0x192b082b, 0x19080808), uvec2(0x192b1919, 0x19080808), uvec2(0x2b080819, 0x19080808),
+    uvec2(0x2b081908, 0x19080808), uvec2(0x2b190808, 0x19080808), uvec2(0x2b191919, 0x19080808), uvec2(0x2b192b08, 0x19080808),
+    uvec2(0x2b2b0819, 0x19080808), uvec2(0x2b2b1908, 0x19080808), uvec2(0x08080808, 0x19080819), uvec2(0x0808082b, 0x19080819),
+    uvec2(0x08081919, 0x19080819), uvec2(0x08082b08, 0x19080819), uvec2(0x08190819, 0x19080819), uvec2(0x08191908, 0x19080819),
+    uvec2(0x0819192b, 0x19080819), uvec2(0x08192b19, 0x19080819), uvec2(0x082b0808, 0x19080819), uvec2(0x082b082b, 0x19080819),
+    uvec2(0x082b1919, 0x19080819), uvec2(0x19080819, 0x19080819), uvec2(0x19081908, 0x19080819), uvec2(0x1908192b, 0x19080819),
+    uvec2(0x19082b19, 0x19080819), uvec2(0x19190808, 0x19080819), uvec2(0x1919082b, 0x19080819), uvec2(0x19191919, 0x19080819),
+    uvec2(0x19192b08, 0x19080819), uvec2(0x192b0819, 0x19080819), uvec2(0x192b1908, 0x19080819), uvec2(0x2b080808, 0x19080819),
+    uvec2(0x2b08082b, 0x19080819), uvec2(0x2b081919, 0x19080819), uvec2(0x2b082b08, 0x19080819), uvec2(0x2b190819, 0x19080819),
+    uvec2(0x2b191908, 0x19080819), uvec2(0x2b2b0808, 0x19080819), uvec2(0x08080819, 0x1908082b), uvec2(0x08081908, 0x1908082b),
+    uvec2(0x08190808, 0x1908082b), uvec2(0x0819082b, 0x1908082b), uvec2(0x08191919, 0x1908082b), uvec2(0x08192b08, 0x1908082b),
+    uvec2(0x082b1908, 0x1908082b), uvec2(0x19080808, 0x1908082b), uvec2(0x19081919, 0x1908082b), uvec2(0x19082b08, 0x1908082b),
+    uvec2(0x19190819, 0x1908082b), uvec2(0x19191908, 0x1908082b), uvec2(0x192b0808, 0x1908082b), uvec2(0x2b080819, 0x1908082b),
+    uvec2(0x2b081908, 0x1908082b), uvec2(0x08080808, 0x19081908), uvec2(0x0808082b, 0x19081908), uvec2(0x08081919, 0x19081908),
+    uvec2(0x08082b08, 0x19081908), uvec2(0x08082b2b, 0x19081908), uvec2(0x08190819, 0x19081908), uvec2(0x08191908, 0x19081908),
+    uvec2(0x0819192b, 0x19081908), uvec2(0x08192b19, 0x19081908), uvec2(0x082b0808, 0x19081908), uvec2(0x082b082b, 0x19081908),
+    uvec2(0x082b1919, 0x19081908), uvec2(0x082b2b08, 0x19081908), uvec2(0x19080819, 0x19081908), uvec2(0x19081908, 0x19081908),
+    uvec2(0x1908192b, 0x19081908), uvec2(0x19082b19, 0x19081908), uvec2(0x19190808, 0x19081908), uvec2(0x1919082b, 0x19081908),
+    uvec2(0x19191919, 0x19081908), uvec2(0x19192b08, 0x19081908), uvec2(0x192b0819, 0x19081908), uvec2(0x192b1908, 0x19081908),
+    uvec2(0x2b080808, 0x19081908), uvec2(0x2b08082b, 0x19081908), uvec2(0x2b081919, 0x19081908), uvec2(0x2b082b08, 0x19081908),
+    uvec2(0x2b190819, 0x19081908), uvec2(0x2b191908, 0x19081908), uvec2(0x2b2b0808, 0x19081908), uvec2(0x08080819, 0x19081919),
+    uvec2(0x08081908, 0x19081919), uvec2(0x0808192b, 0x19081919), uvec2(0x08082b19, 0x19081919), uvec2(0x08190808, 0x19081919),
+    uvec2(0x0819082b, 0x19081919), uvec2(0x08191919, 0x19081919), uvec2(0x08192b08, 0x19081919), uvec2(0x082b0819, 0x19081919),
+    uvec2(0x082b1908, 0x19081919), uvec2(0x19080808, 0x19081919), uvec2(0x1908082b, 0x19081919), uvec2(0x19081919, 0x19081919),
+    uvec2(0x19082b08, 0x19081919), uvec2(0x19190819, 0x19081919), uvec2(0x19191908, 0x19081919), uvec2(0x192b0808, 0x19081919),
+    uvec2(0x192b2b2b, 0x19081919), uvec2(0x2b080819, 0x19081919), uvec2(0x2b081908, 0x19081919), uvec2(0x2b190808, 0x19081919),
+    uvec2(0x08080808, 0x1908192b), uvec2(0x0808082b, 0x1908192b), uvec2(0x08081919, 0x1908192b), uvec2(0x08082b08, 0x1908192b),
+    uvec2(0x08190819, 0x1908192b), uvec2(0x08191908, 0x1908192b), uvec2(0x082b0808, 0x1908192b), uvec2(0x19080819, 0x1908192b),
+    uvec2(0x19081908, 0x1908192b), uvec2(0x19190808, 0x1908192b), uvec2(0x2b080808, 0x1908192b), uvec2(0x2b2b1919, 0x1908192b),
+    uvec2(0x08080819, 0x19082b08), uvec2(0x08081908, 0x19082b08), uvec2(0x08082b19, 0x19082b08), uvec2(0x08190808, 0x19082b08),
+    uvec2(0x0819082b, 0x19082b08), uvec2(0x08191919, 0x19082b08), uvec2(0x08192b08, 0x19082b08), uvec2(0x082b0819, 0x19082b08),
+    uvec2(0x082b1908, 0x19082b08), uvec2(0x19080808, 0x19082b08), uvec2(0x1908082b, 0x19082b08), uvec2(0x19081919, 0x19082b08),
+    uvec2(0x19082b08, 0x19082b08), uvec2(0x19190819, 0x19082b08), uvec2(0x19191908, 0x19082b08), uvec2(0x192b0808, 0x19082b08),
+    uvec2(0x2b081908, 0x19082b08), uvec2(0x2b190808, 0x19082b08), uvec2(0x08080808, 0x19082b19), uvec2(0x0808082b, 0x19082b19),
+    uvec2(0x08081919, 0x19082b19), uvec2(0x08082b08, 0x19082b19), uvec2(0x08190819, 0x19082b19), uvec2(0x08191908, 0x19082b19),
+    uvec2(0x082b0808, 0x19082b19), uvec2(0x19080819, 0x19082b19), uvec2(0x19081908, 0x19082b19), uvec2(0x19190808, 0x19082b19),
+    uvec2(0x2b080808, 0x19082b19), uvec2(0x2b19192b, 0x19082b19), uvec2(0x08080819, 0x19082b2b), uvec2(0x08081908, 0x19082b2b),
+    uvec2(0x08190808, 0x19082b2b), uvec2(0x19080808, 0x19082b2b), uvec2(0x08080808, 0x19190808), uvec2(0x0808082b, 0x19190808),
+    uvec2(0x08081919, 0x19190808), uvec2(0x08082b08, 0x19190808), uvec2(0x08190819, 0x19190808), uvec2(0x08191908, 0x19190808),
+    uvec2(0x0819192b, 0x19190808), uvec2(0x08192b19, 0x19190808), uvec2(0x082b0808, 0x19190808), uvec2(0x082b082b, 0x19190808),
+    uvec2(0x082b1919, 0x19190808), uvec2(0x082b2b08, 0x19190808), uvec2(0x19080819, 0x19190808), uvec2(0x19081908, 0x19190808),
+    uvec2(0x1908192b, 0x19190808), uvec2(0x19082b19, 0x19190808), uvec2(0x19190808, 0x19190808), uvec2(0x1919082b, 0x19190808),
+    uvec2(0x19191919, 0x19190808), uvec2(0x19192b08, 0x19190808), uvec2(0x192b0819, 0x19190808), uvec2(0x192b1908, 0x19190808),
+    uvec2(0x2b080808, 0x19190808), uvec2(0x2b08082b, 0x19190808), uvec2(0x2b081919, 0x19190808), uvec2(0x2b082b08, 0x19190808),
+    uvec2(0x2b190819, 0x19190808), uvec2(0x2b191908, 0x19190808), uvec2(0x08080819, 0x19190819), uvec2(0x08081908, 0x19190819),
+    uvec2(0x0808192b, 0x19190819), uvec2(0x08082b19, 0x19190819), uvec2(0x08190808, 0x19190819), uvec2(0x0819082b, 0x19190819),
+    uvec2(0x08191919, 0x19190819), uvec2(0x08192b08, 0x19190819), uvec2(0x082b0819, 0x19190819), uvec2(0x082b1908, 0x19190819),
+    uvec2(0x19080808, 0x19190819), uvec2(0x1908082b, 0x19190819), uvec2(0x19081919, 0x19190819), uvec2(0x19082b08, 0x19190819),
+    uvec2(0x19190819, 0x19190819), uvec2(0x19191908, 0x19190819), uvec2(0x192b0808, 0x19190819), uvec2(0x2b080819, 0x19190819),
+    uvec2(0x2b081908, 0x19190819), uvec2(0x2b190808, 0x19190819), uvec2(0x08080808, 0x1919082b), uvec2(0x08081919, 0x1919082b),
+    uvec2(0x08082b08, 0x1919082b), uvec2(0x08190819, 0x1919082b), uvec2(0x08191908, 0x1919082b), uvec2(0x082b0808, 0x1919082b),
+    uvec2(0x19080819, 0x1919082b), uvec2(0x19081908, 0x1919082b), uvec2(0x19190808, 0x1919082b), uvec2(0x192b2b19, 0x1919082b),
+    uvec2(0x2b080808, 0x1919082b), uvec2(0x08080819, 0x19191908), uvec2(0x08081908, 0x19191908), uvec2(0x0808192b, 0x19191908),
+    uvec2(0x08082b19, 0x19191908), uvec2(0x08190808, 0x19191908), uvec2(0x0819082b, 0x19191908), uvec2(0x08191919, 0x19191908),
+    uvec2(0x08192b08, 0x19191908), uvec2(0x082b0819, 0x19191908), uvec2(0x082b1908, 0x19191908), uvec2(0x19080808, 0x19191908),
+    uvec2(0x1908082b, 0x19191908), uvec2(0x19081919, 0x19191908), uvec2(0x19082b08, 0x19191908), uvec2(0x19190819, 0x19191908),
+    uvec2(0x19191908, 0x19191908), uvec2(0x192b0808, 0x19191908), uvec2(0x2b080819, 0x19191908), uvec2(0x2b081908, 0x19191908),
+    uvec2(0x2b190808, 0x19191908), uvec2(0x08080808, 0x19191919), uvec2(0x0808082b, 0x19191919), uvec2(0x08081919, 0x19191919),
+    uvec2(0x08082b08, 0x19191919), uvec2(0x08190819, 0x19191919), uvec2(0x08191908, 0x19191919), uvec2(0x082b0808, 0x19191919),
+    uvec2(0x19080819, 0x19191919), uvec2(0x19081908, 0x19191919), uvec2(0x19190808, 0x19191919), uvec2(0x2b080808, 0x19191919),
+    uvec2(0x08080819, 0x1919192b), uvec2(0x08081908, 0x1919192b), uvec2(0x08190808, 0x1919192b), uvec2(0x082b192b, 0x1919192b),
+    uvec2(0x19080808, 0x1919192b), uvec2(0x08080808, 0x19192b08), uvec2(0x0808082b, 0x19192b08), uvec2(0x08081919, 0x19192b08),
+    uvec2(0x08082b08, 0x19192b08), uvec2(0x08190819, 0x19192b08), uvec2(0x08191908, 0x19192b08), uvec2(0x082b0808, 0x19192b08),
+    uvec2(0x19080819, 0x19192b08), uvec2(0x19081908, 0x19192b08), uvec2(0x19190808, 0x19192b08), uvec2(0x19192b2b, 0x19192b08),
+    uvec2(0x2b080808, 0x19192b08), uvec2(0x08080819, 0x19192b19), uvec2(0x08081908, 0x19192b19), uvec2(0x08190808, 0x19192b19),
+    uvec2(0x19080808, 0x19192b19), uvec2(0x08080808, 0x19192b2b), uvec2(0x08192b19, 0x19192b2b), uvec2(0x2b081919, 0x19192b2b),
+    uvec2(0x2b2b2b08, 0x19192b2b), uvec2(0x08080819, 0x192b0808), uvec2(0x08081908, 0x192b0808), uvec2(0x0808192b, 0x192b0808),
+    uvec2(0x08190808, 0x192b0808), uvec2(0x0819082b, 0x192b0808), uvec2(0x08191919, 0x192b0808), uvec2(0x08192b08, 0x192b0808),
+    uvec2(0x082b0819, 0x192b0808), uvec2(0x082b1908, 0x192b0808), uvec2(0x19080808, 0x192b0808), uvec2(0x19081919, 0x192b0808),
+    uvec2(0x19082b08, 0x192b0808), uvec2(0x19190819, 0x192b0808), uvec2(0x19191908, 0x192b0808), uvec2(0x192b0808, 0x192b0808),
+    uvec2(0x2b081908, 0x192b0808), uvec2(0x2b190808, 0x192b0808), uvec2(0x08080808, 0x192b0819), uvec2(0x0808082b, 0x192b0819),
+    uvec2(0x08081919, 0x192b0819), uvec2(0x08082b08, 0x192b0819), uvec2(0x08190819, 0x192b0819), uvec2(0x08191908, 0x192b0819),
+    uvec2(0x082b0808, 0x192b0819), uvec2(0x19080819, 0x192b0819), uvec2(0x19081908, 0x192b0819), uvec2(0x19190808, 0x192b0819),
+    uvec2(0x2b080808, 0x192b0819), uvec2(0x2b192b19, 0x192b0819), uvec2(0x08081908, 0x192b082b), uvec2(0x08190808, 0x192b082b),
+    uvec2(0x19080808, 0x192b082b), uvec2(0x1919192b, 0x192b082b), uvec2(0x2b2b0819, 0x192b082b), uvec2(0x08080808, 0x192b1908),
+    uvec2(0x08081919, 0x192b1908), uvec2(0x08082b08, 0x192b1908), uvec2(0x08190819, 0x192b1908), uvec2(0x08191908, 0x192b1908),
+    uvec2(0x082b0808, 0x192b1908), uvec2(0x19080819, 0x192b1908), uvec2(0x19081908, 0x192b1908), uvec2(0x19190808, 0x192b1908),
+    uvec2(0x2b080808, 0x192b1908), uvec2(0x08080819, 0x192b1919), uvec2(0x08081908, 0x192b1919), uvec2(0x08190808, 0x192b1919),
+    uvec2(0x19080808, 0x192b1919), uvec2(0x19082b2b, 0x192b1919), uvec2(0x192b2b08, 0x192b1919), uvec2(0x2b19082b, 0x192b1919),
+    uvec2(0x08080808, 0x192b192b), uvec2(0x2b191908, 0x192b192b), uvec2(0x08080819, 0x192b2b08), uvec2(0x08081908, 0x192b2b08),
+    uvec2(0x08190808, 0x192b2b08), uvec2(0x192b1919, 0x192b2b08), uvec2(0x2b192b08, 0x192b2b08), uvec2(0x08080808, 0x192b2b19),
+    uvec2(0x082b2b2b, 0x192b2b19), uvec2(0x1908082b, 0x192b2b2b), uvec2(0x2b2b0819, 0x192b2b2b), uvec2(0x08080808, 0x2b080808),
+    uvec2(0x0808082b, 0x2b080808), uvec2(0x08081919, 0x2b080808), uvec2(0x08082b08, 0x2b080808), uvec2(0x08190819, 0x2b080808),
+    uvec2(0x08191908, 0x2b080808), uvec2(0x08192b19, 0x2b080808), uvec2(0x082b0808, 0x2b080808), uvec2(0x082b1919, 0x2b080808),
+    uvec2(0x19080819, 0x2b080808), uvec2(0x19081908, 0x2b080808), uvec2(0x19190808, 0x2b080808), uvec2(0x1919082b, 0x2b080808),
+    uvec2(0x19191919, 0x2b080808), uvec2(0x19192b08, 0x2b080808), uvec2(0x192b0819, 0x2b080808), uvec2(0x2b080808, 0x2b080808),
+    uvec2(0x2b081919, 0x2b080808), uvec2(0x2b190819, 0x2b080808), uvec2(0x2b191908, 0x2b080808), uvec2(0x08080819, 0x2b080819),
+    uvec2(0x08081908, 0x2b080819), uvec2(0x08082b19, 0x2b080819), uvec2(0x08190808, 0x2b080819), uvec2(0x0819082b, 0x2b080819),
+    uvec2(0x08191919, 0x2b080819), uvec2(0x08192b08, 0x2b080819), uvec2(0x082b0819, 0x2b080819), uvec2(0x082b1908, 0x2b080819),
+    uvec2(0x19080808, 0x2b080819), uvec2(0x1908082b, 0x2b080819), uvec2(0x19081919, 0x2b080819), uvec2(0x19082b08, 0x2b080819),
+    uvec2(0x19190819, 0x2b080819), uvec2(0x19191908, 0x2b080819), uvec2(0x2b080819, 0x2b080819), uvec2(0x2b081908, 0x2b080819),
+    uvec2(0x2b190808, 0x2b080819), uvec2(0x2b2b2b19, 0x2b080819), uvec2(0x08080808, 0x2b08082b), uvec2(0x08081919, 0x2b08082b),
+    uvec2(0x08082b2b, 0x2b08082b), uvec2(0x08190819, 0x2b08082b), uvec2(0x08191908, 0x2b08082b), uvec2(0x19080819, 0x2b08082b),
+    uvec2(0x19081908, 0x2b08082b), uvec2(0x19190808, 0x2b08082b), uvec2(0x08080819, 0x2b081908), uvec2(0x08081908, 0x2b081908),
+    uvec2(0x0808192b, 0x2b081908), uvec2(0x08082b19, 0x2b081908), uvec2(0x08190808, 0x2b081908), uvec2(0x0819082b, 0x2b081908),
+    uvec2(0x08191919, 0x2b081908), uvec2(0x08192b08, 0x2b081908), uvec2(0x082b0819, 0x2b081908), uvec2(0x19080808, 0x2b081908),
+    uvec2(0x1908082b, 0x2b081908), uvec2(0x19081919, 0x2b081908), uvec2(0x19082b08, 0x2b081908), uvec2(0x19190819, 0x2b081908),
+    uvec2(0x19191908, 0x2b081908), uvec2(0x192b0808, 0x2b081908), uvec2(0x2b080819, 0x2b081908), uvec2(0x2b081908, 0x2b081908),
+    uvec2(0x2b190808, 0x2b081908), uvec2(0x08080808, 0x2b081919), uvec2(0x0808082b, 0x2b081919), uvec2(0x08081919, 0x2b081919),
+    uvec2(0x08082b08, 0x2b081919), uvec2(0x08190819, 0x2b081919), uvec2(0x08191908, 0x2b081919), uvec2(0x082b0808, 0x2b081919),
+    uvec2(0x19080819, 0x2b081919), uvec2(0x19081908, 0x2b081919), uvec2(0x19190808, 0x2b081919), uvec2(0x2b080808, 0x2b081919),
+    uvec2(0x2b082b2b, 0x2b081919), uvec2(0x08080819, 0x2b08192b), uvec2(0x08081908, 0x2b08192b), uvec2(0x08190808, 0x2b08192b),
+    uvec2(0x082b2b19, 0x2b08192b), uvec2(0x19080808, 0x2b08192b), uvec2(0x08080808, 0x2b082b08), uvec2(0x08081919, 0x2b082b08),
+    uvec2(0x08190819, 0x2b082b08), uvec2(0x08191908, 0x2b082b08), uvec2(0x19080819, 0x2b082b08), uvec2(0x19081908, 0x2b082b08),
+    uvec2(0x19190808, 0x2b082b08), uvec2(0x2b2b082b, 0x2b082b08), uvec2(0x08080819, 0x2b082b19), uvec2(0x08081908, 0x2b082b19),
+    uvec2(0x19080808, 0x2b082b19), uvec2(0x192b1919, 0x2b082b19), uvec2(0x082b082b, 0x2b082b2b), uvec2(0x19192b08, 0x2b082b2b),
+    uvec2(0x19192b2b, 0x2b082b2b), uvec2(0x2b08082b, 0x2b082b2b), uvec2(0x2b2b082b, 0x2b082b2b), uvec2(0x08080819, 0x2b190808),
+    uvec2(0x08081908, 0x2b190808), uvec2(0x08082b19, 0x2b190808), uvec2(0x08190808, 0x2b190808), uvec2(0x0819082b, 0x2b190808),
+    uvec2(0x08191919, 0x2b190808), uvec2(0x08192b08, 0x2b190808), uvec2(0x082b1908, 0x2b190808), uvec2(0x19080808, 0x2b190808),
+    uvec2(0x1908082b, 0x2b190808), uvec2(0x19081919, 0x2b190808), uvec2(0x19082b08, 0x2b190808), uvec2(0x19190819, 0x2b190808),
+    uvec2(0x19191908, 0x2b190808), uvec2(0x192b0808, 0x2b190808), uvec2(0x2b080819, 0x2b190808), uvec2(0x2b081908, 0x2b190808),
+    uvec2(0x2b190808, 0x2b190808), uvec2(0x08080808, 0x2b190819), uvec2(0x08081919, 0x2b190819), uvec2(0x08190819, 0x2b190819),
+    uvec2(0x08191908, 0x2b190819), uvec2(0x19080819, 0x2b190819), uvec2(0x19081908, 0x2b190819), uvec2(0x19190808, 0x2b190819),
+    uvec2(0x19192b2b, 0x2b190819), uvec2(0x08080819, 0x2b19082b), uvec2(0x08081908, 0x2b19082b), uvec2(0x08190808, 0x2b19082b),
+    uvec2(0x19080808, 0x2b19082b), uvec2(0x2b2b192b, 0x2b19082b), uvec2(0x08080808, 0x2b191908), uvec2(0x0808082b, 0x2b191908),
+    uvec2(0x08081919, 0x2b191908), uvec2(0x08082b08, 0x2b191908), uvec2(0x08190819, 0x2b191908), uvec2(0x08191908, 0x2b191908),
+    uvec2(0x082b0808, 0x2b191908), uvec2(0x19080819, 0x2b191908), uvec2(0x19081908, 0x2b191908), uvec2(0x19190808, 0x2b191908),
+    uvec2(0x2b080808, 0x2b191908), uvec2(0x2b19192b, 0x2b191908), uvec2(0x08080819, 0x2b191919), uvec2(0x08081908, 0x2b191919),
+    uvec2(0x08190808, 0x2b191919), uvec2(0x19080808, 0x2b191919), uvec2(0x2b192b08, 0x2b191919), uvec2(0x2b2b0819, 0x2b191919),
+    uvec2(0x08080808, 0x2b19192b), uvec2(0x1908192b, 0x2b19192b), uvec2(0x192b1908, 0x2b19192b), uvec2(0x08080819, 0x2b192b08),
+    uvec2(0x08081908, 0x2b192b08), uvec2(0x08190808, 0x2b192b08), uvec2(0x082b192b, 0x2b192b08), uvec2(0x19080808, 0x2b192b08),
+    uvec2(0x2b2b2b19, 0x2b192b08), uvec2(0x08080808, 0x2b192b19), uvec2(0x19082b19, 0x2b192b19), uvec2(0x1919082b, 0x2b192b19),
+    uvec2(0x2b190808, 0x2b192b2b), uvec2(0x08080808, 0x2b2b0808), uvec2(0x08081919, 0x2b2b0808), uvec2(0x08082b2b, 0x2b2b0808),
+    uvec2(0x08191908, 0x2b2b0808), uvec2(0x082b082b, 0x2b2b0808), uvec2(0x082b2b2b, 0x2b2b0808), uvec2(0x19080819, 0x2b2b0808),
+    uvec2(0x19081908, 0x2b2b0808), uvec2(0x19190808, 0x2b2b0808), uvec2(0x2b2b082b, 0x2b2b0808), uvec2(0x2b2b2b2b, 0x2b2b0808),
+    uvec2(0x19080808, 0x2b2b0819), uvec2(0x192b1919, 0x2b2b0819), uvec2(0x0808082b, 0x2b2b082b), uvec2(0x08082b2b, 0x2b2b082b),
+    uvec2(0x082b082b, 0x2b2b082b), uvec2(0x082b2b08, 0x2b2b082b), uvec2(0x082b2b2b, 0x2b2b082b), uvec2(0x2b08082b, 0x2b2b082b),
+    uvec2(0x2b082b08, 0x2b2b082b), uvec2(0x2b082b2b, 0x2b2b082b), uvec2(0x2b2b2b08, 0x2b2b082b), uvec2(0x08080819, 0x2b2b1908),
+    uvec2(0x08081908, 0x2b2b1908), uvec2(0x08190808, 0x2b2b1908), uvec2(0x19080808, 0x2b2b1908), uvec2(0x2b082b19, 0x2b2b1908),
+    uvec2(0x2b2b1908, 0x2b2b1908), uvec2(0x08080808, 0x2b2b1919), uvec2(0x08192b19, 0x2b2b1919), uvec2(0x19190819, 0x2b2b192b),
+    uvec2(0x08082b2b, 0x2b2b2b08), uvec2(0x082b2b08, 0x2b2b2b08), uvec2(0x2b2b082b, 0x2b2b2b08), uvec2(0x19191908, 0x2b2b2b19),
+    uvec2(0x2b08192b, 0x2b2b2b19), uvec2(0x08082b08, 0x2b2b2b2b), uvec2(0x08082b2b, 0x2b2b2b2b), uvec2(0x082b0808, 0x2b2b2b2b),
+    uvec2(0x082b082b, 0x2b2b2b2b), uvec2(0x082b2b08, 0x2b2b2b2b), uvec2(0x2b082b08, 0x2b2b2b2b), uvec2(0x2b2b2b2b, 0x2b2b2b2b)
+};
+
+shared uvec2 iq2s_grid[1024];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
+        iq2s_grid[i] = iq2s_grid_const[i];
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ2_S
+#define QUANT_R QUANT_R_IQ2_S
+#define A_TYPE block_iq2_s
+#endif
+
+#define QUANT_K_IQ3_XXS 256
+#define QUANT_R_IQ3_XXS 1
+
+struct block_iq3_xxs
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ3_XXS/4 + QUANT_K_IQ3_XXS/8];
+};
+
+struct block_iq3_xxs_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ3_XXS/8 + QUANT_K_IQ3_XXS/16];
+};
+
+#if defined(DATA_A_IQ3_XXS)
+
+const uint32_t iq3xxs_grid_const[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
+shared uint32_t iq3xxs_grid[256];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
+        iq3xxs_grid[i] = iq3xxs_grid_const[i];
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ3_XXS
+#define QUANT_R QUANT_R_IQ3_XXS
+#define A_TYPE block_iq3_xxs
+#define A_TYPE_PACKED16 block_iq3_xxs_packed16
+#endif
+
+#define QUANT_K_IQ3_S 256
+#define QUANT_R_IQ3_S 1
+
+struct block_iq3_s
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ3_S/4];
+    uint8_t qh[QUANT_K_IQ3_S/32];
+    uint8_t signs[QUANT_K_IQ3_S/8];
+    uint8_t scales[QUANT_K_IQ3_S/64];
+};
+
+struct block_iq3_s_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ3_S/4/2];
+    uint16_t qh[QUANT_K_IQ3_S/32/2];
+    uint16_t signs[QUANT_K_IQ3_S/8/2];
+    uint16_t scales[QUANT_K_IQ3_S/64/2];
+};
+
+#if defined(DATA_A_IQ3_S)
+
+const uint32_t iq3s_grid_const[512] = {
+    0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
+    0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
+    0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
+    0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
+    0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
+    0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
+    0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
+    0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
+    0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
+    0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
+    0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
+    0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
+    0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
+    0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
+    0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
+    0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
+    0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
+    0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
+    0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
+    0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
+    0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
+    0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
+    0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
+    0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
+    0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
+    0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
+    0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
+    0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
+    0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
+    0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
+    0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
+    0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
+    0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
+    0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
+    0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
+    0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
+    0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
+    0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
+    0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
+    0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
+    0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
+    0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
+    0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
+    0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
+    0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
+    0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
+    0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
+    0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
+    0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
+    0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
+    0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
+    0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
+    0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
+    0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
+    0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
+    0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
+    0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
+    0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
+    0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
+    0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
+    0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
+    0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
+    0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
+    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
+};
+
+shared uint32_t iq3s_grid[512];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
+        iq3s_grid[i] = iq3s_grid_const[i];
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ3_S
+#define QUANT_R QUANT_R_IQ3_S
+#define A_TYPE block_iq3_s
+#define A_TYPE_PACKED16 block_iq3_s_packed16
+#endif
+
+#define QUANT_K_IQ4_XS 256
+#define QUANT_R_IQ4_XS 1
+
+struct block_iq4_xs
+{
+    float16_t d;
+    uint16_t scales_h;
+    uint8_t scales_l[QUANT_K_IQ4_XS/64];
+    uint8_t qs[QUANT_K_IQ4_XS/2];
+};
+
+#if defined(DATA_A_IQ4_XS)
+#define QUANT_K QUANT_K_IQ4_XS
+#define QUANT_R QUANT_R_IQ4_XS
+#define A_TYPE block_iq4_xs
+#endif
+
+#define QUANT_K_IQ4_NL 32
+#define QUANT_R_IQ4_NL 2
+
+struct block_iq4_nl
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ4_NL/2];
+};
+
+struct block_iq4_nl_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ4_NL/2/2];
+};
+
+#if defined(DATA_A_IQ4_NL)
+#define QUANT_K QUANT_K_IQ4_NL
+#define QUANT_R QUANT_R_IQ4_NL
+#define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
+#endif
+
+#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
+const int8_t kvalues_iq4nl_const[16] = {
+    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
+    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
+};
+
+shared FLOAT_TYPE kvalues_iq4nl[16];
+
+void init_iq_shmem(uvec3 wgsize)
+{
+    // copy the table into shared memory and sync
+    for (uint i = gl_LocalInvocationIndex.x; i < kvalues_iq4nl.length(); i += wgsize.x) {
+        kvalues_iq4nl[i] = FLOAT_TYPE(kvalues_iq4nl_const[i]);
+    }
+    barrier();
+}
+#endif
+
+#endif // !defined(GGML_TYPES_COMP)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
new file mode 100644
index 000000000..6f607380d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
@@ -0,0 +1,36 @@
+#version 450
+
+layout (push_constant) uniform parameter
+{
+    uint ne; uint a_offset; uint d_offset;
+    uint nb00; uint nb01; uint nb02; uint nb03;
+    uint ne10; uint ne11; uint ne12; uint ne13;
+    float sf0; float sf1; float sf2; float sf3;
+} p;
+
+#include "types.comp"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i10 = idx % p.ne10;
+    const uint i11 = (idx / p.ne10) % p.ne11;
+    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
+    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
+
+    const uint i00 = uint(i10 / p.sf0);
+    const uint i01 = uint(i11 / p.sf1);
+    const uint i02 = uint(i12 / p.sf2);
+    const uint i03 = uint(i13 / p.sf3);
+
+    data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
new file mode 100644
index 000000000..77e7e1148
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -0,0 +1,609 @@
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <array>
+#include <vector>
+#include <map>
+#include <thread>
+#include <mutex>
+#include <future>
+#include <queue>
+#include <condition_variable>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <algorithm>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#ifdef _WIN32
+    #include <windows.h>
+    #include <direct.h> // For _mkdir on Windows
+#else
+    #include <unistd.h>
+    #include <sys/wait.h>
+    #include <fcntl.h>
+#endif
+
+#define ASYNCIO_CONCURRENCY 64
+
+std::mutex lock;
+std::vector<std::pair<std::string, std::string>> shader_fnames;
+
+std::string GLSLC = "glslc";
+std::string input_dir = "vulkan-shaders";
+std::string output_dir = "/tmp";
+std::string target_hpp = "ggml-vulkan-shaders.hpp";
+std::string target_cpp = "ggml-vulkan-shaders.cpp";
+bool no_clean = false;
+
+const std::vector<std::string> type_names = {
+    "f32",
+    "f16",
+    "q4_0",
+    "q4_1",
+    "q5_0",
+    "q5_1",
+    "q8_0",
+    "q2_k",
+    "q3_k",
+    "q4_k",
+    "q5_k",
+    "q6_k",
+    "iq2_xxs",
+    "iq2_xs",
+    "iq2_s",
+    "iq3_xxs",
+    "iq3_s",
+    "iq4_xs",
+    "iq4_nl"
+};
+
+namespace {
+void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
+#ifdef _WIN32
+    HANDLE stdout_read, stdout_write;
+    HANDLE stderr_read, stderr_write;
+    SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+
+    if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
+        !SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
+        throw std::runtime_error("Failed to create stdout pipe");
+    }
+
+    if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
+        !SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
+        throw std::runtime_error("Failed to create stderr pipe");
+    }
+
+    PROCESS_INFORMATION pi;
+    STARTUPINFOA si = {};
+    si.cb = sizeof(STARTUPINFOA);
+    si.dwFlags = STARTF_USESTDHANDLES;
+    si.hStdOutput = stdout_write;
+    si.hStdError = stderr_write;
+
+    std::vector<char> cmd(command.begin(), command.end());
+    cmd.push_back('\0');
+
+    if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
+        throw std::runtime_error("Failed to create process");
+    }
+
+    CloseHandle(stdout_write);
+    CloseHandle(stderr_write);
+
+    std::array<char, 128> buffer;
+    DWORD bytes_read;
+
+    while (ReadFile(stdout_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
+        stdout_str.append(buffer.data(), bytes_read);
+    }
+
+    while (ReadFile(stderr_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
+        stderr_str.append(buffer.data(), bytes_read);
+    }
+
+    CloseHandle(stdout_read);
+    CloseHandle(stderr_read);
+    WaitForSingleObject(pi.hProcess, INFINITE);
+    CloseHandle(pi.hProcess);
+    CloseHandle(pi.hThread);
+#else
+int stdout_pipe[2];
+    int stderr_pipe[2];
+
+    if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
+        throw std::runtime_error("Failed to create pipes");
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        throw std::runtime_error("Failed to fork process");
+    }
+
+    if (pid == 0) {
+        close(stdout_pipe[0]);
+        close(stderr_pipe[0]);
+        dup2(stdout_pipe[1], STDOUT_FILENO);
+        dup2(stderr_pipe[1], STDERR_FILENO);
+        close(stdout_pipe[1]);
+        close(stderr_pipe[1]);
+        execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
+        _exit(EXIT_FAILURE);
+    } else {
+        close(stdout_pipe[1]);
+        close(stderr_pipe[1]);
+
+        std::array<char, 128> buffer;
+        ssize_t bytes_read;
+
+        while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
+            stdout_str.append(buffer.data(), bytes_read);
+        }
+
+        while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
+            stderr_str.append(buffer.data(), bytes_read);
+        }
+
+        close(stdout_pipe[0]);
+        close(stderr_pipe[0]);
+        waitpid(pid, nullptr, 0);
+    }
+#endif
+}
+
+bool directory_exists(const std::string& path) {
+    struct stat info;
+    if (stat(path.c_str(), &info) != 0) {
+        return false; // Path doesn't exist or can't be accessed
+    }
+    return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
+}
+
+bool create_directory(const std::string& path) {
+#ifdef _WIN32
+    return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
+#else
+    return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
+#endif
+}
+
+std::string to_uppercase(const std::string& input) {
+    std::string result = input;
+    for (char& c : result) {
+        c = std::toupper(c);
+    }
+    return result;
+}
+
+bool string_ends_with(const std::string& str, const std::string& suffix) {
+    if (suffix.size() > str.size()) {
+        return false;
+    }
+    return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
+}
+
+static const char path_separator = '/';
+
+std::string join_paths(const std::string& path1, const std::string& path2) {
+    return path1 + path_separator + path2;
+}
+
+std::string basename(const std::string &path) {
+    return path.substr(path.find_last_of("/\\") + 1);
+}
+
+// variables to track number of compiles in progress
+static uint32_t compile_count = 0;
+static std::mutex compile_count_mutex;
+static std::condition_variable compile_count_cond;
+
+void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) {
+    std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32"));
+    std::string out_fname = join_paths(output_dir, name + ".spv");
+    std::string in_path = join_paths(input_dir, in_fname);
+
+    std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2";
+
+    // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734
+    std::string opt_level = coopmat ? "" : "-O";
+
+    #ifdef _WIN32
+        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
+    #else
+        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, in_path, "-o",  out_fname};
+    #endif
+
+    #ifdef GGML_VULKAN_SHADER_DEBUG_INFO
+        cmd.push_back("-g");
+    #endif
+
+    for (const auto& define : defines) {
+        cmd.push_back("-D" + define.first + "=" + define.second);
+    }
+
+    std::string command;
+    for (const auto& part : cmd) {
+        command += part + " ";
+    }
+
+    std::string stdout_str, stderr_str;
+    try {
+        // std::cout << "Executing command: ";
+        // for (const auto& part : cmd) {
+        //     std::cout << part << " ";
+        // }
+        // std::cout << std::endl;
+
+        execute_command(command, stdout_str, stderr_str);
+        if (!stderr_str.empty()) {
+            std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
+            return;
+        }
+
+        std::lock_guard<std::mutex> guard(lock);
+        shader_fnames.push_back(std::make_pair(name, out_fname));
+    } catch (const std::exception& e) {
+        std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
+    }
+    {
+        std::lock_guard<std::mutex> guard(compile_count_mutex);
+        assert(compile_count > 0);
+        compile_count--;
+    }
+    compile_count_cond.notify_all();
+}
+
+std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
+    std::map<std::string, std::string> result = a;
+    result.insert(b.begin(), b.end());
+    return result;
+}
+
+static std::vector<std::future<void>> compiles;
+void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) {
+    {
+        // wait until fewer than N compiles are in progress.
+        // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
+        uint32_t N = 16;
+        std::unique_lock<std::mutex> guard(compile_count_mutex);
+        while (compile_count >= N) {
+            compile_count_cond.wait(guard);
+        }
+        compile_count++;
+    }
+    compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc));
+}
+
+void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) {
+    std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4";
+    std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
+    std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
+
+    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}};
+    std::string shader_name = "matmul";
+
+    if (matmul_id) {
+        base_dict["MUL_MAT_ID"] = "1";
+        shader_name = "matmul_id";
+    }
+
+    if (fp16) {
+        base_dict["FLOAT16"] = "1";
+    }
+
+    base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
+
+    if (coopmat) {
+        base_dict["COOPMAT"] = "1";
+    }
+
+    base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
+
+    std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
+
+    // Shaders with f16 B_TYPE
+    string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+
+    string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
+
+    for (const auto& tname : type_names) {
+        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+        // For unaligned, load one at a time for f32/f16, or two at a time for quants
+        std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
+        // For aligned matmul loads
+        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
+
+        // don't generate f32 variants for coopmat2
+        if (!coopmat2) {
+            string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+        }
+
+        if (tname != "f16" && tname != "f32") {
+            string_to_spv(shader_name + "_" + tname + "_f16", source_name,          merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name,  merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+        }
+    }
+}
+
+void process_shaders() {
+    std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
+    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
+
+    // matmul
+    for (const auto& matmul_id : {false, true}) {
+        // No coopmats
+        // fp32
+        matmul_shaders(false, matmul_id, false, false, false);
+
+        // fp16, fp32acc and fp16acc
+        matmul_shaders(true, matmul_id, false, false, false);
+        matmul_shaders(true, matmul_id, false, false, true);
+
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+        // Coopmat, fp32acc and fp16acc
+        matmul_shaders(true, matmul_id, true, false, false);
+        matmul_shaders(true, matmul_id, true, false, true);
+#endif
+
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+        // Coopmat2, fp32acc and fp16acc
+        matmul_shaders(true, matmul_id, false, true, false);
+        matmul_shaders(true, matmul_id, false, true, true);
+#endif
+    }
+
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    // flash attention
+    for (const auto& f16acc : {false, true}) {
+        std::string acctype = f16acc ? "float16_t" : "float";
+
+        for (const auto& tname : type_names) {
+            if (tname == "f32") {
+                continue;
+            }
+
+            if (tname == "f16") {
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
+                    merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc);
+            } else {
+                std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
+                    merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc);
+            }
+        }
+    }
+#endif
+
+    for (const auto& tname : type_names) {
+        // mul mat vec
+        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+        std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
+
+        string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
+        string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
+
+        string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
+
+        // Dequant shaders
+        if (tname != "f16") {
+            string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
+        }
+
+        if (!string_ends_with(tname, "_k")) {
+            shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
+
+            if (tname == "f16") {
+                string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
+            } else {
+                string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
+            }
+            string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
+        }
+    }
+
+    string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    // Norms
+    string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+
+    string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+
+    for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
+        string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+        string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    }
+
+    string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+    string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
+
+    string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
+
+    string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
+
+    string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+
+    string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+
+    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+
+    string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+
+    string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
+
+    string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+
+    string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
+    string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
+
+    string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+
+    string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+
+    string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
+
+    for (auto &c : compiles) {
+        c.wait();
+    }
+}
+
+void write_output_files() {
+    FILE* hdr = fopen(target_hpp.c_str(), "w");
+    FILE* src = fopen(target_cpp.c_str(), "w");
+
+    fprintf(hdr, "#include <cstdint>\n\n");
+    fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
+
+    std::sort(shader_fnames.begin(), shader_fnames.end());
+    for (const auto& pair : shader_fnames) {
+        const std::string& name = pair.first;
+        #ifdef _WIN32
+            std::string path = pair.second;
+            std::replace(path.begin(), path.end(), '/', '\\' );
+        #else
+            const std::string& path = pair.second;
+        #endif
+
+        FILE* spv = fopen(path.c_str(), "rb");
+        if (!spv) {
+            std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
+            continue;
+        }
+
+        fseek(spv, 0, SEEK_END);
+        size_t size = ftell(spv);
+        fseek(spv, 0, SEEK_SET);
+
+        std::vector<unsigned char> data(size);
+        size_t read_size = fread(data.data(), 1, size, spv);
+        fclose(spv);
+        if (read_size != size) {
+            std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
+            continue;
+        }
+
+        fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
+        fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
+
+        fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
+        for (size_t i = 0; i < size; ++i) {
+            fprintf(src, "0x%02x,", data[i]);
+            if ((i + 1) % 12 == 0) fprintf(src, "\n");
+        }
+        fprintf(src, "\n};\n\n");
+
+        if (!no_clean) {
+            std::remove(path.c_str());
+        }
+    }
+
+    fclose(hdr);
+    fclose(src);
+}
+}
+
+int main(int argc, char** argv) {
+    std::map<std::string, std::string> args;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg.rfind("--", 0) == 0) {
+            if (i + 1 < argc && argv[i + 1][0] != '-') {
+                args[arg] = argv[i + 1];
+                ++i;
+            } else {
+                args[arg] = "";
+            }
+        }
+    }
+
+    if (args.find("--glslc") != args.end()) {
+        GLSLC = args["--glslc"]; // Path to glslc
+    }
+    if (args.find("--input-dir") != args.end()) {
+        input_dir = args["--input-dir"]; // Directory containing shader sources
+    }
+    if (args.find("--output-dir") != args.end()) {
+        output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
+    }
+    if (args.find("--target-hpp") != args.end()) {
+        target_hpp = args["--target-hpp"]; // Path to generated header file
+    }
+    if (args.find("--target-cpp") != args.end()) {
+        target_cpp = args["--target-cpp"]; // Path to generated cpp file
+    }
+    if (args.find("--no-clean") != args.end()) {
+        no_clean = true; // Keep temporary SPIR-V files in output-dir after build
+    }
+
+    if (!directory_exists(input_dir)) {
+        std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (!directory_exists(output_dir)) {
+        if (!create_directory(output_dir)) {
+            std::cerr << "Error creating output directory: " << output_dir << "\n";
+            return EXIT_FAILURE;
+        }
+    }
+
+    process_shaders();
+
+    write_output_files();
+
+    return EXIT_SUCCESS;
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
new file mode 100644
index 000000000..35cc6c45f
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp
@@ -0,0 +1,87 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#define BLOCK_SIZE 64
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout(push_constant) uniform Parameters {
+    uint B;
+    uint T;
+    uint C;
+    uint H;
+};
+
+layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; };
+layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; };
+layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; };
+layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; };
+layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; };
+layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; };
+layout(binding = 6) buffer DstBuf { A_TYPE dst[]; };
+
+shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE];
+
+void main() {
+    const uint head_size = BLOCK_SIZE;
+    const uint batch_id = gl_WorkGroupID.x / H;
+    const uint head_id = gl_WorkGroupID.x % H;
+    const uint tid = gl_LocalInvocationID.x;
+
+    const uint state_size = C * head_size;
+    const uint n_seq_tokens = T / B;
+
+    if (batch_id >= B || head_id >= H) {
+        return;
+    }
+
+    A_TYPE state[BLOCK_SIZE];
+    [[unroll]] for (uint i = 0; i < head_size; i++) {
+        state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
+                          + i * head_size + tid];
+    }
+
+    barrier();
+    _tf[tid] = tf[head_id * head_size + tid];
+    barrier();
+
+    const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
+    const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
+
+    for (uint t = start_t; t < end_t; t += C) {
+        barrier();
+        _k[tid] = k[t];
+        _r[tid] = r[t];
+        _td[tid] = td[t];
+        barrier();
+
+        const A_TYPE v_val = v[t];
+        A_TYPE y = 0.0;
+
+        [[unroll]] for (uint j = 0; j < head_size; j += 4) {
+            vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
+            vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
+            vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
+            vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
+            vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
+
+            vec4 kv = k_vec * v_val;
+
+            vec4 temp = tf_vec * kv + s_vec;
+            y += dot(r_vec, temp);
+
+            s_vec = s_vec * td_vec + kv;
+            state[j] = s_vec.x;
+            state[j+1] = s_vec.y;
+            state[j+2] = s_vec.z;
+            state[j+3] = s_vec.w;
+        }
+
+        dst[t] = y;
+    }
+
+    [[unroll]] for (uint i = 0; i < head_size; i++) {
+        dst[T * C + batch_id * state_size + head_id * head_size * head_size
+            + i * head_size + tid] = state[i];
+    }
+}
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
new file mode 100644
index 000000000..3b4861542
--- /dev/null
+++ b/ggml/src/ggml.c
@@ -0,0 +1,6511 @@
+#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-threading.h"
+#include "ggml.h"
+
+// FIXME: required here for quantization functions
+#include "ggml-quants.h"
+
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <signal.h>
+#if defined(__gnu_linux__)
+#include <syscall.h>
+#endif
+
+#if defined(__APPLE__)
+#include <unistd.h>
+#include <mach/mach.h>
+#include <TargetConditionals.h>
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define UNUSED GGML_UNUSED
+
+#if defined(_MSC_VER)
+#define m512bh(p) p
+#define m512i(p) p
+#else
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+#endif
+
+// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
+float ggml_table_f32_f16[1 << 16];
+
+#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
+    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
+
+struct backtrace_state {
+    void ** current;
+    void ** end;
+};
+
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+    struct backtrace_state * state = (struct backtrace_state *)arg;
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc) {
+        if (state->current == state->end) {
+            return _URC_END_OF_STACK;
+        } else {
+            *state->current++ = (void*)pc;
+        }
+    }
+    return _URC_NO_REASON;
+}
+
+static void ggml_print_backtrace_symbols(void) {
+    const int max = 100;
+    void* buffer[max];
+
+    struct backtrace_state state = {buffer, buffer + max};
+    _Unwind_Backtrace(unwind_callback, &state);
+
+    int count = state.current - buffer;
+
+    for (int idx = 0; idx < count; ++idx) {
+        const void * addr = buffer[idx];
+        const char * symbol = "";
+
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname) {
+            symbol = info.dli_sname;
+        }
+
+        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+    }
+}
+#elif defined(__linux__) && defined(__GLIBC__)
+#include <execinfo.h>
+static void ggml_print_backtrace_symbols(void) {
+    void * trace[100];
+    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
+    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
+}
+#else
+static void ggml_print_backtrace_symbols(void) {
+    // platform not supported
+}
+#endif
+
+static void ggml_print_backtrace(void) {
+    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
+    if (GGML_NO_BACKTRACE) {
+        return;
+    }
+    char attach[32];
+    snprintf(attach, sizeof(attach), "attach %d", getpid());
+    int pid = fork();
+    if (pid == 0) {
+        // try gdb
+        execlp("gdb", "gdb", "--batch",
+            "-ex", "set style enabled on",
+            "-ex", attach,
+            "-ex", "bt -frame-info source-and-location",
+            "-ex", "detach",
+            "-ex", "quit",
+            (char *) NULL);
+        // try lldb
+        execlp("lldb", "lldb", "--batch",
+            "-o", "bt",
+            "-o", "quit",
+            "-p", attach,
+            (char *) NULL);
+        exit(EXIT_FAILURE);
+    } else {
+        int wstatus;
+        waitpid(pid, &wstatus, 0);
+        if (WIFEXITED(wstatus)) {
+            if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
+                // gdb failed, fallback to backtrace_symbols
+                ggml_print_backtrace_symbols();
+            }
+        }
+    }
+}
+#else
+static void ggml_print_backtrace(void) {
+    // platform not supported
+}
+#endif
+
+void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    fflush(stdout);
+
+    fprintf(stderr, "%s:%d: ", file, line);
+
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+
+    fprintf(stderr, "\n");
+
+    ggml_print_backtrace();
+    abort();
+}
+
+//
+// logging
+//
+
+struct ggml_logger_state {
+    ggml_log_callback log_callback;
+    void * log_callback_user_data;
+};
+static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
+
+static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
+    if (format == NULL) {
+        return;
+    }
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        free(buffer2);
+    }
+    va_end(args_copy);
+}
+
+void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    ggml_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+//
+// end of logging block
+//
+
+#ifdef GGML_USE_ACCELERATE
+// uncomment to use vDSP for soft max computation
+// note: not sure if it is actually faster
+//#define GGML_SOFT_MAX_ACCELERATE
+#endif
+
+
+void * ggml_aligned_malloc(size_t size) {
+    const int alignment = 64;
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    return _aligned_malloc(size, alignment);
+#else
+    if (size == 0) {
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        return NULL;
+    }
+    void * aligned_memory = NULL;
+  #ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, alignment, size);
+  #elif TARGET_OS_OSX
+    GGML_UNUSED(alignment);
+    kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
+    int result = EFAULT;
+    switch (alloc_status) {
+        case KERN_SUCCESS:
+            result = 0;
+            break;
+        case KERN_INVALID_ADDRESS:
+            result = EINVAL;
+            break;
+        case KERN_NO_SPACE:
+            result = ENOMEM;
+            break;
+        default:
+            result = EFAULT;
+            break;
+    }
+  #else
+    int result = posix_memalign(&aligned_memory, alignment, size);
+  #endif
+    if (result != 0) {
+        // Handle allocation failure
+        const char *error_desc = "unknown allocation error";
+        switch (result) {
+            case EINVAL:
+                error_desc = "invalid alignment value";
+                break;
+            case ENOMEM:
+                error_desc = "insufficient memory";
+                break;
+        }
+        GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
+        return NULL;
+    }
+    return aligned_memory;
+#endif
+}
+
+void ggml_aligned_free(void * ptr, size_t size) {
+    GGML_UNUSED(size);
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    _aligned_free(ptr);
+#elif GGML_USE_CPU_HBM
+    if (ptr != NULL) {
+        hbw_free(ptr);
+    }
+#elif TARGET_OS_OSX
+    if (ptr != NULL) {
+        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
+    }
+#else
+    free(ptr);
+#endif
+}
+
+
+inline static void * ggml_malloc(size_t size) {
+    if (size == 0) {
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
+        return NULL;
+    }
+    void * result = malloc(size);
+    if (result == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
+        GGML_ABORT("fatal error");
+    }
+    return result;
+}
+
+// calloc
+inline static void * ggml_calloc(size_t num, size_t size) {
+    if (num == 0 || size == 0) {
+        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
+        return NULL;
+    }
+    void * result = calloc(num, size);
+    if (result == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
+        GGML_ABORT("fatal error");
+    }
+    return result;
+}
+
+#define GGML_MALLOC(size)      ggml_malloc(size)
+#define GGML_CALLOC(num, size) ggml_calloc(num, size)
+
+#define GGML_FREE(ptr) free(ptr)
+
+const char * ggml_status_to_string(enum ggml_status status) {
+    switch (status) {
+        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
+        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
+        case GGML_STATUS_SUCCESS:      return "GGML status: success";
+        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
+    }
+
+    return "GGML status: unknown";
+}
+
+float ggml_fp16_to_fp32(ggml_fp16_t x) {
+#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
+    return GGML_FP16_TO_FP32(x);
+}
+
+ggml_fp16_t ggml_fp32_to_fp16(float x) {
+#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
+    return GGML_FP32_TO_FP16(x);
+}
+
+float ggml_bf16_to_fp32(ggml_bf16_t x) {
+#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
+    return GGML_BF16_TO_FP32(x);  // it just left shifts
+}
+
+ggml_bf16_t ggml_fp32_to_bf16(float x) {
+#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
+    return GGML_FP32_TO_BF16(x);
+}
+
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
+//        currently, the ggml_cpu_has_* functions are entirely compile-time
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+    //if (ggml_cpu_has_f16c()) {
+        for (; i + 7 < n; i += 8) {
+            __m256 x_vec = _mm256_loadu_ps(x + i);
+            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storeu_si128((__m128i *)(y + i), y_vec);
+        }
+        for(; i + 3 < n; i += 4) {
+            __m128 x_vec = _mm_loadu_ps(x + i);
+            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storel_epi64((__m128i *)(y + i), y_vec);
+        }
+    //}
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__AVX512F__)
+    //if (ggml_cpu_has_avx512()) {
+        for (; i + 16 <= n; i += 16) {
+            _mm512_storeu_ps(y + i,
+                            _mm512_castsi512_ps(
+                                _mm512_slli_epi32(
+                                    _mm512_cvtepu16_epi32(
+                                        _mm256_loadu_si256(
+                                            (const __m256i *)(x + i))),
+                                    16)));
+        }
+    //}
+#endif
+#if defined(__AVX2__)
+    //if (ggml_cpu_has_avx2()) {
+        for (; i + 8 <= n; i += 8) {
+            _mm256_storeu_ps(y + i,
+                            _mm256_castsi256_ps(
+                                _mm256_slli_epi32(
+                                    _mm256_cvtepu16_epi32(
+                                        _mm_loadu_si128(
+                                            (const __m128i *)(x + i))),
+                                    16)));
+        }
+    //}
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_BF16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
+    for (int i = 0; i < n; i++) {
+        y[i] = ggml_compute_fp32_to_bf16(x[i]);
+    }
+}
+
+void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
+  int i = 0;
+#if defined(__AVX512BF16__)
+  // subnormals are flushed to zero on this platform
+  for (; i + 32 <= n; i += 32) {
+        _mm512_storeu_si512(
+            (__m512i *)(y + i),
+            m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
+                                _mm512_loadu_ps(x + i))));
+  }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_FP32_TO_BF16(x[i]);
+    }
+}
+
+bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
+    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
+}
+
+//
+// timing
+//
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+static int64_t timer_freq, timer_start;
+void ggml_time_init(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency(&t);
+    timer_freq = t.QuadPart;
+
+    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
+    // and the uptime is high enough.
+    // We subtract the program start time to reduce the likelihood of that happening.
+    QueryPerformanceCounter(&t);
+    timer_start = t.QuadPart;
+}
+int64_t ggml_time_ms(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
+}
+int64_t ggml_time_us(void) {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
+}
+#else
+void ggml_time_init(void) {}
+int64_t ggml_time_ms(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
+}
+
+int64_t ggml_time_us(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
+}
+#endif
+
+int64_t ggml_cycles(void) {
+    return clock();
+}
+
+int64_t ggml_cycles_per_ms(void) {
+    return CLOCKS_PER_SEC/1000;
+}
+
+//
+// cross-platform UTF-8 file paths
+//
+
+#ifdef _WIN32
+static wchar_t * ggml_mbstowcs(const char * mbs) {
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
+    if (!wlen) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
+    wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
+    if (!wlen) {
+        GGML_FREE(wbuf);
+        errno = EINVAL;
+        return NULL;
+    }
+
+    return wbuf;
+}
+#endif
+
+FILE * ggml_fopen(const char * fname, const char * mode) {
+#ifdef _WIN32
+    FILE * file = NULL;
+
+    // convert fname (UTF-8)
+    wchar_t * wfname = ggml_mbstowcs(fname);
+    if (wfname) {
+        // convert mode (ANSI)
+        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
+        wchar_t * wmode_p = wmode;
+        do {
+            *wmode_p++ = (wchar_t)*mode;
+        } while (*mode++);
+
+        // open file
+        file = _wfopen(wfname, wmode);
+
+        GGML_FREE(wfname);
+        GGML_FREE(wmode);
+    }
+
+    return file;
+#else
+    return fopen(fname, mode);
+#endif
+
+}
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I64] = {
+        .type_name                = "i64",
+        .blck_size                = 1,
+        .type_size                = sizeof(int64_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F64] = {
+        .type_name                = "f64",
+        .blck_size                = 1,
+        .type_size                = sizeof(double),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_fp16_t),
+        .is_quantized             = false,
+        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+    },
+    [GGML_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
+    },
+    [GGML_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
+    },
+    [4] = { // GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [5] = { // GGML_TYPE_Q4_3
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
+    },
+    [GGML_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
+    },
+    [GGML_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
+    },
+    [GGML_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
+    },
+    [GGML_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
+    },
+    [GGML_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
+    },
+    [GGML_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
+    },
+    [GGML_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
+    },
+    [GGML_TYPE_IQ2_XXS] = {
+        .type_name                = "iq2_xxs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_xxs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
+        .from_float_ref           = NULL,
+    },
+    [GGML_TYPE_IQ2_XS] = {
+        .type_name                = "iq2_xs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_xs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
+        .from_float_ref           = NULL,
+    },
+    [GGML_TYPE_IQ3_XXS] = {
+        .type_name                = "iq3_xxs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq3_xxs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
+    },
+    [GGML_TYPE_IQ3_S] = {
+        .type_name                = "iq3_s",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq3_s),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
+    },
+    [GGML_TYPE_IQ2_S] = {
+        .type_name                = "iq2_s",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq2_s),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
+    },
+    [GGML_TYPE_IQ1_S] = {
+        .type_name                = "iq1_s",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq1_s),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
+        .from_float_ref           = NULL,
+    },
+    [GGML_TYPE_IQ1_M] = {
+        .type_name                = "iq1_m",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq1_m),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
+        .from_float_ref           = NULL,
+    },
+    [GGML_TYPE_IQ4_NL] = {
+        .type_name                = "iq4_nl",
+        .blck_size                = QK4_NL,
+        .type_size                = sizeof(block_iq4_nl),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
+    },
+    [GGML_TYPE_IQ4_XS] = {
+        .type_name                = "iq4_xs",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq4_xs),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
+    },
+    [GGML_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
+    },
+    [GGML_TYPE_BF16] = {
+        .type_name                = "bf16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_bf16_t),
+        .is_quantized             = false,
+        .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
+        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
+    },
+    [31] = { // GGML_TYPE_Q4_0_4_4
+        .type_name                = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [32] = { // GGML_TYPE_Q4_0_4_8
+        .type_name                = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [33] = { // GGML_TYPE_Q4_0_8_8
+        .type_name                = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_TQ1_0] = {
+        .type_name                = "tq1_0",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_tq1_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_tq1_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_tq1_0_ref,
+    },
+    [GGML_TYPE_TQ2_0] = {
+        .type_name                = "tq2_0",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_tq2_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
+    },
+    [36] = { // GGML_TYPE_IQ4_NL_4_4
+        .type_name                = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [37] = { // GGML_TYPE_IQ4_NL_4_8
+        .type_name                = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [38] = { // GGML_TYPE_IQ4_NL_8_8
+        .type_name                = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+};
+
+const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
+    GGML_ASSERT(type < GGML_TYPE_COUNT);
+    return &type_traits[type];
+}
+
+//
+// ggml object
+//
+
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    enum ggml_object_type type;
+
+    char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
+//
+// ggml context
+//
+
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+};
+
+//
+// data types
+//
+
+static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+    "NONE",
+
+    "DUP",
+    "ADD",
+    "ADD1",
+    "ACC",
+    "SUB",
+    "MUL",
+    "DIV",
+    "SQR",
+    "SQRT",
+    "LOG",
+    "SIN",
+    "COS",
+    "SUM",
+    "SUM_ROWS",
+    "MEAN",
+    "ARGMAX",
+    "COUNT_EQUAL",
+    "REPEAT",
+    "REPEAT_BACK",
+    "CONCAT",
+    "SILU_BACK",
+    "NORM",
+    "RMS_NORM",
+    "RMS_NORM_BACK",
+    "GROUP_NORM",
+
+    "MUL_MAT",
+    "MUL_MAT_ID",
+    "OUT_PROD",
+
+    "SCALE",
+    "SET",
+    "CPY",
+    "CONT",
+    "RESHAPE",
+    "VIEW",
+    "PERMUTE",
+    "TRANSPOSE",
+    "GET_ROWS",
+    "GET_ROWS_BACK",
+    "DIAG",
+    "DIAG_MASK_INF",
+    "DIAG_MASK_ZERO",
+    "SOFT_MAX",
+    "SOFT_MAX_BACK",
+    "ROPE",
+    "ROPE_BACK",
+    "CLAMP",
+    "CONV_TRANSPOSE_1D",
+    "IM2COL",
+    "IM2COL_BACK",
+    "CONV_TRANSPOSE_2D",
+    "POOL_1D",
+    "POOL_2D",
+    "POOL_2D_BACK",
+    "UPSCALE",
+    "PAD",
+    "PAD_REFLECT_1D",
+    "ARANGE",
+    "TIMESTEP_EMBEDDING",
+    "ARGSORT",
+    "LEAKY_RELU",
+
+    "FLASH_ATTN_EXT",
+    "FLASH_ATTN_BACK",
+    "SSM_CONV",
+    "SSM_SCAN",
+    "WIN_PART",
+    "WIN_UNPART",
+    "GET_REL_POS",
+    "ADD_REL_POS",
+    "RWKV_WKV6",
+    "GATED_LINEAR_ATTN",
+
+    "UNARY",
+
+    "MAP_UNARY",
+    "MAP_BINARY",
+
+    "MAP_CUSTOM1_F32",
+    "MAP_CUSTOM2_F32",
+    "MAP_CUSTOM3_F32",
+
+    "MAP_CUSTOM1",
+    "MAP_CUSTOM2",
+    "MAP_CUSTOM3",
+
+    "CROSS_ENTROPY_LOSS",
+    "CROSS_ENTROPY_LOSS_BACK",
+    "OPT_STEP_ADAMW",
+};
+
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+
+static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+    "none",
+
+    "x",
+    "x+y",
+    "x+y",
+    "view(x,nb,offset)+=y->x",
+    "x-y",
+    "x*y",
+    "x/y",
+    "x^2",
+    "√x",
+    "log(x)",
+    "sin(x)",
+    "cos(x)",
+    "Σx",
+    "Σx_k",
+    "Σx/n",
+    "argmax(x)",
+    "count_equal(x)",
+    "repeat(x)",
+    "repeat_back(x)",
+    "concat(x, y)",
+    "silu_back(x)",
+    "norm(x)",
+    "rms_norm(x)",
+    "rms_norm_back(x)",
+    "group_norm(x)",
+
+    "X*Y",
+    "X[i]*Y",
+    "X*Y",
+
+    "x*v",
+    "y-\\>view(x)",
+    "x-\\>y",
+    "cont(x)",
+    "reshape(x)",
+    "view(x)",
+    "permute(x)",
+    "transpose(x)",
+    "get_rows(x)",
+    "get_rows_back(x)",
+    "diag(x)",
+    "diag_mask_inf(x)",
+    "diag_mask_zero(x)",
+    "soft_max(x)",
+    "soft_max_back(x)",
+    "rope(x)",
+    "rope_back(x)",
+    "clamp(x)",
+    "conv_transpose_1d(x)",
+    "im2col(x)",
+    "im2col_back(x)",
+    "conv_transpose_2d(x)",
+    "pool_1d(x)",
+    "pool_2d(x)",
+    "pool_2d_back(x)",
+    "upscale(x)",
+    "pad(x)",
+    "pad_reflect_1d(x)",
+    "arange(start, stop, step)",
+    "timestep_embedding(timesteps, dim, max_period)",
+    "argsort(x)",
+    "leaky_relu(x)",
+
+    "flash_attn_ext(x)",
+    "flash_attn_back(x)",
+    "ssm_conv(x)",
+    "ssm_scan(x)",
+    "win_part(x)",
+    "win_unpart(x)",
+    "get_rel_pos(x)",
+    "add_rel_pos(x)",
+    "rwkv_wkv6(k, v, r, tf, td, s)",
+    "gated_linear_attn(k, v, q, gate, s)",
+
+    "unary(x)",
+
+    "f(x)",
+    "f(x,y)",
+
+    "custom_f32(x)",
+    "custom_f32(x,y)",
+    "custom_f32(x,y,z)",
+
+    "custom(x)",
+    "custom(x,y)",
+    "custom(x,y,z)",
+
+    "cross_entropy_loss(x,y)",
+    "cross_entropy_loss_back(x,y)",
+    "adamw(x)",
+};
+
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+
+static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
+
+
+static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
+    "ABS",
+    "SGN",
+    "NEG",
+    "STEP",
+    "TANH",
+    "ELU",
+    "RELU",
+    "SIGMOID",
+    "GELU",
+    "GELU_QUICK",
+    "SILU",
+    "HARDSWISH",
+    "HARDSIGMOID",
+    "EXP",
+};
+
+static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
+
+
+static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
+static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_print_object(const struct ggml_object * obj) {
+    GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+            obj->type, obj->offs, obj->size, (const void *) obj->next);
+}
+
+void ggml_print_objects(const struct ggml_context * ctx) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
+
+    while (obj != NULL) {
+        ggml_print_object(obj);
+        obj = obj->next;
+    }
+
+    GGML_LOG_INFO("%s: --- end ---\n", __func__);
+}
+
+int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    size_t nbytes;
+    const size_t blck_size = ggml_blck_size(tensor->type);
+    if (blck_size == 1) {
+        nbytes = ggml_type_size(tensor->type);
+        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+    else {
+        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
+        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+
+    return nbytes;
+}
+
+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
+}
+
+int64_t ggml_blck_size(enum ggml_type type) {
+    return type_traits[type].blck_size;
+}
+
+size_t ggml_type_size(enum ggml_type type) {
+    return type_traits[type].type_size;
+}
+
+size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+    assert(ne % ggml_blck_size(type) == 0);
+    return ggml_type_size(type)*ne/ggml_blck_size(type);
+}
+
+double ggml_type_sizef(enum ggml_type type) {
+    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
+}
+
+const char * ggml_type_name(enum ggml_type type) {
+    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
+}
+
+bool ggml_is_quantized(enum ggml_type type) {
+    return type_traits[type].is_quantized;
+}
+
+const char * ggml_op_name(enum ggml_op op) {
+    return GGML_OP_NAME[op];
+}
+
+const char * ggml_op_symbol(enum ggml_op op) {
+    return GGML_OP_SYMBOL[op];
+}
+
+const char * ggml_unary_op_name(enum ggml_unary_op op) {
+    return GGML_UNARY_OP_NAME[op];
+}
+
+const char * ggml_op_desc(const struct ggml_tensor * t) {
+    if (t->op == GGML_OP_UNARY) {
+        enum ggml_unary_op uop = ggml_get_unary_op(t);
+        return ggml_unary_op_name(uop);
+    }
+    return ggml_op_name(t->op);
+}
+
+size_t ggml_element_size(const struct ggml_tensor * tensor) {
+    return ggml_type_size(tensor->type);
+}
+
+bool ggml_is_scalar(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_is_vector(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_is_matrix(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+bool ggml_is_3d(const struct ggml_tensor * tensor) {
+    return tensor->ne[3] == 1;
+}
+
+int ggml_n_dims(const struct ggml_tensor * tensor) {
+    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
+        if (tensor->ne[i] > 1) {
+            return i + 1;
+        }
+    }
+    return 1;
+}
+
+enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
+    enum ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_BF16:          wtype = GGML_TYPE_BF16;  break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
+        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
+        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
+        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
+        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
+        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
+        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
+        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
+        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
+        case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
+        case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
+        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
+        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
+        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
+
+    return wtype;
+}
+
+size_t ggml_tensor_overhead(void) {
+    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
+}
+
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
+}
+
+static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
+    size_t next_nb = ggml_type_size(tensor->type);
+    if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
+        return false;
+    }
+    next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        if (tensor->ne[i] != 1) {
+            if (i > n) {
+                if (tensor->nb[i] != next_nb) {
+                    return false;
+                }
+                next_nb *= tensor->ne[i];
+            } else {
+                // this dimension does not need to be contiguous
+                next_nb = tensor->ne[i]*tensor->nb[i];
+            }
+        }
+    }
+    return true;
+}
+
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_0(tensor);
+}
+
+bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_n(tensor, 0);
+}
+
+bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_n(tensor, 1);
+}
+
+bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_n(tensor, 2);
+}
+
+bool ggml_is_permuted(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
+}
+
+static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+bool ggml_is_empty(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] == 0) {
+            // empty if any dimension has no elements
+            return true;
+        }
+    }
+    return false;
+}
+
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->ne[0] == t1->ne[0]) &&
+        (t0->ne[1] == t1->ne[1]) &&
+        (t0->ne[2] == t1->ne[2]) &&
+        (t0->ne[3] == t1->ne[3]);
+}
+
+bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->nb[0] == t1->nb[0]) &&
+        (t0->nb[1] == t1->nb[1]) &&
+        (t0->nb[2] == t1->nb[2]) &&
+        (t0->nb[3] == t1->nb[3]);
+}
+
+// check if t1 can be represented as a repeatition of t0
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
+        (t1->ne[0]%t0->ne[0] == 0) &&
+        (t1->ne[1]%t0->ne[1] == 0) &&
+        (t1->ne[2]%t0->ne[2] == 0) &&
+        (t1->ne[3]%t0->ne[3] == 0);
+}
+
+static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
+}
+
+// assert that pointer is aligned to GGML_MEM_ALIGN
+#define GGML_ASSERT_ALIGNED(ptr) \
+    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ggml_context * ggml_init(struct ggml_init_params params) {
+    static bool is_first_call = true;
+
+    ggml_critical_section_start();
+
+    if (is_first_call) {
+        // initialize time system (required on Windows)
+        ggml_time_init();
+
+        for (int i = 0; i < (1 << 16); ++i) {
+            union {
+                uint16_t u16;
+                ggml_fp16_t fp16;
+            } u = {i};
+            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
+        }
+
+        is_first_call = false;
+    }
+
+    ggml_critical_section_end();
+
+    struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
+
+    // allow to call ggml_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_MEM_ALIGN;
+    }
+
+    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
+
+    *ctx = (struct ggml_context) {
+        /*.mem_size           =*/ mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
+        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
+        /*.no_alloc           =*/ params.no_alloc,
+        /*.n_objects          =*/ 0,
+        /*.objects_begin      =*/ NULL,
+        /*.objects_end        =*/ NULL,
+    };
+
+    GGML_ASSERT(ctx->mem_buffer != NULL);
+
+    GGML_ASSERT_ALIGNED(ctx->mem_buffer);
+
+    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
+
+    return ctx;
+}
+
+void ggml_reset(struct ggml_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    ctx->n_objects     = 0;
+    ctx->objects_begin = NULL;
+    ctx->objects_end   = NULL;
+}
+
+void ggml_free(struct ggml_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    if (ctx->mem_buffer_owned) {
+        ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
+    }
+
+    GGML_FREE(ctx);
+}
+
+size_t ggml_used_mem(const struct ggml_context * ctx) {
+    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
+}
+
+bool ggml_get_no_alloc(struct ggml_context * ctx) {
+    return ctx->no_alloc;
+}
+
+void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
+    ctx->no_alloc = no_alloc;
+}
+
+void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
+    return ctx->mem_buffer;
+}
+
+size_t ggml_get_mem_size(const struct ggml_context * ctx) {
+    return ctx->mem_size;
+}
+
+size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
+    size_t max_size = 0;
+
+    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
+        size_t bytes = ggml_nbytes(tensor);
+        max_size = MAX(max_size, bytes);
+    }
+
+    return max_size;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
+    // always insert objects at the end of the context's memory pool
+    struct ggml_object * obj_cur = ctx->objects_end;
+
+    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
+    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
+    const size_t cur_end  = cur_offs + cur_size;
+
+    // align to GGML_MEM_ALIGN
+    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
+
+    char * const mem_buffer = ctx->mem_buffer;
+    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
+
+    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
+        GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
+#ifndef NDEBUG
+        GGML_ABORT("not enough space in the context's memory pool");
+#endif
+        return NULL;
+    }
+
+    *obj_new = (struct ggml_object) {
+        .offs = cur_end + GGML_OBJECT_SIZE,
+        .size = size_needed,
+        .next = NULL,
+        .type = type,
+    };
+
+    GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
+
+    if (obj_cur != NULL) {
+        obj_cur->next = obj_new;
+    } else {
+        // this is the first object in this context
+        ctx->objects_begin = obj_new;
+    }
+
+    ctx->objects_end = obj_new;
+
+    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+
+    return obj_new;
+}
+
+static struct ggml_tensor * ggml_new_tensor_impl(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int                   n_dims,
+        const int64_t       * ne,
+        struct ggml_tensor  * view_src,
+        size_t                view_offs) {
+
+    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
+    GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+
+    // find the base tensor and absolute offset
+    if (view_src != NULL && view_src->view_src != NULL) {
+        view_offs += view_src->view_offs;
+        view_src   = view_src->view_src;
+    }
+
+    size_t data_size = ggml_row_size(type, ne[0]);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= ne[i];
+    }
+
+    GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
+
+    void * data = view_src != NULL ? view_src->data : NULL;
+    if (data != NULL) {
+        data = (char *) data + view_offs;
+    }
+
+    size_t obj_alloc_size = 0;
+
+    if (view_src == NULL && !ctx->no_alloc) {
+        // allocate tensor data in the context's memory pool
+        obj_alloc_size = data_size;
+    }
+
+    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
+    GGML_ASSERT(obj_new);
+
+    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
+
+    *result = (struct ggml_tensor) {
+        /*.type         =*/ type,
+        /*.buffer       =*/ NULL,
+        /*.ne           =*/ { 1, 1, 1, 1 },
+        /*.nb           =*/ { 0, 0, 0, 0 },
+        /*.op           =*/ GGML_OP_NONE,
+        /*.op_params    =*/ { 0 },
+        /*.flags        =*/ 0,
+        /*.src          =*/ { NULL },
+        /*.view_src     =*/ view_src,
+        /*.view_offs    =*/ view_offs,
+        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
+        /*.name         =*/ { 0 },
+        /*.extra        =*/ NULL,
+        /*.padding      =*/ { 0 },
+    };
+
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+    //GGML_ASSERT_ALIGNED(result->data);
+
+    for (int i = 0; i < n_dims; i++) {
+        result->ne[i] = ne[i];
+    }
+
+    result->nb[0] = ggml_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
+    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
+    }
+
+    ctx->n_objects++;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_new_tensor(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int                   n_dims,
+        const int64_t       * ne) {
+    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
+}
+
+struct ggml_tensor * ggml_new_tensor_1d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0) {
+    return ggml_new_tensor(ctx, type, 1, &ne0);
+}
+
+struct ggml_tensor * ggml_new_tensor_2d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0,
+        int64_t ne1) {
+    const int64_t ne[2] = { ne0, ne1 };
+    return ggml_new_tensor(ctx, type, 2, ne);
+}
+
+struct ggml_tensor * ggml_new_tensor_3d(
+        struct ggml_context * ctx,
+        enum   ggml_type      type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2) {
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    return ggml_new_tensor(ctx, type, 3, ne);
+}
+
+struct ggml_tensor * ggml_new_tensor_4d(
+        struct ggml_context * ctx,
+        enum   ggml_type type,
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3) {
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    return ggml_new_tensor(ctx, type, 4, ne);
+}
+
+void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, nbytes);
+
+    return (uint8_t *)ctx->mem_buffer + obj->offs;
+}
+
+struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
+}
+
+void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+
+    const int64_t i3_ = (i/(ne2*ne1*ne0));
+    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
+    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
+    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
+
+    if (i0) {
+        * i0 = i0_;
+    }
+    if (i1) {
+        * i1 = i1_;
+    }
+    if (i2) {
+        * i2 = i2_;
+    }
+    if (i3) {
+        * i3 = i3_;
+    }
+}
+
+void * ggml_get_data(const struct ggml_tensor * tensor) {
+    return tensor->data;
+}
+
+float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
+    assert(tensor->type == GGML_TYPE_F32);
+    return (float *)(tensor->data);
+}
+
+enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
+    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
+}
+
+const char * ggml_get_name(const struct ggml_tensor * tensor) {
+    return tensor->name;
+}
+
+struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
+    size_t i;
+    for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
+        tensor->name[i] = name[i];
+    }
+    tensor->name[i] = '\0';
+    return tensor;
+}
+
+struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
+    va_end(args);
+    return tensor;
+}
+
+struct ggml_tensor * ggml_view_tensor(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * src) {
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
+    ggml_format_name(result, "%s (view)", src->name);
+
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = src->nb[i];
+    }
+
+    return result;
+}
+
+struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
+    obj = obj->next;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
+            return (struct ggml_tensor *)(mem_buffer + obj->offs);
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
+            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+            if (strcmp(cur->name, name) == 0) {
+                return cur;
+            }
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// ggml_dup
+
+static struct ggml_tensor * ggml_dup_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_DUP;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_dup(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_dup_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_dup_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_dup_impl(ctx, a, true);
+}
+
+// ggml_add
+
+static struct ggml_tensor * ggml_add_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_ADD;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_add_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_add_impl(ctx, a, b, true);
+}
+
+// ggml_add_cast
+
+static struct ggml_tensor * ggml_add_cast_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        enum   ggml_type      type) {
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+
+    // currently only supported for quantized input and f16
+    GGML_ASSERT(ggml_is_quantized(a->type) ||
+                a->type == GGML_TYPE_F16 ||
+                a->type == GGML_TYPE_BF16);
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
+
+    result->op     = GGML_OP_ADD;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add_cast(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        enum   ggml_type      type) {
+    return ggml_add_cast_impl(ctx, a, b, type);
+}
+
+// ggml_add1
+
+static struct ggml_tensor * ggml_add1_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_is_scalar(b));
+    GGML_ASSERT(ggml_is_padded_1d(a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_ADD1;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add1(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_add1_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_add1_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_add1_impl(ctx, a, b, true);
+}
+
+// ggml_acc
+
+static struct ggml_tensor * ggml_acc_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_ACC;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_acc(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_acc_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+// ggml_sub
+
+static struct ggml_tensor * ggml_sub_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SUB;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sub(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_sub_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_sub_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_sub_impl(ctx, a, b, true);
+}
+
+// ggml_mul
+
+static struct ggml_tensor * ggml_mul_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_MUL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_mul(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_mul_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_mul_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_mul_impl(ctx, a, b, true);
+}
+
+// ggml_div
+
+static struct ggml_tensor * ggml_div_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_DIV;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_div(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_div_impl(ctx, a, b, false);
+}
+
+struct ggml_tensor * ggml_div_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_div_impl(ctx, a, b, true);
+}
+
+// ggml_sqr
+
+static struct ggml_tensor * ggml_sqr_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SQR;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sqr(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqr_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_sqr_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqr_impl(ctx, a, true);
+}
+
+// ggml_sqrt
+
+static struct ggml_tensor * ggml_sqrt_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SQRT;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sqrt(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqrt_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_sqrt_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sqrt_impl(ctx, a, true);
+}
+
+// ggml_log
+
+static struct ggml_tensor * ggml_log_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_LOG;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_log(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_log_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_log_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_log_impl(ctx, a, true);
+}
+
+// ggml_sin
+
+static struct ggml_tensor * ggml_sin_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SIN;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_sin(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sin_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_sin_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_sin_impl(ctx, a, true);
+}
+
+// ggml_cos
+
+static struct ggml_tensor * ggml_cos_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_COS;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_cos(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_cos_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_cos_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_cos_impl(ctx, a, true);
+}
+
+// ggml_sum
+
+struct ggml_tensor * ggml_sum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
+
+    result->op     = GGML_OP_SUM;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_sum_rows
+
+struct ggml_tensor * ggml_sum_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    int64_t ne[GGML_MAX_DIMS] = { 1 };
+    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+        ne[i] = a->ne[i];
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
+
+    result->op     = GGML_OP_SUM_ROWS;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_mean
+
+struct ggml_tensor * ggml_mean(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_MEAN;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_argmax
+
+struct ggml_tensor * ggml_argmax(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    GGML_ASSERT(ggml_is_matrix(a));
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
+
+    result->op     = GGML_OP_ARGMAX;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_count_equal
+
+struct ggml_tensor * ggml_count_equal(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1);
+
+    result->op     = GGML_OP_COUNT_EQUAL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_repeat
+
+struct ggml_tensor * ggml_repeat(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_repeat(a, b));
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
+
+    result->op     = GGML_OP_REPEAT;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_repeat_back
+
+struct ggml_tensor * ggml_repeat_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_repeat(b, a));
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
+
+    result->op     = GGML_OP_REPEAT_BACK;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_concat
+
+struct ggml_tensor * ggml_concat(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                   dim) {
+    GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+
+    int64_t ne[GGML_MAX_DIMS];
+    for (int d = 0; d < GGML_MAX_DIMS; ++d) {
+        if (d == dim) {
+            ne[d] = a->ne[d] + b->ne[d];
+            continue;
+        }
+        GGML_ASSERT(a->ne[d] == b->ne[d]);
+        ne[d] = a->ne[d];
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
+
+    ggml_set_op_params_i32(result, 0, dim);
+
+    result->op     = GGML_OP_CONCAT;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_abs
+
+struct ggml_tensor * ggml_abs(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
+}
+
+struct ggml_tensor * ggml_abs_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
+}
+
+// ggml_sgn
+
+struct ggml_tensor * ggml_sgn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
+}
+
+struct ggml_tensor * ggml_sgn_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
+}
+
+// ggml_neg
+
+struct ggml_tensor * ggml_neg(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
+}
+
+struct ggml_tensor * ggml_neg_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
+}
+
+// ggml_step
+
+struct ggml_tensor * ggml_step(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
+}
+
+struct ggml_tensor * ggml_step_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
+}
+
+// ggml_tanh
+
+struct ggml_tensor * ggml_tanh(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
+}
+
+struct ggml_tensor * ggml_tanh_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
+}
+
+// ggml_elu
+
+struct ggml_tensor * ggml_elu(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
+}
+
+struct ggml_tensor * ggml_elu_inplace(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
+}
+
+// ggml_relu
+
+struct ggml_tensor * ggml_relu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
+}
+
+struct ggml_tensor * ggml_relu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
+}
+
+// ggml_leaky_relu
+
+struct ggml_tensor * ggml_leaky_relu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 negative_slope,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
+
+    result->op     = GGML_OP_LEAKY_RELU;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_sigmoid
+
+struct ggml_tensor * ggml_sigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
+}
+
+struct ggml_tensor * ggml_sigmoid_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
+}
+
+// ggml_gelu
+
+struct ggml_tensor * ggml_gelu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
+}
+
+struct ggml_tensor * ggml_gelu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
+}
+
+// ggml_gelu_quick
+
+struct ggml_tensor * ggml_gelu_quick(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
+}
+
+struct ggml_tensor * ggml_gelu_quick_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
+}
+
+// ggml_silu
+
+struct ggml_tensor * ggml_silu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
+}
+
+struct ggml_tensor * ggml_silu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
+}
+
+// ggml_silu_back
+
+struct ggml_tensor * ggml_silu_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SILU_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml hardswish
+
+struct ggml_tensor * ggml_hardswish(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
+}
+
+// ggml hardsigmoid
+
+struct ggml_tensor * ggml_hardsigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
+}
+
+// ggml exp
+
+struct ggml_tensor * ggml_exp(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_EXP);
+}
+
+struct ggml_tensor * ggml_exp_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP);
+}
+
+// ggml_norm
+
+static struct ggml_tensor * ggml_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op     = GGML_OP_NORM;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_tensor * ggml_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_rms_norm
+
+static struct ggml_tensor * ggml_rms_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op     = GGML_OP_RMS_NORM;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rms_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, false);
+}
+
+struct ggml_tensor * ggml_rms_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_rms_norm_impl(ctx, a, eps, true);
+}
+
+// ggml_rms_norm_back
+
+struct ggml_tensor * ggml_rms_norm_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        float                 eps) {
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
+    result->op     = GGML_OP_RMS_NORM_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_group_norm
+
+static struct ggml_tensor * ggml_group_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, n_groups);
+    ggml_set_op_params_f32(result, 1, eps);
+
+    result->op     = GGML_OP_GROUP_NORM;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_group_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
+}
+
+struct ggml_tensor * ggml_group_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_groups,
+        float                 eps) {
+    return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
+}
+
+// ggml_mul_mat
+
+static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[0]           == t1->ne[0])  &&
+           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+struct ggml_tensor * ggml_mul_mat(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_mul_mat(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
+
+    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_MUL_MAT;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+void ggml_mul_mat_set_prec(
+        struct ggml_tensor * a,
+        enum ggml_prec       prec) {
+    GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
+
+    const int32_t prec_i32 = (int32_t) prec;
+
+    ggml_set_op_params_i32(a, 0, prec_i32);
+}
+
+// ggml_mul_mat_id
+
+/*
+    c = ggml_mul_mat_id(ctx, as, b, ids);
+
+    as  -> [cols, rows, n_expert]
+    ids -> [n_experts_used, n_tokens] (i32)
+    b   -> [cols, n_expert_used, n_tokens]
+    c   -> [rows, n_expert_used, n_tokens]
+
+    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+
+    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
+*/
+struct ggml_tensor * ggml_mul_mat_id(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * as,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * ids) {
+    GGML_ASSERT(!ggml_is_transposed(as));
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
+    GGML_ASSERT(b->ne[3] == 1); // b is 3d
+    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
+    GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
+    GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
+    GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
+
+    const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_MUL_MAT_ID;
+    result->src[0] = as;
+    result->src[1] = b;
+    result->src[2] = ids;
+
+    return result;
+}
+
+// ggml_out_prod
+
+static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return (t0->ne[1] == t1->ne[1])   &&
+           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+struct ggml_tensor * ggml_out_prod(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_can_out_prod(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
+
+    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
+    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_OUT_PROD;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_scale
+
+static struct ggml_tensor * ggml_scale_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 s,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_is_padded_1d(a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, &s, sizeof(s));
+
+    result->op     = GGML_OP_SCALE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_scale(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 s) {
+    return ggml_scale_impl(ctx, a, s, false);
+}
+
+struct ggml_tensor * ggml_scale_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 s) {
+    return ggml_scale_impl(ctx, a, s, true);
+}
+
+// ggml_set
+
+static struct ggml_tensor * ggml_set_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
+
+    // make a view of the destination
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    GGML_ASSERT(offset < (size_t)(1 << 30));
+    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SET;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_set(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ggml_tensor * ggml_set_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+struct ggml_tensor * ggml_set_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_1d_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
+}
+
+struct ggml_tensor * ggml_set_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+struct ggml_tensor * ggml_set_2d_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        size_t                nb1,
+        size_t                offset) {
+    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
+}
+
+// ggml_cpy
+
+static struct ggml_tensor * ggml_cpy_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+
+    // make a view of the destination
+    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
+    if (strlen(b->name) > 0) {
+        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
+    } else {
+        ggml_format_name(result, "%s (copy)", a->name);
+    }
+
+    result->op     = GGML_OP_CPY;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_cpy(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_cpy_impl(ctx, a, b);
+}
+
+struct ggml_tensor * ggml_cast(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum   ggml_type      type) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
+    ggml_format_name(result, "%s (copy)", a->name);
+
+    result->op     = GGML_OP_CPY;
+    result->src[0] = a;
+    result->src[1] = result;
+
+    return result;
+}
+
+// ggml_cont
+
+static struct ggml_tensor * ggml_cont_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+    ggml_format_name(result, "%s (cont)", a->name);
+
+    result->op     = GGML_OP_CONT;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_cont(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a) {
+    return ggml_cont_impl(ctx, a);
+}
+
+// make contiguous, with new shape
+GGML_API struct ggml_tensor * ggml_cont_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0) {
+    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
+}
+
+struct ggml_tensor * ggml_cont_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+    ggml_format_name(result, "%s (cont)", a->name);
+
+    result->op     = GGML_OP_CONT;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_reshape
+
+struct ggml_tensor * ggml_reshape(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
+    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op     = GGML_OP_RESHAPE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0);
+
+    const int64_t ne[1] = { ne0 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op     = GGML_OP_RESHAPE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
+
+    const int64_t ne[2] = { ne0, ne1 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op     = GGML_OP_RESHAPE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
+
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op     = GGML_OP_RESHAPE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_reshape_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
+
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
+    ggml_format_name(result, "%s (reshaped)", a->name);
+
+    result->op     = GGML_OP_RESHAPE;
+    result->src[0] = a;
+
+    return result;
+}
+
+static struct ggml_tensor * ggml_view_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_dims,
+        const int64_t       * ne,
+        size_t                offset) {
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
+    ggml_format_name(result, "%s (view)", a->name);
+
+    ggml_set_op_params(result, &offset, sizeof(offset));
+
+    result->op     = GGML_OP_VIEW;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_view_1d
+
+struct ggml_tensor * ggml_view_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        size_t                offset) {
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
+
+    return result;
+}
+
+// ggml_view_2d
+
+struct ggml_tensor * ggml_view_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        size_t                nb1,
+        size_t                offset) {
+    const int64_t ne[2] = { ne0, ne1 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = result->nb[1]*ne1;
+    result->nb[3] = result->nb[2];
+
+    return result;
+}
+
+// ggml_view_3d
+
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                offset) {
+    const int64_t ne[3] = { ne0, ne1, ne2 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = result->nb[2]*ne2;
+
+    return result;
+}
+
+// ggml_view_4d
+
+struct ggml_tensor * ggml_view_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                nb3,
+        size_t                offset) {
+    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = nb3;
+
+    return result;
+}
+
+// ggml_permute
+
+struct ggml_tensor * ggml_permute(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   axis0,
+        int                   axis1,
+        int                   axis2,
+        int                   axis3) {
+    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
+    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
+
+    GGML_ASSERT(axis0 != axis1);
+    GGML_ASSERT(axis0 != axis2);
+    GGML_ASSERT(axis0 != axis3);
+    GGML_ASSERT(axis1 != axis2);
+    GGML_ASSERT(axis1 != axis3);
+    GGML_ASSERT(axis2 != axis3);
+
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    ggml_format_name(result, "%s (permuted)", a->name);
+
+    int ne[GGML_MAX_DIMS];
+    int nb[GGML_MAX_DIMS];
+
+    ne[axis0] = a->ne[0];
+    ne[axis1] = a->ne[1];
+    ne[axis2] = a->ne[2];
+    ne[axis3] = a->ne[3];
+
+    nb[axis0] = a->nb[0];
+    nb[axis1] = a->nb[1];
+    nb[axis2] = a->nb[2];
+    nb[axis3] = a->nb[3];
+
+    result->ne[0] = ne[0];
+    result->ne[1] = ne[1];
+    result->ne[2] = ne[2];
+    result->ne[3] = ne[3];
+
+    result->nb[0] = nb[0];
+    result->nb[1] = nb[1];
+    result->nb[2] = nb[2];
+    result->nb[3] = nb[3];
+
+    result->op     = GGML_OP_PERMUTE;
+    result->src[0] = a;
+
+    int32_t params[] = { axis0, axis1, axis2, axis3 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    return result;
+}
+
+// ggml_transpose
+
+struct ggml_tensor * ggml_transpose(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    ggml_format_name(result, "%s (transposed)", a->name);
+
+    result->ne[0] = a->ne[1];
+    result->ne[1] = a->ne[0];
+
+    result->nb[0] = a->nb[1];
+    result->nb[1] = a->nb[0];
+
+    result->op     = GGML_OP_TRANSPOSE;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_get_rows
+
+struct ggml_tensor * ggml_get_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(b->ne[3] == 1);
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+
+    // TODO: implement non F32 return
+    enum ggml_type type = GGML_TYPE_F32;
+    if (a->type == GGML_TYPE_I32) {
+        type = a->type;
+    }
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+
+    result->op     = GGML_OP_GET_ROWS;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_get_rows_back
+
+struct ggml_tensor * ggml_get_rows_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c) {
+    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
+
+    // TODO: implement non F32 return
+    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+
+    result->op     = GGML_OP_GET_ROWS_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_diag
+
+struct ggml_tensor * ggml_diag(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    GGML_ASSERT(a->ne[1] == 1);
+
+    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
+
+    result->op     = GGML_OP_DIAG;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_diag_mask_inf
+
+static struct ggml_tensor * ggml_diag_mask_inf_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_DIAG_MASK_INF;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_diag_mask_inf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
+// ggml_diag_mask_zero
+
+static struct ggml_tensor * ggml_diag_mask_zero_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[] = { n_past };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_DIAG_MASK_ZERO;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_diag_mask_zero(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_zero_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
+// ggml_soft_max
+
+static struct ggml_tensor * ggml_soft_max_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * mask,
+        float                 scale,
+        float                 max_bias,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_is_contiguous(a));
+
+    if (mask) {
+        GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_is_contiguous(mask));
+        GGML_ASSERT(ggml_is_matrix(mask));
+        GGML_ASSERT(mask->ne[0] == a->ne[0]);
+        GGML_ASSERT(mask->ne[1] >= a->ne[1]);
+    }
+
+    if (max_bias > 0.0f) {
+        GGML_ASSERT(mask);
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    float params[] = { scale, max_bias };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SOFT_MAX;
+    result->src[0] = a;
+    result->src[1] = mask;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_max(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
+}
+
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
+}
+
+struct ggml_tensor * ggml_soft_max_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * mask,
+        float                 scale,
+        float                 max_bias) {
+    return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
+}
+
+// ggml_soft_max_ext_back
+
+static struct ggml_tensor * ggml_soft_max_ext_back_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_SOFT_MAX_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    memcpy((float *) result->op_params + 0, &scale,    sizeof(float));
+    memcpy((float *) result->op_params + 1, &max_bias, sizeof(float));
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_max_ext_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias) {
+    return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
+}
+
+struct ggml_tensor * ggml_soft_max_ext_back_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias) {
+    return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
+}
+
+// ggml_rope
+
+static struct ggml_tensor * ggml_rope_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
+        bool                  inplace) {
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] == b->ne[0]);
+
+    if (c) {
+        GGML_ASSERT(c->type == GGML_TYPE_F32);
+        GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+
+    int sections[4] = {0, 0, 0, 0};
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &sections,     sizeof(int)*4);
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rope(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode) {
+    return ggml_rope_impl(
+        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
+    );
+}
+
+struct ggml_tensor * ggml_rope_multi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    // Multimodal Rotary Position Embedding
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+
+    if (c) {
+        GGML_ASSERT(c->type == GGML_TYPE_F32);
+        GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(&params[11], sections,      sizeof(int)*4);
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode) {
+    return ggml_rope_impl(
+        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
+    );
+}
+
+struct ggml_tensor * ggml_rope_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, false
+    );
+}
+
+struct ggml_tensor * ggml_rope_ext_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, true
+    );
+}
+
+struct ggml_tensor * ggml_rope_custom(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, false
+    );
+}
+
+struct ggml_tensor * ggml_rope_custom_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    return ggml_rope_impl(
+        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, true
+    );
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
+) {
+    // start and end correction dims
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
+    dims[0] = MAX(0, start);
+    dims[1] = MIN(n_dims - 1, end);
+}
+
+// ggml_rope_back
+
+struct ggml_tensor * ggml_rope_ext_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    struct ggml_tensor * result = ggml_rope_ext(
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    result->op = GGML_OP_ROPE_BACK;
+    return result;
+}
+
+struct ggml_tensor * ggml_rope_multi_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    struct ggml_tensor * result = ggml_rope_multi(
+        ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    result->op = GGML_OP_ROPE_BACK;
+    return result;
+}
+// ggml_clamp
+
+struct ggml_tensor * ggml_clamp(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 min,
+        float                 max) {
+    // TODO: when implement backward, fix this:
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    float params[] = { min, max };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_CLAMP;
+    result->src[0] = a;
+
+    return result;
+}
+
+static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+}
+
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct ggml_tensor * ggml_im2col(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum ggml_type        dst_type) {
+    if (is_2D) {
+        GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        GGML_ASSERT(b->ne[1] == a->ne[1]);
+        GGML_ASSERT(b->ne[3] == 1);
+    }
+
+    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d
+
+struct ggml_tensor * ggml_conv_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
+
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
+
+    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
+
+    return result;
+}
+
+// ggml_conv_1d_ph
+
+struct ggml_tensor* ggml_conv_1d_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s,
+        int                   d) {
+    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
+// ggml_conv_1d_dw
+
+struct ggml_tensor * ggml_conv_1d_dw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
+
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
+
+    struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
+
+    result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
+
+    return result;
+}
+
+// ggml_conv_1d_dw_ph
+
+struct ggml_tensor * ggml_conv_1d_dw_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   d0) {
+    return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
+
+// ggml_conv_transpose_1d
+
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
+
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
+
+    const int64_t ne[4] = {
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_2d
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_tensor * ggml_conv_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
+
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
+    result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
+
+
+    return result;
+}
+
+// ggml_conv_2d_sk_p0
+
+struct ggml_tensor * ggml_conv_2d_sk_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+}
+
+// ggml_conv_2d_s1_ph
+
+struct ggml_tensor * ggml_conv_2d_s1_ph(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
+}
+
+// ggml_conv_2d_dw
+
+struct ggml_tensor * ggml_conv_2d_dw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+
+    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
+
+// ggml_conv_transpose_2d_p0
+
+static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+    return (ins - 1) * s - 2 * p + ks;
+}
+
+struct ggml_tensor * ggml_conv_transpose_2d_p0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride) {
+    GGML_ASSERT(a->ne[3] == b->ne[2]);
+
+    const int64_t ne[4] = {
+        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
+        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
+        a->ne[2], b->ne[3],
+    };
+
+    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    ggml_set_op_params_i32(result, 0, stride);
+
+    result->op     = GGML_OP_CONV_TRANSPOSE_2D;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_pool_*
+
+static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
+    return (ins + 2 * p - ks) / s + 1;
+}
+
+// ggml_pool_1d
+
+struct ggml_tensor * ggml_pool_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   s0,
+        int                   p0) {
+    const int64_t ne[4] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        a->ne[1],
+        a->ne[2],
+        a->ne[3],
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { op, k0, s0, p0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_POOL_1D;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor * ggml_pool_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        float                 p0,
+        float                 p1) {
+    struct ggml_tensor * result;
+    const int64_t ne[4] = {
+        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
+        a->ne[2],
+        a->ne[3],
+    };
+    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_POOL_2D;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_pool_2d_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * af,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        float                 p0,
+        float                 p1) {
+    struct ggml_tensor * result;
+    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
+
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_POOL_2D_BACK;
+    result->src[0] = a;
+    result->src[1] = af;
+
+    return result;
+}
+
+// ggml_upscale
+
+static struct ggml_tensor * ggml_upscale_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        int                   ne2,
+        int                   ne3) {
+    GGML_ASSERT(a->ne[0] <= ne0);
+    GGML_ASSERT(a->ne[1] <= ne1);
+    GGML_ASSERT(a->ne[2] <= ne2);
+    GGML_ASSERT(a->ne[3] <= ne3);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+
+    result->op     = GGML_OP_UPSCALE;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_upscale(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   scale_factor) {
+    return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
+}
+
+struct ggml_tensor * ggml_upscale_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   ne0,
+        int                   ne1,
+        int                   ne2,
+        int                   ne3) {
+    return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
+}
+
+// ggml_pad
+
+struct ggml_tensor * ggml_pad(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1,
+        int                   p2,
+        int                   p3) {
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0,
+            a->ne[1] + p1,
+            a->ne[2] + p2,
+            a->ne[3] + p3);
+
+    result->op     = GGML_OP_PAD;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_pad_reflect_1d
+
+struct ggml_tensor * ggml_pad_reflect_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1) {
+    GGML_ASSERT(p0 >= 0);
+    GGML_ASSERT(p1 >= 0);
+
+    GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
+    GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
+
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0 + p1,
+            a->ne[1],
+            a->ne[2],
+            a->ne[3]);
+
+    int32_t params[] = { p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_PAD_REFLECT_1D;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_arange
+
+struct ggml_tensor * ggml_arange(
+        struct ggml_context * ctx,
+        float                 start,
+        float                 stop,
+        float                 step) {
+    GGML_ASSERT(stop > start);
+
+    const int64_t steps = (int64_t) ceilf((stop - start) / step);
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
+
+    ggml_set_op_params_f32(result, 0, start);
+    ggml_set_op_params_f32(result, 1, stop);
+    ggml_set_op_params_f32(result, 2, step);
+
+    result->op = GGML_OP_ARANGE;
+
+    return result;
+}
+
+// ggml_timestep_embedding
+
+struct ggml_tensor * ggml_timestep_embedding(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * timesteps,
+        int                   dim,
+        int                   max_period) {
+    int actual_dim = dim;
+    if (dim % 2 != 0) {
+        actual_dim = dim + 1;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
+
+    ggml_set_op_params_i32(result, 0, dim);
+    ggml_set_op_params_i32(result, 1, max_period);
+
+    result->op     = GGML_OP_TIMESTEP_EMBEDDING;
+    result->src[0] = timesteps;
+
+    return result;
+}
+
+// ggml_argsort
+
+struct ggml_tensor * ggml_argsort(
+        struct ggml_context  * ctx,
+        struct ggml_tensor   * a,
+        enum ggml_sort_order   order) {
+    GGML_ASSERT(a->ne[0] <= INT32_MAX);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
+
+    ggml_set_op_params_i32(result, 0, (int32_t) order);
+
+    result->op     = GGML_OP_ARGSORT;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_top_k
+
+struct ggml_tensor * ggml_top_k(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   k) {
+    GGML_ASSERT(a->ne[0] >= k);
+
+    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
+
+    result = ggml_view_4d(ctx, result,
+                k, result->ne[1], result->ne[2], result->ne[3],
+                   result->nb[1], result->nb[2], result->nb[3],
+                0);
+
+    return result;
+}
+
+// ggml_flash_attn_ext
+
+struct ggml_tensor * ggml_flash_attn_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * mask,
+        float                 scale,
+        float                 max_bias,
+        float                 logit_softcap) {
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    if (mask) {
+        GGML_ASSERT(ggml_is_contiguous(mask));
+        GGML_ASSERT(mask->ne[2] == 1);
+        GGML_ASSERT(mask->ne[3] == 1);
+        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
+                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
+        //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
+    }
+
+    if (max_bias > 0.0f) {
+        GGML_ASSERT(mask);
+    }
+
+    // permute(0, 2, 1, 3)
+    int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    float params[] = { scale, max_bias, logit_softcap };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_FLASH_ATTN_EXT;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = mask;
+
+    return result;
+}
+
+void ggml_flash_attn_ext_set_prec(
+        struct ggml_tensor * a,
+        enum ggml_prec       prec) {
+    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
+
+    const int32_t prec_i32 = (int32_t) prec;
+
+    ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
+}
+
+enum ggml_prec ggml_flash_attn_ext_get_prec(
+        const struct ggml_tensor * a) {
+    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
+
+    const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
+
+    return (enum ggml_prec) prec_i32;
+}
+
+// ggml_flash_attn_back
+
+struct ggml_tensor * ggml_flash_attn_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * d,
+        bool                  masked) {
+    GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
+
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+
+    // d shape [D,N,ne2,ne3]
+    // q shape [D,N,ne2,ne3]
+    // k shape [D,M,kvne2,ne3]
+    // v shape [M,D,kvne2,ne3]
+
+    const int64_t     D = q->ne[0];
+    const int64_t     N = q->ne[1];
+    const int64_t     M = k->ne[1];
+    const int64_t   ne2 = q->ne[2];
+    const int64_t   ne3 = q->ne[3];
+    const int64_t kvne2 = k->ne[2];
+
+    GGML_ASSERT(k->ne[0] == D);
+    GGML_ASSERT(v->ne[0] == M);
+    GGML_ASSERT(v->ne[1] == D);
+    GGML_ASSERT(d->ne[0] == D);
+    GGML_ASSERT(d->ne[1] == N);
+    GGML_ASSERT(k->ne[2] == kvne2);
+    GGML_ASSERT(k->ne[3] == ne3);
+    GGML_ASSERT(v->ne[2] == kvne2);
+    GGML_ASSERT(v->ne[3] == ne3);
+    GGML_ASSERT(d->ne[2] == ne2);
+    GGML_ASSERT(d->ne[3] == ne3);
+
+    GGML_ASSERT(ne2 % kvne2 == 0);
+
+    // store gradients of q, k and v as continuous tensors concatenated in result.
+    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+    const int64_t elem_v = ggml_nelements(v);
+
+    enum ggml_type result_type = GGML_TYPE_F32;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
+
+    const size_t nelements = (end + tsize - 1)/tsize;
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
+
+    int32_t masked_i = masked ? 1 : 0;
+    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
+
+    result->op     = GGML_OP_FLASH_ATTN_BACK;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = d;
+
+    return result;
+}
+
+// ggml_ssm_conv
+
+struct ggml_tensor * ggml_ssm_conv(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * sx,
+        struct ggml_tensor  * c) {
+    GGML_ASSERT(ggml_is_3d(sx));
+    GGML_ASSERT(ggml_is_matrix(c));
+
+    const int64_t d_conv  = c->ne[0];
+    const int64_t d_inner = c->ne[1];
+    const int64_t n_t     = sx->ne[0] - d_conv + 1; // tokens per sequence
+    const int64_t n_s     = sx->ne[2];
+
+    // TODO: maybe support other strides than 1?
+    // FIXME: this is always true?
+    GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
+    GGML_ASSERT(sx->ne[1] == d_inner);
+    GGML_ASSERT(n_t >= 0);
+
+    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
+
+    result->op     = GGML_OP_SSM_CONV;
+    result->src[0] = sx;
+    result->src[1] = c;
+
+    return result;
+}
+
+// ggml_ssm_scan
+
+struct ggml_tensor * ggml_ssm_scan(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * s,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * dt,
+        struct ggml_tensor  * A,
+        struct ggml_tensor  * B,
+        struct ggml_tensor  * C) {
+    GGML_ASSERT(ggml_is_contiguous(s));
+    GGML_ASSERT(ggml_is_contiguous(x));
+    GGML_ASSERT(ggml_is_contiguous(dt));
+    GGML_ASSERT(ggml_is_contiguous(A));
+    GGML_ASSERT(ggml_is_matrix(A));
+    GGML_ASSERT(ggml_is_3d(B));
+    GGML_ASSERT(ggml_is_3d(s));
+    GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
+    GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
+    GGML_ASSERT(ggml_are_same_shape(x, dt));
+    GGML_ASSERT(ggml_are_same_shape(B, C));
+
+    {
+        const int64_t d_state      = s->ne[0];
+        const int64_t d_inner      = s->ne[1];
+        const int64_t n_seq_tokens = x->ne[1];
+        const int64_t n_seqs       = x->ne[2];
+
+        GGML_ASSERT(s->ne[2] == n_seqs);
+        GGML_ASSERT(x->ne[0] == d_inner);
+        GGML_ASSERT(A->ne[0] == d_state);
+        GGML_ASSERT(A->ne[1] == d_inner);
+        GGML_ASSERT(B->ne[0] == d_state);
+        GGML_ASSERT(B->ne[1] == n_seq_tokens);
+        GGML_ASSERT(B->ne[2] == n_seqs);
+    }
+
+    // concatenated y + ssm_states
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
+
+    result->op   = GGML_OP_SSM_SCAN;
+    result->src[0] = s;
+    result->src[1] = x;
+    result->src[2] = dt;
+    result->src[3] = A;
+    result->src[4] = B;
+    result->src[5] = C;
+
+    return result;
+}
+
+// ggml_win_part
+
+struct ggml_tensor * ggml_win_part(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   w) {
+    GGML_ASSERT(a->ne[3] == 1);
+    GGML_ASSERT(a->type  == GGML_TYPE_F32);
+
+    // padding
+    const int px = (w - a->ne[1]%w)%w;
+    const int py = (w - a->ne[2]%w)%w;
+
+    const int npx = (px + a->ne[1])/w;
+    const int npy = (py + a->ne[2])/w;
+    const int np  = npx*npy;
+
+    const int64_t ne[4] = { a->ne[0], w, w, np, };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { npx, npy, w };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_WIN_PART;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_win_unpart
+
+struct ggml_tensor * ggml_win_unpart(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   w0,
+        int                   h0,
+        int                   w) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+
+    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+
+    int32_t params[] = { w };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_WIN_UNPART;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_get_rel_pos
+
+struct ggml_tensor * ggml_get_rel_pos(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   qh,
+        int                   kh) {
+    GGML_ASSERT(qh == kh);
+    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
+
+    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
+
+    result->op     = GGML_OP_GET_REL_POS;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_add_rel_pos
+
+static struct ggml_tensor * ggml_add_rel_pos_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_are_same_shape(pw, ph));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_is_contiguous(pw));
+    GGML_ASSERT(ggml_is_contiguous(ph));
+    GGML_ASSERT(ph->type == GGML_TYPE_F32);
+    GGML_ASSERT(pw->type == GGML_TYPE_F32);
+    GGML_ASSERT(pw->ne[3] == a->ne[2]);
+    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
+    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
+
+    result->op     = GGML_OP_ADD_REL_POS;
+    result->src[0] = a;
+    result->src[1] = pw;
+    result->src[2] = ph;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add_rel_pos(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph) {
+    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
+}
+
+struct ggml_tensor * ggml_add_rel_pos_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pw,
+        struct ggml_tensor  * ph) {
+    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
+}
+
+// ggml_rwkv_wkv6
+
+struct ggml_tensor * ggml_rwkv_wkv6(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * r,
+        struct ggml_tensor  * tf,
+        struct ggml_tensor  * td,
+        struct ggml_tensor  * state) {
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(r));
+    GGML_ASSERT(ggml_is_contiguous(tf));
+    GGML_ASSERT(ggml_is_contiguous(td));
+    GGML_ASSERT(ggml_is_contiguous(state));
+
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
+    const int64_t n_seqs = state->ne[1];
+    {
+        GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens);
+        GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens);
+        GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
+    }
+
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op     = GGML_OP_RWKV_WKV6;
+    result->src[0] = k;
+    result->src[1] = v;
+    result->src[2] = r;
+    result->src[3] = tf;
+    result->src[4] = td;
+    result->src[5] = state;
+
+    return result;
+}
+
+// ggml_gated_linear_attn
+
+struct ggml_tensor * ggml_gated_linear_attn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * state,
+        float scale) {
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(q));
+    GGML_ASSERT(ggml_is_contiguous(g));
+    GGML_ASSERT(ggml_is_contiguous(state));
+
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
+    const int64_t n_seqs = state->ne[1];
+    {
+        GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens);
+        GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens);
+        GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
+    }
+
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    ggml_set_op_params_f32(result, 0, scale);
+
+    result->op     = GGML_OP_GATED_LINEAR_ATTN;
+    result->src[0] = k;
+    result->src[1] = v;
+    result->src[2] = q;
+    result->src[3] = g;
+    result->src[4] = state;
+
+    return result;
+}
+
+// ggml_unary
+
+static struct ggml_tensor * ggml_unary_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op    op,
+        bool                  inplace) {
+    GGML_ASSERT(ggml_is_contiguous_1(a));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, (int32_t) op);
+
+    result->op     = GGML_OP_UNARY;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_unary(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op    op) {
+    return ggml_unary_impl(ctx, a, op, false);
+}
+
+struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op    op) {
+    return ggml_unary_impl(ctx, a, op, true);
+}
+
+// ggml_map_unary
+
+static struct ggml_tensor * ggml_map_unary_impl_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun,
+        bool                         inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_UNARY;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_unary_inplace_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_binary
+
+static struct ggml_tensor * ggml_map_binary_impl_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun,
+        bool                          inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_BINARY;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_binary_inplace_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom1_f32
+
+static struct ggml_tensor * ggml_map_custom1_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM1_F32;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_custom2_f32
+
+static struct ggml_tensor * ggml_map_custom2_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM2_F32;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom3_f32
+
+static struct ggml_tensor * ggml_map_custom3_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM3_F32;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
+}
+
+// ggml_map_custom1
+
+static struct ggml_tensor * ggml_map_custom1_impl(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom1_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op     = GGML_OP_MAP_CUSTOM1;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        const  ggml_custom1_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom2
+
+static struct ggml_tensor * ggml_map_custom2_impl(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom2_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op     = GGML_OP_MAP_CUSTOM2;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        const  ggml_custom2_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
+}
+
+// ggml_map_custom3
+
+static struct ggml_tensor * ggml_map_custom3_impl(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata,
+        bool                       inplace) {
+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    struct ggml_map_custom3_op_params params = {
+        /*.fun      =*/ fun,
+        /*.n_tasks  =*/ n_tasks,
+        /*.userdata =*/ userdata
+    };
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
+
+    result->op     = GGML_OP_MAP_CUSTOM3;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace(
+        struct ggml_context      * ctx,
+        struct ggml_tensor       * a,
+        struct ggml_tensor       * b,
+        struct ggml_tensor       * c,
+        const  ggml_custom3_op_t   fun,
+        int                        n_tasks,
+        void                     * userdata) {
+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
+}
+
+// ggml_cross_entropy_loss
+
+struct ggml_tensor * ggml_cross_entropy_loss(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
+
+    result->op     = GGML_OP_CROSS_ENTROPY_LOSS;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_cross_entropy_loss_back
+
+struct ggml_tensor * ggml_cross_entropy_loss_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c) {
+    GGML_ASSERT(ggml_is_scalar(a));
+    GGML_ASSERT(ggml_are_same_shape(b, c));
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, b);
+
+    result->op     = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+// opt_step_adamw
+
+struct ggml_tensor * ggml_opt_step_adamw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * grad,
+        struct ggml_tensor  * m,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * adamw_params) {
+    GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
+    GGML_ASSERT(ggml_are_same_shape(a, grad));
+    GGML_ASSERT(ggml_are_same_shape(a, m));
+    GGML_ASSERT(ggml_are_same_shape(a, v));
+    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    result->op     = GGML_OP_OPT_STEP_ADAMW;
+    result->src[0] = a;
+    result->src[1] = grad;
+    result->src[2] = m;
+    result->src[3] = v;
+    result->src[4] = adamw_params;
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ggml_hash_set ggml_hash_set_new(size_t size) {
+    size = ggml_hash_size(size);
+    struct ggml_hash_set result;
+    result.size = size;
+    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
+    result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
+    return result;
+}
+
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
+    memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
+}
+
+void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
+    GGML_FREE(hash_set->used);
+    GGML_FREE(hash_set->keys);
+}
+
+size_t ggml_hash_size(size_t min_sz) {
+    // next primes after powers of two
+    static const size_t primes[] = {
+        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
+        2053, 4099, 8209, 16411, 32771, 65537, 131101,
+        262147, 524309, 1048583, 2097169, 4194319, 8388617,
+        16777259, 33554467, 67108879, 134217757, 268435459,
+        536870923, 1073741827, 2147483659
+    };
+    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
+
+    // find the smallest prime that is larger or equal than min_sz
+    size_t l = 0;
+    size_t r = n_primes;
+    while (l < r) {
+        size_t m = (l + r)/2;
+        if (primes[m] < min_sz) {
+            l = m + 1;
+        } else {
+            r = m;
+        }
+    }
+    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
+    return sz;
+}
+
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+static struct hash_map * ggml_new_hash_map(size_t size) {
+    struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
+    result->set = ggml_hash_set_new(size);
+    result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
+    return result;
+}
+
+static void ggml_hash_map_free(struct hash_map * map) {
+    ggml_hash_set_free(&map->set);
+    GGML_FREE(map->vals);
+    GGML_FREE(map);
+}
+
+// utility functions to change gradients
+// isrc is the index of tensor in cgraph->visited_has_set.keys
+// the corresponding gradient (accumulators) are also at position isrc
+// if tensor has a gradient accumulator, modify that accumulator in-place
+// else if there is no gradient for tensor, set the corresponding value
+// else, just add/subtract/etc. the gradients
+
+static void ggml_add_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = tensor;
+    }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
+}
+
+static void ggml_acc_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor,
+        const  size_t         nb1,
+        const  size_t         nb2,
+        const  size_t         nb3,
+        const  size_t         offset) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
+    } else {
+        struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
+        cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
+    }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
+}
+
+static void ggml_add1_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
+    }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
+}
+
+static void ggml_sub_or_set(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
+        size_t                isrc,
+        struct ggml_tensor  * tensor) {
+    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    GGML_ASSERT(src);
+    if (cgraph->grads[isrc]) {
+        cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+    } else {
+        cgraph->grads[isrc] = ggml_neg(ctx, tensor);
+    }
+    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
+    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
+}
+
+static void ggml_compute_backward(
+        struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) {
+    struct ggml_tensor * tensor = cgraph->nodes[i];
+    struct ggml_tensor * grad   = ggml_graph_get_grad(cgraph, tensor);
+
+    if (!grad) {
+        return;
+    }
+
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+    struct ggml_tensor * src2 = tensor->src[2];
+    struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
+    const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
+    const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
+    const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
+    const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
+    const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
+    const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
+
+    switch (tensor->op) {
+        case GGML_OP_DUP: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+        } break;
+        case GGML_OP_ADD: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+            if (src1_needs_grads) {
+                struct ggml_tensor * tmp = grad;
+                if (!ggml_are_same_shape(src0, src1)) {
+                    tmp = ggml_repeat_back(ctx, tmp, src1);
+                }
+                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
+            }
+        } break;
+        case GGML_OP_ADD1: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_mean(ctx, grad)); // TODO: should probably be sum instead of mean
+            }
+        } break;
+        case GGML_OP_ACC: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+            if (src1_needs_grads) {
+                const size_t nb1    = ((int32_t *) tensor->op_params)[0];
+                const size_t nb2    = ((int32_t *) tensor->op_params)[1];
+                const size_t nb3    = ((int32_t *) tensor->op_params)[2];
+                const size_t offset = ((int32_t *) tensor->op_params)[3];
+
+                struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
+                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                    nb1, nb2, nb3, offset);
+
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
+            }
+        } break;
+        case GGML_OP_SUB: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+            if (src1_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc1, grad);
+            }
+        } break;
+        case GGML_OP_MUL: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
+            }
+            if (src1_needs_grads) {
+                struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
+                if (!ggml_are_same_shape(src0, src1)) {
+                    tmp = ggml_repeat_back(ctx, tmp, src1);
+                }
+                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
+            }
+        } break;
+        case GGML_OP_DIV: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src1));
+            }
+            if (src1_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, grad, ggml_div(ctx, tensor, src1)));
+            }
+        } break;
+        case GGML_OP_SQR: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_mul(ctx, src0, grad), 2.0f));
+            }
+        } break;
+        case GGML_OP_SQRT: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_div(ctx, grad, tensor), 0.5f));
+            }
+        } break;
+        case GGML_OP_LOG: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_SIN: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_cos(ctx, src0)));
+            }
+        } break;
+        case GGML_OP_COS: {
+            if (src0_needs_grads) {
+                ggml_sub_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sin(ctx, src0)));
+            }
+        } break;
+        case GGML_OP_SUM: {
+            if (src0_needs_grads) {
+                ggml_add1_or_set(ctx, cgraph, isrc0, grad);
+            }
+        } break;
+        case GGML_OP_SUM_ROWS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_MEAN: {
+            if (src0_needs_grads) {
+                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
+            }
+        } break;
+        case GGML_OP_REPEAT: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat_back(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_REPEAT_BACK: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_RMS_NORM: {
+            if (src0_needs_grads) {
+                float eps;
+                memcpy(&eps, tensor->op_params, sizeof(float));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, grad, src0, eps));
+            }
+        } break;
+        case GGML_OP_MUL_MAT: {
+            // https://cs231n.github.io/optimization-2/#staged
+            // # forward pass
+            // s0 = np.random.randn(5, 10)
+            // s1 = np.random.randn(10, 3)
+            // t = s0.dot(s1)
+
+            // # now suppose we had the gradient on t from above in the circuit
+            // dt = np.random.randn(*t.shape) # same shape as t
+            // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+            // ds1 = t.T.dot(dt)
+
+            // tensor.shape [m,p,qq,rr]
+            // src0.shape   [n,m,q1,r1]
+            // src1.shape   [n,p,qq,rr]
+
+            if (src0_needs_grads) {
+                GGML_ASSERT(grad->ne[2] == src1->ne[2]);
+                GGML_ASSERT(grad->ne[3] == src1->ne[3]);
+                struct ggml_tensor * tmp =
+                    ggml_out_prod(ctx, // [n,m,qq,rr]
+                        src1,          // [n,p,qq,rr]
+                        grad);         // [m,p,qq,rr]
+                if (!ggml_are_same_shape(tmp, src0)) {
+                    GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
+                    GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
+                    GGML_ASSERT(tmp->ne[3] == 1);
+
+                    const int64_t nr2 = tmp->ne[2] / src0->ne[2];
+                    const size_t nb2 = tmp->nb[2] * nr2;
+                    const size_t nb3 = tmp->nb[2];
+
+                    tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
+                    tmp = ggml_repeat_back(ctx, tmp, src0);
+                }
+                ggml_add_or_set(ctx, cgraph, isrc0, tmp);
+            }
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1,
+                        // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
+                        //     ggml_cont(ctx,                  // [m,n,q1,r1]
+                        //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
+                        //     grad),                          // [m,p,qq,rr]
+
+                        // when src0 is bigger than tensor->grad (this is mostly the case in llama),
+                        // avoid transpose of src0, rather transpose smaller tensor->grad
+                        // and then use ggml_out_prod
+                        ggml_out_prod(ctx,      // [n,p,qq,rr]
+                            src0,               // [n,m,q1,r1]
+                            ggml_transpose(ctx, // [p,m,qq,rr]
+                                grad)));        // [m,p,qq,rr]
+            }
+        } break;
+        case GGML_OP_SCALE: {
+            if (src0_needs_grads) {
+                float s;
+                memcpy(&s, tensor->op_params, sizeof(float));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
+            }
+        } break;
+        case GGML_OP_SET: {
+            const size_t nb1    = ((const int32_t *) tensor->op_params)[0];
+            const size_t nb2    = ((const int32_t *) tensor->op_params)[1];
+            const size_t nb3    = ((const int32_t *) tensor->op_params)[2];
+            const size_t offset = ((const int32_t *) tensor->op_params)[3];
+
+            struct ggml_tensor * tensor_grad_view = NULL;
+
+            if (src0_needs_grads || src1_needs_grads) {
+                GGML_ASSERT(src0->type == tensor->type);
+                GGML_ASSERT(!cgraph->grads[isrc0] ||                      cgraph->grads[isrc0]->type == grad->type);
+                GGML_ASSERT(!cgraph->grads[isrc1] || !src1_needs_grads || cgraph->grads[isrc1]->type == grad->type);
+
+                tensor_grad_view = ggml_view_4d(ctx,
+                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                    nb1, nb2, nb3, offset);
+            }
+
+            if (src0_needs_grads) {
+                struct ggml_tensor * tmp = ggml_neg(ctx, tensor_grad_view);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_acc_impl(ctx, grad, tmp, nb1, nb2, nb3, offset, false));
+            }
+
+            if (src1_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
+            }
+        } break;
+        case GGML_OP_CPY: {
+            // cpy overwrites value of src1 by src0 and returns view(src1)
+            // the overwriting is mathematically equivalent to:
+            // tensor = src0 * 1 + src1 * 0
+            if (src0_needs_grads) {
+                // dsrc0 = dtensor * 1
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+            }
+            if (src1_needs_grads) {
+                // dsrc1 = dtensor * 0 -> noop
+            }
+        } break;
+        case GGML_OP_CONT: {
+            // same as cpy
+            if (src0_needs_grads) {
+                GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
+                GGML_ASSERT(ggml_is_contiguous(grad));
+                GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
+                ggml_add_or_set(ctx, cgraph, isrc0,
+                    ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
+            }
+        } break;
+        case GGML_OP_RESHAPE: {
+            if (src0_needs_grads) {
+                struct ggml_tensor * grad_cont = ggml_is_contiguous(grad) ? grad : ggml_cont(ctx, grad);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad_cont, src0));
+            }
+        } break;
+        case GGML_OP_VIEW: {
+            if (src0_needs_grads) {
+                size_t offset;
+
+                memcpy(&offset, tensor->op_params, sizeof(offset));
+
+                size_t nb1 = tensor->nb[1];
+                size_t nb2 = tensor->nb[2];
+                size_t nb3 = tensor->nb[3];
+
+                if (cgraph->grads[isrc0] && src0->type != cgraph->grads[isrc0]->type) {
+                    // gradient is typically F32, but src0 could be other type
+                    size_t ng = ggml_element_size(cgraph->grads[isrc0]);
+                    size_t n0 = ggml_element_size(src0);
+                    GGML_ASSERT(offset % n0 == 0);
+                    GGML_ASSERT(nb1 % n0 == 0);
+                    GGML_ASSERT(nb2 % n0 == 0);
+                    GGML_ASSERT(nb3 % n0 == 0);
+                    offset = (offset / n0) * ng;
+                    nb1 = (nb1 / n0) * ng;
+                    nb2 = (nb2 / n0) * ng;
+                    nb3 = (nb3 / n0) * ng;
+                }
+
+                ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
+            }
+        } break;
+        case GGML_OP_PERMUTE: {
+            if (src0_needs_grads) {
+                const int32_t * axes = (const int32_t *) tensor->op_params;
+                const int axis0 = axes[0] & 0x3;
+                const int axis1 = axes[1] & 0x3;
+                const int axis2 = axes[2] & 0x3;
+                const int axis3 = axes[3] & 0x3;
+                int axb[4] = {0,0,0,0}; // axes backward
+                axb[axis0] = 0;
+                axb[axis1] = 1;
+                axb[axis2] = 2;
+                axb[axis3] = 3;
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_permute(ctx, grad, axb[0], axb[1], axb[2], axb[3]));
+            }
+        } break;
+        case GGML_OP_TRANSPOSE: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_transpose(ctx, grad));
+            }
+        } break;
+        case GGML_OP_GET_ROWS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_get_rows_back(ctx, grad, src1, src0));
+            }
+            if (src1_needs_grads) {
+                // noop
+            }
+        } break;
+        case GGML_OP_DIAG_MASK_INF: {
+            if (src0_needs_grads) {
+                /* ggml_diag_mask_inf_impl() shouldn't be here */
+                /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
+                const int n_past = ((const int32_t *) tensor->op_params)[0];
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
+            }
+        } break;
+        case GGML_OP_DIAG_MASK_ZERO: {
+            if (src0_needs_grads) {
+                const int n_past = ((const int32_t *) tensor->op_params)[0];
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
+            }
+        } break;
+        case GGML_OP_SOFT_MAX: {
+            if (src0_needs_grads) {
+                float scale    = 1.0f;
+                float max_bias = 0.0f;
+
+                memcpy(&scale,    (const float *) tensor->op_params + 0, sizeof(float));
+                memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float));
+
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias));
+            }
+            GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
+        } break;
+        case GGML_OP_ROPE: {
+            if (src0_needs_grads) {
+                //const int n_past = ((int32_t *) tensor->op_params)[0];
+                const int n_dims     = ((const int32_t *) tensor->op_params)[1];
+                const int mode       = ((const int32_t *) tensor->op_params)[2];
+                //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
+                float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                int sections[4] = {0, 0, 0, 0};
+
+                memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
+                memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
+                memcpy(&ext_factor,  (const float *) tensor->op_params +  7, sizeof(float));
+                memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
+                memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
+                memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));
+                memcpy(&sections,                    tensor->op_params + 11, sizeof(sections));
+
+                struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
+                    ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
+                    ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
+            }
+            GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
+        } break;
+        case GGML_OP_IM2COL: {
+            if (src1_needs_grads) {
+                const int32_t s0    = ggml_get_op_params_i32(tensor, 0);
+                const int32_t s1    = ggml_get_op_params_i32(tensor, 1);
+                const int32_t p0    = ggml_get_op_params_i32(tensor, 2);
+                const int32_t p1    = ggml_get_op_params_i32(tensor, 3);
+                const int32_t d0    = ggml_get_op_params_i32(tensor, 4);
+                const int32_t d1    = ggml_get_op_params_i32(tensor, 5);
+                const bool    is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
+
+                ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
+            }
+        } break;
+        case GGML_OP_POOL_2D: {
+            if (src0_needs_grads) {
+                const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
+                const      int32_t      k0 = ggml_get_op_params_i32(tensor, 1);
+                const      int32_t      k1 = ggml_get_op_params_i32(tensor, 2);
+                const      int32_t      s0 = ggml_get_op_params_i32(tensor, 3);
+                const      int32_t      s1 = ggml_get_op_params_i32(tensor, 4);
+                const      int32_t      p0 = ggml_get_op_params_i32(tensor, 5);
+                const      int32_t      p1 = ggml_get_op_params_i32(tensor, 6);
+
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_pool_2d_back(ctx, grad, src0, op, k0, k1, s0, s1, p0, p1));
+            }
+        } break;
+        case GGML_OP_WIN_PART:
+        case GGML_OP_WIN_UNPART:
+        case GGML_OP_UNARY: {
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_ABS: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_sgn(ctx, src0), grad));
+                    }
+                } break;
+                case GGML_UNARY_OP_SGN: {
+                    // noop
+                } break;
+                case GGML_UNARY_OP_NEG: {
+                    if (src0_needs_grads) {
+                        ggml_sub_or_set(ctx, cgraph, isrc0, grad);
+                    }
+                } break;
+                case GGML_UNARY_OP_STEP: {
+                    // noop
+                } break;
+                case GGML_UNARY_OP_RELU: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_step(ctx, src0), grad));
+                    }
+                } break;
+                case GGML_UNARY_OP_SILU: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0));
+                    }
+                } break;
+                case GGML_UNARY_OP_EXP: {
+                    if (src0_needs_grads) {
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
+                    }
+                } break;
+                default: {
+                    fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
+                        __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
+                    GGML_ABORT("fatal error");
+                } //break;
+            }
+        } break;
+        case GGML_OP_CROSS_ENTROPY_LOSS: {
+            if (src0_needs_grads) {
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, grad, src0, src1));
+            }
+            GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
+        } break;
+        case GGML_OP_NONE: {
+            // noop
+        } break;
+        case GGML_OP_COUNT:
+        default: {
+            fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
+            GGML_ABORT("fatal error");
+        } //break;
+    }
+
+    GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));
+    GGML_ASSERT(!src1_needs_grads || ggml_are_same_shape(src1, cgraph->grads[isrc1]));
+    GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
+}
+
+static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
+    // check if already visited
+    if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
+        return;
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        const int k =
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
+            /* unknown order, just fall back to using i*/ i;
+        if (node->src[k]) {
+            ggml_visit_parents(cgraph, node->src[k]);
+        }
+    }
+
+    if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
+        // reached a leaf node, not part of the gradient graph (e.g. a constant)
+        GGML_ASSERT(cgraph->n_leafs < cgraph->size);
+
+        if (strlen(node->name) == 0) {
+            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
+        }
+
+        cgraph->leafs[cgraph->n_leafs] = node;
+        cgraph->n_leafs++;
+    } else {
+        GGML_ASSERT(cgraph->n_nodes < cgraph->size);
+
+        if (strlen(node->name) == 0) {
+            ggml_format_name(node, "node_%d", cgraph->n_nodes);
+        }
+
+        cgraph->nodes[cgraph->n_nodes] = node;
+        cgraph->n_nodes++;
+    }
+}
+
+static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
+    if (!expand) {
+        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
+        ggml_graph_clear(cgraph);
+    }
+
+    const int n0 = cgraph->n_nodes;
+
+    ggml_visit_parents(cgraph, tensor);
+
+    const int n_new = cgraph->n_nodes - n0;
+    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
+
+    if (n_new > 0) {
+        // the last added node should always be starting point
+        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
+    }
+}
+
+void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    ggml_build_forward_impl(cgraph, tensor, true);
+}
+
+void ggml_build_backward_expand(
+        struct ggml_context * ctx_static,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * cgraph,
+        bool                  accumulate) {
+    GGML_ASSERT(cgraph->n_nodes > 0);
+    GGML_ASSERT(cgraph->grads);
+    GGML_ASSERT(cgraph->grad_accs);
+
+    const int n_nodes_f = cgraph->n_nodes;
+
+    memset(cgraph->grads,     0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
+
+    {
+        bool any_params = false;
+        bool any_loss   = false;
+        for (int i = 0; i < n_nodes_f; ++i) {
+            struct ggml_tensor * node = cgraph->nodes[i];
+            any_params = any_params || (node->flags & GGML_TENSOR_FLAG_PARAM);
+            any_loss   = any_loss   || (node->flags & GGML_TENSOR_FLAG_LOSS);
+        }
+        GGML_ASSERT(any_params && "no trainable parameters found, did you forget to call ggml_set_param?");
+        GGML_ASSERT(any_loss && "no training loss found, did you forget to call ggml_set_loss?");
+    }
+
+    for (int i = 0; i < n_nodes_f; ++i) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->type == GGML_TYPE_I32) {
+            continue;
+        }
+
+        bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
+        bool ignore_src[GGML_MAX_SRC] = {false};
+        switch (node->op) {
+            // gradients in node->src[0] for one reason or another have no effect on output gradients
+            case GGML_OP_IM2COL:      // only used for its shape
+            case GGML_OP_IM2COL_BACK: // same as IM2COL
+                ignore_src[0] = true;
+                break;
+            case GGML_OP_UNARY: {
+                const enum ggml_unary_op uop = ggml_get_unary_op(node);
+                // SGN and STEP unary ops are piecewise constant
+                if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) {
+                    ignore_src[0] = true;
+                }
+            } break;
+
+            // gradients in node->src[1] for one reason or another have no effect on output gradients
+            case GGML_OP_CPY:           // gradients in CPY target are irrelevant
+            case GGML_OP_GET_ROWS:      // row indices not differentiable
+            case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
+            case GGML_OP_ROPE:          // positions not differentiable
+                ignore_src[1] = true;
+                break;
+
+            default:
+                break;
+        }
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            if (!node->src[j] || ignore_src[j] || !grads_needed[ggml_hash_find(&cgraph->visited_hash_set, node->src[j])]) {
+                continue;
+            }
+            GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
+            node_needs_grad = true;
+            break;
+        }
+        if (!node_needs_grad) {
+            continue;
+        }
+
+        // inplace operations are currently not supported
+        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
+            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
+
+        const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
+        if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
+            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
+            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
+            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
+        }
+        grads_needed[igrad] = true;
+    }
+
+    for (int i = n_nodes_f - 1; i >= 0; --i) {
+        // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
+        // use allocator to automatically make inplace operations
+        ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
+    }
+
+    free(grads_needed);
+}
+
+static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
+    void * ptr = *p;
+    ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
+    *p = (void *) ((char *) ptr + size);
+    return ptr;
+}
+
+static size_t ggml_graph_nbytes(size_t size, bool grads) {
+    size_t hash_size = ggml_hash_size(size * 2);
+    void * p = 0;
+    incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
+    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
+    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
+    incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
+    if (grads) {
+        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
+        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grad_accs
+    }
+    incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
+
+    size_t nbytes = (size_t) p;
+    return nbytes;
+}
+
+size_t ggml_graph_overhead_custom(size_t size, bool grads) {
+    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
+}
+
+size_t ggml_graph_overhead(void) {
+    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
+    const size_t obj_size = ggml_graph_nbytes(size, grads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
+    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
+
+    // the size of the hash table is doubled since it needs to hold both nodes and leafs
+    size_t hash_size = ggml_hash_size(size * 2);
+
+    void * p = cgraph + 1;
+
+    struct ggml_tensor ** nodes_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** leafs_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** hash_keys_ptr =         incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** grads_ptr     = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+    struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+
+    ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
+
+    // check that we allocated the correct amount of memory
+    assert(obj_size == (size_t)((char *)p - (char *)cgraph));
+
+    *cgraph = (struct ggml_cgraph) {
+        /*.size         =*/ size,
+        /*.n_nodes      =*/ 0,
+        /*.n_leafs      =*/ 0,
+        /*.nodes        =*/ nodes_ptr,
+        /*.grads        =*/ grads_ptr,
+        /*.grad_accs    =*/ grad_accs_ptr,
+        /*.leafs        =*/ leafs_ptr,
+        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+    };
+
+    ggml_hash_set_reset(&cgraph->visited_hash_set);
+    if (grads) {
+        memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
+        memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
+    }
+
+    return cgraph;
+}
+
+struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
+    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
+}
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
+    struct ggml_cgraph cgraph = {
+        /*.size             =*/ 0,
+        /*.n_nodes          =*/ i1 - i0,
+        /*.n_leafs          =*/ 0,
+        /*.nodes            =*/ cgraph0->nodes + i0,
+        /*.grads            =*/ NULL, // gradients would need visited_hash_set
+        /*.grad_accs        =*/ NULL,
+        /*.leafs            =*/ NULL,
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
+    };
+
+    return cgraph;
+}
+
+void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
+    GGML_ASSERT(dst->size >= src->n_leafs);
+    GGML_ASSERT(dst->size >= src->n_nodes);
+    GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
+
+    dst->n_leafs = src->n_leafs;
+    dst->n_nodes = src->n_nodes;
+    dst->order   = src->order;
+
+    for (int i = 0; i < src->n_leafs; ++i) {
+        dst->leafs[i] = src->leafs[i];
+    }
+
+    for (int i = 0; i < src->n_nodes; ++i) {
+        dst->nodes[i] = src->nodes[i];
+    }
+
+    for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
+        // copy all hashset keys (tensors) that are in use
+        if (ggml_bitset_get(src->visited_hash_set.used, i)) {
+            ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
+        }
+    }
+
+    if (dst->grads) {
+        memset(dst->grads,     0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+        memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
+    }
+    if (src->grads) {
+        GGML_ASSERT(dst->grads     != NULL);
+        GGML_ASSERT(dst->grad_accs != NULL);
+        for (int i = 0; i < src->n_nodes; ++i) {
+            const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+            const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+            GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+            GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+            GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+            dst->grads[igrad_dst]     = src->grads[igrad_src];
+            dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
+        }
+    }
+}
+
+struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
+    ggml_graph_cpy(cgraph, result);
+    return result;
+}
+
+struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
+    if (ggml_is_empty(tensor)) {
+        return tensor;
+    }
+    if (tensor->buffer) {
+        ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ASSERT(tensor->data);
+        memset(tensor->data, 0, ggml_nbytes(tensor));
+    }
+    return tensor;
+}
+
+void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+    GGML_ASSERT(cgraph->grads != NULL);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node     = cgraph->nodes[i];
+        struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(cgraph, node);
+
+        if (node->op == GGML_OP_OPT_STEP_ADAMW) {
+            // clear momenta
+            ggml_set_zero(node->src[2]);
+            ggml_set_zero(node->src[3]);
+        }
+
+        // initial gradients of loss should be 1, 0 otherwise
+        if (grad_acc) {
+            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+                GGML_ASSERT(grad_acc->type == GGML_TYPE_F32);
+                GGML_ASSERT(ggml_is_scalar(grad_acc));
+
+                const float onef = 1.0f;
+                if (grad_acc->buffer) {
+                    ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
+                } else {
+                    GGML_ASSERT(grad_acc->data);
+                    *((float *) grad_acc->data) = onef;
+                }
+            } else {
+                ggml_set_zero(grad_acc);
+            }
+        }
+    }
+}
+
+void ggml_graph_clear(struct ggml_cgraph * cgraph) {
+    cgraph->n_leafs = 0;
+    cgraph->n_nodes = 0;
+    ggml_hash_set_reset(&cgraph->visited_hash_set);
+}
+
+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+    return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+    if (i < 0) {
+        GGML_ASSERT(cgraph->n_nodes + i >= 0);
+        return cgraph->nodes[cgraph->n_nodes + i];
+    }
+
+    GGML_ASSERT(i < cgraph->n_nodes);
+    return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
+    cgraph->nodes[cgraph->n_nodes] = tensor;
+    cgraph->n_nodes++;
+}
+
+struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, const char * name) {
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * leaf = cgraph->leafs[i];
+
+        if (strcmp(leaf->name, name) == 0) {
+            return leaf;
+        }
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        if (strcmp(node->name, name) == 0) {
+            return node;
+        }
+    }
+
+    return NULL;
+}
+
+struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
+}
+
+struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
+    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
+}
+
+void ggml_graph_print(const struct ggml_cgraph * cgraph) {
+    GGML_LOG_INFO("=== GRAPH ===\n");
+
+    GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
+                i,
+                node->ne[0], node->ne[1], node->ne[2],
+                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" :
+                      ggml_graph_get_grad(cgraph, node) ? "g" : " ");
+    }
+
+    GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * node = cgraph->leafs[i];
+
+        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
+                i,
+                node->ne[0], node->ne[1],
+                ggml_op_name(node->op),
+                ggml_get_name(node));
+    }
+
+    GGML_LOG_INFO("========================================\n");
+}
+
+// check if node is part of the graph
+static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    if (cgraph == NULL) {
+        return true;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (cgraph->nodes[i] == node) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * parent = cgraph->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, parent);
+
+        if (grad == node) {
+            return parent;
+        }
+    }
+
+    return NULL;
+}
+
+static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
+    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
+    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
+            gparent0 ? (void *) gparent0 : (void *) parent,
+            gparent0 ? "g" : "x",
+            gparent ? (void *) gparent : (void *) node,
+            gparent ? "g" : "x",
+            gparent ? "empty" : "vee",
+            gparent ? "dashed" : "solid",
+            label);
+}
+
+static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
+    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
+            (void *) parent, "x",
+            (void *) node, "x",
+            label);
+}
+
+void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
+    char color[16];
+
+    FILE * fp = ggml_fopen(filename, "w");
+    GGML_ASSERT(fp);
+
+    fprintf(fp, "digraph G {\n");
+    fprintf(fp, "  newrank = true;\n");
+    fprintf(fp, "  rankdir = TB;\n");
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_tensor * node = gb->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(gb, node);
+
+        if (ggml_graph_get_parent(gb, node) != NULL) {
+            continue;
+        }
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            snprintf(color, sizeof(color), "yellow");
+        } else if (grad) {
+            if (ggml_graph_find(gf, node)) {
+                snprintf(color, sizeof(color), "green");
+            } else {
+                snprintf(color, sizeof(color), "lightblue");
+            }
+        } else {
+            snprintf(color, sizeof(color), "white");
+        }
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_type_name(node->type));
+        }
+
+        if (ggml_is_matrix(node)) {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
+        } else {
+            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
+        }
+
+        if (grad) {
+            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(grad->op));
+        } else {
+            fprintf(fp, "\"; ]\n");
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_tensor * node = gb->leafs[i];
+
+        snprintf(color, sizeof(color), "pink");
+
+        fprintf(fp, "  \"%p\" [ "
+                    "style = filled; fillcolor = %s; shape = record; "
+                    "label=\"<x>",
+                (void *) node, color);
+
+        if (strlen(node->name) > 0) {
+            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
+        } else {
+            fprintf(fp, "(%s)|", ggml_type_name(node->type));
+        }
+
+        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+        if (ggml_nelements(node) < 5 && node->data != NULL) {
+            fprintf(fp, " | (");
+            for (int j = 0; j < ggml_nelements(node); j++) {
+                // FIXME: use ggml-backend to obtain the tensor data
+                //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
+                //    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
+                //}
+                //else if (node->type == GGML_TYPE_F32 ||
+                //         node->type == GGML_TYPE_F16 ||
+                //         node->type == GGML_TYPE_BF16) {
+                //    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
+                //}
+                //else
+                {
+                    fprintf(fp, "#");
+                }
+                if (j < ggml_nelements(node) - 1) {
+                    fprintf(fp, ", ");
+                }
+            }
+            fprintf(fp, ")");
+        }
+        fprintf(fp, "\"; ]\n");
+    }
+
+    for (int i = 0; i < gb->n_nodes; i++) {
+        struct ggml_tensor * node = gb->nodes[i];
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
+            }
+        }
+    }
+
+    for (int i = 0; i < gb->n_leafs; i++) {
+        struct ggml_tensor * node = gb->leafs[i];
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j]) {
+                char label[16];
+                snprintf(label, sizeof(label), "src %d", j);
+                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
+            }
+        }
+    }
+
+    fprintf(fp, "}\n");
+
+    fclose(fp);
+
+    GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_set_input(struct ggml_tensor * tensor) {
+    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
+}
+
+void ggml_set_output(struct ggml_tensor * tensor) {
+    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
+}
+
+void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+    GGML_UNUSED(ctx); // TODO: remove this parameter
+    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
+}
+
+void ggml_set_loss(struct ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_is_scalar(tensor));
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
+    tensor->flags |= GGML_TENSOR_FLAG_LOSS;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_quantize_init(enum ggml_type type) {
+    ggml_critical_section_start();
+
+    switch (type) {
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
+        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
+        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
+        default: // nothing
+            break;
+    }
+
+    ggml_critical_section_end();
+}
+
+void ggml_quantize_free(void) {
+    ggml_critical_section_start();
+
+    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
+    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
+    iq2xs_free_impl(GGML_TYPE_IQ1_S);
+    iq3xs_free_impl(256);
+
+    ggml_critical_section_end();
+}
+
+bool ggml_quantize_requires_imatrix(enum ggml_type type) {
+    return
+        type == GGML_TYPE_IQ2_XXS ||
+        type == GGML_TYPE_IQ2_XS  ||
+        type == GGML_TYPE_IQ1_S;//   ||
+        //type == GGML_TYPE_IQ1_M;
+}
+
+size_t ggml_quantize_chunk(
+        enum ggml_type   type,
+           const float * src,
+                  void * dst,
+               int64_t   start,
+               int64_t   nrows,
+               int64_t   n_per_row,
+           const float * imatrix) {
+    const int64_t n = (int64_t) nrows * n_per_row;
+
+    if (ggml_quantize_requires_imatrix(type)) {
+        GGML_ASSERT(imatrix != NULL);
+    }
+
+    GGML_ASSERT(start % type_traits[type].blck_size == 0);
+    GGML_ASSERT(start % n_per_row == 0);
+
+    ggml_quantize_init(type); // this is noop if already initialized
+
+    const size_t start_row = start / n_per_row;
+    const size_t row_size  = ggml_row_size(type, n_per_row);
+
+    size_t result = 0;
+
+    switch (type) {
+        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_F16:
+            {
+                size_t elemsize = sizeof(ggml_fp16_t);
+                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
+                result = n * elemsize;
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                size_t elemsize = sizeof(ggml_bf16_t);
+                ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
+                result = n * elemsize;
+            } break;
+        case GGML_TYPE_F32:
+            {
+                size_t elemsize = sizeof(float);
+                result = n * elemsize;
+                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
+            } break;
+        default:
+            assert(false);
+    }
+
+    GGML_ASSERT(result == nrows * row_size);
+
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
+    g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
+}
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+    struct ggml_threadpool_params p;
+    ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
new file mode 100644
index 000000000..ab13669c5
--- /dev/null
+++ b/ggml/src/gguf.cpp
@@ -0,0 +1,1329 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "gguf.h"
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <new>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+template <typename T>
+struct type_to_gguf_type;
+
+template <>
+struct type_to_gguf_type<uint8_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_UINT8;
+};
+
+template <>
+struct type_to_gguf_type<int8_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_INT8;
+};
+
+template <>
+struct type_to_gguf_type<uint16_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_UINT16;
+};
+
+template <>
+struct type_to_gguf_type<int16_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_INT16;
+};
+
+template <>
+struct type_to_gguf_type<uint32_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_UINT32;
+};
+
+template <>
+struct type_to_gguf_type<int32_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_INT32;
+};
+
+template <>
+struct type_to_gguf_type<float> {
+    static constexpr enum gguf_type value = GGUF_TYPE_FLOAT32;
+};
+
+template <>
+struct type_to_gguf_type<bool> {
+    static constexpr enum gguf_type value = GGUF_TYPE_BOOL;
+};
+
+template <>
+struct type_to_gguf_type<std::string> {
+    static constexpr enum gguf_type value = GGUF_TYPE_STRING;
+};
+
+template <>
+struct type_to_gguf_type<uint64_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_UINT64;
+};
+
+template <>
+struct type_to_gguf_type<int64_t> {
+    static constexpr enum gguf_type value = GGUF_TYPE_INT64;
+};
+
+template <>
+struct type_to_gguf_type<double> {
+    static constexpr enum gguf_type value = GGUF_TYPE_FLOAT64;
+};
+
+static const std::map<gguf_type, size_t> GGUF_TYPE_SIZE = {
+    {GGUF_TYPE_UINT8,   sizeof(uint8_t)},
+    {GGUF_TYPE_INT8,    sizeof(int8_t)},
+    {GGUF_TYPE_UINT16,  sizeof(uint16_t)},
+    {GGUF_TYPE_INT16,   sizeof(int16_t)},
+    {GGUF_TYPE_UINT32,  sizeof(uint32_t)},
+    {GGUF_TYPE_INT32,   sizeof(int32_t)},
+    {GGUF_TYPE_FLOAT32, sizeof(float)},
+    {GGUF_TYPE_BOOL,    sizeof(int8_t)},
+    {GGUF_TYPE_STRING,  0}, // undefined
+    {GGUF_TYPE_ARRAY,   0}, // undefined
+    {GGUF_TYPE_UINT64,  sizeof(uint64_t)},
+    {GGUF_TYPE_INT64,   sizeof(int64_t)},
+    {GGUF_TYPE_FLOAT64, sizeof(double)},
+};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
+
+static const std::map<gguf_type, const char *> GGUF_TYPE_NAME = {
+    {GGUF_TYPE_UINT8,   "u8"},
+    {GGUF_TYPE_INT8,    "i8"},
+    {GGUF_TYPE_UINT16,  "u16"},
+    {GGUF_TYPE_INT16,   "i16"},
+    {GGUF_TYPE_UINT32,  "u32"},
+    {GGUF_TYPE_INT32,   "i32"},
+    {GGUF_TYPE_FLOAT32, "f32"},
+    {GGUF_TYPE_BOOL,    "bool"},
+    {GGUF_TYPE_STRING,  "str"},
+    {GGUF_TYPE_ARRAY,   "arr"},
+    {GGUF_TYPE_UINT64,  "u64"},
+    {GGUF_TYPE_INT64,   "i64"},
+    {GGUF_TYPE_FLOAT64, "f64"},
+};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
+
+size_t gguf_type_size(enum gguf_type type) {
+    auto it = GGUF_TYPE_SIZE.find(type);
+    return it == GGUF_TYPE_SIZE.end() ? 0 : it->second;
+}
+
+struct gguf_kv {
+    std::string key;
+
+    bool is_array;
+    enum gguf_type type;
+
+    std::vector<int8_t>      data;
+    std::vector<std::string> data_string;
+
+    template <typename T>
+    gguf_kv(const std::string & key, const T value)
+            : key(key), is_array(false), type(type_to_gguf_type<T>::value) {
+        GGML_ASSERT(!key.empty());
+        data.resize(sizeof(T));
+        memcpy(data.data(), &value, sizeof(T));
+    }
+
+    template <typename T>
+    gguf_kv(const std::string & key, const std::vector<T> & value)
+            : key(key), is_array(true), type(type_to_gguf_type<T>::value) {
+        GGML_ASSERT(!key.empty());
+        data.resize(value.size()*sizeof(T));
+        for (size_t i = 0; i < value.size(); ++i) {
+            const T tmp = value[i];
+            memcpy(data.data() + i*sizeof(T), &tmp, sizeof(T));
+        }
+    }
+
+    gguf_kv(const std::string & key, const std::string & value)
+            : key(key), is_array(false), type(GGUF_TYPE_STRING) {
+        GGML_ASSERT(!key.empty());
+        data_string.push_back(value);
+    }
+
+    gguf_kv(const std::string & key, const std::vector<std::string> & value)
+            : key(key), is_array(true), type(GGUF_TYPE_STRING) {
+        GGML_ASSERT(!key.empty());
+        data_string = value;
+    }
+
+    const std::string & get_key() const {
+        return key;
+    }
+
+    const enum gguf_type & get_type() const {
+        return type;
+    }
+
+    size_t get_ne() const {
+        if (type == GGUF_TYPE_STRING) {
+            const size_t ne = data_string.size();
+            GGML_ASSERT(is_array || ne == 1);
+            return ne;
+        }
+        const size_t type_size = gguf_type_size(type);
+        GGML_ASSERT(data.size() % type_size == 0);
+        const size_t ne = data.size() / type_size;
+        GGML_ASSERT(is_array || ne == 1);
+        return ne;
+    }
+
+    template <typename T>
+    const T & get_val(const size_t i = 0) const {
+        GGML_ASSERT(type_to_gguf_type<T>::value == type);
+        if constexpr (std::is_same<T, std::string>::value) {
+            GGML_ASSERT(data_string.size() >= i+1);
+            return data_string[i];
+        }
+        const size_t type_size = gguf_type_size(type);
+        GGML_ASSERT(data.size() % type_size == 0);
+        GGML_ASSERT(data.size() >= (i+1)*type_size);
+        return reinterpret_cast<const T *>(data.data())[i];
+    }
+
+    void cast(const enum gguf_type new_type) {
+        const size_t new_type_size = gguf_type_size(new_type);
+        GGML_ASSERT(data.size() % new_type_size == 0);
+        type = new_type;
+    }
+};
+
+struct gguf_tensor_info {
+    struct ggml_tensor t; // for holding the equivalent info
+    uint64_t offset;      // offset from start of `data`, must be a multiple of `ALIGNMENT`
+};
+
+struct gguf_context {
+    uint32_t version = GGUF_VERSION;
+
+    std::vector<struct gguf_kv> kv;
+    std::vector<struct gguf_tensor_info> info;
+
+    size_t alignment = GGUF_DEFAULT_ALIGNMENT;
+    size_t offset    = 0; // offset of `data` from beginning of file
+    size_t size      = 0; // size of `data` in bytes
+
+    void * data = nullptr;
+};
+
+struct gguf_reader {
+    FILE * file;
+
+    gguf_reader(FILE * file) : file(file) {}
+
+    template <typename T>
+    bool read(T & dst) const {
+        return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
+    }
+
+    template <typename T>
+    bool read(std::vector<T> & dst, const size_t n) const {
+        dst.resize(n);
+        for (size_t i = 0; i < dst.size(); ++i) {
+            if constexpr (std::is_same<T, bool>::value) {
+                bool tmp;
+                if (!read(tmp)) {
+                    return false;
+                }
+                dst[i] = tmp;
+            } else {
+                if (!read(dst[i])) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool read(bool & dst) const {
+        int8_t tmp = -1;
+        if (!read(tmp)) {
+            return false;
+        }
+        dst = tmp != 0;
+        return true;
+    }
+
+    bool read(enum ggml_type & dst) const {
+        int32_t tmp = -1;
+        if (!read(tmp)) {
+            return false;
+        }
+        dst = ggml_type(tmp);
+        return true;
+    }
+
+    bool read(enum gguf_type & dst) const {
+        int32_t tmp = -1;
+        if (!read(tmp)) {
+            return false;
+        }
+        dst = gguf_type(tmp);
+        return true;
+    }
+
+    bool read(std::string & dst) const {
+        uint64_t size = -1;
+        if (!read(size)) {
+            return false;
+        }
+        dst.resize(size);
+        return fread(dst.data(), 1, dst.length(), file) == dst.length();
+    }
+
+    bool read(void * dst, const size_t size) const {
+        return fread(dst, 1, size, file) == size;
+    }
+};
+
+struct gguf_context * gguf_init_empty(void) {
+    return new gguf_context;
+}
+
+template<typename T>
+bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct gguf_kv> & kv, const std::string & key, const bool is_array, const size_t n) {
+    if (is_array) {
+        std::vector<T> value;
+        try {
+            if (!gr.read(value, n)) {
+                return false;
+            }
+        } catch (std::length_error &) {
+            fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
+            return false;
+        } catch (std::bad_alloc &) {
+            fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
+            return false;
+        }
+        kv.emplace_back(key, value);
+    } else {
+        T value;
+        if (!gr.read(value)) {
+            return false;
+        }
+        kv.emplace_back(key, value);
+    }
+    return true;
+}
+
+struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
+    const struct gguf_reader gr(file);
+    struct gguf_context * ctx = new gguf_context;
+
+    bool ok = true;
+
+    // file magic
+    {
+        std::vector<char> magic;
+        ok = ok && gr.read(magic, 4);
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read magic\n", __func__);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        for (uint32_t i = 0; i < magic.size(); i++) {
+            if (magic[i] != GGUF_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
+                gguf_free(ctx);
+                return nullptr;
+            }
+        }
+    }
+
+    // header
+    int64_t n_kv      = 0;
+    int64_t n_tensors = 0;
+
+    if (ok && gr.read(ctx->version)) {
+        if (ctx->version == 1) {
+            fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
+            ok = false;
+        }
+        if (ctx->version > GGUF_VERSION) {
+            fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
+                __func__, ctx->version, GGUF_VERSION);
+            ok = false;
+        }
+    } else {
+        ok = false;
+    }
+
+    if (ok && gr.read(n_tensors)) {
+        static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
+        if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) {
+            fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
+                __func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info));
+            ok = false;
+        }
+    } else {
+        ok = false;
+    }
+
+    if (ok && gr.read(n_kv)) {
+        static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
+        if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) {
+            fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
+                    __func__, n_kv, SIZE_MAX/sizeof(gguf_kv));
+            ok = false;
+        }
+    } else {
+        ok = false;
+    }
+
+    if (!ok) {
+        fprintf(stderr, "%s: failed to read header\n", __func__);
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    // KV pairs
+    {
+        for (int64_t i = 0; ok && i < n_kv; ++i) {
+            std::string key;
+            gguf_type   type     = gguf_type(-1);
+            bool        is_array = false;
+            uint64_t    n        = 1;
+
+            try {
+                ok = ok && gr.read(key);
+            } catch (std::length_error &) {
+                fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
+                ok = false;
+            } catch (std::bad_alloc &) {
+                fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
+                ok = false;
+            }
+            for (size_t j = 0; ok && j < ctx->kv.size(); ++j) {
+                if (key == ctx->kv[j].key) {
+                    fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
+                    ok = false;
+                }
+            }
+            if (!ok) {
+                break;
+            }
+
+            ok = ok && gr.read(type);
+            if (type == GGUF_TYPE_ARRAY) {
+                is_array = true;
+                ok = ok && gr.read(type);
+                ok = ok && gr.read(n);
+            }
+            if (!ok) {
+                break;
+            }
+
+            switch (type) {
+                case GGUF_TYPE_UINT8:   ok = ok && gguf_read_emplace_helper<uint8_t>    (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_INT8:    ok = ok && gguf_read_emplace_helper<int8_t>     (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_UINT16:  ok = ok && gguf_read_emplace_helper<uint16_t>   (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_INT16:   ok = ok && gguf_read_emplace_helper<int16_t>    (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_UINT32:  ok = ok && gguf_read_emplace_helper<uint32_t>   (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_INT32:   ok = ok && gguf_read_emplace_helper<int32_t>    (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_FLOAT32: ok = ok && gguf_read_emplace_helper<float>      (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_BOOL:    ok = ok && gguf_read_emplace_helper<bool>       (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_STRING:  ok = ok && gguf_read_emplace_helper<std::string>(gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_UINT64:  ok = ok && gguf_read_emplace_helper<uint64_t>   (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_INT64:   ok = ok && gguf_read_emplace_helper<int64_t>    (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_FLOAT64: ok = ok && gguf_read_emplace_helper<double>     (gr, ctx->kv, key, is_array, n); break;
+                case GGUF_TYPE_ARRAY:
+                default:
+                    {
+                        fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
+                        ok = false;
+                    } break;
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+            gguf_free(ctx);
+            return nullptr;
+        }
+        GGML_ASSERT(int64_t(ctx->kv.size()) == n_kv);
+
+        const int alignment_idx = gguf_find_key(ctx, GGUF_KEY_GENERAL_ALIGNMENT);
+        ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);
+
+        if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
+            fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
+            gguf_free(ctx);
+            return nullptr;
+        }
+    }
+
+    // read the tensor info
+    for (int64_t i = 0; ok && i < n_tensors; ++i) {
+        struct gguf_tensor_info info;
+
+        // tensor name
+        {
+            std::string name;
+            try {
+                ok = ok && gr.read(name);
+            } catch (std::length_error &) {
+                fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
+                ok = false;
+            } catch (std::bad_alloc &) {
+                fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
+                ok = false;
+            }
+            if (name.length() >= GGML_MAX_NAME) {
+                fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
+                ok = false;
+                break;
+            }
+            ggml_set_name(&info.t, name.c_str());
+
+            // make sure there are no duplicate tensor names
+            for (int64_t j = 0; ok && j < i; ++j) {
+                if (strcmp(info.t.name, ctx->info[j].t.name) == 0) {
+                    fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
+                    ok = false;
+                    break;
+                }
+            }
+        }
+        if (!ok) {
+            break;
+        }
+
+        // tensor shape
+        {
+            uint32_t n_dims = -1;
+            ok = ok && gr.read(n_dims);
+            if (n_dims > GGML_MAX_DIMS) {
+                fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
+                    __func__, info.t.name, n_dims, GGML_MAX_DIMS);
+                ok = false;
+                break;
+            }
+            for (uint32_t j = 0; ok && j < GGML_MAX_DIMS; ++j) {
+                info.t.ne[j] = 1;
+                if (j < n_dims) {
+                    ok = ok && gr.read(info.t.ne[j]);
+                }
+
+                // check that all ne are non-negative
+                if (info.t.ne[j] < 0) {
+                    fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
+                        __func__, info.t.name, j, info.t.ne[j]);
+                    ok = false;
+                    break;
+                }
+            }
+
+            // check that the total number of elements is representable
+            if (ok && ((INT64_MAX/info.t.ne[1] <= info.t.ne[0]) ||
+                       (INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) ||
+                       (INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) {
+
+                fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
+                    "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
+                    __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX);
+                ok = false;
+                break;
+            }
+        }
+        if (!ok) {
+            break;
+        }
+
+        // tensor type
+        {
+            ok = ok && gr.read(info.t.type);
+
+            // check that tensor type is within defined range
+            if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
+                fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
+                    __func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
+                ok = false;
+                break;
+            }
+            const size_t  type_size = ggml_type_size(info.t.type);
+            const int64_t blck_size = ggml_blck_size(info.t.type);
+
+            // check that row size is divisible by block size
+            if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
+                fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
+                    "not a multiple of block size (%" PRId64 ")\n",
+                    __func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size);
+                ok = false;
+                break;
+            }
+
+            // calculate byte offsets given the tensor shape and type
+            info.t.nb[0] = type_size;
+            info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
+            for (int j = 2; j < GGML_MAX_DIMS; ++j) {
+                info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
+            }
+        }
+        if (!ok) {
+            break;
+        }
+
+        // tensor data offset within buffer
+        ok = ok && gr.read(info.offset);
+
+        ctx->info.push_back(info);
+    }
+
+    if (!ok) {
+        fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+        gguf_free(ctx);
+        return nullptr;
+    }
+    GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
+
+    // we require the data section to be aligned, so take into account any padding
+    if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
+        fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__);
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    // store the current file offset - this is where the data section starts
+    ctx->offset = ftell(file);
+
+    // compute the total size of the data section, taking into account the alignment
+    {
+        ctx->size = 0;
+        for (size_t i = 0; i < ctx->info.size(); ++i) {
+            const gguf_tensor_info & ti = ctx->info[i];
+            if (ti.offset != ctx->size) {
+                fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
+                    __func__, ti.t.name, ti.offset, ctx->size);
+                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+                gguf_free(ctx);
+                return nullptr;
+            }
+            ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
+        }
+    }
+
+    // load the tensor data only if requested
+    if (params.ctx != nullptr) {
+        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
+        //   the ggml_tensor structs to the appropriate locations in the binary blob
+
+        // compute the exact size needed for the new ggml_context
+        const size_t mem_size =
+            params.no_alloc ?
+            (n_tensors    )*ggml_tensor_overhead() :
+            (n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+
+        struct ggml_init_params pdata = {
+            /*mem_size   =*/ mem_size,
+            /*mem_buffer =*/ nullptr,
+            /*no_alloc   =*/ params.no_alloc,
+        };
+
+        *params.ctx = ggml_init(pdata);
+        if (*params.ctx == nullptr) {
+            fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        struct ggml_context * ctx_data = *params.ctx;
+
+        struct ggml_tensor * data = nullptr;
+
+        if (!params.no_alloc) {
+            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
+
+            ok = ok && data != nullptr;
+
+            if (ok) {
+                ggml_set_name(data, "GGUF tensor data binary blob");
+            }
+
+            // read the binary blob with the tensor data
+            ok = ok && gr.read(data->data, ctx->size);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__);
+                ggml_free(ctx_data);
+                *params.ctx = nullptr;
+                gguf_free(ctx);
+                return nullptr;
+            }
+
+            ctx->data = data->data;
+        }
+
+        ggml_set_no_alloc(ctx_data, true);
+
+        // create the tensors
+        for (size_t i = 0; i < ctx->info.size(); ++i) {
+            const struct gguf_tensor_info & info = ctx->info[i];
+
+            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, info.t.type, GGML_MAX_DIMS, info.t.ne);
+
+            ok = ok && cur != nullptr;
+
+            if (!ok) {
+                break;
+            }
+
+            ggml_set_name(cur, info.t.name);
+
+            // point the data member to the appropriate location in the binary blob using the tensor info
+            if (!params.no_alloc) {
+                cur->data = (char *) data->data + info.offset;
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to create tensors\n", __func__);
+            ggml_free(ctx_data);
+            *params.ctx = nullptr;
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        ggml_set_no_alloc(ctx_data, params.no_alloc);
+    }
+
+    return ctx;
+}
+
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    FILE * file = ggml_fopen(fname, "rb");
+
+    if (!file) {
+        fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname);
+        return nullptr;
+    }
+
+    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
+
+void gguf_free(struct gguf_context * ctx) {
+    if (ctx == nullptr) {
+        return;
+    }
+    delete ctx;
+}
+
+const char * gguf_type_name(enum gguf_type type) {
+    auto it = GGUF_TYPE_NAME.find(type);
+    return it == GGUF_TYPE_NAME.end() ? nullptr : it->second;
+}
+
+uint32_t gguf_get_version(const struct gguf_context * ctx) {
+    return ctx->version;
+}
+
+size_t gguf_get_alignment(const struct gguf_context * ctx) {
+    return ctx->alignment;
+}
+
+size_t gguf_get_data_offset(const struct gguf_context * ctx) {
+    return ctx->offset;
+}
+
+int64_t gguf_get_n_kv(const struct gguf_context * ctx) {
+    return ctx->kv.size();
+}
+
+int64_t gguf_find_key(const struct gguf_context * ctx, const char * key) {
+    // return -1 if key not found
+    int64_t keyfound = -1;
+
+    const int64_t n_kv = gguf_get_n_kv(ctx);
+
+    for (int64_t i = 0; i < n_kv; ++i) {
+        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
+            keyfound = i;
+            break;
+        }
+    }
+
+    return keyfound;
+}
+
+const char * gguf_get_key(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    return ctx->kv[key_id].get_key().c_str();
+}
+
+enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    return ctx->kv[key_id].is_array ? GGUF_TYPE_ARRAY : ctx->kv[key_id].get_type();
+}
+
+enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].is_array);
+    return ctx->kv[key_id].get_type();
+}
+
+const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
+    return ctx->kv[key_id].data.data();
+}
+
+const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
+    return ctx->kv[key_id].data_string[i].c_str();
+}
+
+size_t gguf_get_arr_n(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+
+    if (ctx->kv[key_id].type == GGUF_TYPE_STRING) {
+        return ctx->kv[key_id].data_string.size();
+    }
+
+    const size_t type_size = gguf_type_size(ctx->kv[key_id].type);
+    GGML_ASSERT(ctx->kv[key_id].data.size() % type_size == 0);
+    return ctx->kv[key_id].data.size() / type_size;
+}
+
+uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<uint8_t>();
+}
+
+int8_t gguf_get_val_i8(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<int8_t>();
+}
+
+uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<uint16_t>();
+}
+
+int16_t gguf_get_val_i16(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<int16_t>();
+}
+
+uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<uint32_t>();
+}
+
+int32_t gguf_get_val_i32(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<int32_t>();
+}
+
+float gguf_get_val_f32(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<float>();
+}
+
+uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<uint64_t>();
+}
+
+int64_t gguf_get_val_i64(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<int64_t>();
+}
+
+double gguf_get_val_f64(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<double>();
+}
+
+bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<bool>();
+}
+
+const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    return ctx->kv[key_id].get_val<std::string>().c_str();
+}
+
+const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
+    GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
+    return ctx->kv[key_id].data.data();
+}
+
+int64_t gguf_get_n_tensors(const struct gguf_context * ctx) {
+    return ctx->info.size();
+}
+
+int64_t gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
+    // return -1 if tensor not found
+    int64_t tensor_id = -1;
+
+    const int64_t n_tensors = gguf_get_n_tensors(ctx);
+
+    for (int64_t i = 0; i < n_tensors; ++i) {
+        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+            tensor_id = i;
+            break;
+        }
+    }
+
+    return tensor_id;
+}
+
+size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id) {
+    GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
+    return ctx->info[tensor_id].offset;
+}
+
+const char * gguf_get_tensor_name(const struct gguf_context * ctx, int64_t tensor_id) {
+    GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
+    return ctx->info[tensor_id].t.name;
+}
+
+enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int64_t tensor_id) {
+    GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
+    return ctx->info[tensor_id].t.type;
+}
+
+size_t gguf_get_tensor_size(const struct gguf_context * ctx, int64_t tensor_id) {
+    GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
+    return ggml_nbytes(&ctx->info[tensor_id].t);
+}
+
+int64_t gguf_remove_key(struct gguf_context * ctx, const char * key) {
+    const int64_t key_id = gguf_find_key(ctx, key);
+    if (key_id >= 0) {
+        ctx->kv.erase(ctx->kv.begin() + key_id);
+    }
+    return key_id;
+}
+
+template<typename T>
+static void gguf_check_reserved_keys(const std::string & key, const T val) {
+    if (key == GGUF_KEY_GENERAL_ALIGNMENT) {
+        if constexpr (std::is_same<T, uint32_t>::value) {
+            GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
+        } else {
+            GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
+        }
+    }
+}
+
+void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, val);
+}
+
+void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
+    gguf_check_reserved_keys(key, val);
+    gguf_remove_key(ctx, key);
+    ctx->kv.emplace_back(key, std::string(val));
+}
+
+void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n) {
+    gguf_check_reserved_keys(key, data);
+    gguf_remove_key(ctx, key);
+
+    const size_t nbytes = n*gguf_type_size(type);
+    std::vector<int8_t> tmp(nbytes);
+    if (!tmp.empty()) {
+        memcpy(tmp.data(), data, nbytes);
+    }
+    ctx->kv.emplace_back(key, tmp);
+    ctx->kv.back().cast(type);
+}
+
+void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, size_t n) {
+    gguf_check_reserved_keys(key, data);
+    gguf_remove_key(ctx, key);
+
+    std::vector<std::string> tmp(n);
+    for (size_t i = 0; i < n; ++i) {
+        tmp[i] = data[i];
+    }
+    ctx->kv.emplace_back(key, tmp);
+}
+
+// set or add KV pairs from another context
+void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src) {
+    const int64_t n_kv = gguf_get_n_kv(src);
+    for (int64_t i = 0; i < n_kv; ++i) {
+        const struct gguf_kv & kv = src->kv[i];
+
+        if (!kv.is_array) {
+            switch (kv.get_type()) {
+                case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, kv.get_key().c_str(), kv.get_val<uint8_t>());             break;
+                case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, kv.get_key().c_str(), kv.get_val<int8_t>());              break;
+                case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, kv.get_key().c_str(), kv.get_val<uint16_t>());            break;
+                case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, kv.get_key().c_str(), kv.get_val<int16_t>());             break;
+                case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, kv.get_key().c_str(), kv.get_val<uint32_t>());            break;
+                case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, kv.get_key().c_str(), kv.get_val<int32_t>());             break;
+                case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, kv.get_key().c_str(), kv.get_val<float>());               break;
+                case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, kv.get_key().c_str(), kv.get_val<uint64_t>());            break;
+                case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, kv.get_key().c_str(), kv.get_val<int64_t>());             break;
+                case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, kv.get_key().c_str(), kv.get_val<double>());              break;
+                case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, kv.get_key().c_str(), kv.get_val<bool>());                break;
+                case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, kv.get_key().c_str(), kv.get_val<std::string>().c_str()); break;
+                case GGUF_TYPE_ARRAY:
+                default: GGML_ABORT("invalid type");
+            }
+            continue;
+        }
+
+        const size_t ne = kv.get_ne();
+
+        switch (kv.get_type()) {
+            case GGUF_TYPE_UINT8:
+            case GGUF_TYPE_INT8:
+            case GGUF_TYPE_UINT16:
+            case GGUF_TYPE_INT16:
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:
+            case GGUF_TYPE_FLOAT32:
+            case GGUF_TYPE_UINT64:
+            case GGUF_TYPE_INT64:
+            case GGUF_TYPE_FLOAT64:
+            case GGUF_TYPE_BOOL: {
+                gguf_set_arr_data(ctx, kv.get_key().c_str(), kv.get_type(), kv.data.data(), ne);
+            } break;
+            case GGUF_TYPE_STRING: {
+                std::vector<const char *> tmp(ne);
+                for (size_t j = 0; j < ne; ++j) {
+                    tmp[j] = kv.data_string[j].c_str();
+                }
+                gguf_set_arr_str(ctx, kv.get_key().c_str(), tmp.data(), ne);
+            } break;
+            case GGUF_TYPE_ARRAY:
+            default: GGML_ABORT("invalid type");
+        }
+    }
+}
+
+void gguf_add_tensor(
+             struct gguf_context * ctx,
+        const struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    if (gguf_find_tensor(ctx, tensor->name) != -1) {
+        GGML_ABORT("duplicate tensor name: %s", tensor->name);
+    }
+
+    struct gguf_tensor_info ti;
+    ti.t = *tensor;
+    ti.offset = ctx->info.empty() ? 0 :
+        ctx->info.back().offset + GGML_PAD(ggml_nbytes(&ctx->info.back().t), ctx->alignment);
+    ctx->info.push_back(ti);
+}
+
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+    const int64_t tensor_id = gguf_find_tensor(ctx, name);
+    if (tensor_id < 0) {
+        GGML_ABORT("tensor not found: %s", name);
+    }
+    struct ggml_tensor * tensor = &ctx->info[tensor_id].t;
+    const size_t  type_size = ggml_type_size(type);
+    const int64_t blck_size = ggml_blck_size(type);
+
+    tensor->type = type;
+    GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
+
+    tensor->nb[0] = type_size;
+    tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+        tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
+    }
+
+    // update offsets
+    const int64_t n_tensors = gguf_get_n_tensors(ctx);
+    for (int64_t i = tensor_id + 1; i < n_tensors; ++i) {
+        ctx->info[i].offset = ctx->info[i - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[i - 1].t), ctx->alignment);
+    }
+}
+
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data) {
+    const int64_t tensor_id = gguf_find_tensor(ctx, name);
+    if (tensor_id < 0) {
+        GGML_ABORT("tensor not found: %s", name);
+    }
+
+    ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
+}
+
+struct gguf_writer {
+    std::vector<int8_t> & buf;
+
+    gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
+
+    template <typename T>
+    void write(const T & val) const {
+        for (size_t i = 0; i < sizeof(val); ++i) {
+            buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
+        }
+    }
+
+    void write(const std::vector<int8_t> & val) const {
+        buf.insert(buf.end(), val.begin(), val.end());
+    }
+
+    void write(const bool & val) const {
+        const int8_t val8 = val ? 1 : 0;
+        write(val8);
+    }
+
+    void write(const std::string & val) const {
+        {
+            const uint64_t n = val.length();
+            write(n);
+        }
+        for (size_t i = 0; i < val.length(); ++i) {
+            buf.push_back(reinterpret_cast<const int8_t *>(val.data())[i]);
+        }
+    }
+
+    void write(const char * val) const {
+        write(std::string(val));
+    }
+
+    void write(const enum ggml_type & val) const {
+        write(int32_t(val));
+    }
+
+    void write(const enum gguf_type & val) const {
+        write(int32_t(val));
+    }
+
+    void write(const struct gguf_kv & kv) const {
+        const uint64_t ne = kv.get_ne();
+
+        write(kv.get_key());
+
+        if (kv.is_array) {
+            write(GGUF_TYPE_ARRAY);
+            write(kv.get_type());
+            write(ne);
+        } else {
+            write(kv.get_type());
+        }
+
+        switch (kv.get_type()) {
+            case GGUF_TYPE_UINT8:
+            case GGUF_TYPE_INT8:
+            case GGUF_TYPE_UINT16:
+            case GGUF_TYPE_INT16:
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:
+            case GGUF_TYPE_FLOAT32:
+            case GGUF_TYPE_UINT64:
+            case GGUF_TYPE_INT64:
+            case GGUF_TYPE_FLOAT64: {
+                write(kv.data);
+            } break;
+            case GGUF_TYPE_BOOL: {
+                for (size_t i = 0; i < ne; ++i) {
+                    write(kv.get_val<bool>(i));
+                }
+            } break;
+            case GGUF_TYPE_STRING: {
+                for (size_t i = 0; i < ne; ++i) {
+                    write(kv.get_val<std::string>(i));
+                }
+            } break;
+            case GGUF_TYPE_ARRAY:
+            default: GGML_ABORT("invalid type");
+        }
+    }
+
+    void write_tensor_meta(const struct gguf_tensor_info & info) const {
+        write(info.t.name);
+
+        const uint32_t n_dims = ggml_n_dims(&info.t);
+        write(n_dims);
+
+        for (uint32_t j = 0; j < n_dims; ++j) {
+            write(info.t.ne[j]);
+        }
+        write(info.t.type);
+        write(info.offset);
+    }
+
+    void pad(const size_t alignment) const {
+        while (buf.size() % alignment != 0) {
+            const int8_t zero = 0;
+            write(zero);
+        }
+    }
+
+    void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) const {
+        GGML_ASSERT(buf.size() - offset_data == info.offset);
+
+        GGML_ASSERT(ggml_is_contiguous(&info.t));
+        const size_t offset = buf.size();
+        const size_t nbytes = ggml_nbytes(&info.t);
+
+        buf.resize(offset + nbytes);
+        if (info.t.buffer) {
+            ggml_backend_tensor_get(&info.t, buf.data() + offset, 0, nbytes);
+        } else {
+            GGML_ASSERT(info.t.data);
+            memcpy(buf.data() + offset, info.t.data, nbytes);
+        }
+
+        pad(alignment);
+    }
+};
+
+void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
+    const struct gguf_writer gw(buf);
+
+    const int64_t n_kv      = gguf_get_n_kv(ctx);
+    const int64_t n_tensors = gguf_get_n_tensors(ctx);
+
+    // write header
+    gw.write(GGUF_MAGIC[0]);
+    gw.write(GGUF_MAGIC[1]);
+    gw.write(GGUF_MAGIC[2]);
+    gw.write(GGUF_MAGIC[3]);
+    gw.write(ctx->version);
+    gw.write(n_tensors);
+    gw.write(n_kv);
+
+    // write key-value pairs
+    for (int64_t i = 0; i < n_kv; ++i) {
+        gw.write(ctx->kv[i]);
+    }
+
+    // write tensor info
+    for (int64_t i = 0; i < n_tensors; ++i) {
+        gw.write_tensor_meta(ctx->info[i]);
+    }
+
+    // we require the data section to be aligned
+    gw.pad(ctx->alignment);
+
+    if (only_meta) {
+        return;
+    }
+
+    const size_t offset_data = gw.buf.size();
+
+    // write tensor data
+    for (int64_t i = 0; i < n_tensors; ++i) {
+        gw.write_tensor_data(ctx->info[i], offset_data, ctx->alignment);
+    }
+}
+
+bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
+    FILE * file = ggml_fopen(fname, "wb");
+
+    if (!file) {
+        fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
+        return false;
+    }
+
+    std::vector<int8_t> buf;
+    gguf_write_to_buf(ctx, buf, only_meta);
+    const bool ok = fwrite(buf.data(), 1, buf.size(), file) == buf.size();
+    fclose(file);
+    return ok;
+}
+
+size_t gguf_get_meta_size(const struct gguf_context * ctx) {
+    // only return size
+    std::vector<int8_t> buf;
+    gguf_write_to_buf(ctx, buf, /*only_meta =*/ true);
+    return buf.size();
+}
+
+void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
+    std::vector<int8_t> buf;
+    gguf_write_to_buf(ctx, buf, /*only_meta =*/ true);
+    memcpy(data, buf.data(), buf.size());
+}
diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py
deleted file mode 100644
index b2e86e182..000000000
--- a/ggml_vk_generate_shaders.py
+++ /dev/null
@@ -1,2335 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import asyncio
-import os
-import sys
-from tempfile import gettempdir, NamedTemporaryFile
-
-shader_f32 = """
-#define FLOAT_TYPE float
-"""
-shader_f16 = """
-#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
-#define FLOAT_TYPE float16_t
-"""
-shader_int8_ext = """
-#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
-"""
-
-# Type-specific defines
-shader_f16_defines = """
-#define QUANT_K 1
-#define QUANT_R 1
-
-#define A_TYPE float16_t
-"""
-shader_q4_0_defines = """
-#define QUANT_K 32
-#define QUANT_R 2
-
-struct block_q4_0
-{
-    float16_t d;
-    uint8_t qs[16];
-};
-
-#define A_TYPE block_q4_0
-"""
-shader_q4_1_defines = """
-#define QUANT_K 32
-#define QUANT_R 2
-
-struct block_q4_1
-{
-    float16_t d;
-    float16_t m;
-    uint8_t qs[16];
-};
-
-#define A_TYPE block_q4_1
-"""
-shader_q5_0_defines = """
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
-
-struct block_q5_0
-{
-    float16_t d;
-    uint16_t qh[2];
-    uint8_t qs[16];
-};
-
-#define A_TYPE block_q5_0
-"""
-shader_q5_1_defines = """
-#define QUANT_K 32
-#define QUANT_R 2
-
-struct block_q5_1
-{
-    float16_t d;
-    float16_t m;
-    uint qh;
-    uint8_t qs[16];
-};
-
-#define A_TYPE block_q5_1
-"""
-shader_q8_0_defines = """
-#define QUANT_K 32
-#define QUANT_R 1
-
-struct block_q8_0
-{
-    float16_t d;
-    int8_t qs[32];
-};
-
-#define A_TYPE block_q8_0
-"""
-
-# K-quants
-shader_q2_K_defines = """
-#define QUANT_K 256
-
-struct block_q2_K
-{
-    uint8_t scales[QUANT_K/16];
-    uint8_t qs[QUANT_K/4];
-    f16vec2 d;
-};
-
-#define A_TYPE block_q2_K
-"""
-shader_q3_K_defines = """
-#define QUANT_K 256
-
-struct block_q3_K
-{
-    uint8_t hmask[QUANT_K/8];
-    uint8_t qs[QUANT_K/4];
-    uint8_t scales[12];
-    float16_t d;
-};
-
-#define A_TYPE block_q3_K
-"""
-shader_q4_K_defines = """
-#define QUANT_K 256
-
-struct block_q4_K
-{
-    f16vec2 d;
-    uint8_t scales[3*QUANT_K/64];
-    uint8_t qs[QUANT_K/2];
-};
-
-#define A_TYPE block_q4_K
-"""
-shader_q5_K_defines = """
-#define QUANT_K 256
-
-struct block_q5_K
-{
-    f16vec2 d;
-    uint8_t scales[12];
-    uint8_t qh[QUANT_K/8];
-    uint8_t qs[QUANT_K/2];
-};
-
-#define A_TYPE block_q5_K
-"""
-shader_q6_K_defines = """
-#define QUANT_K 256
-
-struct block_q6_K
-{
-    uint8_t ql[QUANT_K/2];
-    uint8_t qh[QUANT_K/4];
-    int8_t scales[QUANT_K/16];
-    float16_t d;
-};
-
-#define A_TYPE block_q6_K
-"""
-
-# Dequant functions
-shader_f16_dequant_func = """
-#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
-"""
-
-shader_q4_0_dequant_func = """
-#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
-const uint vui = uint(data_a[ib].qs[iqs]); \
-vec2 v = vec2(vui & 0xF, vui >> 4); \
-v = (v - 8.0f)*d;
-"""
-
-shader_q4_1_dequant_func = """
-#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
-const float m = float(data_a[ib].m); \
-const uint vui = uint(data_a[ib].qs[iqs]); \
-vec2 v = vec2(vui & 0xF, vui >> 4); \
-v = v*d + m;
-"""
-
-shader_q5_0_dequant_func = """
-#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
-const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
-const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
-const uint vui = uint(data_a[ib].qs[iqs]); \
-vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
-v = (v - 16.0f) * d;
-"""
-
-shader_q5_1_dequant_func = """
-#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
-const float m = float(data_a[ib].m); \
-const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
-const uint vui = uint(data_a[ib].qs[iqs]); \
-vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
-v = v*d + m;
-"""
-
-shader_q8_0_dequant_func = """
-#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
-vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
-v = v * d;
-"""
-
-# MULMAT
-
-mulmat_head = """#version 450
-
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_16bit_storage : require
-
-#ifndef LOAD_VEC
-#define LOAD_VEC 1
-#endif
-"""
-
-mulmat_body = """
-layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
-
-layout (push_constant) uniform parameter
-{
-    uint M;
-    uint N;
-    uint K;
-    uint stride_a;
-    uint stride_b;
-    uint stride_d;
-    uint k_split;
-
-    uint ne02;
-    uint ne12;
-    uint broadcast2;
-    uint broadcast3;
-
-    uint batch_stride_a;
-    uint batch_stride_b;
-    uint batch_stride_d;
-} p;
-
-layout (constant_id = 1) const uint BM = 64;
-layout (constant_id = 2) const uint BN = 64;
-layout (constant_id = 3) const uint BK = 16;
-layout (constant_id = 4) const uint WM = 32;
-layout (constant_id = 5) const uint WN = 32;
-layout (constant_id = 6) const uint WMITER = 2;
-layout (constant_id = 7) const uint TM = 4;
-layout (constant_id = 8) const uint TN = 2;
-layout (constant_id = 9) const uint WARP = 32;
-
-shared FLOAT_TYPE buf_a[BM * (BK+1)];
-shared FLOAT_TYPE buf_b[BN * (BK+1)];
-
-void main() {
-    const uint i13 = gl_GlobalInvocationID.z / p.ne12;
-    const uint i12 = gl_GlobalInvocationID.z % p.ne12;
-
-    const uint i03 = i13 / p.broadcast3;
-    const uint i02 = i12 / p.broadcast2;
-
-    const uint batch_idx_a = i03 * p.ne02 + i02;
-
-    const uint blocks_m = (p.M + BM - 1) / BM;
-    const uint ir = gl_WorkGroupID.x % blocks_m;
-    const uint ik = gl_WorkGroupID.x / blocks_m;
-    const uint ic = gl_WorkGroupID.y;
-
-    const uint warp_i = gl_LocalInvocationID.x / WARP;
-    const uint warp_r = warp_i % (BM / WM);
-    const uint warp_c = warp_i / (BM / WM);
-
-    const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
-    const uint WSUBM = WM / WMITER;
-    const uint WSUBN = WN / WNITER;
-
-    const uint tiw = gl_LocalInvocationID.x % WARP;
-    const uint tiwr = tiw % (WSUBM / TM);
-    const uint tiwc = tiw / (WSUBM / TM);
-
-    const uint loadr = gl_LocalInvocationID.x % (BK / LOAD_VEC);
-    const uint loadc = gl_LocalInvocationID.x / (BK / LOAD_VEC);
-
-    const uint loadstride = gl_WorkGroupSize.x * LOAD_VEC / BK;
-
-    const uint start_k = ik * p.k_split;
-    const uint end_k = min(p.K, (ik + 1) * p.k_split);
-
-    uint pos_a = (batch_idx_a * p.batch_stride_a + ir * BM * p.stride_a + start_k) / LOAD_VEC;
-    uint pos_b = (gl_GlobalInvocationID.z * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC;
-
-    float sums[WMITER * TM * WNITER * TN];
-    FLOAT_TYPE cache_a[WMITER * TM];
-    FLOAT_TYPE cache_b[WNITER * TN];
-
-    [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
-        sums[i] = 0.0f;
-    }
-
-    [[unroll]] for (uint block = start_k; block < end_k; block += BK) {
-        [[unroll]] for (uint l = 0; l < BM; l += loadstride) {
-#if LOAD_VEC == 8
-            const uint idx = pos_a + (loadc + l) * p.stride_a / LOAD_VEC + loadr;
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_a[idx][0].x);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_a[idx][0].y);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_a[idx][0].z);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_a[idx][0].w);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(data_a[idx][1].x);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(data_a[idx][1].y);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(data_a[idx][1].z);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(data_a[idx][1].w);
-#elif LOAD_VEC == 4
-            const uint idx = pos_a + (loadc + l) * p.stride_a / LOAD_VEC + loadr;
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_a[idx].x);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_a[idx].y);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_a[idx].z);
-            buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_a[idx].w);
-#else
-            if (ir * BM + loadc + l < p.M && block + loadr < end_k) {
-                buf_a[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(data_a[pos_a + (loadc + l) * p.stride_a + loadr]);
-            } else {
-                buf_a[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(0.0f);
-            }
-#endif
-        }
-        [[unroll]] for (uint l = 0; l < BN; l += loadstride) {
-#if LOAD_VEC == 8
-            const uint idx = pos_b + (loadc + l) * p.stride_b / LOAD_VEC + loadr;
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_b[idx][0].x);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_b[idx][0].y);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_b[idx][0].z);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_b[idx][0].w);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(data_b[idx][1].x);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(data_b[idx][1].y);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(data_b[idx][1].z);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(data_b[idx][1].w);
-#elif LOAD_VEC == 4
-            const uint idx = pos_b + (loadc + l) * p.stride_b / LOAD_VEC + loadr;
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_b[idx].x);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_b[idx].y);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_b[idx].z);
-            buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_b[idx].w);
-#else
-            if (ic * BN + loadc + l < p.N && block + loadr < end_k) {
-                buf_b[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(data_b[pos_b + (loadc + l) * p.stride_b + loadr]);
-            } else {
-                buf_b[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(0.0f);
-            }
-#endif
-        }
-
-        barrier();
-
-        pos_a += BK / LOAD_VEC;
-        pos_b += BK / LOAD_VEC;
-
-        for (uint i = 0; i < BK; i++) {
-            // Load from shared into cache
-            [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
-                [[unroll]] for (uint j = 0; j < TM; j++) {
-                    cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * (BK+1) + i];
-                }
-            }
-            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
-                [[unroll]] for (uint j = 0; j < TN; j++) {
-                    cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * (BK+1) + i];
-                }
-            }
-
-            [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
-                [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
-                    [[unroll]] for (uint cc = 0; cc < TN; cc++) {
-                        [[unroll]] for (uint cr = 0; cr < TM; cr++) {
-                            sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr] += float(cache_a[wsir * TM + cr]) * float(cache_b[wsic * TN + cc]);
-                        }
-                    }
-                }
-            }
-        }
-
-        barrier();
-    }
-
-    const uint dr = ir * BM + warp_r * WM;
-    const uint dc = ic * BN + warp_c * WN;
-
-    const uint offsets = gl_GlobalInvocationID.z * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
-
-    [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
-        [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
-
-            const uint dr_warp = dr + wsir * WSUBM + tiwr * TM;
-            const uint dc_warp = dc + wsic * WSUBN + tiwc * TN;
-            [[unroll]] for (uint cc = 0; cc < TN; cc++) {
-                [[unroll]] for (uint cr = 0; cr < TM; cr++) {
-                    if (dr_warp + cr < p.M && dc_warp + cc < p.N) {
-                        data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
-                    }
-                }
-            }
-        }
-    }
-}
-"""
-
-mulmat_split_k_reduce_src = """#version 450
-
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {float data_a[];};
-layout (binding = 1) writeonly buffer D {float data_d[];};
-
-layout (push_constant) uniform parameter {
-    uint ne;
-    uint k_num;
-} p;
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.ne) {
-        return;
-    }
-
-    float result = 0.0f;
-
-    [[unroll]] for (uint i = 0; i < p.k_num; i++) {
-        result += data_a[i * p.ne + idx];
-    }
-
-    data_d[idx] = result;
-}
-"""
-
-# DEQUANT SHADER
-dequant_head = """#version 450
-
-#extension GL_EXT_control_flow_attributes : require
-#extension GL_EXT_shader_16bit_storage : require
-"""
-
-dequant_body = """
-layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    const int i = int(gl_GlobalInvocationID.x);
-
-    // Transposed
-    const int row = i % (p.K / QUANT_K);
-    const int col = i / (p.K / QUANT_K);
-
-    if (row * QUANT_K >= p.K || col >= p.M) {
-        return;
-    }
-
-    const int stride_a = p.stride_a / QUANT_K;
-
-    const int ib = col * stride_a + row;
-
-    const int y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
-    const int step = QUANT_R == 1 ? 2 : 1;
-
-    [[unroll]] for (int iqs = 0; iqs < QUANT_K/QUANT_R; iqs += step) {
-        DEQUANT_FUNC
-
-        data_b[col * p.stride_b + row*QUANT_K + iqs + 0       ] = D_TYPE(v.x);
-        data_b[col * p.stride_b + row*QUANT_K + iqs + y_offset] = D_TYPE(v.y);
-    }
-}
-"""
-
-# K-quants
-dequant_q2_K_body = """
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    [[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
-        const int i = int(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
-            return;
-        }
-
-        const int tid = int(gl_LocalInvocationID.x);
-        const int ip = tid / 32;
-        const int il = tid - 32 * ip;
-        const int is = 8 * ip + il / 16;
-
-        const int y_idx = i * QUANT_K + 128 * ip + il;
-
-        const int ql_idx = 32 * ip + il;
-        const uint8_t qs = data_a[i].qs[32 * ip + il];
-
-        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
-        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
-        data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
-        data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
-        data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
-        data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4));
-    }
-}
-"""
-dequant_q3_K_body = """
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    [[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
-        const int i = int(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
-            return;
-        }
-
-        const int r = int(gl_LocalInvocationID.x) / 4;
-        const int tid = r / 2;
-        const int is0 = r % 2;
-        const int l0 = 16 * is0 + 4 * (int(gl_LocalInvocationID.x) % 4);
-        const int n = tid / 4;
-        const int j = tid - 4*n;
-
-        const uint8_t m = uint8_t(1 << (4*n + j));
-        const int is = 8*n + 2*j + is0;
-        const int shift = 2*j;
-
-        const int8_t us = int8_t(is <  4 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+8] >> 0) & 3) << 4) :
-                                 is <  8 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+4] >> 2) & 3) << 4) :
-                                 is < 12 ? (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is+0] >> 4) & 3) << 4) :
-                                           (data_a[i].scales[is-8] >>  4) | (((data_a[i].scales[is-4] >> 6) & 3) << 4));
-        const FLOAT_TYPE d_all = FLOAT_TYPE(data_a[i].d);
-        const FLOAT_TYPE dl    = d_all * FLOAT_TYPE(us - 32);
-
-        const int y_idx = i * QUANT_K + 128 * n + 32 * j;
-        const int qs_idx = 32*n;
-
-        for (int l = l0; l < l0 + 4; ++l) {
-            data_b[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((data_a[i].qs[qs_idx + l] >> shift) & 3) - (((data_a[i].hmask[l] & m) != 0) ? 0 : 4)));
-        }
-    }
-}
-"""
-dequant_q4_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    [[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
-        const int i = int(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
-            return;
-        }
-
-        const int tid = int(gl_LocalInvocationID.x);
-        const int il = tid / 8;
-        const int ir = tid % 8;
-        const int is = 2 * il;
-        const int n = 4;
-
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
-
-        const int y_idx = i * QUANT_K + 64 * il + n * ir;
-        const int qs_idx = 32*il + n * ir;
-
-        uint8_t sc;
-        uint8_t m;
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is] & 63);
-            m  = uint8_t(data_a[i].scales[is + 4] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
-        }
-        const FLOAT_TYPE d1 = dall * sc;
-        const FLOAT_TYPE m1 = dmin * m;
-
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is + 1] & 63);
-            m  = uint8_t(data_a[i].scales[is + 5] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
-        }
-        const FLOAT_TYPE d2 = dall * sc;
-        const FLOAT_TYPE m2 = dmin * m;
-
-        [[unroll]] for (int l = 0; l < n; ++l) {
-            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
-            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
-        }
-    }
-}
-"""
-dequant_q5_K_body = """
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    [[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
-        const int i = int(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
-            return;
-        }
-
-        const int tid = int(gl_LocalInvocationID.x);
-        const int il = tid / 16;
-        const int ir = tid % 16;
-        const int is = 2 * il;
-
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
-
-        const int y_idx = i * QUANT_K + 64 * il + 2 * ir;
-        const int qs_idx = 32*il + 2 * ir;
-        const int qh_idx = 2 * ir;
-
-        uint8_t sc;
-        uint8_t m;
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is] & 63);
-            m  = uint8_t(data_a[i].scales[is + 4] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
-        }
-        const FLOAT_TYPE d1 = dall * sc;
-        const FLOAT_TYPE m1 = dmin * m;
-
-        if (is < 4) {
-            sc = uint8_t(data_a[i].scales[is + 1] & 63);
-            m  = uint8_t(data_a[i].scales[is + 5] & 63);
-        } else {
-            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
-            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
-        }
-        const FLOAT_TYPE d2 = dall * sc;
-        const FLOAT_TYPE m2 = dmin * m;
-
-        const uint8_t hm1 = uint8_t(1 << (2 * il    ));
-        const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
-        data_b[y_idx     ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx    ] & 0xF) + (((data_a[i].qh[qh_idx    ] & hm1) != 0) ? 16 : 0)) - m1);
-        data_b[y_idx +  1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
-        data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx    ]  >> 4) + (((data_a[i].qh[qh_idx    ] & hm2) != 0) ? 16 : 0)) - m2);
-        data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1]  >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
-    }
-}
-"""
-dequant_q6_K_body = """
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    [[unroll]] for (int wgy = 0; wgy < 256; wgy++) {
-        const int i = int(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
-            return;
-        }
-        const int tid = int(gl_LocalInvocationID.x);
-        const int ip = tid / 32;
-        const int il = tid - 32 * ip;
-        const int is = 8 * ip + il / 16;
-
-        const int y_idx = i * QUANT_K + 128 * ip + il;
-
-        const int ql_idx = 64 * ip + il;
-        const uint8_t qh = data_a[i].qh[32 * ip + il];
-
-        const FLOAT_TYPE d = FLOAT_TYPE(data_a[i].d);
-
-        data_b[y_idx +  0] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 0] * (int8_t((data_a[i].ql[ql_idx +  0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
-        data_b[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 2] * (int8_t((data_a[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
-        data_b[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 4] * (int8_t((data_a[i].ql[ql_idx +  0] >>  4) | (((qh >> 4) & 3) << 4)) - 32)));
-        data_b[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 6] * (int8_t((data_a[i].ql[ql_idx + 32] >>  4) | (((qh >> 6) & 3) << 4)) - 32)));
-    }
-}
-"""
-
-# Mul Mat Vec
-mul_mat_vec_head = """#version 450
-
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_8bit_storage : require
-"""
-
-mul_mat_vec_body = """
-layout(local_size_x = QUANT_K, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[QUANT_K];
-
-void main() {
-    const int block_size = int(gl_WorkGroupSize.x);
-    const int row = int(gl_WorkGroupID.x);
-    const int tid = int(gl_LocalInvocationID.x);
-
-    const int y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
-
-    tmp[tid] = FLOAT_TYPE(0.0f);
-
-    [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*p.ncols + col)/QUANT_K; // block index
-        const int iqs = (col%QUANT_K)/QUANT_R; // quant index
-        const int iybs = col - col%QUANT_K; // y block start index
-
-        DEQUANT_FUNC
-
-        // matrix multiplication
-        tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[p.b_offset + iybs + iqs + 0]);
-        tmp[tid] += FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[p.b_offset + iybs + iqs + y_offset]);
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = block_size/2; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-
-# K-quants
-mul_mat_vec_q2_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[32];
-
-void main() {
-    const int row = int(gl_WorkGroupID.x);
-
-    const int num_blocks_per_row = p.ncols / QUANT_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;            // 16 or 8
-
-    const int v_im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int v_in = tid - step*v_im;                      // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*v_in;            // 0...15
-    const int q_offset = 32*v_im + l0;
-    const int s_offset = 8*v_im;
-    const int y_offset = 128*v_im + l0;
-
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-        const int y_idx = i * QUANT_K + y_offset;
-
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
-
-        FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
-        FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += FLOAT_TYPE(data_b[p.b_offset + y_idx + l +  0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3);
-            sum2 += FLOAT_TYPE(data_b[p.b_offset + y_idx + l +  0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF)
-                  + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF);
-        }
-        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = 16; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-mul_mat_vec_q3_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[32];
-
-void main() {
-    const int row = int(gl_WorkGroupID.x);
-
-    const int num_blocks_per_row = p.ncols / QUANT_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;            // 16 or 8
-
-    const int v_im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int v_in = tid - step*v_im;                      // 0...15 or 0...7
-
-    const uint8_t m = uint8_t(1 << (4 * v_im));
-
-    const int l0 = K_QUANTS_PER_ITERATION*v_in;            // 0...15
-    const int q_offset = 32*v_im + l0;
-    const int y_offset = 128*v_im + l0;
-
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    const uint s_shift = 4 * v_im;
-
-    [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-        const int y_idx = i * QUANT_K + y_offset;
-
-        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
-
-        FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum += FLOAT_TYPE(data_b[p.b_offset + y_idx + l +  0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4))
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4));
-        }
-        tmp[16 * ix + tid] += d * sum;
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = 16; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-mul_mat_vec_q4_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[32];
-
-void main() {
-    const int row = int(gl_WorkGroupID.x);
-
-    const int num_blocks_per_row = p.ncols / QUANT_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;             // 8 or 4
-
-    const int il = tid/step;                               // 0...3
-    const int ir = tid - step*il;                          // 0...7 or 0...3
-    const int n =  2 * K_QUANTS_PER_ITERATION;             // 2 or 4
-
-    const int v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int v_in = il % 2;
-
-    const int l0 = n * (2 * ir + v_in);            // 0...15
-    const int q_offset = 32*v_im + l0;
-    const int y_offset = 64*v_im + l0;
-
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-        const int y1_idx = i * QUANT_K + y_offset;
-        const int y2_idx = y1_idx + 128;
-
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
-
-        const uint8_t sc0 = uint8_t(  data_a[ib0 + i].scales[v_im * 2    ]       & 0x3f);
-        const uint8_t sc1 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 1]       & 0x3f);
-        const uint8_t sc2 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 4]       & 0x3f);
-        const uint8_t sc3 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 5]       & 0x3f);
-        const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2    ] & 0xc0) >> 2));
-        const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2));
-        const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
-        const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint8_t q4_0  = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1  = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2  = uint8_t(data_a[ib0 + i].qs[q_offset +  2] & 0xf);
-        const uint8_t q4_3  = uint8_t(data_a[ib0 + i].qs[q_offset +  3] & 0xf);
-        const uint8_t q4_4  = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_5  = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_6  = uint8_t(data_a[ib0 + i].qs[q_offset +  2]  >> 4);
-        const uint8_t q4_7  = uint8_t(data_a[ib0 + i].qs[q_offset +  3]  >> 4);
-        const uint8_t q4_8  = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_9  = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] & 0xf);
-        const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] & 0xf);
-        const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-        const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66]  >> 4);
-        const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67]  >> 4);
-
-        const FLOAT_TYPE sx = FLOAT_TYPE(data_b[p.b_offset + y1_idx] * q4_0 + data_b[p.b_offset + y1_idx + 1] * q4_1 + data_b[p.b_offset + y1_idx + 2] * q4_2 + data_b[p.b_offset + y1_idx + 3] * q4_3);
-        const FLOAT_TYPE sy = FLOAT_TYPE(data_b[p.b_offset + y1_idx + 32] * q4_4 + data_b[p.b_offset + y1_idx + 33] * q4_5 + data_b[p.b_offset + y1_idx + 34] * q4_6 + data_b[p.b_offset + y1_idx + 35] * q4_7);
-        const FLOAT_TYPE sz = FLOAT_TYPE(data_b[p.b_offset + y2_idx] * q4_8 + data_b[p.b_offset + y2_idx + 1] * q4_9 + data_b[p.b_offset + y2_idx + 2] * q4_10 + data_b[p.b_offset + y2_idx + 3] * q4_11);
-        const FLOAT_TYPE sw = FLOAT_TYPE(data_b[p.b_offset + y2_idx + 32] * q4_12 + data_b[p.b_offset + y2_idx + 33] * q4_13 + data_b[p.b_offset + y2_idx + 34] * q4_14 + data_b[p.b_offset + y2_idx + 35] * q4_15);
-        const FLOAT_TYPE smin = FLOAT_TYPE(
-            data_b[p.b_offset + y1_idx    ] * sc2 + data_b[p.b_offset + y1_idx + 32] * sc3 + data_b[p.b_offset + y2_idx    ] * sc6 + data_b[p.b_offset + y2_idx + 32] * sc7
-          + data_b[p.b_offset + y1_idx + 1] * sc2 + data_b[p.b_offset + y1_idx + 33] * sc3 + data_b[p.b_offset + y2_idx + 1] * sc6 + data_b[p.b_offset + y2_idx + 33] * sc7
-          + data_b[p.b_offset + y1_idx + 2] * sc2 + data_b[p.b_offset + y1_idx + 34] * sc3 + data_b[p.b_offset + y2_idx + 2] * sc6 + data_b[p.b_offset + y2_idx + 34] * sc7
-          + data_b[p.b_offset + y1_idx + 3] * sc2 + data_b[p.b_offset + y1_idx + 35] * sc3 + data_b[p.b_offset + y2_idx + 3] * sc6 + data_b[p.b_offset + y2_idx + 35] * sc7
-        );
-        tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
-#else
-        const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-
-        const FLOAT_TYPE sx = FLOAT_TYPE(data_b[p.b_offset + y1_idx     ] * q4_0  + data_b[p.b_offset + y1_idx +  1] * q4_1);
-        const FLOAT_TYPE sy = FLOAT_TYPE(data_b[p.b_offset + y1_idx + 32] * q4_2  + data_b[p.b_offset + y1_idx + 33] * q4_3);
-        const FLOAT_TYPE sz = FLOAT_TYPE(data_b[p.b_offset + y2_idx     ] * q4_4  + data_b[p.b_offset + y2_idx +  1] * q4_5);
-        const FLOAT_TYPE sw = FLOAT_TYPE(data_b[p.b_offset + y2_idx + 32] * q4_6 + data_b[p.b_offset + y2_idx + 33] * q4_7);
-        const FLOAT_TYPE smin = FLOAT_TYPE(
-            data_b[p.b_offset + y1_idx] * sc2 + data_b[p.b_offset + y1_idx + 32] * sc3 + data_b[p.b_offset + y2_idx] * sc6 + data_b[p.b_offset + y2_idx + 32] * sc7
-          + data_b[p.b_offset + y1_idx + 1] * sc2 + data_b[p.b_offset + y1_idx + 33] * sc3 + data_b[p.b_offset + y2_idx + 1] * sc6 + data_b[p.b_offset + y2_idx + 33] * sc7
-        );
-
-        tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
-#endif
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = 16; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-mul_mat_vec_q5_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[32];
-
-void main() {
-    const int row = int(gl_WorkGroupID.x);
-
-    const int num_blocks_per_row = p.ncols / QUANT_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const int tid = int(gl_LocalInvocationID.x)/2;  // 0...31 or 0...16
-    const int ix  = int(gl_LocalInvocationID.x)%2;  // 0 or 0, 1
-
-    const int il = tid/4;                           // 0...3
-    const int ir = tid - 4*il;                      // 0...7 or 0...3
-
-    const int v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int v_in = il % 2;
-
-    const int l0 = 4*ir + 2*v_in;                   // 0...15
-    const int q_offset = 32*v_im + l0;
-    const int y_offset = 64*v_im + l0;
-
-    const uint8_t hm1 = uint8_t(1 << (2*v_im));
-    const uint8_t hm2 = uint8_t(hm1 << 4);
-
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    [[unroll]] for (int i = ix; i < num_blocks_per_row; i += 2) {
-        const int y1_idx = i * QUANT_K + y_offset;
-        const int y2_idx = y1_idx + 128;
-
-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
-
-        const uint8_t sc0 = uint8_t(  data_a[ib0 + i].scales[v_im * 2    ]       & 0x3f);
-        const uint8_t sc1 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 1]       & 0x3f);
-        const uint8_t sc2 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 4]       & 0x3f);
-        const uint8_t sc3 = uint8_t(  data_a[ib0 + i].scales[v_im * 2 + 5]       & 0x3f);
-        const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2    ] & 0xc0) >> 2));
-        const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9]       & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2));
-        const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
-        const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
-
-        const uint8_t q4_0  = uint8_t(data_a[ib0 + i].qs[q_offset     ] & 0xf);
-        const uint8_t q4_1  = uint8_t(data_a[ib0 + i].qs[q_offset +  1] & 0xf);
-        const uint8_t q4_2  = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf);
-        const uint8_t q4_3  = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf);
-        const uint8_t q4_4  = uint8_t(data_a[ib0 + i].qs[q_offset     ]  >> 4);
-        const uint8_t q4_5  = uint8_t(data_a[ib0 + i].qs[q_offset +  1]  >> 4);
-        const uint8_t q4_6  = uint8_t(data_a[ib0 + i].qs[q_offset + 16]  >> 4);
-        const uint8_t q4_7  = uint8_t(data_a[ib0 + i].qs[q_offset + 17]  >> 4);
-        const uint8_t q4_8  = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
-        const uint8_t q4_9  = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
-        const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf);
-        const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf);
-        const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64]  >> 4);
-        const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65]  >> 4);
-        const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80]  >> 4);
-        const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81]  >> 4);
-
-        const FLOAT_TYPE sx = FLOAT_TYPE(
-            data_b[p.b_offset + y1_idx     ] * (q4_0 + (((data_a[ib0 + i].qh[l0     ] & hm1) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx +  1] * (q4_1 + (((data_a[ib0 + i].qh[l0 +  1] & hm1) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx + 16] * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx + 17] * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))
-        );
-        const FLOAT_TYPE sy = FLOAT_TYPE(
-            data_b[p.b_offset + y1_idx + 32] * (q4_4 + (((data_a[ib0 + i].qh[l0     ] & (hm1 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx + 33] * (q4_5 + (((data_a[ib0 + i].qh[l0 +  1] & (hm1 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx + 48] * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y1_idx + 49] * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))
-        );
-        const FLOAT_TYPE sz = FLOAT_TYPE(
-            data_b[p.b_offset + y2_idx     ] * (q4_8  + (((data_a[ib0 + i].qh[l0     ] & hm2) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx +  1] * (q4_9  + (((data_a[ib0 + i].qh[l0 +  1] & hm2) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx + 16] * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx + 17] * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))
-        );
-        const FLOAT_TYPE sw = FLOAT_TYPE(
-            data_b[p.b_offset + y2_idx + 32] * (q4_12 + (((data_a[ib0 + i].qh[l0     ] & (hm2 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx + 33] * (q4_13 + (((data_a[ib0 + i].qh[l0 +  1] & (hm2 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx + 48] * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0))
-          + data_b[p.b_offset + y2_idx + 49] * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))
-        );
-        const FLOAT_TYPE smin = FLOAT_TYPE(
-            (data_b[p.b_offset + y1_idx] + data_b[p.b_offset + y1_idx + 1] + data_b[p.b_offset + y1_idx + 16] + data_b[p.b_offset + y1_idx + 17]) * sc2 + (data_b[p.b_offset + y1_idx + 32] + data_b[p.b_offset + y1_idx + 33] + data_b[p.b_offset + y1_idx + 48] + data_b[p.b_offset + y1_idx + 49]) * sc3
-          + (data_b[p.b_offset + y2_idx] + data_b[p.b_offset + y2_idx + 1] + data_b[p.b_offset + y2_idx + 16] + data_b[p.b_offset + y2_idx + 17]) * sc6 + (data_b[p.b_offset + y2_idx + 32] + data_b[p.b_offset + y2_idx + 33] + data_b[p.b_offset + y2_idx + 48] + data_b[p.b_offset + y2_idx + 49]) * sc7
-        );
-        tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = 16; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-mul_mat_vec_q6_K_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    int ncols;
-    int b_offset;
-    int d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[32];
-
-void main() {
-    const int row = int(gl_WorkGroupID.x);
-
-    const int num_blocks_per_row = p.ncols / QUANT_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const int tid = int(gl_LocalInvocationID.x)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = int(gl_LocalInvocationID.x)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;            // 16 or 8
-
-    const int v_im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int v_in = tid - step*v_im;                      // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*v_in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * v_in;                               // 0, 4, 8, ..., 28
-    const int is = v_in / 4;
-#endif
-
-    const int ql_offset = 64*v_im + l0;
-    const int qh_offset = 32*v_im + l0;
-    const int s_offset  =  8*v_im + is;
-    const int y_offset = 128*v_im + l0;
-
-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
-
-    [[unroll]] for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-        const int y_idx    = i * QUANT_K + y_offset;
-
-        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
-
-#if K_QUANTS_PER_ITERATION == 1
-        FLOAT_TYPE sum = FLOAT_TYPE(data_b[p.b_offset + y_idx +  0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset +  0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x03) << 4)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x0c) << 2)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset +  0]  >> 4) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0x30) >> 0)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16]  >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32]  >> 4) | ((data_a[ib0 + i].qh[qh_offset +  0] & 0xc0) >> 2)) - 32)
-                       + FLOAT_TYPE(data_b[p.b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48]  >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
-        tmp[16 * ix + tid] += sum;
-#else
-        FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-        [[unroll]] for (int l = 0; l < 4; ++l) {
-            sum += FLOAT_TYPE(data_b[p.b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0]  >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32)
-                 + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32]  >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp[16 * ix + tid] += sum;
-#endif
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = 16; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-       }
-        barrier();
-    }
-    if (tid == 0) {
-        dst[p.d_offset + row] = D_TYPE(tmp[0]);
-    }
-}
-"""
-
-mul_mat_p021_src = """#version 450
-
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_16bit_storage : require
-
-#define BLOCK_SIZE 32
-#define FLOAT_TYPE float
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    uint ncols_x;
-    uint nrows_x;
-    uint nchannels_x;
-    uint nchannels_y;
-    uint b_offset;
-    uint d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint tid = gl_LocalInvocationID.x;
-    const uint row_x = gl_GlobalInvocationID.y;
-    const uint channel = gl_GlobalInvocationID.z;
-    const uint channel_x = channel / (p.nchannels_y / p.nchannels_x);
-
-    const uint nrows_y = p.ncols_x;
-    const uint nrows_dst = p.nrows_x;
-    const uint row_dst = row_x;
-
-    tmp[tid] = FLOAT_TYPE(0.0f);
-
-    for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
-        const uint col_x = col_x0 + tid;
-
-        if (col_x >= p.ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
-        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
-
-        const uint row_y = col_x;
-
-        // y is not transposed but permuted
-        const uint iy = channel*nrows_y + row_y;
-
-        tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
-    }
-
-    // dst is not transposed and not permuted
-    const uint idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-
-    if (tid == 0) {
-        dst[idst] = tmp[0];
-    }
-}
-"""
-
-
-mul_mat_nc_src = """#version 450
-
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_16bit_storage : require
-
-#define BLOCK_SIZE 32
-#define FLOAT_TYPE float
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-layout (push_constant) uniform parameter
-{
-    uint ncols_x;
-    uint nrows_x;
-    uint row_stride_x;
-    uint channel_stride_x;
-    uint channel_x_divisor;
-    uint b_offset;
-    uint d_offset;
-} p;
-
-shared FLOAT_TYPE tmp[BLOCK_SIZE];
-
-void main() {
-    const uint tid       = gl_LocalInvocationID.x;
-    const uint row_x     = gl_GlobalInvocationID.y;
-    const uint channel   = gl_GlobalInvocationID.z;
-    const uint channel_x = channel / p.channel_x_divisor;
-
-    const uint nrows_y   = p.ncols_x;
-    const uint nrows_dst = p.nrows_x;
-    const uint row_dst   = row_x;
-
-    const uint idst = channel*nrows_dst + row_dst;
-
-    tmp[tid] = 0.0f;
-
-    for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
-        const uint col_x = col_x0 + tid;
-
-        if (col_x >= p.ncols_x) {
-            break;
-        }
-
-        const uint row_y = col_x;
-
-        const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
-        const uint iy = channel*nrows_y + row_y;
-
-        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
-
-        tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier();
-    }
-
-    if (tid == 0) {
-        dst[idst] = tmp[0];
-    }
-}
-"""
-
-# F16 to F32
-f32_to_f16_src = """#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {float data_a[];};
-layout (binding = 1) writeonly buffer D {float16_t data_b[];};
-
-layout (push_constant) uniform parameter
-{
-    int M;
-    int K;
-    int stride_a;
-    int stride_b;
-} p;
-
-void main() {
-    const int row = int(gl_GlobalInvocationID.x % p.K);
-    const int col = int(gl_GlobalInvocationID.x / p.K);
-
-    if (row < p.K && col < p.M) {
-        data_b[col * p.stride_b + row] = float16_t(data_a[col * p.stride_a + row]);
-    }
-}
-"""
-
-generic_head = """
-#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout (push_constant) uniform parameter
-{
-    uint KX;
-    uint KY;
-    float param1;
-    float param2;
-} p;
-"""
-
-# MUL F32
-mul_body = """layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.KX) {
-        return;
-    }
-
-    data_d[idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(data_b[idx % p.KY]));
-}
-"""
-
-# ADD
-add_body = """
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.KX) {
-        return;
-    }
-
-    data_d[idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) + FLOAT_TYPE(data_b[idx % p.KY]));
-}
-"""
-
-# SCALE
-scale_body = """layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.KX) {
-        return;
-    }
-
-    data_d[idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
-}
-"""
-
-# SQR
-sqr_body = """layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.KX) {
-        return;
-    }
-
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[idx]);
-    data_d[idx] = D_TYPE(val * val);
-}
-"""
-
-# CLAMP
-clamp_body = """layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint idx = gl_GlobalInvocationID.x;
-
-    if (idx >= p.KX) {
-        return;
-    }
-
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[idx]);
-    data_d[idx] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
-}
-"""
-
-# CPY
-cpy_src = """#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout (push_constant) uniform parameter
-{
-    uint ne;
-    uint ne00; uint ne01; uint nb00; uint nb01; uint nb02;
-    uint ne10; uint ne11; uint nb10; uint nb11; uint nb12;
-    uint d_offset;
-} p;
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
-        return;
-    }
-
-    const uint i02 = gl_GlobalInvocationID.x / (p.ne00*p.ne01);
-    const uint i01 = (gl_GlobalInvocationID.x - i02*p.ne01*p.ne00) / p.ne00;
-    const uint i00 = gl_GlobalInvocationID.x - i02*p.ne01*p.ne00 - i01*p.ne00;
-    const uint a_idx = i00*p.nb00 + i01*p.nb01 + i02*p.nb02;
-
-    const uint i12 = gl_GlobalInvocationID.x / (p.ne10*p.ne11);
-    const uint i11 = (gl_GlobalInvocationID.x - i12*p.ne11*p.ne10) / p.ne10;
-    const uint i10 = gl_GlobalInvocationID.x - i12*p.ne11*p.ne10 - i11*p.ne10;
-    const uint d_idx = i10*p.nb10 + i11*p.nb11 + i12*p.nb12;
-"""
-cpy_end = """
-    data_d[p.d_offset + d_idx] = D_TYPE(data_a[a_idx]);
-}
-"""
-# Causes an optimization error otherwise
-cpy_f16_f16_end = """
-    data_d[p.d_offset + d_idx] = data_a[a_idx];
-}
-"""
-
-# GET_ROWS
-get_rows_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_8bit_storage : require
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
-void main() {
-    const uint col = int(gl_GlobalInvocationID.x) * 2;
-    const uint row = int(gl_GlobalInvocationID.y);
-
-    if (col >= p.KY) {
-        return;
-    }
-
-    const uint r = uint(data_b[row]);
-
-    // copy data_a[r*p.KY + col] to dst[row*p.KX + col]
-    const uint xi = r*p.KY + col;
-    const uint di = row*p.KY + col;
-
-    const uint ib = xi/QUANT_K; // block index
-    const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index
-    const uint iybs = di - di%QUANT_K; // y block start index
-    const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
-
-    DEQUANT_FUNC
-
-    dst[iybs + iqs + 0]        = D_TYPE(v.x);
-    dst[iybs + iqs + y_offset] = D_TYPE(v.y);
-}
-"""
-
-# UNARY
-gelu_body = """
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const uint i = gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
-    data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
-}
-"""
-
-silu_body = """
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    const float xi = float(data_a[i]);
-    data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
-}
-"""
-
-relu_body = """
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint i = gl_GlobalInvocationID.x;
-
-    if (i >= p.KX) {
-        return;
-    }
-
-    data_d[i] = max(float(data_a[i]), 0);
-}
-"""
-
-# DIAG_MASK_INF
-diag_mask_inf_head = """#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout (push_constant) uniform parameter
-{
-    uint ncols;
-    uint rows_per_channel;
-    uint n_past;
-} p;
-"""
-diag_mask_inf_body = """
-#extension GL_EXT_control_flow_attributes : enable
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-void main() {
-    const uint col = gl_GlobalInvocationID.y;
-    const uint row = gl_GlobalInvocationID.x;
-
-    if (col >= p.ncols) {
-        return;
-    }
-
-    const uint i = row*p.ncols + col;
-    data_d[i] = D_TYPE(data_a[i] - float(uint(col > p.n_past + row % p.rows_per_channel) * 0xFFFFFFFF));
-}
-"""
-
-# NORMS
-norm_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#define BLOCK_SIZE 512
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-shared vec2 sum[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x;
-    const uint tid = gl_LocalInvocationID.x;
-
-    const float eps = 1e-5f;
-
-    sum[tid] = vec2(0.0f, 0.0f);
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        const float xi = float(data_a[row*p.KX + col]);
-        sum[tid].x += xi;
-        sum[tid].y += xi * xi;
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            sum[tid] += sum[tid + s];
-        }
-        barrier();
-    }
-
-    const float mean = sum[0].x / p.KX;
-    const float var = sum[0].y / p.KX - mean * mean;
-    const float inv_std = inversesqrt(var + 1e-5f);
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
-    }
-}
-"""
-
-rms_norm_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#define BLOCK_SIZE 512
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
-shared FLOAT_TYPE sum[BLOCK_SIZE];
-
-void main() {
-    const uint row = gl_WorkGroupID.x;
-    const uint tid = gl_LocalInvocationID.x;
-
-    sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
-        sum[tid] += xi * xi;
-    }
-
-    // sum up partial sums and write back result
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            sum[tid] += sum[tid + s];
-        }
-        barrier();
-    }
-
-    const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
-    const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
-    }
-}
-"""
-
-# SOFT_MAX
-soft_max_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#define BLOCK_SIZE 512
-
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
-layout (binding = 2) buffer D {D_TYPE data_d[];};
-
-shared FLOAT_TYPE vals[BLOCK_SIZE];
-
-void main() {
-    const uint tid = gl_LocalInvocationID.x;
-    const uint rowx = gl_WorkGroupID.x;
-    const uint rowy = rowx % p.KY;
-
-    // Find max
-    vals[tid] = uintBitsToFloat(0xFF800000);
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        vals[tid] = max(vals[tid], FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.param1 + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)));
-    }
-
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            vals[tid] = max(vals[tid], vals[tid + s]);
-        }
-        barrier();
-    }
-
-    const FLOAT_TYPE max_val = vals[0];
-    barrier();
-
-    // Sum up values
-    vals[tid] = FLOAT_TYPE(0.0f);
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        const uint i = rowx * p.KX + col;
-        const FLOAT_TYPE val = exp(FLOAT_TYPE(data_a[i]) * p.param1 + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
-        vals[tid] += val;
-        data_d[i] = D_TYPE(val);
-    }
-
-    barrier();
-    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            vals[tid] += vals[tid + s];
-        }
-        barrier();
-    }
-
-    const D_TYPE divisor = D_TYPE(vals[0]);
-
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        data_d[rowx*p.KX + col] /= divisor;
-    }
-}
-"""
-
-# ROPE
-rope_src = """
-#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
-
-layout (push_constant) uniform parameter {
-    uint ncols;
-    float freq_scale;
-    uint p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[4];
-} p;
-
-float rope_yarn_ramp(const float low, const float high, const uint i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
-    float mscale = p.attn_factor;
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = p.freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (p.ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
-
-void main() {
-    const uint col = gl_GlobalInvocationID.y * 2;
-    const uint row = gl_GlobalInvocationID.x;
-
-    if (col >= p.ncols) {
-        return;
-    }
-
-    const uint i = row*p.ncols + col;
-    const uint i2 = row/p.p_delta_rows;
-
-    const int pos = data_b[i2];
-    const float theta_base = pos * pow(p.freq_base, -float(col)/p.ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, col, cos_theta, sin_theta);
-
-    const float x0 = float(data_a[i + 0]);
-    const float x1 = float(data_a[i + 1]);
-
-    data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
-}
-"""
-
-rope_neox_src = """
-#version 450
-
-#extension GL_EXT_shader_16bit_storage : require
-
-layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
-
-layout (push_constant) uniform parameter {
-    uint ncols;
-    uint ndims;
-    float freq_scale;
-    uint p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[4];
-    float theta_scale;
-    float inv_ndims;
-} p;
-
-float rope_yarn_ramp(const float low, const float high, const uint i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
-    float mscale = p.attn_factor;
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = p.freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (p.ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
-
-void main() {
-    const uint col = gl_GlobalInvocationID.y * 2;
-    const uint row = gl_GlobalInvocationID.x;
-
-    if (col >= p.ncols) {
-        return;
-    }
-
-    const uint ib = col / p.ndims;
-    const uint ic = col % p.ndims;
-
-    if (ib > 0) {
-        const uint i = row*p.ncols + ib*p.ndims + ic;
-
-        data_d[i + 0] = data_a[i + 0];
-        data_d[i + 1] = data_a[i + 1];
-
-        return;
-    }
-
-    const uint i  = row*p.ncols + ib*p.ndims + ic/2;
-    const uint i2 = row/p.p_delta_rows;
-
-    const float cur_rot = p.inv_ndims * ic - ib;
-
-    const int pos = data_b[i2];
-    const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
-
-    const float x0 = float(data_a[i + 0]);
-    const float x1 = float(data_a[i + p.ndims/2]);
-
-    data_d[i + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[i + p.ndims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
-}
-"""
-
-GLSLC = "glslc"
-
-VK_NUM_TYPES = 16
-
-GGML_TYPE_F32  = 0
-GGML_TYPE_F16  = 1
-GGML_TYPE_Q4_0 = 2
-GGML_TYPE_Q4_1 = 3
-GGML_TYPE_Q5_0 = 6
-GGML_TYPE_Q5_1 = 7
-GGML_TYPE_Q8_0 = 8
-GGML_TYPE_Q8_1 = 9
-GGML_TYPE_Q2_K = 10
-GGML_TYPE_Q3_K = 11
-GGML_TYPE_Q4_K = 12
-GGML_TYPE_Q5_K = 13
-GGML_TYPE_Q6_K = 14
-GGML_TYPE_Q8_K = 15
-
-
-type_names = {
-    GGML_TYPE_F32: "f32",
-    GGML_TYPE_F16: "f16",
-    GGML_TYPE_Q4_0: "q4_0",
-    GGML_TYPE_Q4_1: "q4_1",
-    GGML_TYPE_Q5_0: "q5_0",
-    GGML_TYPE_Q5_1: "q5_1",
-    GGML_TYPE_Q8_0: "q8_0",
-    GGML_TYPE_Q8_1: "q8_1",
-    GGML_TYPE_Q2_K: "q2_K",
-    GGML_TYPE_Q3_K: "q3_K",
-    GGML_TYPE_Q4_K: "q4_K",
-    GGML_TYPE_Q5_K: "q5_K",
-    GGML_TYPE_Q6_K: "q6_K",
-    GGML_TYPE_Q8_K: "q8_K",
-}
-
-K_QUANTS_PER_ITERATION = 2
-
-ASYNCIO_CONCURRENCY = 64
-
-output_dir = gettempdir()
-
-lock = asyncio.Lock()
-shader_fnames = []
-
-
-async def string_to_spv(name, code, defines, fp16=True):
-    f = NamedTemporaryFile(mode="w", delete=False)
-    f.write(code)
-    f.flush()
-
-    name = f"{name}{'_fp32' if not fp16 else ''}"
-    fname = os.path.join(output_dir, f"{name}.comp")
-
-    cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", f.name, "-o", fname]
-
-    cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
-
-    proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
-
-    stdout, stderr = await proc.communicate()
-
-    stdout = stdout.decode()
-    error = stderr.decode()
-
-    if proc.returncode:
-        # Generate preprocessed code
-        cmd = [GLSLC, "-E", f.name]
-        cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
-
-        proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
-
-        stdout, stderr = await proc.communicate()
-
-        print(" ".join(cmd))
-
-        if proc.returncode:
-            raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}")
-
-        preprocessed_code = stdout.decode()
-
-        cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
-        code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
-        print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}")
-        f.close()
-        os.remove(f.name)
-        sys.exit(proc.returncode)
-
-    f.close()
-    os.remove(f.name)
-
-    async with lock:
-        shader_fnames.append((name, fname))
-
-
-async def main():
-    print("ggml_vulkan: Generating and compiling shaders to SPIR-V")
-
-    tasks = []
-
-    for fp16 in (False, True):
-        # mulmat
-        if fp16:
-            shader_float_type = shader_f16
-            load_vec = "8"
-            vec_type_f16 = "f16mat2x4"
-            vec_type = "mat2x4"
-        else:
-            shader_float_type = shader_f32
-            load_vec = "4"
-            vec_type_f16 = "f16vec4"
-            vec_type = "vec4"
-
-        stream = []
-        stream.extend((mulmat_head, shader_float_type, mulmat_body))
-        tasks.append(string_to_spv("matmul_f32_l", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f32_m", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f32_s", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f32_aligned_l", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-
-        tasks.append(string_to_spv("matmul_f16_l", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_m", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_s", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_aligned_l", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16))
-
-        tasks.append(string_to_spv("matmul_f16_f32_l", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_f32_m", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_f32_s", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_f32_aligned_l", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-        tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
-
-    # Shaders where precision is needed, so no fp16 version
-
-    # mul mat vec
-    for i in range(0, VK_NUM_TYPES):
-        stream.clear()
-        stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
-
-        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q4_0:
-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q4_1:
-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q5_0:
-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q5_1:
-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q8_0:
-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body))
-        elif i == GGML_TYPE_Q2_K:
-            stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
-        elif i == GGML_TYPE_Q3_K:
-            stream.extend((shader_q3_K_defines, mul_mat_vec_q3_K_body))
-        elif i == GGML_TYPE_Q4_K:
-            stream.extend((shader_q4_K_defines, mul_mat_vec_q4_K_body))
-        elif i == GGML_TYPE_Q5_K:
-            stream.extend((shader_q5_K_defines, mul_mat_vec_q5_K_body))
-        elif i == GGML_TYPE_Q6_K:
-            stream.extend((shader_q6_K_defines, mul_mat_vec_q6_K_body))
-        else:
-            continue
-
-        tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
-
-    # Dequant shaders
-    for i in range(0, VK_NUM_TYPES):
-        stream.clear()
-
-        stream.extend((dequant_head, shader_int8_ext, shader_f32))
-
-        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines,  shader_f16_dequant_func,  dequant_body))
-        elif i == GGML_TYPE_Q4_0:
-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body))
-        elif i == GGML_TYPE_Q4_1:
-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body))
-        elif i == GGML_TYPE_Q5_0:
-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body))
-        elif i == GGML_TYPE_Q5_1:
-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body))
-        elif i == GGML_TYPE_Q8_0:
-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body))
-        elif i == GGML_TYPE_Q2_K:
-            stream.extend((shader_q2_K_defines, dequant_q2_K_body))
-        elif i == GGML_TYPE_Q3_K:
-            stream.extend((shader_q3_K_defines, dequant_q3_K_body))
-        elif i == GGML_TYPE_Q4_K:
-            stream.extend((shader_q4_K_defines, dequant_q4_K_body))
-        elif i == GGML_TYPE_Q5_K:
-            stream.extend((shader_q5_K_defines, dequant_q5_K_body))
-        elif i == GGML_TYPE_Q6_K:
-            stream.extend((shader_q6_K_defines, dequant_q6_K_body))
-        else:
-            continue
-
-        tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}))
-
-    tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}))
-
-    # get_rows
-    for i in range(0, VK_NUM_TYPES):
-        stream.clear()
-        stream.extend((generic_head, shader_int8_ext, shader_f32))
-
-        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines,  shader_f16_dequant_func,  get_rows_body))
-        elif i == GGML_TYPE_Q4_0:
-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
-        elif i == GGML_TYPE_Q4_1:
-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
-        elif i == GGML_TYPE_Q5_0:
-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
-        elif i == GGML_TYPE_Q5_1:
-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
-        elif i == GGML_TYPE_Q8_0:
-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
-        else:
-            continue
-
-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
-
-    # Norms
-    tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
-    tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
-
-    tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
-    tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
-
-    tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
-
-    tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
-
-    # Helper to decorate tasks with semaphore acquisition.
-    async def withSemaphore(sem, task):
-        async with sem:
-            return await task
-
-    # Run tasks concurrently guarded by a concurrency limit.
-    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
-    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
-
-    with open("ggml-vulkan-shaders.hpp", "w") as f:
-        f.write("#include <cstdint>\n\n")
-        for name, path in sorted(shader_fnames):
-
-            with open(path, "rb") as spv:
-                counter = 0
-                newline_counter = 0
-                f.write(f"unsigned char {name}_data[] = {{\n")
-                for val in spv.read():
-                    f.write(f"0x{val:02x},")
-                    newline_counter += 1
-                    counter += 1
-                    if newline_counter >= 12:
-                        newline_counter = 0
-                        f.write("\n")
-            f.write("\n};\n")
-            f.write(f"const uint64_t {name}_len = {counter};\n\n")
-            os.remove(path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
-
-    parser.add_argument("--glslc", help="Path to glslc")
-
-    args = parser.parse_args()
-
-    if args.glslc:
-        GLSLC = args.glslc
-
-    asyncio.run(main())
diff --git a/gguf-py/README.md b/gguf-py/README.md
index 22d7ffa52..2e513633d 100644
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -3,7 +3,7 @@
 This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
 (GGML Universal File) format.
 
-See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)
+See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)
 as an example for its usage.
 
 ## Installation
@@ -15,11 +15,15 @@ pip install gguf
 
 [examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
 
-[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console.
+[examples/reader.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format.
 
-[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key.
+[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
 
-[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.
+[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
+
+[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
+
+[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
 
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:
@@ -76,6 +80,13 @@ python -m build
 python -m twine upload dist/*
 ```
 
+## Run Unit Tests
+
+From root of this repository you can run this command to run all the unit tests
+
+```bash
+python -m unittest discover ./gguf-py -v
+```
+
 ## TODO
-- [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py
index 62e0769da..d841048c6 100644
--- a/gguf-py/examples/reader.py
+++ b/gguf-py/examples/reader.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
+import logging
 import sys
 from pathlib import Path
 from gguf.gguf_reader import GGUFReader
 
+logger = logging.getLogger("reader")
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
@@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path):
     reader = GGUFReader(gguf_file_path)
 
     # List all key-value pairs in a columnized format
-    print("Key-Value Pairs:")
+    print("Key-Value Pairs:") # noqa: NP100
     max_key_length = max(len(key) for key in reader.fields.keys())
     for key, field in reader.fields.items():
         value = field.parts[field.data[0]]
-        print(f"{key:{max_key_length}} : {value}")
-    print("----")
+        print(f"{key:{max_key_length}} : {value}") # noqa: NP100
+    print("----") # noqa: NP100
 
     # List all tensors
-    print("Tensors:")
+    print("Tensors:") # noqa: NP100
     tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
-    print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
-    print("-" * 80)
+    print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
+    print("-" * 80) # noqa: NP100
     for tensor in reader.tensors:
         shape_str = "x".join(map(str, tensor.shape))
         size_str = str(tensor.n_elements)
         quantization_str = tensor.tensor_type.name
-        print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
+        print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
 
 
 if __name__ == '__main__':
     if len(sys.argv) < 2:
-        print("Usage: reader.py <path_to_gguf_file>")
+        logger.info("Usage: reader.py <path_to_gguf_file>")
         sys.exit(1)
     gguf_file_path = sys.argv[1]
     read_gguf_file(gguf_file_path)
diff --git a/gguf-py/examples/writer.py b/gguf-py/examples/writer.py
index f39eed1af..731873a7d 100755
--- a/gguf-py/examples/writer.py
+++ b/gguf-py/examples/writer.py
@@ -15,7 +15,6 @@ def writer_example() -> None:
     # Example usage with a file
     gguf_writer = GGUFWriter("example.gguf", "llama")
 
-    gguf_writer.add_architecture()
     gguf_writer.add_block_count(12)
     gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
     gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py
index 110ab342c..243defc4c 100644
--- a/gguf-py/gguf/__init__.py
+++ b/gguf-py/gguf/__init__.py
@@ -1,5 +1,9 @@
 from .constants import *
+from .lazy import *
 from .gguf_reader import *
 from .gguf_writer import *
+from .quants import *
 from .tensor_mapping import *
 from .vocab import *
+from .utility import *
+from .metadata import *
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 5db760cb1..ecac5b4bb 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import sys
 from enum import Enum, IntEnum, auto
 from typing import Any
 
@@ -11,6 +10,7 @@ from typing import Any
 GGUF_MAGIC             = 0x46554747  # "GGUF"
 GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
+GGML_QUANT_VERSION     = 2  # GGML_QNT_VERSION from ggml.h
 
 #
 # metadata keys
@@ -19,28 +19,103 @@ GGUF_DEFAULT_ALIGNMENT = 32
 
 class Keys:
     class General:
-        ARCHITECTURE         = "general.architecture"
-        QUANTIZATION_VERSION = "general.quantization_version"
-        ALIGNMENT            = "general.alignment"
-        NAME                 = "general.name"
-        AUTHOR               = "general.author"
-        URL                  = "general.url"
-        DESCRIPTION          = "general.description"
-        LICENSE              = "general.license"
-        SOURCE_URL           = "general.source.url"
-        SOURCE_HF_REPO       = "general.source.huggingface.repository"
-        FILE_TYPE            = "general.file_type"
+        TYPE                       = "general.type"
+        ARCHITECTURE               = "general.architecture"
+        QUANTIZATION_VERSION       = "general.quantization_version"
+        ALIGNMENT                  = "general.alignment"
+        FILE_TYPE                  = "general.file_type"
+
+        # Authorship Metadata
+        NAME                       = "general.name"
+        AUTHOR                     = "general.author"
+        VERSION                    = "general.version"
+        ORGANIZATION               = "general.organization"
+
+        FINETUNE                   = "general.finetune"
+        BASENAME                   = "general.basename"
+
+        DESCRIPTION                = "general.description"
+        QUANTIZED_BY               = "general.quantized_by"
+
+        SIZE_LABEL                 = "general.size_label"
+
+        # Licensing details
+        LICENSE                    = "general.license"
+        LICENSE_NAME               = "general.license.name"
+        LICENSE_LINK               = "general.license.link"
+
+        # Typically represents the converted GGUF repo (Unless native)
+        URL                        = "general.url" # Model Website/Paper
+        DOI                        = "general.doi"
+        UUID                       = "general.uuid"
+        REPO_URL                   = "general.repo_url" # Model Source Repository (git/svn/etc...)
+
+        # Model Source during conversion
+        SOURCE_URL                 = "general.source.url" # Model Website/Paper
+        SOURCE_DOI                 = "general.source.doi"
+        SOURCE_UUID                = "general.source.uuid"
+        SOURCE_REPO_URL            = "general.source.repo_url" # Model Source Repository (git/svn/etc...)
+
+        # Base Model Source. There can be more than one source if it's a merged
+        # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
+        # tracing linage of models as it is finetuned or merged over time.
+        BASE_MODEL_COUNT           = "general.base_model.count"
+        BASE_MODEL_NAME            = "general.base_model.{id}.name"
+        BASE_MODEL_AUTHOR          = "general.base_model.{id}.author"
+        BASE_MODEL_VERSION         = "general.base_model.{id}.version"
+        BASE_MODEL_ORGANIZATION    = "general.base_model.{id}.organization"
+        BASE_MODEL_DESCRIPTION     = "general.base_model.{id}.description"
+        BASE_MODEL_URL             = "general.base_model.{id}.url" # Model Website/Paper
+        BASE_MODEL_DOI             = "general.base_model.{id}.doi"
+        BASE_MODEL_UUID            = "general.base_model.{id}.uuid"
+        BASE_MODEL_REPO_URL        = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
+
+        # Dataset Source
+        DATASET_COUNT           = "general.dataset.count"
+        DATASET_NAME            = "general.dataset.{id}.name"
+        DATASET_AUTHOR          = "general.dataset.{id}.author"
+        DATASET_VERSION         = "general.dataset.{id}.version"
+        DATASET_ORGANIZATION    = "general.dataset.{id}.organization"
+        DATASET_DESCRIPTION     = "general.dataset.{id}.description"
+        DATASET_URL             = "general.dataset.{id}.url" # Model Website/Paper
+        DATASET_DOI             = "general.dataset.{id}.doi"
+        DATASET_UUID            = "general.dataset.{id}.uuid"
+        DATASET_REPO_URL        = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
+
+        # Array based KV stores
+        TAGS                       = "general.tags"
+        LANGUAGES                  = "general.languages"
 
     class LLM:
-        CONTEXT_LENGTH        = "{arch}.context_length"
-        EMBEDDING_LENGTH      = "{arch}.embedding_length"
-        BLOCK_COUNT           = "{arch}.block_count"
-        FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
-        USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
-        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
-        EXPERT_COUNT          = "{arch}.expert_count"
-        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
-        POOLING_TYPE          = "{arch}.pooling_type"
+        VOCAB_SIZE                        = "{arch}.vocab_size"
+        CONTEXT_LENGTH                    = "{arch}.context_length"
+        EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        FEATURES_LENGTH                   = "{arch}.features_length"
+        BLOCK_COUNT                       = "{arch}.block_count"
+        LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
+        FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
+        EXPERT_FEED_FORWARD_LENGTH        = "{arch}.expert_feed_forward_length"
+        EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
+        USE_PARALLEL_RESIDUAL             = "{arch}.use_parallel_residual"
+        TENSOR_DATA_LAYOUT                = "{arch}.tensor_data_layout"
+        EXPERT_COUNT                      = "{arch}.expert_count"
+        EXPERT_USED_COUNT                 = "{arch}.expert_used_count"
+        EXPERT_SHARED_COUNT               = "{arch}.expert_shared_count"
+        EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
+        EXPERT_WEIGHTS_NORM               = "{arch}.expert_weights_norm"
+        EXPERT_GATING_FUNC                = "{arch}.expert_gating_func"
+        POOLING_TYPE                      = "{arch}.pooling_type"
+        LOGIT_SCALE                       = "{arch}.logit_scale"
+        DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
+        ATTN_LOGIT_SOFTCAPPING            = "{arch}.attn_logit_softcapping"
+        FINAL_LOGIT_SOFTCAPPING           = "{arch}.final_logit_softcapping"
+        SWIN_NORM                         = "{arch}.swin_norm"
+        RESCALE_EVERY_N_LAYERS            = "{arch}.rescale_every_n_layers"
+        TIME_MIX_EXTRA_DIM                = "{arch}.time_mix_extra_dim"
+        TIME_DECAY_EXTRA_DIM              = "{arch}.time_decay_extra_dim"
+        RESIDUAL_SCALE                    = "{arch}.residual_scale"
+        EMBEDDING_SCALE                   = "{arch}.embedding_scale"
+        TOKEN_SHIFT_COUNT                 = "{arch}.token_shift_count"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -51,157 +126,473 @@ class Keys:
         VALUE_LENGTH      = "{arch}.attention.value_length"
         LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
         LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+        GROUPNORM_EPS     = "{arch}.attention.group_norm_epsilon"
+        GROUPNORM_GROUPS  = "{arch}.attention.group_norm_groups"
         CAUSAL            = "{arch}.attention.causal"
+        Q_LORA_RANK       = "{arch}.attention.q_lora_rank"
+        KV_LORA_RANK      = "{arch}.attention.kv_lora_rank"
+        REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
+        SLIDING_WINDOW    = "{arch}.attention.sliding_window"
+        SCALE             = "{arch}.attention.scale"
 
     class Rope:
-        DIMENSION_COUNT      = "{arch}.rope.dimension_count"
-        FREQ_BASE            = "{arch}.rope.freq_base"
-        SCALING_TYPE         = "{arch}.rope.scaling.type"
-        SCALING_FACTOR       = "{arch}.rope.scaling.factor"
-        SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
-        SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
+        DIMENSION_COUNT         = "{arch}.rope.dimension_count"
+        DIMENSION_SECTIONS      = "{arch}.rope.dimension_sections"
+        FREQ_BASE               = "{arch}.rope.freq_base"
+        SCALING_TYPE            = "{arch}.rope.scaling.type"
+        SCALING_FACTOR          = "{arch}.rope.scaling.factor"
+        SCALING_ATTN_FACTOR     = "{arch}.rope.scaling.attn_factor"
+        SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
+        SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"
+        SCALING_YARN_LOG_MUL    = "{arch}.rope.scaling.yarn_log_multiplier"
+
+    class Split:
+        LLM_KV_SPLIT_NO            = "split.no"
+        LLM_KV_SPLIT_COUNT         = "split.count"
+        LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
+
+    class SSM:
+        CONV_KERNEL    = "{arch}.ssm.conv_kernel"
+        INNER_SIZE     = "{arch}.ssm.inner_size"
+        STATE_SIZE     = "{arch}.ssm.state_size"
+        TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
+        DT_B_C_RMS     = "{arch}.ssm.dt_b_c_rms"
+
+    class WKV:
+        HEAD_SIZE = "{arch}.wkv.head_size"
+
+    class PosNet:
+        EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
+        BLOCK_COUNT      = "{arch}.posnet.block_count"
+
+    class ConvNext:
+        EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
+        BLOCK_COUNT      = "{arch}.convnext.block_count"
 
     class Tokenizer:
-        MODEL            = "tokenizer.ggml.model"
-        LIST             = "tokenizer.ggml.tokens"
-        TOKEN_TYPE       = "tokenizer.ggml.token_type"
-        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"  # for BERT-style token types
-        SCORES           = "tokenizer.ggml.scores"
-        MERGES           = "tokenizer.ggml.merges"
-        BOS_ID           = "tokenizer.ggml.bos_token_id"
-        EOS_ID           = "tokenizer.ggml.eos_token_id"
-        UNK_ID           = "tokenizer.ggml.unknown_token_id"
-        SEP_ID           = "tokenizer.ggml.seperator_token_id"
-        PAD_ID           = "tokenizer.ggml.padding_token_id"
-        CLS_ID           = "tokenizer.ggml.cls_token_id"
-        MASK_ID          = "tokenizer.ggml.mask_token_id"
-        ADD_BOS          = "tokenizer.ggml.add_bos_token"
-        ADD_EOS          = "tokenizer.ggml.add_eos_token"
-        ADD_PREFIX       = "tokenizer.ggml.add_space_prefix"
-        HF_JSON          = "tokenizer.huggingface.json"
-        RWKV             = "tokenizer.rwkv.world"
-        CHAT_TEMPLATE    = "tokenizer.chat_template"
+        MODEL                = "tokenizer.ggml.model"
+        PRE                  = "tokenizer.ggml.pre"
+        LIST                 = "tokenizer.ggml.tokens"
+        TOKEN_TYPE           = "tokenizer.ggml.token_type"
+        TOKEN_TYPE_COUNT     = "tokenizer.ggml.token_type_count"  # for BERT-style token types
+        SCORES               = "tokenizer.ggml.scores"
+        MERGES               = "tokenizer.ggml.merges"
+        BOS_ID               = "tokenizer.ggml.bos_token_id"
+        EOS_ID               = "tokenizer.ggml.eos_token_id"
+        EOT_ID               = "tokenizer.ggml.eot_token_id"
+        EOM_ID               = "tokenizer.ggml.eom_token_id"
+        UNK_ID               = "tokenizer.ggml.unknown_token_id"
+        SEP_ID               = "tokenizer.ggml.seperator_token_id"
+        PAD_ID               = "tokenizer.ggml.padding_token_id"
+        MASK_ID              = "tokenizer.ggml.mask_token_id"
+        ADD_BOS              = "tokenizer.ggml.add_bos_token"
+        ADD_EOS              = "tokenizer.ggml.add_eos_token"
+        ADD_PREFIX           = "tokenizer.ggml.add_space_prefix"
+        REMOVE_EXTRA_WS      = "tokenizer.ggml.remove_extra_whitespaces"
+        PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
+        HF_JSON              = "tokenizer.huggingface.json"
+        RWKV                 = "tokenizer.rwkv.world"
+        CHAT_TEMPLATE        = "tokenizer.chat_template"
+        CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
+        CHAT_TEMPLATES       = "tokenizer.chat_templates"
+        # FIM/Infill special tokens constants
+        FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
+        FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
+        FIM_MID_ID           = "tokenizer.ggml.fim_mid_token_id"
+        FIM_PAD_ID           = "tokenizer.ggml.fim_pad_token_id"
+        FIM_REP_ID           = "tokenizer.ggml.fim_rep_token_id"
+        FIM_SEP_ID           = "tokenizer.ggml.fim_sep_token_id"
+        # deprecated:
+        PREFIX_ID            = "tokenizer.ggml.prefix_token_id"
+        SUFFIX_ID            = "tokenizer.ggml.suffix_token_id"
+        MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
 
+    class Adapter:
+        TYPE       = "adapter.type"
+        LORA_ALPHA = "adapter.lora.alpha"
 
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 
 
+class GGUFType:
+    MODEL   = "model"
+    ADAPTER = "adapter"
+
+
 class MODEL_ARCH(IntEnum):
-    LLAMA      = auto()
-    FALCON     = auto()
-    BAICHUAN   = auto()
-    GPT2       = auto()
-    GPTJ       = auto()
-    GPTNEOX    = auto()
-    MPT        = auto()
-    STARCODER  = auto()
-    PERSIMMON  = auto()
-    REFACT     = auto()
-    BERT       = auto()
-    NOMIC_BERT = auto()
-    BLOOM      = auto()
-    STABLELM   = auto()
-    QWEN       = auto()
-    QWEN2      = auto()
-    PHI2       = auto()
-    PLAMO      = auto()
-    CODESHELL  = auto()
-    ORION      = auto()
-    INTERNLM2  = auto()
-    MINICPM    = auto()
-    GEMMA      = auto()
-    STARCODER2 = auto()
+    LLAMA            = auto()
+    DECI             = auto()
+    FALCON           = auto()
+    BAICHUAN         = auto()
+    GROK             = auto()
+    GPT2             = auto()
+    GPTJ             = auto()
+    GPTNEOX          = auto()
+    MPT              = auto()
+    STARCODER        = auto()
+    REFACT           = auto()
+    BERT             = auto()
+    NOMIC_BERT       = auto()
+    JINA_BERT_V2     = auto()
+    BLOOM            = auto()
+    STABLELM         = auto()
+    QWEN             = auto()
+    QWEN2            = auto()
+    QWEN2MOE         = auto()
+    QWEN2VL          = auto()
+    PHI2             = auto()
+    PHI3             = auto()
+    PHIMOE           = auto()
+    PLAMO            = auto()
+    CODESHELL        = auto()
+    ORION            = auto()
+    INTERNLM2        = auto()
+    MINICPM          = auto()
+    MINICPM3         = auto()
+    GEMMA            = auto()
+    GEMMA2           = auto()
+    STARCODER2       = auto()
+    RWKV6            = auto()
+    RWKV6QWEN2       = auto()
+    MAMBA            = auto()
+    XVERSE           = auto()
+    COMMAND_R        = auto()
+    COHERE2          = auto()
+    DBRX             = auto()
+    OLMO             = auto()
+    OLMO2            = auto()
+    OLMOE            = auto()
+    OPENELM          = auto()
+    ARCTIC           = auto()
+    DEEPSEEK         = auto()
+    DEEPSEEK2        = auto()
+    CHATGLM          = auto()
+    BITNET           = auto()
+    T5               = auto()
+    T5ENCODER        = auto()
+    JAIS             = auto()
+    NEMOTRON         = auto()
+    EXAONE           = auto()
+    GRANITE          = auto()
+    GRANITE_MOE      = auto()
+    CHAMELEON        = auto()
+    WAVTOKENIZER_DEC = auto()
 
 
 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD      = auto()
-    TOKEN_EMBD_NORM = auto()
-    TOKEN_TYPES     = auto()
-    POS_EMBD        = auto()
-    OUTPUT          = auto()
-    OUTPUT_NORM     = auto()
-    ROPE_FREQS      = auto()
-    ATTN_Q          = auto()
-    ATTN_K          = auto()
-    ATTN_V          = auto()
-    ATTN_QKV        = auto()
-    ATTN_OUT        = auto()
-    ATTN_NORM       = auto()
-    ATTN_NORM_2     = auto()
-    ATTN_OUT_NORM   = auto()
-    ATTN_ROT_EMBD   = auto()
-    FFN_GATE_INP    = auto()
-    FFN_NORM        = auto()
-    FFN_GATE        = auto()
-    FFN_DOWN        = auto()
-    FFN_UP          = auto()
-    FFN_ACT         = auto()
-    FFN_GATE_EXP    = auto()
-    FFN_DOWN_EXP    = auto()
-    FFN_UP_EXP      = auto()
-    ATTN_Q_NORM     = auto()
-    ATTN_K_NORM     = auto()
-    LAYER_OUT_NORM  = auto()
+    TOKEN_EMBD           = auto()
+    TOKEN_EMBD_NORM      = auto()
+    TOKEN_TYPES          = auto()
+    POS_EMBD             = auto()
+    OUTPUT               = auto()
+    OUTPUT_NORM          = auto()
+    ROPE_FREQS           = auto()
+    ROPE_FACTORS_LONG    = auto()
+    ROPE_FACTORS_SHORT   = auto()
+    ATTN_Q               = auto()
+    ATTN_K               = auto()
+    ATTN_V               = auto()
+    ATTN_QKV             = auto()
+    ATTN_OUT             = auto()
+    ATTN_NORM            = auto()
+    ATTN_NORM_2          = auto()
+    ATTN_OUT_NORM        = auto()
+    ATTN_POST_NORM       = auto()
+    ATTN_ROT_EMBD        = auto()
+    FFN_GATE_INP         = auto()
+    FFN_GATE_INP_SHEXP   = auto()
+    FFN_NORM             = auto()
+    FFN_PRE_NORM         = auto()
+    FFN_POST_NORM        = auto()
+    FFN_GATE             = auto()
+    FFN_DOWN             = auto()
+    FFN_UP               = auto()
+    FFN_ACT              = auto()
+    FFN_NORM_EXP         = auto()
+    FFN_GATE_EXP         = auto()
+    FFN_DOWN_EXP         = auto()
+    FFN_UP_EXP           = auto()
+    FFN_GATE_SHEXP       = auto()
+    FFN_DOWN_SHEXP       = auto()
+    FFN_UP_SHEXP         = auto()
+    FFN_EXP_PROBS_B      = auto()
+    ATTN_Q_NORM          = auto()
+    ATTN_K_NORM          = auto()
+    LAYER_OUT_NORM       = auto()
+    SSM_IN               = auto()
+    SSM_CONV1D           = auto()
+    SSM_X                = auto()
+    SSM_DT               = auto()
+    SSM_A                = auto()
+    SSM_D                = auto()
+    SSM_OUT              = auto()
+    TIME_MIX_W1          = auto()
+    TIME_MIX_W2          = auto()
+    TIME_MIX_LERP_X      = auto()
+    TIME_MIX_LERP_K      = auto()
+    TIME_MIX_LERP_V      = auto()
+    TIME_MIX_LERP_R      = auto()
+    TIME_MIX_LERP_G      = auto()
+    TIME_MIX_LERP_FUSED  = auto()
+    TIME_MIX_LERP_W      = auto()
+    TIME_MIX_FIRST       = auto()
+    TIME_MIX_DECAY       = auto()
+    TIME_MIX_DECAY_W1    = auto()
+    TIME_MIX_DECAY_W2    = auto()
+    TIME_MIX_KEY         = auto()
+    TIME_MIX_VALUE       = auto()
+    TIME_MIX_RECEPTANCE  = auto()
+    TIME_MIX_GATE        = auto()
+    TIME_MIX_LN          = auto()
+    TIME_MIX_OUTPUT      = auto()
+    CHANNEL_MIX_LERP_K   = auto()
+    CHANNEL_MIX_LERP_R   = auto()
+    CHANNEL_MIX_KEY      = auto()
+    CHANNEL_MIX_RECEPTANCE = auto()
+    CHANNEL_MIX_VALUE    = auto()
+    ATTN_Q_A             = auto()
+    ATTN_Q_B             = auto()
+    ATTN_KV_A_MQA        = auto()
+    ATTN_KV_B            = auto()
+    ATTN_Q_A_NORM        = auto()
+    ATTN_KV_A_NORM       = auto()
+    FFN_SUB_NORM         = auto()
+    ATTN_SUB_NORM        = auto()
+    DEC_ATTN_NORM        = auto()
+    DEC_ATTN_Q           = auto()
+    DEC_ATTN_K           = auto()
+    DEC_ATTN_V           = auto()
+    DEC_ATTN_OUT         = auto()
+    DEC_ATTN_REL_B       = auto()
+    DEC_CROSS_ATTN_NORM  = auto()
+    DEC_CROSS_ATTN_Q     = auto()
+    DEC_CROSS_ATTN_K     = auto()
+    DEC_CROSS_ATTN_V     = auto()
+    DEC_CROSS_ATTN_OUT   = auto()
+    DEC_CROSS_ATTN_REL_B = auto()
+    DEC_FFN_NORM         = auto()
+    DEC_FFN_GATE         = auto()
+    DEC_FFN_DOWN         = auto()
+    DEC_FFN_UP           = auto()
+    DEC_OUTPUT_NORM      = auto()
+    ENC_ATTN_NORM        = auto()
+    ENC_ATTN_Q           = auto()
+    ENC_ATTN_K           = auto()
+    ENC_ATTN_V           = auto()
+    ENC_ATTN_OUT         = auto()
+    ENC_ATTN_REL_B       = auto()
+    ENC_FFN_NORM         = auto()
+    ENC_FFN_GATE         = auto()
+    ENC_FFN_DOWN         = auto()
+    ENC_FFN_UP           = auto()
+    ENC_OUTPUT_NORM      = auto()
+    CLS                  = auto() # classifier
+    CLS_OUT              = auto() # classifier output projection
+    CONV1D               = auto()
+    CONVNEXT_DW          = auto()
+    CONVNEXT_NORM        = auto()
+    CONVNEXT_PW1         = auto()
+    CONVNEXT_PW2         = auto()
+    CONVNEXT_GAMMA       = auto()
+    POSNET_CONV1         = auto()
+    POSNET_CONV2         = auto()
+    POSNET_NORM          = auto()
+    POSNET_NORM1         = auto()
+    POSNET_NORM2         = auto()
+    POSNET_ATTN_NORM     = auto()
+    POSNET_ATTN_Q        = auto()
+    POSNET_ATTN_K        = auto()
+    POSNET_ATTN_V        = auto()
+    POSNET_ATTN_OUT      = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:          "llama",
-    MODEL_ARCH.FALCON:         "falcon",
-    MODEL_ARCH.BAICHUAN:       "baichuan",
-    MODEL_ARCH.GPT2:           "gpt2",
-    MODEL_ARCH.GPTJ:           "gptj",
-    MODEL_ARCH.GPTNEOX:        "gptneox",
-    MODEL_ARCH.MPT:            "mpt",
-    MODEL_ARCH.STARCODER:      "starcoder",
-    MODEL_ARCH.PERSIMMON:      "persimmon",
-    MODEL_ARCH.REFACT:         "refact",
-    MODEL_ARCH.BERT:           "bert",
-    MODEL_ARCH.NOMIC_BERT:     "nomic-bert",
-    MODEL_ARCH.BLOOM:          "bloom",
-    MODEL_ARCH.STABLELM:       "stablelm",
-    MODEL_ARCH.QWEN:           "qwen",
-    MODEL_ARCH.QWEN2:          "qwen2",
-    MODEL_ARCH.PHI2:           "phi2",
-    MODEL_ARCH.PLAMO:          "plamo",
-    MODEL_ARCH.CODESHELL:      "codeshell",
-    MODEL_ARCH.ORION:          "orion",
-    MODEL_ARCH.INTERNLM2:      "internlm2",
-    MODEL_ARCH.MINICPM:        "minicpm",
-    MODEL_ARCH.GEMMA:          "gemma",
-    MODEL_ARCH.STARCODER2:     "starcoder2",
+    MODEL_ARCH.LLAMA:            "llama",
+    MODEL_ARCH.DECI:             "deci",
+    MODEL_ARCH.FALCON:           "falcon",
+    MODEL_ARCH.BAICHUAN:         "baichuan",
+    MODEL_ARCH.GROK:             "grok",
+    MODEL_ARCH.GPT2:             "gpt2",
+    MODEL_ARCH.GPTJ:             "gptj",
+    MODEL_ARCH.GPTNEOX:          "gptneox",
+    MODEL_ARCH.MPT:              "mpt",
+    MODEL_ARCH.STARCODER:        "starcoder",
+    MODEL_ARCH.REFACT:           "refact",
+    MODEL_ARCH.BERT:             "bert",
+    MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
+    MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
+    MODEL_ARCH.BLOOM:            "bloom",
+    MODEL_ARCH.STABLELM:         "stablelm",
+    MODEL_ARCH.QWEN:             "qwen",
+    MODEL_ARCH.QWEN2:            "qwen2",
+    MODEL_ARCH.QWEN2MOE:         "qwen2moe",
+    MODEL_ARCH.QWEN2VL:          "qwen2vl",
+    MODEL_ARCH.PHI2:             "phi2",
+    MODEL_ARCH.PHI3:             "phi3",
+    MODEL_ARCH.PHIMOE:           "phimoe",
+    MODEL_ARCH.PLAMO:            "plamo",
+    MODEL_ARCH.CODESHELL:        "codeshell",
+    MODEL_ARCH.ORION:            "orion",
+    MODEL_ARCH.INTERNLM2:        "internlm2",
+    MODEL_ARCH.MINICPM:          "minicpm",
+    MODEL_ARCH.MINICPM3:         "minicpm3",
+    MODEL_ARCH.GEMMA:            "gemma",
+    MODEL_ARCH.GEMMA2:           "gemma2",
+    MODEL_ARCH.STARCODER2:       "starcoder2",
+    MODEL_ARCH.RWKV6:            "rwkv6",
+    MODEL_ARCH.RWKV6QWEN2:       "rwkv6qwen2",
+    MODEL_ARCH.MAMBA:            "mamba",
+    MODEL_ARCH.XVERSE:           "xverse",
+    MODEL_ARCH.COMMAND_R:        "command-r",
+    MODEL_ARCH.COHERE2:          "cohere2",
+    MODEL_ARCH.DBRX:             "dbrx",
+    MODEL_ARCH.OLMO:             "olmo",
+    MODEL_ARCH.OLMO2:            "olmo2",
+    MODEL_ARCH.OLMOE:            "olmoe",
+    MODEL_ARCH.OPENELM:          "openelm",
+    MODEL_ARCH.ARCTIC:           "arctic",
+    MODEL_ARCH.DEEPSEEK:         "deepseek",
+    MODEL_ARCH.DEEPSEEK2:        "deepseek2",
+    MODEL_ARCH.CHATGLM:          "chatglm",
+    MODEL_ARCH.BITNET:           "bitnet",
+    MODEL_ARCH.T5:               "t5",
+    MODEL_ARCH.T5ENCODER:        "t5encoder",
+    MODEL_ARCH.JAIS:             "jais",
+    MODEL_ARCH.NEMOTRON:         "nemotron",
+    MODEL_ARCH.EXAONE:           "exaone",
+    MODEL_ARCH.GRANITE:          "granite",
+    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
+    MODEL_ARCH.CHAMELEON:        "chameleon",
+    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
-    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
-    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
-    MODEL_TENSOR.POS_EMBD:        "position_embd",
-    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
-    MODEL_TENSOR.OUTPUT:          "output",
-    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
-    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
-    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
-    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
-    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
-    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
-    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
-    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
-    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
-    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
-    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
-    MODEL_TENSOR.ATTN_OUT_NORM:   "blk.{bid}.attn_output_norm",
-    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
-    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
-    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
-    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
-    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
-    MODEL_TENSOR.FFN_ACT:         "blk.{bid}.ffn",
-    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
-    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
-    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
-    MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
+    MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
+    MODEL_TENSOR.TOKEN_TYPES:               "token_types",
+    MODEL_TENSOR.POS_EMBD:                  "position_embd",
+    MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
+    MODEL_TENSOR.OUTPUT:                    "output",
+    MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
+    MODEL_TENSOR.ROPE_FACTORS_LONG:         "rope_factors_long",
+    MODEL_TENSOR.ROPE_FACTORS_SHORT:        "rope_factors_short",
+    MODEL_TENSOR.ATTN_NORM:                 "blk.{bid}.attn_norm",
+    MODEL_TENSOR.ATTN_NORM_2:               "blk.{bid}.attn_norm_2",
+    MODEL_TENSOR.ATTN_QKV:                  "blk.{bid}.attn_qkv",
+    MODEL_TENSOR.ATTN_Q:                    "blk.{bid}.attn_q",
+    MODEL_TENSOR.ATTN_K:                    "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_V:                    "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_OUT:                  "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_ROT_EMBD:             "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q_NORM:               "blk.{bid}.attn_q_norm",
+    MODEL_TENSOR.ATTN_K_NORM:               "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.ATTN_OUT_NORM:             "blk.{bid}.attn_output_norm",
+    MODEL_TENSOR.ATTN_POST_NORM:            "blk.{bid}.post_attention_norm",
+    MODEL_TENSOR.FFN_GATE_INP:              "blk.{bid}.ffn_gate_inp",
+    MODEL_TENSOR.FFN_GATE_INP_SHEXP:        "blk.{bid}.ffn_gate_inp_shexp",
+    MODEL_TENSOR.FFN_NORM:                  "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_PRE_NORM:              "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_POST_NORM:             "blk.{bid}.post_ffw_norm",
+    MODEL_TENSOR.FFN_GATE:                  "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.FFN_DOWN:                  "blk.{bid}.ffn_down",
+    MODEL_TENSOR.FFN_UP:                    "blk.{bid}.ffn_up",
+    MODEL_TENSOR.FFN_GATE_SHEXP:            "blk.{bid}.ffn_gate_shexp",
+    MODEL_TENSOR.FFN_DOWN_SHEXP:            "blk.{bid}.ffn_down_shexp",
+    MODEL_TENSOR.FFN_UP_SHEXP:              "blk.{bid}.ffn_up_shexp",
+    MODEL_TENSOR.FFN_ACT:                   "blk.{bid}.ffn",
+    MODEL_TENSOR.FFN_NORM_EXP:              "blk.{bid}.ffn_norm_exps",
+    MODEL_TENSOR.FFN_GATE_EXP:              "blk.{bid}.ffn_gate_exps",
+    MODEL_TENSOR.FFN_DOWN_EXP:              "blk.{bid}.ffn_down_exps",
+    MODEL_TENSOR.FFN_UP_EXP:                "blk.{bid}.ffn_up_exps",
+    MODEL_TENSOR.FFN_EXP_PROBS_B:           "blk.{bid}.exp_probs_b",
+    MODEL_TENSOR.LAYER_OUT_NORM:            "blk.{bid}.layer_output_norm",
+    MODEL_TENSOR.SSM_IN:                    "blk.{bid}.ssm_in",
+    MODEL_TENSOR.SSM_CONV1D:                "blk.{bid}.ssm_conv1d",
+    MODEL_TENSOR.SSM_X:                     "blk.{bid}.ssm_x",
+    MODEL_TENSOR.SSM_DT:                    "blk.{bid}.ssm_dt",
+    MODEL_TENSOR.SSM_A:                     "blk.{bid}.ssm_a",
+    MODEL_TENSOR.SSM_D:                     "blk.{bid}.ssm_d",
+    MODEL_TENSOR.SSM_OUT:                   "blk.{bid}.ssm_out",
+    MODEL_TENSOR.TIME_MIX_W1:               "blk.{bid}.time_mix_w1",
+    MODEL_TENSOR.TIME_MIX_W2:               "blk.{bid}.time_mix_w2",
+    MODEL_TENSOR.TIME_MIX_LERP_X:           "blk.{bid}.time_mix_lerp_x",
+    MODEL_TENSOR.TIME_MIX_LERP_K:           "blk.{bid}.time_mix_lerp_k",
+    MODEL_TENSOR.TIME_MIX_LERP_V:           "blk.{bid}.time_mix_lerp_v",
+    MODEL_TENSOR.TIME_MIX_LERP_R:           "blk.{bid}.time_mix_lerp_r",
+    MODEL_TENSOR.TIME_MIX_LERP_G:           "blk.{bid}.time_mix_lerp_g",
+    MODEL_TENSOR.TIME_MIX_LERP_FUSED:       "blk.{bid}.time_mix_lerp_fused",
+    MODEL_TENSOR.TIME_MIX_LERP_W:           "blk.{bid}.time_mix_lerp_w",
+    MODEL_TENSOR.TIME_MIX_FIRST:            "blk.{bid}.time_mix_first",
+    MODEL_TENSOR.TIME_MIX_DECAY:            "blk.{bid}.time_mix_decay",
+    MODEL_TENSOR.TIME_MIX_DECAY_W1:         "blk.{bid}.time_mix_decay_w1",
+    MODEL_TENSOR.TIME_MIX_DECAY_W2:         "blk.{bid}.time_mix_decay_w2",
+    MODEL_TENSOR.TIME_MIX_KEY:              "blk.{bid}.time_mix_key",
+    MODEL_TENSOR.TIME_MIX_VALUE:            "blk.{bid}.time_mix_value",
+    MODEL_TENSOR.TIME_MIX_RECEPTANCE:       "blk.{bid}.time_mix_receptance",
+    MODEL_TENSOR.TIME_MIX_GATE:             "blk.{bid}.time_mix_gate",
+    MODEL_TENSOR.TIME_MIX_LN:               "blk.{bid}.time_mix_ln",
+    MODEL_TENSOR.TIME_MIX_OUTPUT:           "blk.{bid}.time_mix_output",
+    MODEL_TENSOR.CHANNEL_MIX_LERP_K:        "blk.{bid}.channel_mix_lerp_k",
+    MODEL_TENSOR.CHANNEL_MIX_LERP_R:        "blk.{bid}.channel_mix_lerp_r",
+    MODEL_TENSOR.CHANNEL_MIX_KEY:           "blk.{bid}.channel_mix_key",
+    MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE:    "blk.{bid}.channel_mix_receptance",
+    MODEL_TENSOR.CHANNEL_MIX_VALUE:         "blk.{bid}.channel_mix_value",
+    MODEL_TENSOR.ATTN_Q_A:                  "blk.{bid}.attn_q_a",
+    MODEL_TENSOR.ATTN_Q_B:                  "blk.{bid}.attn_q_b",
+    MODEL_TENSOR.ATTN_KV_A_MQA:             "blk.{bid}.attn_kv_a_mqa",
+    MODEL_TENSOR.ATTN_KV_B:                 "blk.{bid}.attn_kv_b",
+    MODEL_TENSOR.ATTN_Q_A_NORM:             "blk.{bid}.attn_q_a_norm",
+    MODEL_TENSOR.ATTN_KV_A_NORM:            "blk.{bid}.attn_kv_a_norm",
+    MODEL_TENSOR.ATTN_SUB_NORM:             "blk.{bid}.attn_sub_norm",
+    MODEL_TENSOR.FFN_SUB_NORM:              "blk.{bid}.ffn_sub_norm",
+    MODEL_TENSOR.DEC_ATTN_NORM:             "dec.blk.{bid}.attn_norm",
+    MODEL_TENSOR.DEC_ATTN_Q:                "dec.blk.{bid}.attn_q",
+    MODEL_TENSOR.DEC_ATTN_K:                "dec.blk.{bid}.attn_k",
+    MODEL_TENSOR.DEC_ATTN_V:                "dec.blk.{bid}.attn_v",
+    MODEL_TENSOR.DEC_ATTN_OUT:              "dec.blk.{bid}.attn_o",
+    MODEL_TENSOR.DEC_ATTN_REL_B:            "dec.blk.{bid}.attn_rel_b",
+    MODEL_TENSOR.DEC_CROSS_ATTN_NORM:       "dec.blk.{bid}.cross_attn_norm",
+    MODEL_TENSOR.DEC_CROSS_ATTN_Q:          "dec.blk.{bid}.cross_attn_q",
+    MODEL_TENSOR.DEC_CROSS_ATTN_K:          "dec.blk.{bid}.cross_attn_k",
+    MODEL_TENSOR.DEC_CROSS_ATTN_V:          "dec.blk.{bid}.cross_attn_v",
+    MODEL_TENSOR.DEC_CROSS_ATTN_OUT:        "dec.blk.{bid}.cross_attn_o",
+    MODEL_TENSOR.DEC_CROSS_ATTN_REL_B:      "dec.blk.{bid}.cross_attn_rel_b",
+    MODEL_TENSOR.DEC_FFN_NORM:              "dec.blk.{bid}.ffn_norm",
+    MODEL_TENSOR.DEC_FFN_GATE:              "dec.blk.{bid}.ffn_gate",
+    MODEL_TENSOR.DEC_FFN_DOWN:              "dec.blk.{bid}.ffn_down",
+    MODEL_TENSOR.DEC_FFN_UP:                "dec.blk.{bid}.ffn_up",
+    MODEL_TENSOR.DEC_OUTPUT_NORM:           "dec.output_norm",
+    MODEL_TENSOR.ENC_ATTN_NORM:             "enc.blk.{bid}.attn_norm",
+    MODEL_TENSOR.ENC_ATTN_Q:                "enc.blk.{bid}.attn_q",
+    MODEL_TENSOR.ENC_ATTN_K:                "enc.blk.{bid}.attn_k",
+    MODEL_TENSOR.ENC_ATTN_V:                "enc.blk.{bid}.attn_v",
+    MODEL_TENSOR.ENC_ATTN_OUT:              "enc.blk.{bid}.attn_o",
+    MODEL_TENSOR.ENC_ATTN_REL_B:            "enc.blk.{bid}.attn_rel_b",
+    MODEL_TENSOR.ENC_FFN_NORM:              "enc.blk.{bid}.ffn_norm",
+    MODEL_TENSOR.ENC_FFN_GATE:              "enc.blk.{bid}.ffn_gate",
+    MODEL_TENSOR.ENC_FFN_DOWN:              "enc.blk.{bid}.ffn_down",
+    MODEL_TENSOR.ENC_FFN_UP:                "enc.blk.{bid}.ffn_up",
+    MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
+    MODEL_TENSOR.CLS:                       "cls",
+    MODEL_TENSOR.CLS_OUT:                   "cls.output",
+    MODEL_TENSOR.CONV1D:                    "conv1d",
+    MODEL_TENSOR.CONVNEXT_DW:               "convnext.{bid}.dw",
+    MODEL_TENSOR.CONVNEXT_NORM:             "convnext.{bid}.norm",
+    MODEL_TENSOR.CONVNEXT_PW1:              "convnext.{bid}.pw1",
+    MODEL_TENSOR.CONVNEXT_PW2:              "convnext.{bid}.pw2",
+    MODEL_TENSOR.CONVNEXT_GAMMA:            "convnext.{bid}.gamma",
+    MODEL_TENSOR.POSNET_CONV1:              "posnet.{bid}.conv1",
+    MODEL_TENSOR.POSNET_CONV2:              "posnet.{bid}.conv2",
+    MODEL_TENSOR.POSNET_NORM:               "posnet.{bid}.norm",
+    MODEL_TENSOR.POSNET_NORM1:              "posnet.{bid}.norm1",
+    MODEL_TENSOR.POSNET_NORM2:              "posnet.{bid}.norm2",
+    MODEL_TENSOR.POSNET_ATTN_NORM:          "posnet.{bid}.attn_norm",
+    MODEL_TENSOR.POSNET_ATTN_Q:             "posnet.{bid}.attn_q",
+    MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
+    MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
+    MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -225,6 +616,48 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.DECI: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.GROK: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.LAYER_OUT_NORM,
+    ],
     MODEL_ARCH.GPTNEOX: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -289,6 +722,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
+        MODEL_TENSOR.CLS_OUT,
     ],
     MODEL_ARCH.NOMIC_BERT: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -304,6 +739,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_NORM,
     ],
+    MODEL_ARCH.JINA_BERT_V2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.TOKEN_TYPES,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
+    ],
     MODEL_ARCH.MPT: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -315,6 +768,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.FFN_ACT,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.POS_EMBD,
     ],
     MODEL_ARCH.GPTJ: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -328,20 +784,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
-    MODEL_ARCH.PERSIMMON: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-    ],
     MODEL_ARCH.REFACT: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -382,6 +824,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
     ],
     MODEL_ARCH.QWEN: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -401,6 +845,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -411,6 +856,39 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.QWEN2VL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.QWEN2MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
     MODEL_ARCH.PLAMO: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -452,6 +930,40 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.PHI3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.PHIMOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     MODEL_ARCH.CODESHELL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.POS_EMBD,
@@ -498,8 +1010,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     ],
     MODEL_ARCH.MINICPM: [
         MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -515,6 +1030,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.MINICPM3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.GEMMA: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -528,6 +1062,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.FFN_NORM,
     ],
+    MODEL_ARCH.GEMMA2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_PRE_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+    ],
     MODEL_ARCH.STARCODER2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -543,6 +1092,451 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.RWKV6: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.TIME_MIX_W1,
+        MODEL_TENSOR.TIME_MIX_W2,
+        MODEL_TENSOR.TIME_MIX_LERP_X,
+        MODEL_TENSOR.TIME_MIX_LERP_K,
+        MODEL_TENSOR.TIME_MIX_LERP_V,
+        MODEL_TENSOR.TIME_MIX_LERP_R,
+        MODEL_TENSOR.TIME_MIX_LERP_G,
+        MODEL_TENSOR.TIME_MIX_LERP_W,
+        MODEL_TENSOR.TIME_MIX_LERP_FUSED,
+        MODEL_TENSOR.TIME_MIX_FIRST,
+        MODEL_TENSOR.TIME_MIX_DECAY,
+        MODEL_TENSOR.TIME_MIX_DECAY_W1,
+        MODEL_TENSOR.TIME_MIX_DECAY_W2,
+        MODEL_TENSOR.TIME_MIX_KEY,
+        MODEL_TENSOR.TIME_MIX_VALUE,
+        MODEL_TENSOR.TIME_MIX_RECEPTANCE,
+        MODEL_TENSOR.TIME_MIX_GATE,
+        MODEL_TENSOR.TIME_MIX_LN,
+        MODEL_TENSOR.TIME_MIX_OUTPUT,
+        MODEL_TENSOR.CHANNEL_MIX_LERP_K,
+        MODEL_TENSOR.CHANNEL_MIX_LERP_R,
+        MODEL_TENSOR.CHANNEL_MIX_KEY,
+        MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
+        MODEL_TENSOR.CHANNEL_MIX_VALUE,
+    ],
+    MODEL_ARCH.RWKV6QWEN2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.TIME_MIX_W1,
+        MODEL_TENSOR.TIME_MIX_W2,
+        MODEL_TENSOR.TIME_MIX_LERP_X,
+        MODEL_TENSOR.TIME_MIX_LERP_K,
+        MODEL_TENSOR.TIME_MIX_LERP_V,
+        MODEL_TENSOR.TIME_MIX_LERP_R,
+        MODEL_TENSOR.TIME_MIX_LERP_G,
+        MODEL_TENSOR.TIME_MIX_LERP_W,
+        MODEL_TENSOR.TIME_MIX_LERP_FUSED,
+        MODEL_TENSOR.TIME_MIX_FIRST,
+        MODEL_TENSOR.TIME_MIX_DECAY,
+        MODEL_TENSOR.TIME_MIX_DECAY_W1,
+        MODEL_TENSOR.TIME_MIX_DECAY_W2,
+        MODEL_TENSOR.TIME_MIX_KEY,
+        MODEL_TENSOR.TIME_MIX_VALUE,
+        MODEL_TENSOR.TIME_MIX_RECEPTANCE,
+        MODEL_TENSOR.TIME_MIX_GATE,
+        MODEL_TENSOR.TIME_MIX_LN,
+        MODEL_TENSOR.TIME_MIX_OUTPUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.MAMBA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.SSM_IN,
+        MODEL_TENSOR.SSM_CONV1D,
+        MODEL_TENSOR.SSM_X,
+        MODEL_TENSOR.SSM_DT,
+        MODEL_TENSOR.SSM_A,
+        MODEL_TENSOR.SSM_D,
+        MODEL_TENSOR.SSM_OUT,
+    ],
+    MODEL_ARCH.XVERSE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.COMMAND_R: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+    ],
+    MODEL_ARCH.COHERE2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.DBRX: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.OLMO: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.OLMO2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.OLMOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+    ],
+    MODEL_ARCH.OPENELM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.ARCTIC: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_NORM_EXP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
+    MODEL_ARCH.DEEPSEEK2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+    ],
+    MODEL_ARCH.CHATGLM : [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.BITNET: [
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_SUB_NORM,
+        MODEL_TENSOR.FFN_SUB_NORM,
+    ],
+    MODEL_ARCH.T5: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.DEC_ATTN_NORM,
+        MODEL_TENSOR.DEC_ATTN_Q,
+        MODEL_TENSOR.DEC_ATTN_K,
+        MODEL_TENSOR.DEC_ATTN_V,
+        MODEL_TENSOR.DEC_ATTN_OUT,
+        MODEL_TENSOR.DEC_ATTN_REL_B,
+        MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
+        MODEL_TENSOR.DEC_CROSS_ATTN_Q,
+        MODEL_TENSOR.DEC_CROSS_ATTN_K,
+        MODEL_TENSOR.DEC_CROSS_ATTN_V,
+        MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
+        MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
+        MODEL_TENSOR.DEC_FFN_NORM,
+        MODEL_TENSOR.DEC_FFN_GATE,
+        MODEL_TENSOR.DEC_FFN_DOWN,
+        MODEL_TENSOR.DEC_FFN_UP,
+        MODEL_TENSOR.DEC_OUTPUT_NORM,
+        MODEL_TENSOR.ENC_ATTN_NORM,
+        MODEL_TENSOR.ENC_ATTN_Q,
+        MODEL_TENSOR.ENC_ATTN_K,
+        MODEL_TENSOR.ENC_ATTN_V,
+        MODEL_TENSOR.ENC_ATTN_OUT,
+        MODEL_TENSOR.ENC_ATTN_REL_B,
+        MODEL_TENSOR.ENC_FFN_NORM,
+        MODEL_TENSOR.ENC_FFN_GATE,
+        MODEL_TENSOR.ENC_FFN_DOWN,
+        MODEL_TENSOR.ENC_FFN_UP,
+        MODEL_TENSOR.ENC_OUTPUT_NORM,
+    ],
+    MODEL_ARCH.T5ENCODER: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ENC_ATTN_NORM,
+        MODEL_TENSOR.ENC_ATTN_Q,
+        MODEL_TENSOR.ENC_ATTN_K,
+        MODEL_TENSOR.ENC_ATTN_V,
+        MODEL_TENSOR.ENC_ATTN_OUT,
+        MODEL_TENSOR.ENC_ATTN_REL_B,
+        MODEL_TENSOR.ENC_FFN_NORM,
+        MODEL_TENSOR.ENC_FFN_GATE,
+        MODEL_TENSOR.ENC_FFN_DOWN,
+        MODEL_TENSOR.ENC_FFN_UP,
+        MODEL_TENSOR.ENC_OUTPUT_NORM,
+    ],
+    MODEL_ARCH.JAIS: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.NEMOTRON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.EXAONE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.GRANITE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.GRANITE_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.CHAMELEON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.WAVTOKENIZER_DEC: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.CONV1D,
+        MODEL_TENSOR.CONVNEXT_DW,
+        MODEL_TENSOR.CONVNEXT_NORM,
+        MODEL_TENSOR.CONVNEXT_PW1,
+        MODEL_TENSOR.CONVNEXT_PW2,
+        MODEL_TENSOR.CONVNEXT_GAMMA,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.POSNET_CONV1,
+        MODEL_TENSOR.POSNET_CONV2,
+        MODEL_TENSOR.POSNET_NORM,
+        MODEL_TENSOR.POSNET_NORM1,
+        MODEL_TENSOR.POSNET_NORM2,
+        MODEL_TENSOR.POSNET_ATTN_NORM,
+        MODEL_TENSOR.POSNET_ATTN_Q,
+        MODEL_TENSOR.POSNET_ATTN_K,
+        MODEL_TENSOR.POSNET_ATTN_V,
+        MODEL_TENSOR.POSNET_ATTN_OUT,
+    ],
     # TODO
 }
 
@@ -552,12 +1546,13 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
-    MODEL_ARCH.BAICHUAN: [
+    MODEL_ARCH.DECI: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
-    MODEL_ARCH.PERSIMMON: [
+    MODEL_ARCH.BAICHUAN: [
         MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
     MODEL_ARCH.QWEN: [
         MODEL_TENSOR.ROPE_FREQS,
@@ -575,6 +1570,25 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.XVERSE: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.DEEPSEEK2: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.CHATGLM: [
+        MODEL_TENSOR.ROPE_FREQS,
+    ],
+    MODEL_ARCH.NEMOTRON: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }
 
 #
@@ -592,9 +1606,10 @@ class TokenType(IntEnum):
 
 
 class RopeScalingType(Enum):
-    NONE   = 'none'
-    LINEAR = 'linear'
-    YARN   = 'yarn'
+    NONE     = 'none'
+    LINEAR   = 'linear'
+    YARN     = 'yarn'
+    LONGROPE = 'longrope'
 
 
 class PoolingType(IntEnum):
@@ -604,20 +1619,90 @@ class PoolingType(IntEnum):
 
 
 class GGMLQuantizationType(IntEnum):
-    F32  = 0
-    F16  = 1
-    Q4_0 = 2
-    Q4_1 = 3
-    Q5_0 = 6
-    Q5_1 = 7
-    Q8_0 = 8
-    Q8_1 = 9
-    Q2_K = 10
-    Q3_K = 11
-    Q4_K = 12
-    Q5_K = 13
-    Q6_K = 14
-    Q8_K = 15
+    F32     = 0
+    F16     = 1
+    Q4_0    = 2
+    Q4_1    = 3
+    Q5_0    = 6
+    Q5_1    = 7
+    Q8_0    = 8
+    Q8_1    = 9
+    Q2_K    = 10
+    Q3_K    = 11
+    Q4_K    = 12
+    Q5_K    = 13
+    Q6_K    = 14
+    Q8_K    = 15
+    IQ2_XXS = 16
+    IQ2_XS  = 17
+    IQ3_XXS = 18
+    IQ1_S   = 19
+    IQ4_NL  = 20
+    IQ3_S   = 21
+    IQ2_S   = 22
+    IQ4_XS  = 23
+    I8      = 24
+    I16     = 25
+    I32     = 26
+    I64     = 27
+    F64     = 28
+    IQ1_M   = 29
+    BF16    = 30
+    TQ1_0   = 34
+    TQ2_0   = 35
+
+
+class ExpertGatingFuncType(IntEnum):
+    SOFTMAX  = 1
+    SIGMOID  = 2
+
+
+# TODO: add GGMLFileType from ggml_ftype in ggml.h
+
+
+# from llama_ftype in llama.h
+# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
+class LlamaFileType(IntEnum):
+    ALL_F32              = 0
+    MOSTLY_F16           = 1   # except 1d tensors
+    MOSTLY_Q4_0          = 2   # except 1d tensors
+    MOSTLY_Q4_1          = 3   # except 1d tensors
+    # MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
+    # MOSTLY_Q4_2        = 5   # support has been removed
+    # MOSTLY_Q4_3        = 6   # support has been removed
+    MOSTLY_Q8_0          = 7   # except 1d tensors
+    MOSTLY_Q5_0          = 8   # except 1d tensors
+    MOSTLY_Q5_1          = 9   # except 1d tensors
+    MOSTLY_Q2_K          = 10  # except 1d tensors
+    MOSTLY_Q3_K_S        = 11  # except 1d tensors
+    MOSTLY_Q3_K_M        = 12  # except 1d tensors
+    MOSTLY_Q3_K_L        = 13  # except 1d tensors
+    MOSTLY_Q4_K_S        = 14  # except 1d tensors
+    MOSTLY_Q4_K_M        = 15  # except 1d tensors
+    MOSTLY_Q5_K_S        = 16  # except 1d tensors
+    MOSTLY_Q5_K_M        = 17  # except 1d tensors
+    MOSTLY_Q6_K          = 18  # except 1d tensors
+    MOSTLY_IQ2_XXS       = 19  # except 1d tensors
+    MOSTLY_IQ2_XS        = 20  # except 1d tensors
+    MOSTLY_Q2_K_S        = 21  # except 1d tensors
+    MOSTLY_IQ3_XS        = 22  # except 1d tensors
+    MOSTLY_IQ3_XXS       = 23  # except 1d tensors
+    MOSTLY_IQ1_S         = 24  # except 1d tensors
+    MOSTLY_IQ4_NL        = 25  # except 1d tensors
+    MOSTLY_IQ3_S         = 26  # except 1d tensors
+    MOSTLY_IQ3_M         = 27  # except 1d tensors
+    MOSTLY_IQ2_S         = 28  # except 1d tensors
+    MOSTLY_IQ2_M         = 29  # except 1d tensors
+    MOSTLY_IQ4_XS        = 30  # except 1d tensors
+    MOSTLY_IQ1_M         = 31  # except 1d tensors
+    MOSTLY_BF16          = 32  # except 1d tensors
+    # MOSTLY_Q4_0_4_4      = 33  # removed from gguf files, use Q4_0 and runtime repack
+    # MOSTLY_Q4_0_4_8      = 34  # removed from gguf files, use Q4_0 and runtime repack
+    # MOSTLY_Q4_0_8_8      = 35  # removed from gguf files, use Q4_0 and runtime repack
+    MOSTLY_TQ1_0         = 36  # except 1d tensors
+    MOSTLY_TQ2_0         = 37  # except 1d tensors
+
+    GUESSED              = 1024  # not specified in the model file
 
 
 class GGUFEndian(IntEnum):
@@ -654,28 +1739,43 @@ class GGUFValueType(IntEnum):
             return GGUFValueType.INT32
         # TODO: need help with 64-bit types in Python
         else:
-            print("Unknown type:", type(val))
-            sys.exit()
+            raise ValueError(f"Unknown type: {type(val)}")
 
 
-# Note: Does not support GGML_QKK_64
-QK_K = 256
 # Items here are (block size, type size)
-GGML_QUANT_SIZES = {
-    GGMLQuantizationType.F32:  (1, 4),
-    GGMLQuantizationType.F16:  (1, 2),
-    GGMLQuantizationType.Q4_0: (32, 2 + 16),
-    GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
-    GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
-    GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
-    GGMLQuantizationType.Q8_0: (32, 2 + 32),
-    GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
-    GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
-    GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
-    GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
-    GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
-    GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
-    GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
+QK_K = 256
+GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
+    GGMLQuantizationType.F32:     (1, 4),
+    GGMLQuantizationType.F16:     (1, 2),
+    GGMLQuantizationType.Q4_0:    (32, 2 + 16),
+    GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
+    GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
+    GGMLQuantizationType.Q5_1:    (32, 2 + 2 + 4 + 16),
+    GGMLQuantizationType.Q8_0:    (32, 2 + 32),
+    GGMLQuantizationType.Q8_1:    (32, 4 + 4 + 32),
+    GGMLQuantizationType.Q2_K:    (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    GGMLQuantizationType.Q3_K:    (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    GGMLQuantizationType.Q4_K:    (256, 2 + 2 + QK_K // 2 + 12),
+    GGMLQuantizationType.Q5_K:    (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    GGMLQuantizationType.Q6_K:    (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    GGMLQuantizationType.Q8_K:    (256, 4 + QK_K + QK_K // 8),
+    GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
+    GGMLQuantizationType.IQ2_XS:  (256, 2 + QK_K // 4 + QK_K // 32),
+    GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
+    GGMLQuantizationType.IQ1_S:   (256, 2 + QK_K // 8 + QK_K // 16),
+    GGMLQuantizationType.IQ4_NL:  (32, 2 + 16),
+    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
+    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
+    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
+    GGMLQuantizationType.I8:      (1, 1),
+    GGMLQuantizationType.I16:     (1, 2),
+    GGMLQuantizationType.I32:     (1, 4),
+    GGMLQuantizationType.I64:     (1, 8),
+    GGMLQuantizationType.F64:     (1, 8),
+    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
+    GGMLQuantizationType.BF16:    (1, 2),
+    GGMLQuantizationType.TQ1_0:   (256, 2 + 4 * 13),
+    GGMLQuantizationType.TQ2_0:   (256, 2 + 64),
 }
 
 
@@ -691,10 +1791,10 @@ KEY_GENERAL_URL                  = Keys.General.URL
 KEY_GENERAL_DESCRIPTION          = Keys.General.DESCRIPTION
 KEY_GENERAL_LICENSE              = Keys.General.LICENSE
 KEY_GENERAL_SOURCE_URL           = Keys.General.SOURCE_URL
-KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
 KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE
 
 # LLM
+KEY_VOCAB_SIZE            = Keys.LLM.VOCAB_SIZE
 KEY_CONTEXT_LENGTH        = Keys.LLM.CONTEXT_LENGTH
 KEY_EMBEDDING_LENGTH      = Keys.LLM.EMBEDDING_LENGTH
 KEY_BLOCK_COUNT           = Keys.LLM.BLOCK_COUNT
@@ -718,18 +1818,39 @@ KEY_ROPE_SCALING_FACTOR       = Keys.Rope.SCALING_FACTOR
 KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
 KEY_ROPE_SCALING_FINETUNED    = Keys.Rope.SCALING_FINETUNED
 
+# SSM
+KEY_SSM_CONV_KERNEL    = Keys.SSM.CONV_KERNEL
+KEY_SSM_INNER_SIZE     = Keys.SSM.INNER_SIZE
+KEY_SSM_STATE_SIZE     = Keys.SSM.STATE_SIZE
+KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
+KEY_SSM_DT_B_C_RMS     = Keys.SSM.DT_B_C_RMS
+
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
+KEY_TOKENIZER_PRE        = Keys.Tokenizer.PRE
 KEY_TOKENIZER_LIST       = Keys.Tokenizer.LIST
 KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
 KEY_TOKENIZER_SCORES     = Keys.Tokenizer.SCORES
 KEY_TOKENIZER_MERGES     = Keys.Tokenizer.MERGES
 KEY_TOKENIZER_BOS_ID     = Keys.Tokenizer.BOS_ID
 KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
+KEY_TOKENIZER_EOT_ID     = Keys.Tokenizer.EOT_ID
+KEY_TOKENIZER_EOM_ID     = Keys.Tokenizer.EOM_ID
 KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
-KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
+
+KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
+KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
+KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
+KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
+KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
+KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
+
+# deprecated
+KEY_TOKENIZER_PREFIX_ID  = Keys.Tokenizer.PREFIX_ID
+KEY_TOKENIZER_SUFFIX_ID  = Keys.Tokenizer.SUFFIX_ID
+KEY_TOKENIZER_MIDDLE_ID  = Keys.Tokenizer.MIDDLE_ID
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 5b6d4ba6b..e17a4e831 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -4,6 +4,7 @@
 #
 from __future__ import annotations
 
+import logging
 import os
 from collections import OrderedDict
 from typing import Any, Literal, NamedTuple, TypeVar, Union
@@ -11,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
 import numpy as np
 import numpy.typing as npt
 
+from .quants import quant_shape_to_byte_shape
+
 if __name__ == "__main__":
     import sys
     from pathlib import Path
@@ -27,6 +30,7 @@ from gguf.constants import (
     GGUFValueType,
 )
 
+logger = logging.getLogger(__name__)
 
 READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
 
@@ -63,8 +67,9 @@ class ReaderTensor(NamedTuple):
 
 class GGUFReader:
     # I - same as host, S - swapped
-    byte_order: Literal['I' | 'S'] = 'I'
+    byte_order: Literal['I', 'S'] = 'I'
     alignment: int = GGUF_DEFAULT_ALIGNMENT
+    data_offset: int
 
     # Note: Internal helper, API may change.
     gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@@ -81,12 +86,16 @@ class GGUFReader:
         GGUFValueType.BOOL:    np.bool_,
     }
 
-    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
+    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
         self.data = np.memmap(path, mode = mode)
         offs = 0
+
+        # Check for GGUF magic
         if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
             raise ValueError('GGUF magic invalid')
         offs += 4
+
+        # Check GGUF version
         temp_version = self._get(offs, np.uint32)
         if temp_version[0] & 65535 == 0:
             # If we get 0 here that means it's (probably) a GGUF file created for
@@ -99,12 +108,16 @@ class GGUFReader:
         self.fields: OrderedDict[str, ReaderField] = OrderedDict()
         self.tensors: list[ReaderTensor] = []
         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
+
+        # Check tensor count and kv count
         temp_counts = self._get(offs, np.uint64, 2)
         offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
         offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
         tensor_count, kv_count = temp_counts
         offs = self._build_fields(offs, kv_count)
-        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
+
+        # Build Tensor Info Fields
+        offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
         new_align = self.fields.get('general.alignment')
         if new_align is not None:
             if new_align.types != [GGUFValueType.UINT32]:
@@ -113,6 +126,7 @@ class GGUFReader:
         padding = offs % self.alignment
         if padding != 0:
             offs += self.alignment - padding
+        self.data_offset = offs
         self._build_tensors(offs, tensors_fields)
 
     _DT = TypeVar('_DT', bound = npt.DTypeLike)
@@ -126,21 +140,25 @@ class GGUFReader:
         return self.tensors[idx]
 
     def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
     ) -> npt.NDArray[Any]:
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
         end_offs = offset + itemsize * count
-        return (
-            self.data[offset:end_offs]
-            .view(dtype = dtype)[:count]
-            .newbyteorder(override_order or self.byte_order)
-        )
+        arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
+        if override_order is None:
+            return arr
+        return arr.view(arr.dtype.newbyteorder(override_order))
 
     def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         if field.name in self.fields:
-            raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
-        self.fields[field.name] = field
+            # TODO: add option to generate error on duplicate keys
+            # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
+
+            logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
+            self.fields[field.name + '_{}'.format(field.offset)] = field
+        else:
+            self.fields[field.name] = field
         return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
 
     def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
@@ -184,18 +202,29 @@ class GGUFReader:
         # We can't deal with this one.
         raise ValueError('Unknown/unhandled field type {gtype}')
 
-    def _get_tensor(self, orig_offs: int) -> ReaderField:
+    def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs = orig_offs
+
+        # Get Tensor Name
         name_len, name_data = self._get_str(offs)
         offs += int(name_len.nbytes + name_data.nbytes)
+
+        # Get Tensor Dimensions Count
         n_dims = self._get(offs, np.uint32)
         offs += int(n_dims.nbytes)
+
+        # Get Tensor Dimension Array
         dims = self._get(offs, np.uint64, n_dims[0])
         offs += int(dims.nbytes)
+
+        # Get Tensor Encoding Scheme Type
         raw_dtype = self._get(offs, np.uint32)
         offs += int(raw_dtype.nbytes)
+
+        # Get Tensor Offset
         offset_tensor = self._get(offs, np.uint64)
         offs += int(offset_tensor.nbytes)
+
         return ReaderField(
             orig_offs,
             str(bytes(name_data), encoding = 'utf-8'),
@@ -224,41 +253,64 @@ class GGUFReader:
             offs += field_size
         return offs
 
-    def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
+    def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
         tensor_fields = []
         for _ in range(count):
-            field = self._get_tensor(offs)
+            field = self._get_tensor_info_field(offs)
             offs += sum(int(part.nbytes) for part in field.parts)
             tensor_fields.append(field)
         return offs, tensor_fields
 
     def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
         tensors = []
+        tensor_names = set() # keep track of name to prevent duplicated tensors
         for field in fields:
             _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
+            # check if there's any tensor having same name already in the list
+            tensor_name = str(bytes(name_data), encoding = 'utf-8')
+            if tensor_name in tensor_names:
+                raise ValueError(f'Found duplicated tensor with name {tensor_name}')
+            tensor_names.add(tensor_name)
             ggml_type = GGMLQuantizationType(raw_dtype[0])
-            n_elems = np.prod(dims)
+            n_elems = int(np.prod(dims))
+            np_dims = tuple(reversed(dims.tolist()))
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])
             item_type: npt.DTypeLike
-            if ggml_type == GGMLQuantizationType.F32:
-                item_count = n_elems
-                item_type = np.float32
-            elif ggml_type == GGMLQuantizationType.F16:
+            if ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
                 item_type = np.float16
+            elif ggml_type == GGMLQuantizationType.F32:
+                item_count = n_elems
+                item_type = np.float32
+            elif ggml_type == GGMLQuantizationType.F64:
+                item_count = n_elems
+                item_type = np.float64
+            elif ggml_type == GGMLQuantizationType.I8:
+                item_count = n_elems
+                item_type = np.int8
+            elif ggml_type == GGMLQuantizationType.I16:
+                item_count = n_elems
+                item_type = np.int16
+            elif ggml_type == GGMLQuantizationType.I32:
+                item_count = n_elems
+                item_type = np.int32
+            elif ggml_type == GGMLQuantizationType.I64:
+                item_count = n_elems
+                item_type = np.int64
             else:
                 item_count = n_bytes
                 item_type = np.uint8
+                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
             tensors.append(ReaderTensor(
-                name = str(bytes(name_data), encoding = 'utf-8'),
+                name = tensor_name,
                 tensor_type = ggml_type,
                 shape = dims,
                 n_elements = n_elems,
                 n_bytes = n_bytes,
                 data_offset = data_offs,
-                data = self._get(data_offs, item_type, item_count),
+                data = self._get(data_offs, item_type, item_count).reshape(np_dims),
                 field = field,
             ))
         self.tensors = tensors
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index e4681475c..080d2b9dc 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1,12 +1,17 @@
 from __future__ import annotations
 
+import logging
 import os
 import shutil
 import struct
 import tempfile
+from dataclasses import dataclass
 from enum import Enum, auto
+from math import prod
+from pathlib import Path
 from io import BufferedWriter
-from typing import IO, Any, Sequence
+from typing import IO, Any, Sequence, Mapping
+from string import ascii_letters, digits
 
 import numpy as np
 
@@ -21,20 +26,47 @@ from .constants import (
     RopeScalingType,
     PoolingType,
     TokenType,
+    ExpertGatingFuncType,
 )
 
+from .quants import quant_shape_from_byte_shape
+
+logger = logging.getLogger(__name__)
+
+
+SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
+
+
+@dataclass
+class TensorInfo:
+    shape: Sequence[int]
+    dtype: GGMLQuantizationType
+    nbytes: int
+    tensor: np.ndarray[Any, Any] | None = None
+
+
+@dataclass
+class GGUFValue:
+    value: Any
+    type: GGUFValueType
+
 
 class WriterState(Enum):
+    NO_FILE = auto()
     EMPTY   = auto()
     HEADER  = auto()
     KV_DATA = auto()
     TI_DATA = auto()
+    WEIGHTS = auto()
 
 
 class GGUFWriter:
-    fout: BufferedWriter
+    fout: list[BufferedWriter] | None
+    path: Path | None
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None
-    tensors: list[np.ndarray[Any, Any]]
+    tensors: list[dict[str, TensorInfo]]
+    kv_data: list[dict[str, GGUFValue]]
+    state: WriterState
     _simple_value_packing = {
         GGUFValueType.UINT8:   "B",
         GGUFValueType.INT8:    "b",
@@ -50,170 +82,289 @@ class GGUFWriter:
     }
 
     def __init__(
-        self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True,
-        endianess: GGUFEndian = GGUFEndian.LITTLE,
+        self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
+        split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
     ):
-        self.fout = open(path, "wb")
+        self.fout = None
+        self.path = Path(path) if path else None
         self.arch = arch
         self.endianess = endianess
-        self.offset_tensor = 0
         self.data_alignment = GGUF_DEFAULT_ALIGNMENT
-        self.kv_data = bytearray()
-        self.kv_data_count = 0
-        self.ti_data = bytearray()
-        self.ti_data_count = 0
         self.use_temp_file = use_temp_file
         self.temp_file = None
-        self.tensors = []
-        print("gguf: This GGUF file is for {0} Endian only".format(
+        self.tensors = [{}]
+        self.kv_data = [{}]
+        self.split_max_tensors = split_max_tensors
+        self.split_max_size = split_max_size
+        self.dry_run = dry_run
+        self.small_first_shard = small_first_shard
+        logger.info("gguf: This GGUF file is for {0} Endian only".format(
             "Big" if self.endianess == GGUFEndian.BIG else "Little",
         ))
-        self.state = WriterState.EMPTY
+        self.state = WriterState.NO_FILE
+
+        if self.small_first_shard:
+            self.tensors.append({})
 
         self.add_architecture()
 
-    def write_header_to_file(self) -> None:
+    def get_total_parameter_count(self) -> tuple[int, int, int, int]:
+        total_params = 0
+        shared_params = 0
+        expert_params = 0
+
+        expert_sum = 0
+        n_expert_tensors = 0
+
+        last_lora_a: tuple[str, TensorInfo] | None = None
+
+        for tensors in self.tensors:
+            for name, info in tensors.items():
+
+                shape = info.shape
+
+                if name.endswith(".lora_a"):
+                    last_lora_a = (name, info)
+                    continue
+                elif name.endswith(".lora_b"):
+                    if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
+                        # Bail when the LoRA pair can't be found trivially
+                        logger.warning("can't measure LoRA size correctly, tensor order is unusual")
+                        return 0, 0, 0, 0
+                    else:
+                        shape = (*shape[:-1], last_lora_a[1].shape[-1])
+
+                size = prod(shape)
+
+                if "_exps." in name:
+                    expert_params += (size // shape[-3])
+                    expert_sum += shape[-3]
+                    n_expert_tensors += 1
+                else:
+                    shared_params += size
+
+                total_params += size
+
+        # Hopefully this should work even for variable-expert-count models
+        expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
+
+        # Negate the total to signal it's likely not exact
+        if last_lora_a is not None:
+            total_params = -total_params
+
+        # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
+        return total_params, shared_params, expert_params, expert_count
+
+    def format_shard_names(self, path: Path) -> list[Path]:
+        if len(self.tensors) == 1:
+            return [path]
+        return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
+
+    def open_output_file(self, path: Path | None = None) -> None:
+        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
+            # allow calling this multiple times as long as the path is the same
+            return
+
+        if self.state is not WriterState.NO_FILE:
+            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
+
+        if path is not None:
+            self.path = path
+
+        if self.path is not None:
+            filenames = self.print_plan()
+            self.fout = [open(filename, "wb") for filename in filenames]
+            self.state = WriterState.EMPTY
+
+    def print_plan(self) -> list[Path]:
+        logger.info("Writing the following files:")
+        assert self.path is not None
+        filenames = self.format_shard_names(self.path)
+        assert len(filenames) == len(self.tensors)
+        for name, tensors in zip(filenames, self.tensors):
+            logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
+
+        if self.dry_run:
+            logger.info("Dry run, not writing files")
+            for name in filenames:
+                print(name)  # noqa: NP100
+            exit()
+
+        return filenames
+
+    def add_shard_kv_data(self) -> None:
+        if len(self.tensors) == 1:
+            return
+
+        total_tensors = sum(len(t) for t in self.tensors)
+        assert self.fout is not None
+        total_splits = len(self.fout)
+        self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
+        for i, kv_data in enumerate(self.kv_data):
+            kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
+            kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
+            kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
+
+    def write_header_to_file(self, path: Path | None = None) -> None:
+        if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
+            logger.warning("Model fails split requirements, not splitting")
+
+        self.open_output_file(path)
+
         if self.state is not WriterState.EMPTY:
             raise ValueError(f'Expected output file to be empty, got {self.state}')
 
-        self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
-        self._write_packed("I", GGUF_VERSION)
-        self._write_packed("Q", self.ti_data_count)
-        self._write_packed("Q", self.kv_data_count)
-        self.flush()
+        assert self.fout is not None
+        assert len(self.fout) == len(self.tensors)
+        assert len(self.kv_data) == 1
+
+        self.add_shard_kv_data()
+
+        for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
+            fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
+            fout.write(self._pack("I", GGUF_VERSION))
+            fout.write(self._pack("Q", len(tensors)))
+            fout.write(self._pack("Q", len(kv_data)))
+            fout.flush()
         self.state = WriterState.HEADER
 
     def write_kv_data_to_file(self) -> None:
         if self.state is not WriterState.HEADER:
             raise ValueError(f'Expected output file to contain the header, got {self.state}')
+        assert self.fout is not None
+
+        for fout, kv_data in zip(self.fout, self.kv_data):
+            kv_bytes = bytearray()
+
+            for key, val in kv_data.items():
+                kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
+                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
+
+            fout.write(kv_bytes)
 
-        self.fout.write(self.kv_data)
         self.flush()
         self.state = WriterState.KV_DATA
 
     def write_ti_data_to_file(self) -> None:
         if self.state is not WriterState.KV_DATA:
             raise ValueError(f'Expected output file to contain KV data, got {self.state}')
+        assert self.fout is not None
 
-        self.fout.write(self.ti_data)
-        self.flush()
+        for fout, tensors in zip(self.fout, self.tensors):
+            ti_data = bytearray()
+            offset_tensor = 0
+
+            for name, ti in tensors.items():
+                ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
+                n_dims = len(ti.shape)
+                ti_data += self._pack("I", n_dims)
+                for j in range(n_dims):
+                    ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
+                ti_data += self._pack("I", ti.dtype)
+                ti_data += self._pack("Q", offset_tensor)
+                offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
+
+            fout.write(ti_data)
+            fout.flush()
         self.state = WriterState.TI_DATA
 
-    def add_key(self, key: str) -> None:
-        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
+        if any(key in kv_data for kv_data in self.kv_data):
+            raise ValueError(f'Duplicated key name {key!r}')
+
+        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
 
     def add_uint8(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT8)
+        self.add_key_value(key,val, GGUFValueType.UINT8)
 
     def add_int8(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT8)
+        self.add_key_value(key, val, GGUFValueType.INT8)
 
     def add_uint16(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT16)
+        self.add_key_value(key, val, GGUFValueType.UINT16)
 
     def add_int16(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT16)
+        self.add_key_value(key, val, GGUFValueType.INT16)
 
     def add_uint32(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT32)
+        self.add_key_value(key, val, GGUFValueType.UINT32)
 
     def add_int32(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT32)
+        self.add_key_value(key, val, GGUFValueType.INT32)
 
     def add_float32(self, key: str, val: float) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.FLOAT32)
+        self.add_key_value(key, val, GGUFValueType.FLOAT32)
 
     def add_uint64(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT64)
+        self.add_key_value(key, val, GGUFValueType.UINT64)
 
     def add_int64(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT64)
+        self.add_key_value(key, val, GGUFValueType.INT64)
 
     def add_float64(self, key: str, val: float) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.FLOAT64)
+        self.add_key_value(key, val, GGUFValueType.FLOAT64)
 
     def add_bool(self, key: str, val: bool) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.BOOL)
+        self.add_key_value(key, val, GGUFValueType.BOOL)
 
     def add_string(self, key: str, val: str) -> None:
         if not val:
             return
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.STRING)
+        self.add_key_value(key, val, GGUFValueType.STRING)
 
     def add_array(self, key: str, val: Sequence[Any]) -> None:
-        if not isinstance(val, Sequence):
-            raise ValueError("Value must be a sequence for array type")
-
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.ARRAY)
-
-    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
-        if vtype is None:
-            vtype = GGUFValueType.get_type(val)
-
-        if add_vtype:
-            self.kv_data += self._pack("I", vtype)
-            self.kv_data_count += 1
-
-        pack_fmt = self._simple_value_packing.get(vtype)
-        if pack_fmt is not None:
-            self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
-        elif vtype == GGUFValueType.STRING:
-            encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += self._pack("Q", len(encoded_val))
-            self.kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
-            ltype = GGUFValueType.get_type(val[0])
-            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
-                raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += self._pack("I", ltype)
-            self.kv_data += self._pack("Q", len(val))
-            for item in val:
-                self.add_val(item, add_vtype=False)
-        else:
-            raise ValueError("Invalid GGUF metadata value type or value")
+        if len(val) == 0:
+            return
+        self.add_key_value(key, val, GGUFValueType.ARRAY)
 
     @staticmethod
     def ggml_pad(x: int, n: int) -> int:
         return ((x + n - 1) // n) * n
 
     def add_tensor_info(
-        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
+        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
         tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
     ) -> None:
-        if self.state is not WriterState.EMPTY:
-            raise ValueError(f'Expected output file to be empty, got {self.state}')
+        if self.state is not WriterState.NO_FILE:
+            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
 
-        if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
-            raise ValueError("Only F32 and F16 tensors are supported for now")
+        if any(name in tensors for tensors in self.tensors):
+            raise ValueError(f'Duplicated tensor name {name!r}')
 
-        encoded_name = name.encode("utf8")
-        self.ti_data += self._pack("Q", len(encoded_name))
-        self.ti_data += encoded_name
-        n_dims = len(tensor_shape)
-        self.ti_data += self._pack("I", n_dims)
-        for i in range(n_dims):
-            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
-            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+            if tensor_dtype == np.float16:
+                dtype = GGMLQuantizationType.F16
+            elif tensor_dtype == np.float32:
+                dtype = GGMLQuantizationType.F32
+            elif tensor_dtype == np.float64:
+                dtype = GGMLQuantizationType.F64
+            elif tensor_dtype == np.int8:
+                dtype = GGMLQuantizationType.I8
+            elif tensor_dtype == np.int16:
+                dtype = GGMLQuantizationType.I16
+            elif tensor_dtype == np.int32:
+                dtype = GGMLQuantizationType.I32
+            elif tensor_dtype == np.int64:
+                dtype = GGMLQuantizationType.I64
+            else:
+                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
         else:
             dtype = raw_dtype
-        self.ti_data += self._pack("I", dtype)
-        self.ti_data += self._pack("Q", self.offset_tensor)
-        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
-        self.ti_data_count += 1
+            if tensor_dtype == np.uint8:
+                tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
+
+        # make sure there is at least one tensor before splitting
+        if len(self.tensors[-1]) > 0:
+            if (  # split when over tensor limit
+                self.split_max_tensors != 0
+                and len(self.tensors[-1]) >= self.split_max_tensors
+            ) or (   # split when over size limit
+                self.split_max_size != 0
+                and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
+            ):
+                self.tensors.append({})
+
+        self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
 
     def add_tensor(
         self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@@ -227,10 +378,10 @@ class GGUFWriter:
             self.temp_file = fp
 
         shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
 
         if self.temp_file is None:
-            self.tensors.append(tensor)
+            self.tensors[-1][name].tensor = tensor
             return
 
         tensor.tofile(self.temp_file)
@@ -242,62 +393,103 @@ class GGUFWriter:
             fp.write(bytes([0] * pad))
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
-        if self.state is not WriterState.TI_DATA:
-            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
+        if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
+            raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
+        assert self.fout is not None
 
         if self.endianess == GGUFEndian.BIG:
             tensor.byteswap(inplace=True)
-        self.write_padding(self.fout, self.fout.tell())
-        tensor.tofile(self.fout)
-        self.write_padding(self.fout, tensor.nbytes)
 
-    def write_tensors_to_file(self) -> None:
+        file_id = -1
+        for i, tensors in enumerate(self.tensors):
+            if len(tensors) > 0:
+                file_id = i
+                break
+
+        fout = self.fout[file_id]
+
+        # pop the first tensor info
+        # TODO: cleaner way to get the first key
+        first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
+        ti = self.tensors[file_id].pop(first_tensor_name)
+        assert ti.nbytes == tensor.nbytes
+
+        self.write_padding(fout, fout.tell())
+        tensor.tofile(fout)
+        self.write_padding(fout, tensor.nbytes)
+
+        self.state = WriterState.WEIGHTS
+
+    def write_tensors_to_file(self, *, progress: bool = False) -> None:
         self.write_ti_data_to_file()
 
-        self.write_padding(self.fout, self.fout.tell())
+        assert self.fout is not None
+
+        for fout in self.fout:
+            self.write_padding(fout, fout.tell())
 
         if self.temp_file is None:
-            while True:
-                try:
-                    tensor = self.tensors.pop(0)
-                except IndexError:
-                    break
-                tensor.tofile(self.fout)
-                self.write_padding(self.fout, tensor.nbytes)
-            return
+            shard_bar = None
+            bar = None
 
-        self.temp_file.seek(0)
+            if progress:
+                from tqdm import tqdm
 
-        shutil.copyfileobj(self.temp_file, self.fout)
-        self.flush()
-        self.temp_file.close()
+                total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
+
+                if len(self.fout) > 1:
+                    shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
+                bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
+
+            for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
+                if shard_bar is not None:
+                    shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
+                    total = sum(ti.nbytes for ti in tensors.values())
+                    shard_bar.reset(total=(total if total > 0 else None))
+
+                # relying on the fact that Python dicts preserve insertion order (since 3.7)
+                for ti in tensors.values():
+                    assert ti.tensor is not None  # can only iterate once over the tensors
+                    assert ti.tensor.nbytes == ti.nbytes
+                    ti.tensor.tofile(fout)
+                    if shard_bar is not None:
+                        shard_bar.update(ti.nbytes)
+                    if bar is not None:
+                        bar.update(ti.nbytes)
+                    self.write_padding(fout, ti.nbytes)
+                    ti.tensor = None
+        else:
+            self.temp_file.seek(0)
+
+            shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
+            self.flush()
+            self.temp_file.close()
+
+        self.state = WriterState.WEIGHTS
 
     def flush(self) -> None:
-        self.fout.flush()
+        assert self.fout is not None
+        for fout in self.fout:
+            fout.flush()
 
     def close(self) -> None:
-        self.fout.close()
+        if self.fout is not None:
+            for fout in self.fout:
+                fout.close()
+            self.fout = None
+
+    def add_type(self, type_name: str) -> None:
+        self.add_string(Keys.General.TYPE, type_name)
 
     def add_architecture(self) -> None:
         self.add_string(Keys.General.ARCHITECTURE, self.arch)
 
-    def add_author(self, author: str) -> None:
-        self.add_string(Keys.General.AUTHOR, author)
+    def add_quantization_version(self, quantization_version: int) -> None:
+        self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
 
-    def add_tensor_data_layout(self, layout: str) -> None:
-        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
-
-    def add_url(self, url: str) -> None:
-        self.add_string(Keys.General.URL, url)
-
-    def add_description(self, description: str) -> None:
-        self.add_string(Keys.General.DESCRIPTION, description)
-
-    def add_source_url(self, url: str) -> None:
-        self.add_string(Keys.General.SOURCE_URL, url)
-
-    def add_source_hf_repo(self, repo: str) -> None:
-        self.add_string(Keys.General.SOURCE_HF_REPO, repo)
+    def add_custom_alignment(self, alignment: int) -> None:
+        self.data_alignment = alignment
+        self.add_uint32(Keys.General.ALIGNMENT, alignment)
 
     def add_file_type(self, ftype: int) -> None:
         self.add_uint32(Keys.General.FILE_TYPE, ftype)
@@ -305,13 +497,134 @@ class GGUFWriter:
     def add_name(self, name: str) -> None:
         self.add_string(Keys.General.NAME, name)
 
-    def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
-        self.add_uint32(
-            Keys.General.QUANTIZATION_VERSION, quantization_version)
+    def add_author(self, author: str) -> None:
+        self.add_string(Keys.General.AUTHOR, author)
 
-    def add_custom_alignment(self, alignment: int) -> None:
-        self.data_alignment = alignment
-        self.add_uint32(Keys.General.ALIGNMENT, alignment)
+    def add_version(self, version: str) -> None:
+        self.add_string(Keys.General.VERSION, version)
+
+    def add_organization(self, organization: str) -> None:
+        self.add_string(Keys.General.ORGANIZATION, organization)
+
+    def add_finetune(self, finetune: str) -> None:
+        self.add_string(Keys.General.FINETUNE, finetune)
+
+    def add_basename(self, basename: str) -> None:
+        self.add_string(Keys.General.BASENAME, basename)
+
+    def add_description(self, description: str) -> None:
+        self.add_string(Keys.General.DESCRIPTION, description)
+
+    def add_quantized_by(self, quantized: str) -> None:
+        self.add_string(Keys.General.QUANTIZED_BY, quantized)
+
+    def add_size_label(self, size_label: str) -> None:
+        self.add_string(Keys.General.SIZE_LABEL, size_label)
+
+    def add_license(self, license: str) -> None:
+        self.add_string(Keys.General.LICENSE, license)
+
+    def add_license_name(self, license: str) -> None:
+        self.add_string(Keys.General.LICENSE_NAME, license)
+
+    def add_license_link(self, license: str) -> None:
+        self.add_string(Keys.General.LICENSE_LINK, license)
+
+    def add_url(self, url: str) -> None:
+        self.add_string(Keys.General.URL, url)
+
+    def add_doi(self, doi: str) -> None:
+        self.add_string(Keys.General.DOI, doi)
+
+    def add_uuid(self, uuid: str) -> None:
+        self.add_string(Keys.General.UUID, uuid)
+
+    def add_repo_url(self, repo_url: str) -> None:
+        self.add_string(Keys.General.REPO_URL, repo_url)
+
+    def add_source_url(self, url: str) -> None:
+        self.add_string(Keys.General.SOURCE_URL, url)
+
+    def add_source_doi(self, doi: str) -> None:
+        self.add_string(Keys.General.SOURCE_DOI, doi)
+
+    def add_source_uuid(self, uuid: str) -> None:
+        self.add_string(Keys.General.SOURCE_UUID, uuid)
+
+    def add_source_repo_url(self, repo_url: str) -> None:
+        self.add_string(Keys.General.SOURCE_REPO_URL, repo_url)
+
+    def add_base_model_count(self, source_count: int) -> None:
+        self.add_uint32(Keys.General.BASE_MODEL_COUNT, source_count)
+
+    def add_base_model_name(self, source_id: int, name: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_NAME.format(id=source_id), name)
+
+    def add_base_model_author(self, source_id: int, author: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_AUTHOR.format(id=source_id), author)
+
+    def add_base_model_version(self, source_id: int, version: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)
+
+    def add_base_model_organization(self, source_id: int, organization: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
+
+    def add_base_model_description(self, source_id: int, description: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description)
+
+    def add_base_model_url(self, source_id: int, url: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
+
+    def add_base_model_doi(self, source_id: int, doi: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_DOI.format(id=source_id), doi)
+
+    def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_UUID.format(id=source_id), uuid)
+
+    def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
+        self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
+
+    def add_dataset_count(self, source_count: int) -> None:
+        self.add_uint32(Keys.General.DATASET_COUNT, source_count)
+
+    def add_dataset_name(self, source_id: int, name: str) -> None:
+        self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
+
+    def add_dataset_author(self, source_id: int, author: str) -> None:
+        self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
+
+    def add_dataset_version(self, source_id: int, version: str) -> None:
+        self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
+
+    def add_dataset_organization(self, source_id: int, organization: str) -> None:
+        self.add_string(Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization)
+
+    def add_dataset_description(self, source_id: int, description: str) -> None:
+        self.add_string(Keys.General.DATASET_DESCRIPTION.format(id=source_id), description)
+
+    def add_dataset_url(self, source_id: int, url: str) -> None:
+        self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
+
+    def add_dataset_doi(self, source_id: int, doi: str) -> None:
+        self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
+
+    def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
+        self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
+
+    def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
+        self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
+
+    def add_tags(self, tags: Sequence[str]) -> None:
+        self.add_array(Keys.General.TAGS, tags)
+
+    def add_languages(self, languages: Sequence[str]) -> None:
+        self.add_array(Keys.General.LANGUAGES, languages)
+
+    def add_tensor_data_layout(self, layout: str) -> None:
+        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_vocab_size(self, size: int) -> None:
+        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
 
     def add_context_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
@@ -319,20 +632,56 @@ class GGUFWriter:
     def add_embedding_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+
     def add_block_count(self, length: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
 
-    def add_feed_forward_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+    def add_leading_dense_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
+        if isinstance(length, int):
+            self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+        else:
+            self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_expert_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_expert_shared_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
 
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
-    def add_head_count(self, count: int) -> None:
-        self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
+    def add_decoder_start_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
 
-    def add_head_count_kv(self, count: int) -> None:
-        self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
+    def add_head_count(self, count: int | Sequence[int]) -> None:
+        if isinstance(count, int):
+            self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
+        else:
+            self.add_array(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int | Sequence[int]) -> None:
+        if isinstance(count, int):
+            self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
+        else:
+            self.add_array(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
 
     def add_key_length(self, length: int) -> None:
         self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
@@ -346,27 +695,96 @@ class GGUFWriter:
     def add_clamp_kqv(self, value: float) -> None:
         self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
 
+    def add_logit_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
+
+    def add_attn_logit_softcapping(self, value: float) -> None:
+        self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
+
+    def add_final_logit_softcapping(self, value: float) -> None:
+        self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
+
     def add_expert_count(self, count: int) -> None:
         self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
 
     def add_expert_used_count(self, count: int) -> None:
         self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
 
+    def add_expert_shared_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)
+
+    def add_expert_weights_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
+
+    def add_expert_weights_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
+
+    def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
+
+    def add_swin_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
+
+    def add_rescale_every_n_layers(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
+
+    def add_time_mix_extra_dim(self, dim: int) -> None:
+        self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
+
+    def add_time_decay_extra_dim(self, dim: int) -> None:
+        self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
+
+    def add_residual_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
+
+    def add_embedding_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
+
+    def add_wkv_head_size(self, size: int) -> None:
+        self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
+
+    def add_token_shift_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
+
     def add_layer_norm_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
 
     def add_layer_norm_rms_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
 
+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
     def add_causal_attention(self, value: bool) -> None:
         self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
 
+    def add_q_lora_rank(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
+
+    def add_kv_lora_rank(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
+
+    def add_relative_attn_buckets_count(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
+
+    def add_sliding_window(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
+
+    def add_attention_scale(self, value: float) -> None:
+        self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
-        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
+        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
     def add_rope_dimension_count(self, count: int) -> None:
         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
 
+    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
+
     def add_rope_freq_base(self, value: float) -> None:
         self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
 
@@ -376,15 +794,39 @@ class GGUFWriter:
     def add_rope_scaling_factor(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
 
+    def add_rope_scaling_attn_factors(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
+
     def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
         self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
 
     def add_rope_scaling_finetuned(self, value: bool) -> None:
         self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
 
+    def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
+
+    def add_ssm_conv_kernel(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
+
+    def add_ssm_inner_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)
+
+    def add_ssm_state_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)
+
+    def add_ssm_time_step_rank(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
+
+    def add_ssm_dt_b_c_rms(self, value: bool) -> None:
+        self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
+
     def add_tokenizer_model(self, model: str) -> None:
         self.add_string(Keys.Tokenizer.MODEL, model)
 
+    def add_tokenizer_pre(self, pre: str) -> None:
+        self.add_string(Keys.Tokenizer.PRE, pre)
+
     def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
         self.add_array(Keys.Tokenizer.LIST, tokens)
 
@@ -415,9 +857,6 @@ class GGUFWriter:
     def add_pad_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.PAD_ID, id)
 
-    def add_cls_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
-
     def add_mask_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.MASK_ID, id)
 
@@ -430,14 +869,96 @@ class GGUFWriter:
     def add_add_space_prefix(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
 
-    def add_chat_template(self, value: str) -> None:
+    def add_remove_extra_whitespaces(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
+
+    def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
+        self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
+
+    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
+        if not isinstance(value, str):
+            template_default = None
+            template_names = set()
+
+            for choice in value:
+                name = choice.get('name', '')
+                template = choice.get('template')
+
+                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
+                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
+
+                if name and template is not None:
+                    if name == 'default':
+                        template_default = template
+                    else:
+                        template_names.add(name)
+                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
+
+            if template_names:
+                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
+
+            if template_default is None:
+                return
+
+            value = template_default
+
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
+    def add_eot_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOT_ID, id)
+
+    def add_eom_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOM_ID, id)
+
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''
         if not skip_pack_prefix:
             pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
         return struct.pack(f'{pack_prefix}{fmt}', value)
 
-    def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
-        self.fout.write(self._pack(fmt, value, skip_pack_prefix))
+    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
+        kv_data = bytearray()
+
+        if add_vtype:
+            kv_data += self._pack("I", vtype)
+
+        pack_fmt = self._simple_value_packing.get(vtype)
+        if pack_fmt is not None:
+            kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
+            kv_data += self._pack("Q", len(encoded_val))
+            kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY:
+
+            if not isinstance(val, Sequence):
+                raise ValueError("Invalid GGUF metadata array, expecting sequence")
+
+            if len(val) == 0:
+                raise ValueError("Invalid GGUF metadata array. Empty array")
+
+            if isinstance(val, bytes):
+                ltype = GGUFValueType.UINT8
+            else:
+                ltype = GGUFValueType.get_type(val[0])
+                if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
+                    raise ValueError("All items in a GGUF array should be of the same type")
+            kv_data += self._pack("I", ltype)
+            kv_data += self._pack("Q", len(val))
+            for item in val:
+                kv_data += self._pack_val(item, ltype, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type or value")
+
+        return kv_data
+
+    @staticmethod
+    def format_n_bytes_to_str(num: int) -> str:
+        if num == 0:
+            return "negligible - metadata only"
+        fnum = float(num)
+        for unit in ("", "K", "M", "G"):
+            if abs(fnum) < 1000.0:
+                return f"{fnum:3.1f}{unit}"
+            fnum /= 1000.0
+        return f"{fnum:.1f}T - over 1TB, split recommended"
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
new file mode 100644
index 000000000..8d4fece2d
--- /dev/null
+++ b/gguf-py/gguf/lazy.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+from abc import ABC, ABCMeta, abstractmethod
+
+import logging
+from typing import Any, Callable
+
+import numpy as np
+from numpy.typing import DTypeLike
+
+
+logger = logging.getLogger(__name__)
+
+
+class LazyMeta(ABCMeta):
+
+    def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
+        def __getattr__(self, name: str) -> Any:
+            meta_attr = getattr(self._meta, name)
+            if callable(meta_attr):
+                return type(self)._wrap_fn(
+                    (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
+                    use_self=self,
+                )
+            elif isinstance(meta_attr, self._tensor_type):
+                # e.g. self.T with torch.Tensor should still be wrapped
+                return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
+            else:
+                # no need to wrap non-tensor properties,
+                # and they likely don't depend on the actual contents of the tensor
+                return meta_attr
+
+        namespace["__getattr__"] = __getattr__
+
+        # need to make a builder for the wrapped wrapper to copy the name,
+        # or else it fails with very cryptic error messages,
+        # because somehow the same string would end up in every closures
+        def mk_wrap(op_name: str, *, meta_noop: bool = False):
+            # need to wrap the wrapper to get self
+            def wrapped_special_op(self, *args, **kwargs):
+                return type(self)._wrap_fn(
+                    getattr(type(self)._tensor_type, op_name),
+                    meta_noop=meta_noop,
+                )(self, *args, **kwargs)
+            return wrapped_special_op
+
+        # special methods bypass __getattr__, so they need to be added manually
+        # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
+        # NOTE: doing this from a metaclass is very convenient
+        # TODO: make this even more comprehensive
+        for binary_op in (
+            "lt", "le", "eq", "ne", "ge", "gt", "not"
+            "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
+            "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
+            "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
+            "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
+        ):
+            attr_name = f"__{binary_op}__"
+            # the result of these operators usually has the same shape and dtype as the input,
+            # so evaluation on the meta tensor can be skipped.
+            namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
+
+        for special_op in (
+            "getitem", "setitem", "len",
+        ):
+            attr_name = f"__{special_op}__"
+            namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
+
+        return super().__new__(cls, name, bases, namespace, **kwargs)
+
+
+# Tree of lazy tensors
+class LazyBase(ABC, metaclass=LazyMeta):
+    _tensor_type: type
+    _meta: Any
+    _data: Any | None
+    _args: tuple
+    _kwargs: dict[str, Any]
+    _func: Callable[[Any], Any] | None
+
+    def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
+        super().__init__()
+        self._meta = meta
+        self._data = data
+        self._args = args
+        self._kwargs = kwargs if kwargs is not None else {}
+        self._func = func
+        assert self._func is not None or self._data is not None
+
+    def __init_subclass__(cls) -> None:
+        if "_tensor_type" not in cls.__dict__:
+            raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
+        return super().__init_subclass__()
+
+    @staticmethod
+    def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
+        # TODO: dict and set
+        if isinstance(o, (list, tuple)):
+            L = []
+            for item in o:
+                L.append(LazyBase._recurse_apply(item, fn))
+            if isinstance(o, tuple):
+                L = tuple(L)
+            return L
+        elif isinstance(o, LazyBase):
+            return fn(o)
+        else:
+            return o
+
+    @classmethod
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+        def wrapped_fn(*args, **kwargs):
+            if kwargs is None:
+                kwargs = {}
+            args = ((use_self,) if use_self is not None else ()) + args
+
+            meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
+            # TODO: maybe handle tensors in kwargs too
+
+            if isinstance(meta_noop, bool) and not meta_noop:
+                try:
+                    res = fn(*meta_args, **kwargs)
+                except NotImplementedError:
+                    # running some operations on PyTorch's Meta tensors can cause this exception
+                    res = None
+            else:
+                # some operators don't need to actually run on the meta tensors
+                assert len(args) > 0
+                res = args[0]
+                assert isinstance(res, cls)
+                res = res._meta
+                # allow operations to override the dtype and shape
+                if meta_noop is not True:
+                    if isinstance(meta_noop, tuple):
+                        dtype, shape = meta_noop
+                        assert callable(shape)
+                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
+                    else:
+                        res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
+
+            if isinstance(res, cls._tensor_type):
+                return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
+            else:
+                del res  # not needed
+                # non-tensor return likely relies on the contents of the args
+                # (e.g. the result of torch.equal)
+                eager_args = cls.to_eager(args)
+                return fn(*eager_args, **kwargs)
+        return wrapped_fn
+
+    @classmethod
+    def to_eager(cls, t: Any) -> Any:
+        def simple_to_eager(_t: LazyBase) -> Any:
+            if _t._data is not None:
+                return _t._data
+
+            # NOTE: there's a recursion limit in Python (usually 1000)
+
+            assert _t._func is not None
+            _t._args = cls._recurse_apply(_t._args, simple_to_eager)
+            _t._data = _t._func(*_t._args, **_t._kwargs)
+            # sanity check
+            assert _t._data is not None
+            assert _t._data.dtype == _t._meta.dtype
+            assert _t._data.shape == _t._meta.shape
+
+            return _t._data
+
+        # recurse into lists and/or tuples, keeping their structure
+        return cls._recurse_apply(t, simple_to_eager)
+
+    @classmethod
+    def eager_to_meta(cls, t: Any) -> Any:
+        return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
+
+    # must be overridden, meta tensor init is backend-specific
+    @classmethod
+    @abstractmethod
+    def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
+
+    @classmethod
+    def from_eager(cls, t: Any) -> Any:
+        if type(t) is cls:
+            # already lazy
+            return t
+        elif isinstance(t, cls._tensor_type):
+            return cls(meta=cls.eager_to_meta(t), data=t)
+        else:
+            return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
+
+
+class LazyNumpyTensor(LazyBase):
+    _tensor_type = np.ndarray
+
+    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+
+    @classmethod
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
+        # The initial idea was to use np.nan as the fill value,
+        # but non-float types like np.int16 can't use that.
+        # So zero it is.
+        cheat = np.zeros(1, dtype)
+        return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
+
+    def astype(self, dtype, *args, **kwargs):
+        meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
+        full_args = (self, dtype,) + args
+        return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
+
+    def tofile(self, *args, **kwargs):
+        eager = LazyNumpyTensor.to_eager(self)
+        return eager.tofile(*args, **kwargs)
+
+    # TODO: __array_function__
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
new file mode 100644
index 000000000..962c27b20
--- /dev/null
+++ b/gguf-py/gguf/metadata.py
@@ -0,0 +1,622 @@
+from __future__ import annotations
+
+import re
+import json
+import yaml
+import logging
+from pathlib import Path
+from typing import Any, Literal, Optional
+from dataclasses import dataclass
+
+from .constants import Keys
+
+import gguf
+
+logger = logging.getLogger("metadata")
+
+
+@dataclass
+class Metadata:
+    # Authorship Metadata to be written to GGUF KV Store
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    organization: Optional[str] = None
+    finetune: Optional[str] = None
+    basename: Optional[str] = None
+    description: Optional[str] = None
+    quantized_by: Optional[str] = None
+    size_label: Optional[str] = None
+    url: Optional[str] = None
+    doi: Optional[str] = None
+    uuid: Optional[str] = None
+    repo_url: Optional[str] = None
+    source_url: Optional[str] = None
+    source_doi: Optional[str] = None
+    source_uuid: Optional[str] = None
+    source_repo_url: Optional[str] = None
+    license: Optional[str] = None
+    license_name: Optional[str] = None
+    license_link: Optional[str] = None
+    base_models: Optional[list[dict]] = None
+    tags: Optional[list[str]] = None
+    languages: Optional[list[str]] = None
+    datasets: Optional[list[dict]] = None
+
+    @staticmethod
+    def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
+        # This grabs as many contextual authorship metadata as possible from the model repository
+        # making any conversion as required to match the gguf kv store metadata format
+        # as well as giving users the ability to override any authorship metadata that may be incorrect
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        model_card = Metadata.load_model_card(model_path)
+        hf_params = Metadata.load_hf_parameters(model_path)
+        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
+
+        # heuristics
+        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
+
+        # Metadata Override File Provided
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata_override = Metadata.load_metadata_override(metadata_override_path)
+
+        metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
+        metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
+        metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
+        metadata.organization    = metadata_override.get(Keys.General.ORGANIZATION,    metadata.organization)
+
+        metadata.finetune        = metadata_override.get(Keys.General.FINETUNE,        metadata.finetune)
+        metadata.basename        = metadata_override.get(Keys.General.BASENAME,        metadata.basename)
+
+        metadata.description     = metadata_override.get(Keys.General.DESCRIPTION,     metadata.description)
+        metadata.quantized_by    = metadata_override.get(Keys.General.QUANTIZED_BY,    metadata.quantized_by)
+
+        metadata.size_label      = metadata_override.get(Keys.General.SIZE_LABEL,      metadata.size_label)
+        metadata.license_name    = metadata_override.get(Keys.General.LICENSE_NAME,    metadata.license_name)
+        metadata.license_link    = metadata_override.get(Keys.General.LICENSE_LINK,    metadata.license_link)
+
+        metadata.url             = metadata_override.get(Keys.General.URL,             metadata.url)
+        metadata.doi             = metadata_override.get(Keys.General.DOI,             metadata.doi)
+        metadata.uuid            = metadata_override.get(Keys.General.UUID,            metadata.uuid)
+        metadata.repo_url        = metadata_override.get(Keys.General.REPO_URL,        metadata.repo_url)
+
+        metadata.source_url      = metadata_override.get(Keys.General.SOURCE_URL,      metadata.source_url)
+        metadata.source_doi      = metadata_override.get(Keys.General.SOURCE_DOI,      metadata.source_doi)
+        metadata.source_uuid     = metadata_override.get(Keys.General.SOURCE_UUID,     metadata.source_uuid)
+        metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url)
+
+        # Base Models is received here as an array of models
+        metadata.base_models     = metadata_override.get("general.base_models",        metadata.base_models)
+
+        # Datasets is received here as an array of datasets
+        metadata.datasets        = metadata_override.get("general.datasets",           metadata.datasets)
+
+        metadata.tags            = metadata_override.get(Keys.General.TAGS,            metadata.tags)
+        metadata.languages       = metadata_override.get(Keys.General.LANGUAGES,       metadata.languages)
+
+        # Direct Metadata Override (via direct cli argument)
+        if model_name is not None:
+            metadata.name = model_name
+
+        return metadata
+
+    @staticmethod
+    def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]:
+        if metadata_override_path is None or not metadata_override_path.is_file():
+            return {}
+
+        with open(metadata_override_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
+        if model_path is None or not model_path.is_dir():
+            return {}
+
+        model_card_path = model_path / "README.md"
+
+        if not model_card_path.is_file():
+            return {}
+
+        # The model card metadata is assumed to always be in YAML
+        # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
+        with open(model_card_path, "r", encoding="utf-8") as f:
+            if f.readline() == "---\n":
+                raw = f.read().partition("---\n")[0]
+                data = yaml.safe_load(raw)
+                if isinstance(data, dict):
+                    return data
+                else:
+                    logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict")
+                    return {}
+            else:
+                return {}
+
+    @staticmethod
+    def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
+        if model_path is None or not model_path.is_dir():
+            return {}
+
+        config_path = model_path / "config.json"
+
+        if not config_path.is_file():
+            return {}
+
+        with open(config_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def id_to_title(string):
+        # Convert capitalization into title form unless acronym or version number
+        return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()])
+
+    @staticmethod
+    def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
+        # Huggingface often store model id as '<org>/<model name>'
+        # so let's parse it and apply some heuristics if possible for model name components
+
+        if model_id is None:
+            # model ID missing
+            return None, None, None, None, None, None
+
+        if ' ' in model_id:
+            # model ID is actually a normal human sentence
+            # which means its most likely a normal model name only
+            # not part of the hugging face naming standard, but whatever
+            return model_id, None, None, None, None, None
+
+        if '/' in model_id:
+            # model ID (huggingface style)
+            org_component, model_full_name_component = model_id.split('/', 1)
+        else:
+            # model ID but missing org components
+            org_component, model_full_name_component = None, model_id
+
+        # Check if we erroneously matched against './' or '../' etc...
+        if org_component is not None and len(org_component) > 0 and org_component[0] == '.':
+            org_component = None
+
+        name_parts: list[str] = model_full_name_component.split('-')
+
+        # Remove empty parts
+        for i in reversed(range(len(name_parts))):
+            if len(name_parts[i]) == 0:
+                del name_parts[i]
+
+        name_types: list[
+            set[Literal["basename", "size_label", "finetune", "version", "type"]]
+        ] = [set() for _ in name_parts]
+
+        # Annotate the name
+        for i, part in enumerate(name_parts):
+            # Version
+            if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE):
+                name_types[i].add("version")
+            # Quant type (should not be there for base models, but still annotated)
+            elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE):
+                name_types[i].add("type")
+                name_parts[i] = part.upper()
+            # Model size
+            elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE):
+                part = part.replace("_", ".")
+                # Handle weird bloom-7b1 notation
+                if part[-1].isdecimal():
+                    part = part[:-2] + "." + part[-1] + part[-2]
+                # Normalize the size suffixes
+                if len(part) > 1 and part[-2].isdecimal():
+                    if part[-1] in "kmbt":
+                        part = part[:-1] + part[-1].upper()
+                if total_params != 0:
+                    try:
+                        label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1]))
+                        # Only use it as a size label if it's close or bigger than the model size
+                        # Note that LoRA adapters don't necessarily include all layers,
+                        # so this is why bigger label sizes are accepted.
+                        # Do not use the size label when it's smaller than 1/8 of the model size
+                        if (total_params < 0 and label_params < abs(total_params) // 8) or (
+                            # Check both directions when the current model isn't a LoRA adapter
+                            total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8
+                        ):
+                            # Likely a context length
+                            name_types[i].add("finetune")
+                            # Lowercase the size when it's a context length
+                            part = part[:-1] + part[-1].lower()
+                    except ValueError:
+                        # Failed to convert the size label to float, use it anyway
+                        pass
+                if len(name_types[i]) == 0:
+                    name_types[i].add("size_label")
+                name_parts[i] = part
+            # Some easy to recognize finetune names
+            elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
+                if total_params < 0 and part.lower() == "lora":
+                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
+                    name_types[i].add("type")
+                else:
+                    name_types[i].add("finetune")
+
+        # Ignore word-based size labels when there is at least a number-based one present
+        # TODO: should word-based size labels always be removed instead?
+        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
+            for n, t in zip(name_parts, name_types):
+                if "size_label" in t:
+                    if all(c.isalpha() for c in n):
+                        t.remove("size_label")
+
+        at_start = True
+        # Find the basename through the annotated name
+        for part, t in zip(name_parts, name_types):
+            if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
+                t.add("basename")
+            else:
+                if at_start:
+                    at_start = False
+                if len(t) == 0:
+                    t.add("finetune")
+
+        # Remove the basename annotation from trailing version
+        for part, t in zip(reversed(name_parts), reversed(name_types)):
+            if "basename" in t and len(t) > 1:
+                t.remove("basename")
+            else:
+                break
+
+        basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
+        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
+        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
+        finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
+        # TODO: should the basename version always be excluded?
+        # NOTE: multiple finetune versions are joined together
+        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
+
+        if size_label is None and finetune is None and version is None:
+            # Too ambiguous, output nothing
+            basename = None
+
+        return model_full_name_component, org_component, basename, finetune, version, size_label
+
+    @staticmethod
+    def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata:
+        # Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+
+        # Model Card Heuristics
+        ########################
+        if model_card is not None:
+
+            def use_model_card_metadata(metadata_key: str, model_card_key: str):
+                if model_card_key in model_card and getattr(metadata, metadata_key, None) is None:
+                    setattr(metadata, metadata_key, model_card.get(model_card_key))
+
+            def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
+                # Note: Will append rather than replace if already exist
+                tags_value = model_card.get(model_card_key, None)
+                if tags_value is None:
+                    return
+
+                current_value = getattr(metadata, metadata_key, None)
+                if current_value is None:
+                    current_value = []
+
+                if isinstance(tags_value, str):
+                    current_value.append(tags_value)
+                elif isinstance(tags_value, list):
+                    current_value.extend(tags_value)
+
+                setattr(metadata, metadata_key, current_value)
+
+            # LLAMA.cpp's direct internal convention
+            # (Definitely not part of hugging face formal/informal standard)
+            #########################################
+            use_model_card_metadata("name", "name")
+            use_model_card_metadata("author", "author")
+            use_model_card_metadata("version", "version")
+            use_model_card_metadata("organization", "organization")
+            use_model_card_metadata("description", "description")
+            use_model_card_metadata("finetune", "finetune")
+            use_model_card_metadata("basename", "basename")
+            use_model_card_metadata("size_label", "size_label")
+            use_model_card_metadata("source_url", "url")
+            use_model_card_metadata("source_doi", "doi")
+            use_model_card_metadata("source_uuid", "uuid")
+            use_model_card_metadata("source_repo_url", "repo_url")
+
+            # LLAMA.cpp's huggingface style convention
+            # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
+            ###########################################
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_author")
+            use_model_card_metadata("version", "model_version")
+            use_model_card_metadata("organization", "model_organization")
+            use_model_card_metadata("description", "model_description")
+            use_model_card_metadata("finetune", "model_finetune")
+            use_model_card_metadata("basename", "model_basename")
+            use_model_card_metadata("size_label", "model_size_label")
+            use_model_card_metadata("source_url", "model_url")
+            use_model_card_metadata("source_doi", "model_doi")
+            use_model_card_metadata("source_uuid", "model_uuid")
+            use_model_card_metadata("source_repo_url", "model_repo_url")
+
+            # Hugging Face Direct Convention
+            #################################
+
+            # Not part of huggingface model card standard but notice some model creator using it
+            # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_creator")
+            use_model_card_metadata("basename", "model_type")
+
+            if "base_model" in model_card or "base_models" in model_card or "base_model_sources" in model_card:
+                # This represents the parent models that this is based on
+                # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
+                # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
+                metadata_base_models = []
+                base_model_value = model_card.get("base_model", model_card.get("base_models", model_card.get("base_model_sources", None)))
+
+                if base_model_value is not None:
+                    if isinstance(base_model_value, str):
+                        metadata_base_models.append(base_model_value)
+                    elif isinstance(base_model_value, list):
+                        metadata_base_models.extend(base_model_value)
+
+                if metadata.base_models is None:
+                    metadata.base_models = []
+
+                for model_id in metadata_base_models:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    base_model = {}
+                    if isinstance(model_id, str):
+                        if model_id.startswith("http://") or model_id.startswith("https://") or model_id.startswith("ssh://"):
+                            base_model["repo_url"] = model_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in model_id:
+                                match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", model_id)
+                                if match:
+                                    model_id_component = match.group(1)
+                                    model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id_component, total_params)
+
+                                    # Populate model dictionary with extracted components
+                                    if model_full_name_component is not None:
+                                        base_model["name"] = Metadata.id_to_title(model_full_name_component)
+                                    if org_component is not None:
+                                        base_model["organization"] = Metadata.id_to_title(org_component)
+                                    if version is not None:
+                                        base_model["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
+
+                            # Populate model dictionary with extracted components
+                            if model_full_name_component is not None:
+                                base_model["name"] = Metadata.id_to_title(model_full_name_component)
+                            if org_component is not None:
+                                base_model["organization"] = Metadata.id_to_title(org_component)
+                            if version is not None:
+                                base_model["version"] = version
+                            if org_component is not None and model_full_name_component is not None:
+                                base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
+
+                    elif isinstance(model_id, dict):
+                        base_model = model_id
+
+                    else:
+                        logger.error(f"base model entry '{str(model_id)}' not in a known format")
+
+                    metadata.base_models.append(base_model)
+
+            if "datasets" in model_card or "dataset" in model_card or "dataset_sources" in model_card:
+                # This represents the datasets that this was trained from
+                metadata_datasets = []
+                dataset_value = model_card.get("datasets", model_card.get("dataset", model_card.get("dataset_sources", None)))
+
+                if dataset_value is not None:
+                    if isinstance(dataset_value, str):
+                        metadata_datasets.append(dataset_value)
+                    elif isinstance(dataset_value, list):
+                        metadata_datasets.extend(dataset_value)
+
+                if metadata.datasets is None:
+                    metadata.datasets = []
+
+                for dataset_id in metadata_datasets:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    dataset = {}
+                    if isinstance(dataset_id, str):
+                        if dataset_id.startswith(("http://", "https://", "ssh://")):
+                            dataset["repo_url"] = dataset_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in dataset_id:
+                                match = re.match(r"https?://huggingface.co/([^/]+/[^/]+)$", dataset_id)
+                                if match:
+                                    dataset_id_component = match.group(1)
+                                    dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id_component, total_params)
+
+                                    # Populate dataset dictionary with extracted components
+                                    if dataset_name_component is not None:
+                                        dataset["name"] = Metadata.id_to_title(dataset_name_component)
+                                    if org_component is not None:
+                                        dataset["organization"] = Metadata.id_to_title(org_component)
+                                    if version is not None:
+                                        dataset["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            dataset_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(dataset_id, total_params)
+
+                            # Populate dataset dictionary with extracted components
+                            if dataset_name_component is not None:
+                                dataset["name"] = Metadata.id_to_title(dataset_name_component)
+                            if org_component is not None:
+                                dataset["organization"] = Metadata.id_to_title(org_component)
+                            if version is not None:
+                                dataset["version"] = version
+                            if org_component is not None and dataset_name_component is not None:
+                                dataset["repo_url"] = f"https://huggingface.co/{org_component}/{dataset_name_component}"
+
+                    elif isinstance(dataset_id, dict):
+                        dataset = dataset_id
+
+                    else:
+                        logger.error(f"dataset entry '{str(dataset_id)}' not in a known format")
+
+                    metadata.datasets.append(dataset)
+
+            use_model_card_metadata("license", "license")
+            use_model_card_metadata("license_name", "license_name")
+            use_model_card_metadata("license_link", "license_link")
+
+            use_array_model_card_metadata("tags", "tags")
+            use_array_model_card_metadata("tags", "pipeline_tag")
+
+            use_array_model_card_metadata("languages", "languages")
+            use_array_model_card_metadata("languages", "language")
+
+        # Hugging Face Parameter Heuristics
+        ####################################
+
+        if hf_params is not None:
+
+            hf_name_or_path = hf_params.get("_name_or_path")
+            if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
+                # Use _name_or_path only if its actually a model name and not some computer path
+                # e.g. 'meta-llama/Llama-2-7b-hf'
+                model_id = hf_name_or_path
+                model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
+                if metadata.name is None and model_full_name_component is not None:
+                    metadata.name = Metadata.id_to_title(model_full_name_component)
+                if metadata.organization is None and org_component is not None:
+                    metadata.organization = Metadata.id_to_title(org_component)
+                if metadata.basename is None and basename is not None:
+                    metadata.basename = basename
+                if metadata.finetune is None and finetune is not None:
+                    metadata.finetune = finetune
+                if metadata.version is None and version is not None:
+                    metadata.version = version
+                if metadata.size_label is None and size_label is not None:
+                    metadata.size_label = size_label
+
+        # Directory Folder Name Fallback Heuristics
+        ############################################
+        if model_path is not None:
+            model_id = model_path.name
+            model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
+            if metadata.name is None and model_full_name_component is not None:
+                metadata.name = Metadata.id_to_title(model_full_name_component)
+            if metadata.organization is None and org_component is not None:
+                metadata.organization = Metadata.id_to_title(org_component)
+            if metadata.basename is None and basename is not None:
+                metadata.basename = basename
+            if metadata.finetune is None and finetune is not None:
+                metadata.finetune = finetune
+            if metadata.version is None and version is not None:
+                metadata.version = version
+            if metadata.size_label is None and size_label is not None:
+                metadata.size_label = size_label
+
+        return metadata
+
+    def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
+        assert self.name is not None
+        gguf_writer.add_name(self.name)
+
+        if self.author is not None:
+            gguf_writer.add_author(self.author)
+        if self.version is not None:
+            gguf_writer.add_version(self.version)
+        if self.organization is not None:
+            gguf_writer.add_organization(self.organization)
+
+        if self.finetune is not None:
+            gguf_writer.add_finetune(self.finetune)
+        if self.basename is not None:
+            gguf_writer.add_basename(self.basename)
+
+        if self.description is not None:
+            gguf_writer.add_description(self.description)
+        if self.quantized_by is not None:
+            gguf_writer.add_quantized_by(self.quantized_by)
+
+        if self.size_label is not None:
+            gguf_writer.add_size_label(self.size_label)
+
+        if self.license is not None:
+            if isinstance(self.license, list):
+                gguf_writer.add_license(",".join(self.license))
+            else:
+                gguf_writer.add_license(self.license)
+        if self.license_name is not None:
+            gguf_writer.add_license_name(self.license_name)
+        if self.license_link is not None:
+            gguf_writer.add_license_link(self.license_link)
+
+        if self.url is not None:
+            gguf_writer.add_url(self.url)
+        if self.doi is not None:
+            gguf_writer.add_doi(self.doi)
+        if self.uuid is not None:
+            gguf_writer.add_uuid(self.uuid)
+        if self.repo_url is not None:
+            gguf_writer.add_repo_url(self.repo_url)
+
+        if self.source_url is not None:
+            gguf_writer.add_source_url(self.source_url)
+        if self.source_doi is not None:
+            gguf_writer.add_source_doi(self.source_doi)
+        if self.source_uuid is not None:
+            gguf_writer.add_source_uuid(self.source_uuid)
+        if self.source_repo_url is not None:
+            gguf_writer.add_source_repo_url(self.source_repo_url)
+
+        if self.base_models is not None:
+            gguf_writer.add_base_model_count(len(self.base_models))
+            for key, base_model_entry in enumerate(self.base_models):
+                if "name" in base_model_entry:
+                    gguf_writer.add_base_model_name(key, base_model_entry["name"])
+                if "author" in base_model_entry:
+                    gguf_writer.add_base_model_author(key, base_model_entry["author"])
+                if "version" in base_model_entry:
+                    gguf_writer.add_base_model_version(key, base_model_entry["version"])
+                if "organization" in base_model_entry:
+                    gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
+                if "description" in base_model_entry:
+                    gguf_writer.add_base_model_description(key, base_model_entry["description"])
+                if "url" in base_model_entry:
+                    gguf_writer.add_base_model_url(key, base_model_entry["url"])
+                if "doi" in base_model_entry:
+                    gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
+                if "uuid" in base_model_entry:
+                    gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
+                if "repo_url" in base_model_entry:
+                    gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
+
+        if self.datasets is not None:
+            gguf_writer.add_dataset_count(len(self.datasets))
+            for key, dataset_entry in enumerate(self.datasets):
+                if "name" in dataset_entry:
+                    gguf_writer.add_dataset_name(key, dataset_entry["name"])
+                if "author" in dataset_entry:
+                    gguf_writer.add_dataset_author(key, dataset_entry["author"])
+                if "version" in dataset_entry:
+                    gguf_writer.add_dataset_version(key, dataset_entry["version"])
+                if "organization" in dataset_entry:
+                    gguf_writer.add_dataset_organization(key, dataset_entry["organization"])
+                if "description" in dataset_entry:
+                    gguf_writer.add_dataset_description(key, dataset_entry["description"])
+                if "url" in dataset_entry:
+                    gguf_writer.add_dataset_url(key, dataset_entry["url"])
+                if "doi" in dataset_entry:
+                    gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
+                if "uuid" in dataset_entry:
+                    gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
+                if "repo_url" in dataset_entry:
+                    gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
+        if self.tags is not None:
+            gguf_writer.add_tags(self.tags)
+        if self.languages is not None:
+            gguf_writer.add_languages(self.languages)
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
new file mode 100644
index 000000000..3c8ba82e1
--- /dev/null
+++ b/gguf-py/gguf/quants.py
@@ -0,0 +1,1269 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Sequence
+from math import log2, ceil
+
+from numpy.typing import DTypeLike
+
+from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
+from .lazy import LazyNumpyTensor
+
+import numpy as np
+
+
+def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % block_size != 0:
+        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
+    return (*shape[:-1], shape[-1] // block_size * type_size)
+
+
+def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    if shape[-1] % type_size != 0:
+        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
+    return (*shape[:-1], shape[-1] // type_size * block_size)
+
+
+# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
+def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
+    rows = arr.reshape((-1, arr.shape[-1]))
+    osize = 1
+    for dim in oshape:
+        osize *= dim
+    out = np.empty(shape=osize, dtype=otype)
+    # compute over groups of 16 rows (arbitrary, but seems good for performance)
+    n_groups = (rows.shape[0] // 16) or 1
+    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
+    return out.reshape(oshape)
+
+
+# round away from zero
+# ref: https://stackoverflow.com/a/59143326/22827863
+def np_roundf(n: np.ndarray) -> np.ndarray:
+    a = abs(n)
+    floored = np.floor(a)
+    b = floored + np.floor(2 * (a - floored))
+    return np.sign(n) * b
+
+
+class QuantError(Exception): ...
+
+
+_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
+
+
+def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.astype(np.float32, copy=False)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float16, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.quantize(data)
+    else:
+        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
+
+
+def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.view(np.float32)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.view(np.float16).astype(np.float32)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.dequantize(data)
+    else:
+        raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
+
+
+class __Quant(ABC):
+    qtype: GGMLQuantizationType
+    block_size: int
+    type_size: int
+
+    grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
+    grid_shape: tuple[int, int] = (0, 0)
+    grid_map: tuple[int | float, ...] = ()
+    grid_hex: bytes | None = None
+
+    def __init__(self):
+        return TypeError("Quant conversion classes can't have instances")
+
+    def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
+        cls.qtype = qtype
+        cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
+        cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
+            cls.__quantize_array,
+            meta_noop=(np.uint8, cls.__shape_to_bytes)
+        )
+        cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
+            cls.__dequantize_array,
+            meta_noop=(np.float32, cls.__shape_from_bytes)
+        )
+        assert qtype not in _type_traits
+        _type_traits[qtype] = cls
+
+    @classmethod
+    def init_grid(cls):
+        if cls.grid is not None or cls.grid_hex is None:
+            return
+
+        bits_per_elem = ceil(log2(len(cls.grid_map)))
+        assert bits_per_elem != 0, cls.qtype.name
+        elems_per_byte = 8 // bits_per_elem
+
+        grid = np.frombuffer(cls.grid_hex, dtype=np.uint8)
+        # decode hexadecimal chars from grid
+        grid = grid.reshape((-1, 2))
+        grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2))
+        grid = grid[..., 0] | grid[..., 1]
+        # unpack the grid values
+        grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte))
+        grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1))
+        grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1))
+        grid = np.take_along_axis(grid_map, grid, axis=-1)
+        cls.grid = grid.reshape((1, 1, *cls.grid_shape))
+
+    @classmethod
+    @abstractmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    @classmethod
+    def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
+        rows = rows.astype(np.float32, copy=False)
+        shape = rows.shape
+        n_blocks = rows.size // cls.block_size
+        blocks = rows.reshape((n_blocks, cls.block_size))
+        blocks = cls.quantize_blocks(blocks)
+        assert blocks.dtype == np.uint8
+        assert blocks.shape[-1] == cls.type_size
+        return blocks.reshape(cls.__shape_to_bytes(shape))
+
+    @classmethod
+    def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
+        rows = rows.view(np.uint8)
+        shape = rows.shape
+        n_blocks = rows.size // cls.type_size
+        blocks = rows.reshape((n_blocks, cls.type_size))
+        blocks = cls.dequantize_blocks(blocks)
+        assert blocks.dtype == np.float32
+        assert blocks.shape[-1] == cls.block_size
+        return blocks.reshape(cls.__shape_from_bytes(shape))
+
+    @classmethod
+    def __shape_to_bytes(cls, shape: Sequence[int]):
+        return quant_shape_to_byte_shape(shape, cls.qtype)
+
+    @classmethod
+    def __shape_from_bytes(cls, shape: Sequence[int]):
+        return quant_shape_from_byte_shape(shape, cls.qtype)
+
+    @classmethod
+    def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
+        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
+
+    @classmethod
+    def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
+        cls.init_grid()
+        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
+
+    @classmethod
+    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
+        pass
+
+    @classmethod
+    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
+        pass
+
+    @classmethod
+    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
+        return tensor.shape[-1] % cls.block_size == 0
+
+    @classmethod
+    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if not cls.can_quantize(tensor):
+            raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
+        if isinstance(tensor, LazyNumpyTensor):
+            return cls.__quantize_lazy(tensor)
+        else:
+            return cls.__quantize_array(tensor)
+
+    @classmethod
+    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if isinstance(tensor, LazyNumpyTensor):
+            return cls.__dequantize_lazy(tensor)
+        else:
+            return cls.__dequantize_array(tensor)
+
+
+class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
+    @classmethod
+    # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n = blocks.view(np.uint32)
+        # force nan to quiet
+        n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
+        # round to nearest even
+        n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
+        return n.astype(np.uint16).view(np.uint8)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
+
+
+class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        imax = abs(blocks).argmax(axis=-1, keepdims=True)
+        max = np.take_along_axis(blocks, imax, axis=-1)
+
+        d = max / -8
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        # FIXME: Q4_0's reference rounding is cursed and depends on FMA
+        qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
+
+        qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, qs = np.hsplit(blocks, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8)
+
+        return (d * qs.astype(np.float32))
+
+
+class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        max = blocks.max(axis=-1, keepdims=True)
+        min = blocks.min(axis=-1, keepdims=True)
+
+        d = (max - min) / 15
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
+
+        qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
+
+        d = d.astype(np.float16).view(np.uint8)
+        m = min.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, m, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        m, qs = np.hsplit(rest, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+        m = m.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32)
+
+        return (d * qs) + m
+
+
+class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        imax = abs(blocks).argmax(axis=-1, keepdims=True)
+        max = np.take_along_axis(blocks, imax, axis=-1)
+
+        d = max / -16
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        # FIXME: Q5_0's reference rounding is cursed and depends on FMA
+        q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
+
+        qs = q.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
+
+        qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, qh, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qh, qs = np.hsplit(rest, [4])
+
+        d = d.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint32)
+
+        qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
+        ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qh = (qh & np.uint32(0x01)).astype(np.uint8)
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
+
+        qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16)
+
+        return (d * qs.astype(np.float32))
+
+
+class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        max = blocks.max(axis=-1, keepdims=True)
+        min = blocks.min(axis=-1, keepdims=True)
+
+        d = (max - min) / 31
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
+
+        qs = q.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
+
+        qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
+
+        d = d.astype(np.float16).view(np.uint8)
+        m = min.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, m, qh, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        m, rest = np.hsplit(rest, [2])
+        qh, qs = np.hsplit(rest, [4])
+
+        d = d.view(np.float16).astype(np.float32)
+        m = m.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint32)
+
+        qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
+        ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qh = (qh & np.uint32(0x01)).astype(np.uint8)
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
+
+        qs = (ql | (qh << np.uint8(4))).astype(np.float32)
+
+        return (d * qs) + m
+
+
+class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
+    @classmethod
+    # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+
+        d = abs(blocks).max(axis=1, keepdims=True) / 127
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+
+        # (n_blocks, 2)
+        d = d.astype(np.float16).view(np.uint8)
+        # (n_blocks, block_size)
+        qs = qs.astype(np.int8).view(np.uint8)
+
+        return np.concatenate([d, qs], axis=1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        d, x = np.split(blocks, [2], axis=1)
+        d = d.view(np.float16).astype(np.float32)
+        x = x.view(np.int8).astype(np.float32)
+
+        return (x * d)
+
+
+class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        scales, rest = np.hsplit(blocks, [QK_K // 16])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        d, dmin = np.hsplit(rest, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
+
+        # (n_blocks, 16, 1)
+        dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
+        ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
+
+        shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+
+        qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3)
+
+        qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32)
+
+        qs = dl * qs - ml
+
+        return qs.reshape((n_blocks, -1))
+
+
+class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        hmask, rest = np.hsplit(blocks, [QK_K // 8])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        scales, d = np.hsplit(rest, [12])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        # The scales are packed at 6-bit each in this pattern:
+        #  0: IIIIAAAA
+        #  1: JJJJBBBB
+        #  2: KKKKCCCC
+        #  3: LLLLDDDD
+        #  4: MMMMEEEE
+        #  5: NNNNFFFF
+        #  6: OOOOGGGG
+        #  7: PPPPHHHH
+        #  8: MMIIEEAA
+        #  9: NNJJFFBB
+        # 10: OOKKGGCC
+        # 11: PPLLHHDD
+        lscales, hscales = np.hsplit(scales, [8])
+        lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
+        lscales = lscales.reshape((n_blocks, 16))
+        hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1))
+        hscales = hscales.reshape((n_blocks, 16))
+        scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4))
+        scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32)
+
+        dl = (d * scales).reshape((n_blocks, 16, 1))
+
+        ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
+        ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3)
+        qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1))
+        qh = qh ^ np.uint8(1)  # strangely, the offset is zero when the bitmask is 1
+        q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32)
+
+        return (dl * q).reshape((n_blocks, QK_K))
+
+
+class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
+    K_SCALE_SIZE = 12
+
+    @staticmethod
+    def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        n_blocks = scales.shape[0]
+        scales = scales.view(np.uint8)
+        ### Unpacking the following: ###
+        #  0 EEAAAAAA
+        #  1 FFBBBBBB
+        #  2 GGCCCCCC
+        #  3 HHDDDDDD
+        #  4 eeaaaaaa
+        #  5 ffbbbbbb
+        #  6 ggcccccc
+        #  7 hhdddddd
+        #  8 eeeeEEEE
+        #  9 ffffFFFF
+        # 10 ggggGGGG
+        # 11 hhhhHHHH
+        scales = scales.reshape((n_blocks, 3, 4))
+        d, m, m_d = np.split(scales, 3, axis=-2)
+
+        sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1)
+        min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1)
+
+        return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        dmin, rest = np.hsplit(rest, [2])
+        scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE])
+
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
+
+        sc, m = Q4_K.get_scale_min(scales)
+
+        d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
+        dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
+
+        qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32)
+
+        return (d * qs - dm).reshape((n_blocks, QK_K))
+
+
+class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        dmin, rest = np.hsplit(rest, [2])
+        scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE])
+        qh, qs = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
+
+        sc, m = Q4_K.get_scale_min(scales)
+
+        d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
+        dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
+
+        ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
+        qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32))
+        q = (ql | (qh << np.uint8(4))).astype(np.float32)
+
+        return (d * q - dm).reshape((n_blocks, QK_K))
+
+
+class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        ql, rest = np.hsplit(blocks, [QK_K // 2])
+        qh, rest = np.hsplit(rest, [QK_K // 4])
+        scales, d = np.hsplit(rest, [QK_K // 16])
+
+        scales = scales.view(np.int8).astype(np.float32)
+        d = d.view(np.float16).astype(np.float32)
+        d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
+
+        ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
+        qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32))
+        q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
+        q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32)
+
+        return (d * q).reshape((n_blocks, QK_K))
+
+
+class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d = abs(blocks).max(axis=-1, keepdims=True)
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
+
+        qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
+        qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
+        qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
+        qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
+        qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
+        qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
+        qs = np.concatenate([qs0, qs1, qh], axis=-1)
+        qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
+
+        qs = qs.astype(np.uint8)
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([qs, d], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
+        qh, d = np.hsplit(rest, [QK_K // 64])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs0, qs1 = qs[..., :32], qs[..., 32:]
+        qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
+        qs0 = qs0.reshape((n_blocks, -1))
+        qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
+        qs1 = qs1.reshape((n_blocks, -1))
+        qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qh = qh.reshape((n_blocks, -1))
+        qs = np.concatenate([qs0, qs1, qh], axis=-1)
+        qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
+
+        return (d * qs.astype(np.float32))
+
+
+class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d = abs(blocks).max(axis=-1, keepdims=True)
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
+
+        qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
+        qs = qs.reshape((n_blocks, -1))
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([qs, d], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, d = np.hsplit(blocks, [QK_K // 4])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+        qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
+
+        return (d * qs.astype(np.float32))
+
+
+class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
+    ksigns: bytes = (
+        b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
+        b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f"
+        b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf"
+        b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f"
+        b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf"
+        b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f"
+        b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f"
+        b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff"
+    )
+
+    # iq2xxs_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (256, 8)
+    grid_map = (0x08, 0x19, 0x2b)
+    grid_hex = (
+        b"00000200050008000a00110014002000220028002a0041004400500058006100"
+        b"6400800082008a00a20001010401100115014001840198010002020222028202"
+        b"010404041004210424044004420448046004810484049004a404000502050805"
+        b"200546056905800591050906100640068406a406000805080808140828084108"
+        b"440850085208880804094009020a140a01100410101021104010601084109010"
+        b"951000110811201150115a118011241245120014081420142514491480141815"
+        b"6215001616160118041810184018811800190519a019511a002002200a204420"
+        b"6120802082202921482100220222012404241024402456240025412564259026"
+        b"082820289428442a014004401040184021402440404048405640604081408440"
+        b"9040004120416141804185410142104248425642684200440844204480449944"
+        b"124524450046014804481048404845480049584961498249454a904a00500850"
+        b"1150195020508050885004514251a4519152905492540a550156545600581158"
+        b"195864584059085a046010604060686000615561186260620064056410651265"
+        b"84654268008002800a8041808280048118814081118201840484108415844084"
+        b"608400854685948509864086608602880489118a0490109024904090a1901691"
+        b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, qs = np.hsplit(blocks, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.view(np.uint32).reshape(n_blocks, -1, 2)
+
+        db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
+        ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
+        signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
+        signs = np.take_along_axis(ksigns, signs, axis=-1)
+        signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS):
+    # iq2xs_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (512, 8)
+    grid_map = (0x08, 0x19, 0x2b)
+    grid_hex = (
+        b"00000200050008000a0011001400160019002000220025002800410044004600"
+        b"49005000520055005800610064008000820085008800910094009900a0000101"
+        b"04010601090110011201150118011a0121012401400142014501480151015401"
+        b"6001680181018401900100020202050208021102140220024102440250025502"
+        b"80028a0201040404060409041004120415041804210424044004420445044804"
+        b"5104540456046004810484049004000502050505080511051405200541054405"
+        b"500561058005010604061006260640064206840600080208050808080a081108"
+        b"14082008250841084408500858088008a008aa08010904091009400981098909"
+        b"000a200a280a960aa00a01100410061009101010121015101810211024104010"
+        b"4210451048105110541060106a10811084109010001102110511081111111411"
+        b"2011411144115011801194119611011204120612101240126012001402140514"
+        b"0814111414142014411444144914501464148014011504151015401500161416"
+        b"49160118041810181218401854188618001905196619511aa91a002002200520"
+        b"08200a201120142020204120442050208020a020012104211021402148216521"
+        b"002222228022a82201240424102429244024002541255225992501261a26a626"
+        b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440"
+        b"0640094010401240154018402140244040404240454048404a40514054406040"
+        b"6540814084409040004102410541084111411441204141414441504180418541"
+        b"a241014204421042124229424042004402440544084411441444194420444144"
+        b"4444504480449444014504451045244540459a4500460a464446504601480448"
+        b"1048404845485448624800491149444950496949044a00500250055008501150"
+        b"145020502850415044505050805001510451105115514051425100524452aa52"
+        b"0154045410542154405460548154a154005508558055885521566856a1560058"
+        b"14584158505899581a5940594259855a0160046010604060546062608660a960"
+        b"006124624a62926200641664106540654565a46501686a682569066a546a626a"
+        b"00800280058008801180148020802a8041804480508080808280a880aa800181"
+        b"0481068110814081518159810082208280828282a082a8820184048410841284"
+        b"158440846084898400854485a58518866a860088088825885a8880888288a888"
+        b"0689228a808a888a968aa88a0190049010904090569084900091229164915692"
+        b"89920094059444945094589429959095929541965198a6984999159a609a00a0"
+        b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4"
+        b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, scales = np.hsplit(rest, [2 * QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        qs = qs.view(np.uint16)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (np.float32(0.5) + scales) * np.float32(0.25)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128)
+        signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1)
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 2, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 2, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S):
+    # iq2s_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (1024, 8)
+    grid_map = (0x08, 0x19, 0x2b)
+    grid_hex = (
+        b"00000200050008000a0011001400160019002000220025002800410044004600"
+        b"490050005200550058006100640066006900800082008500880091009400a000"
+        b"a500aa0001010401060109011001120115011801210124014001420145014801"
+        b"510154015601590160016501680181018401900192019501a101a40100020202"
+        b"050208021102140220022a02410244024602490250025502800285028a029402"
+        b"a202010404040604090410041204150418042104240426042904400442044504"
+        b"48044a0451045404560459046004620465048104840486048904900495049804"
+        b"a104a40400050205050508050a05110514051605190520052505280541054405"
+        b"46054905500552055505580561056405800582058505880591059405a0050106"
+        b"0406060609061006150640064506480651065406600681068406900600080208"
+        b"050808081108140816081908200825082a084108440846084908500852085508"
+        b"580861086408800885089408aa08010904091009120915091809210940094509"
+        b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410"
+        b"0610091010101210151018102110241026104010421045104810511054105610"
+        b"59106010621065106810811084108610901095109810a110a410001102110511"
+        b"08110a1111111411161119112011221125112811411144114611491150115211"
+        b"5511581161116411801182118511881191119411011204120912101215122112"
+        b"2412401245125112541281128412901200140214051408141114141416141914"
+        b"2014251428144114441446144914501452145514581461146414801482148514"
+        b"881491149414a014011504150615091510151215151518152115241540154215"
+        b"4515481551155415601581158415901500160516081611161416201641164416"
+        b"50168016aa160118041806180918101815181818211840184218451848185118"
+        b"541860188118841800190219051908191119141920194119441950196919a219"
+        b"041a101a401a561a00200220052008201120142016201920202025202a204120"
+        b"4420502052205520642080208a209420aa200121042110211221152121214021"
+        b"4221452151215421602181218421902100220a22222228222a22442250228822"
+        b"8a22a82201240424062409241024152418242124242440244224452448245124"
+        b"5424602481248424902400250525082511251425202541254425502566258025"
+        b"0126042610264026592600280528112814284128442850288a28aa2801290429"
+        b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40"
+        b"21402440264040404240454048404a4051405440564059406040624065408140"
+        b"8440904095409840a140a4400041024105410841114114411641194120412241"
+        b"2541414144414641494150415241554158416141644180418241854188419141"
+        b"9441a04101420442104212421542184224424042454248425142544260428142"
+        b"844200440244054408440a441144144416441944204422442544284441444444"
+        b"46444944504452445544584461446444804482448544884491449444a0440145"
+        b"0445064509451045124515451845214524454045424545454845514554456045"
+        b"6a4581458445904500460246054608461146144620464146444650468046a546"
+        b"0148044809481048124815481848214824484048424845484848514854486048"
+        b"84489048004902490549084911491449204941494449504980499649014a044a"
+        b"104a404a00500250055008501150145016501950205022502550285041504450"
+        b"4650495050505250555058506150645080508250855088509150945001510451"
+        b"0651095110511251155118512151245140514251455148515151545160518151"
+        b"8451905100520552085211521452205241524452505269528052015404540654"
+        b"0954105412541554185421542454405442544554485451545454605481548454"
+        b"9054005502550555085511551455205541554455505580550156045610562656"
+        b"405600580258055808581158145820584158445850585a588058015904591059"
+        b"4059005a195a855aa85a01600460066010601260156018602160246040604560"
+        b"4860516054606060846090600061026105610861116114612061416144615061"
+        b"806199610462106240625662a162006405640864116414642064416444645064"
+        b"806401650465106540654a656865926500669466016804681068656898680069"
+        b"2a69426aa16a0080028005800880118014801980208025804180448050805280"
+        b"5580588061808080858091809480018104810981108112811581188121812481"
+        b"408142814581488151815481818184819081a981008205820a82118214824182"
+        b"4482508201840484068409841084128415841884218440844284458448845184"
+        b"5484608481848484908400850285058508851185148520854185448550858085"
+        b"8a85018604861086298640860088058811881488418844885088a28801890489"
+        b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090"
+        b"4290459048905190549060908190849090900091059111911491419144915091"
+        b"5a910192049210924092a6920094029405940894119414942094419444945094"
+        b"8094969401950495109540959895a19500964696649601980498109826984098"
+        b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0"
+        b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4"
+        b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, rest = np.hsplit(rest, [QK_K // 8])
+        signs, rest = np.hsplit(rest, [QK_K // 8])
+        qh, scales = np.hsplit(rest, [QK_K // 32])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (np.float32(0.5) + scales) * np.float32(0.25)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # unpack the sign bits
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 2, 8))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4))
+        qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 2, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS):
+    grid_shape = (256, 4)
+    grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e)
+    grid_hex = (
+        b"0000020004001100130017002000220031004200730075000101030110011201"
+        b"2101250130013201410154017001000202020402110220022202310233023702"
+        b"5102570275020103070310031203250370031304370444045704730475040105"
+        b"0705320552053506640610071407160743076107011003101010121021102310"
+        b"3010321034104710501000110211111120112211011203121012121221123012"
+        b"7212001302132013311346136613011405145014201524154615711505162217"
+        b"4017002002201120132020202220262031204220012103210521102112212121"
+        b"3021632167217021002202221122172220222222372240225522012310231423"
+        b"7023742335245324032527254125742501270327162745270130103012302130"
+        b"2330503065307230003102312031313144314631013203321032253252327232"
+        b"1133333330344734723400350635223555351436363663363337603704401740"
+        b"3540374053405740744120423742404260426642074345430444514464442545"
+        b"4345704505471047124730471250415070500051065126515551145232527252"
+        b"0253535310542354275472540255315550562457425724604460466064602161"
+        b"6161176264623063366344640565526533660367216703700570077010703270"
+        b"5270267140711272457252720073157333736073217441740075027524753076"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, scales = np.hsplit(rest, [QK_K // 4])
+
+        d = d.view(np.float16).astype(np.float32)
+        scales = scales.view(np.uint32)
+
+        db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
+        ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
+        signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
+        signs = np.take_along_axis(ksigns, signs, axis=-1)
+        signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S):
+    grid_shape = (512, 4)
+    grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f)
+    grid_hex = (
+        b"0000010002000500070010001100120014001600200021002500330040004200"
+        b"4500470051005300600062007100740077000001010102010401100111011501"
+        b"2001230127013101350144016101650172010002010205020702100213021602"
+        b"2102250230023402420245024702510253027002730203031103150320032203"
+        b"3103330336034403500352036703710375030004130417042104240432044004"
+        b"4304510470040205040520052205260533054105450547056605730506061106"
+        b"1306310652067106000702070407200722072607330750075407001001100210"
+        b"0410101011101310151017102010221031103410361054105610611072100011"
+        b"0111031106111011141121113011331141115011521170117611001212121512"
+        b"1712201224123212401243125512601272120113041307131013131321132713"
+        b"3013341341136213701303140514121414143114331442144614501454140115"
+        b"1015131521153015321551152016241627164416461601170317101712172117"
+        b"3517411762177017002001200320052007201020122014201620212023202720"
+        b"3020322041204320452050205220672070207320752000210221102113211721"
+        b"2221252131213421422151210122042207222122232230223722412253225722"
+        b"7122742200230223052311232223242331233323422350236623012407242024"
+        b"2324322435244124722475240425112522253725402553257025002602260726"
+        b"2126552661260527112726273027432750270230113013301530173022303130"
+        b"3330353042304430473051306330713001310331053114312131233140316031"
+        b"7231763100321232203232323432503201331033143321332333273330334133"
+        b"4333473355337333033411341634223431345234603464340135103512352535"
+        b"3235443556357335163641360137033720372237353700400440124020402440"
+        b"2740324041405040704002410741114113412241304135414341514155410142"
+        b"0342104215422142334240425742624270420443114313432043224331433543"
+        b"0044024424443744404471440545074521456245134634466046104715473047"
+        b"4347514702501050145022504050445047505250665074500151035105511251"
+        b"2151325172510052115223523052365253520253075310532753445351536553"
+        b"7353015404542054325446541255265551555355425602570457225711601360"
+        b"1560316033606060006120612761646112623462426255626262706200631463"
+        b"2163406325644364626400650365346560650566406611671367007004700770"
+        b"2070227036704070547062700271117124714371457101720472107216722172"
+        b"3072517202733273357353730174057413742074507422754275027631760077"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        qh, rest = np.hsplit(rest, [QK_K // 32])
+        signs, scales = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (1 + 2 * scales)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # unpack the sign bits
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8)
+        qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1))
+        qs = qs.astype(np.uint16) | (qh << 8)
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S):
+    # iq1s_grid, with each byte packed into 2 bits
+    # -1, 0, 1 <=> 0, 1, 2
+    grid_shape = (2048, 8)
+    grid_map = (-1, 0, 1)
+    grid_hex = (
+        b"00000200050008000a00110015002000220028002a0045005100540056006500"
+        b"8000820088008a009500a000a200a800aa000401050111011401160119011a01"
+        b"2501410146014901520155015a0161016401660168018501910194019601a501"
+        b"0002020208020a0215022002220228022a024502510259026402690280028202"
+        b"88028a02910295029902a002a202a802aa021104140416042504410449045504"
+        b"5a046404650491049904a5040105040505050605150518051a05290540054505"
+        b"4a0550055105540555055605590560056205650568056a058105910595059805"
+        b"9a05a105a405a505a605a9051406190641064406500652065506580660066106"
+        b"6606690685069106940699060008020808080a0815082008220828082a084508"
+        b"5108560865088008820888088a089508a008a208a808aa080509110914091909"
+        b"2409250941095009510955096109640969099109940996099909a509000a020a"
+        b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a"
+        b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510"
+        b"58106110641065106910911094109610a110a510011104110611091110111211"
+        b"1511181121112411291145114a11501151115211541155115611591160116511"
+        b"841192119511a111a41111121412161225124012461249125212551258125a12"
+        b"641266128512911294129612a512011406140914141415141814191421142614"
+        b"41144514461448144a1451145414551456145914621465146814841489149014"
+        b"94149514981499149a14a114a414a514a914021505150a151115141515151615"
+        b"191520152215251528152a154115441545154615511552155415551556155915"
+        b"5a1561156415651566156915801582158415851588158a159015911594159515"
+        b"961599159a15a015a215a51501160416051606161516161618161a1621162616"
+        b"401642164416451648164a165116551656165816591661166416651668166916"
+        b"6a1686168a1692169516a416a916111816182518411844184618491850185518"
+        b"58185a1860186118641866186918851891189418a5181019121915191a192119"
+        b"25194219441945194819511954195519561959195a19601965196a1989199119"
+        b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a"
+        b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520"
+        b"28202a20452051205920612065208020822088208a209520a020a220a520a820"
+        b"aa2005211121142119212521422144214921552158215a216121642165216621"
+        b"8521902196219921a521012208220a22112215222022222228222a2245225122"
+        b"562259226522812288228a2291229522a022a222a822aa220524142416241924"
+        b"252444244524462449245224552458245a2466248524912494249924a124a524"
+        b"0925152521252925402545254825512554255525592562256525682589259025"
+        b"9425952598259a25a125a425a625a92505261026122619262526412649265526"
+        b"6026612669268426862690269a260028022808280a2815282028222828282a28"
+        b"45285128542865288028822888288a28a028a228a828aa280929112914291929"
+        b"2529462949295229552961296429662969298529902996299929a429a529002a"
+        b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a"
+        b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440"
+        b"664094409940a140a6400041014104410641094112411541164118411a412141"
+        b"26412941454148414a41514154415541564159415a41654168416a4181418441"
+        b"8641904192419541a041a141a241054211421442164225424142524255425a42"
+        b"6442694289429442a5420144154419442944454448444a445144544455445644"
+        b"61446244654468446a44814486448944904492449544a044a144a94401450245"
+        b"05450a4511451445154516451945204525452a45414544454545464549455045"
+        b"5145544555455645584559456145644565456645694582458445854588459145"
+        b"94459545964599459a45a545a845aa450146054609461446154618461a462146"
+        b"2446294640464246454648465046514652465546564659466246654668468146"
+        b"85468a4694469546a146a446a6460548114815481a4825484248494850485548"
+        b"5848614864486648694885489148944896489948a5480149054906490a491049"
+        b"144915491849214924492649404945494a495149524954495549564959496049"
+        b"6249654966496a49864989499249954996499849a149a449a649a949164a444a"
+        b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550"
+        b"1a50215024502950405045504850515054505550565059506550685086508950"
+        b"95509850a050a150a650a9500551085109510a51115114511551165118511951"
+        b"20512551265128512a5141514451455146514951505151515251545155515651"
+        b"585159515a51615164516551665169518251855191519451955196519951a051"
+        b"a551aa5101520652125215521a5221522452425245524a525152545255525652"
+        b"595262526552855290529252955299529a52a452045405541154145415541654"
+        b"185419542154255428542a54415444544554465449544a545054515454545554"
+        b"5654585459545a54615462546454655466546954805488548a54915494549554"
+        b"96549954a154a454a554aa540155025504550555065509551055115512551455"
+        b"1555165519551a55215524552555265529554055415542554455455546554855"
+        b"4955505551555255545555555655585559555a55605561556455655566556855"
+        b"69556a5581558455855589558a559055915594559555965598559955a155a455"
+        b"a555a655a9550056015602560456065608560956115614561556185619562056"
+        b"2156225624562556265628562956415645564656485649564a56505651565256"
+        b"545655565656585659565a566156645665566956825685568656885689568a56"
+        b"915695569a56a256a556a656a856a95604580558065809581058155818582158"
+        b"2a58455848584a58515854585558565858585958605862586458655882588958"
+        b"9058925895589858a158a9580159025905590a59115914591559165919592559"
+        b"41594459455946594959505951595259545955595659585959595a5961596459"
+        b"655966596959815985598959915994599559965998599959a559045a085a155a"
+        b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a"
+        b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060"
+        b"5560566058605a60616064606660696081609660a56001610461066109611261"
+        b"15612161226126612961456149615161556156615961656166616a6184618a61"
+        b"92619561a161a661a96111621662196240624162466255625662586260628562"
+        b"91629662a56211641264156416641a6421642664296440644264456448644a64"
+        b"516454645564566459645a646064626465648464856489649064926494649564"
+        b"966498649a64a164a464a964056508650a651165156516651965446545654665"
+        b"496550655165546555655665596561656465656566656965866589658a659165"
+        b"9565966599659a65a265a565a665a86502660966156620662666286629664066"
+        b"456648664a66516654665566566658665a666066656668668066826685668a66"
+        b"9466966698669966a066a466a666aa661668196825684168526855685a686168"
+        b"6968856891689868a66801690469106915692169246926692969406941694569"
+        b"4669486951695469556956695969606965696a69826984698a699569a169a469"
+        b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a"
+        b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480"
+        b"5680598065808080828088808a809580a080a280a880aa800581118114811681"
+        b"1981258141814481498150815281558156815881598164816681698185818981"
+        b"948196819981a5810082028208820a8215822082228228822a82518254825982"
+        b"65828082828288828a829582a082a282a882aa82148419844184448451845584"
+        b"5a846184648469849484998401850985128515851a8526852985408541854585"
+        b"4885518554855585568559855a856585668568856a8581858485868589859085"
+        b"928595859885a68511861686198625864186448649864a865086558659865a86"
+        b"618666866a86858691869a86a4860088028808880a8815882088228828882a88"
+        b"41884588518854885988658869888088828888888a889588a088a288a888aa88"
+        b"05890689118914891689258941894489468949895089528955895a8961896489"
+        b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a"
+        b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590"
+        b"419046904990559058905a9069906a9085909190949096909990a59001910491"
+        b"069109911091159118911a912191249126912991409145915091519154915591"
+        b"569159916291659184918691929195919891a191a491a691a991059211921492"
+        b"19922592449246924992509252925592589266926992859294929692a9920194"
+        b"04940694109415941894269440944a9451945494559456945894599460946194"
+        b"62946594849486949294949495949894a194a9940095059508950a9510951195"
+        b"14951595169519952195259529952a9541954495459546954995509551955295"
+        b"549555955695589559955a956195649565956695699581958595889591959295"
+        b"94959595969599959a95a095a295a595a895aa95019604961096159619962096"
+        b"2696299645964896499651965296559656965996659668968296849689968a96"
+        b"929694969596a496a696a9960598169819982598419846985098529855985698"
+        b"5a98649865988598919896989998a59804990699099910991299159918991a99"
+        b"209921992499269940994299459948994a995199549955995699599962996599"
+        b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a"
+        b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0"
+        b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0"
+        b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1"
+        b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2"
+        b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4"
+        b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5"
+        b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5"
+        b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6"
+        b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8"
+        b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9"
+        b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa"
+        b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa"
+    )
+
+    delta = np.float32(0.125)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, qh = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint16)
+
+        dl = d * (2 * ((qh >> 12) & 7) + 1)
+        dl = dl.reshape((n_blocks, -1, 1, 1))
+        delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta)
+        delta = delta.reshape((n_blocks, -1, 1, 1))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
+        qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (dl * (grid + delta)).reshape((n_blocks, -1))
+
+
+class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M):
+    grid_shape = IQ1_S.grid_shape
+    grid_map = IQ1_S.grid_map
+    grid_hex = IQ1_S.grid_hex
+
+    delta = IQ1_S.delta
+
+    # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts.
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, rest = np.hsplit(blocks, [QK_K // 8])
+        qh, scales = np.hsplit(rest, [QK_K // 16])
+
+        # The f16 scale is packed across multiple bytes
+        scales = scales.view(np.uint16)
+        d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4))
+        d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3]
+        d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1))
+
+        scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
+        scales = (scales & 0x07).reshape((n_blocks, -1))
+        dl = d * (2 * scales + 1)
+        dl = dl.reshape((n_blocks, -1, 2, 1, 1))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
+        qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1))
+
+        delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta)
+        delta = delta.reshape((n_blocks, -1, 2, 2, 1))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 2, 2, 8))
+
+        return (dl * (grid + delta)).reshape((n_blocks, -1))
+
+
+class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL):
+    kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, qs = np.hsplit(blocks, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1))
+
+        kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
+        qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1))
+
+        return (d * qs)
+
+
+class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        scales_h, rest = np.hsplit(rest, [2])
+        scales_l, qs = np.hsplit(rest, [QK_K // 64])
+
+        d = d.view(np.float16).astype(np.float32)
+        scales_h = scales_h.view(np.uint16)
+
+        scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
+        scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1))
+        scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F)
+        scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03)
+
+        scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32)
+        dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1))
+
+        qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
+        qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F)
+
+        kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1))
+        qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32))
+
+        return (dl * qs).reshape((n_blocks, -1))
diff --git a/gguf-py/gguf/scripts/__init__.py b/gguf-py/gguf/scripts/__init__.py
new file mode 100644
index 000000000..e77f2e9c9
--- /dev/null
+++ b/gguf-py/gguf/scripts/__init__.py
@@ -0,0 +1,6 @@
+# pyright: reportUnusedImport=false
+
+from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
+from .gguf_dump import main as gguf_dump_entrypoint
+from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
+from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
diff --git a/gguf-py/gguf/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py
new file mode 100755
index 000000000..f97e91bd4
--- /dev/null
+++ b/gguf-py/gguf/scripts/gguf_convert_endian.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import argparse
+import os
+import sys
+from tqdm import tqdm
+from pathlib import Path
+
+import numpy as np
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+import gguf
+
+logger = logging.getLogger("gguf-convert-endian")
+
+
+def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
+    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+        # Host is little endian
+        host_endian = "little"
+        swapped_endian = "big"
+    else:
+        # Sorry PDP or other weird systems that don't use BE or LE.
+        host_endian = "big"
+        swapped_endian = "little"
+    if reader.byte_order == "S":
+        file_endian = swapped_endian
+    else:
+        file_endian = host_endian
+    order = host_endian if args.order == "native" else args.order
+    logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
+    if file_endian == order:
+        logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
+        sys.exit(0)
+    logger.info("* Checking tensors for conversion compatibility")
+    for tensor in reader.tensors:
+        if tensor.tensor_type not in (
+            gguf.GGMLQuantizationType.F32,
+            gguf.GGMLQuantizationType.F16,
+            gguf.GGMLQuantizationType.Q8_0,
+        ):
+            raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
+    logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
+    if args.dry_run:
+        return
+    logger.warning("*** Warning *** Warning *** Warning **")
+    logger.warning("* This conversion process may damage the file. Ensure you have a backup.")
+    if order != host_endian:
+        logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.")
+    logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted")
+    logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
+    response = input("YES, I am sure> ")
+    if response != "YES":
+        logger.warning("You didn't enter YES. Okay then, see ya!")
+        sys.exit(0)
+    logger.info(f"* Converting fields ({len(reader.fields)})")
+    for idx, field in enumerate(reader.fields.values()):
+        logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
+        for part in field.parts:
+            part.byteswap(inplace=True)
+    logger.info(f"* Converting tensors ({len(reader.tensors)})")
+
+    for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")):
+        log_message = (
+            f"Converting tensor {repr(tensor.name)}, "
+            f"type={tensor.tensor_type.name}, "
+            f"elements={tensor.n_elements} "
+        )
+
+        # Byte-swap each part of the tensor's field
+        for part in tensor.field.parts:
+            part.byteswap(inplace=True)
+
+        # Byte-swap tensor data if necessary
+        if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
+            # Handle Q8_0 tensor blocks (block_q8_0)
+            # Specific handling of block_q8_0 is required.
+            # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
+
+            block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
+
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized delta field
+                delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap Q8 weights
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
+        else:
+            # Handle other tensor types
+            tensor.data.byteswap(inplace=True)
+
+        pbar.set_description(log_message)
+
+    logger.info("* Completion")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Convert GGUF file byte order")
+    parser.add_argument(
+        "model", type=str,
+        help="GGUF format model filename",
+    )
+    parser.add_argument(
+        "order", type=str, choices=['big', 'little', 'native'],
+        help="Requested byte order",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Don't actually change anything",
+    )
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    logger.info(f'* Loading: {args.model}')
+    reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
+    convert_byteorder(reader, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gguf-py/gguf/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py
new file mode 100755
index 000000000..f95b4fd48
--- /dev/null
+++ b/gguf-py/gguf/scripts/gguf_dump.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import argparse
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from gguf import GGUFReader, GGUFValueType, ReaderTensor  # noqa: E402
+
+logger = logging.getLogger("gguf-dump")
+
+
+def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
+    host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
+    if reader.byte_order == 'S':
+        file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE'
+    else:
+        file_endian = host_endian
+    return (host_endian, file_endian)
+
+
+# For more information about what field.parts and field.data represent,
+# please see the comments in the modify_gguf.py example.
+def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
+    host_endian, file_endian = get_file_host_endian(reader)
+    print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')  # noqa: NP100
+    print(f'* Dumping {len(reader.fields)} key/value pair(s)')  # noqa: NP100
+    for n, field in enumerate(reader.fields.values(), 1):
+        if not field.types:
+            pretty_type = 'N/A'
+        elif field.types[0] == GGUFValueType.ARRAY:
+            nest_count = len(field.types) - 1
+            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
+        else:
+            pretty_type = str(field.types[-1].name)
+
+        log_message = f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
+        if len(field.types) == 1:
+            curr_type = field.types[0]
+            if curr_type == GGUFValueType.STRING:
+                log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
+            elif field.types[0] in reader.gguf_scalar_to_np:
+                log_message += ' = {0}'.format(field.parts[-1][0])
+        print(log_message)  # noqa: NP100
+    if args.no_tensors:
+        return
+    print(f'* Dumping {len(reader.tensors)} tensor(s)')  # noqa: NP100
+    for n, tensor in enumerate(reader.tensors, 1):
+        prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
+        print(f'  {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')  # noqa: NP100
+
+
+def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
+    import json
+    host_endian, file_endian = get_file_host_endian(reader)
+    metadata: dict[str, Any] = {}
+    tensors: dict[str, Any] = {}
+    result = {
+        "filename": args.model,
+        "endian": file_endian,
+        "metadata": metadata,
+        "tensors": tensors,
+    }
+    for idx, field in enumerate(reader.fields.values()):
+        curr: dict[str, Any] = {
+            "index": idx,
+            "type": field.types[0].name if field.types else 'UNKNOWN',
+            "offset": field.offset,
+        }
+        metadata[field.name] = curr
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            curr["array_types"] = [t.name for t in field.types][1:]
+            if not args.json_array:
+                continue
+            itype = field.types[-1]
+            if itype == GGUFValueType.STRING:
+                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
+            else:
+                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        elif field.types[0] == GGUFValueType.STRING:
+            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
+        else:
+            curr["value"] = field.parts[-1].tolist()[0]
+    if not args.no_tensors:
+        for idx, tensor in enumerate(reader.tensors):
+            tensors[tensor.name] = {
+                "index": idx,
+                "shape": tensor.shape.tolist(),
+                "type": tensor.tensor_type.name,
+                "offset": tensor.field.offset,
+            }
+    json.dump(result, sys.stdout)
+
+
+def markdown_table_with_alignment_support(header_map: list[dict[str, str]], data: list[dict[str, Any]]):
+    # JSON to Markdown table formatting: https://stackoverflow.com/a/72983854/2850957
+
+    # Alignment Utility Function
+    def strAlign(padding: int, alignMode: str | None, strVal: str):
+        if alignMode == 'center':
+            return strVal.center(padding)
+        elif alignMode == 'right':
+            return strVal.rjust(padding - 1) + ' '
+        elif alignMode == 'left':
+            return ' ' + strVal.ljust(padding - 1)
+        else: # default left
+            return ' ' + strVal.ljust(padding - 1)
+
+    def dashAlign(padding: int, alignMode: str | None):
+        if alignMode == 'center':
+            return ':' + '-' * (padding - 2) + ':'
+        elif alignMode == 'right':
+            return '-' * (padding - 1) + ':'
+        elif alignMode == 'left':
+            return ':' + '-' * (padding - 1)
+        else: # default left
+            return '-' * (padding)
+
+    # Calculate Padding For Each Column Based On Header and Data Length
+    rowsPadding = {}
+    for index, columnEntry in enumerate(header_map):
+        padCount = max([len(str(v)) for d in data for k, v in d.items() if k == columnEntry['key_name']], default=0) + 2
+        headerPadCount = len(columnEntry['header_name']) + 2
+        rowsPadding[index] = headerPadCount if padCount <= headerPadCount else padCount
+
+    # Render Markdown Header
+    rows = []
+    rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(columnEntry['header_name'])) for index, columnEntry in enumerate(header_map)))
+    rows.append('|'.join(dashAlign(rowsPadding[index], columnEntry.get('align')) for index, columnEntry in enumerate(header_map)))
+
+    # Render Tabular Data
+    for item in data:
+        rows.append('|'.join(strAlign(rowsPadding[index], columnEntry.get('align'), str(item[columnEntry['key_name']])) for index, columnEntry in enumerate(header_map)))
+
+    # Convert Tabular String Rows Into String
+    tableString = ""
+    for row in rows:
+        tableString += f'|{row}|\n'
+
+    return tableString
+
+
+def element_count_rounded_notation(count: int) -> str:
+    if count > 1e15 :
+        # Quadrillion
+        scaled_amount = count * 1e-15
+        scale_suffix = "Q"
+    elif count > 1e12 :
+        # Trillions
+        scaled_amount = count * 1e-12
+        scale_suffix = "T"
+    elif count > 1e9 :
+        # Billions
+        scaled_amount = count * 1e-9
+        scale_suffix = "B"
+    elif count > 1e6 :
+        # Millions
+        scaled_amount = count * 1e-6
+        scale_suffix = "M"
+    elif count > 1e3 :
+        # Thousands
+        scaled_amount = count * 1e-3
+        scale_suffix = "K"
+    else:
+        # Under Thousands
+        scaled_amount = count
+        scale_suffix = ""
+    return f"{'~' if count > 1e3 else ''}{round(scaled_amount)}{scale_suffix}"
+
+
+def translate_tensor_name(name):
+    words = name.split(".")
+
+    # Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names
+    abbreviation_dictionary = {
+        'token_embd': 'Token embedding',
+        'pos_embd': 'Position embedding',
+        'output_norm': 'Output normalization',
+        'output': 'Output',
+        'attn_norm': 'Attention normalization',
+        'attn_norm_2': 'Attention normalization',
+        'attn_qkv': 'Attention query-key-value',
+        'attn_q': 'Attention query',
+        'attn_k': 'Attention key',
+        'attn_v': 'Attention value',
+        'attn_output': 'Attention output',
+        'ffn_norm': 'Feed-forward network normalization',
+        'ffn_up': 'Feed-forward network "up"',
+        'ffn_gate': 'Feed-forward network "gate"',
+        'ffn_down': 'Feed-forward network "down"',
+        'ffn_gate_inp': 'Expert-routing layer for the Feed-forward network in Mixture of Expert models',
+        'ffn_gate_exp': 'Feed-forward network "gate" layer per expert in Mixture of Expert models',
+        'ffn_down_exp': 'Feed-forward network "down" layer per expert in Mixture of Expert models',
+        'ffn_up_exp': 'Feed-forward network "up" layer per expert in Mixture of Expert models',
+        'ssm_in': 'State space model input projections',
+        'ssm_conv1d': 'State space model rolling/shift',
+        'ssm_x': 'State space model selective parametrization',
+        'ssm_a': 'State space model state compression',
+        'ssm_d': 'State space model skip connection',
+        'ssm_dt': 'State space model time step',
+        'ssm_out': 'State space model output projection',
+        'blk': 'Block',
+        'enc': 'Encoder',
+        'dec': 'Decoder',
+    }
+
+    expanded_words = []
+    for word in words:
+        word_norm = word.strip().lower()
+        if word_norm in abbreviation_dictionary:
+            expanded_words.append(abbreviation_dictionary[word_norm].title())
+        else:
+            expanded_words.append(word.title())
+
+    return ' '.join(expanded_words)
+
+
+def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
+    host_endian, file_endian = get_file_host_endian(reader)
+    markdown_content = ""
+    markdown_content += f'# {args.model} - GGUF Internal File Dump\n\n'
+    markdown_content += f'- Endian: {file_endian} endian\n'
+    markdown_content += '\n'
+    markdown_content += '## Key Value Metadata Store\n\n'
+    markdown_content += f'There are {len(reader.fields)} key-value pairs in this file\n'
+    markdown_content += '\n'
+
+    kv_dump_table: list[dict[str, str | int]] = []
+    for n, field in enumerate(reader.fields.values(), 1):
+        if not field.types:
+            pretty_type = 'N/A'
+        elif field.types[0] == GGUFValueType.ARRAY:
+            nest_count = len(field.types) - 1
+            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
+        else:
+            pretty_type = str(field.types[-1].name)
+
+        def escape_markdown_inline_code(value_string):
+            # Find the longest contiguous sequence of backticks in the string then
+            # wrap string with appropriate number of backticks required to escape it
+            max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
+            inline_code_marker = '`' * (max_backticks + 1)
+
+            # If the string starts or ends with a backtick, add a space at the beginning and end
+            if value_string.startswith('`') or value_string.endswith('`'):
+                value_string = f" {value_string} "
+
+            return f"{inline_code_marker}{value_string}{inline_code_marker}"
+
+        total_elements = len(field.data)
+        value = ""
+        if len(field.types) == 1:
+            curr_type = field.types[0]
+            if curr_type == GGUFValueType.STRING:
+                truncate_length = 60
+                value_string = str(bytes(field.parts[-1]), encoding='utf-8')
+                if len(value_string) > truncate_length:
+                    head = escape_markdown_inline_code(value_string[:truncate_length // 2])
+                    tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
+                    value = "{head}...{tail}".format(head=head, tail=tail)
+                else:
+                    value = escape_markdown_inline_code(value_string)
+            elif curr_type in reader.gguf_scalar_to_np:
+                value = str(field.parts[-1][0])
+        else:
+            if field.types[0] == GGUFValueType.ARRAY:
+                curr_type = field.types[1]
+                array_elements = []
+
+                if curr_type == GGUFValueType.STRING:
+                    render_element = min(5, total_elements)
+                    for element_pos in range(render_element):
+                        truncate_length = 30
+                        value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
+                        if len(value_string) > truncate_length:
+                            head = escape_markdown_inline_code(value_string[:truncate_length // 2])
+                            tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
+                            value = "{head}...{tail}".format(head=head, tail=tail)
+                        else:
+                            value = escape_markdown_inline_code(value_string)
+                        array_elements.append(value)
+
+                elif curr_type in reader.gguf_scalar_to_np:
+                    render_element = min(7, total_elements)
+                    for element_pos in range(render_element):
+                        array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
+
+                value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
+
+        kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
+
+    kv_dump_table_header_map = [
+        {'key_name':'n',                'header_name':'POS',      'align':'right'},
+        {'key_name':'pretty_type',      'header_name':'TYPE',     'align':'left'},
+        {'key_name':'total_elements',   'header_name':'Count',    'align':'right'},
+        {'key_name':'field_name',       'header_name':'Key',      'align':'left'},
+        {'key_name':'value',            'header_name':'Value',    'align':'left'},
+    ]
+
+    markdown_content += markdown_table_with_alignment_support(kv_dump_table_header_map, kv_dump_table)
+
+    markdown_content += "\n"
+
+    if not args.no_tensors:
+        # Group tensors by their prefix and maintain order
+        tensor_prefix_order: list[str] = []
+        tensor_name_to_key: dict[str, int] = {}
+        tensor_groups: dict[str, list[ReaderTensor]] = {}
+        total_elements = sum(tensor.n_elements for tensor in reader.tensors)
+
+        # Parsing Tensors Record
+        for key, tensor in enumerate(reader.tensors):
+            tensor_components = tensor.name.split('.')
+
+            # Classify Tensor Group
+            tensor_group_name = "base"
+            if tensor_components[0] == 'blk':
+                tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
+            elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
+                tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
+            elif tensor_components[0] in ['enc', 'dec']:
+                tensor_group_name = f"{tensor_components[0]}"
+
+            # Check if new Tensor Group
+            if tensor_group_name not in tensor_groups:
+                tensor_groups[tensor_group_name] = []
+                tensor_prefix_order.append(tensor_group_name)
+
+            # Record Tensor and Tensor Position
+            tensor_groups[tensor_group_name].append(tensor)
+            tensor_name_to_key[tensor.name] = key
+
+        # Tensors Mapping Dump
+        markdown_content += f'## Tensors Overview {element_count_rounded_notation(total_elements)} Elements\n\n'
+        markdown_content += f'Total number of elements in all tensors: {total_elements} Elements\n'
+        markdown_content += '\n'
+
+        for group in tensor_prefix_order:
+            tensors = tensor_groups[group]
+            group_elements = sum(tensor.n_elements for tensor in tensors)
+            markdown_content += f"- [{translate_tensor_name(group)} Tensor Group - {element_count_rounded_notation(group_elements)} Elements](#{group.replace('.', '_')})\n"
+
+        markdown_content += "\n"
+
+        markdown_content += "### Tensor Data Offset\n"
+        markdown_content += '\n'
+        markdown_content += 'This table contains the offset and data segment relative to start of file\n'
+        markdown_content += '\n'
+
+        tensor_mapping_table: list[dict[str, str | int]] = []
+        for key, tensor in enumerate(reader.tensors):
+            data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
+            data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
+            tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
+
+        tensors_mapping_table_header_map = [
+            {'key_name':'t_id',         'header_name':'T_ID',               'align':'right'},
+            {'key_name':'layer_name',   'header_name':'Tensor Layer Name',  'align':'left'},
+            {'key_name':'data_offset',  'header_name':'Data Offset (B)',    'align':'right'},
+            {'key_name':'data_size',    'header_name':'Data Size (B)',      'align':'right'},
+        ]
+
+        markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
+        markdown_content += "\n"
+
+        for group in tensor_prefix_order:
+            tensors = tensor_groups[group]
+            group_elements = sum(tensor.n_elements for tensor in tensors)
+            group_percentage = group_elements / total_elements * 100
+            markdown_content += f"### <a name=\"{group.replace('.', '_')}\">{translate_tensor_name(group)} Tensor Group : {element_count_rounded_notation(group_elements)} Elements</a>\n\n"
+
+            # Precalculate column sizing for visual consistency
+            prettify_element_est_count_size: int = 1
+            prettify_element_count_size: int = 1
+            prettify_dimension_max_widths: dict[int, int] = {}
+            for tensor in tensors:
+                prettify_element_est_count_size = max(prettify_element_est_count_size, len(str(element_count_rounded_notation(tensor.n_elements))))
+                prettify_element_count_size = max(prettify_element_count_size, len(str(tensor.n_elements)))
+                for i, dimension_size in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape))):
+                    prettify_dimension_max_widths[i] = max(prettify_dimension_max_widths.get(i,1), len(str(dimension_size)))
+
+            # Generate Tensor Layer Table Content
+            tensor_dump_table: list[dict[str, str | int]] = []
+            for tensor in tensors:
+                human_friendly_name = translate_tensor_name(tensor.name.replace(".weight", ".(W)").replace(".bias", ".(B)"))
+                pretty_dimension = ' x '.join(f'{str(d):>{prettify_dimension_max_widths[i]}}' for i, d in enumerate(list(tensor.shape) + [1] * (4 - len(tensor.shape))))
+                element_count_est = f"({element_count_rounded_notation(tensor.n_elements):>{prettify_element_est_count_size}})"
+                element_count_string = f"{element_count_est} {tensor.n_elements:>{prettify_element_count_size}}"
+                type_name_string = f"{tensor.tensor_type.name}"
+                tensor_dump_table.append({"t_id":tensor_name_to_key[tensor.name], "layer_name":tensor.name, "human_layer_name":human_friendly_name, "element_count":element_count_string, "pretty_dimension":pretty_dimension, "tensor_type":type_name_string})
+
+            tensor_dump_table_header_map = [
+                {'key_name':'t_id',             'header_name':'T_ID',                             'align':'right'},
+                {'key_name':'layer_name',       'header_name':'Tensor Layer Name',                'align':'left'},
+                {'key_name':'human_layer_name', 'header_name':'Human Friendly Tensor Layer Name', 'align':'left'},
+                {'key_name':'element_count',    'header_name':'Elements',                         'align':'left'},
+                {'key_name':'pretty_dimension', 'header_name':'Shape',                            'align':'left'},
+                {'key_name':'tensor_type',      'header_name':'Type',                             'align':'left'},
+            ]
+
+            markdown_content += markdown_table_with_alignment_support(tensor_dump_table_header_map, tensor_dump_table)
+
+            markdown_content += "\n"
+            markdown_content += f"- Total elements in {group}: ({element_count_rounded_notation(group_elements):>4}) {group_elements}\n"
+            markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n"
+            markdown_content += "\n\n"
+
+    print(markdown_content)  # noqa: NP100
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
+    parser.add_argument("model",           type=str,            help="GGUF format model filename")
+    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
+    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
+    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
+    parser.add_argument("--data-offset",    action="store_true", help="Start of data offset")
+    parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
+    parser.add_argument("--markdown",   action="store_true", help="Produce markdown output")
+    parser.add_argument("--verbose",    action="store_true", help="increase output verbosity")
+
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
+        logger.info(f'* Loading: {args.model}')
+
+    reader = GGUFReader(args.model, 'r')
+
+    if args.json:
+        dump_metadata_json(reader, args)
+    elif args.markdown:
+        dump_markdown_metadata(reader, args)
+    elif args.data_offset:
+        print(reader.data_offset)  # noqa: NP100
+    elif args.data_alignment:
+        print(reader.alignment)  # noqa: NP100
+    else:
+        dump_metadata(reader, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gguf-py/gguf/scripts/gguf_hash.py b/gguf-py/gguf/scripts/gguf_hash.py
new file mode 100755
index 000000000..3ef989921
--- /dev/null
+++ b/gguf-py/gguf/scripts/gguf_hash.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import uuid
+import hashlib
+
+import logging
+import argparse
+import os
+import sys
+from pathlib import Path
+
+from tqdm import tqdm
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from gguf import GGUFReader  # noqa: E402
+
+
+logger = logging.getLogger("gguf-hash")
+
+# UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
+UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
+
+
+# For more information about what field.parts and field.data represent,
+# please see the comments in the modify_gguf.py example.
+def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar: bool, no_layer: bool) -> None:
+    sha1 = hashlib.sha1()
+    sha256 = hashlib.sha256()
+    uuidv5_sha1 = hashlib.sha1()
+    uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
+
+    # Total Weight Calculation For Progress Bar
+    total_weights = 0
+    for n, tensor in enumerate(reader.tensors, 1):
+
+        # We don't need these
+        if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Calculate Tensor Volume
+        sum_weights_in_tensor = 1
+        for dim in tensor.shape:
+            sum_weights_in_tensor *= dim
+        total_weights += sum_weights_in_tensor
+
+    # Hash Progress Bar
+    bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar)
+
+    # Hashing Process
+    for tensor in reader.tensors:
+
+        # We don't need these
+        if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
+            continue
+
+        # Progressbar
+        sum_weights_in_tensor = 1
+        for dim in tensor.shape:
+            sum_weights_in_tensor *= dim
+        bar.update(sum_weights_in_tensor)
+
+        if not no_layer:
+
+            sha1_layer = hashlib.sha1()
+            sha1_layer.update(tensor.data.data)
+            print("sha1      {0}  {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
+
+            sha256_layer = hashlib.sha256()
+            sha256_layer.update(tensor.data.data)
+            print("sha256    {0}  {1}:{2}".format(sha256_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
+
+        sha1.update(tensor.data.data)
+        sha256.update(tensor.data.data)
+        uuidv5_sha1.update(tensor.data.data)
+
+    # Flush Hash Progress Bar
+    bar.close()
+
+    # Display Hash Output
+    print("sha1      {0}  {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
+    print("sha256    {0}  {1}".format(sha256.hexdigest(), filename)) # noqa: NP100
+    print("uuid      {0}  {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
+    parser.add_argument("model",         type=str,            help="GGUF format model filename")
+    parser.add_argument("--no-layer",    action="store_true", help="exclude per layer hash")
+    parser.add_argument("--verbose",     action="store_true", help="increase output verbosity")
+    parser.add_argument("--progressbar", action="store_true", help="enable progressbar")
+    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+    reader = GGUFReader(args.model, 'r')
+    gguf_hash(reader, args.model, not args.progressbar, args.no_layer)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gguf-py/gguf/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py
new file mode 100755
index 000000000..a8cfc9d58
--- /dev/null
+++ b/gguf-py/gguf/scripts/gguf_new_metadata.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import argparse
+import os
+import sys
+import json
+from pathlib import Path
+
+import numpy as np
+from tqdm import tqdm
+from typing import Any, Sequence, NamedTuple
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+import gguf
+
+logger = logging.getLogger("gguf-new-metadata")
+
+
+class MetadataDetails(NamedTuple):
+    type: gguf.GGUFValueType
+    value: Any
+    description: str = ''
+
+
+def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
+    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+        # Host is little endian
+        host_endian = gguf.GGUFEndian.LITTLE
+        swapped_endian = gguf.GGUFEndian.BIG
+    else:
+        # Sorry PDP or other weird systems that don't use BE or LE.
+        host_endian = gguf.GGUFEndian.BIG
+        swapped_endian = gguf.GGUFEndian.LITTLE
+
+    if reader.byte_order == "S":
+        return swapped_endian
+    else:
+        return host_endian
+
+
+def decode_field(field: gguf.ReaderField | None) -> Any:
+    if field and field.types:
+        main_type = field.types[0]
+
+        if main_type == gguf.GGUFValueType.ARRAY:
+            sub_type = field.types[-1]
+
+            if sub_type == gguf.GGUFValueType.STRING:
+                return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
+            else:
+                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
+        if main_type == gguf.GGUFValueType.STRING:
+            return str(bytes(field.parts[-1]), encoding='utf-8')
+        else:
+            return field.parts[-1][0]
+
+    return None
+
+
+def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
+    field = reader.get_field(key)
+
+    return decode_field(field)
+
+
+def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
+    token_ids = [index for index, value in enumerate(token_list) if value == token]
+
+    if len(token_ids) == 0:
+        raise LookupError(f'Unable to find "{token}" in token list!')
+
+    return token_ids
+
+
+def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
+    for field in reader.fields.values():
+        # Suppress virtual fields and fields written by GGUFWriter
+        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
+            logger.debug(f'Suppressing {field.name}')
+            continue
+
+        # Skip old chat templates if we have new ones
+        if field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
+            logger.debug(f'Skipping {field.name}')
+            continue
+
+        if field.name in remove_metadata:
+            logger.debug(f'Removing {field.name}')
+            continue
+
+        old_val = MetadataDetails(field.types[0], decode_field(field))
+        val = new_metadata.get(field.name, old_val)
+
+        if field.name in new_metadata:
+            logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
+            del new_metadata[field.name]
+        elif val.value is not None:
+            logger.debug(f'Copying {field.name}')
+
+        if val.value is not None:
+            writer.add_key_value(field.name, val.value, val.type)
+
+    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
+        logger.debug('Adding chat template(s)')
+        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
+        del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
+
+    for key, val in new_metadata.items():
+        logger.debug(f'Adding {key}: "{val.value}" {val.description}')
+        writer.add_key_value(key, val.value, val.type)
+
+    total_bytes = 0
+
+    for tensor in reader.tensors:
+        total_bytes += tensor.n_bytes
+        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
+
+    bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_ti_data_to_file()
+
+    for tensor in reader.tensors:
+        writer.write_tensor_data(tensor.data)
+        bar.update(tensor.n_bytes)
+
+    writer.close()
+
+
+def main() -> None:
+    tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
+    token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
+
+    parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
+    parser.add_argument("input",                                       type=Path, help="GGUF format model input filename")
+    parser.add_argument("output",                                      type=Path, help="GGUF format model output filename")
+    parser.add_argument("--general-name",                              type=str,  help="The models general.name", metavar='"name"')
+    parser.add_argument("--general-description",                       type=str,  help="The models general.description", metavar='"Description ..."')
+    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
+    parser.add_argument("--chat-template-config",                      type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
+    parser.add_argument("--pre-tokenizer",                             type=str,  help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
+    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model", metavar='general.url')
+    parser.add_argument("--special-token",        action="append",     type=str,  help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
+    parser.add_argument("--special-token-by-id",  action="append",     type=str,  help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
+    parser.add_argument("--force",                action="store_true",            help="Bypass warnings without confirmation")
+    parser.add_argument("--verbose",              action="store_true",            help="Increase output verbosity")
+    args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    new_metadata = {}
+    remove_metadata = args.remove_metadata or []
+
+    if args.general_name:
+        new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
+
+    if args.general_description:
+        new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
+
+    if args.chat_template:
+        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
+
+    if args.chat_template_config:
+        with open(args.chat_template_config, 'r') as fp:
+            config = json.load(fp)
+            template = config.get('chat_template')
+            if template:
+                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
+
+    if args.pre_tokenizer:
+        new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)
+
+    if remove_metadata:
+        logger.warning('*** Warning *** Warning *** Warning **')
+        logger.warning('* Most metadata is required for a fully functional GGUF file,')
+        logger.warning('* removing crucial metadata may result in a corrupt output file!')
+
+        if not args.force:
+            logger.warning('* Enter exactly YES if you are positive you want to proceed:')
+            response = input('YES, I am sure> ')
+            if response != 'YES':
+                logger.info("You didn't enter YES. Okay then, see ya!")
+                sys.exit(0)
+
+    logger.info(f'* Loading: {args.input}')
+    reader = gguf.GGUFReader(args.input, 'r')
+
+    arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
+    endianess = get_byteorder(reader)
+
+    token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
+
+    for name, token in args.special_token or []:
+        if name not in token_names:
+            logger.warning(f'Unknown special token "{name}", ignoring...')
+        else:
+            ids = find_token(token_list, token)
+            new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
+
+            if len(ids) > 1:
+                logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
+                logger.warning(', '.join(str(i) for i in ids))
+
+    for name, id_string in args.special_token_by_id or []:
+        if name not in token_names:
+            logger.warning(f'Unknown special token "{name}", ignoring...')
+        elif not id_string.isdecimal():
+            raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
+        else:
+            id_int = int(id_string)
+
+            if id_int >= 0 and id_int < len(token_list):
+                new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
+            else:
+                raise LookupError(f'Token ID {id_int} is not within token list!')
+
+    if os.path.isfile(args.output) and not args.force:
+        logger.warning('*** Warning *** Warning *** Warning **')
+        logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
+        logger.warning('* Enter exactly YES if you are positive you want to proceed:')
+        response = input('YES, I am sure> ')
+        if response != 'YES':
+            logger.info("You didn't enter YES. Okay then, see ya!")
+            sys.exit(0)
+
+    logger.info(f'* Writing: {args.output}')
+    writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess)
+
+    alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT)
+    if alignment is not None:
+        logger.debug(f'Setting custom alignment: {alignment}')
+        writer.data_alignment = alignment
+
+    copy_with_new_metadata(reader, writer, new_metadata, remove_metadata)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gguf-py/scripts/gguf-set-metadata.py b/gguf-py/gguf/scripts/gguf_set_metadata.py
similarity index 71%
rename from gguf-py/scripts/gguf-set-metadata.py
rename to gguf-py/gguf/scripts/gguf_set_metadata.py
index 3ebdfa898..f5809c35c 100755
--- a/gguf-py/scripts/gguf-set-metadata.py
+++ b/gguf-py/gguf/scripts/gguf_set_metadata.py
@@ -1,15 +1,18 @@
 #!/usr/bin/env python3
+import logging
 import argparse
 import os
 import sys
 from pathlib import Path
 
 # Necessary to load the local gguf package
-if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
-    sys.path.insert(0, str(Path(__file__).parent.parent))
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from gguf import GGUFReader  # noqa: E402
 
+logger = logging.getLogger("gguf-set-metadata")
+
 
 def minimal_example(filename: str) -> None:
     reader = GGUFReader(filename, 'r+')
@@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None:
 def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
     field = reader.get_field(args.key)
     if field is None:
-        print(f'! Field {repr(args.key)} not found', file = sys.stderr)
+        logger.error(f'! Field {repr(args.key)} not found')
         sys.exit(1)
     # Note that field.types is a list of types. This is because the GGUF
     # format supports arrays. For example, an array of UINT32 would
     # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
     handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
     if handler is None:
-        print(
-            f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
-            file = sys.stderr,
-        )
+        logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}')
         sys.exit(1)
     current_value = field.parts[field.data[0]][0]
     new_value = handler(args.value)
-    print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
+    logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
     if current_value == new_value:
-        print(f'- Key {repr(args.key)} already set to requested value {current_value}')
+        logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}')
         sys.exit(0)
     if args.dry_run:
         sys.exit(0)
     if not args.force:
-        print('*** Warning *** Warning *** Warning **')
-        print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
-        print('* Enter exactly YES if you are positive you want to proceed:')
+        logger.warning('*** Warning *** Warning *** Warning **')
+        logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
+        logger.warning('* Enter exactly YES if you are positive you want to proceed:')
         response = input('YES, I am sure> ')
         if response != 'YES':
-            print("You didn't enter YES. Okay then, see ya!")
+            logger.info("You didn't enter YES. Okay then, see ya!")
             sys.exit(0)
     field.parts[field.data[0]][0] = new_value
-    print('* Field changed. Successful completion.')
+    logger.info('* Field changed. Successful completion.')
 
 
 def main() -> None:
@@ -80,8 +80,13 @@ def main() -> None:
     parser.add_argument("value",     type=str,            help="Metadata value to set")
     parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
     parser.add_argument("--force",   action="store_true", help="Change the field without confirmation")
+    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+
     args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
-    print(f'* Loading: {args.model}')
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    logger.info(f'* Loading: {args.model}')
     reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
     set_metadata(reader, args)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index db2ec9704..617791e24 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -10,16 +10,24 @@ class TensorNameMap:
         # Token embeddings
         MODEL_TENSOR.TOKEN_EMBD: (
             "gpt_neox.embed_in",                         # gptneox
-            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen
+            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
             "wte",                                       # gpt2
             "transformer.embd.wte",                      # phi2
             "model.tok_embeddings",                      # internlm2
+            "model.embedding",                           # mamba-qbert
+            "backbone.embedding",                        # mamba
+            "backbone.embeddings",                       # mamba-hf
+            "transformer.in_out_embed",                  # Grok
+            "embedding.word_embeddings",                 # chatglm
+            "transformer.token_embeddings",              # openelm
+            "shared",                                    # t5
+            "rwkv.embeddings",                           # rwkv
         ),
 
         # Token type embeddings
@@ -32,6 +40,9 @@ class TensorNameMap:
             "word_embeddings_layernorm",  # bloom
             "embeddings.LayerNorm",       # bert
             "emb_ln",                     # nomic-bert
+            "transformer.norm",           # openelm
+            "rwkv.blocks.0.pre_ln",       # rwkv
+            "backbone.norm",              # wavtokenizer
         ),
 
         # Position embeddings
@@ -44,28 +55,47 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
             "output",                    # llama-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
+            "output_layer",              # chatglm
+            "head",                      # rwkv
+            "head.out",                  # wavtokenizer
         ),
 
         # Output norm
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
-            "transformer.ln_f",                        # gpt2 gpt-j falcon
-            "model.norm",                              # llama-hf baichuan internlm2
+            "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
+            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo2 phimoe
             "norm",                                    # llama-pth
-            "transformer.norm_f",                      # mpt
+            "transformer.norm_f",                      # mpt dbrx
             "ln_f",                                    # refact bloom qwen gpt2
             "language_model.encoder.final_layernorm",  # persimmon
             "model.final_layernorm",                   # persimmon
             "lm_head.ln",                              # phi2
+            "model.norm_f",                            # mamba-qbert
+            "backbone.norm_f",                         # mamba
+            "transformer.rms_norm",                    # Grok
+            "encoder.final_layernorm",                 # chatglm
+            "transformer.norm",                        # openelm
+            "model.norm",                              # nemotron
+            "rwkv.ln_out",                             # rwkv
+            "backbone.final_layer_norm",               # wavtokenizer
         ),
 
         # Rope frequencies
         MODEL_TENSOR.ROPE_FREQS: (
             "rope.freqs",  # llama-pth
+            "rotary_pos_emb.inv_freq",  # chatglm
+        ),
+
+        MODEL_TENSOR.ROPE_FACTORS_LONG: (),
+        MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
+
+        MODEL_TENSOR.CONV1D: (
+            "backbone.embed", # roberta
         ),
     }
 
@@ -73,12 +103,12 @@ class TensorNameMap:
         # Attention norm
         MODEL_TENSOR.ATTN_NORM: (
             "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen jais exaone
             "transformer.blocks.{bid}.norm_1",                      # mpt
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
             "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf
+            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe
             "layers.{bid}.attention_norm",                          # llama-pth
             "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
             "model.layers.{bid}.ln1",                               # yi
@@ -86,18 +116,28 @@ class TensorNameMap:
             "transformer.h.{bid}.ln",                               # phi2
             "model.layers.layers.{bid}.norm",                       # plamo
             "model.layers.{bid}.attention_norm",                    # internlm2
+            "model.layers.{bid}.norm",                              # mamba-qbert
+            "backbone.layers.{bid}.norm",                           # mamba
+            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
+            "encoder.layers.{bid}.input_layernorm",                 # chatglm
+            "transformer.layers.{bid}.attn_norm",                   # openelm
+            "rwkv.blocks.{bid}.ln1",                                # rwkv
         ),
 
         # Attention norm 2
         MODEL_TENSOR.ATTN_NORM_2: (
-            "transformer.h.{bid}.ln_attn",  # falcon40b
+            "transformer.h.{bid}.ln_attn",                  # falcon40b
+            "encoder.layer.{bid}.layer_norm_1",             # jina-v2-code
+            "rwkv.blocks.{bid}.ln2",                        # rwkv
         ),
 
         # Attention query-key-value
         MODEL_TENSOR.ATTN_QKV: (
             "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
-            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
+            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen jais
             "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
+            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",                   # dbrx
             "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
             "h.{bid}.self_attention.query_key_value",                              # bloom
             "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
@@ -105,62 +145,87 @@ class TensorNameMap:
             "h.{bid}.attn.c_attn",                                                 # gpt2
             "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
             "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
+            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
+            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
+            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
         ),
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",         # llama-hf
-            "layers.{bid}.attention.wq",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.query",    # bert
-            "transformer.h.{bid}.attn.q_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
-            "model.layers.{bid}.attention.wq"             # internlm2
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
+            "layers.{bid}.attention.wq",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
+            "model.layers.{bid}.attention.wq",                           # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
+            "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
         ),
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",         # llama-hf
-            "layers.{bid}.attention.wk",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.key",      # bert
-            "transformer.h.{bid}.attn.k_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
-            "model.layers.{bid}.attention.wk"             # internlm2
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
+            "layers.{bid}.attention.wk",                               # llama-pth
+            "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
+            "transformer.h.{bid}.attn.k",                              # refact
+            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
+            "model.layers.{bid}.attention.wk",                         # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
+            "transformer.h.{bid}.attn.attention.k_proj",               # exaone
         ),
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",         # llama-hf
-            "layers.{bid}.attention.wv",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.value",    # bert
-            "transformer.h.{bid}.attn.v_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
-            "model.layers.{bid}.attention.wv"             # internlm2
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
+            "layers.{bid}.attention.wv",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
+            "transformer.h.{bid}.attn.v",                                # refact
+            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
+            "model.layers.{bid}.attention.wv",                           # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
+            "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
         ),
 
         # Attention output
         MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
-            "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact qwen
-            "transformer.blocks.{bid}.attn.out_proj",                    # mpt
-            "transformer.h.{bid}.self_attention.dense",                  # falcon
-            "h.{bid}.self_attention.dense",                              # bloom
-            "model.layers.{bid}.self_attn.o_proj",                       # llama-hf
-            "layers.{bid}.attention.wo",                                 # llama-pth
-            "encoder.layer.{bid}.attention.output.dense",                # bert
-            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
-            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
-            "model.layers.{bid}.self_attn.dense",                        # persimmon
-            "h.{bid}.attn.c_proj",                                       # gpt2
-            "transformer.h.{bid}.mixer.out_proj",                        # phi2
-            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
-            "model.layers.{bid}.attention.wo",                           # internlm2
-            "encoder.layers.{bid}.attn.out_proj",                        # nomic-bert
+            "gpt_neox.layers.{bid}.attention.dense",                        # gptneox
+            "transformer.h.{bid}.attn.c_proj",                              # gpt2 refact qwen jais
+            "transformer.blocks.{bid}.attn.out_proj",                       # mpt
+            "transformer.h.{bid}.self_attention.dense",                     # falcon
+            "h.{bid}.self_attention.dense",                                 # bloom
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.linear_attn",                     # deci
+            "layers.{bid}.attention.wo",                                    # llama-pth
+            "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
+            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
+            "model.layers.{bid}.self_attn.dense",                           # persimmon
+            "h.{bid}.attn.c_proj",                                          # gpt2
+            "transformer.h.{bid}.mixer.out_proj",                           # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
+            "model.layers.{bid}.attention.wo",                              # internlm2
+            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
+            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
+            "transformer.layers.{bid}.attn.out_proj",                       # openelm
+            "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
         ),
 
         # Attention output norm
         MODEL_TENSOR.ATTN_OUT_NORM: (
             "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
             "encoder.layers.{bid}.norm1",                      # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
+        ),
+
+        MODEL_TENSOR.ATTN_POST_NORM: (
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo2
         ),
 
         # Rotary embeddings
@@ -174,48 +239,87 @@ class TensorNameMap:
         # Feed-forward norm
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen
+            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen jais exaone
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
-            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
+            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron olmoe phimoe
             "layers.{bid}.ffn_norm",                                         # llama-pth
             "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
             "model.layers.{bid}.ln2",                                        # yi
             "h.{bid}.ln_2",                                                  # gpt2
             "model.layers.{bid}.ffn_norm",                                   # internlm2
+            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
+            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
+            "transformer.layers.{bid}.ffn_norm",                             # openelm
+        ),
+
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_PRE_NORM: (
+            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
+        ),
+
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_POST_NORM: (
+            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+            "layers.{bid}.feed_forward.gate",                   # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",         # mixtral phimoe
+            "model.layers.{bid}.mlp.gate",                      # qwen2moe olmoe
+            "transformer.decoder_layer.{bid}.router",           # Grok
+            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
+            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
+        ),
+
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
+        ),
+
+        MODEL_TENSOR.FFN_EXP_PROBS_B: (
+            "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
         ),
 
         # Feed-forward up
         MODEL_TENSOR.FFN_UP: (
             "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
-            "transformer.h.{bid}.mlp.c_fc",                           # gpt2
+            "transformer.h.{bid}.mlp.c_fc",                           # gpt2 jais
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact
+            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
+            "transformer.h.{bid}.mlp.linear_3",                       # refact
             "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
             "model.layers.{bid}.mlp.dense_h_to_4h",                   # persimmon
             "transformer.h.{bid}.mlp.w1",                             # qwen
             "h.{bid}.mlp.c_fc",                                       # gpt2
             "transformer.h.{bid}.mlp.fc1",                            # phi2
             "model.layers.{bid}.mlp.fc1",                             # phi2
+            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3
             "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
             "model.layers.{bid}.feed_forward.w3",                     # internlm2
             "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
             "model.layers.{bid}.mlp.c_fc",                            # starcoder2
+            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
+            "model.layers.{bid}.residual_mlp.w3",                     # arctic
+            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
+            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+            "layers.{bid}.feed_forward.experts.w3",           # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_v",   # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.v1",    # dbrx
+            "model.layers.{bid}.mlp.experts.up_proj",         # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_UP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
         ),
 
         # AWQ-activation gate
@@ -225,27 +329,40 @@ class TensorNameMap:
 
         # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo2
             "layers.{bid}.feed_forward.w1",               # llama-pth
             "transformer.h.{bid}.mlp.w2",                 # qwen
+            "transformer.h.{bid}.mlp.c_fc2",              # jais
             "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
             "model.layers.{bid}.feed_forward.w1",         # internlm2
             "encoder.layers.{bid}.mlp.fc12",              # nomic-bert
+            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2
+            "transformer.h.{bid}.mlp.linear_1",           # refact
+            "model.layers.{bid}.residual_mlp.w1",         # arctic
+            "transformer.h.{bid}.mlp.c_fc_0",             # exaone
         ),
 
         MODEL_TENSOR.FFN_GATE_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
+            "layers.{bid}.feed_forward.experts.w1",           # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear",     # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",    # dbrx
+            "model.layers.{bid}.mlp.experts.gate_proj",       # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_GATE_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
         ),
 
         # Feed-forward down
         MODEL_TENSOR.FFN_DOWN: (
             "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
-            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen
+            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen jais
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf
+            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
@@ -258,21 +375,44 @@ class TensorNameMap:
             "model.layers.{bid}.feed_forward.w2",                     # internlm2
             "encoder.layers.{bid}.mlp.fc2",                           # nomic-bert
             "model.layers.{bid}.mlp.c_proj",                          # starcoder2
+            "encoder.layer.{bid}.mlp.wo",                             # jina-bert-v2
+            "transformer.layers.{bid}.ffn.proj_2",                    # openelm
+            "model.layers.{bid}.residual_mlp.w2",                     # arctic
+            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
+            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
+            "model.layers.h.{bid}.mlp.c_proj",                        # exaone
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
+            "layers.{bid}.feed_forward.experts.w2",              # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",      # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",       # dbrx
+            "model.layers.{bid}.mlp.experts.down_proj",          # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
+            "model.layers.{bid}.block_sparse_moe.experts.w2",    # phimoe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_DOWN_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
         ),
 
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
+            "transformer.layers.{bid}.attn.q_norm",                           # openelm
         ),
 
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
+            "transformer.layers.{bid}.attn.k_norm",                           # openelm
         ),
 
         MODEL_TENSOR.ROPE_FREQS: (
@@ -280,9 +420,385 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.LAYER_OUT_NORM: (
-            "encoder.layer.{bid}.output.LayerNorm",  # bert
-            "encoder.layers.{bid}.norm2",            # nomic-bert
-        )
+            "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "encoder.layers.{bid}.norm2",                   # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
+            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
+            "encoder.layer.{bid}.layer_norm_2"              # jina-v2-code
+        ),
+
+        MODEL_TENSOR.SSM_IN: (
+            "model.layers.{bid}.in_proj",
+            "backbone.layers.{bid}.mixer.in_proj",
+        ),
+
+        MODEL_TENSOR.SSM_CONV1D: (
+            "model.layers.{bid}.conv1d",
+            "backbone.layers.{bid}.mixer.conv1d",
+        ),
+
+        MODEL_TENSOR.SSM_X: (
+            "model.layers.{bid}.x_proj",
+            "backbone.layers.{bid}.mixer.x_proj",
+        ),
+
+        MODEL_TENSOR.SSM_DT: (
+            "model.layers.{bid}.dt_proj",
+            "backbone.layers.{bid}.mixer.dt_proj",
+        ),
+
+        MODEL_TENSOR.SSM_A: (
+            "model.layers.{bid}.A_log",
+            "backbone.layers.{bid}.mixer.A_log",
+        ),
+
+        MODEL_TENSOR.SSM_D: (
+            "model.layers.{bid}.D",
+            "backbone.layers.{bid}.mixer.D",
+        ),
+
+        MODEL_TENSOR.SSM_OUT: (
+            "model.layers.{bid}.out_proj",
+            "backbone.layers.{bid}.mixer.out_proj",
+        ),
+
+        MODEL_TENSOR.TIME_MIX_W1: (
+            "rwkv.blocks.{bid}.attention.time_maa_w1",  # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_W2: (
+            "rwkv.blocks.{bid}.attention.time_maa_w2",  # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_X: (
+            "rwkv.blocks.{bid}.attention.time_maa_x",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_x",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.attention.time_maa_k",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_k",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_V: (
+            "rwkv.blocks.{bid}.attention.time_maa_v",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_v",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.attention.time_maa_r",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_r",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_G: (
+            "rwkv.blocks.{bid}.attention.time_maa_g",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_g",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LERP_W: (
+            "rwkv.blocks.{bid}.attention.time_maa_w",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_maa_w",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_FIRST: (
+            "rwkv.blocks.{bid}.attention.time_faaaa",   # rwkv v6
+        ),
+
+        MODEL_TENSOR.TIME_MIX_DECAY: (
+            "rwkv.blocks.{bid}.attention.time_decay",   # rwkv v6
+            "model.layers.{bid}.self_attn.time_decay",  # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_DECAY_W1: (
+            "rwkv.blocks.{bid}.attention.time_decay_w1",  # rwkv v6
+            "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_DECAY_W2: (
+            "rwkv.blocks.{bid}.attention.time_decay_w2",  # rwkv v6
+            "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_KEY: (
+            "rwkv.blocks.{bid}.attention.key",     # rwkv
+            "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_VALUE: (
+            "rwkv.blocks.{bid}.attention.value",   # rwkv
+            "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
+            "rwkv.blocks.{bid}.attention.receptance", # rwkv
+            "model.layers.{bid}.self_attn.q_proj",    # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_GATE: (
+            "rwkv.blocks.{bid}.attention.gate",  # rwkv
+            "model.layers.{bid}.self_attn.gate", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.TIME_MIX_LN: (
+            "rwkv.blocks.{bid}.attention.ln_x", # rwkv
+        ),
+
+        MODEL_TENSOR.TIME_MIX_OUTPUT: (
+            "rwkv.blocks.{bid}.attention.output",  # rwkv
+            "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
+        ),
+
+        MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
+        ),
+
+        MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
+        ),
+
+        MODEL_TENSOR.CHANNEL_MIX_KEY: (
+            "rwkv.blocks.{bid}.feed_forward.key", # rwkv
+        ),
+
+        MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
+            "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
+        ),
+
+        MODEL_TENSOR.CHANNEL_MIX_VALUE: (
+            "rwkv.blocks.{bid}.feed_forward.value", # rwkv
+        ),
+
+        MODEL_TENSOR.ATTN_Q_A: (
+            "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_Q_B: (
+            "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_KV_A_MQA: (
+            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_KV_B: (
+            "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_Q_A_NORM: (
+            "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_KV_A_NORM: (
+            "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_SUB_NORM: (
+            "model.layers.{bid}.self_attn.inner_attn_ln",  # bitnet
+        ),
+
+        MODEL_TENSOR.FFN_SUB_NORM: (
+            "model.layers.{bid}.mlp.ffn_layernorm",  # bitnet
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_NORM: (
+            "decoder.block.{bid}.layer.0.layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_Q: (
+            "decoder.block.{bid}.layer.0.SelfAttention.q", # t5
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_K: (
+            "decoder.block.{bid}.layer.0.SelfAttention.k", # t5
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_V: (
+            "decoder.block.{bid}.layer.0.SelfAttention.v", # t5
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_OUT: (
+            "decoder.block.{bid}.layer.0.SelfAttention.o", # t5
+        ),
+
+        MODEL_TENSOR.DEC_ATTN_REL_B: (
+            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
+            "decoder.block.{bid}.layer.1.layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_K: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_V: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
+        ),
+
+        MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
+        ),
+
+        MODEL_TENSOR.DEC_FFN_NORM: (
+            "decoder.block.{bid}.layer.2.layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.DEC_FFN_GATE: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
+        ),
+
+        MODEL_TENSOR.DEC_FFN_UP: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi",   # t5
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
+        ),
+
+        MODEL_TENSOR.DEC_FFN_DOWN: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
+        ),
+
+        MODEL_TENSOR.DEC_OUTPUT_NORM: (
+            "decoder.final_layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_NORM: (
+            "encoder.block.{bid}.layer.0.layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_Q: (
+            "encoder.block.{bid}.layer.0.SelfAttention.q", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_K: (
+            "encoder.block.{bid}.layer.0.SelfAttention.k", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_V: (
+            "encoder.block.{bid}.layer.0.SelfAttention.v", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_OUT: (
+            "encoder.block.{bid}.layer.0.SelfAttention.o", # t5
+        ),
+
+        MODEL_TENSOR.ENC_ATTN_REL_B: (
+            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
+        ),
+
+        MODEL_TENSOR.ENC_FFN_NORM: (
+            "encoder.block.{bid}.layer.1.layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.ENC_FFN_GATE: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
+        ),
+
+        MODEL_TENSOR.ENC_FFN_UP: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi",   # t5
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
+        ),
+
+        MODEL_TENSOR.ENC_FFN_DOWN: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
+        ),
+
+        ############################################################################
+        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
+        MODEL_TENSOR.ENC_OUTPUT_NORM: (
+            "encoder.final_layer_norm", # t5
+        ),
+
+        MODEL_TENSOR.CLS: (
+            "classifier",       # jina
+            "classifier.dense", # roberta
+        ),
+
+        MODEL_TENSOR.CLS_OUT: (
+            "classifier.out_proj", # roberta
+        ),
+        #############################################################################
+
+        MODEL_TENSOR.CONVNEXT_DW: (
+            "backbone.convnext.{bid}.dwconv", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_NORM: (
+            "backbone.convnext.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_PW1: (
+            "backbone.convnext.{bid}.pwconv1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_PW2: (
+            "backbone.convnext.{bid}.pwconv2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.CONVNEXT_GAMMA: (
+            "backbone.convnext.{bid}.gamma", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_CONV1: (
+            "backbone.posnet.{bid}.conv1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_CONV2: (
+            "backbone.posnet.{bid}.conv2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM1: (
+            "backbone.posnet.{bid}.norm1", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_NORM2: (
+            "backbone.posnet.{bid}.norm2", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_Q: (
+            "backbone.posnet.{bid}.q", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_K: (
+            "backbone.posnet.{bid}.k", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_V: (
+            "backbone.posnet.{bid}.v", # wavtokenizer
+        ),
+
+        MODEL_TENSOR.POSNET_ATTN_OUT: (
+            "backbone.posnet.{bid}.proj_out", # wavtokenizer
+        ),
+    }
+
+    # architecture-specific block mappings
+    arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
+        MODEL_ARCH.ARCTIC: {
+            MODEL_TENSOR.FFN_NORM: (
+                "model.layers.{bid}.residual_layernorm",
+            ),
+            MODEL_TENSOR.FFN_NORM_EXP: (
+                "model.layers.{bid}.post_attention_layernorm",
+            ),
+        },
     }
 
     mapping: dict[str, tuple[MODEL_TENSOR, str]]
@@ -296,18 +812,18 @@ class TensorNameMap:
             self.mapping[tensor_name] = (tensor, tensor_name)
             for key in keys:
                 self.mapping[key] = (tensor, tensor_name)
+        if arch in self.arch_block_mappings_cfg:
+            self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
         for bid in range(n_blocks):
             for tensor, keys in self.block_mappings_cfg.items():
                 if tensor not in MODEL_TENSORS[arch]:
                     continue
-                # TODO: make this configurable
-                n_experts = 8
-                for xid in range(n_experts):
-                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
-                    self.mapping[tensor_name] = (tensor, tensor_name)
-                    for key in keys:
-                        key = key.format(bid = bid, xid = xid)
-                        self.mapping[key] = (tensor, tensor_name)
+
+                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
+                self.mapping[tensor_name] = (tensor, tensor_name)
+                for key in keys:
+                    key = key.format(bid = bid)
+                    self.mapping[key] = (tensor, tensor_name)
 
     def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
         result = self.mapping.get(key)
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
new file mode 100644
index 000000000..40d59b75e
--- /dev/null
+++ b/gguf-py/gguf/utility.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+from typing import Literal
+
+
+def fill_templated_filename(filename: str, output_type: str | None) -> str:
+    # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
+    ftype_lowercase: str = output_type.lower() if output_type is not None else ""
+    ftype_uppercase: str = output_type.upper() if output_type is not None else ""
+    return filename.format(ftype_lowercase,
+                           outtype=ftype_lowercase, ftype=ftype_lowercase,
+                           OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase)
+
+
+def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str:
+    if model_params_count > 1e12 :
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9 :
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6 :
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"
+
+    fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0)
+
+    return f"{scaled_model_params:.{fix}f}{scale_suffix}"
+
+
+def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str:
+
+    if expert_count > 0:
+        pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2)
+        size_class = f"{expert_count}x{pretty_size}"
+    else:
+        size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2)
+
+    return size_class
+
+
+def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
+    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
+
+    if base_name is not None:
+        name = base_name.strip().replace(' ', '-').replace('/', '-')
+    elif model_name is not None:
+        name = model_name.strip().replace(' ', '-').replace('/', '-')
+    else:
+        name = "ggml-model"
+
+    parameters = f"-{size_label}" if size_label is not None else ""
+
+    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
+
+    version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
+
+    encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else ""
+
+    kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
+
+    return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index a23136b18..f2645f921 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -1,23 +1,30 @@
 from __future__ import annotations
 
+import re
+import logging
 import json
 import os
-import sys
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
+
+from sentencepiece import SentencePieceProcessor
+
+import gguf
 
 from .gguf_writer import GGUFWriter
 
+logger = logging.getLogger(__name__)
+
 
 class SpecialVocab:
     merges: list[str]
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
-    chat_template: str | None
+    chat_template: str | Sequence[Mapping[str, str]] | None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
-        special_token_types: tuple[str, ...] | None = None,
+        special_token_types: Iterable[str] | None = None,
         n_vocab: int | None = None,
     ):
         self.special_token_ids = {}
@@ -40,38 +47,29 @@ class SpecialVocab:
     def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
         if self.merges:
             if not quiet:
-                print(f'gguf: Adding {len(self.merges)} merge(s).')
+                logger.info(f'Adding {len(self.merges)} merge(s).')
             gw.add_token_merges(self.merges)
         elif self.load_merges:
-            print(
-                'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
-                file = sys.stderr,
-            )
+            logger.warning('Adding merges requested but no merges found, output may be non-functional.')
         for typ, tokid in self.special_token_ids.items():
             id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
             if id_handler is None:
-                print(
-                    f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
-                    file = sys.stderr,
-                )
+                logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
                 continue
             if not quiet:
-                print(f'gguf: Setting special token type {typ} to {tokid}')
+                logger.info(f'Setting special token type {typ} to {tokid}')
             id_handler(tokid)
         for typ, value in self.add_special_token.items():
             add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
             if add_handler is None:
-                print(
-                    f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
-                    file = sys.stderr,
-                )
+                logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
                 continue
             if not quiet:
-                print(f'gguf: Setting add_{typ}_token to {value}')
+                logger.info(f'Setting add_{typ}_token to {value}')
             add_handler(value)
         if self.chat_template is not None:
             if not quiet:
-                print(f'gguf: Setting chat_template to {self.chat_template}')
+                logger.info(f'Setting chat_template to {self.chat_template}')
             gw.add_chat_template(self.chat_template)
 
     def _load(self, path: Path) -> None:
@@ -99,10 +97,7 @@ class SpecialVocab:
                     continue
                 parts = line.split(None, 3)
                 if len(parts) != 2:
-                    print(
-                        f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
-                        file = sys.stderr,
-                    )
+                    logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
                     continue
                 merges.append(f'{parts[0]} {parts[1]}')
         self.merges = merges
@@ -118,10 +113,7 @@ class SpecialVocab:
                 return
             self.special_token_ids[typ] = tid
             return
-        print(
-            f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
-            file = sys.stderr,
-        )
+        logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer_file = path / 'tokenizer.json'
@@ -130,8 +122,30 @@ class SpecialVocab:
                 tokenizer = json.load(f)
             if self.load_merges:
                 merges = tokenizer.get('model', {}).get('merges')
-                if isinstance(merges, list) and merges and isinstance(merges[0], str):
-                    self.merges = merges
+                if isinstance(merges, list) and merges:
+                    if isinstance(merges[0], str):
+                        self.merges = merges
+                    elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
+                        # New format since transformers 4.45 to support spaces in merges
+                        # ref: https://github.com/ggerganov/llama.cpp/issues/9692
+                        # TODO: internally store as the new format instead of converting to old
+                        if any(' ' in s for pair in merges for s in pair):
+                            logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
+                        self.merges = [
+                            ' '.join(
+                                [
+                                    # ensure the spaces are properly encoded
+                                    ''.join(
+                                        chr(ord(c) + 256) if c == ' ' else c
+                                        for c in part
+                                    )
+                                    for part in pair
+                                ]
+                            )
+                            for pair in merges
+                        ]
+                    else:
+                        raise ValueError("Unknown tokenizer merges format")
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}
@@ -141,13 +155,10 @@ class SpecialVocab:
         with open(tokenizer_config_file, encoding = 'utf-8') as f:
             tokenizer_config = json.load(f)
         chat_template = tokenizer_config.get('chat_template')
-        if chat_template is None or isinstance(chat_template, str):
+        if chat_template is None or isinstance(chat_template, (str, list)):
             self.chat_template = chat_template
         else:
-            print(
-                f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
-                file = sys.stderr
-            )
+            logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
         for typ in self.special_token_types:
             add_entry = tokenizer_config.get(f'add_{typ}_token')
             if isinstance(add_entry, bool):
@@ -179,3 +190,298 @@ class SpecialVocab:
         for typ in self.special_token_types:
             self._set_special_token(typ, config.get(f'{typ}_token_id'))
         return True
+
+
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]
+
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+class BpeVocab(Vocab):
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / 'tokenizer.json'
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "spm"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')
+
+        self.sentencepiece_tokenizer = SentencePieceProcessor()
+        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict  = added_tokens
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.IdToPiece(i)
+            text         = piece.encode("utf-8")
+            score: float = tokenizer.GetScore(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.IsUnknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.IsControl(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.IsUnused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.IsByte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class LlamaHfVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "hfft"
+
+    def __init__(self, base_path: Path):
+        fname_tokenizer = base_path / 'tokenizer.json'
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use LlamaHfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
+        # Allow the tokenizer to default to slow or fast versions.
+        # Explicitly set tokenizer to use local paths.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            base_path,
+            cache_dir=base_path,
+            local_files_only=True,
+        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used
+
+        # Initialize lists and dictionaries for added tokens
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids  = set()
+
+        # Process added tokens
+        for tok, tokidx in sorted(
+            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
+        ):
+            # Only consider added tokens that are not in the base vocabulary
+            if tokidx >= self.tokenizer.vocab_size:
+                self.added_tokens_list.append(tok)
+                self.added_tokens_dict[tok] = tokidx
+                self.added_tokens_ids.add(tokidx)
+
+        # Store special tokens and their IDs
+        self.specials = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids = set(self.tokenizer.all_special_ids)
+
+        # Set vocabulary sizes
+        self.vocab_size_base = self.tokenizer.vocab_size
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+
+        self.fname_tokenizer = fname_tokenizer
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {
+            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
+        }
+
+        for token_id in range(self.vocab_size_base):
+            # Skip processing added tokens here
+            if token_id in self.added_tokens_ids:
+                continue
+
+            # Convert token text to bytes
+            token_text = reverse_vocab[token_id].encode("utf-8")
+
+            # Yield token text, score, and type
+            yield token_text, self.get_token_score(token_id), self.get_token_type(
+                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+            )
+
+    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+        # Special case for byte tokens
+        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+            return gguf.TokenType.BYTE
+
+        # Determine token type based on whether it's a special token
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+
+    def get_token_score(self, token_id: int) -> float:
+        # Placeholder for actual logic to determine the token's score
+        # This needs to be implemented based on specific requirements
+        return -1000.0  # Default score
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            if text in self.specials:
+                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                score = self.get_token_score(self.specials[text])
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self):
+        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 9789c2c87..78c6baa64 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,12 +1,11 @@
 [tool.poetry]
 name = "gguf"
-version = "0.7.0"
+version = "0.15.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
     {include = "gguf"},
     {include = "gguf/py.typed"},
-    {include = "scripts"},
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
@@ -21,6 +20,9 @@ classifiers = [
 [tool.poetry.dependencies]
 python = ">=3.8"
 numpy = ">=1.17"
+tqdm = ">=4.27"
+pyyaml = ">=5.1"
+sentencepiece = ">=0.1.98,<=0.2.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
@@ -30,6 +32,7 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
-gguf-dump = "scripts:gguf_dump_entrypoint"
-gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
+gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
+gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
+gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
+gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
diff --git a/gguf-py/scripts/__init__.py b/gguf-py/scripts/__init__.py
deleted file mode 100644
index 77132db7a..000000000
--- a/gguf-py/scripts/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-
-from importlib import import_module
-
-
-os.environ["NO_LOCAL_GGUF"] = "TRUE"
-
-gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main
-gguf_dump_entrypoint           = import_module("scripts.gguf-dump").main
-gguf_set_metadata_entrypoint   = import_module("scripts.gguf-set-metadata").main
-
-del import_module, os
diff --git a/gguf-py/scripts/gguf-convert-endian.py b/gguf-py/scripts/gguf-convert-endian.py
deleted file mode 100755
index 10a16ad06..000000000
--- a/gguf-py/scripts/gguf-convert-endian.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import os
-import sys
-from pathlib import Path
-
-import numpy as np
-
-# Necessary to load the local gguf package
-if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
-    sys.path.insert(0, str(Path(__file__).parent.parent))
-
-import gguf
-
-
-def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
-        # Host is little endian
-        host_endian = "little"
-        swapped_endian = "big"
-    else:
-        # Sorry PDP or other weird systems that don't use BE or LE.
-        host_endian = "big"
-        swapped_endian = "little"
-    if reader.byte_order == "S":
-        file_endian = swapped_endian
-    else:
-        file_endian = host_endian
-    order = host_endian if args.order == "native" else args.order
-    print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
-    if file_endian == order:
-        print(f"* File is already {order.upper()} endian. Nothing to do.")
-        sys.exit(0)
-    print("* Checking tensors for conversion compatibility")
-    for tensor in reader.tensors:
-        if tensor.tensor_type not in (
-            gguf.GGMLQuantizationType.F32,
-            gguf.GGMLQuantizationType.F16,
-            gguf.GGMLQuantizationType.Q8_0,
-        ):
-            raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
-    print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
-    if args.dry_run:
-        return
-    print("\n*** Warning *** Warning *** Warning **")
-    print("* This conversion process may damage the file. Ensure you have a backup.")
-    if order != host_endian:
-        print("* Requested endian differs from host, you will not be able to load the model on this machine.")
-    print("* The file will be modified immediately, so if conversion fails or is interrupted")
-    print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
-    response = input("YES, I am sure> ")
-    if response != "YES":
-        print("You didn't enter YES. Okay then, see ya!")
-        sys.exit(0)
-    print(f"\n* Converting fields ({len(reader.fields)})")
-    for idx, field in enumerate(reader.fields.values()):
-        print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
-        for part in field.parts:
-            part.byteswap(inplace=True)
-    print(f"\n* Converting tensors ({len(reader.tensors)})")
-    for idx, tensor in enumerate(reader.tensors):
-        print(
-            f"  - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, "
-            f"elements={tensor.n_elements}... ",
-            end="",
-        )
-        tensor_type = tensor.tensor_type
-        for part in tensor.field.parts:
-            part.byteswap(inplace=True)
-        if tensor_type != gguf.GGMLQuantizationType.Q8_0:
-            tensor.data.byteswap(inplace=True)
-            print()
-            continue
-        # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes
-        block_size = 34
-        n_blocks = len(tensor.data) // block_size
-        for block_num in range(n_blocks):
-            block_offs = block_num * block_size
-            # I know I said f16, but it doesn't matter here - any simple 16 bit type works.
-            delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
-            delta.byteswap(inplace=True)
-            if block_num % 100000 == 0:
-                print(f"[{(n_blocks - block_num) // 1000}K]", end="")
-                sys.stdout.flush()
-        print()
-    print("* Completion")
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Convert GGUF file byte order")
-    parser.add_argument(
-        "model", type=str,
-        help="GGUF format model filename",
-    )
-    parser.add_argument(
-        "order", type=str, choices=['big', 'little', 'native'],
-        help="Requested byte order",
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true",
-        help="Don't actually change anything",
-    )
-    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
-    print(f'* Loading: {args.model}')
-    reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
-    convert_byteorder(reader, args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py
deleted file mode 100755
index dbf891508..000000000
--- a/gguf-py/scripts/gguf-dump.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import os
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-
-# Necessary to load the local gguf package
-if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
-    sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from gguf import GGUFReader, GGUFValueType  # noqa: E402
-
-
-def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
-    host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
-    if reader.byte_order == 'S':
-        file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE'
-    else:
-        file_endian = host_endian
-    return (host_endian, file_endian)
-
-
-# For more information about what field.parts and field.data represent,
-# please see the comments in the modify_gguf.py example.
-def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
-    host_endian, file_endian = get_file_host_endian(reader)
-    print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')
-    print(f'\n* Dumping {len(reader.fields)} key/value pair(s)')
-    for n, field in enumerate(reader.fields.values(), 1):
-        if not field.types:
-            pretty_type = 'N/A'
-        elif field.types[0] == GGUFValueType.ARRAY:
-            nest_count = len(field.types) - 1
-            pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
-        else:
-            pretty_type = str(field.types[-1].name)
-        print(f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
-        if len(field.types) == 1:
-            curr_type = field.types[0]
-            if curr_type == GGUFValueType.STRING:
-                print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
-            elif field.types[0] in reader.gguf_scalar_to_np:
-                print(' = {0}'.format(field.parts[-1][0]), end = '')
-        print()
-    if args.no_tensors:
-        return
-    print(f'\n* Dumping {len(reader.tensors)} tensor(s)')
-    for n, tensor in enumerate(reader.tensors, 1):
-        prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
-        print(f'  {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')
-
-
-def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
-    import json
-    host_endian, file_endian = get_file_host_endian(reader)
-    metadata: dict[str, Any] = {}
-    tensors: dict[str, Any] = {}
-    result = {
-        "filename": args.model,
-        "endian": file_endian,
-        "metadata": metadata,
-        "tensors": tensors,
-    }
-    for idx, field in enumerate(reader.fields.values()):
-        curr: dict[str, Any] = {
-            "index": idx,
-            "type": field.types[0].name if field.types else 'UNKNOWN',
-            "offset": field.offset,
-        }
-        metadata[field.name] = curr
-        if field.types[:1] == [GGUFValueType.ARRAY]:
-            curr["array_types"] = [t.name for t in field.types][1:]
-            if not args.json_array:
-                continue
-            itype = field.types[-1]
-            if itype == GGUFValueType.STRING:
-                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
-            else:
-                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
-        elif field.types[0] == GGUFValueType.STRING:
-            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
-        else:
-            curr["value"] = field.parts[-1].tolist()[0]
-    if not args.no_tensors:
-        for idx, tensor in enumerate(reader.tensors):
-            tensors[tensor.name] = {
-                "index": idx,
-                "shape": tensor.shape.tolist(),
-                "type": tensor.tensor_type.name,
-                "offset": tensor.field.offset,
-            }
-    json.dump(result, sys.stdout)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
-    parser.add_argument("model",           type=str,            help="GGUF format model filename")
-    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
-    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
-    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
-    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
-    if not args.json:
-        print(f'* Loading: {args.model}')
-    reader = GGUFReader(args.model, 'r')
-    if args.json:
-        dump_metadata_json(reader, args)
-    else:
-        dump_metadata(reader, args)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/gguf-py/tests/__init__.py b/gguf-py/tests/__init__.py
new file mode 100644
index 000000000..d23ff9cb7
--- /dev/null
+++ b/gguf-py/tests/__init__.py
@@ -0,0 +1 @@
+from .test_metadata import *
diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py
deleted file mode 100644
index 0adeb7d55..000000000
--- a/gguf-py/tests/test_gguf.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import gguf  # noqa: F401
-
-# TODO: add tests
-
-
-def test_write_gguf() -> None:
-    pass
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
new file mode 100755
index 000000000..40d484f4e
--- /dev/null
+++ b/gguf-py/tests/test_metadata.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+
+import unittest
+from pathlib import Path
+import os
+import sys
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import gguf
+
+
+class TestMetadataMethod(unittest.TestCase):
+
+    def test_id_to_title(self):
+        self.assertEqual(gguf.Metadata.id_to_title("Mixtral-8x7B-Instruct-v0.1"), "Mixtral 8x7B Instruct v0.1")
+        self.assertEqual(gguf.Metadata.id_to_title("Meta-Llama-3-8B"), "Meta Llama 3 8B")
+        self.assertEqual(gguf.Metadata.id_to_title("hermes-2-pro-llama-3-8b-DPO"), "Hermes 2 Pro Llama 3 8b DPO")
+
+    def test_get_model_id_components(self):
+        # This is the basic standard form with organization marker
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mistral/Mixtral-8x7B-Instruct-v0.1"),
+                         ('Mixtral-8x7B-Instruct-v0.1', "Mistral", 'Mixtral', 'Instruct', 'v0.1', '8x7B'))
+
+        # Similar to basic standard form but without organization marker
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct-v0.1"),
+                         ('Mixtral-8x7B-Instruct-v0.1', None, 'Mixtral', 'Instruct', 'v0.1', '8x7B'))
+
+        # Missing version
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-Instruct"),
+                         ('Mixtral-8x7B-Instruct', None, 'Mixtral', 'Instruct', None, '8x7B'))
+
+        # Missing finetune
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B-v0.1"),
+                         ('Mixtral-8x7B-v0.1', None, 'Mixtral', None, 'v0.1', '8x7B'))
+
+        # Base name and size label only
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-8x7B"),
+                         ('Mixtral-8x7B', None, 'Mixtral', None, None, '8x7B'))
+
+        # Base name and version only
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral-v0.1"),
+                         ('Mixtral-v0.1', None, 'Mixtral', None, 'v0.1', None))
+
+        ## Edge Cases ##
+
+        # This is too ambiguous... best to err on caution and output nothing
+        self.assertEqual(gguf.Metadata.get_model_id_components("Mixtral"),
+                         ('Mixtral', None, None, None, None, None))
+
+        # Basename has numbers mixed in and also size label provided. Must avoid capturing number in basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
+                         ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
+
+        # Non standard naming
+        self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
+                         ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
+
+        # Capture 'sub size labels' e.g. A14B in '57B-A14B' usually refers to activated params/weight count
+        self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-57B-A14B-Instruct"),
+                         ('Qwen2-57B-A14B-Instruct', None, 'Qwen2', 'Instruct', None, '57B-A14B'))
+
+        # Check that it can handle a real model id with no version code
+        # Note that 4k in this string is non standard and microsoft were referring to context length rather than weight count
+        self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Phi-3-mini-4k-instruct", 4 * 10**9),
+                         ('Phi-3-mini-4k-instruct', 'microsoft', 'Phi-3', '4k-instruct', None, 'mini'))
+
+        # There is some legitimate models with only thousands of parameters
+        self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
+                         ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
+
+        # Non standard and not easy to disambiguate
+        self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
+                         ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
+
+        # This is a real model_id where they append 2DPO to refer to Direct Preference Optimization
+        self.assertEqual(gguf.Metadata.get_model_id_components("crestf411/daybreak-kunoichi-2dpo-7b"),
+                         ('daybreak-kunoichi-2dpo-7b', 'crestf411', 'daybreak-kunoichi', '2dpo', None, '7B'))
+
+        # This is a real model id where the weight size has a decimal point
+        self.assertEqual(gguf.Metadata.get_model_id_components("Qwen2-0.5B-Instruct"),
+                         ('Qwen2-0.5B-Instruct', None, 'Qwen2', 'Instruct', None, '0.5B'))
+
+        # Uses an underscore in the size label
+        self.assertEqual(gguf.Metadata.get_model_id_components("smallcloudai/Refact-1_6B-fim"),
+                         ('Refact-1_6B-fim', 'smallcloudai', 'Refact', 'fim', None, '1.6B'))
+
+        # Uses Iter3 for the version
+        self.assertEqual(gguf.Metadata.get_model_id_components("UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3"),
+                         ('Gemma-2-9B-It-SPPO-Iter3', 'UCLA-AGI', 'Gemma-2', 'It-SPPO', 'Iter3', '9B'))
+
+        # Has two potential versions in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Hermes-2-Theta-Llama-3-8B"),
+                         ('Hermes-2-Theta-Llama-3-8B', 'NousResearch', 'Hermes-2-Theta-Llama-3', None, None, '8B'))
+
+        # Potential version in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("SeaLLMs/SeaLLMs-v3-7B-Chat"),
+                         ('SeaLLMs-v3-7B-Chat', 'SeaLLMs', 'SeaLLMs-v3', 'Chat', None, '7B'))
+
+        # Underscore in the basename, and 1m for the context size
+        self.assertEqual(gguf.Metadata.get_model_id_components("internlm/internlm2_5-7b-chat-1m", 7 * 10**9),
+                         ('internlm2_5-7b-chat-1m', 'internlm', 'internlm2_5', 'chat-1m', None, '7B'))
+
+        # Version before the finetune name
+        self.assertEqual(gguf.Metadata.get_model_id_components("pszemraj/jamba-900M-v0.13-KIx2"),
+                         ('jamba-900M-v0.13-KIx2', 'pszemraj', 'jamba', 'KIx2', 'v0.13', '900M'))
+
+        # TODO: hf suffix which could be ignored but isn't
+        self.assertEqual(gguf.Metadata.get_model_id_components("state-spaces/mamba-2.8b-hf"),
+                         ('mamba-2.8b-hf', 'state-spaces', 'mamba', 'hf', None, '2.8B'))
+
+        # Two sizes, don't merge them, the other is the number of tokens on which it was trained
+        self.assertEqual(gguf.Metadata.get_model_id_components("abacaj/llama-161M-100B", 161 * 10**6),
+                         ('llama-161M-100B', 'abacaj', 'llama', '100b', None, '161M'))
+
+        # It's a trap, there is no size label
+        self.assertEqual(gguf.Metadata.get_model_id_components("SparseLLM/relu-100B", 1340 * 10**6),
+                         ('relu-100B', 'SparseLLM', 'relu', '100b', None, None))
+
+        # Weird size notation
+        self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
+                         ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
+
+        # Ignore full-text size labels when there are number-based ones, and deduplicate size labels
+        self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
+                         ('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
+
+        # Instruct in a name without a size label
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
+                         ('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
+
+        # Non-obvious splitting relying on 'chat' keyword
+        self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
+                         ('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
+
+        # Multiple versions
+        self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
+                         ('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
+
+        # TODO: DPO in the name
+        self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
+                         ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
+
+        # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
+                         ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
+
+        # Too ambiguous
+        # TODO: should "base" be a 'finetune' or 'size_label'?
+        # (in this case it should be a size label, but other models use it to signal that they are not finetuned)
+        self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
+                         ('Florence-2-base', 'microsoft', None, None, None, None))
+
+        ## Invalid cases ##
+
+        # Start with a dash and has dashes in rows
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
+                         ('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
+
+        ## LoRA ##
+
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
+
+        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
+
+    def test_apply_metadata_heuristic_from_model_card(self):
+        model_card = {
+            'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
+            'model-index': [{'name': 'Mixtral-8x7B-Instruct-v0.1', 'results': []}],
+            'language': ['en'],
+            'datasets': ['teknium/OpenHermes-2.5'],
+            'widget': [{'example_title': 'Hermes 2 Pro', 'messages': [{'role': 'system', 'content': 'You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.'}, {'role': 'user', 'content': 'Write a short story about Goku discovering kirby has teamed up with Majin Buu to destroy the world.'}]}],
+            'base_model': ["EmbeddedLLM/Mistral-7B-Merge-14-v0", "janai-hq/trinity-v1"]
+        }
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        expect = gguf.Metadata()
+        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
+        expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
+        expect.languages=['en']
+        expect.datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]
+        self.assertEqual(got, expect)
+
+        # Base Model spec is inferred from model id
+        model_card = {'base_models': 'teknium/OpenHermes-2.5'}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Base Model spec is only url
+        model_card = {'base_models': ['https://huggingface.co/teknium/OpenHermes-2.5']}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Base Model spec is given directly
+        model_card = {'base_models': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]}
+        expect = gguf.Metadata(base_models=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is inferred from model id
+        model_card = {'datasets': 'teknium/OpenHermes-2.5'}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is only url
+        model_card = {'datasets': ['https://huggingface.co/teknium/OpenHermes-2.5']}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+        # Dataset spec is given directly
+        model_card = {'datasets': [{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}]}
+        expect = gguf.Metadata(datasets=[{'name': 'OpenHermes 2.5', 'organization': 'Teknium', 'version': '2.5', 'repo_url': 'https://huggingface.co/teknium/OpenHermes-2.5'}])
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
+        self.assertEqual(got, expect)
+
+    def test_apply_metadata_heuristic_from_hf_parameters(self):
+        hf_params = {"_name_or_path": "./hermes-2-pro-llama-3-8b-DPO"}
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=hf_params, model_path=None)
+        expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
+        self.assertEqual(got, expect)
+
+    def test_apply_metadata_heuristic_from_model_dir(self):
+        model_dir_path = Path("./hermes-2-pro-llama-3-8b-DPO")
+        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card=None, hf_params=None, model_path=model_dir_path)
+        expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
+        self.assertEqual(got, expect)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py
new file mode 100755
index 000000000..f04d5acce
--- /dev/null
+++ b/gguf-py/tests/test_quants.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+
+# Test gguf.quants so that it exactly matches the C implementation of the (de)quantization
+
+# NOTE: this is kind of a mess, but at least it worked for initially testing the Python implementations.
+
+from __future__ import annotations
+
+import argparse
+from math import prod
+import os
+import sys
+from pathlib import Path
+import ctypes
+import logging
+import numpy as np
+
+# Necessary to load the local gguf package
+if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import gguf
+from gguf.constants import GGMLQuantizationType
+
+
+logger = logging.getLogger("test-quants")
+
+
+c_float_p = ctypes.POINTER(ctypes.c_float)
+
+
+class ggml_init_params(ctypes.Structure):
+    _fields_ = [
+        ("mem_size", ctypes.c_size_t),
+        ("mem_buffer", ctypes.c_void_p),
+        ("no_alloc", ctypes.c_bool),
+    ]
+
+
+class GGMLQuants:
+    libggml: ctypes.CDLL
+
+    def __init__(self, libggml: Path):
+        self.libggml = ctypes.CDLL(str(libggml))
+        self.libggml.ggml_quantize_chunk.restype = ctypes.c_size_t
+        # enum ggml_type   type,
+        #    const float * src,
+        #           void * dst,
+        #        int64_t   start,
+        #        int64_t   nrows,
+        #        int64_t   n_per_row,
+        #    const float * imatrix) {
+        self.libggml.ggml_quantize_chunk.argtypes = (
+            ctypes.c_int,
+            ctypes.POINTER(ctypes.c_float),
+            ctypes.c_void_p,
+            ctypes.c_int64,
+            ctypes.c_int64,
+            ctypes.c_int64,
+            ctypes.POINTER(ctypes.c_float),
+        )
+
+        self.libggml.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
+        self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
+
+        for t in (
+            "q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
+            "q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
+            "tq1_0", "tq2_0",
+            "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
+            "iq4_nl", "iq4_xs",
+        ):
+            dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + t)
+            dequant_func.restype = None
+            dequant_func.argtypes = (ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
+
+        self.libggml.ggml_fp16_to_fp32_row.restype = None
+        self.libggml.ggml_fp16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
+        self.libggml.ggml_bf16_to_fp32_row.restype = None
+        self.libggml.ggml_bf16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
+
+        self.libggml.ggml_init.argtypes = (ggml_init_params,)
+
+        self.libggml.ggml_init(ggml_init_params(1 * 1024 * 1024, 0, False))
+
+    def dequantize(self, tensor: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+        result = np.zeros(gguf.quant_shape_from_byte_shape(tensor.shape, qtype), dtype=np.float32, order="C")
+        if qtype == GGMLQuantizationType.F32:
+            # no-op
+            result = tensor.view(np.float32)
+        elif qtype == GGMLQuantizationType.F16:
+            self.libggml.ggml_fp16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size)
+        elif qtype == GGMLQuantizationType.BF16:
+            self.libggml.ggml_bf16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size)
+        else:
+            lw_qname = qtype.name.lower()
+            if lw_qname[-1] == "k":
+                lw_qname = lw_qname[:-1] + "K"
+            dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + lw_qname)
+            dequant_func(tensor.ctypes.data_as(ctypes.c_void_p), result.ctypes.data_as(c_float_p), result.size)
+        return result
+
+    def quantize(self, data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+        result = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
+        if self.libggml.ggml_quantize_requires_imatrix(qtype.value):
+            # TODO: is a column-wise sum of squares appropriate?
+            qw = np.sum((data * data).reshape((-1, data.shape[-1])), axis=0).ctypes.data_as(c_float_p)
+        else:
+            qw = ctypes.cast(0, c_float_p)
+        result_size = self.libggml.ggml_quantize_chunk(qtype.value, data.ctypes.data_as(c_float_p), result.ctypes.data_as(ctypes.c_void_p), 0, prod(data.shape[:-1]), data.shape[-1], qw)
+        assert result.size == result_size
+        return result
+
+
+def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) -> bool:
+    same = np.array_equal(t1, t2)
+    if same:
+        return True
+    else:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qtype]
+        if t1.dtype == np.float32:
+            t1 = t1.reshape((-1, block_size))
+            t2 = t2.reshape((-1, block_size))
+        else:
+            t1 = t1.reshape((-1, type_size))
+            t2 = t2.reshape((-1, type_size))
+        x = t1.view(np.uint8) ^ t2.view(np.uint8)
+        diff_bits = np.count_nonzero(np.unpackbits(x, axis=-1), axis=-1)
+        num_bad_blocks = np.count_nonzero(diff_bits, axis=0)
+        if num_bad_blocks == 0 and t1.shape == t2.shape:
+            logger.debug("Bits are equal, but arrays don't match, likely contains NANs")
+            return True
+        logger.debug(f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)")
+        bad_block_id = np.argmax(diff_bits, axis=0)
+        logger.debug(f"Worst block id: {bad_block_id}")
+        logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}")
+
+        sum_diff_bits = np.sum(diff_bits)
+        logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)")
+        return False
+
+
+def do_test(libggml_path: Path, quick: bool = False):
+    ggml_quants = GGMLQuants(libggml_path)
+
+    np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n})
+
+    r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False)
+
+    for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()):
+        has_dequantize = False
+        has_quantize = False
+
+        try:
+            gguf.dequantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype)
+            has_dequantize = True
+        except (NotImplementedError, AssertionError) as e:
+            if isinstance(e, AssertionError):
+                logger.error(f"Error with {qtype.name}: {e}")
+                raise e
+        try:
+            gguf.quantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype)
+            has_quantize = True
+        except (NotImplementedError, AssertionError) as e:
+            if isinstance(e, AssertionError):
+                logger.error(f"Error with {qtype.name}: {e}")
+                raise e
+
+        if not has_dequantize and not has_quantize:
+            continue
+
+        logger.info(f"Testing {qtype.name}")
+
+        rc = r.copy(order="C")
+
+        pyq = None
+        ggq = None
+
+        if has_quantize:
+            logger.debug(f"Quantizing to {qtype.name} with Python")
+            pyq = gguf.quants.quantize(rc, qtype)
+
+            logger.debug(f"Quantizing to {qtype.name} with C")
+            ggq = ggml_quants.quantize(rc, qtype)
+
+            if qtype == GGMLQuantizationType.F16:
+                pyq = pyq.view(np.uint8)
+            quant_equal = compare_tensors(pyq, ggq, qtype)
+
+            if not quant_equal:
+                logger.error(f"Quantization to {qtype.name} does not match ❌")
+            else:
+                logger.info(f"Quantization to {qtype.name} matches exactly ✅")
+
+        if has_dequantize:
+            if ggq is None and not quick:
+                logger.debug(f"Quantizing to {qtype.name} with C")
+                ggq = ggml_quants.quantize(rc, qtype)
+
+            if ggq is not None:
+                logger.debug(f"Dequantizing from {qtype.name} with Python")
+                pydq = gguf.quants.dequantize(ggq, qtype)
+                logger.debug(f"Dequantizing from {qtype.name} with C")
+                ggdq = ggml_quants.dequantize(ggq, qtype)
+
+                dequant_equal = compare_tensors(pydq, ggdq, qtype)
+
+                if not dequant_equal:
+                    logger.error(f"Dequantization from {qtype.name} does not match ❌")
+                else:
+                    logger.info(f"Dequantization from {qtype.name} matches exactly ✅")
+
+            rq_shape = gguf.quants.quant_shape_to_byte_shape((8, 1024, 1024 // 2), qtype)
+            rq = np.random.random(rq_shape).astype(np.float16).view(np.uint8)
+
+            logger.debug(f"Dequantizing random f16 data as {qtype.name} with Python")
+            pydq = gguf.quants.dequantize(rq, qtype)
+            logger.debug(f"Dequantizing random f16 data as {qtype.name} with C")
+            ggdq = ggml_quants.dequantize(rq, qtype)
+
+            dequant_equal = compare_tensors(pydq, ggdq, qtype)
+
+            if not dequant_equal:
+                logger.error(f"Dequantization from random f16 data as {qtype.name} does not match ❌")
+            else:
+                logger.info(f"Dequantization from random f16 data as {qtype.name} matches exactly ✅")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation")
+    parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so")
+    parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary")
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    do_test(args.libggml, args.quick)
diff --git a/grammars/README.md b/grammars/README.md
index e1383fa5c..976954091 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.
 
 ## Background
 
-[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
 
 ## Basics
 
@@ -46,12 +46,12 @@ Terminals support the full range of Unicode. Unicode characters can be specified
 
 Character ranges can be negated with `^`:
 ```
-single-line ::= [^\n]+ "\n"`
+single-line ::= [^\n]+ "\n"
 ```
 
 ## Sequences and Alternatives
 
-The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
+The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
 
 Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
 
@@ -59,9 +59,13 @@ Parentheses `()` can be used to group sequences, which allows for embedding alte
 
 ## Repetition and Optional Symbols
 
-- `*` after a symbol or sequence means that it can be repeated zero or more times.
-- `+` denotes that the symbol or sequence should appear one or more times.
-- `?` makes the preceding symbol or sequence optional.
+- `*` after a symbol or sequence means that it can be repeated zero or more times (equivalent to `{0,}`).
+- `+` denotes that the symbol or sequence should appear one or more times (equivalent to `{1,}`).
+- `?` makes the preceding symbol or sequence optional (equivalent to `{0,1}`).
+- `{m}` repeats the precedent symbol or sequence exactly `m` times
+- `{m,}` repeats the precedent symbol or sequence at least `m` times
+- `{m,n}` repeats the precedent symbol or sequence at between `m` and `n` times (included)
+- `{0,n}` repeats the precedent symbol or sequence at most `n` times (included)
 
 ## Comments and newlines
 
@@ -87,5 +91,292 @@ item ::= [^\n]+ "\n"
 
 This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
 ```
-./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+./llama-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
 ```
+
+`llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
+
+## Troubleshooting
+
+Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
+
+### Efficient optional repetitions
+
+A common pattern is to allow repetitions of a pattern `x` up to N times.
+
+While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
+
+## Using GBNF grammars
+
+You can use GBNF grammars:
+
+- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
+- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
+- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
+
+## JSON Schemas → GBNF
+
+`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
+
+- In [llama-server](../examples/server):
+    - For any completion endpoints, passed as the `json_schema` body field
+    - For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`)
+- In [llama-cli](../examples/main), passed as the `--json` / `-j` flag
+- To convert to a grammar ahead of time:
+    - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
+    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
+
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+
+```bash
+llama-cli \
+  -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
+  -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
+  -j '{
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 100
+            },
+            "age": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 150
+            }
+        },
+        "required": ["name", "age"],
+        "additionalProperties": false
+    },
+    "minItems": 10,
+    "maxItems": 100
+  }' \
+  -p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
+```
+
+<details>
+
+<summary>Show grammar</summary>
+
+You can convert any schema in command-line with:
+
+```bash
+examples/json_schema_to_grammar.py name-age-schema.json
+```
+
+```
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+item ::= "{" space item-name-kv "," space item-age-kv "}" space
+item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
+item-age-kv ::= "\"age\"" space ":" space item-age
+item-name ::= "\"" char{1,100} "\"" space
+item-name-kv ::= "\"name\"" space ":" space item-name
+root ::= "[" space item ("," space item){9,99} "]" space
+space ::= | " " | "\n" [ \t]{0,20}
+```
+
+</details>
+
+Here is also a list of known limitations (contributions welcome):
+
+- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
+- `"additionalProperties": true` may produce keys that contain unescaped newlines.
+- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
+- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
+- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
+- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
+- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
+- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
+- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
+- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
+
+And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
+
+- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
+- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
+- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
+- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
+- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
+
+### A word about additionalProperties
+
+> [!WARNING]
+> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
+> Since this is slow and seems prone to hallucinations, we default to no additional properties.
+> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
+
+If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:
+
+```python
+# pip install pydantic
+import json
+from typing import Annotated, List
+from pydantic import BaseModel, Extra, Field
+class QAPair(BaseModel):
+    class Config:
+        extra = 'allow'  # triggers additionalProperties: true in the JSON schema
+    question: str
+    concise_answer: str
+    justification: str
+
+class Summary(BaseModel):
+    class Config:
+        extra = 'allow'
+    key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
+    question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
+
+print(json.dumps(Summary.model_json_schema(), indent=2))
+```
+
+<details>
+<summary>Show JSON schema & grammar</summary>
+
+```json
+{
+  "$defs": {
+    "QAPair": {
+      "additionalProperties": true,
+      "properties": {
+        "question": {
+          "title": "Question",
+          "type": "string"
+        },
+        "concise_answer": {
+          "title": "Concise Answer",
+          "type": "string"
+        },
+        "justification": {
+          "title": "Justification",
+          "type": "string"
+        }
+      },
+      "required": [
+        "question",
+        "concise_answer",
+        "justification"
+      ],
+      "title": "QAPair",
+      "type": "object"
+    }
+  },
+  "additionalProperties": true,
+  "properties": {
+    "key_facts": {
+      "items": {
+        "pattern": "^- .{5,}$",
+        "type": "string"
+      },
+      "title": "Key Facts",
+      "type": "array"
+    },
+    "question_answers": {
+      "items": {
+        "items": {
+          "$ref": "#/$defs/QAPair"
+        },
+        "minItems": 5,
+        "type": "array"
+      },
+      "title": "Question Answers",
+      "type": "array"
+    }
+  },
+  "required": [
+    "key_facts",
+    "question_answers"
+  ],
+  "title": "Summary",
+  "type": "object"
+}
+```
+
+```
+QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space
+QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space
+QAPair-additional-kv ::= QAPair-additional-k ":" space value
+QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
+QAPair-justification-kv ::= "\"justification\"" space ":" space string
+QAPair-question-kv ::= "\"question\"" space ":" space string
+additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space
+additional-kv ::= additional-k ":" space value
+array ::= "[" space ( value ("," space value)* )? "]" space
+boolean ::= ("true" | "false") space
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+decimal-part ::= [0-9]{1,16}
+dot ::= [^\x0A\x0D]
+integral-part ::= [0] | [1-9] [0-9]{0,15}
+key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
+key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
+key-facts-item-1 ::= dot
+key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
+null ::= "null" space
+number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
+question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
+question-answers-item-item ::= QAPair
+question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
+root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
+space ::= | " " | "\n" [ \t]{0,20}
+string ::= "\"" char* "\"" space
+value ::= object | array | string | number | boolean | null
+```
+
+</details>
+
+If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway.
+
+```js
+import { z } from 'zod';
+import { zodToJsonSchema } from 'zod-to-json-schema';
+
+const Foo = z.object({
+  age: z.number().positive(),
+  email: z.string().email(),
+}).strict();
+
+console.log(zodToJsonSchema(Foo));
+```
+
+<details>
+<summary>Show JSON schema & grammar</summary>
+
+```json
+{
+  "type": "object",
+  "properties": {
+    "age": {
+      "type": "number",
+      "exclusiveMinimum": 0
+    },
+    "email": {
+      "type": "string",
+      "format": "email"
+    }
+  },
+  "required": [
+    "age",
+    "email"
+  ],
+  "additionalProperties": false,
+  "$schema": "http://json-schema.org/draft-07/schema#"
+}
+```
+
+```
+age-kv ::= "\"age\"" space ":" space number
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+decimal-part ::= [0-9]{1,16}
+email-kv ::= "\"email\"" space ":" space string
+integral-part ::= [0] | [1-9] [0-9]{0,15}
+number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+root ::= "{" space age-kv "," space email-kv "}" space
+space ::= | " " | "\n" [ \t]{0,20}
+string ::= "\"" char* "\"" space
+```
+
+</details>
diff --git a/grammars/english.gbnf b/grammars/english.gbnf
new file mode 100644
index 000000000..2e53686c8
--- /dev/null
+++ b/grammars/english.gbnf
@@ -0,0 +1,6 @@
+# note: this might be incomplete, mostly an example
+root        ::= en-char+ ([ \t\n] en-char+)*
+en-char     ::= letter | digit | punctuation
+letter      ::= [a-zA-Z]
+digit       ::= [0-9]
+punctuation ::= [!"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~]
diff --git a/grammars/json.gbnf b/grammars/json.gbnf
index a9537cdf9..b6448c87b 100644
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@@ -15,11 +15,11 @@ array  ::=
 
 string ::=
   "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
   )* "\"" ws
 
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
 
 # Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
+ws ::= | " " | "\n" [ \t]{0,20}
diff --git a/grammars/json_arr.gbnf b/grammars/json_arr.gbnf
index ef53e77a0..b3dc6f9b1 100644
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@@ -24,11 +24,11 @@ array  ::=
 
 string ::=
   "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
   )* "\"" ws
 
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
 
 # Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
+ws ::= | " " | "\n" [ \t]{0,20}
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
new file mode 100644
index 000000000..8f6368177
--- /dev/null
+++ b/include/llama-cpp.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include <memory>
+
+#include "llama.h"
+
+struct llama_model_deleter {
+    void operator()(llama_model * model) { llama_model_free(model); }
+};
+
+struct llama_context_deleter {
+    void operator()(llama_context * context) { llama_free(context); }
+};
+
+struct llama_sampler_deleter {
+    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
+};
+
+struct llama_adapter_lora_deleter {
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
+};
+
+typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
+typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
+typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
+typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
diff --git a/include/llama.h b/include/llama.h
new file mode 100644
index 000000000..3784f7d39
--- /dev/null
+++ b/include/llama.h
@@ -0,0 +1,1335 @@
+#ifndef LLAMA_H
+#define LLAMA_H
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAMA_API __declspec(dllexport)
+#        else
+#            define LLAMA_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAMA_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAMA_API
+#endif
+
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
+
+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
+
+#define LLAMA_TOKEN_NULL -1
+
+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
+#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
+
+#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
+#define LLAMA_SESSION_VERSION 9
+
+#define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
+#define LLAMA_STATE_SEQ_VERSION 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    //
+    // C interface
+    //
+    // TODO: show sample usage
+    //
+
+    struct llama_vocab;
+    struct llama_model;
+    struct llama_context;
+    struct llama_sampler;
+
+    typedef int32_t llama_pos;
+    typedef int32_t llama_token;
+    typedef int32_t llama_seq_id;
+
+    enum llama_vocab_type {
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
+        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
+        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
+        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
+        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
+    };
+
+    // pre-tokenization types
+    enum llama_vocab_pre_type {
+        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
+        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
+        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
+        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
+        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
+        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
+        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
+        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
+        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
+        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
+        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
+        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
+        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
+        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+    };
+
+    enum llama_rope_type {
+        LLAMA_ROPE_TYPE_NONE   = -1,
+        LLAMA_ROPE_TYPE_NORM   = 0,
+        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
+    };
+
+    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
+        LLAMA_TOKEN_TYPE_BYTE         = 6,
+    };
+
+    enum llama_token_attr {
+        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
+        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
+        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
+        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
+        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
+        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
+        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
+        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
+        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
+        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
+        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
+    };
+
+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
+        //LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // removed from gguf files, use Q4_0 and runtime repack
+        //LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // removed from gguf files, use Q4_0 and runtime repack
+        //LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
+        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
+
+        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
+    };
+
+    enum llama_rope_scaling_type {
+        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
+        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
+        LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
+        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
+    };
+
+    enum llama_pooling_type {
+        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
+        LLAMA_POOLING_TYPE_NONE = 0,
+        LLAMA_POOLING_TYPE_MEAN = 1,
+        LLAMA_POOLING_TYPE_CLS  = 2,
+        LLAMA_POOLING_TYPE_LAST = 3,
+        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
+    };
+
+    enum llama_attention_type {
+        LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
+        LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
+        LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
+    };
+
+    enum llama_split_mode {
+        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+    };
+
+    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
+    typedef struct llama_token_data {
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
+    } llama_token_data;
+
+    typedef struct llama_token_data_array {
+        // TODO: consider SoA
+        // NOTE: this pointer can be modified by the samplers
+        llama_token_data * data;
+        size_t size;
+        int64_t selected; // this is the index in the data array (i.e. not the token id)
+        bool sorted;
+    } llama_token_data_array;
+
+    typedef bool (*llama_progress_callback)(float progress, void * user_data);
+
+    // Input data for llama_decode
+    // A llama_batch object can contain input about one or many sequences
+    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+    //
+    // - token  : the token ids of the input (used when embd is NULL)
+    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+    // - pos    : the positions of the respective token in the sequence
+    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+    // - seq_id : the sequence to which the respective token belongs
+    //            (if set to NULL, the sequence ID will be assumed to be 0)
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+    //            (if set to NULL, only the logits for last token will be returned)
+    //
+    typedef struct llama_batch {
+        int32_t n_tokens;
+
+        llama_token  *  token;
+        float        *  embd;
+        llama_pos    *  pos;
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
+        int8_t       *  logits; // TODO: rename this to "output"
+    } llama_batch;
+
+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_TYPE_INT,
+        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
+        LLAMA_KV_OVERRIDE_TYPE_BOOL,
+        LLAMA_KV_OVERRIDE_TYPE_STR,
+    };
+
+    struct llama_model_kv_override {
+        enum llama_model_kv_override_type tag;
+
+        char key[128];
+
+        union {
+            int64_t val_i64;
+            double  val_f64;
+            bool    val_bool;
+            char    val_str[128];
+        };
+    };
+
+    struct llama_model_params {
+        // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        ggml_backend_dev_t * devices;
+
+        int32_t n_gpu_layers; // number of layers to store in VRAM
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+
+        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
+        int32_t main_gpu;
+
+        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
+        const float * tensor_split;
+
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
+        llama_progress_callback progress_callback;
+
+        // context pointer passed to the progress callback
+        void * progress_callback_user_data;
+
+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
+    };
+
+    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
+    //       https://github.com/ggerganov/llama.cpp/pull/7544
+    struct llama_context_params {
+        uint32_t n_ctx;             // text context, 0 = from model
+        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
+        uint32_t n_ubatch;          // physical maximum batch size
+        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        int32_t  n_threads;         // number of threads to use for generation
+        int32_t  n_threads_batch;   // number of threads to use for batch processing
+
+        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+        enum llama_attention_type    attention_type;    // attention type to use for embeddings
+
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
+        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
+
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        // TODO: move at the end of the struct
+        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+
+        // Abort callback
+        // if it returns true, execution of llama_decode() will be aborted
+        // currently works only with CPU execution
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;              // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type token_embedding_type; // token embeddings tensor type
+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;         // quantize output.weight
+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                           // quantize all tensors to the default type
+        bool keep_split;                     // quantize to the same number of shards
+        void * imatrix;                      // pointer to importance matrix data
+        void * kv_overrides;                 // pointer to vector containing overrides
+    } llama_model_quantize_params;
+
+    typedef struct llama_logit_bias {
+        llama_token token;
+        float bias;
+    } llama_logit_bias;
+
+    typedef struct llama_sampler_chain_params {
+        bool no_perf; // whether to measure performance timings
+    } llama_sampler_chain_params;
+
+    // used in chat template
+    typedef struct llama_chat_message {
+        const char * role;
+        const char * content;
+    } llama_chat_message;
+
+    // lora adapter
+    struct llama_adapter_lora;
+
+    // Helpers for getting default parameters
+    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
+    LLAMA_API struct llama_model_params          llama_model_default_params(void);
+    LLAMA_API struct llama_context_params        llama_context_default_params(void);
+    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
+
+    // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
+    // Call once at the start of the program
+    LLAMA_API void llama_backend_init(void);
+
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free(void);
+
+    //optional:
+    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
+
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    LLAMA_API void llama_attach_threadpool(
+            struct llama_context * ctx,
+               ggml_threadpool_t   threadpool,
+               ggml_threadpool_t   threadpool_batch);
+
+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params),
+            "use llama_model_load_from_file instead");
+
+    // Load the model from a file
+    // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
+    // If the split file name does not follow this pattern, use llama_model_load_from_splits
+    LLAMA_API struct llama_model * llama_model_load_from_file(
+                             const char * path_model,
+              struct llama_model_params   params);
+
+    // Load the model from multiple splits (support custom naming scheme)
+    // The paths must be in the correct order
+    LLAMA_API struct llama_model * llama_model_load_from_splits(
+                             const char ** paths,
+                                 size_t    n_paths,
+              struct llama_model_params    params);
+
+    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+            "use llama_model_free instead");
+
+    LLAMA_API void llama_model_free(struct llama_model * model);
+
+    LLAMA_API struct llama_context * llama_init_from_model(
+                     struct llama_model * model,
+            struct llama_context_params   params);
+
+    DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params),
+            "use llama_init_from_model instead");
+
+    // Frees all allocated memory
+    LLAMA_API void llama_free(struct llama_context * ctx);
+
+    LLAMA_API int64_t llama_time_us(void);
+
+    LLAMA_API size_t llama_max_devices(void);
+
+    LLAMA_API bool llama_supports_mmap       (void);
+    LLAMA_API bool llama_supports_mlock      (void);
+    LLAMA_API bool llama_supports_gpu_offload(void);
+    LLAMA_API bool llama_supports_rpc        (void);
+
+    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
+
+    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
+
+    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
+    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+
+    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
+    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
+
+    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
+
+    // Get the model's RoPE frequency scaling factor
+    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
+
+    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
+
+    LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
+
+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - When retrieving a string, an extra byte must be allocated to account for the null terminator
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
+
+    // Get metadata key name by index
+    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+
+    // Get a string describing the model type
+    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+
+    // Returns the total size of all the tensors in the model in bytes
+    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+
+    // Get the default chat template. Returns nullptr if not available
+    // If name is NULL, returns the default chat template
+    LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
+
+    // Returns the total number of parameters in the model
+    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
+
+    // Returns true if the model contains an encoder that requires llama_encode() call
+    LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
+
+    // Returns true if the model contains a decoder that requires llama_decode() call
+    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
+
+    // For encoder-decoder models, this function returns id of the token that must be provided
+    // to the decoder to start generating output sequence. For other models, it returns -1.
+    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
+
+    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
+    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
+
+    // Returns 0 on success
+    LLAMA_API uint32_t llama_model_quantize(
+            const char * fname_inp,
+            const char * fname_out,
+            const llama_model_quantize_params * params);
+
+    //
+    // Adapters
+    //
+
+    // Load a LoRA adapter from file
+    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
+            struct llama_model * model,
+            const char * path_lora);
+
+    // Manually free a LoRA adapter
+    // Note: loaded adapters will be free when the associated model is deleted
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+
+    // The following functions operate on a llama_context, hence the naming: llama_verb_...
+
+    // Add a loaded LoRA adapter to given context
+    // This will not modify model's weight
+    LLAMA_API int32_t llama_set_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter,
+            float scale);
+
+    // Remove a specific LoRA adapter from given context
+    // Return -1 if the adapter is not present in the context
+    LLAMA_API int32_t llama_rm_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter);
+
+    // Remove all LoRA adapters from given context
+    LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
+
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_apply_adapter_cvec(
+            struct llama_context * ctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
+    //
+    // KV cache
+    //
+
+    // TODO: remove llama_kv_cache_view_* API
+
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_seq_max;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_seq_max items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    ///
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+
+    // Clear the KV cache - both cell info is erased and KV data is zeroed
+    LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx);
+
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
+    LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1);
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1);
+
+    // Removes all tokens that do not belong to the specified sequence
+    LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta);
+
+    // Integer division of the positions by factor of `d > 1`
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d);
+
+    // Returns the largest position present in the KV cache for the specified sequence
+    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
+    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
+    //       how to avoid this?
+
+    // Defragment the KV cache
+    // This will be applied:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
+    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+
+    // Check if the context supports KV cache shifting
+    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+
+    //
+    // State / sessions
+    //
+
+    // Returns the *actual* size in bytes of the state
+    // (logits, embedding and kv_cache)
+    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
+        "use llama_state_get_size instead");
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_state_get_data(
+            struct llama_context * ctx,
+                         uint8_t * dst,
+                          size_t   size);
+    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
+            struct llama_context * ctx,
+                         uint8_t * dst),
+        "use llama_state_get_data instead");
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_state_set_data(
+            struct llama_context * ctx,
+                   const uint8_t * src,
+                          size_t   size);
+    LLAMA_API DEPRECATED(size_t llama_set_state_data(
+            struct llama_context * ctx,
+                   const uint8_t * src),
+        "use llama_state_set_data instead");
+
+    // Save/load session file
+    LLAMA_API bool llama_state_load_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
+    LLAMA_API DEPRECATED(bool llama_load_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out),
+        "use llama_state_load_file instead");
+
+    LLAMA_API bool llama_state_save_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count);
+    LLAMA_API DEPRECATED(bool llama_save_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count),
+        "use llama_state_save_file instead");
+
+    // Get the exact size needed to copy the KV cache of a single sequence
+    LLAMA_API size_t llama_state_seq_get_size(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
+    // Copy the KV cache of a single sequence into the specified buffer
+    LLAMA_API size_t llama_state_seq_get_data(
+            struct llama_context * ctx,
+                         uint8_t * dst,
+                          size_t   size,
+                    llama_seq_id   seq_id);
+
+    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
+    // Returns:
+    //  - Positive: Ok
+    //  - Zero: Failed to load
+    LLAMA_API size_t llama_state_seq_set_data(
+            struct llama_context * ctx,
+                   const uint8_t * src,
+                          size_t   size,
+                    llama_seq_id   dest_seq_id);
+
+    LLAMA_API size_t llama_state_seq_save_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   seq_id,
+               const llama_token * tokens,
+                          size_t   n_token_count);
+
+    LLAMA_API size_t llama_state_seq_load_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   dest_seq_id,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
+
+    //
+    // Decoding
+    //
+
+    // Return batch for single sequence of tokens
+    // The sequence ID will be fixed to 0
+    // The position of the tokens will be tracked automatically by llama_decode
+    //
+    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
+    //
+    LLAMA_API struct llama_batch llama_batch_get_one(
+                  llama_token * tokens,
+                      int32_t   n_tokens);
+
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+    // Each token can be assigned up to n_seq_max sequence ids
+    // The batch has to be freed with llama_batch_free()
+    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
+    // The rest of the llama_batch members are allocated with size n_tokens
+    // All members are left uninitialized
+    LLAMA_API struct llama_batch llama_batch_init(
+            int32_t n_tokens,
+            int32_t embd,
+            int32_t n_seq_max);
+
+    // Frees a batch of tokens allocated with llama_batch_init()
+    LLAMA_API void llama_batch_free(struct llama_batch batch);
+
+    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
+    // Stores the encoder output internally for later use by the decoder cross-attention layers.
+    //   0 - success
+    // < 0 - error. the KV cache state is restored to the state before this call
+    LLAMA_API int32_t llama_encode(
+            struct llama_context * ctx,
+              struct llama_batch   batch);
+
+    // Positive return values does not mean a fatal error, but rather a warning.
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error. the KV cache state is restored to the state before this call
+    LLAMA_API int32_t llama_decode(
+            struct llama_context * ctx,
+              struct llama_batch   batch);
+
+    // Set the number of threads used for decoding
+    // n_threads is the number of threads used for generation (single token)
+    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
+
+    // Get the number of threads used for generation of a single token.
+    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
+
+    // Get the number of threads used for prompt and batch processing (multiple token).
+    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
+
+    // Set whether the model is in embeddings mode or not
+    // If true, embeddings will be returned but logits will not
+    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+
+    // Set whether to use causal attention or not
+    // If set to true, the model will only attend to the past tokens
+    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
+
+    // Set abort callback
+    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    // Wait until all computations are finished
+    // This is automatically done when using one of the functions below to obtain the computation results
+    // and is not necessary to call it explicitly in most cases
+    LLAMA_API void llama_synchronize(struct llama_context * ctx);
+
+    // Token logits obtained from the last call to llama_decode()
+    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // Rows: number of tokens for which llama_batch.logits[i] != 0
+    // Cols: n_vocab
+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+
+    // Logits for the ith token. For positive indices, Equivalent to:
+    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // returns NULL for invalid ids.
+    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
+
+    // Get all output token embeddings.
+    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
+    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // shape: [n_outputs*n_embd]
+    // Otherwise, returns NULL.
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
+    // Get the embeddings for the ith token. For positive indices, Equivalent to:
+    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+    // shape: [n_embd] (1-dimensional)
+    // returns NULL for invalid ids.
+    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+
+    // Get the embeddings for a sequence id
+    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // otherwise: float[n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
+
+    //
+    // Vocab
+    //
+
+    LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
+
+    LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
+
+    LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
+
+    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+    LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
+
+    // Identify if Token Id is a control token or a render-able token
+    LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
+
+    // Special tokens
+    LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
+    LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
+    LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
+    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
+    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
+    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+
+    LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+
+    LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
+
+    DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
+    DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
+    DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
+    DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
+    DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
+    DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
+    DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
+
+    // CLS is equivalent to BOS
+    DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
+            "use llama_vocab_bos instead");
+
+    //
+    // Tokenization
+    //
+    // The API is thread-safe.
+    //
+
+    /// @details Convert the provided text into tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+    /// @return Returns the number of tokens on success, no more than n_tokens_max
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
+    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
+    ///                      as plaintext. Does not insert a leading space.
+    LLAMA_API int32_t llama_tokenize(
+        const struct llama_vocab * vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+    // Token Id -> Piece.
+    // Uses the vocabulary in the provided context.
+    // Does not write null terminator to the buffer.
+    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
+    // @param special If true, special tokens are rendered in the output.
+    LLAMA_API int32_t llama_token_to_piece(
+              const struct llama_vocab * vocab,
+                           llama_token   token,
+                                  char * buf,
+                               int32_t   length,
+                               int32_t   lstrip,
+                                  bool   special);
+
+    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
+    /// @param text The char pointer must be large enough to hold the resulting text.
+    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
+    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
+    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
+    /// @param unparse_special If true, special tokens are rendered in the output.
+    LLAMA_API int32_t llama_detokenize(
+        const struct llama_vocab * vocab,
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
+
+    //
+    // Chat templates
+    //
+
+    /// Apply chat template. Inspired by hf apply_chat_template() on python.
+    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+    /// @param chat Pointer to a list of multiple llama_chat_message
+    /// @param n_msg Number of llama_chat_message in this chat
+    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
+    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
+    /// @param length The size of the allocated buffer
+    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
+    LLAMA_API int32_t llama_chat_apply_template(
+                            const char * tmpl,
+       const struct llama_chat_message * chat,
+                                size_t   n_msg,
+                                  bool   add_ass,
+                                  char * buf,
+                               int32_t   length);
+
+    // Get list of built-in chat templates
+    LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
+
+    //
+    // Sampling API
+    //
+    // Sample usage:
+    //
+    //    // prepare the sampling chain at the start
+    //    auto sparams = llama_sampler_chain_default_params();
+    //
+    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    //
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
+    //
+    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
+    //    // this sampler will be responsible to select the actual token
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
+    //
+    //    ...
+    //
+    //    // decoding loop:
+    //    while (...) {
+    //        ...
+    //
+    //        llama_decode(ctx, batch);
+    //
+    //        // sample from the logits of the last token in the batch
+    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
+    //
+    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
+    //        llama_sampler_accept(smpl, id);
+    //        ...
+    //    }
+    //
+    //    llama_sampler_free(smpl);
+    //
+    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    //
+
+    typedef void * llama_sampler_context_t;
+
+    // user code can implement the interface below in order to create custom llama_sampler
+    struct llama_sampler_i {
+        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
+        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
+        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
+        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
+        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
+        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
+
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
+    };
+
+    struct llama_sampler {
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
+    };
+
+    // mirror of llama_sampler_i:
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
+    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
+    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
+    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
+    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
+    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
+
+    // llama_sampler_chain
+    // a type of llama_sampler that can chain multiple samplers one after another
+
+    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
+
+    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
+    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
+    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
+
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
+
+    // available samplers:
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
+        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
+
+    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
+
+    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
+
+    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
+
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
+                             int32_t   n_vocab,
+                            uint32_t   seed,
+                               float   tau,
+                               float   eta,
+                             int32_t   m);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
+                            uint32_t   seed,
+                               float   tau,
+                               float   eta);
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
+            const struct llama_vocab * vocab,
+                          const char * grammar_str,
+                          const char * grammar_root);
+
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
+    /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
+    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+            const struct llama_vocab * vocab,
+                          const char * grammar_str,
+                          const char * grammar_root,
+                         const char ** trigger_words,
+                                size_t num_trigger_words,
+                   const llama_token * trigger_tokens,
+                                size_t num_trigger_tokens);
+
+    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
+    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
+                             int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
+                               float   penalty_repeat,   // 1.0 = disabled
+                               float   penalty_freq,     // 0.0 = disabled
+                               float   penalty_present); // 0.0 = disabled
+
+    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+    LLAMA_API struct llama_sampler * llama_sampler_init_dry(
+            const struct llama_vocab *  vocab,
+                             int32_t    n_ctx_train,
+                               float    dry_multiplier,
+                               float    dry_base,
+                             int32_t    dry_allowed_length,
+                             int32_t    dry_penalty_last_n,
+                          const char ** seq_breakers,
+                              size_t    num_breakers);
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
+                             int32_t   n_vocab,
+                             int32_t   n_logit_bias,
+              const llama_logit_bias * logit_bias);
+
+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k + top_p sampling
+    //
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
+    //
+    // example:
+    //
+    // - before:
+    //   "hel":   0.5
+    //   "hell":  0.2
+    //   "hello": 0.1
+    //   "dummy": 0.1
+    //
+    // - after:
+    //   "hel":   0.8
+    //   "dummy": 0.1
+    //
+    // 3. discard non-EOG tokens with low prob
+    // 4. if no tokens are left -> pick EOT
+    //
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
+
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
+    //
+    // Shorthand for:
+    //    const auto * logits = llama_get_logits_ith(ctx, idx);
+    //    llama_token_data_array cur_p = { ... init from logits ... };
+    //    llama_sampler_apply(smpl, &cur_p);
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
+    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
+
+    // TODO: extend in the future
+    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
+
+    //
+    // Model split
+    //
+
+    /// @details Build a split GGUF final path for this chunk.
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+
+    // Print system information
+    LLAMA_API const char * llama_print_system_info(void);
+
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
+
+    //
+    // Performance utils
+    //
+    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    //
+
+    struct llama_perf_context_data {
+        double t_start_ms;
+        double t_load_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
+
+    struct llama_perf_sampler_data {
+        double t_sample_ms;
+
+        int32_t n_sample;
+    };
+
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
+
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // LLAMA_H
diff --git a/kompute-shaders/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
deleted file mode 100644
index b44622584..000000000
--- a/kompute-shaders/op_rope_f16.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & 2) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+1]);
-
-            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            const float x0 = float(inA[src]);
-            const float x1 = float(inA[src+pcs.n_dims/2]);
-
-            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
deleted file mode 100644
index 2c0235d75..000000000
--- a/kompute-shaders/op_rope_f32.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & 2) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+1];
-
-            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+pcs.n_dims/2];
-
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
diff --git a/llama.cpp b/llama.cpp
deleted file mode 100644
index b1db5b179..000000000
--- a/llama.cpp
+++ /dev/null
@@ -1,13469 +0,0 @@
-#define LLAMA_API_INTERNAL
-#include "llama.h"
-
-#include "unicode.h"
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#  include "ggml-cuda.h"
-#elif defined(GGML_USE_CLBLAST)
-#  include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#  include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-#  include "ggml-sycl.h"
-#elif defined(GGML_USE_KOMPUTE)
-#   include "ggml-kompute.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#  include "ggml-metal.h"
-#endif
-#ifdef GGML_USE_MPI
-#  include "ggml-mpi.h"
-#endif
-#ifndef QK_K
-#  ifdef GGML_QKK_64
-#    define QK_K 64
-#  else
-#    define QK_K 256
-#  endif
-#endif
-
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-            #include <fcntl.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-#endif
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cfloat>
-#include <cinttypes>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <cwctype>
-#include <forward_list>
-#include <fstream>
-#include <functional>
-#include <initializer_list>
-#include <locale>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <queue>
-#include <random>
-#include <regex>
-#include <set>
-#include <sstream>
-#include <thread>
-#include <type_traits>
-#include <unordered_map>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...)
-#endif
-
-#define LLAMA_MAX_NODES   8192
-#define LLAMA_MAX_EXPERTS 8
-
-//
-// logging
-//
-
-LLAMA_ATTRIBUTE_FORMAT(2, 3)
-static void llama_log_internal        (ggml_log_level level, const char* format, ...);
-static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
-
-#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
-#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
-#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-
-//
-// helpers
-//
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
-
-static bool is_float_close(float a, float b, float abs_tol) {
-    // Check for non-negative tolerance
-    if (abs_tol < 0.0) {
-        throw std::invalid_argument("Tolerance must be non-negative");
-    }
-
-    // Exact equality check
-    if (a == b) {
-        return true;
-    }
-
-    // Check for infinities
-    if (std::isinf(a) || std::isinf(b)) {
-        return false;
-    }
-
-    // Regular comparison using the provided absolute tolerance
-    return std::fabs(b - a) <= abs_tol;
-}
-
-static void zeros(std::ofstream & file, size_t n) {
-    char zero = 0;
-    for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
-    }
-}
-
-LLAMA_ATTRIBUTE_FORMAT(1, 2)
-static std::string format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-//
-// gguf constants (sync with gguf.py)
-//
-
-enum llm_arch {
-    LLM_ARCH_LLAMA,
-    LLM_ARCH_FALCON,
-    LLM_ARCH_BAICHUAN,
-    LLM_ARCH_GPT2,
-    LLM_ARCH_GPTJ,
-    LLM_ARCH_GPTNEOX,
-    LLM_ARCH_MPT,
-    LLM_ARCH_STARCODER,
-    LLM_ARCH_PERSIMMON,
-    LLM_ARCH_REFACT,
-    LLM_ARCH_BERT,
-    LLM_ARCH_NOMIC_BERT,
-    LLM_ARCH_BLOOM,
-    LLM_ARCH_STABLELM,
-    LLM_ARCH_QWEN,
-    LLM_ARCH_QWEN2,
-    LLM_ARCH_PHI2,
-    LLM_ARCH_PLAMO,
-    LLM_ARCH_CODESHELL,
-    LLM_ARCH_ORION,
-    LLM_ARCH_INTERNLM2,
-    LLM_ARCH_MINICPM,
-    LLM_ARCH_GEMMA,
-    LLM_ARCH_STARCODER2,
-    LLM_ARCH_UNKNOWN,
-};
-
-static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-    { LLM_ARCH_LLAMA,           "llama"      },
-    { LLM_ARCH_FALCON,          "falcon"     },
-    { LLM_ARCH_GPT2,            "gpt2"       },
-    { LLM_ARCH_GPTJ,            "gptj"       },
-    { LLM_ARCH_GPTNEOX,         "gptneox"    },
-    { LLM_ARCH_MPT,             "mpt"        },
-    { LLM_ARCH_BAICHUAN,        "baichuan"   },
-    { LLM_ARCH_STARCODER,       "starcoder"  },
-    { LLM_ARCH_PERSIMMON,       "persimmon"  },
-    { LLM_ARCH_REFACT,          "refact"     },
-    { LLM_ARCH_BERT,            "bert"       },
-    { LLM_ARCH_NOMIC_BERT,      "nomic-bert" },
-    { LLM_ARCH_BLOOM,           "bloom"      },
-    { LLM_ARCH_STABLELM,        "stablelm"   },
-    { LLM_ARCH_QWEN,            "qwen"       },
-    { LLM_ARCH_QWEN2,           "qwen2"      },
-    { LLM_ARCH_PHI2,            "phi2"       },
-    { LLM_ARCH_PLAMO,           "plamo"      },
-    { LLM_ARCH_CODESHELL,       "codeshell"  },
-    { LLM_ARCH_ORION,           "orion"      },
-    { LLM_ARCH_INTERNLM2,       "internlm2"  },
-    { LLM_ARCH_MINICPM,         "minicpm"    },
-    { LLM_ARCH_GEMMA,           "gemma"      },
-    { LLM_ARCH_STARCODER2,      "starcoder2" },
-};
-
-enum llm_kv {
-    LLM_KV_GENERAL_ARCHITECTURE,
-    LLM_KV_GENERAL_QUANTIZATION_VERSION,
-    LLM_KV_GENERAL_ALIGNMENT,
-    LLM_KV_GENERAL_NAME,
-    LLM_KV_GENERAL_AUTHOR,
-    LLM_KV_GENERAL_URL,
-    LLM_KV_GENERAL_DESCRIPTION,
-    LLM_KV_GENERAL_LICENSE,
-    LLM_KV_GENERAL_SOURCE_URL,
-    LLM_KV_GENERAL_SOURCE_HF_REPO,
-
-    LLM_KV_CONTEXT_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_BLOCK_COUNT,
-    LLM_KV_FEED_FORWARD_LENGTH,
-    LLM_KV_USE_PARALLEL_RESIDUAL,
-    LLM_KV_TENSOR_DATA_LAYOUT,
-    LLM_KV_EXPERT_COUNT,
-    LLM_KV_EXPERT_USED_COUNT,
-    LLM_KV_POOLING_TYPE,
-
-    LLM_KV_ATTENTION_HEAD_COUNT,
-    LLM_KV_ATTENTION_HEAD_COUNT_KV,
-    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
-    LLM_KV_ATTENTION_CLAMP_KQV,
-    LLM_KV_ATTENTION_KEY_LENGTH,
-    LLM_KV_ATTENTION_VALUE_LENGTH,
-    LLM_KV_ATTENTION_LAYERNORM_EPS,
-    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
-    LLM_KV_ATTENTION_CAUSAL,
-
-    LLM_KV_ROPE_DIMENSION_COUNT,
-    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_SCALE_LINEAR,
-    LLM_KV_ROPE_SCALING_TYPE,
-    LLM_KV_ROPE_SCALING_FACTOR,
-    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
-    LLM_KV_ROPE_SCALING_FINETUNED,
-
-    LLM_KV_TOKENIZER_MODEL,
-    LLM_KV_TOKENIZER_LIST,
-    LLM_KV_TOKENIZER_TOKEN_TYPE,
-    LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
-    LLM_KV_TOKENIZER_SCORES,
-    LLM_KV_TOKENIZER_MERGES,
-    LLM_KV_TOKENIZER_BOS_ID,
-    LLM_KV_TOKENIZER_EOS_ID,
-    LLM_KV_TOKENIZER_UNK_ID,
-    LLM_KV_TOKENIZER_SEP_ID,
-    LLM_KV_TOKENIZER_PAD_ID,
-    LLM_KV_TOKENIZER_ADD_BOS,
-    LLM_KV_TOKENIZER_ADD_EOS,
-    LLM_KV_TOKENIZER_ADD_PREFIX,
-    LLM_KV_TOKENIZER_HF_JSON,
-    LLM_KV_TOKENIZER_RWKV,
-};
-
-static std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION,  "general.quantization_version"          },
-    { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     },
-    { LLM_KV_GENERAL_NAME,                  "general.name"                          },
-    { LLM_KV_GENERAL_AUTHOR,                "general.author"                        },
-    { LLM_KV_GENERAL_URL,                   "general.url"                           },
-    { LLM_KV_GENERAL_DESCRIPTION,           "general.description"                   },
-    { LLM_KV_GENERAL_LICENSE,               "general.license"                       },
-    { LLM_KV_GENERAL_SOURCE_URL,            "general.source.url"                    },
-    { LLM_KV_GENERAL_SOURCE_HF_REPO,        "general.source.huggingface.repository" },
-
-    { LLM_KV_CONTEXT_LENGTH,                "%s.context_length"        },
-    { LLM_KV_EMBEDDING_LENGTH,              "%s.embedding_length"      },
-    { LLM_KV_BLOCK_COUNT,                   "%s.block_count"           },
-    { LLM_KV_FEED_FORWARD_LENGTH,           "%s.feed_forward_length"   },
-    { LLM_KV_USE_PARALLEL_RESIDUAL,         "%s.use_parallel_residual" },
-    { LLM_KV_TENSOR_DATA_LAYOUT,            "%s.tensor_data_layout"    },
-    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
-    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
-    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
-
-    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,      "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,           "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,          "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,        "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_CAUSAL,              "%s.attention.causal"                 },
-
-    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
-    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
-    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
-    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
-    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
-
-    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
-    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
-    { LLM_KV_TOKENIZER_SCORES,              "tokenizer.ggml.scores"             },
-    { LLM_KV_TOKENIZER_MERGES,              "tokenizer.ggml.merges"             },
-    { LLM_KV_TOKENIZER_BOS_ID,              "tokenizer.ggml.bos_token_id"       },
-    { LLM_KV_TOKENIZER_EOS_ID,              "tokenizer.ggml.eos_token_id"       },
-    { LLM_KV_TOKENIZER_UNK_ID,              "tokenizer.ggml.unknown_token_id"   },
-    { LLM_KV_TOKENIZER_SEP_ID,              "tokenizer.ggml.seperator_token_id" },
-    { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
-    { LLM_KV_TOKENIZER_ADD_BOS,             "tokenizer.ggml.add_bos_token"      },
-    { LLM_KV_TOKENIZER_ADD_EOS,             "tokenizer.ggml.add_eos_token"      },
-    { LLM_KV_TOKENIZER_ADD_PREFIX,          "tokenizer.ggml.add_space_prefix"   },
-    { LLM_KV_TOKENIZER_HF_JSON,             "tokenizer.huggingface.json"        },
-    { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
-};
-
-struct LLM_KV {
-    LLM_KV(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_kv kv) const {
-        return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
-    }
-};
-
-enum llm_tensor {
-    LLM_TENSOR_TOKEN_EMBD,
-    LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_TOKEN_TYPES,
-    LLM_TENSOR_POS_EMBD,
-    LLM_TENSOR_OUTPUT,
-    LLM_TENSOR_OUTPUT_NORM,
-    LLM_TENSOR_ROPE_FREQS,
-    LLM_TENSOR_ATTN_Q,
-    LLM_TENSOR_ATTN_K,
-    LLM_TENSOR_ATTN_V,
-    LLM_TENSOR_ATTN_QKV,
-    LLM_TENSOR_ATTN_OUT,
-    LLM_TENSOR_ATTN_NORM,
-    LLM_TENSOR_ATTN_NORM_2,
-    LLM_TENSOR_ATTN_OUT_NORM,
-    LLM_TENSOR_ATTN_ROT_EMBD,
-    LLM_TENSOR_FFN_GATE_INP,
-    LLM_TENSOR_FFN_NORM,
-    LLM_TENSOR_FFN_GATE,
-    LLM_TENSOR_FFN_DOWN,
-    LLM_TENSOR_FFN_UP,
-    LLM_TENSOR_FFN_ACT,
-    LLM_TENSOR_FFN_DOWN_EXP,
-    LLM_TENSOR_FFN_GATE_EXP,
-    LLM_TENSOR_FFN_UP_EXP,
-    LLM_TENSOR_ATTN_Q_NORM,
-    LLM_TENSOR_ATTN_K_NORM,
-    LLM_TENSOR_LAYER_OUT_NORM,
-};
-
-static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-    {
-        LLM_ARCH_LLAMA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-        },
-    },
-    {
-        LLM_ARCH_BAICHUAN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_FALCON,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_GPT2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_POS_EMBD,        "position_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
-    },
-    {
-        LLM_ARCH_GPTJ,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
-    },
-    {
-        LLM_ARCH_GPTNEOX,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_PERSIMMON,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd"},
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm"},
-            { LLM_TENSOR_OUTPUT,          "output"},
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv"},
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output"},
-            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
-            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm"},
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down"},
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up"},
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd"},
-        },
-    },
-    {
-        LLM_ARCH_MPT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_ACT,         "blk.%d.ffn.act" },
-        },
-    },
-    {
-        LLM_ARCH_STARCODER,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_POS_EMBD,        "position_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
-    },
-    {
-        LLM_ARCH_REFACT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_BERT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
-            { LLM_TENSOR_POS_EMBD,        "position_embd" },
-            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_NOMIC_BERT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
-            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_BLOOM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
-    },
-    {
-        LLM_ARCH_STABLELM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_QWEN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_QWEN2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_PHI2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_PLAMO,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_CODESHELL,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_ORION,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_INTERNLM2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_MINICPM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-        },
-    },
-    {
-        LLM_ARCH_GEMMA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_STARCODER2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
-    },
-    {
-        LLM_ARCH_UNKNOWN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
-    },
-};
-
-static llm_arch llm_arch_from_string(const std::string & name) {
-    for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
-        if (kv.second == name) {
-            return kv.first;
-        }
-    }
-
-    return LLM_ARCH_UNKNOWN;
-}
-
-// helper to handle gguf constants
-// usage:
-//
-//   const auto tn = LLM_TN(LLM_ARCH_LLAMA);
-//
-//   std::string name = tn(LLM_TENSOR_OUTPUT);                     -> "output"
-//   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
-//   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
-//
-struct LLM_TN {
-    LLM_TN(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_tensor tensor) const {
-        if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
-            return "__missing__";
-        }
-        return LLM_TENSOR_NAMES[arch].at(tensor);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix) const {
-        if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
-            return "__missing__";
-        }
-        return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, int bid) const {
-        if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
-        if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
-        if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
-            return "__missing__";
-        }
-        return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
-    }
-};
-
-//
-// gguf helpers
-//
-
-static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_TYPE_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_TYPE_YARN,   "yarn"   },
-};
-
-static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
-    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
-        if (kv.second == name) {
-            return kv.first;
-        }
-    }
-
-    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-}
-
-static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
-    switch (type) {
-        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
-        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
-        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
-        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
-        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
-        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
-        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
-        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
-        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
-        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
-        default:                return format("unknown type %d", type);
-    }
-}
-
-static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
-    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
-
-    switch (type) {
-        case GGUF_TYPE_STRING:
-            return gguf_get_val_str(ctx_gguf, i);
-        case GGUF_TYPE_ARRAY:
-            {
-                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
-                int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = gguf_get_arr_data(ctx_gguf, i);
-                std::stringstream ss;
-                ss << "[";
-                for (int j = 0; j < arr_n; j++) {
-                    if (arr_type == GGUF_TYPE_STRING) {
-                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
-                        // escape quotes
-                        replace_all(val, "\\", "\\\\");
-                        replace_all(val, "\"", "\\\"");
-                        ss << '"' << val << '"';
-                    } else if (arr_type == GGUF_TYPE_ARRAY) {
-                        ss << "???";
-                    } else {
-                        ss << gguf_data_to_str(arr_type, data, j);
-                    }
-                    if (j < arr_n - 1) {
-                        ss << ", ";
-                    }
-                }
-                ss << "]";
-                return ss.str();
-            }
-        default:
-            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
-    }
-}
-
-//
-// ggml helpers
-//
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-//
-// llama helpers
-//
-
-#if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-template <typename T>
-struct no_init {
-    T value;
-    no_init() { /* do nothing */ }
-};
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) const {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error("unexpectedly reached end of file");
-        }
-    }
-
-    uint32_t read_u32() const {
-        uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) const {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-struct llama_mmap {
-    void * addr;
-    size_t size;
-
-    llama_mmap(const llama_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    // list of mapped fragments (first_offset, last_offset)
-    std::vector<std::pair<size_t, size_t>> mapped_fragments;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa)  { prefetch = 0; }
-#ifdef __linux__
-        // advise the kernel to read the file sequentially (increases readahead)
-        if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
-            LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
-                    strerror(errno));
-        }
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) { // NOLINT
-            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
-        }
-
-        if (prefetch > 0) {
-            // advise the kernel to preload the mapped memory
-            if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-
-        // initialize list of mapped_fragments
-        mapped_fragments.emplace_back(0, file->size);
-    }
-
-    static void align_range(size_t * first, size_t * last, size_t page_size) {
-        // align first to the next page
-        size_t offset_in_page = *first & (page_size - 1);
-        size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
-        *first += offset_to_page;
-
-        // align last to the previous page
-        *last = *last & ~(page_size - 1);
-
-        if (*last <= *first) {
-            *last = *first;
-        }
-    }
-
-    // partially unmap the file in the range [first, last)
-    void unmap_fragment(size_t first, size_t last) {
-        // note: this function must not be called multiple times with overlapping ranges
-        // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
-        int page_size = sysconf(_SC_PAGESIZE);
-        align_range(&first, &last, page_size);
-        size_t len = last - first;
-
-        if (len == 0) {
-            return;
-        }
-
-        GGML_ASSERT(first % page_size == 0);
-        GGML_ASSERT(last % page_size == 0);
-        GGML_ASSERT(last > first);
-
-        void * next_page_start = (uint8_t *) addr + first;
-
-        // unmap the range
-        if (munmap(next_page_start, len)) {
-            LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
-        }
-
-        // update the list of mapped fragments to avoid unmapping the same range again in the destructor
-        std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
-        for (const auto & frag : mapped_fragments) {
-            if (frag.first < first && frag.second > last) {
-                // the range is in the middle of the fragment, split it
-                new_mapped_fragments.emplace_back(frag.first, first);
-                new_mapped_fragments.emplace_back(last, frag.second);
-            } else if (frag.first < first && frag.second > first) {
-                // the range starts in the middle of the fragment
-                new_mapped_fragments.emplace_back(frag.first, first);
-            } else if (frag.first < last && frag.second > last) {
-                // the range ends in the middle of the fragment
-                new_mapped_fragments.emplace_back(last, frag.second);
-            } else if (frag.first >= first && frag.second <= last) {
-                // the range covers the entire fragment
-            } else {
-                // the range is outside the fragment
-                new_mapped_fragments.push_back(frag);
-            }
-        }
-        mapped_fragments = std::move(new_mapped_fragments);
-    }
-
-    ~llama_mmap() {
-        for (const auto & frag : mapped_fragments) {
-            if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
-                LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
-            }
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
-        GGML_UNUSED(numa);
-
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-
-        if (hMapping == NULL) {
-            DWORD error = GetLastError();
-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        DWORD error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        if (prefetch > 0) {
-#if _WIN32_WINNT >= 0x602
-            // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
-            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
-            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
-
-            // may fail on pre-Windows 8 systems
-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
-
-            if (pPrefetchVirtualMemory) {
-                // advise the kernel to preload the mapped memory
-                WIN32_MEMORY_RANGE_ENTRY range;
-                range.VirtualAddress = addr;
-                range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
-                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
-                            llama_format_win_err(GetLastError()).c_str());
-                }
-            }
-#else
-            throw std::runtime_error("PrefetchVirtualMemory unavailable");
-#endif
-        }
-    }
-
-    void unmap_fragment(size_t first, size_t last) {
-        // not supported
-        GGML_UNUSED(first);
-        GGML_UNUSED(last);
-    }
-
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
-        GGML_UNUSED(file);
-        GGML_UNUSED(prefetch);
-        GGML_UNUSED(numa);
-
-        throw std::runtime_error("mmap not supported");
-    }
-
-    void unmap_fragment(size_t first, size_t last) {
-        GGML_UNUSED(first);
-        GGML_UNUSED(last);
-
-        throw std::runtime_error("mmap not supported");
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-
-    bool failed_already = false;
-
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
-
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * ptr) {
-        GGML_ASSERT(addr == NULL && size == 0); // NOLINT
-        addr = ptr;
-    }
-
-    void grow_to(size_t target_size) {
-        GGML_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    static size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) const {
-        if (!mlock(addr, size)) {
-            return true;
-        }
-
-        char* errmsg = std::strerror(errno);
-        bool suggest = (errno == ENOMEM);
-
-        // Check if the resource limit is fine after all
-        struct rlimit lock_limit;
-        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
-            suggest = false;
-        }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
-            suggest = false;
-        }
-
-        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-        return false;
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    static void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    static size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * ptr, size_t len) const {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(ptr, len)) {
-                return true;
-            }
-            if (tries == 2) {
-                LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = len + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    static void raw_unlock(void * ptr, size_t len) {
-        if (!VirtualUnlock(ptr, len)) {
-            LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    static size_t lock_granularity() {
-        return (size_t) 65536;
-    }
-
-    bool raw_lock(const void * addr, size_t len) const {
-        LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
-        return false;
-    }
-
-    static void raw_unlock(const void * addr, size_t len) {}
-#endif
-};
-
-static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
-    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-        GGML_ASSERT(check == -n_tokens);
-    }
-    else {
-        result.resize(n_tokens);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#if defined(GGML_USE_CUBLAS)
-    // host buffers should only be used when data is expected to be copied to/from the GPU
-    if (host_buffer) {
-        buft = ggml_backend_cuda_host_buffer_type();
-    }
-#elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_host_buffer_type();
-#elif defined(GGML_USE_CPU_HBM)
-    buft = ggml_backend_cpu_hbm_buffer_type();
-#elif defined(GGML_USE_VULKAN)
-    if (host_buffer) {
-        buft = ggml_backend_vk_host_buffer_type();
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = ggml_backend_cpu_buffer_type();
-    }
-    return buft;
-
-    GGML_UNUSED(host_buffer);
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_METAL
-    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUBLAS)
-    buft = ggml_backend_cuda_buffer_type(gpu);
-#elif defined(GGML_USE_VULKAN)
-    buft = ggml_backend_vk_buffer_type(gpu);
-#elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(gpu);
-#elif defined(GGML_USE_CLBLAST)
-    buft = ggml_backend_opencl_buffer_type();
-#elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type(gpu);
-    if (buft == nullptr) {
-        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_cpu(true);
-    }
-    return buft;
-
-    GGML_UNUSED(gpu);
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_CUBLAS
-    if (ggml_backend_cuda_get_device_count() > 1) {
-        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_offload(fallback_gpu);
-    }
-    return buft;
-
-    GGML_UNUSED(tensor_split);
-}
-
-static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUBLAS)
-    return ggml_backend_cuda_get_device_count();
-#elif defined(GGML_USE_VULKAN)
-    return ggml_backend_vk_get_device_count();
-#else
-    return 1;
-#endif
-}
-
-static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUBLAS)
-    size_t total;
-    size_t free;
-    ggml_backend_cuda_get_device_memory(device, &total, &free);
-    return free;
-#elif defined(GGML_USE_VULKAN)
-    size_t total;
-    size_t free;
-    ggml_backend_vk_get_device_memory(device, &total, &free);
-    return free;
-#else
-    return 1;
-    GGML_UNUSED(device);
-#endif
-}
-
-//
-// globals
-//
-
-struct llama_state {
-    llama_state() {
-#ifdef GGML_USE_METAL
-        ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
-#endif
-    }
-
-    // We save the log callback globally
-    ggml_log_callback log_callback = llama_log_callback_default;
-    void * log_callback_user_data = nullptr;
-};
-
-static llama_state g_state;
-
-// available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_17M,
-    MODEL_22M,
-    MODEL_33M,
-    MODEL_109M,
-    MODEL_137M,
-    MODEL_335M,
-    MODEL_0_5B,
-    MODEL_1B,
-    MODEL_2B,
-    MODEL_3B,
-    MODEL_4B,
-    MODEL_7B,
-    MODEL_8B,
-    MODEL_13B,
-    MODEL_14B,
-    MODEL_15B,
-    MODEL_20B,
-    MODEL_30B,
-    MODEL_34B,
-    MODEL_40B,
-    MODEL_65B,
-    MODEL_70B,
-    MODEL_SMALL,
-    MODEL_MEDIUM,
-    MODEL_LARGE,
-    MODEL_XL,
-};
-
-static const size_t kiB = 1024;
-static const size_t MiB = 1024*kiB;
-static const size_t GiB = 1024*MiB;
-
-struct llama_hparams {
-    bool vocab_only;
-    bool rope_finetuned;
-
-    uint32_t n_vocab;
-    uint32_t n_ctx_train; // context size the model was trained on
-    uint32_t n_embd;
-    uint32_t n_head;
-    uint32_t n_head_kv;
-    uint32_t n_layer;
-    uint32_t n_rot;
-    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
-    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
-    uint32_t n_ff;
-    uint32_t n_expert = 0;
-    uint32_t n_expert_used = 0;
-    uint32_t n_vocab_type = 0; // for BERT-style token types
-
-    float f_norm_eps;
-    float f_norm_rms_eps;
-
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_yarn_orig_ctx;
-    int32_t  rope_scaling_type_train;
-
-    float f_clamp_kqv      = 0.0f;
-    float f_max_alibi_bias = 0.0f;
-
-    bool causal_attn = true;
-    bool need_kq_pos = false;
-
-    enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
-    enum llama_rope_type    rope_type    = LLAMA_ROPE_TYPE_NONE;
-
-    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only    != other.vocab_only)    return true;
-        if (this->n_vocab       != other.n_vocab)       return true;
-        if (this->n_ctx_train   != other.n_ctx_train)   return true;
-        if (this->n_embd        != other.n_embd)        return true;
-        if (this->n_head        != other.n_head)        return true;
-        if (this->n_head_kv     != other.n_head_kv)     return true;
-        if (this->n_layer       != other.n_layer)       return true;
-        if (this->n_rot         != other.n_rot)         return true;
-        if (this->n_embd_head_k != other.n_embd_head_k) return true;
-        if (this->n_embd_head_v != other.n_embd_head_v) return true;
-        if (this->n_ff          != other.n_ff)          return true;
-        if (this->n_expert      != other.n_expert)      return true;
-        if (this->n_expert_used != other.n_expert_used) return true;
-
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
-
-        const float EPSILON = 1e-9f;
-
-        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
-
-        return false;
-    }
-
-    uint32_t n_gqa() const {
-        return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
-        return n_embd_head_k * n_head_kv;
-    }
-
-    uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
-        return n_embd_head_v * n_head_kv;
-    }
-};
-
-struct llama_cparams {
-    uint32_t n_ctx;       // context size used during inference
-    uint32_t n_batch;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
-
-    float    rope_freq_base;
-    float    rope_freq_scale;
-
-    uint32_t n_yarn_orig_ctx;
-    // These hyperparameters are not exposed in GGUF, because all
-    // existing YaRN models use the same values for them.
-    float yarn_ext_factor;
-    float yarn_attn_factor;
-    float yarn_beta_fast;
-    float yarn_beta_slow;
-    float defrag_thold;
-
-    bool offload_kqv;
-    bool do_pooling;
-
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attn_norm;
-    struct ggml_tensor * attn_norm_b;
-    struct ggml_tensor * attn_norm_2;
-    struct ggml_tensor * attn_norm_2_b;
-    struct ggml_tensor * attn_q_norm;
-    struct ggml_tensor * attn_q_norm_b;
-    struct ggml_tensor * attn_k_norm;
-    struct ggml_tensor * attn_k_norm_b;
-    struct ggml_tensor * attn_out_norm;
-    struct ggml_tensor * attn_out_norm_b;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-    struct ggml_tensor * wqkv;
-
-    // attention bias
-    struct ggml_tensor * bq;
-    struct ggml_tensor * bk;
-    struct ggml_tensor * bv;
-    struct ggml_tensor * bo;
-    struct ggml_tensor * bqkv;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-    struct ggml_tensor * ffn_norm_b;
-    struct ggml_tensor * layer_out_norm;
-    struct ggml_tensor * layer_out_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-
-    // ff MoE
-    struct ggml_tensor * ffn_gate_inp;
-    struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
-    struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
-    struct ggml_tensor * ffn_up_exp  [LLAMA_MAX_EXPERTS];
-
-    // ff bias
-    struct ggml_tensor * ffn_down_b; // b2
-    struct ggml_tensor * ffn_up_b;   // b3
-    struct ggml_tensor * ffn_act;
-};
-
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
-
-    std::set<llama_seq_id> seq_id;
-
-    bool has_seq_id(const llama_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-
-    bool is_empty() const {
-        return seq_id.empty();
-    }
-
-    bool is_same_seq(const llama_kv_cell & other) const {
-        return seq_id == other.seq_id;
-    }
-};
-
-// ring-buffer of cached KV data
-struct llama_kv_cache {
-    bool has_shift = false;
-    bool do_defrag = false;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_internal also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
-
-    std::vector<llama_kv_cell> cells;
-
-    std::vector<struct ggml_tensor *> k_l; // per layer
-    std::vector<struct ggml_tensor *> v_l;
-
-    std::vector<struct ggml_context *> ctxs;
-    std::vector<ggml_backend_buffer_t> bufs;
-
-    size_t total_size() const {
-        size_t size = 0;
-        for (ggml_backend_buffer_t buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf);
-        }
-        return size;
-    }
-
-    ~llama_kv_cache() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
-};
-
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
-
-    struct token_data {
-        token text;
-        float score;
-        ttype type;
-    };
-
-    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data>       id_to_token;
-
-    std::unordered_map<token, id> special_tokens_cache;
-
-    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
-
-    // default LLaMA special tokens
-    id special_bos_id = 1;
-    id special_eos_id = 2;
-    id special_unk_id = 0;
-    id special_sep_id = -1;
-    id special_pad_id = -1;
-
-    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
-    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
-
-    id linefeed_id       = 13;
-    id special_prefix_id = 32007;
-    id special_middle_id = 32009;
-    id special_suffix_id = 32008;
-    id special_eot_id    = 32010;
-
-    bool add_space_prefix = true;
-
-    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
-        GGML_ASSERT(token_left.find(' ') == std::string::npos);
-        GGML_ASSERT(token_left.find('\n') == std::string::npos);
-        GGML_ASSERT(token_right.find(' ') == std::string::npos);
-        GGML_ASSERT(token_right.find('\n') == std::string::npos);
-
-        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
-        if (it == bpe_ranks.end()) {
-            return -1;
-        }
-
-        return it->second;
-    }
-};
-
-struct llama_model {
-    e_model     type  = MODEL_UNKNOWN;
-    llm_arch    arch  = LLM_ARCH_UNKNOWN;
-    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::string name = "n/a";
-
-    llama_hparams hparams = {};
-    llama_vocab   vocab;
-
-    struct ggml_tensor * tok_embd;
-    struct ggml_tensor * type_embd;
-    struct ggml_tensor * pos_embd;
-    struct ggml_tensor * tok_norm;
-    struct ggml_tensor * tok_norm_b;
-
-    struct ggml_tensor * output_norm;
-    struct ggml_tensor * output_norm_b;
-    struct ggml_tensor * output;
-    struct ggml_tensor * output_b;
-
-    std::vector<llama_layer> layers;
-
-    llama_split_mode split_mode;
-    int main_gpu;
-    int n_gpu_layers;
-
-    // gguf metadata
-    std::unordered_map<std::string, std::string> gguf_kv;
-
-    // layer -> buffer type mapping
-    struct layer_buft {
-        layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
-        layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
-        layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
-
-        ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
-        ggml_backend_buffer_type_t buft;        // everything else
-    };
-
-    layer_buft buft_input;
-    layer_buft buft_output;
-    std::vector<layer_buft> buft_layer;
-
-    // contexts where the model tensors metadata is stored
-    std::vector<struct ggml_context *> ctxs;
-
-    // the model memory buffers for the tensor data
-    std::vector<ggml_backend_buffer_t> bufs;
-
-    // model memory mapped file
-    std::unique_ptr<llama_mmap> mapping;
-
-    // objects representing data potentially being locked in memory
-    std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
-    llama_mlock mlock_mmap;
-
-    // for quantize-stats only
-    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
-
-    int64_t t_load_us = 0;
-    int64_t t_start_us = 0;
-
-    ~llama_model() {
-        for (struct ggml_context * ctx : ctxs) {
-            ggml_free(ctx);
-        }
-        for (ggml_backend_buffer_t buf : bufs) {
-            ggml_backend_buffer_free(buf);
-        }
-    }
-};
-
-struct llama_context {
-    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
-    ~llama_context() {
-        ggml_backend_sched_free(sched);
-
-        for (ggml_backend_t backend : backends) {
-            ggml_backend_free(backend);
-        }
-
-#ifdef GGML_USE_VULKAN
-        ggml_vk_free_cpu_assist();
-#endif
-
-        ggml_backend_buffer_free(buf_input);
-        ggml_free(ctx_input);
-    }
-
-    llama_cparams cparams;
-
-    std::vector<ggml_backend_t> backends;
-#ifdef GGML_USE_METAL
-    ggml_backend_t backend_metal = nullptr;
-#endif
-    ggml_backend_t backend_cpu = nullptr;
-
-    const llama_model & model;
-
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-
-    std::mt19937 rng;
-
-    bool has_evaluated_once = false;
-
-    int64_t t_start_us;
-    int64_t t_load_us;
-    int64_t t_sample_us = 0;
-    int64_t t_p_eval_us = 0;
-    int64_t t_eval_us   = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    int32_t n_eval   = 0; // number of eval calls
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-#ifndef NDEBUG
-    // guard against access to unset logits
-    std::vector<bool>  logits_valid;
-#endif
-    bool logits_all = false;
-
-    // input embedding (1-dimensional array: [n_embd])
-    std::vector<float> embedding;
-
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
-    ggml_backend_sched_t sched = nullptr;
-
-    // input tensors
-    ggml_backend_buffer_t buf_input = nullptr;
-    ggml_context * ctx_input = nullptr;
-    struct ggml_tensor * inp_tokens;    // I32 [n_batch]
-    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;       // I32 [n_batch]
-    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
-    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
-    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
-    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
-    struct ggml_tensor * inp_cls;       // I32 [n_batch]
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
-};
-
-//
-// kv cache helpers
-//
-
-static bool llama_kv_cache_init(
-             struct llama_kv_cache & cache,
-                 const llama_model & model,
-                         ggml_type   type_k,
-                         ggml_type   type_v,
-                          uint32_t   n_ctx,
-                              bool   offload) {
-    const struct llama_hparams & hparams = model.hparams;
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-    const int64_t  n_layer      = hparams.n_layer;
-
-    cache.has_shift = false;
-
-    cache.head = 0;
-    cache.size = n_ctx;
-    cache.used = 0;
-
-    cache.type_k = type_k;
-    cache.type_v = type_v;
-
-    cache.cells.clear();
-    cache.cells.resize(n_ctx);
-
-#ifdef GGML_USE_CLBLAST
-    offload = false;
-#endif
-
-    // count used buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    if (offload) {
-        for (int64_t i = 0; i < n_layer; ++i) {
-            buft_layer_count[model.buft_layer[i].buft]++;
-        }
-    } else {
-        buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
-    }
-
-    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (auto & it : buft_layer_count) {
-        int n_layers = it.second;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ 2u*n_layers*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-        if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
-            return false;
-        }
-        ctx_map[it.first] = ctx;
-        cache.ctxs.push_back(ctx);
-    }
-
-    cache.k_l.reserve(n_layer);
-    cache.v_l.reserve(n_layer);
-
-    for (int i = 0; i < (int) n_layer; i++) {
-        struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
-        ggml_format_name(k, "cache_k_l%d", i);
-        ggml_format_name(v, "cache_v_l%d", i);
-        cache.k_l.push_back(k);
-        cache.v_l.push_back(v);
-    }
-
-    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        ggml_backend_buffer_type_t buft = it.first;
-        ggml_context * ctx = it.second;
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return false;
-        }
-        ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        cache.bufs.push_back(buf);
-    }
-
-    return true;
-}
-
-// find an empty slot of size "n_tokens" in the cache
-// updates the cache head
-// Note: On success, it's important that cache.head points
-// to the first cell of the slot.
-static bool llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-        const struct llama_batch & batch) {
-    const uint32_t n_ctx    = cache.size;
-    const uint32_t n_tokens = batch.n_tokens;
-
-    if (n_tokens > n_ctx) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
-        return false;
-    }
-
-    uint32_t n_tested = 0;
-
-    while (true) {
-        if (cache.head + n_tokens > n_ctx) {
-            n_tested += n_ctx - cache.head;
-            cache.head = 0;
-            continue;
-        }
-
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cache.cells[cache.head + i].pos >= 0) {
-                found = false;
-                cache.head += i + 1;
-                n_tested   += i + 1;
-                break;
-            }
-        }
-
-        if (found) {
-            break;
-        }
-
-        if (n_tested >= n_ctx) {
-            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
-        }
-    }
-
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        cache.cells[cache.head + i].pos = batch.pos[i];
-
-        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
-        }
-    }
-
-    cache.used += n_tokens;
-
-    return true;
-}
-
-// find how many cells are currently in use
-static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
-    for (uint32_t i = cache.size - 1; i > 0; --i) {
-        if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
-            return i + 1;
-        }
-    }
-
-    return 0;
-}
-
-static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
-    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
-        cache.cells[i].pos = -1;
-        cache.cells[i].seq_id.clear();
-    }
-    cache.head = 0;
-    cache.used = 0;
-}
-
-static void llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cache.cells[i].seq_id.clear();
-            } else if (cache.cells[i].has_seq_id(seq_id)) {
-                cache.cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cache.cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cache.cells[i].pos >= 0) cache.used--;
-
-                cache.cells[i].pos = -1;
-                if (new_head == cache.size) new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
-}
-
-static void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    cache.head = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
-}
-
-static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    uint32_t new_head = cache.size;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (!cache.cells[i].has_seq_id(seq_id)) {
-            if (cache.cells[i].pos >= 0) cache.used--;
-            cache.cells[i].pos = -1;
-            cache.cells[i].seq_id.clear();
-            if (new_head == cache.size) new_head = i;
-        } else {
-            cache.cells[i].seq_id.clear();
-            cache.cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
-}
-
-static void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-            cache.cells[i].pos   += delta;
-            cache.cells[i].delta += delta;
-
-            if (cache.cells[i].pos < 0) {
-                if (!cache.cells[i].is_empty()) {
-                    cache.used--;
-                }
-                cache.cells[i].pos = -1;
-                cache.cells[i].seq_id.clear();
-                if (new_head == cache.size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    // Otherwise we just start the next search from the beginning.
-    cache.head = new_head != cache.size ? new_head : 0;
-}
-
-static void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-
-            {
-                llama_pos p_old = cache.cells[i].pos;
-                cache.cells[i].pos   /= d;
-                cache.cells[i].delta += cache.cells[i].pos - p_old;
-            }
-        }
-    }
-}
-
-static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    llama_pos result = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cache.cells[i].pos);
-        }
-    }
-
-    return result;
-}
-
-static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
-    cache.do_defrag = true;
-}
-
-//
-// model loading and saving
-//
-
-enum llama_fver {
-    GGUF_FILE_VERSION_V1 = 1,
-    GGUF_FILE_VERSION_V2 = 2,
-    GGUF_FILE_VERSION_V3 = 3,
-};
-
-static const char * llama_file_version_name(llama_fver version) {
-    switch (version) {
-        case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
-        case GGUF_FILE_VERSION_V2: return "GGUF V2";
-        case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
-    }
-
-    return "unknown";
-}
-
-static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
-    char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
-    for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
-    }
-    return buf;
-}
-
-static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
-    char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
-    }
-    return buf;
-}
-
-namespace GGUFMeta {
-    template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
-    struct GKV_Base_Type {
-        static constexpr gguf_type gt = gt_;
-
-        static T getter(const gguf_context * ctx, const int kid) {
-            return gfun(ctx, kid);
-        }
-    };
-
-    template<typename T> struct GKV_Base;
-
-    template<> struct GKV_Base<bool        >: GKV_Base_Type<bool,         GGUF_TYPE_BOOL,    gguf_get_val_bool> {};
-    template<> struct GKV_Base<uint8_t     >: GKV_Base_Type<uint8_t,      GGUF_TYPE_UINT8,   gguf_get_val_u8  > {};
-    template<> struct GKV_Base<uint16_t    >: GKV_Base_Type<uint16_t,     GGUF_TYPE_UINT16,  gguf_get_val_u16 > {};
-    template<> struct GKV_Base<uint32_t    >: GKV_Base_Type<uint32_t,     GGUF_TYPE_UINT32,  gguf_get_val_u32 > {};
-    template<> struct GKV_Base<uint64_t    >: GKV_Base_Type<uint64_t,     GGUF_TYPE_UINT64,  gguf_get_val_u64 > {};
-    template<> struct GKV_Base<int8_t      >: GKV_Base_Type<int8_t,       GGUF_TYPE_INT8,    gguf_get_val_i8  > {};
-    template<> struct GKV_Base<int16_t     >: GKV_Base_Type<int16_t,      GGUF_TYPE_INT16,   gguf_get_val_i16 > {};
-    template<> struct GKV_Base<int32_t     >: GKV_Base_Type<int32_t,      GGUF_TYPE_INT32,   gguf_get_val_i32 > {};
-    template<> struct GKV_Base<int64_t     >: GKV_Base_Type<int64_t,      GGUF_TYPE_INT64,   gguf_get_val_i64 > {};
-    template<> struct GKV_Base<float       >: GKV_Base_Type<float,        GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
-    template<> struct GKV_Base<double      >: GKV_Base_Type<double,       GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
-    template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING,  gguf_get_val_str > {};
-
-    template<> struct GKV_Base<std::string> {
-        static constexpr gguf_type gt = GGUF_TYPE_STRING;
-
-        static std::string getter(const gguf_context * ctx, const int kid) {
-            return gguf_get_val_str(ctx, kid);
-        }
-    };
-
-    struct ArrayInfo {
-        const gguf_type gt;
-        const size_t length;
-        const void * data;
-    };
-
-    template<> struct GKV_Base<ArrayInfo> {
-        public:
-        static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
-        static ArrayInfo getter(const gguf_context *ctx, const int k) {
-            return ArrayInfo {
-                gguf_get_arr_type(ctx, k),
-                size_t(gguf_get_arr_n(ctx, k)),
-                gguf_get_arr_data(ctx, k),
-            };
-        }
-    };
-
-    template<typename T>
-    class GKV : public GKV_Base<T> {
-        GKV() = delete;
-
-        public:
-        static T get_kv(const gguf_context * ctx, const int k) {
-            const enum gguf_type kt = gguf_get_kv_type(ctx, k);
-
-            if (kt != GKV::gt) {
-                throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
-                    gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
-            }
-            return GKV::getter(ctx, k);
-        }
-
-        static const char * override_type_to_str(const llama_model_kv_override_type ty) {
-            switch (ty) {
-                case LLAMA_KV_OVERRIDE_TYPE_BOOL:  return "bool";
-                case LLAMA_KV_OVERRIDE_TYPE_INT:   return "int";
-                case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
-            }
-            return "unknown";
-        }
-
-        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
-            if (!ovrd) { return false; }
-            if (ovrd->tag == expected_type) {
-                LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
-                    __func__, override_type_to_str(ovrd->tag), ovrd->key);
-                switch (ovrd->tag) {
-                    case LLAMA_KV_OVERRIDE_TYPE_BOOL:  {
-                        LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
-                    } break;
-                    case LLAMA_KV_OVERRIDE_TYPE_INT:   {
-                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
-                    } break;
-                    case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
-                        LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
-                    } break;
-                    default:
-                        // Shouldn't be possible to end up here, but just in case...
-                        throw std::runtime_error(
-                            format("Unsupported attempt to override %s type for metadata key %s\n",
-                                override_type_to_str(ovrd->tag), ovrd->key));
-                }
-                return true;
-            }
-            LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
-                __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
-            return false;
-        }
-
-        template<typename OT>
-        static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
-                target = ovrd->bool_value;
-                return true;
-            }
-            return false;
-        }
-
-        template<typename OT>
-        static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
-        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
-                target = ovrd->int_value;
-                return true;
-            }
-            return false;
-        }
-
-        template<typename OT>
-        static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
-                target = ovrd->float_value;
-                return true;
-            }
-            return false;
-        }
-
-        template<typename OT>
-        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
-        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            (void)target;
-            (void)ovrd;
-            if (!ovrd) { return false; }
-            // Currently, we should never end up here so it would be a bug if we do.
-            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
-                ovrd ? ovrd->key : "NULL"));
-        }
-
-        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
-            if (try_override<T>(target, ovrd)) {
-                return true;
-            }
-            if (k < 0) { return false; }
-            target = get_kv(ctx, k);
-            return true;
-        }
-
-        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
-            return set(ctx, gguf_find_key(ctx, key), target, ovrd);
-        }
-
-        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
-            return set(ctx, key.c_str(), target, ovrd);
-        }
-    };
-}
-
-struct llama_model_loader {
-    int n_kv      = 0;
-    int n_tensors = 0;
-    int n_created = 0;
-
-    int64_t n_elements = 0;
-    size_t  n_bytes    = 0;
-
-    bool use_mmap = false;
-
-    llama_file  file;
-    llama_ftype ftype;
-    llama_fver  fver;
-
-    std::unique_ptr<llama_mmap> mapping;
-    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
-
-    struct gguf_context * ctx_gguf = NULL;
-    struct ggml_context * ctx_meta = NULL;
-
-    std::string arch_name;
-    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
-
-    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
-        int trace = 0;
-        if (getenv("LLAMA_TRACE")) {
-            trace = atoi(getenv("LLAMA_TRACE"));
-        }
-
-        struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
-        };
-
-        if (param_overrides_p != nullptr) {
-            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
-                kv_overrides.insert({std::string(p->key), *p});
-            }
-        }
-
-        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-        if (!ctx_gguf) {
-            throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
-        }
-
-        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
-        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-
-        n_kv      = gguf_get_n_kv(ctx_gguf);
-        n_tensors = gguf_get_n_tensors(ctx_gguf);
-
-        fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
-
-        for (int i = 0; i < n_tensors; i++) {
-            const char * name = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
-            n_elements += ggml_nelements(t);
-            n_bytes    += ggml_nbytes(t);
-        }
-
-        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-                __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
-
-        // determine file type based on the number of tensors for each quantization and print meta data
-        // TODO: make optional
-        {
-            std::map<enum ggml_type, uint32_t> n_type;
-
-            uint32_t n_type_max = 0;
-            enum ggml_type type_max = GGML_TYPE_F32;
-
-            for (int i = 0; i < n_tensors; i++) {
-                enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
-
-                n_type[type]++;
-
-                if (n_type_max < n_type[type]) {
-                    n_type_max = n_type[type];
-                    type_max   = type;
-                }
-
-                if (trace > 0) {
-                    struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
-                }
-            }
-
-            switch (type_max) {
-                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
-                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
-                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
-                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
-                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
-                case GGML_TYPE_Q5_1:    ftype = LLAMA_FTYPE_MOSTLY_Q5_1;    break;
-                case GGML_TYPE_Q8_0:    ftype = LLAMA_FTYPE_MOSTLY_Q8_0;    break;
-                case GGML_TYPE_Q2_K:    ftype = LLAMA_FTYPE_MOSTLY_Q2_K;    break;
-                case GGML_TYPE_Q3_K:    ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M;  break;
-                case GGML_TYPE_Q4_K:    ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M;  break;
-                case GGML_TYPE_Q5_K:    ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M;  break;
-                case GGML_TYPE_Q6_K:    ftype = LLAMA_FTYPE_MOSTLY_Q6_K;    break;
-                case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
-                case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
-                case GGML_TYPE_IQ2_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_S;   break;
-                case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
-                case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
-                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
-                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
-                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                default:
-                    {
-                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
-                        ftype = LLAMA_FTYPE_ALL_F32;
-                    } break;
-            }
-
-            // this is a way to mark that we have "guessed" the file type
-            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
-
-            {
-                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
-                if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
-                }
-            }
-
-            LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
-            for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(ctx_gguf, i);
-                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
-                const std::string type_name =
-                    type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
-                    : gguf_type_name(type);
-
-                std::string value          = gguf_kv_to_str(ctx_gguf, i);
-                const size_t MAX_VALUE_LEN = 40;
-                if (value.size() > MAX_VALUE_LEN) {
-                    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
-                }
-                replace_all(value, "\n", "\\n");
-
-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
-            }
-
-            // print type counts
-            for (auto & kv : n_type) {
-                if (kv.second == 0) {
-                    continue;
-                }
-
-                LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
-            }
-        }
-
-        if (!llama_mmap::SUPPORTED) {
-            LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
-            use_mmap = false;
-        }
-
-        this->use_mmap = use_mmap;
-    }
-
-    ~llama_model_loader() {
-        if (ctx_gguf) {
-            gguf_free(ctx_gguf);
-        }
-        if (ctx_meta) {
-            ggml_free(ctx_meta);
-        }
-    }
-
-    template<typename T>
-    typename std::enable_if<std::is_integral<T>::value, bool>::type
-    get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(ctx_gguf, key.c_str());
-
-        if (kid < 0) {
-            if (required) {
-                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
-
-
-        result = arr_info.length;
-        return true;
-    }
-
-    template<typename T>
-    typename std::enable_if<std::is_integral<T>::value, bool>::type
-    get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
-        return get_arr_n(llm_kv(kid), result, required);
-    }
-
-    template<typename T>
-    bool get_key(const std::string & key, T & result, const bool required = true) {
-        auto it = kv_overrides.find(key);
-
-        const struct llama_model_kv_override * override =
-            it != kv_overrides.end() ? &it->second : nullptr;
-
-        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
-
-        if (required && !found) {
-            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
-        }
-
-        return found;
-    }
-
-    template<typename T>
-    bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
-        return get_key(llm_kv(kid), result, required);
-    }
-
-    std::string get_arch_name() const {
-        return arch_name;
-    }
-
-    enum llm_arch get_arch() const {
-        return llm_kv.arch;
-    }
-
-    const char * get_tensor_name(int i) const {
-        return gguf_get_tensor_name(ctx_gguf, i);
-    }
-
-    struct ggml_tensor * get_tensor_meta(const char * name) const {
-        return ggml_get_tensor(ctx_meta, name);
-    }
-
-    struct ggml_tensor * get_tensor_meta(int i) const {
-        return get_tensor_meta(get_tensor_name(i));
-    }
-
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
-        ggml_set_name(tensor, ggml_get_name(meta));
-
-        n_created++;
-
-        return tensor;
-    }
-
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
-
-        if (cur == NULL) {
-            if (!required) {
-                return NULL;
-            }
-            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
-        }
-
-        {
-            bool is_ok = true;
-            for (size_t i = 0; i < ne.size(); ++i) {
-                if (ne[i] != cur->ne[i]) {
-                    is_ok = false;
-                    break;
-                }
-            }
-            if (!is_ok) {
-                throw std::runtime_error(
-                        format("%s: tensor '%s' has wrong shape; expected %s, got %s",
-                            __func__, name.c_str(),
-                            llama_format_tensor_shape(ne).c_str(),
-                            llama_format_tensor_shape(cur).c_str()));
-            }
-        }
-
-        return create_tensor_for(ctx, cur);
-    }
-
-    void done_getting_tensors() const {
-        if (n_created != n_tensors) {
-            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
-        }
-    }
-
-    size_t file_offset(const char * name) const {
-        const int idx = gguf_find_tensor(ctx_gguf, name);
-
-        if (idx < 0) {
-            throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
-        }
-
-        return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
-    }
-
-    void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
-        // prefetch the whole file - all the data is needed anyway
-        if (use_mmap) {
-            mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
-        }
-
-        // compute the total size of all tensors for progress reporting
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            size_data += ggml_nbytes(cur);
-        }
-
-        if (use_mmap && mapping) {
-            if (lmlock) {
-                lmlock->init(mapping->addr);
-            }
-            mmap_used_first = mapping->size;
-        }
-    }
-
-    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
-        GGML_ASSERT(mapping);
-
-        *first = mapping->size;
-        *last  = 0;
-        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            const size_t offs = file_offset(ggml_get_name(tensor));
-            *first = std::min(*first, offs);
-            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
-        }
-    }
-
-    // for backwards compatibility, does not support ggml-backend
-    void load_data_for(struct ggml_tensor * cur) const {
-        const size_t offs = file_offset(ggml_get_name(cur));
-
-        if (use_mmap && mapping) {
-            if (cur->data == nullptr) {
-                cur->data = (uint8_t *)mapping->addr + offs;
-            } else {
-                memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
-            }
-        } else {
-            GGML_ASSERT(cur->data != nullptr);
-            file.seek(offs, SEEK_SET);
-            file.read_raw(cur->data, ggml_nbytes(cur));
-        }
-    }
-
-    size_t size_done = 0;
-    size_t size_data = 0;
-    size_t mmap_used_first = -1;
-    size_t mmap_used_last  = 0;
-
-    // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
-        GGML_ASSERT(size_data != 0 && "call init_mapping() first");
-
-        std::vector<no_init<uint8_t>> read_buf;
-
-        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
-            if (progress_callback) {
-                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-                    return false;
-                }
-            }
-
-            const size_t offs = file_offset(ggml_get_name(cur));
-
-            if (use_mmap && mapping) {
-                if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
-                    if (lmlock) {
-                        lmlock->grow_to(offs + ggml_nbytes(cur));
-                    }
-                    mmap_used_first = std::min(mmap_used_first, offs);
-                    mmap_used_last  = std::max(mmap_used_last,  offs + ggml_nbytes(cur));
-                } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
-                }
-            } else {
-                if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file.seek(offs, SEEK_SET);
-                    file.read_raw(cur->data, ggml_nbytes(cur));
-                } else {
-                    read_buf.resize(ggml_nbytes(cur));
-                    file.seek(offs, SEEK_SET);
-                    file.read_raw(read_buf.data(), ggml_nbytes(cur));
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
-                }
-            }
-
-            size_done += ggml_nbytes(cur);
-        }
-
-        // check if this is the last call and do final cleanup
-        if (size_done >= size_data) {
-            // unmap offloaded tensors and metadata
-            if (use_mmap && mapping) {
-                mapping->unmap_fragment(0, mmap_used_first);
-                if (mmap_used_last != 0) {
-                    mapping->unmap_fragment(mmap_used_last, mapping->size);
-                }
-            }
-            if (progress_callback) {
-                // Even though the model is done loading, we still honor
-                // cancellation since we need to free allocations.
-                return progress_callback(1.0f, progress_callback_user_data);
-            }
-        }
-
-        return true;
-    }
-};
-
-template<>
-bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
-    uint32_t tmp;
-    const bool found = get_key(kid, tmp, required);
-    result = (enum llama_pooling_type) tmp;
-    return found;
-}
-
-
-//
-// load LLaMA models
-//
-
-static const char * llama_model_arch_name(llm_arch arch) {
-    auto it = LLM_ARCH_NAMES.find(arch);
-    if (it == LLM_ARCH_NAMES.end()) {
-        return "unknown";
-    }
-    return it->second;
-}
-
-static std::string llama_model_ftype_name(llama_ftype ftype) {
-    if (ftype & LLAMA_FTYPE_GUESSED) {
-        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
-    }
-
-    switch (ftype) {
-        case LLAMA_FTYPE_ALL_F32:     return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
-        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                                      return "Q4_1, some F16";
-        case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
-
-        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:  return "IQ2_S - 2.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:  return "IQ2_M - 2.7 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_S  :return "IQ1_S - 1.5625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
-
-        default: return "unknown, may not work";
-    }
-}
-
-static const char * llama_model_type_name(e_model type) {
-    switch (type) {
-        case MODEL_22M:    return "22M";
-        case MODEL_33M:    return "33M";
-        case MODEL_109M:   return "109M";
-        case MODEL_137M:   return "137M";
-        case MODEL_0_5B:   return "0.5B";
-        case MODEL_1B:     return "1B";
-        case MODEL_2B:     return "2B";
-        case MODEL_3B:     return "3B";
-        case MODEL_7B:     return "7B";
-        case MODEL_8B:     return "8B";
-        case MODEL_13B:    return "13B";
-        case MODEL_14B:    return "14B";
-        case MODEL_15B:    return "15B";
-        case MODEL_20B:    return "20B";
-        case MODEL_30B:    return "30B";
-        case MODEL_34B:    return "34B";
-        case MODEL_40B:    return "40B";
-        case MODEL_65B:    return "65B";
-        case MODEL_70B:    return "70B";
-        case MODEL_SMALL:  return "0.1B";
-        case MODEL_MEDIUM: return "0.4B";
-        case MODEL_LARGE:  return "0.8B";
-        case MODEL_XL:     return "1.5B";
-        default:           return "?B";
-    }
-}
-
-static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
-    switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM: return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE: return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM: return "WPM";
-        default:                   return "unknown";
-    }
-}
-
-static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
-    model.arch = ml.get_arch();
-    if (model.arch == LLM_ARCH_UNKNOWN) {
-        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
-    }
-}
-
-static void llm_load_hparams(
-        llama_model_loader & ml,
-        llama_model & model) {
-    auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.ctx_gguf;
-
-    // get metadata as string
-    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
-        enum gguf_type type = gguf_get_kv_type(ctx, i);
-        if (type == GGUF_TYPE_ARRAY) {
-            continue;
-        }
-        const char * name = gguf_get_key(ctx, i);
-        const std::string value = gguf_kv_to_str(ctx, i);
-        model.gguf_kv.emplace(name, value);
-    }
-
-    // get general kv
-    ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
-
-    // get hparams kv
-    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
-    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
-    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
-    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
-    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
-    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
-    ml.get_key  (LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
-    ml.get_key  (LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
-
-    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
-    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
-    if (hparams.n_expert > 0) {
-        GGML_ASSERT(hparams.n_expert_used > 0);
-    } else {
-        GGML_ASSERT(hparams.n_expert_used == 0);
-    }
-
-    // n_head_kv is optional, default to n_head
-    hparams.n_head_kv = hparams.n_head;
-    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
-
-    bool rope_finetuned = false;
-    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
-    hparams.rope_finetuned = rope_finetuned;
-
-    hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
-    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
-
-    // rope_freq_base (optional)
-    hparams.rope_freq_base_train = 10000.0f;
-    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
-
-    std::string rope_scaling("linear");
-    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
-    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
-    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
-
-    // rope_freq_scale (inverse of the kv) is optional
-    float ropescale = 0.0f;
-    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
-        // try the old key name
-        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
-    }
-    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
-
-    // sanity check for n_rot (optional)
-    {
-        hparams.n_rot = hparams.n_embd / hparams.n_head;
-
-        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
-
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
-            if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
-                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
-            }
-        }
-        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
-        // gpt-j n_rot = rotary_dim
-    }
-
-    hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
-    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
-
-    hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
-    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
-
-    // arch-specific KVs
-    switch (model.arch) {
-        case LLM_ARCH_LLAMA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 22: model.type = e_model::MODEL_1B; break;
-                    case 26: model.type = e_model::MODEL_3B; break;
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    case 48: model.type = e_model::MODEL_34B; break;
-                    case 60: model.type = e_model::MODEL_30B; break;
-                    case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MINICPM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 40: model.type = e_model::MODEL_2B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 60: model.type = e_model::MODEL_40B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-
-                if (model.type == e_model::MODEL_13B) {
-                    // TODO: become GGUF KV parameter
-                    hparams.f_max_alibi_bias = 8.0f;
-                }
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 36: model.type = e_model::MODEL_3B; break;
-                    case 42: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_15B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PERSIMMON:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 36: model.type = e_model::MODEL_8B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_1B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-
-                // TODO: become GGUF KV parameter
-                hparams.f_max_alibi_bias = 8.0f;
-            } break;
-        case LLM_ARCH_BERT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
-                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
-
-                switch (hparams.n_layer) {
-                    case 3:
-                        model.type = e_model::MODEL_17M; break; // bge-micro
-                    case 6:
-                        model.type = e_model::MODEL_22M; break; // MiniLM-L6
-                    case 12:
-                        switch (hparams.n_embd) {
-                            case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
-                            case 768: model.type = e_model::MODEL_109M; break; // bge-base
-                        } break;
-                    case 24:
-                        model.type = e_model::MODEL_335M; break; // bge-large
-                }
-            } break;
-        case LLM_ARCH_NOMIC_BERT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
-                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
-
-                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
-                    model.type = e_model::MODEL_137M;
-                }
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 30:
-                        switch (hparams.n_embd) {
-                            case 2560: model.type = e_model::MODEL_3B; break;
-                            case 4096: model.type = e_model::MODEL_7B; break;
-                        } break;
-                }
-
-                // TODO: become GGUF KV parameter
-                hparams.f_max_alibi_bias = 8.0f;
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
-                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 48: model.type = e_model::MODEL_30B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_STABLELM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 32: model.type = e_model::MODEL_3B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_QWEN:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
-                    case 80: model.type = e_model::MODEL_70B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PHI2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 32: model.type = e_model::MODEL_3B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PLAMO:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_GPT2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 12: model.type = e_model::MODEL_SMALL; break;
-                    case 24: model.type = e_model::MODEL_MEDIUM; break;
-                    case 36: model.type = e_model::MODEL_LARGE; break;
-                    case 48: model.type = e_model::MODEL_XL; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_CODESHELL:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 42: model.type = e_model::MODEL_SMALL; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_ORION:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 40: model.type = e_model::MODEL_14B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_INTERNLM2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 48: model.type = e_model::MODEL_20B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GEMMA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 18: model.type = e_model::MODEL_2B; break;
-                    case 28: model.type = e_model::MODEL_7B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_STARCODER2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 30: model.type = e_model::MODEL_3B; break;
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_15B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        default: (void)0;
-    }
-
-    model.ftype = ml.ftype;
-
-    if (hparams.f_max_alibi_bias > 0.0f) {
-        hparams.need_kq_pos = true;
-    }
-
-    hparams.rope_type = llama_rope_type(&model);
-}
-
-// TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
-static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
-
-static void llm_load_vocab(
-        llama_model_loader & ml,
-        llama_model & model) {
-    auto & vocab = model.vocab;
-
-    struct gguf_context * ctx = ml.ctx_gguf;
-
-    const auto kv = LLM_KV(model.arch);
-
-    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
-    if (token_idx == -1) {
-        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
-    }
-
-    const float * scores = nullptr;
-    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
-    if (score_idx != -1) {
-        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-    }
-
-    const int * toktypes = nullptr;
-    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
-    if (toktype_idx != -1) {
-        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
-    }
-
-    // determine vocab type
-    {
-        std::string tokenizer_name;
-
-        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
-
-        if (tokenizer_name == "llama") {
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
-
-            // default special tokens
-            vocab.special_bos_id = 1;
-            vocab.special_eos_id = 2;
-            vocab.special_unk_id = 0;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
-
-            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-            if (add_space_prefix_keyidx != -1) {
-                vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
-            } // The default value of add_space_prefix is true.
-        } else if (tokenizer_name == "gpt2") {
-            vocab.type = LLAMA_VOCAB_TYPE_BPE;
-
-            // read bpe merges and populate bpe ranks
-            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
-            if (merges_keyidx == -1) {
-                throw std::runtime_error("cannot find tokenizer merges in model file\n");
-            }
-
-            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
-
-            for (int i = 0; i < n_merges; i++) {
-                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
-                GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
-
-                std::string first;
-                std::string second;
-
-                const size_t pos = word.find(' ', 1);
-
-                if (pos != std::string::npos) {
-                    first  = word.substr(0, pos);
-                    second = word.substr(pos + 1);
-                }
-
-                vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
-            }
-
-            // default special tokens
-            vocab.special_bos_id = 11;
-            vocab.special_eos_id = 11;
-            vocab.special_unk_id = -1;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
-        } else if (tokenizer_name == "bert") {
-            vocab.type = LLAMA_VOCAB_TYPE_WPM;
-
-            // default special tokens
-            vocab.special_bos_id = 101;
-            vocab.special_eos_id = 102;
-            vocab.special_unk_id = 100;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
-            vocab.add_space_prefix = false;
-        } else {
-            LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
-            LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
-
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
-        }
-    }
-
-    const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
-
-    vocab.id_to_token.resize(n_vocab);
-
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
-
-        vocab.token_to_id[word] = i;
-
-        auto & token_data = vocab.id_to_token[i];
-        token_data.text  = std::move(word);
-        token_data.score = scores ? scores[i] : 0.0f;
-        token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
-    }
-    GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
-
-    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
-    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        try {
-            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
-        } catch (const std::exception & e) {
-            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
-            vocab.linefeed_id = vocab.special_pad_id;
-        }
-    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
-        vocab.linefeed_id = vocab.special_pad_id;
-    } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
-    }
-
-    // special tokens
-    {
-        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
-            { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
-            { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
-            { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
-            { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
-        };
-        for (const auto & it : special_token_types) {
-            const std::string & key = kv(std::get<0>(it));
-            int32_t & id = std::get<1>(it);
-
-            uint32_t new_id;
-            if (!ml.get_key(std::get<0>(it), new_id, false)) {
-                continue;
-            }
-            if (new_id >= vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
-                    __func__, key.c_str(), new_id, id);
-            } else {
-                id = new_id;
-            }
-
-        }
-
-        // Handle add_bos_token and add_eos_token
-        {
-            bool temp = true;
-
-            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
-                vocab.special_add_bos = int(temp);
-            }
-            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
-                vocab.special_add_eos = int(temp);
-            }
-        }
-    }
-
-    // build special tokens cache
-    {
-        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
-        //  and will always be correctly labeled in 'added_tokens.json' etc.
-        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
-        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
-        //  are special tokens.
-        // From testing, this appears to correlate 1:1 with special tokens.
-        //
-
-        // Counting special tokens and verifying in only one direction
-        //  is sufficient to detect difference in those two sets.
-        //
-        uint32_t special_tokens_count_by_type = 0;
-        uint32_t special_tokens_count_from_verification = 0;
-
-        bool special_tokens_definition_mismatch = false;
-
-        for (const auto & t : vocab.token_to_id) {
-            const auto & token = t.first;
-            const auto & id    = t.second;
-
-            // Count all non-normal tokens in the vocab while iterating
-            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
-                special_tokens_count_by_type++;
-            }
-
-            // Skip single character tokens
-            if (token.length() > 1) {
-                bool is_tokenizable = false;
-
-                // Split token string representation in two, in all possible ways
-                //  and check if both halves can be matched to a valid token
-                for (unsigned i = 1; i < token.length();) {
-                    const auto left  = token.substr(0, i);
-                    const auto right = token.substr(i);
-
-                    // check if we didnt partition in the middle of a utf sequence
-                    auto utf = utf8_len(left.at(left.length() - 1));
-
-                    if (utf == 1) {
-                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
-                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
-                            is_tokenizable = true;
-                            break;
-                        }
-                        i++;
-                    } else {
-                        // skip over the rest of multibyte utf sequence
-                        i += utf - 1;
-                    }
-                }
-
-                if (!is_tokenizable) {
-                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
-                    //  it's faster to re-filter them here, since there are way less candidates now
-
-                    // Calculate a total "utf" length of a token string representation
-                    size_t utf8_str_len = 0;
-                    for (unsigned i = 0; i < token.length();) {
-                        utf8_str_len++;
-                        i += utf8_len(token.at(i));
-                    }
-
-                    // And skip the ones which are one character
-                    if (utf8_str_len > 1) {
-                        // At this point what we have left are special tokens only
-                        vocab.special_tokens_cache[token] = id;
-
-                        // Count manually found special tokens
-                        special_tokens_count_from_verification++;
-
-                        // If this manually found special token is not marked as such, flag a mismatch
-                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
-                            special_tokens_definition_mismatch = true;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
-            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
-                __func__,
-                special_tokens_count_from_verification, vocab.id_to_token.size(),
-                special_tokens_count_by_type, vocab.id_to_token.size()
-            );
-        } else {
-            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
-                __func__,
-                special_tokens_count_from_verification, vocab.id_to_token.size()
-            );
-        }
-    }
-}
-
-static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
-    const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;
-
-    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
-
-    // hparams
-    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
-    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch));
-    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, llama_model_vocab_type_name(vocab.type));
-    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab);
-    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size());
-    LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
-    LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
-    LLAMA_LOG_INFO("%s: n_head           = %u\n",     __func__, hparams.n_head);
-    LLAMA_LOG_INFO("%s: n_head_kv        = %u\n",     __func__, hparams.n_head_kv);
-    LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
-    LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
-    LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
-    LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
-    LLAMA_LOG_INFO("%s: n_gqa            = %u\n",     __func__, hparams.n_gqa());
-    LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %u\n",     __func__, hparams.n_embd_k_gqa());
-    LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %u\n",     __func__, hparams.n_embd_v_gqa());
-    LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
-    LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
-    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
-    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
-    LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
-    LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
-    LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
-    LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
-    LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
-    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
-    LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
-    LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-    LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
-    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
-    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
-    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
-    if (ml.n_elements >= 1e12) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
-    } else if (ml.n_elements >= 1e9) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
-    } else if (ml.n_elements >= 1e6) {
-        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
-    } else {
-        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
-    }
-    if (ml.n_bytes < GiB) {
-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
-    } else {
-        LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
-    }
-
-    // general kv
-    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
-
-    // special tokens
-    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
-    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
-    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
-    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
-    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
-    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
-}
-
-// Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
-        llama_model_loader & ml,
-        llama_model & model,
-        int n_gpu_layers,
-        enum llama_split_mode split_mode,
-        int main_gpu,
-        const float * tensor_split,
-        bool use_mlock,
-        llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
-    model.t_start_us = ggml_time_us();
-
-    auto & hparams = model.hparams;
-
-    model.split_mode   = split_mode;
-    model.main_gpu     = main_gpu;
-    model.n_gpu_layers = n_gpu_layers;
-
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
-
-    // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    model.buft_input = llama_default_buffer_type_cpu(true);
-
-    model.buft_layer.resize(n_layer);
-
-    // assign cpu layers
-    for (int64_t i = 0; i < i_gpu_start; ++i) {
-        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
-    }
-
-    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
-        // calculate the split points
-        int device_count = llama_get_device_count();
-        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
-        std::vector<float> splits(device_count);
-        if (all_zero) {
-            // default split, by free memory
-            for (int i = 0; i < device_count; ++i) {
-                splits[i] = llama_get_device_memory(i);
-            }
-        } else {
-            std::copy(tensor_split, tensor_split + device_count, splits.begin());
-        }
-
-        // sum and normalize the splits to get the split points
-        float split_sum = 0.0f;
-        for (int i = 0; i < device_count; ++i) {
-            split_sum += splits[i];
-            splits[i] = split_sum;
-        }
-        for (int i = 0; i < device_count; ++i) {
-            splits[i] /= split_sum;
-        }
-
-        // assign the repeating layers to the devices according to the splits
-        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
-        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-            model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
-        }
-        // assign the output layer
-        if (n_gpu_layers > n_layer) {
-            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-            model.buft_output = llama_default_buffer_type_offload(layer_gpu);
-        } else {
-            model.buft_output = llama_default_buffer_type_cpu(true);
-        }
-    } else {
-        ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
-        } else {
-            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(main_gpu);
-        }
-        // assign the repeating layers
-        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
-            model.buft_layer[i] = {
-                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
-            };
-        }
-        // assign the output layer
-        if (n_gpu_layers > n_layer) {
-            model.buft_output = {
-                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
-            };
-        } else {
-            model.buft_output = llama_default_buffer_type_cpu(true);
-        }
-    }
-
-    // count used buffer types
-    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
-    buft_layer_count[model.buft_input.buft]++;
-    buft_layer_count[model.buft_input.buft_matrix]++;
-    buft_layer_count[model.buft_output.buft]++;
-    buft_layer_count[model.buft_output.buft_matrix]++;
-    for (int64_t i = 0; i < n_layer; ++i) {
-        buft_layer_count[model.buft_layer[i].buft]++;
-        buft_layer_count[model.buft_layer[i].buft_matrix]++;
-    }
-
-    // create one context per buffer type
-    size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    for (auto & it : buft_layer_count) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-        if (!ctx) {
-            throw std::runtime_error(format("failed to create context"));
-        }
-        ctx_map[it.first] = ctx;
-        model.ctxs.push_back(ctx);
-    }
-
-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
-
-    // create tensors for the weights
-    {
-        const int64_t n_embd       = hparams.n_embd;
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const int64_t n_embd_gqa   = n_embd_v_gqa;
-        const int64_t n_vocab      = hparams.n_vocab;
-        const int64_t n_vocab_type = hparams.n_vocab_type;
-        const int64_t n_ff         = hparams.n_ff;
-
-        GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-        ggml_context * ctx_input        = ctx_map.at(model.buft_input.buft);
-        ggml_context * ctx_output       = ctx_map.at(model.buft_output.buft);
-        ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
-        auto ctx_for_layer              = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
-        auto ctx_for_layer_split        = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
-
-        model.layers.resize(n_layer);
-
-        const auto tn = LLM_TN(model.arch);
-        switch (model.arch) {
-            case LLM_ARCH_LLAMA:
-            case LLM_ARCH_REFACT:
-            case LLM_ARCH_MINICPM:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        if (model.arch != LLM_ARCH_MINICPM){
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     false);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, false);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, false);
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     false);
-
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-
-                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
-
-                        if (layer.ffn_gate_inp == nullptr) {
-                            GGML_ASSERT(hparams.n_expert      == 0);
-                            GGML_ASSERT(hparams.n_expert_used == 0);
-
-                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        } else {
-                            GGML_ASSERT(hparams.n_expert      > 0);
-                            GGML_ASSERT(hparams.n_expert_used > 0);
-
-                            // MoE branch
-                            for (uint32_t x = 0; x < hparams.n_expert; ++x) {
-                                layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd,   n_ff});
-                                layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {  n_ff, n_embd});
-                                layer.ffn_up_exp[x]   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd,   n_ff});
-                            }
-                        }
-                    }
-                } break;
-            case LLM_ARCH_BAICHUAN:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_FALCON:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab});
-                        } else {
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
-                            ml.n_created--; // artificial tensor
-                            ml.size_data += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
-                            layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
-                            layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
-                        }
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_STARCODER:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, hparams.n_ctx_train});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
-                    }
-                } break;
-            case LLM_ARCH_PERSIMMON:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab});
-
-                    {
-                        model.output_norm    = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b  = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output         = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
-                        layer.attn_norm_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd});
-
-                        layer.wqkv          = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv          = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa});
-
-                        layer.wo            = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd});
-                        layer.bo            = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd});
-
-                        layer.ffn_down      = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b    = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd});
-
-                        layer.ffn_up        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b      = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff});
-
-                        layer.ffn_norm      = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd});
-                        layer.ffn_norm_b    = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd});
-
-                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
-                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64});
-
-                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
-                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64});
-                    }
-                } break;
-            case LLM_ARCH_BERT:
-            case LLM_ARCH_NOMIC_BERT:
-                {
-                    model.tok_embd     = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
-                    model.type_embd    = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
-                    if (model.arch == LLM_ARCH_BERT) {
-                        model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, hparams.n_ctx_train});
-                    }
-
-                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        if (model.arch == LLM_ARCH_BERT) {
-                            layer.wq   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                            layer.bq   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd});
-
-                            layer.wk   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                            layer.bk   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa});
-
-                            layer.wv   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                            layer.bv   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa});
-                        } else {
-                            layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        }
-
-                        layer.wo              = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd});
-
-                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
-                        layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_up          = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
-                        layer.ffn_down        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd});
-
-                        if (model.arch == LLM_ARCH_BERT) {
-                            layer.bo         = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-                            layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
-
-                            layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-                        } else {
-                            layer.ffn_gate   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        }
-
-                        layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
-                        layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd});
-                    }
-                } break;
-            case LLM_ARCH_BLOOM:
-                {
-                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
-                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
-                    }
-                } break;
-            case LLM_ARCH_MPT:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
-
-                        // same as tok_embd, duplicated to allow offloading
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
-                        ml.n_created--; // artificial tensor
-                        ml.size_data += ggml_nbytes(model.output);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, false);
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, false);
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, false);
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, false);
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);
-
-                        // AWQ ScaleActivation layer
-                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
-                    }
-                } break;
-            case LLM_ARCH_STABLELM:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm =   ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        // optional bias tensors, present in Stable LM 2 1.6B
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     false);
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, false);
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, false);
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_QWEN:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2});
-                    }
-                } break;
-            case LLM_ARCH_QWEN2:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
-
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_PHI2:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                        model.output_b      = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
-
-                        if (layer.wqkv == nullptr) {
-                            layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
-                            layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
-
-                            layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
-                            layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa});
-
-                            layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
-                            layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa});
-                        }
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
-                    }
-                } break;
-            case LLM_ARCH_PLAMO:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_GPT2:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"),   {n_embd, hparams.n_ctx_train});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
-                    }
-                } break;
-            case LLM_ARCH_CODESHELL:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
-
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
-
-                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
-                    }
-                } break;
-            case LLM_ARCH_ORION:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_INTERNLM2:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                    }
-                } break;
-            case LLM_ARCH_GEMMA:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
-                    ml.n_created--; // artificial tensor
-                    ml.size_data += ggml_nbytes(model.output);
-
-                    const int64_t n_ff          = hparams.n_ff;
-                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
-
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                    }
-                } break;
-            case LLM_ARCH_STARCODER2:
-                {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
-                    // output
-                    {
-                        model.output_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
-                        // if output is NULL, init from the input tok embed
-                        if (model.output == NULL) {
-                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                            ml.n_created--; // artificial tensor
-                            ml.size_data += ggml_nbytes(model.output);
-                        }
-
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-                        ggml_context * ctx_split = ctx_for_layer_split(i);
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
-
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-
-                        // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
-                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
-                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
-                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
-
-                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
-
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-
-                        // optional bias tensors
-                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
-                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff});
-                    }
-                } break;
-            default:
-                throw std::runtime_error("unknown architecture");
-        }
-    }
-
-    ml.done_getting_tensors();
-
-    ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
-
-    // create the backend buffers
-    std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
-
-    for (auto & it : ctx_map) {
-        ggml_backend_buffer_type_t buft = it.first;
-        ggml_context * ctx = it.second;
-        ggml_backend_buffer_t buf = nullptr;
-
-        // only the mmap region containing the tensors in the model is mapped to the backend buffer
-        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
-        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-        if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            size_t first, last;
-            ml.get_mapping_range(&first, &last, ctx);
-            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
-        }
-#ifdef GGML_USE_METAL
-        else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
-            const size_t max_size = ggml_get_max_tensor_size(ctx);
-            size_t first, last;
-            ml.get_mapping_range(&first, &last, ctx);
-            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
-        }
-#endif
-        else {
-            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
-                model.mlock_bufs.emplace_back(new llama_mlock);
-                auto & mlock_buf = model.mlock_bufs.back();
-                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
-                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
-            }
-        }
-        if (buf == nullptr) {
-            throw std::runtime_error("failed to allocate buffer");
-        }
-        // indicate that this buffer contains weights
-        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
-        ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        model.bufs.push_back(buf);
-        ctx_bufs.emplace_back(ctx, buf);
-    }
-
-    if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
-        }
-
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
-
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-    }
-
-    // print memory requirements
-    for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
-    }
-
-    // populate tensors_by_name
-    for (ggml_context * ctx : model.ctxs) {
-        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
-            model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
-        }
-    }
-
-    // load tensor data
-    for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        ggml_backend_buffer_t buf = it.second;
-        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
-            return false;
-        }
-    }
-
-    model.mapping = std::move(ml.mapping);
-
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
-    return true;
-}
-
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
-    try {
-        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
-
-        model.hparams.vocab_only = params.vocab_only;
-
-        try {
-            llm_load_arch(ml, model);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
-        }
-        try {
-            llm_load_hparams(ml, model);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
-        }
-        try {
-            llm_load_vocab(ml, model);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
-        }
-
-        llm_load_print_meta(ml, model);
-
-        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
-            throw std::runtime_error("vocab size mismatch");
-        }
-
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
-        }
-
-#ifdef GGML_USE_KOMPUTE
-        if (params.n_gpu_layers > 0 && (
-            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
-            || !(
-                model.ftype == LLAMA_FTYPE_ALL_F32 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
-                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
-            )
-        )) {
-            // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
-            LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
-            params.n_gpu_layers = 0;
-        }
-#endif
-
-        if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
-            params.progress_callback, params.progress_callback_user_data
-        )) {
-            return -2;
-        }
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-    }
-
-    return 0;
-}
-
-//
-// llm_build
-//
-
-using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
-
-enum llm_ffn_op_type {
-    LLM_FFN_SILU,
-    LLM_FFN_GELU,
-    LLM_FFN_RELU,
-    LLM_FFN_RELU_SQR,
-};
-
-enum llm_ffn_gate_type {
-    LLM_FFN_SEQ,
-    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
-};
-
-enum llm_norm_type {
-    LLM_NORM,
-    LLM_NORM_RMS,
-};
-
-static struct ggml_tensor * llm_build_inp_embd(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
-          const llama_batch & batch,
-         struct ggml_tensor * tok_embd,
-         struct ggml_tensor * inp_tokens,
-         struct ggml_tensor * inp_embd,
-         const llm_build_cb & cb) {
-    const int64_t n_embd = hparams.n_embd;
-
-    struct ggml_tensor * inpL;
-
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
-    }
-
-    return inpL;
-}
-
-static void llm_build_kv_store(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-                    int64_t   n_ctx,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-         const llm_build_cb & cb,
-                    int64_t   il) {
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    // compute the transposed [n_tokens, n_embd] V matrix
-    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
-    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
-    cb(v_cur_t, "v_cur_t", il);
-
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
-            (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
-    cb(k_cache_view, "k_cache_view", il);
-
-    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
-            (  n_ctx)*ggml_element_size(kv.v_l[il]),
-            (kv_head)*ggml_element_size(kv.v_l[il]));
-    cb(v_cache_view, "v_cache_view", il);
-
-    // important: storing RoPE-ed version of K in the KV cache!
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
-}
-
-static struct ggml_tensor * llm_build_norm(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-        const llama_hparams & hparams,
-         struct ggml_tensor * mw,
-         struct ggml_tensor * mb,
-              llm_norm_type   type,
-         const llm_build_cb & cb,
-                        int   il) {
-    switch (type) {
-        case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
-    }
-
-    if (mw || mb) {
-        cb(cur, "norm", il);
-    }
-
-    if (mw) {
-        cur = ggml_mul(ctx, cur, mw);
-        if (mb) {
-            cb(cur, "norm_w", il);
-        }
-    }
-
-    if (mb) {
-        cur = ggml_add(ctx, cur, mb);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_ffn(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * up,
-         struct ggml_tensor * up_b,
-         struct ggml_tensor * gate,
-         struct ggml_tensor * gate_b,
-         struct ggml_tensor * down,
-         struct ggml_tensor * down_b,
-         struct ggml_tensor * act_scales,
-            llm_ffn_op_type   type_op,
-          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb,
-                        int   il) {
-    struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
-    cb(tmp, "ffn_up", il);
-
-    if (up_b) {
-        tmp = ggml_add(ctx, tmp, up_b);
-        cb(tmp, "ffn_up_b", il);
-    }
-
-    if (gate) {
-        switch (type_gate) {
-            case LLM_FFN_SEQ:
-                {
-                    cur = ggml_mul_mat(ctx, gate, tmp);
-                    cb(cur, "ffn_gate", il);
-                } break;
-            case LLM_FFN_PAR:
-                {
-                    cur = ggml_mul_mat(ctx, gate, cur);
-                    cb(cur, "ffn_gate", il);
-                } break;
-        }
-
-        if (gate_b) {
-            cur = ggml_add(ctx, cur, gate_b);
-            cb(cur, "ffn_gate_b", il);
-        }
-    } else {
-        cur = tmp;
-    }
-
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                cur = ggml_silu(ctx, cur);
-                cb(cur, "ffn_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                cur = ggml_gelu(ctx, cur);
-                cb(cur, "ffn_gelu", il);
-                if (act_scales != NULL) {
-                    cur = ggml_div(ctx, cur, act_scales);
-                    cb(cur, "ffn_act", il);
-                }
-            } break;
-        case LLM_FFN_RELU:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-            } break;
-        case LLM_FFN_RELU_SQR:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-
-                cur = ggml_sqr(ctx, cur);
-                cb(cur, "ffn_sqr(relu)", il);
-            } break;
-    }
-
-    if (type_gate == LLM_FFN_PAR) {
-        cur = ggml_mul(ctx, cur, tmp);
-        cb(cur, "ffn_gate_par", il);
-    }
-
-    cur = ggml_mul_mat(ctx, down, cur);
-    if (down_b) {
-        cb(cur, "ffn_down", il);
-    }
-
-    if (down_b) {
-        cur = ggml_add(ctx, cur, down_b);
-    }
-
-    return cur;
-}
-
-// if max_alibi_bias > 0 then apply ALiBi
-static struct ggml_tensor * llm_build_kqv(
-        struct ggml_context * ctx,
-          const llama_model & model,
-        const llama_hparams & hparams,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-         struct ggml_tensor * kq_pos,
-                    int64_t   n_ctx,
-                    int32_t   n_tokens,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-    const int64_t n_head        = hparams.n_head;
-    const int64_t n_head_kv     = hparams.n_head_kv;
-    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
-
-    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
-    cb(q, "q", il);
-
-    struct ggml_tensor * k =
-        ggml_view_3d(ctx, kv.k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
-                0);
-    cb(k, "k", il);
-
-    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-    cb(kq, "kq", il);
-
-    if (model.arch == LLM_ARCH_PHI2) {
-        // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
-        // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-    }
-
-#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
-#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
-#pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024")
-#pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5488")
-    if (hparams.f_max_alibi_bias > 0.0f) {
-        kq = ggml_scale(ctx, kq, kq_scale);
-        cb(kq, "kq_scaled", il);
-
-        kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
-        cb(kq, "kq_scaled_alibi", il);
-
-        kq = ggml_add(ctx, kq, kq_mask);
-        cb(kq, "kq_masked", il);
-
-        kq = ggml_soft_max(ctx, kq);
-        cb(kq, "kq_soft_max", il);
-    } else
-#endif
-    {
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
-        cb(kq, "kq_soft_max_ext", il);
-    }
-
-    // split cached v into n_head heads
-    struct ggml_tensor * v =
-        ggml_view_3d(ctx, kv.v_l[il],
-                n_kv, n_embd_head_v, n_head_kv,
-                ggml_element_size(kv.v_l[il])*n_ctx,
-                ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
-                0);
-    cb(v, "v", il);
-
-    struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
-    cb(kqv, "kqv", il);
-
-    struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
-    cb(kqv_merged, "kqv_merged", il);
-
-    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
-    cb(cur, "kqv_merged_cont", il);
-
-    ggml_build_forward_expand(graph, cur);
-
-    cur = ggml_mul_mat(ctx, wo, cur);
-    if (wo_b) {
-        cb(cur, "kqv_wo", il);
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx, cur, wo_b);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_kv(
-        struct ggml_context * ctx,
-          const llama_model & model,
-        const llama_hparams & hparams,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-         struct ggml_tensor * kq_pos,
-                    int64_t   n_ctx,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    ggml_build_forward_expand(graph, q_cur);
-    ggml_build_forward_expand(graph, k_cur);
-    ggml_build_forward_expand(graph, v_cur);
-
-    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
-
-    struct ggml_tensor * cur;
-    cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
-            q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
-    cb(cur, "kqv_out", il);
-
-    return cur;
-}
-
-struct llm_build_context {
-    const llama_model    & model;
-    const llama_context  & lctx;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_batch    & batch;
-    const llama_kv_cache & kv_self;
-
-    const int64_t n_embd;
-    const int64_t n_layer;
-    const int64_t n_rot;
-    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_head;
-    const int64_t n_head_kv;
-    const int64_t n_embd_head_k;
-    const int64_t n_embd_k_gqa;
-    const int64_t n_embd_head_v;
-    const int64_t n_embd_v_gqa;
-    const int64_t n_expert;
-    const int64_t n_expert_used;
-
-    const float freq_base;
-    const float freq_scale;
-    const float ext_factor;
-    const float attn_factor;
-    const float beta_fast;
-    const float beta_slow;
-    const float norm_eps;
-    const float norm_rms_eps;
-
-    const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
-    const int32_t kv_head;  // index of where we store new KV data in the cache
-    const int32_t n_orig_ctx;
-
-    const enum llama_pooling_type pooling_type;
-    const enum llama_rope_type    rope_type;
-
-    const llm_build_cb & cb;
-
-    std::vector<uint8_t> & buf_compute_meta;
-
-    struct ggml_context * ctx0 = nullptr;
-
-    // TODO: consider making the entire interface noexcept
-    llm_build_context(
-        llama_context  & lctx,
-    const llama_batch  & batch,
-    const llm_build_cb & cb,
-                  bool   worst_case) :
-        model            (lctx.model),
-        lctx             (lctx),
-        hparams          (model.hparams),
-        cparams          (lctx.cparams),
-        batch            (batch),
-        kv_self          (lctx.kv_self),
-        n_embd           (hparams.n_embd),
-        n_layer          (hparams.n_layer),
-        n_rot            (hparams.n_rot),
-        n_ctx            (cparams.n_ctx),
-        n_head           (hparams.n_head),
-        n_head_kv        (hparams.n_head_kv),
-        n_embd_head_k    (hparams.n_embd_head_k),
-        n_embd_k_gqa     (hparams.n_embd_k_gqa()),
-        n_embd_head_v    (hparams.n_embd_head_v),
-        n_embd_v_gqa     (hparams.n_embd_v_gqa()),
-        n_expert         (hparams.n_expert),
-        n_expert_used    (hparams.n_expert_used),
-        freq_base        (cparams.rope_freq_base),
-        freq_scale       (cparams.rope_freq_scale),
-        ext_factor       (cparams.yarn_ext_factor),
-        attn_factor      (cparams.yarn_attn_factor),
-        beta_fast        (cparams.yarn_beta_fast),
-        beta_slow        (cparams.yarn_beta_slow),
-        norm_eps         (hparams.f_norm_eps),
-        norm_rms_eps     (hparams.f_norm_rms_eps),
-        n_tokens         (batch.n_tokens),
-        n_kv             (worst_case ? n_ctx            : kv_self.n),
-        kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
-        n_orig_ctx       (cparams.n_yarn_orig_ctx),
-        pooling_type     (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
-        rope_type        (hparams.rope_type),
-        cb               (cb),
-        buf_compute_meta (lctx.buf_compute_meta) {
-            // all initializations should be done in init()
-        }
-
-    void init() {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ buf_compute_meta.size(),
-            /*.mem_buffer =*/ buf_compute_meta.data(),
-            /*.no_alloc   =*/ true,
-        };
-
-        ctx0 = ggml_init(params);
-    }
-
-    void free() {
-        if (ctx0) {
-            ggml_free(ctx0);
-            ctx0 = nullptr;
-        }
-    }
-
-    struct ggml_cgraph * build_k_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * tmp =
-                // we rotate only the first n_rot dimensions
-                ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k_l[il],
-                            n_embd_head_k, n_head_kv, n_ctx,
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                            0),
-                        lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-            cb(tmp, "K_shifted", il);
-            ggml_build_forward_expand(gf, tmp);
-        }
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            for (int il = 0; il < n_layer; ++il) {
-                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-
-                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-
-                ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, i));
-
-                ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, id));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-            }
-
-            i += nm - 1;
-        }
-
-        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_llama() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            } else {
-                // MoE branch
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
-                cb(logits, "ffn_moe_logits", il);
-
-                ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
-                cb(probs, "ffn_moe_probs", il);
-
-                // select experts
-                ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
-                cb(selected_experts->src[0], "ffn_moe_argsort", il);
-
-                ggml_tensor * weights = ggml_get_rows(ctx0,
-                        ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
-                cb(weights, "ffn_moe_weights", il);
-
-                weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
-
-                ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
-                cb(weights_sum, "ffn_moe_weights_sum", il);
-
-                weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
-                cb(weights, "ffn_moe_weights_norm", il);
-
-                // compute expert outputs
-                ggml_tensor * moe_out = nullptr;
-
-                for (int i = 0; i < n_expert_used; ++i) {
-                    ggml_tensor * cur_expert;
-
-                    ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
-                    cb(cur_up, "ffn_moe_up", il);
-
-                    ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
-                    cb(cur_gate, "ffn_moe_gate", il);
-
-                    cur_gate = ggml_silu(ctx0, cur_gate);
-                    cb(cur_gate, "ffn_moe_silu", il);
-
-                    cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
-                    cb(cur_expert, "ffn_moe_gate_par", il);
-
-                    cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
-                    cb(cur_expert, "ffn_moe_down", il);
-
-                    cur_expert = ggml_mul(ctx0, cur_expert,
-                            ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
-                    cb(cur_expert, "ffn_moe_weighted", il);
-
-                    if (i == 0) {
-                        moe_out = cur_expert;
-                    } else {
-                        moe_out = ggml_add(ctx0, moe_out, cur_expert);
-                        cb(moe_out, "ffn_moe_out", il);
-                    }
-                }
-
-                cur = moe_out;
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_baichuan() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
-        cb(KQ_pos, "KQ_pos", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                switch (model.type) {
-                    case MODEL_7B:
-                        Qcur = ggml_rope_custom(
-                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-                        Kcur = ggml_rope_custom(
-                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-                        break;
-                    case MODEL_13B:
-                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
-                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
-                        break;
-                    default:
-                        GGML_ASSERT(false);
-                }
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_falcon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * attn_norm;
-
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(attn_norm, "attn_norm", il);
-
-            // self-attention
-            {
-                if (model.layers[il].attn_norm_2) {
-                    // Falcon-40B
-                    cur = llm_build_norm(ctx0, inpL, hparams,
-                            model.layers[il].attn_norm_2,
-                            model.layers[il].attn_norm_2_b,
-                            LLM_NORM, cb, il);
-                    cb(cur, "attn_norm_2", il);
-                } else {
-                    cur = attn_norm;
-                }
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                // using mode = 2 for neox mode
-                Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = cur;
-
-            // feed forward
-            {
-                cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
-                        model.layers[il].ffn_up,   NULL,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            cur = ggml_add(ctx0, cur, inpL);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        // norm
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_starcoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * pos;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
-        cb(pos, "pos_embd", -1);
-
-        inpL = ggml_add(ctx0, inpL, pos);
-        cb(inpL, "inpL", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_persimmon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head   == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * residual = inpL;
-
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                // split qkv
-                GGML_ASSERT(n_head_kv == n_head);
-
-                struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
-                cb(tmpqkv, "tmpqkv", il);
-
-                struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
-                cb(tmpqkv_perm, "tmpqkv", il);
-
-                struct ggml_tensor * tmpq = ggml_view_3d(
-                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                        0
-                        );
-                cb(tmpq, "tmpq", il);
-
-                struct ggml_tensor * tmpk = ggml_view_3d(
-                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
-                        );
-                cb(tmpk, "tmpk", il);
-
-                // Q/K Layernorm
-                tmpq = llm_build_norm(ctx0, tmpq, hparams,
-                        model.layers[il].attn_q_norm,
-                        model.layers[il].attn_q_norm_b,
-                        LLM_NORM, cb, il);
-                cb(tmpq, "tmpq", il);
-
-                tmpk = llm_build_norm(ctx0, tmpk, hparams,
-                        model.layers[il].attn_k_norm,
-                        model.layers[il].attn_k_norm_b,
-                        LLM_NORM, cb, il);
-                cb(tmpk, "tmpk", il);
-
-                // RoPE the first n_rot of q/k, pass the other half, and concat.
-                struct ggml_tensor * qrot = ggml_view_3d(
-                        ctx0, tmpq, n_rot, n_head, n_tokens,
-                        ggml_element_size(tmpq) * n_embd_head,
-                        ggml_element_size(tmpq) * n_embd_head * n_head,
-                        0
-                        );
-                cb(qrot, "qrot", il);
-
-                struct ggml_tensor * krot = ggml_view_3d(
-                        ctx0, tmpk, n_rot, n_head, n_tokens,
-                        ggml_element_size(tmpk) * n_embd_head,
-                        ggml_element_size(tmpk) * n_embd_head * n_head,
-                        0
-                        );
-                cb(krot, "krot", il);
-
-                // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
-                struct ggml_tensor * qpass = ggml_view_3d(
-                        ctx0, tmpq, n_rot, n_head, n_tokens,
-                        ggml_element_size(tmpq) * n_embd_head,
-                        ggml_element_size(tmpq) * n_embd_head * n_head,
-                        ggml_element_size(tmpq) * n_rot
-                        );
-                cb(qpass, "qpass", il);
-
-                struct ggml_tensor * kpass = ggml_view_3d(
-                        ctx0, tmpk, n_rot, n_head, n_tokens,
-                        ggml_element_size(tmpk) * n_embd_head,
-                        ggml_element_size(tmpk) * n_embd_head * n_head,
-                        ggml_element_size(tmpk) * n_rot
-                        );
-                cb(kpass, "kpass", il);
-
-                struct ggml_tensor * qrotated = ggml_rope_custom(
-                    ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(qrotated, "qrotated", il);
-
-                struct ggml_tensor * krotated = ggml_rope_custom(
-                    ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(krotated, "krotated", il);
-
-                // ggml currently only supports concatenation on dim=2
-                // so we need to permute qrot, qpass, concat, then permute back.
-                qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
-                cb(qrotated, "qrotated", il);
-
-                krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
-                cb(krotated, "krotated", il);
-
-                qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
-                cb(qpass, "qpass", il);
-
-                kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
-                cb(kpass, "kpass", il);
-
-                struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
-                cb(Q, "Q", il);
-
-                Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_view_3d(
-                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
-                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
-                        );
-                cb(Vcur, "Vcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_refact() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
-        cb(KQ_pos, "KQ_pos", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                cb(Kcur, "Kcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                cb(Qcur, "Qcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_bert() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        // get input vectors with right size
-        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
-
-        // construct input embeddings (token, type, position)
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-
-        // token types are hardcoded to zero ("Sentence A")
-        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
-        inpL = ggml_add(ctx0, inpL, type_row0);
-        if (model.arch == LLM_ARCH_BERT) {
-            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
-        }
-        cb(inpL, "inp_embd", -1);
-
-        // embed layer norm
-        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
-        cb(inpL, "inp_norm", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
-
-        // iterate layers
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * cur = inpL;
-
-            // self-attention
-            if (model.arch == LLM_ARCH_BERT) {
-                struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                // seems like we just need to do this for Q?
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            } else {
-                // compute Q and K and RoPE them
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // re-add the layer input
-            cur = ggml_add(ctx0, cur, inpL);
-
-            // attention layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
-
-            struct ggml_tensor * ffn_inp = cur;
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            if (model.arch == LLM_ARCH_BERT) {
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            } else {
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            }
-            cb(cur, "ffn_out", il);
-
-            // attentions bypass the intermediate layer
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            // output layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        // final output
-        cur = inpL;
-
-        // pooling layer
-        if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-            cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
-        } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
-            cur = ggml_get_rows(ctx0, cur, inp_cls);
-        } else {
-            GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
-        }
-        cb(cur, "result_embd", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_bloom() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
-        cb(KQ_pos, "KQ_pos", -1);
-
-        inpL = llm_build_norm(ctx0, inpL, hparams,
-                model.tok_norm,
-                model.tok_norm_b,
-                LLM_NORM, cb, -1);
-        cb(inpL, "inp_norm", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // Add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_mpt() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
-        cb(KQ_pos, "KQ_pos", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * attn_norm;
-
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(attn_norm, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = attn_norm;
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                if (model.layers[il].bqkv){
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
-                }
-
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(cur, "wqkv_clamped", il);
-                }
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // Add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed forward
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        model.layers[il].ffn_act,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_stablelm() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                // using mode = 2 for neox mode
-                Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward forward
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                // these nodes are added to the graph together so that they are not reordered
-                // by doing so, the number of splits in the graph is reduced
-                ggml_build_forward_expand(gf, Qcur);
-                ggml_build_forward_expand(gf, Kcur);
-                ggml_build_forward_expand(gf, Vcur);
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_phi2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * attn_norm_output;
-        struct ggml_tensor * ffn_output;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(attn_norm_output, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = nullptr;
-                struct ggml_tensor * Kcur = nullptr;
-                struct ggml_tensor * Vcur = nullptr;
-
-                if (model.layers[il].wqkv) {
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
-                    cb(cur, "wqkv", il);
-
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
-
-                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-                } else {
-                    Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
-                }
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                // with phi2, we scale the Q to avoid precision issues
-                // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // FF
-            {
-                ffn_output = llm_build_ffn(ctx0, attn_norm_output,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(ffn_output, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_output);
-            cb(cur, "l_out", il);
-
-            cur = ggml_add(ctx0, cur, inpL);
-            cb(cur, "l_out", il);
-
-            inpL = cur;
-        }
-
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output_no_bias", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_b);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_plamo() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            struct ggml_tensor * attention_norm = cur;
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos,
-                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
-                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-            struct ggml_tensor * sa_out = cur;
-
-            cur = attention_norm;
-
-            // feed-forward network
-            {
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up, NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, sa_out);
-            cb(cur, "l_out", il);
-
-            cur = ggml_add(ctx0, cur, inpL);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gpt2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * pos;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
-        cb(pos, "pos_embd", -1);
-
-        inpL = ggml_add(ctx0, inpL, pos);
-        cb(inpL, "inpL", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_codeshell() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(tmpq, "tmpq", il);
-                cb(tmpk, "tmpk", il);
-                cb(Vcur, "Vcur", il);
-
-                struct ggml_tensor * Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = llm_build_norm(ctx0, inpL, hparams,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_orion() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                // if (model.layers[il].bq) {
-                //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                //     cb(Qcur, "Qcur", il);
-                // }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                // if (model.layers[il].bk) {
-                //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                //     cb(Kcur, "Kcur", il);
-                // }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                // if (model.layers[il].bv) {
-                //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                //     cb(Vcur, "Vcur", il);
-                // }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_internlm2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = llm_build_ffn(ctx0, cur,
-                    model.layers[il].ffn_up,   NULL,
-                    model.layers[il].ffn_gate, NULL,
-                    model.layers[il].ffn_down, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // ref: https://arxiv.org/abs/2203.03466
-    //      https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
-    // based on the original build_llama() function
-    struct ggml_cgraph * build_minicpm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        const int64_t n_embd = hparams.n_embd;
-        //TODO: if the model varies, these parameters need to be read from the model
-        const int64_t n_embd_base = 256;
-        const float scale_embd  = 12.0f;
-        const float scale_depth = 1.4f;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // scale the input embeddings
-        inpL = ggml_scale(ctx0, inpL, scale_embd);
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            // scale_res - scale the hidden states for residual connection
-            const float scale_res = scale_depth/sqrtf(float(n_layer));
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled", -1);
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // scale the hidden states for residual connection
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled_ffn", -1);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head scaling
-        const float scale_lmhead = float(n_embd_base)/float(n_embd);
-        cur = ggml_scale(ctx0, cur, scale_lmhead);
-        cb(cur, "lmhead_scaling", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gemma() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos,
-                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
-                cb(Qcur, "Qcur_scaled", il);
-
-                Kcur = ggml_rope_custom(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
-                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
-            cb(sa_out, "sa_out", il);
-
-            cur = llm_build_norm(ctx0, sa_out, hparams,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            // feed-forward network
-            {
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up, NULL,
-                        model.layers[il].ffn_gate, NULL,
-                        model.layers[il].ffn_down, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, sa_out);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_starcoder2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
-        cb(inpL, "inp_embd", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
-        cb(inp_pos, "inp_pos", -1);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
-        cb(KQ_mask, "KQ_mask", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                cb(cur, "kqv_out", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_out", il);
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-};
-
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-    llama_batch dummy;
-    dummy.n_tokens = 0;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_defrag(ids);
-
-    llm.free();
-
-    return result;
-}
-
-static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
-    llama_batch dummy;
-    dummy.n_tokens = 0;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_k_shift();
-
-    llm.free();
-
-    return result;
-}
-
-static struct ggml_cgraph * llama_build_graph(
-         llama_context & lctx,
-     const llama_batch & batch,
-                  bool   worst_case) {
-    const auto & model = lctx.model;
-
-    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
-        if (il >= 0) {
-            ggml_format_name(cur, "%s-%d", name, il);
-        } else {
-            ggml_set_name(cur, name);
-        }
-
-        if (!lctx.cparams.offload_kqv) {
-            if (strcmp(name, "kqv_merged_cont") == 0) {
-                // all nodes between the KV store and the attention output are run on the CPU
-                ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
-            }
-        }
-    };
-
-    struct ggml_cgraph * result = NULL;
-
-    struct llm_build_context llm(lctx, batch, cb, worst_case);
-
-    llm.init();
-
-    switch (model.arch) {
-        case LLM_ARCH_LLAMA:
-            {
-                result = llm.build_llama();
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                result = llm.build_baichuan();
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                result = llm.build_falcon();
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                result = llm.build_starcoder();
-            } break;
-        case LLM_ARCH_PERSIMMON:
-            {
-                result = llm.build_persimmon();
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                result = llm.build_refact();
-            } break;
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_NOMIC_BERT:
-            {
-                result = llm.build_bert();
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                result = llm.build_bloom();
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                result = llm.build_mpt();
-            } break;
-         case LLM_ARCH_STABLELM:
-            {
-                result = llm.build_stablelm();
-            } break;
-        case LLM_ARCH_QWEN:
-            {
-                result = llm.build_qwen();
-            } break;
-        case LLM_ARCH_QWEN2:
-            {
-                result = llm.build_qwen2();
-            } break;
-        case LLM_ARCH_PHI2:
-            {
-                result = llm.build_phi2();
-            } break;
-        case LLM_ARCH_PLAMO:
-            {
-                result = llm.build_plamo();
-            } break;
-        case LLM_ARCH_GPT2:
-            {
-                result = llm.build_gpt2();
-            } break;
-        case LLM_ARCH_CODESHELL:
-            {
-                result = llm.build_codeshell();
-            } break;
-        case LLM_ARCH_ORION:
-            {
-                result = llm.build_orion();
-            } break;
-        case LLM_ARCH_INTERNLM2:
-            {
-                result = llm.build_internlm2();
-            } break;
-        case LLM_ARCH_MINICPM:
-            {
-                result = llm.build_minicpm();
-            } break;
-        case LLM_ARCH_GEMMA:
-            {
-                result = llm.build_gemma();
-            } break;
-        case LLM_ARCH_STARCODER2:
-            {
-                result = llm.build_starcoder2();
-            } break;
-        default:
-            GGML_ASSERT(false);
-    }
-
-    llm.free();
-
-    return result;
-}
-
-static void llama_set_k_shift(llama_context & lctx) {
-    const auto & cparams = lctx.cparams;
-
-    const int64_t n_ctx = cparams.n_ctx;
-
-    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
-
-    int32_t * data = (int32_t *) lctx.inp_K_shift->data;
-
-    for (int i = 0; i < n_ctx; ++i) {
-        data[i] = lctx.kv_self.cells[i].delta;
-    }
-}
-
-static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-    //
-    // set input data
-    //
-
-    const auto & hparams = lctx.model.hparams;
-    const auto & cparams = lctx.cparams;
-    const auto & kv_self = lctx.kv_self;
-
-    if (batch.token) {
-        const int64_t n_tokens = batch.n_tokens;
-
-        ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
-    }
-
-    if (batch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = batch.n_tokens;
-
-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
-    }
-
-    if (batch.pos) {
-        const int64_t n_tokens = batch.n_tokens;
-
-        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
-    }
-
-    {
-        const int64_t n_kv     = kv_self.n;
-        const int64_t n_tokens = batch.n_tokens;
-
-        assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
-
-        float * data = (float *) lctx.inp_KQ_mask->data;
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j][0];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    float f;
-                    if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
-                        (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
-                        f = -INFINITY;
-                    } else {
-                        f = 0;
-                    }
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
-                }
-            }
-        }
-    }
-
-    if (hparams.need_kq_pos) {
-        const int64_t n_kv = kv_self.n;
-
-        assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
-
-        float * data = (float *) lctx.inp_KQ_pos->data;
-
-        for (int i = 0; i < n_kv; ++i) {
-            data[i] = float(lctx.kv_self.cells[i].pos);
-        }
-    }
-
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens = batch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
-        float * data = (float *) lctx.inp_mean->data;
-
-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-        for (int i = 0; i < n_tokens; ++i) {
-            const llama_seq_id seq_id = batch.seq_id[i][0];
-            sum[seq_id] += 1;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            const llama_seq_id seq_id = batch.seq_id[i][0];
-            data[seq_id*n_tokens + i] = div[seq_id];
-        }
-    }
-
-    if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        const int64_t n_tokens = batch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-
-        for (int i = 0; i < n_tokens; ++i) {
-            const llama_seq_id seq_id = batch.seq_id[i][0];
-            const llama_pos pos = batch.pos[i];
-            if (pos == 0) {
-                data[seq_id] = i;
-            }
-        }
-    }
-}
-
-static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
-#ifdef GGML_USE_MPI
-    const int64_t n_layer = lctx.model.hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(lctx.backend_metal)) {
-        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
-    }
-#endif
-
-    if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
-    }
-
-    ggml_backend_sched_graph_compute(lctx.sched, gf);
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
-}
-
-// decode a batch of tokens by evaluating the transformer
-//
-//   - lctx:      llama context
-//   - batch:     batch to evaluate
-//
-// return 0 on success
-// return positive int on warning
-// return negative int on error
-//
-static int llama_decode_internal(
-         llama_context & lctx,
-           llama_batch   batch) {
-    const uint32_t n_tokens = batch.n_tokens;
-
-    if (n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
-        return -1;
-    }
-
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    const auto n_batch = cparams.n_batch;
-
-    GGML_ASSERT(n_tokens <= n_batch);
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
-
-    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-
-    const int64_t t_start_us = ggml_time_us();
-
-#ifdef GGML_USE_MPI
-    // TODO: needs fix after #3228
-    GGML_ASSERT(false && "not implemented");
-    //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
-
-    GGML_ASSERT(n_threads > 0);
-
-    auto & kv_self = lctx.kv_self;
-
-    const int64_t n_embd  = hparams.n_embd;
-    const int64_t n_vocab = hparams.n_vocab;
-
-    // helpers for smoother batch API transition
-    // after deprecating the llama_eval calls, these will be removed
-    std::vector<llama_pos> pos;
-
-    std::vector<int32_t>                   n_seq_id;
-    std::vector<llama_seq_id *>            seq_id_arr;
-    std::vector<std::vector<llama_seq_id>> seq_id;
-
-    if (batch.pos == nullptr) {
-        pos.resize(n_tokens);
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
-        }
-
-        batch.pos = pos.data();
-    }
-
-    if (batch.seq_id == nullptr) {
-        n_seq_id.resize(n_tokens);
-        seq_id.resize(n_tokens);
-        seq_id_arr.resize(n_tokens);
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            n_seq_id[i] = 1;
-            seq_id[i].resize(1);
-            seq_id[i][0] = batch.all_seq_id;
-            seq_id_arr[i] = seq_id[i].data();
-        }
-
-        batch.n_seq_id = n_seq_id.data();
-        batch.seq_id = seq_id_arr.data();
-    }
-
-    llama_kv_cache_update(&lctx);
-
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (kv_self.head > kv_self.used + 2*n_tokens) {
-        kv_self.head = 0;
-    }
-
-    if (!llama_kv_cache_find_slot(kv_self, batch)) {
-        return 1;
-    }
-
-    // a heuristic, to avoid attending the full cache if it is not yet utilized
-    // after enough generations, the benefit from this heuristic disappears
-    // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
-    //kv_self.n = llama_kv_cache_cell_max(kv_self);
-
-    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
-
-    ggml_backend_sched_reset(lctx.sched);
-    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
-
-    ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
-
-    // the output is always the last tensor in the graph
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
-    if (strcmp(res->name, "result_output") == 0) {
-        // the embeddings could be the second to last tensor, or the third to last tensor
-        if (strcmp(embeddings->name, "result_norm") != 0) {
-            embeddings = gf->nodes[gf->n_nodes - 3];
-            GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
-        }
-    } else if (strcmp(res->name, "result_embd") == 0) {
-        embeddings = res;
-        res = nullptr;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-    //       with the BLAS calls. need a better solution
-    // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-    //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-    if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        n_threads = std::min(4, n_threads);
-    }
-
-    llama_set_inputs(lctx, batch);
-
-    llama_graph_compute(lctx, gf, n_threads);
-
-    // update the kv ring buffer
-    {
-        kv_self.head += n_tokens;
-
-        // Ensure kv cache head points to a valid index.
-        if (kv_self.head >= kv_self.size) {
-            kv_self.head = 0;
-        }
-    }
-
-    // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
-
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
-
-            llama_kv_cache_defrag(kv_self);
-        }
-    }
-
-#ifdef GGML_PERF
-    // print timing information per ggml operation (for debugging purposes)
-    // requires GGML_PERF to be defined
-    ggml_graph_print(gf);
-#endif
-
-    // plot the computation graph in dot format (for debugging purposes)
-    //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-    //}
-
-    // extract logits
-    // TODO: do not compute and extract logits if only embeddings are needed
-    //       need to update the graphs to skip "result_output"
-    if (res) {
-        auto & logits_out = lctx.logits;
-
-#ifndef NDEBUG
-        auto & logits_valid = lctx.logits_valid;
-        logits_valid.clear();
-        logits_valid.resize(n_tokens);
-
-        logits_out.clear();
-#endif
-
-        ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
-        GGML_ASSERT(res_backend != nullptr);
-        if (batch.logits) {
-            logits_out.resize(n_vocab * n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
-                if (batch.logits[i] == 0) {
-                    continue;
-                }
-                ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
-#ifndef NDEBUG
-                logits_valid[i] = true;
-#endif
-            }
-        } else if (lctx.logits_all) {
-            logits_out.resize(n_vocab * n_tokens);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
-#ifndef NDEBUG
-            std::fill(logits_valid.begin(), logits_valid.end(), true);
-#endif
-        } else {
-            logits_out.resize(n_vocab);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
-#ifndef NDEBUG
-            logits_valid[0] = true;
-#endif
-        }
-        ggml_backend_synchronize(res_backend);
-    }
-
-    // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
-
-        const int64_t embd_pos  = res ? n_embd * (n_tokens-1) : 0;
-        const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
-
-        embedding_out.resize(embd_size);
-        ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
-        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
-        ggml_backend_synchronize(embeddings_backend);
-    }
-
-    // measure the performance only for the single-token evals
-    if (n_tokens == 1) {
-        lctx.t_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_eval++;
-    }
-    else if (n_tokens > 1) {
-        lctx.t_p_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_p_eval += n_tokens;
-    }
-
-    // get a more accurate load time, upon first eval
-    // TODO: fix this
-    if (!lctx.has_evaluated_once) {
-        lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
-        lctx.has_evaluated_once = true;
-    }
-
-    return 0;
-}
-
-// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
-    auto & kv_self = lctx.kv_self;
-
-    const auto & hparams = lctx.model.hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
-    const uint32_t n_used = kv_self.used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    std::vector<uint32_t> ids(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        // each move requires 6*n_layer tensors (see build_defrag)
-        //   - source view, destination view, copy operation
-        //   - x2 for keys and values
-        //
-        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
-            // the graph is too big, we cannot move more cells
-            break;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = kv_self.cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = kv_self.cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            kv_self.cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
-            kv_self.head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return;
-    }
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = kv_self.size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    // ggml_graph defrag
-
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
-#endif
-
-    //const int64_t t_end = ggml_time_us();
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
-}
-
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
-    // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
-        llama_set_k_shift(lctx);
-
-        {
-            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
-
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
-        }
-
-        {
-            auto & kv_self = lctx.kv_self;
-
-            kv_self.has_shift = false;
-
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
-            }
-        }
-    }
-
-    // defragment the KV cache if needed
-    if (lctx.kv_self.do_defrag) {
-        llama_kv_cache_defrag_internal(lctx);
-
-        lctx.kv_self.do_defrag = false;
-    }
-}
-
-//
-// tokenizer
-//
-
-static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
-    return vocab.type;
-}
-
-static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
-}
-
-static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
-}
-
-static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
-}
-
-static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
-}
-
-static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
-    GGML_ASSERT(llama_is_byte_token(vocab, id));
-    const auto& token_data = vocab.id_to_token.at(id);
-    switch (llama_vocab_get_type(vocab)) {
-    case LLAMA_VOCAB_TYPE_SPM: {
-        auto buf = token_data.text.substr(3, 2);
-        return strtol(buf.c_str(), NULL, 16);
-    }
-    case LLAMA_VOCAB_TYPE_BPE: {
-        GGML_ASSERT(false);
-        return unicode_to_bytes_bpe(token_data.text);
-    }
-    case LLAMA_VOCAB_TYPE_WPM: {
-        GGML_ASSERT(false);
-    }
-    default:
-        GGML_ASSERT(false);
-    }
-}
-
-static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
-    static const char * hex = "0123456789ABCDEF";
-    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
-            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-            auto token = vocab.token_to_id.find(buf);
-            if (token != vocab.token_to_id.end()) {
-                return (*token).second;
-            }
-            // Try to fall back to just the byte as a string
-            const char buf2[2] = { (char)ch, 0 };
-            return vocab.token_to_id.at(buf2);
-        }
-        case LLAMA_VOCAB_TYPE_WPM:
-        case LLAMA_VOCAB_TYPE_BPE: {
-            return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
-        }
-        default:
-            GGML_ASSERT(false);
-    }
-}
-
-static void llama_escape_whitespace(std::string & text) {
-    replace_all(text, " ", "\xe2\x96\x81");
-}
-
-static void llama_unescape_whitespace(std::string & word) {
-    replace_all(word, "\xe2\x96\x81", " ");
-}
-
-struct llm_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
-
-static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
-
-// SPM tokenizer
-// original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
-
-struct llm_bigram_spm {
-    struct comparator {
-        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
-            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
-        }
-    };
-    using queue_storage = std::vector<llm_bigram_spm>;
-    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    float score;
-    size_t size;
-};
-
-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // split string into utf8 chars
-        int index = 0;
-        size_t offs = 0;
-        while (offs < text.size()) {
-            llm_symbol sym;
-            size_t len = utf8_len(text[offs]);
-            sym.text = text.c_str() + offs;
-            sym.n = std::min(len, text.size() - offs);
-            offs += sym.n;
-            sym.prev = index - 1;
-            sym.next = offs == text.size() ? -1 : index + 1;
-            index++;
-            symbols.emplace_back(sym);
-        }
-
-        // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
-            try_add_bigram(i - 1, i);
-        }
-
-        // keep substituting the highest frequency pairs for as long as we can.
-        while (!work_queue.empty()) {
-            auto bigram = work_queue.top();
-            work_queue.pop();
-
-            auto & left_sym = symbols[bigram.left];
-            auto & right_sym = symbols[bigram.right];
-
-            // if one of the symbols already got merged, skip it.
-            if (left_sym.n == 0 || right_sym.n == 0 ||
-                left_sym.n + right_sym.n != bigram.size) {
-                continue;
-            }
-
-            // merge the right sym into the left one
-            left_sym.n += right_sym.n;
-            right_sym.n = 0;
-
-            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
-
-            // remove the right sym from the chain
-            left_sym.next = right_sym.next;
-            if (right_sym.next >= 0) {
-                symbols[right_sym.next].prev = bigram.left;
-            }
-
-            // find more substitutions
-            try_add_bigram(left_sym.prev, bigram.left);
-            try_add_bigram(bigram.left, left_sym.next);
-        }
-
-        for (int i = 0; i != -1; i = symbols[i].next) {
-            auto & symbol = symbols[i];
-            resegment(symbol, output);
-        }
-    }
-
-private:
-    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
-        auto text = std::string(symbol.text, symbol.n);
-        auto token = vocab.token_to_id.find(text);
-
-        // Do we need to support is_unused?
-        if (token != vocab.token_to_id.end()) {
-            output.push_back((*token).second);
-            return;
-        }
-
-        const auto p = rev_merge.find(text);
-
-        if (p == rev_merge.end()) {
-            // output any symbols that did not form tokens as bytes.
-            output.reserve(output.size() + symbol.n);
-            for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
-                output.push_back(token_id);
-            }
-            return;
-        }
-
-        resegment(symbols[p->second.first],  output);
-        resegment(symbols[p->second.second], output);
-    }
-
-    void try_add_bigram(int left, int right) {
-        if (left == -1 || right == -1) {
-            return;
-        }
-
-        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
-        auto token = vocab.token_to_id.find(text);
-
-        if (token == vocab.token_to_id.end()) {
-            return;
-        }
-
-        if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
-            return;
-        }
-
-        const auto & tok_data = vocab.id_to_token[(*token).second];
-
-        llm_bigram_spm bigram;
-        bigram.left  = left;
-        bigram.right = right;
-        bigram.score = tok_data.score;
-        bigram.size  = text.size();
-
-        work_queue.push(bigram);
-
-        // Do we need to support is_unused?
-        rev_merge[text] = std::make_pair(left, right);
-    }
-
-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    llm_bigram_spm::queue work_queue;
-
-    std::map<std::string, std::pair<int, int>> rev_merge;
-};
-
-// BPE tokenizer
-// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
-// tried to simplify unicode stuff, so most likely does not work 100% correctly!
-
-// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
-
-struct llm_bigram_bpe {
-    struct comparator {
-        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };
-
-    using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
-
-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        int final_prev_index = -1;
-        auto word_collection = bpe_gpt2_preprocess(text);
-
-        symbols_final.clear();
-
-        for (auto & word : word_collection) {
-            work_queue = llm_bigram_bpe::queue();
-            symbols.clear();
-
-            int index = 0;
-            size_t offset = 0;
-
-            while (offset < word.size()) {
-                llm_symbol sym;
-                size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
-                sym.text = word.c_str() + offset;
-                sym.n = char_len;
-                offset += sym.n;
-                sym.prev = index - 1;
-                sym.next = offset == word.size() ? -1 : index + 1;
-                index++;
-                symbols.emplace_back(sym);
-            }
-            for (size_t i = 1; i < symbols.size(); ++i) {
-                add_new_bigram(i - 1, i);
-            }
-
-            // build token(s)
-            while (!work_queue.empty()) {
-                auto bigram = work_queue.top();
-                work_queue.pop();
-
-                auto & left_symbol = symbols[bigram.left];
-                auto & right_symbol = symbols[bigram.right];
-
-                if (left_symbol.n == 0 || right_symbol.n == 0) {
-                    continue;
-                }
-                std::string left_token = std::string(left_symbol.text, left_symbol.n);
-                std::string right_token = std::string(right_symbol.text, right_symbol.n);
-                if (left_token + right_token != bigram.text) {
-                    continue;  // Skip this bigram if it's outdated
-                }
-
-                // merge the right sym into the left one
-                left_symbol.n += right_symbol.n;
-                right_symbol.n = 0;
-
-                // remove the right sym from the chain
-                left_symbol.next = right_symbol.next;
-                if (right_symbol.next >= 0) {
-                    symbols[right_symbol.next].prev = bigram.left;
-                }
-
-                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
-                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
-            }
-
-            // add the fnished tokens to the final list keeping correct order for next and prev
-            for (auto & sym : symbols) {
-                if (sym.n > 0) {
-                    sym.prev = final_prev_index;
-                    sym.next = -1;
-                    if (final_prev_index != -1) {
-                        symbols_final[final_prev_index].next = symbols_final.size();
-                    }
-                    symbols_final.emplace_back(sym);
-                    final_prev_index = symbols_final.size() - 1;
-                }
-            }
-        }
-
-        symbols = symbols_final;
-
-        if (!symbols.empty()) {
-            for (int i = 0; i != -1; i = symbols[i].next) {
-                auto & symbol = symbols[i];
-                if (symbol.n == 0) {
-                    continue;
-                }
-
-                const std::string str = std::string(symbol.text, symbol.n);
-                const auto token = vocab.token_to_id.find(str);
-
-                if (token == vocab.token_to_id.end()) {
-                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        std::string byte_str(1, *j);
-                        auto token_multibyte = vocab.token_to_id.find(byte_str);
-                        if (token_multibyte == vocab.token_to_id.end()) {
-                            throw std::runtime_error("ERROR: byte not found in vocab");
-                        }
-                        output.push_back((*token_multibyte).second);
-                    }
-                } else {
-                    output.push_back((*token).second);
-                }
-            }
-        }
-    }
-
-private:
-    void add_new_bigram(int left, int right) {
-        if (left == -1 || right == -1) {
-            return;
-        }
-
-        std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
-        std::string right_token = std::string(symbols[right].text, symbols[right].n);
-
-        int rank_found = -1;
-
-        rank_found = vocab.find_bpe_rank(left_token, right_token);
-
-        if (rank_found < 0) {
-            return;
-        }
-
-        llm_bigram_bpe bigram;
-
-        bigram.left  = left;
-        bigram.right = right;
-        bigram.text  = left_token + right_token;
-        bigram.size  = left_token.size() + right_token.size();
-        bigram.rank  = rank_found;
-
-        work_queue.push(bigram);
-    }
-
-    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
-        std::vector<std::string> bpe_words;
-        std::vector<std::string> bpe_encoded_words;
-
-        std::string token = "";
-        // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
-        bool collecting_numeric = false;
-        bool collecting_letter = false;
-        bool collecting_special = false;
-        bool collecting_whitespace_lookahead = false;
-        bool collecting = false;
-
-        std::vector<std::string> text_utf;
-        text_utf.reserve(text.size());
-        bpe_words.reserve(text.size());
-        bpe_encoded_words.reserve(text.size());
-
-        auto cps = codepoints_from_utf8(text);
-        for (size_t i = 0; i < cps.size(); ++i)
-            text_utf.emplace_back(codepoint_to_utf8(cps[i]));
-
-        for (int i = 0; i < (int)text_utf.size(); i++) {
-            const std::string & utf_char = text_utf[i];
-            bool split_condition = false;
-            int bytes_remain = text_utf.size() - i;
-            // forward backward lookups
-            const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
-            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
-
-            // handling contractions
-            if (!split_condition && bytes_remain >= 2) {
-                // 's|'t|'m|'d
-                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char + utf_char_next;
-                    bpe_words.emplace_back(token);
-                    token = "";
-                    i++;
-                    continue;
-                }
-            }
-            if (!split_condition && bytes_remain >= 3) {
-                // 're|'ve|'ll
-                if (utf_char == "\'" && (
-                    (utf_char_next == "r" && utf_char_next_next == "e") ||
-                    (utf_char_next == "v" && utf_char_next_next == "e") ||
-                    (utf_char_next == "l" && utf_char_next_next == "l"))
-                    ) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    // current token + next token can be defined
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char + utf_char_next + utf_char_next_next;
-                    bpe_words.emplace_back(token); // the contraction
-                    token = "";
-                    i += 2;
-                    continue;
-                }
-            }
-
-            if (!split_condition && !collecting) {
-                if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
-                    collecting_letter = true;
-                    collecting = true;
-                }
-                else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
-                    collecting_numeric = true;
-                    collecting = true;
-                }
-                else if (
-                    ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
-                    (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
-                    ) {
-                    collecting_special = true;
-                    collecting = true;
-                }
-                else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
-                    collecting_whitespace_lookahead = true;
-                    collecting = true;
-                }
-                else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
-                    split_condition = true;
-                }
-            }
-            else if (!split_condition && collecting) {
-                if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
-                    split_condition = true;
-                }
-                else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
-                    split_condition = true;
-                }
-                else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
-                    split_condition = true;
-                }
-                else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
-                    split_condition = true;
-                }
-            }
-
-            if (utf_char_next == "") {
-                split_condition = true; // final
-                token += utf_char;
-            }
-
-            if (split_condition) {
-                if (token.size()) {
-                    bpe_words.emplace_back(token);
-                }
-                token = utf_char;
-                collecting = false;
-                collecting_letter = false;
-                collecting_numeric = false;
-                collecting_special = false;
-                collecting_whitespace_lookahead = false;
-            }
-            else {
-                token += utf_char;
-            }
-        }
-
-        for (std::string & word : bpe_words) {
-            std::string encoded_token = "";
-            for (char & c : word) {
-                encoded_token += bytes_to_unicode_bpe(c);
-            }
-            bpe_encoded_words.emplace_back(encoded_token);
-        }
-
-        return bpe_encoded_words;
-    }
-
-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    std::vector<llm_symbol> symbols_final;
-
-    llm_bigram_bpe::queue work_queue;
-};
-
-struct llm_tokenizer_wpm {
-    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        auto * token_map = &vocab.token_to_id;
-
-        // normalize and split by whitespace
-        std::vector<std::string> words = preprocess(text);
-
-        // bos token prepended already
-
-        // find the longest tokens that form the words
-        for (const std::string &word : words) {
-            // skip empty words
-            if (word.size() == 0) {
-                continue;
-            }
-
-            // prepend phantom space
-            std::string word1 = "\xe2\x96\x81" + word;
-            int n = word1.size();
-
-            // we're at the start of a new word
-            int i = 0;
-            bool match_any = false;
-
-            // move through character position in word
-            while (i < n) {
-                // loop through possible match length
-                bool match = false;
-                for (int j = n; j > i; j--) {
-                    auto it = token_map->find(word1.substr(i, j - i));
-                    if (it != token_map->end()) {
-                        output.push_back(it->second);
-                        match = true;
-                        match_any = true;
-                        i = j;
-                        break;
-                    }
-                }
-
-                // must be an unknown character
-                if (!match) {
-                    i++;
-                }
-            }
-
-            // we didn't find any matches for this word
-            if (!match_any) {
-                output.push_back(vocab.special_unk_id);
-            }
-        }
-
-        // append eos token
-        output.push_back(vocab.special_eos_id);
-    }
-
-    std::vector<std::string> preprocess(const std::string & text) {
-        // normalalization form D
-        std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
-        std::vector<uint32_t> nfd_codepoints;
-        for (uint32_t code : codepoints) {
-            auto it = nfd_map.equal_range(code);
-            if (it.first != it.second) {
-                for (auto jt = it.first; jt != it.second; jt++) {
-                    nfd_codepoints.push_back(jt->second);
-                }
-            } else {
-                nfd_codepoints.push_back(code);
-            }
-        }
-
-        // strip accents, strip control, uniformize whitespace,
-        // to lowercase, pad chinese characters, pad punctuation
-        std::string new_str = "";
-        for (uint32_t code : nfd_codepoints) {
-            int type = codepoint_type(code);
-            if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
-                continue;
-            }
-            code = to_lower(code);
-            if (type == CODEPOINT_TYPE_WHITESPACE) {
-                code = ' ';
-            }
-            std::string s = codepoint_to_utf8(code);
-            if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
-                new_str += " ";
-                new_str += s;
-                new_str += " ";
-            } else {
-                new_str += s;
-            }
-        }
-
-        // split by whitespace
-        uint64_t l = 0;
-        uint64_t r = 0;
-        std::vector<std::string> words;
-        while (r < new_str.size()) {
-            // if is whitespace
-            if (isspace(new_str[r])) {
-                if (r > l) words.push_back(new_str.substr(l, (r - l)));
-                l = r + 1;
-                r = l;
-            }
-            else {
-                r += 1;
-            }
-        }
-        if (r > l) {
-            words.push_back(new_str.substr(l, (r - l)));
-        }
-        return words;
-    }
-
-    uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-#if defined(_WIN32)
-        if (code > 0xFFFF) {
-            return code;
-        }
-#endif
-        return std::tolower(wchar_t(code), locale);
-    }
-
-    bool is_ascii_punct(uint32_t code) {
-        return code < 256 && ispunct(code);
-    }
-
-    bool is_chinese_char(uint32_t codepoint) {
-        if ((codepoint >= 0x4E00  && codepoint <= 0x9FFF)  ||
-            (codepoint >= 0x3400  && codepoint <= 0x4DBF)  ||
-            (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
-            (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
-            (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
-            (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
-            (codepoint >= 0xF900  && codepoint <= 0xFAFF)  ||
-            (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
-            (codepoint >= 0x3000  && codepoint <= 0x303F)  ||
-            (codepoint >= 0xFF00  && codepoint <= 0xFFEF)) {
-            return true; // NOLINT
-        }
-        return false;
-    }
-
-    const llama_vocab & vocab;
-};
-
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
-    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
-    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
-} FRAGMENT_BUFFER_VARIANT_TYPE;
-
-struct fragment_buffer_variant {
-    fragment_buffer_variant(llama_vocab::id _token)
-    :
-        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
-        token(_token),
-        raw_text(_dummy),
-        offset(0),
-        length(0) {}
-
-    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
-    :
-        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
-        token((llama_vocab::id) - 1),
-        raw_text(_raw_text),
-        offset(_offset),
-        length(_length){
-            GGML_ASSERT(_offset >= 0);
-            GGML_ASSERT(_length >= 1);
-            GGML_ASSERT(offset + length <= raw_text.length());
-        }
-
-    const FRAGMENT_BUFFER_VARIANT_TYPE type;
-    const llama_vocab::id token;
-    const std::string _dummy;
-    const std::string & raw_text;
-    const uint64_t offset;
-    const uint64_t length;
-};
-
-// #define PRETOKENIZERDEBUG
-
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
-    // for each special token
-    for (const auto & st: vocab.special_tokens_cache) {
-        const auto & special_token = st.first;
-        const auto & special_id    = st.second;
-
-        // for each text fragment
-        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
-        while (it != buffer.end()) {
-            auto & fragment = (*it);
-
-            // if a fragment is text ( not yet processed )
-            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                auto * raw_text = &(fragment.raw_text);
-
-                auto raw_text_base_offset = fragment.offset;
-                auto raw_text_base_length = fragment.length;
-
-                // loop over the text
-                while (true) {
-                    // find the first occurrence of a given special token in this fragment
-                    //  passing offset argument only limit the "search area" but match coordinates
-                    //  are still relative to the source full raw_text
-                    auto match = raw_text->find(special_token, raw_text_base_offset);
-
-                    // no occurrences found, stop processing this fragment for a given special token
-                    if (match == std::string::npos) break;
-
-                    // check if match is within bounds of offset <-> length
-                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
-
-#ifdef PRETOKENIZERDEBUG
-                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
-                    auto source = std::distance(buffer.begin(), it);
-
-                    // if match is further than base offset
-                    //  then we have some text to the left of it
-                    if (match > raw_text_base_offset) {
-                        // left
-                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
-                        const int64_t left_reminder_length = match - raw_text_base_offset;
-                        buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
-#endif
-                        it++;
-                    }
-
-                    // special token
-                    buffer.emplace_after(it, special_id);
-                    it++;
-
-                    // right
-                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
-                        const int64_t right_reminder_offset = match + special_token.length();
-                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
-                        buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
-#endif
-
-                        it++;
-
-                        if (source == 0) {
-                            buffer.erase_after(buffer.before_begin());
-                        } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
-                        }
-
-                        // repeat for the right side
-                        raw_text_base_offset = right_reminder_offset;
-                        raw_text_base_length = right_reminder_length;
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
-                    } else {
-                        if (source == 0) {
-                            buffer.erase_after(buffer.before_begin());
-                        } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
-                        }
-                        break;
-                    }
-                }
-            }
-            it++;
-        }
-    }
-}
-
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
-    std::vector<llama_vocab::id> output;
-
-    // OG tokenizer behavior:
-    //
-    // tokenizer.encode('', add_bos=True)  returns [1]
-    // tokenizer.encode('', add_bos=False) returns []
-
-    if (bos && vocab.special_bos_id != -1) {
-        output.push_back(vocab.special_bos_id);
-    }
-
-    if (raw_text.empty()) {
-        return output;
-    }
-
-    std::forward_list<fragment_buffer_variant> fragment_buffer;
-    fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
-
-    if (special) tokenizer_st_partition(vocab, fragment_buffer);
-
-    switch (vocab.type) {
-        case LLAMA_VOCAB_TYPE_SPM:
-            {
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
-
-                        // TODO: It's likely possible to get rid of this string copy entirely
-                        //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
-                        //  and passing 'add space prefix' as bool argument
-                        //
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-                        if (&fragment == &fragment_buffer.front()) {
-                            if (vocab.add_space_prefix) {
-                                raw_text = " " + raw_text; // prefix with space if the first token is not special
-                            }
-                        }
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        llm_tokenizer_spm tokenizer(vocab);
-                        llama_escape_whitespace(raw_text);
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                    }
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_BPE:
-            {
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        llm_tokenizer_bpe tokenizer(vocab);
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                    }
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_WPM:
-            {
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        llm_tokenizer_wpm tokenizer(vocab);
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                    }
-                }
-            } break;
-    }
-
-    return output;
-}
-
-//
-// grammar - internal
-//
-
-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar {
-    const std::vector<std::vector<llama_grammar_element>>   rules;
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8                                      partial_utf8;
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
-
-// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
-// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
-static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const std::string & src,
-        llama_partial_utf8   partial_start) {
-    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
-    const char          * pos      = src.c_str();
-    std::vector<uint32_t> code_points;
-    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
-    code_points.reserve(src.size() + 1);
-    uint32_t              value    = partial_start.value;
-    int                   n_remain = partial_start.n_remain;
-
-    // continue previous decode, if applicable
-    while (*pos != 0 && n_remain > 0) {
-        uint8_t next_byte = static_cast<uint8_t>(*pos);
-        if ((next_byte >> 6) != 2) {
-            // invalid sequence, abort
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
-        }
-        value = (value << 6) + (next_byte & 0x3F);
-        ++pos;
-        --n_remain;
-    }
-
-    if (partial_start.n_remain > 0 && n_remain == 0) {
-        code_points.push_back(value);
-    }
-
-    // decode any subsequent utf-8 sequences, which may end in an incomplete one
-    while (*pos != 0) {
-        uint8_t  first_byte = static_cast<uint8_t>(*pos);
-        uint8_t  highbits   = first_byte >> 4;
-                 n_remain   = lookup[highbits] - 1;
-
-        if (n_remain < 0) {
-            // invalid sequence, abort
-            code_points.clear();
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
-        }
-
-        uint8_t  mask       = (1 << (7 - n_remain)) - 1;
-                 value      = first_byte & mask;
-        ++pos;
-        while (*pos != 0 && n_remain > 0) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-            ++pos;
-            --n_remain;
-        }
-        if (n_remain == 0) {
-            code_points.push_back(value);
-        }
-    }
-    code_points.push_back(0);
-
-    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
-}
-
-// returns true iff pos points to the end of one of the definitions of a rule
-static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
-    switch (pos->type) {
-        case LLAMA_GRETYPE_END: return true;  // NOLINT
-        case LLAMA_GRETYPE_ALT: return true;  // NOLINT
-        default:                return false;
-    }
-}
-
-// returns true iff chr satisfies the char range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
-        const llama_grammar_element * pos,
-        const uint32_t                chr) {
-
-    bool found            = false;
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
-
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
-
-    do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            found = found || (pos->value <= chr && chr <= pos[1].value);
-            pos += 2;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            found = found || pos->value == chr;
-            pos += 1;
-        }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
-
-    return std::make_pair(found == is_positive_char, pos);
-}
-
-// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
-// range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static bool llama_grammar_match_partial_char(
-        const llama_grammar_element * pos,
-        const llama_partial_utf8      partial_utf8) {
-
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
-
-    uint32_t partial_value = partial_utf8.value;
-    int      n_remain      = partial_utf8.n_remain;
-
-    // invalid sequence or 7-bit char split across 2 bytes (overlong)
-    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
-        return false;
-    }
-
-    // range of possible code points this partial UTF-8 sequence could complete to
-    uint32_t low  = partial_value << (n_remain * 6);
-    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
-
-    if (low == 0) {
-        if (n_remain == 2) {
-            low = 1 << 11;
-        } else if (n_remain == 3) {
-            low = 1 << 16;
-        }
-    }
-
-    do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            if (pos->value <= high && low <= pos[1].value) {
-                return is_positive_char;
-            }
-            pos += 2;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            if (low <= pos->value && pos->value <= high) {
-                return is_positive_char;
-            }
-            pos += 1;
-        }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
-
-    return !is_positive_char;
-}
-
-
-// transforms a grammar pushdown stack into N possible stacks, all ending
-// at a character range (terminal element)
-static void llama_grammar_advance_stack(
-        const std::vector<std::vector<llama_grammar_element>>   & rules,
-        const std::vector<const llama_grammar_element *>        & stack,
-        std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
-
-    if (stack.empty()) {
-        new_stacks.emplace_back(stack);
-        return;
-    }
-
-    const llama_grammar_element * pos = stack.back();
-
-    switch (pos->type) {
-        case LLAMA_GRETYPE_RULE_REF: {
-            const size_t                  rule_id = static_cast<size_t>(pos->value);
-            const llama_grammar_element * subpos  = rules[rule_id].data();
-            do {
-                // init new stack without the top (pos)
-                std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
-                    // if this rule ref is followed by another element, add that to stack
-                    new_stack.push_back(pos + 1);
-                }
-                if (!llama_grammar_is_end_of_sequence(subpos)) {
-                    // if alternate is nonempty, add to stack
-                    new_stack.push_back(subpos);
-                }
-                llama_grammar_advance_stack(rules, new_stack, new_stacks);
-                while (!llama_grammar_is_end_of_sequence(subpos)) {
-                    // scan to end of alternate def
-                    subpos++;
-                }
-                if (subpos->type == LLAMA_GRETYPE_ALT) {
-                    // there's another alternate def of this rule to process
-                    subpos++;
-                } else {
-                    break;
-                }
-            } while (true);
-            break;
-        }
-        case LLAMA_GRETYPE_CHAR:
-        case LLAMA_GRETYPE_CHAR_NOT:
-            new_stacks.emplace_back(stack);
-            break;
-        default:
-            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
-            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
-            // those
-            GGML_ASSERT(false);
-    }
-}
-
-// takes a set of possible pushdown stacks on a grammar, which are required to
-// be positioned at a character range (see `llama_grammar_advance_stack`), and
-// produces the N possible stacks if the given char is accepted at those
-// positions
-static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const uint32_t                                                  chr) {
-
-    std::vector<std::vector<const llama_grammar_element *>> new_stacks;
-
-    for (const auto & stack : stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(rules, new_stack, new_stacks);
-        }
-    }
-
-    return new_stacks;
-}
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const std::vector<llama_grammar_candidate>                    & candidates);
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
-        const std::vector<std::vector<llama_grammar_element>> & rules,
-        const std::vector<const llama_grammar_element *>      & stack,
-        const std::vector<llama_grammar_candidate>            & candidates) {
-
-    std::vector<llama_grammar_candidate> rejects;
-
-    if (stack.empty()) {
-        for (const auto & tok : candidates) {
-            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
-                rejects.push_back(tok);
-            }
-        }
-        return rejects;
-    }
-
-    const llama_grammar_element * stack_pos = stack.back();
-
-    std::vector<llama_grammar_candidate> next_candidates;
-    for (const auto & tok : candidates) {
-        if (*tok.code_points == 0) {
-            // reached end of full codepoints in token, reject iff it ended in a partial sequence
-            // that cannot satisfy this position in grammar
-            if (tok.partial_utf8.n_remain != 0 &&
-                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
-                rejects.push_back(tok);
-            }
-        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
-        } else {
-            rejects.push_back(tok);
-        }
-    }
-
-    const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
-
-    // update top of stack to next element, if any
-    std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
-    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
-        stack_after.push_back(stack_pos_after);
-    }
-    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
-    llama_grammar_advance_stack(rules, stack_after, next_stacks);
-
-    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (const auto & tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
-    }
-
-    return rejects;
-}
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const std::vector<llama_grammar_candidate>                    & candidates) {
-    GGML_ASSERT(!stacks.empty()); // REVIEW
-
-    if (candidates.empty()) {
-        return std::vector<llama_grammar_candidate>();
-    }
-
-    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
-
-    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
-        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
-    }
-    return rejects;
-}
-
-//
-// grammar - external
-//
-
-struct llama_grammar * llama_grammar_init(
-            const llama_grammar_element ** rules,
-                                 size_t    n_rules,
-                                 size_t    start_rule_index) {
-    const llama_grammar_element * pos;
-
-    // copy rule definitions into vectors
-    std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
-    for (size_t i = 0; i < n_rules; i++) {
-        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
-            vec_rules[i].push_back(*pos);
-        }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
-    }
-
-    // loop over alternates of start rule to build initial stacks
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-    pos = rules[start_rule_index];
-    do {
-        std::vector<const llama_grammar_element *> stack;
-        if (!llama_grammar_is_end_of_sequence(pos)) {
-            // if alternate is nonempty, add to stack
-            stack.push_back(pos);
-        }
-        llama_grammar_advance_stack(vec_rules, stack, stacks);
-        while (!llama_grammar_is_end_of_sequence(pos)) {
-            // scan to end of alternate def
-            pos++;
-        }
-        if (pos->type == LLAMA_GRETYPE_ALT) {
-            // there's another alternate def of this rule to process
-            pos++;
-        } else {
-            break;
-        }
-    } while (true);
-
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
-}
-
-void llama_grammar_free(struct llama_grammar * grammar) {
-    delete grammar;
-}
-
-struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
-
-    // redirect elements in stacks to point to new rules
-    for (size_t is = 0; is < result->stacks.size(); is++) {
-        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
-                    }
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-//
-// sampling
-//
-
-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = time(NULL);
-    }
-    ctx->rng.seed(seed);
-}
-
-void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    GGML_ASSERT(candidates->size > 0);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Sort the logits in descending order
-    if (!candidates->sorted) {
-        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        });
-        candidates->sorted = true;
-    }
-
-    float max_l = candidates->data[0].logit;
-    float cum_sum = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float p = expf(candidates->data[i].logit - max_l);
-        candidates->data[i].p = p;
-        cum_sum += p;
-    }
-    for (size_t i = 0; i < candidates->size; ++i) {
-        candidates->data[i].p /= cum_sum;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
-    // if (k >= (int32_t)candidates->size) {
-    //     return;
-    // }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (k <= 0) {
-        k = candidates->size;
-    }
-
-    k = std::max(k, (int) min_keep);
-    k = std::min(k, (int) candidates->size);
-
-    // Sort scores in descending order
-    if (!candidates->sorted) {
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        };
-        if (k <= 128) {
-            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
-        } else {
-            constexpr int   nbuckets     = 128;
-            constexpr float bucket_low   = -10.0f;
-            constexpr float bucket_high  =  10.0f;
-            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
-            constexpr float bucker_inter = -bucket_low * bucket_scale;
-
-            std::vector<int> bucket_idx(candidates->size);
-            std::vector<int> histo(nbuckets, 0);
-
-            for (int i = 0; i < (int)candidates->size; ++i) {
-                const float val = candidates->data[i].logit;
-                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
-                ib = std::max(0, std::min(nbuckets-1, ib));
-                bucket_idx[i] = ib;
-                ++histo[ib];
-            }
-            int nhave = 0;
-            int ib = nbuckets - 1;
-            for ( ; ib >= 0; --ib) {
-                nhave += histo[ib];
-                if (nhave >= k) break;
-            }
-            std::vector<llama_token_data> tmp_tokens(nhave);
-            auto ptr = tmp_tokens.data();
-            std::vector<llama_token_data*> bucket_ptrs;
-            bucket_ptrs.reserve(nbuckets - ib);
-            for (int j = nbuckets - 1; j >= ib; --j) {
-                bucket_ptrs.push_back(ptr);
-                ptr += histo[j];
-            }
-            for (int i = 0; i < (int)candidates->size; ++i) {
-                int j = bucket_idx[i];
-                if (j >= ib) {
-                    *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
-                }
-            }
-
-            ptr = tmp_tokens.data();
-            int ndone = 0;
-            for (int j = nbuckets-1; j > ib; --j) {
-                std::sort(ptr, ptr + histo[j], comp);
-                ptr += histo[j];
-                ndone += histo[j];
-            }
-            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
-
-            std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
-
-        }
-        candidates->sorted = true;
-    }
-    candidates->size = k;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    if (p >= 1.0f) {
-        return;
-    }
-
-    llama_sample_softmax(ctx, candidates);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        cum_sum += candidates->data[i].p;
-
-        // Check if the running sum is at least p or if we have kept at least min_keep tokens
-        // we set the last index to i+1 to indicate that the current iterate should be included in the set
-        if (cum_sum >= p && i + 1 >= min_keep) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the top-p tokens
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    if (p <= 0.0f || !candidates->size) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    bool min_p_applied = false;
-
-    // if the candidates aren't sorted, try the unsorted implementation first
-    if (!candidates->sorted) {
-        std::vector<llama_token_data> filtered_tokens;
-
-        float max_logit = -FLT_MAX;
-        for (size_t i = 0; i < candidates->size; ++i) {
-            max_logit = std::max(max_logit, candidates->data[i].logit);
-        }
-        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
-
-        for (size_t i = 0; i < candidates->size; ++i) {
-            if (candidates->data[i].logit >= min_logit) {
-                filtered_tokens.push_back(candidates->data[i]);
-            }
-        }
-
-        // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= min_keep) {
-            memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
-            candidates->size = filtered_tokens.size();
-            min_p_applied = true;
-        }
-    }
-
-    // if the candidates are sorted or the unsorted implementation failed, use this implementation
-    if (!min_p_applied) {
-        // Sort the logits in descending order
-        if (!candidates->sorted) {
-            std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-                return a.logit > b.logit;
-            });
-            candidates->sorted = true;
-        }
-
-        const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
-        size_t i = 1; // first token always matches
-
-        for (; i < candidates->size; ++i) {
-            if (candidates->data[i].logit < min_logit && i >= min_keep) {
-                break; // prob too small
-            }
-        }
-
-        // Resize the output vector to keep only the matching tokens
-        candidates->size = i;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
-    if (z >= 1.0f || candidates->size <= 2) {
-        return;
-    }
-
-    llama_sample_softmax(nullptr, candidates);
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(candidates->size - 1);
-    std::vector<float> second_derivatives(candidates->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
-        }
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > z && i >= min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    // Reference implementation:
-    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
-    if (p >= 1.0f) {
-        return;
-    }
-
-    // Compute the softmax of logits and calculate entropy
-    llama_sample_softmax(nullptr, candidates);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    float entropy = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
-    }
-
-    // Compute the absolute difference between negative log probability and entropy for each candidate
-    std::vector<float> shifted_scores;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
-        shifted_scores.push_back(shifted_score);
-    }
-
-    // Sort tokens based on the shifted_scores and their corresponding indices
-    std::vector<size_t> indices(candidates->size);
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
-        return shifted_scores[a] < shifted_scores[b];
-    });
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = indices.size();
-
-    for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        cum_sum += candidates->data[idx].p;
-
-        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the locally typical tokens
-    std::vector<llama_token_data> new_candidates;
-    for (size_t i = 0; i < last_idx; ++i) {
-        size_t idx = indices[i];
-        new_candidates.push_back(candidates->data[idx]);
-    }
-
-    // Replace the data in candidates with the new_candidates data
-    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
-    candidates->size = new_candidates.size();
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // no need to do anything if there is only one (or zero) candidates
-    if(candidates_p->size <= 1) {
-        return;
-    }
-
-    // Calculate maximum possible entropy
-    float max_entropy = -logf(1.0f / candidates_p->size);
-
-    llama_sample_softmax(nullptr, candidates_p);
-
-    // Calculate entropy of the softmax probabilities
-    float entropy = 0.0f;
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        float prob = candidates_p->data[i].p;
-        if (prob > 0.0f) { // Ensure no log(0)
-            entropy -= prob * logf(prob);
-        }
-    }
-
-    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
-    float normalized_entropy = entropy / max_entropy;
-
-    // Map the normalized entropy to the desired temperature range using the power function
-    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
-
-#ifdef DEBUG
-    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
-    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
-    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
-    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
-    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
-    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
-#endif
-
-    // Apply the dynamically calculated temperature scaling
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].logit /= dyn_temp;
-    }
-
-    // Re-compute softmax probabilities after scaling logits with dynamic temperature
-    double max_l_double = candidates_p->data[0].logit;
-    double cum_sum_double = 0.0;
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        double p = exp(candidates_p->data[i].logit - max_l_double);
-        candidates_p->data[i].p = p; // Store the scaled probability
-        cum_sum_double += p;
-    }
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
-    }
-
-#ifdef DEBUG
-    // Print the updated top 25 probabilities after temperature scaling
-    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
-    for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
-        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
-    }
-#endif
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].logit /= temp;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present) {
-    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Create a frequency map to count occurrences of each token in last_tokens
-    std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < penalty_last_n; ++i) {
-        token_count[last_tokens[i]]++;
-    }
-
-    // Apply frequency and presence penalties to the candidates
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const auto token_iter = token_count.find(candidates->data[i].id);
-        if (token_iter == token_count.end()) {
-            continue;
-        }
-
-        const int count = token_iter->second;
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty_repeat;
-        } else {
-            candidates->data[i].logit /= penalty_repeat;
-        }
-
-        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
-    GGML_ASSERT(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    bool allow_eos = false;
-    for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            allow_eos = true;
-            break;
-        }
-    }
-
-    const llama_token eos = llama_token_eos(&ctx->model);
-
-    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
-    candidates_decoded.reserve(candidates->size);
-    std::vector<llama_grammar_candidate>                              candidates_grammar;
-    candidates_grammar.reserve(candidates->size);
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id);
-        if (id == eos) {
-            if (!allow_eos) {
-                candidates->data[i].logit = -INFINITY;
-            }
-        } else if (piece.empty() || piece[0] == 0) {
-            candidates->data[i].logit = -INFINITY;
-        } else {
-            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
-        }
-    }
-
-    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
-    for (const auto & reject : rejects) {
-        candidates->data[reject.index].logit = -INFINITY;
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
-static void llama_log_softmax(float * array, size_t size) {
-    float max_l = *std::max_element(array, array + size);
-    float sum = 0.f;
-    for (size_t i = 0; i < size; ++i) {
-        float p = expf(array[i] - max_l);
-        sum += p;
-        array[i] = p;
-    }
-
-    for (size_t i = 0; i < size; ++i) {
-        array[i] = logf(array[i] / sum);
-    }
-}
-
-void llama_sample_apply_guidance(
-          struct llama_context * ctx,
-                         float * logits,
-                         float * logits_guidance,
-                         float   scale) {
-    GGML_ASSERT(ctx);
-
-    const auto t_start_sample_us = ggml_time_us();
-    const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    llama_log_softmax(logits, n_vocab);
-    llama_log_softmax(logits_guidance, n_vocab);
-
-    for (int i = 0; i < n_vocab; ++i) {
-              auto & l = logits[i];
-        const auto & g = logits_guidance[i];
-
-        l = scale * (l - g) + g;
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    GGML_ASSERT(ctx);
-
-    auto N = float(llama_n_vocab(llama_get_model(ctx)));
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    llama_sample_softmax(nullptr, candidates);
-
-    // Estimate s_hat using the most probable m tokens
-    float s_hat = 0.0;
-    float sum_ti_bi = 0.0;
-    float sum_ti_sq = 0.0;
-    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
-        float t_i = logf(float(i + 2) / float(i + 1));
-        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
-        sum_ti_bi += t_i * b_i;
-        sum_ti_sq += t_i * t_i;
-    }
-    s_hat = sum_ti_bi / sum_ti_sq;
-
-    // Compute k from the estimated s_hat and target surprise value
-    float epsilon_hat = s_hat - 1;
-    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
-
-    // Sample the next word X using top-k sampling
-    llama_sample_top_k(nullptr, candidates, int(k), 1);
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    llama_token X = llama_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    return X;
-}
-
-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    llama_sample_softmax(ctx, candidates);
-
-    // Truncate the words with surprise values greater than mu
-    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return -log2f(candidate.p) > *mu;
-    }));
-
-    if (candidates->size == 0) {
-        candidates->size = 1;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-
-    // Normalize the probabilities of the remaining words
-    llama_sample_softmax(ctx, candidates);
-
-    // Sample the next word X from the remaining words
-    llama_token X = llama_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    return X;
-}
-
-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Find max element
-    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-        return a.logit < b.logit;
-    });
-
-    llama_token result = max_iter->id;
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-        ctx->n_sample++;
-    }
-    return result;
-}
-
-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    GGML_ASSERT(ctx);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-    llama_sample_softmax(nullptr, candidates);
-
-    std::vector<float> probs;
-    probs.reserve(candidates->size);
-    for (size_t i = 0; i < candidates->size; ++i) {
-        probs.push_back(candidates->data[i].p);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    auto & rng = ctx->rng;
-    int idx = dist(rng);
-
-    llama_token result = candidates->data[idx].id;
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-    return result;
-}
-
-void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (token == llama_token_eos(&ctx->model)) {
-        for (const auto & stack : grammar->stacks) {
-            if (stack.empty()) {
-                return;
-            }
-        }
-        GGML_ASSERT(false);
-    }
-
-    const std::string piece = llama_token_to_piece(ctx, token);
-
-    // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
-    const auto & code_points = decoded.first;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
-    }
-    grammar->partial_utf8 = decoded.second;
-    GGML_ASSERT(!grammar->stacks.empty());
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
-//
-// Beam search
-//
-
-struct llama_beam {
-    std::vector<llama_token> tokens;
-    float p;  // Cumulative beam probability (renormalized relative to all beams)
-    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
-    // Sort beams by probability. In case of ties, prefer beams at eob.
-    bool operator<(const llama_beam & rhs) const {
-        return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
-    }
-    // Shift off first n tokens and discard them.
-    void shift_tokens(const size_t n) {
-        if (n) {
-            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
-            tokens.resize(tokens.size() - n);
-        }
-    }
-    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
-};
-
-// A struct for calculating logit-related info.
-struct llama_logit_info {
-    const float * const logits;
-    const int n_vocab;
-    const float max_l;
-    const float normalizer;
-    struct sum_exp {
-        float max_l;
-        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
-    };
-    llama_logit_info(llama_context * ctx)
-      : logits(llama_get_logits(ctx))
-      , n_vocab(llama_n_vocab(llama_get_model(ctx)))
-      , max_l(*std::max_element(logits, logits + n_vocab))
-      , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
-      { }
-    llama_token_data get_token_data(const llama_token token_id) const {
-        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
-        return {token_id, logits[token_id], p};
-    }
-    // Return top k token_data by logit.
-    std::vector<llama_token_data> top_k(size_t k) {
-        std::vector<llama_token_data> min_heap;  // min-heap by logit
-        const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
-        min_heap.reserve(k_min);
-        for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
-            min_heap.push_back(get_token_data(token_id));
-        }
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
-        std::make_heap(min_heap.begin(), min_heap.end(), comp);
-        for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
-            if (min_heap.front().logit < logits[token_id]) {
-                std::pop_heap(min_heap.begin(), min_heap.end(), comp);
-                min_heap.back().id = token_id;
-                min_heap.back().logit = logits[token_id];
-                std::push_heap(min_heap.begin(), min_heap.end(), comp);
-            }
-        }
-        return min_heap;
-    }
-    float probability_from_logit(float logit) const {
-        return normalizer * std::exp(logit - max_l);
-    }
-};
-
-struct llama_beam_search_data {
-    llama_context * ctx;
-    size_t n_beams;
-    int n_past;
-    int n_predict;
-    std::vector<llama_beam> beams;
-    std::vector<llama_beam> next_beams;
-
-    // Re-calculated on each loop iteration
-    size_t common_prefix_length;
-
-    // Used to communicate to/from callback on beams state.
-    std::vector<llama_beam_view> beam_views;
-
-    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
-      : ctx(ctx)
-      , n_beams(n_beams)
-      , n_past(n_past)
-      , n_predict(n_predict)
-      , beam_views(n_beams) {
-        beams.reserve(n_beams);
-        next_beams.reserve(n_beams);
-    }
-
-    // Collapse beams to a single beam given by index.
-    void collapse_beams(const size_t beam_idx) {
-        if (0u < beam_idx) {
-            std::swap(beams[0], beams[beam_idx]);
-        }
-        beams.resize(1);
-    }
-
-    // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
-    // The repetitive patterns below reflect the 2 stages of heaps:
-    //  * Gather elements until the vector is full, then call std::make_heap() on it.
-    //  * If the heap is full and a new element is found that should be included, pop the
-    //    least element to the back(), replace it with the new, then push it into the heap.
-    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
-        // Min-heaps use a greater-than comparator.
-        const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
-        if (beam.eob) {
-            // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
-            if (next_beams.size() < n_beams) {
-                next_beams.push_back(std::move(beam));
-                if (next_beams.size() == n_beams) {
-                    std::make_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            } else if (next_beams.front().p < beam.p) {
-                std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                next_beams.back() = std::move(beam);
-                std::push_heap(next_beams.begin(), next_beams.end(), comp);
-            }
-        } else {
-            // beam is not at end-of-sentence, so branch with next top_k tokens.
-            if (!beam.tokens.empty()) {
-                llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
-            }
-            llama_logit_info logit_info(ctx);
-            std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
-            size_t i=0;
-            if (next_beams.size() < n_beams) {
-                for (; next_beams.size() < n_beams ; ++i) {
-                    llama_beam next_beam = beam;
-                    next_beam.tokens.push_back(next_tokens[i].id);
-                    next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
-                    next_beams.push_back(std::move(next_beam));
-                }
-                std::make_heap(next_beams.begin(), next_beams.end(), comp);
-            } else {
-                for (; next_beams.front().p == 0.0f ; ++i) {
-                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = beam;
-                    next_beams.back().tokens.push_back(next_tokens[i].id);
-                    next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
-                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            }
-            for (; i < n_beams ; ++i) {
-                const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
-                if (next_beams.front().p < next_p) {
-                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = beam;
-                    next_beams.back().tokens.push_back(next_tokens[i].id);
-                    next_beams.back().p = next_p;
-                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            }
-        }
-    }
-
-    // Find common_prefix_length based on beams.
-    // Requires beams is not empty.
-    size_t find_common_prefix_length() {
-        size_t common_prefix_length = beams[0].tokens.size();
-        for (size_t i = 1 ; i < beams.size() ; ++i) {
-            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
-            for (size_t j = 0 ; j < common_prefix_length ; ++j) {
-                if (beams[0].tokens[j] != beams[i].tokens[j]) {
-                    common_prefix_length = j;
-                    break;
-                }
-            }
-        }
-        return common_prefix_length;
-    }
-
-    // Construct beams_state to send back to caller via the callback function.
-    // Side effect: set common_prefix_length = find_common_prefix_length();
-    llama_beams_state get_beams_state(const bool last_call) {
-        for (size_t i = 0 ; i < beams.size() ; ++i) {
-            beam_views[i] = beams[i].view();
-        }
-        common_prefix_length = find_common_prefix_length();
-        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
-    }
-
-    // Loop:
-    //  * while i < n_predict, AND
-    //  * any of the beams have not yet reached end-of-beam (eob), AND
-    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
-    //    (since all other beam probabilities can only decrease)
-    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
-        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
-        const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
-        for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
-                       !beams[top_beam_index()].eob ; ++i) {
-            callback(callback_data, get_beams_state(false));  // Sets common_prefix_length
-            update_beams_from_beam_views();   // Update values (p,eob) that callback may have changed.
-            if (common_prefix_length) {
-                llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
-                n_past += common_prefix_length;
-            }
-            // Zero-out next_beam probabilities to place them last in following min-heap.
-            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
-            for (llama_beam & beam : beams) {
-                beam.shift_tokens(common_prefix_length);
-                fill_next_beams_by_top_probabilities(beam);
-            }
-            // next_beams become the beams of next/final iteration. Swap them to re-use memory.
-            beams.swap(next_beams);
-            renormalize_beam_probabilities(beams);
-        }
-        collapse_beams(top_beam_index());
-        callback(callback_data, get_beams_state(true));
-    }
-
-    // As beams grow, the cumulative probabilities decrease.
-    // Renormalize them to avoid floating point underflow.
-    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
-        const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
-        const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
-        std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
-    }
-
-    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
-    size_t top_beam_index() {
-        return std::max_element(beams.begin(), beams.end()) - beams.begin();
-    }
-
-    // Copy (p,eob) for each beam which may have been changed by the callback.
-    void update_beams_from_beam_views() {
-        for (size_t i = 0 ; i < beams.size() ; ++i) {
-            beams[i].p = beam_views[i].p;
-            beams[i].eob = beam_views[i].eob;
-        }
-    }
-};
-
-void llama_beam_search(llama_context * ctx,
-                       llama_beam_search_callback_fn_t callback, void * callback_data,
-                       size_t n_beams, int n_past, int n_predict) {
-    assert(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
-
-    beam_search_data.loop(callback, callback_data);
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-}
-
-//
-// quantization
-//
-
-struct quantize_state_internal {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv    = 0;
-    int n_ffn_down        = 0;
-    int n_ffn_gate        = 0;
-    int n_ffn_up          = 0;
-    int i_attention_wv    = 0;
-    int i_ffn_down        = 0;
-    int i_ffn_gate        = 0;
-    int i_ffn_up          = 0;
-
-    int n_k_quantized     = 0;
-    int n_fallback        = 0;
-
-    bool has_imatrix      = false;
-
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};
-
-static void llama_convert_tensor_internal(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
-    const size_t nelements, const int nthread
-) {
-    if (output.size() < nelements) {
-        output.resize(nelements);
-    }
-    float * f32_output = (float *) output.data();
-
-    ggml_type_traits_t qtype;
-    if (ggml_is_quantized(tensor->type)) {
-        qtype = ggml_internal_get_type_traits(tensor->type);
-        if (qtype.to_float == NULL) {
-            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
-        }
-    } else if (tensor->type != GGML_TYPE_F16) {
-        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
-    }
-
-    if (nthread < 2) {
-        if (tensor->type == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
-        } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
-        } else {
-            GGML_ASSERT(false); // unreachable
-        }
-        return;
-    }
-
-    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
-    size_t block_size_bytes = ggml_type_size(tensor->type);
-
-    GGML_ASSERT(nelements % block_size == 0);
-    size_t nblocks = nelements / block_size;
-    size_t blocks_per_thread = nblocks / nthread;
-    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
-
-    size_t in_buff_offs = 0;
-    size_t out_buff_offs = 0;
-
-    for (int tnum = 0; tnum < nthread; tnum++) {
-        size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
-        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
-        size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
-
-        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
-            if (typ == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
-            } else {
-                qtype.to_float(inbuf, outbuf, nels);
-            }
-        };
-        workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
-        in_buff_offs += thr_block_bytes;
-        out_buff_offs += thr_elems;
-    }
-    for (auto & w : workers) { w.join(); }
-    workers.clear();
-}
-
-static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
-    const std::string name = ggml_get_name(tensor);
-
-    // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const llm_arch arch = qs.model.arch;
-    const auto       tn = LLM_TN(arch);
-
-    auto use_more_bits = [](int i_layer, int num_layers) -> bool {
-        return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
-    };
-    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
-    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
-        if (n_expert > 1) {
-            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
-            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
-            // for getting the current layer as I initially thought, and we need to resort to parsing the
-            // tensor name.
-            n_layer /= n_expert;
-            if (sscanf(name, "blk.%d.", &i_layer) != 1) {
-                throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
-            }
-            if (i_layer < 0 || i_layer >= n_layer) {
-                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
-            }
-        }
-        return std::make_pair(i_layer, n_layer);
-    };
-
-    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
-    // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
-        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
-        int nx = tensor->ne[0];
-        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
-            new_type = GGML_TYPE_Q8_0;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
-                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-            new_type = GGML_TYPE_Q5_K;
-        }
-        else if (new_type != GGML_TYPE_Q8_0) {
-            new_type = GGML_TYPE_Q6_K;
-        }
-    } else if (name == "token_embd.weight") {
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
-            ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
-            new_type = GGML_TYPE_Q2_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-            new_type = GGML_TYPE_IQ3_S;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = GGML_TYPE_IQ3_S;
-        }
-    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
-               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-        if (name.find("attn_v.weight") != std::string::npos) {
-            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
-            else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
-            ++qs.i_attention_wv;
-        }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (name.find("ffn_down") != std::string::npos) {
-            if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
-            }
-            ++qs.i_ffn_down;
-        }
-        else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert == 8) {
-                new_type = GGML_TYPE_Q5_K;
-            } else {
-                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
-            }
-        }
-    } else if (name.find("attn_v.weight") != std::string::npos) {
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q5_K;
-        }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
-        else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
-                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
-        if (qs.model.type == MODEL_70B) {
-            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
-            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
-            // nearly negligible increase in model size by quantizing this tensor with more bits:
-            if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
-        }
-        if (qs.model.hparams.n_expert == 8) {
-            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
-            // TODO: explore better strategies
-            new_type = GGML_TYPE_Q8_0;
-        }
-        ++qs.i_attention_wv;
-    } else if (name.find("attn_k.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert == 8) {
-            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
-            // TODO: explore better strategies
-            new_type = GGML_TYPE_Q8_0;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
-            new_type = GGML_TYPE_IQ3_XXS;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = GGML_TYPE_IQ2_S;
-        }
-    } else if (name.find("attn_q.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
-            new_type = GGML_TYPE_IQ3_XXS;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = GGML_TYPE_IQ2_S;
-        }
-    } else if (name.find("ffn_down") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
-        int i_layer = info.first, n_layer = info.second;
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
-            if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
-            new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
-                     : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
-                     : GGML_TYPE_Q3_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
-                    (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-            if (arch == LLM_ARCH_FALCON) {
-                new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
-                           use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
-            } else {
-                if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
-            }
-        }
-        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
-            new_type = GGML_TYPE_Q5_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
-            new_type = GGML_TYPE_Q5_K;
-        }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
-                && qs.has_imatrix && i_layer < n_layer/8) {
-            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
-            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
-            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
-            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
-        }
-        ++qs.i_ffn_down;
-    } else if (name.find("attn_output.weight") != std::string::npos) {
-        if (arch != LLM_ARCH_FALCON) {
-            if (qs.model.hparams.n_expert == 8) {
-                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
-                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
-                    new_type = GGML_TYPE_Q5_K;
-                }
-            } else {
-                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
-            }
-        } else {
-            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
-        }
-    }
-    else if (name.find("attn_qkv.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
-    }
-    else if (name.find("ffn_gate") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
-        int i_layer = info.first, n_layer = info.second;
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
-            new_type = GGML_TYPE_IQ3_XXS;
-        }
-        ++qs.i_ffn_gate;
-    }
-    else if (name.find("ffn_up") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
-        int i_layer = info.first, n_layer = info.second;
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
-            new_type = GGML_TYPE_IQ3_XXS;
-        }
-        ++qs.i_ffn_up;
-    }
-
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    //}
-    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
-    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    //}
-    // This can be used to reduce the size of the Q5_K_S model.
-    // The associated PPL increase is fully in line with the size reduction
-    //else {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
-    //}
-    bool convert_incompatible_tensor = false;
-    if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
-        new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
-        new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
-        new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
-        int nx = tensor->ne[0];
-        int ny = tensor->ne[1];
-        if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
-            convert_incompatible_tensor = true;
-        } else {
-            ++qs.n_k_quantized;
-        }
-    }
-    if (convert_incompatible_tensor) {
-        switch (new_type) {
-            case GGML_TYPE_IQ2_XXS:
-            case GGML_TYPE_IQ2_XS:
-            case GGML_TYPE_IQ2_S:
-            case GGML_TYPE_IQ3_XXS:
-            case GGML_TYPE_IQ3_S:
-            case GGML_TYPE_IQ1_S:
-            case GGML_TYPE_Q2_K:
-            case GGML_TYPE_Q3_K:
-            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-        }
-        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-        ++qs.n_fallback;
-    }
-
-    return new_type;
-}
-
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
-    ggml_type quantized_type;
-    llama_ftype ftype = params->ftype;
-
-    switch (params->ftype) {
-        case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
-        case LLAMA_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break;
-        case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break;
-
-        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q2_K:    quantized_type = GGML_TYPE_Q2_K;    break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  quantized_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  quantized_type = GGML_TYPE_Q3_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  quantized_type = GGML_TYPE_Q4_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  quantized_type = GGML_TYPE_Q5_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q6_K:    quantized_type = GGML_TYPE_Q6_K;    break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  quantized_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:   quantized_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:   quantized_type = GGML_TYPE_IQ2_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:   quantized_type = GGML_TYPE_IQ1_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  quantized_type = GGML_TYPE_IQ4_NL;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  quantized_type = GGML_TYPE_IQ4_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:   quantized_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:   quantized_type = GGML_TYPE_IQ3_S;   break;
-
-        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
-    }
-
-    int nthread = params->nthread;
-
-    if (nthread <= 0) {
-        nthread = std::thread::hardware_concurrency();
-    }
-
-    // mmap consistently increases speed Linux, and also increases speed on Windows with
-    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
-#if defined(__linux__) || defined(_WIN32)
-    constexpr bool use_mmap = true;
-#else
-    constexpr bool use_mmap = false;
-#endif
-
-    llama_model_loader ml(fname_inp, use_mmap, NULL);
-    ml.init_mapping(false); // no prefetching?
-
-    llama_model model;
-    llm_load_arch(ml, model);
-    llm_load_hparams(ml, model);
-
-    struct quantize_state_internal qs(model, params);
-
-    if (params->only_copy) {
-        ftype = model.ftype;
-    }
-    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
-    if (params->imatrix) {
-        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
-        if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
-            qs.has_imatrix = true;
-        }
-    }
-
-    const size_t align = GGUF_DEFAULT_ALIGNMENT;
-    struct gguf_context * ctx_out = gguf_init_empty();
-
-    // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.ctx_gguf);
-    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
-
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
-
-        const std::string name = ggml_get_name(meta);
-
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
-            ++qs.n_attention_wv;
-        }
-        else if (name.find("ffn_down") != std::string::npos) {
-            ++qs.n_ffn_down;
-        }
-        else if (name.find("ffn_gate") != std::string::npos) {
-            ++qs.n_ffn_gate;
-        }
-        else if (name.find("ffn_up") != std::string::npos) {
-            ++qs.n_ffn_up;
-        }
-    }
-    if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
-        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
-                __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
-    }
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
-    std::vector<std::thread> workers;
-    workers.reserve(nthread);
-    std::mutex mutex;
-
-    int idx = 0;
-
-    std::vector<no_init<uint8_t>> read_data;
-    std::vector<no_init<uint8_t>> work;
-    std::vector<no_init<float>> f32_conv_buf;
-
-    // populate the original tensors so we get an initial meta data
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
-        gguf_add_tensor(ctx_out, meta);
-    }
-
-    std::ofstream fout(fname_out, std::ios::binary);
-    fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-
-    const size_t meta_size = gguf_get_meta_size(ctx_out);
-
-    LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
-
-    // placeholder for the meta data
-    ::zeros(fout, meta_size);
-
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * tensor = ml.get_tensor_meta(i);
-
-        const std::string name = ggml_get_name(tensor);
-
-        if (!ml.use_mmap) {
-            if (read_data.size() < ggml_nbytes(tensor)) {
-                read_data.resize(ggml_nbytes(tensor));
-            }
-            tensor->data = read_data.data();
-        }
-        ml.load_data_for(tensor);
-
-        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
-               ++idx, ml.n_tensors,
-               ggml_get_name(tensor),
-               llama_format_tensor_shape(tensor).c_str(),
-               ggml_type_name(tensor->type));
-
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D tensors
-        quantize &= (ggml_n_dims(tensor) == 2);
-        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        enum ggml_type new_type;
-        void * new_data;
-        size_t new_size;
-
-        if (quantize) {
-            new_type = quantized_type;
-            if (!params->pure) {
-                new_type = get_k_quant_type(qs, new_type, tensor, ftype);
-            }
-
-            // If we've decided to quantize to the same type the tensor is already
-            // in then there's nothing to do.
-            quantize = tensor->type != new_type;
-        }
-        if (!quantize) {
-            new_type = tensor->type;
-            new_data = tensor->data;
-            new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
-        } else {
-            const size_t nelements = ggml_nelements(tensor);
-
-            const float * imatrix = nullptr;
-            if (imatrix_data) {
-                auto it = imatrix_data->find(tensor->name);
-                if (it == imatrix_data->end()) {
-                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
-                } else {
-                    if (it->second.size() == (size_t)tensor->ne[0]) {
-                        imatrix = it->second.data();
-                    } else {
-                        LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
-                                int(it->second.size()), int(tensor->ne[0]), tensor->name);
-                    }
-                }
-            }
-            if ((new_type == GGML_TYPE_IQ2_XXS ||
-                 new_type == GGML_TYPE_IQ2_XS  ||
-                 new_type == GGML_TYPE_IQ2_S   ||
-                 new_type == GGML_TYPE_IQ1_S   ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
-                LLAMA_LOG_ERROR("\n\n============================================================\n");
-                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
-                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
-                LLAMA_LOG_ERROR("============================================================\n\n");
-                throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
-            }
-
-            float * f32_data;
-
-            if (tensor->type == GGML_TYPE_F32) {
-                f32_data = (float *) tensor->data;
-            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
-                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
-            } else {
-                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
-                f32_data = (float *) f32_conv_buf.data();
-            }
-
-            LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
-            fflush(stdout);
-
-            if (work.size() < nelements * 4) {
-                work.resize(nelements * 4); // upper bound on size
-            }
-            new_data = work.data();
-            std::array<int64_t, 1 << 4> hist_cur = {};
-
-            const int n_per_row = tensor->ne[0];
-            const int nrows = nelements / n_per_row;
-
-            static const int min_chunk_size = 32 * 512;
-            const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
-
-            const int nchunk = (nelements + chunk_size - 1)/chunk_size;
-            const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-            if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
-            } else {
-                int counter = 0;
-                new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
-                     nrows, n_per_row, imatrix]() {
-                    std::array<int64_t, 1 << 4> local_hist = {};
-                    const int nrows_per_chunk = chunk_size / n_per_row;
-                    size_t local_size = 0;
-                    while (true) {
-                        std::unique_lock<std::mutex> lock(mutex);
-                        int first_row = counter; counter += nrows_per_chunk;
-                        if (first_row >= nrows) {
-                            if (local_size > 0) {
-                                for (int j=0; j<int(local_hist.size()); ++j) {
-                                    hist_cur[j] += local_hist[j];
-                                }
-                                new_size += local_size;
-                            }
-                            break;
-                        }
-                        lock.unlock();
-                        const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
-                                first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
-                    }
-                };
-                for (int it = 0; it < nthread_use - 1; ++it) {
-                    workers.emplace_back(compute);
-                }
-                compute();
-                for (auto & w : workers) { w.join(); }
-                workers.clear();
-            }
-
-            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
-            int64_t tot_count = 0;
-            for (size_t i = 0; i < hist_cur.size(); i++) {
-                hist_all[i] += hist_cur[i];
-                tot_count += hist_cur[i];
-            }
-
-            if (tot_count > 0) {
-                LLAMA_LOG_INFO(" | hist: ");
-                for (size_t i = 0; i < hist_cur.size(); i++) {
-                    LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
-                }
-            }
-            LLAMA_LOG_INFO("\n");
-        }
-        total_size_org += ggml_nbytes(tensor);
-        total_size_new += new_size;
-
-        // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
-
-        // write tensor data + padding
-        fout.write((const char *) new_data, new_size);
-        zeros(fout, GGML_PAD(new_size, align) - new_size);
-    }
-
-    // go back to beginning of file and write the updated meta data
-    {
-        fout.seekp(0);
-        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-        gguf_get_meta_data(ctx_out, data.data());
-        fout.write((const char *) data.data(), data.size());
-    }
-
-    fout.close();
-
-    gguf_free(ctx_out);
-
-    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
-
-    // print histogram for all tensors
-    {
-        int64_t sum_all = 0;
-        for (size_t i = 0; i < hist_all.size(); i++) {
-            sum_all += hist_all[i];
-        }
-
-        if (sum_all > 0) {
-            LLAMA_LOG_INFO("%s: hist: ", __func__);
-            for (size_t i = 0; i < hist_all.size(); i++) {
-                LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
-            }
-            LLAMA_LOG_INFO("\n");
-        }
-    }
-
-    if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
-    }
-}
-
-static int llama_apply_lora_from_file_internal(
-    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
-) {
-    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
-
-    const int64_t t_start_lora_us = ggml_time_us();
-
-    llama_file fin(path_lora, "rb");
-
-    // verify magic and version
-    {
-        uint32_t magic = fin.read_u32();
-        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-            return 1;
-        }
-
-        uint32_t format_version = fin.read_u32();
-        if (format_version != 1) {
-            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
-            return 1;
-        }
-    }
-
-    int32_t lora_r = fin.read_u32();
-    int32_t lora_alpha = fin.read_u32();
-    float scaling = scale * (float)lora_alpha / (float)lora_r;
-
-    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
-
-    // load base model
-    std::unique_ptr<llama_model_loader> ml;
-    if (path_base_model) {
-        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
-        ml->init_mapping(/*prefetch*/ false); // no prefetching
-    }
-
-    struct tensor_meta {
-        std::string name;
-        ggml_type type;
-        int32_t ne[2];
-        size_t offset;
-    };
-    std::map<std::string, tensor_meta> tensor_meta_map;
-
-    // load all tensor meta
-    while (true) {
-        if (fin.tell() == fin.size) {
-            // eof
-            break;
-        }
-
-        int32_t n_dims;
-        int32_t name_len;
-        int32_t ftype;
-
-        fin.read_raw(&n_dims, sizeof(n_dims));
-        fin.read_raw(&name_len, sizeof(name_len));
-        fin.read_raw(&ftype, sizeof(ftype));
-
-        if (n_dims != 1 && n_dims != 2) {
-            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
-            return 1;
-        }
-
-        int32_t ne[2] = { 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            fin.read_raw(&ne[i], sizeof(ne[i]));
-        }
-
-        std::string name;
-        {
-            GGML_ASSERT(name_len < GGML_MAX_NAME);
-            char buf[GGML_MAX_NAME];
-            fin.read_raw(buf, name_len);
-            name = std::string(buf, name_len);
-        }
-
-        // check for lora suffix
-        std::string lora_suffix;
-        if (name.length() > 6) {
-            lora_suffix = name.substr(name.length() - 6);
-        }
-        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
-            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
-            return 1;
-        }
-
-        // tensor type
-        ggml_type wtype;
-        switch (ftype) {
-            case 0: wtype = GGML_TYPE_F32;  break;
-            case 1: wtype = GGML_TYPE_F16;  break;
-            default:
-                    {
-                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
-                                __func__, ftype);
-                        return 1;
-                    }
-        }
-
-        // data offset
-        size_t offset = fin.tell();
-        offset = (offset + 31) & -32;
-
-        // skip tensor data
-        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
-
-        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
-    }
-
-    bool warned = false;
-    int n_tensors = 0;
-
-    // apply
-    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-    if (backend_cpu == nullptr) {
-        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
-        return 1;
-    }
-    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
-
-    std::vector<no_init<uint8_t>> read_buf;
-    for (const auto & it : model.tensors_by_name) {
-        const std::string & base_name = it.first;
-        ggml_tensor * model_t = it.second;
-
-        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
-            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
-            continue;
-        }
-
-        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
-        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
-
-        ggml_init_params lora_init_params = {
-            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-            /* .mem_buffer */ nullptr,
-            /* .no_alloc   */ true,
-        };
-        ggml_context * lora_ctx = ggml_init(lora_init_params);
-        if (lora_ctx == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
-            ggml_backend_free(backend_cpu);
-            return 1;
-        }
-
-        // create tensors
-        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
-        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
-        ggml_set_name(loraA, metaA.name.c_str());
-        ggml_set_name(loraB, metaB.name.c_str());
-
-        ggml_tensor * base_t;
-        if (ml) {
-            if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
-                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-                return 1;
-            }
-            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
-        } else {
-            base_t = ggml_dup_tensor(lora_ctx, model_t);
-        }
-        ggml_set_name(base_t, base_name.c_str());
-
-        // allocate in backend buffer
-        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-        if (lora_buf == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
-            return 1;
-        }
-
-        // load tensor data
-        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
-            read_buf.resize(ggml_nbytes(tensor));
-            fin.seek(tensor_meta.offset, SEEK_SET);
-            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
-            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
-        };
-        load_tensor(metaA, loraA);
-        load_tensor(metaB, loraB);
-
-        // load base model tensor data
-        if (ml) {
-            ml->load_data_for(base_t);
-        } else {
-            ggml_backend_tensor_copy(model_t, base_t);
-        }
-
-        if (ggml_is_quantized(base_t->type) && !warned) {
-            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-                            "use a f16 or f32 base model with --lora-base\n", __func__);
-            warned = true;
-        }
-
-        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
-            ggml_free(lora_ctx);
-            ggml_backend_buffer_free(lora_buf);
-            ggml_backend_free(backend_cpu);
-            return 1;
-        }
-
-        auto build_lora_graph = [&]() {
-            // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-            ggml_set_name(BA, "BA");
-
-            if (scaling != 1.0f) {
-                BA = ggml_scale(lora_ctx, BA, scaling);
-                ggml_set_name(BA, "BA_scaled");
-            }
-
-            ggml_tensor * r;
-            r = ggml_add_inplace(lora_ctx, base_t, BA);
-            ggml_set_name(r, "r_add");
-
-            if (base_t->type != model_t->type) {
-                // convert the result to the model type
-                r = ggml_cast(lora_ctx, r, model_t->type);
-                ggml_set_name(r, "r_cast");
-            }
-
-            return r;
-        };
-
-        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
-        ggml_tensor * r = build_lora_graph();
-        ggml_build_forward_expand(gf, r);
-
-        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
-        if (graph_buf == nullptr) {
-            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
-            ggml_free(lora_ctx);
-            ggml_backend_buffer_free(lora_buf);
-            ggml_backend_free(backend_cpu);
-            return 1;
-        }
-
-        ggml_backend_graph_compute(backend_cpu, gf);
-
-        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
-
-#if 0
-        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
-        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
-
-        // sched compute
-        ggml_build_forward_expand(gf, build_graph());
-        ggml_backend_sched_init_measure(sched, gf);
-
-        // create the graph again, since the previous one was destroyed by the measure
-        ggml_graph_clear(gf);
-        ggml_build_forward_expand(gf, build_graph());
-        ggml_backend_sched_graph_compute(sched, gf);
-        ggml_backend_sched_free(sched);
-#endif
-
-        ggml_backend_buffer_free(lora_buf);
-        ggml_backend_buffer_free(graph_buf);
-        ggml_free(lora_ctx);
-
-        n_tensors++;
-        if (n_tensors % 4 == 0) {
-            LLAMA_LOG_INFO(".");
-        }
-    }
-
-    ggml_backend_free(backend_cpu);
-
-    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
-    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
-
-    return 0;
-}
-
-//
-// interface implementation
-//
-struct llama_model_params llama_model_default_params() {
-    struct llama_model_params result = {
-        /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
-        /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ nullptr,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
-        /*.vocab_only                  =*/ false,
-        /*.use_mmap                    =*/ true,
-        /*.use_mlock                   =*/ false,
-    };
-
-#ifdef GGML_USE_METAL
-    // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
-    result.n_gpu_layers = 999;
-#endif
-
-    return result;
-}
-
-struct llama_context_params llama_context_default_params() {
-    struct llama_context_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
-        /*.n_ctx                       =*/ 512,
-        /*.n_batch                     =*/ 512,
-        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
-        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
-        /*.rope_freq_base              =*/ 0.0f,
-        /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ -1.0f,
-        /*.yarn_attn_factor            =*/ 1.0f,
-        /*.yarn_beta_fast              =*/ 32.0f,
-        /*.yarn_beta_slow              =*/ 1.0f,
-        /*.yarn_orig_ctx               =*/ 0,
-        /*.defrag_thold                =*/ -1.0f,
-        /*.cb_eval                     =*/ nullptr,
-        /*.cb_eval_user_data           =*/ nullptr,
-        /*.type_k                      =*/ GGML_TYPE_F16,
-        /*.type_v                      =*/ GGML_TYPE_F16,
-        /*.logits_all                  =*/ false,
-        /*.embedding                   =*/ false,
-        /*.offload_kqv                 =*/ true,
-        /*.do_pooling                  =*/ true,
-    };
-
-    return result;
-}
-
-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
-        /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
-        /*.allow_requantize            =*/ false,
-        /*.quantize_output_tensor      =*/ true,
-        /*.only_copy                   =*/ false,
-        /*.pure                        =*/ false,
-        /*.imatrix                     =*/ nullptr,
-    };
-
-    return result;
-}
-
-size_t llama_max_devices(void) {
-#if defined(GGML_USE_METAL)
-    return 1;
-#elif defined(GGML_USE_CUBLAS)
-    return GGML_CUDA_MAX_DEVICES;
-#elif defined(GGML_USE_SYCL)
-    return GGML_SYCL_MAX_DEVICES;
-#elif defined(GGML_USE_VULKAN)
-    return GGML_VK_MAX_DEVICES;
-#else
-    return 1;
-#endif
-}
-
-bool llama_supports_mmap(void) {
-    return llama_mmap::SUPPORTED;
-}
-
-bool llama_supports_mlock(void) {
-    return llama_mlock::SUPPORTED;
-}
-
-bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL)   || defined(GGML_USE_KOMPUTE)
-    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
-    return true;
-#else
-    return false;
-#endif
-}
-
-void llama_backend_init(void) {
-    ggml_time_init();
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
-}
-
-void llama_numa_init(enum ggml_numa_strategy numa) {
-    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
-        ggml_numa_init(numa);
-    }
-}
-
-void llama_backend_free(void) {
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_free();
-#endif
-    ggml_quantize_free();
-}
-
-int64_t llama_time_us(void) {
-    return ggml_time_us();
-}
-
-struct llama_model * llama_load_model_from_file(
-        const char * path_model,
-        struct llama_model_params   params) {
-    ggml_time_init();
-
-    llama_model * model = new llama_model;
-
-    unsigned cur_percentage = 0;
-    if (params.progress_callback == NULL) {
-        params.progress_callback_user_data = &cur_percentage;
-        params.progress_callback = [](float progress, void * ctx) {
-            unsigned * cur_percentage_p = (unsigned *) ctx;
-            unsigned percentage = (unsigned) (100 * progress);
-            while (percentage > *cur_percentage_p) {
-                *cur_percentage_p = percentage;
-                LLAMA_LOG_INFO(".");
-                if (percentage >= 100) {
-                    LLAMA_LOG_INFO("\n");
-                }
-            }
-            return true;
-        };
-    }
-
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-        }
-        delete model;
-        return nullptr;
-    }
-
-    return model;
-}
-
-void llama_free_model(struct llama_model * model) {
-    delete model;
-}
-
-struct llama_context * llama_new_context_with_model(
-                 struct llama_model * model,
-        struct llama_context_params   params) {
-
-    if (!model) {
-        return nullptr;
-    }
-
-    llama_context * ctx = new llama_context(*model);
-
-    const auto & hparams = model->hparams;
-    auto       & cparams = ctx->cparams;
-
-    cparams.n_batch          = params.n_batch;
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
-    cparams.offload_kqv      = params.offload_kqv;
-    cparams.do_pooling       = params.do_pooling;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
-                                                              hparams.n_ctx_train;
-
-    cparams.cb_eval           = params.cb_eval;
-    cparams.cb_eval_user_data = params.cb_eval_user_data;
-
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
-    }
-
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
-    }
-
-    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
-    }
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);
-
-    ctx->rng = std::mt19937(params.seed);
-    ctx->logits_all = params.logits_all;
-
-    const ggml_type type_k = params.type_k;
-    const ggml_type type_v = params.type_v;
-
-    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
-
-    if (!hparams.vocab_only) {
-        // initialize backends
-#ifdef GGML_USE_METAL
-        if (model->n_gpu_layers > 0) {
-            ctx->backend_metal = ggml_backend_metal_init();
-            if (ctx->backend_metal == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(ctx->backend_metal);
-        }
-#elif defined(GGML_USE_CUBLAS)
-        if (model->n_gpu_layers > 0) {
-            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
-            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-                ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            } else {
-                // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
-                for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
-                    ggml_backend_t backend = ggml_backend_cuda_init(device);
-                    if (backend == nullptr) {
-                        LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
-                        llama_free(ctx);
-                        return nullptr;
-                    }
-                    ctx->backends.push_back(backend);
-                }
-            }
-        }
-#elif defined(GGML_USE_VULKAN)
-        if (model->n_gpu_layers > 0) {
-            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_vk_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_SYCL)
-        if (model->n_gpu_layers > 0) {
-            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        }
-#elif defined(GGML_USE_KOMPUTE)
-        if (model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        }
-#endif
-        ctx->backend_cpu = ggml_backend_cpu_init();
-        if (ctx->backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.push_back(ctx->backend_cpu);
-
-        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
-            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-
-        {
-            size_t memory_size_k = 0;
-            size_t memory_size_v = 0;
-
-            for (auto & k : ctx->kv_self.k_l) {
-                memory_size_k += ggml_nbytes(k);
-            }
-
-            for (auto & v : ctx->kv_self.v_l) {
-                memory_size_v += ggml_nbytes(v);
-            }
-
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-        }
-
-        // resized during inference, reserve maximum
-        ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
-
-        if (params.embedding) {
-            ctx->embedding.resize(hparams.n_embd);
-        }
-
-        // graph inputs
-        {
-            ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*8,
-                /* .mem_buffer */ nullptr,
-                /* .no_alloc   */ true,
-            };
-            ctx->ctx_input = ggml_init(init_params);
-
-            ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
-            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
-            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
-            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
-            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
-            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-
-            ggml_set_name(ctx->inp_tokens,  "inp_tokens");
-            ggml_set_name(ctx->inp_embd,    "inp_embd");
-            ggml_set_name(ctx->inp_pos,     "inp_pos");
-            ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
-            ggml_set_name(ctx->inp_KQ_pos,  "inp_KQ_pos");
-            ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
-            ggml_set_name(ctx->inp_mean,    "inp_mean");
-            ggml_set_name(ctx->inp_cls,     "inp_cls");
-
-            ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
-
-            LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_input),
-                    ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
-        }
-
-        // scheduler and compute buffers
-        {
-            // buffer types used for the compute buffer of each backend
-            std::vector<ggml_backend_buffer_type_t> backend_buft;
-            for (auto * backend : ctx->backends) {
-                if (ggml_backend_is_cpu(backend)) {
-                    // use host buffers for the CPU backend compute buffer
-                    backend_buft.push_back(llama_default_buffer_type_cpu(true));
-                } else {
-                    backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
-                }
-            }
-
-            // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
-
-            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
-
-            // build worst-case graph
-            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
-            int n_past = cparams.n_ctx - n_tokens;
-            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
-
-            // initialize scheduler with the worst-case graph
-            if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-
-            for (size_t i = 0; i < ctx->backends.size(); i++) {
-                ggml_backend_t backend = ctx->backends[i];
-                ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
-                LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                        ggml_backend_buft_name(buft),
-                        size / 1024.0 / 1024.0);
-            }
-
-            // note: the number of splits during measure is higher than during inference due to the kv shift
-            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
-        }
-    }
-
-#ifdef GGML_USE_MPI
-    ctx->ctx_mpi = ggml_mpi_init();
-
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
-        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        // TODO: needs fix after #3228
-        GGML_ASSERT(false && "not implemented");
-        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
-        llama_backend_free();
-        exit(1);
-    }
-#endif
-
-    return ctx;
-}
-
-void llama_free(struct llama_context * ctx) {
-    delete ctx;
-}
-
-const llama_model * llama_get_model(const struct llama_context * ctx) {
-    return &ctx->model;
-}
-
-uint32_t llama_n_ctx(const struct llama_context * ctx) {
-    return ctx->cparams.n_ctx;
-}
-
-uint32_t llama_n_batch(const struct llama_context * ctx) {
-    return ctx->cparams.n_batch;
-}
-
-enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
-    return model->vocab.type;
-}
-
-enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-    switch (model->arch) {
-        // these models do not use RoPE
-        case LLM_ARCH_GPT2:
-        case LLM_ARCH_GPTJ:
-        case LLM_ARCH_GPTNEOX:
-        case LLM_ARCH_MPT:
-        case LLM_ARCH_REFACT:
-        case LLM_ARCH_BLOOM:
-            return LLAMA_ROPE_TYPE_NONE;
-
-        // use what we call a normal RoPE, operating on pairs of consecutive head values
-        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_BAICHUAN:
-        case LLM_ARCH_STARCODER:
-        case LLM_ARCH_PLAMO:
-        case LLM_ARCH_CODESHELL:
-        case LLM_ARCH_ORION:
-        case LLM_ARCH_INTERNLM2:
-        case LLM_ARCH_MINICPM:
-            return LLAMA_ROPE_TYPE_NORM;
-
-        // the pairs of head values are offset by n_rot/2
-        case LLM_ARCH_FALCON:
-        case LLM_ARCH_PERSIMMON:
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_STABLELM:
-        case LLM_ARCH_QWEN:
-        case LLM_ARCH_QWEN2:
-        case LLM_ARCH_PHI2:
-        case LLM_ARCH_GEMMA:
-        case LLM_ARCH_STARCODER2:
-            return LLAMA_ROPE_TYPE_NEOX;
-
-        // all model arches should be listed explicitly here
-        case LLM_ARCH_UNKNOWN:
-            GGML_ASSERT(false && "unknown architecture");
-            break;
-    }
-
-    return LLAMA_ROPE_TYPE_NONE;
-}
-
-int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
-}
-
-int32_t llama_n_ctx_train(const struct llama_model * model) {
-    return model->hparams.n_ctx_train;
-}
-
-int32_t llama_n_embd(const struct llama_model * model) {
-    return model->hparams.n_embd;
-}
-
-float llama_rope_freq_scale_train(const struct llama_model * model) {
-    return model->hparams.rope_freq_scale_train;
-}
-
-int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
-    const auto & it = model->gguf_kv.find(key);
-    if (it == model->gguf_kv.end()) {
-        if (buf_size > 0) {
-            buf[0] = '\0';
-        }
-        return -1;
-    }
-    return snprintf(buf, buf_size, "%s", it->second.c_str());
-}
-
-int32_t llama_model_meta_count(const struct llama_model * model) {
-    return (int)model->gguf_kv.size();
-}
-
-int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
-    if (i < 0 || i >= (int)model->gguf_kv.size()) {
-        if (buf_size > 0) {
-            buf[0] = '\0';
-        }
-        return -1;
-    }
-    auto it = model->gguf_kv.begin();
-    std::advance(it, i);
-    return snprintf(buf, buf_size, "%s", it->first.c_str());
-}
-
-int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
-    if (i < 0 || i >= (int)model->gguf_kv.size()) {
-        if (buf_size > 0) {
-            buf[0] = '\0';
-        }
-        return -1;
-    }
-    auto it = model->gguf_kv.begin();
-    std::advance(it, i);
-    return snprintf(buf, buf_size, "%s", it->second.c_str());
-}
-
-int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "%s %s %s",
-            llama_model_arch_name(model->arch),
-            llama_model_type_name(model->type),
-            llama_model_ftype_name(model->ftype).c_str());
-}
-
-uint64_t llama_model_size(const struct llama_model * model) {
-    uint64_t size = 0;
-    for (const auto & it : model->tensors_by_name) {
-        size += ggml_nbytes(it.second);
-    }
-    return size;
-}
-
-uint64_t llama_model_n_params(const struct llama_model * model) {
-    uint64_t nparams = 0;
-    for (const auto & it : model->tensors_by_name) {
-        nparams += ggml_nelements(it.second);
-    }
-    return nparams;
-}
-
-struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
-    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
-            });
-    if (it == model->tensors_by_name.end()) {
-        return nullptr;
-    }
-    return it->second;
-}
-
-uint32_t llama_model_quantize(
-        const char * fname_inp,
-        const char * fname_out,
-        const llama_model_quantize_params * params) {
-    try {
-        llama_model_quantize_internal(fname_inp, fname_out, params);
-        return 0;
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
-        return 1;
-    }
-}
-
-int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-    try {
-        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-        return 1;
-    }
-}
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
-    struct llama_kv_cache_view result = {
-        /*.n_cells            = */ 0,
-        /*.n_max_seq          = */ n_max_seq,
-        /*.token_count        = */ 0,
-        /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx),
-        /*.max_contiguous     = */ 0,
-        /*.max_contiguous_idx = */ -1,
-        /*.cells              = */ nullptr,
-        /*.cells_sequences    = */ nullptr,
-    };
-    return result;
-}
-
-void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
-    if (view->cells != nullptr) {
-        free(view->cells);
-        view->cells = nullptr;
-    }
-    if (view->cells_sequences != nullptr) {
-        free(view->cells_sequences);
-        view->cells_sequences = nullptr;
-    }
-}
-
-void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
-    if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
-        view->n_cells = int32_t(ctx->kv_self.size);
-        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
-        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
-        view->cells = (struct llama_kv_cache_view_cell *)p;
-        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
-        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
-        view->cells_sequences = (llama_seq_id *)p;
-    }
-
-    const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
-    llama_kv_cache_view_cell * c_curr = view->cells;
-    llama_seq_id * cs_curr = view->cells_sequences;
-    int32_t used_cells = 0;
-    int32_t token_count = 0;
-    int32_t curr_contig_idx = -1;
-    uint32_t max_contig = 0;
-    int32_t max_contig_idx = -1;
-
-    for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
-        const size_t curr_size = kv_cells[i].seq_id.size();
-        token_count += curr_size;
-        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
-
-        if (curr_size > 0) {
-            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
-                max_contig = i - curr_contig_idx;
-                max_contig_idx = curr_contig_idx;
-            }
-            curr_contig_idx = -1;
-        } else if (curr_contig_idx < 0) {
-            curr_contig_idx = i;
-        }
-
-        int seq_idx = 0;
-        for (const llama_seq_id it : kv_cells[i].seq_id) {
-            if (seq_idx >= view->n_max_seq) {
-                break;
-            }
-            cs_curr[seq_idx] = it;
-            seq_idx++;
-        }
-        if (seq_idx != 0) {
-            used_cells++;
-        }
-        for (; seq_idx < view->n_max_seq; seq_idx++) {
-            cs_curr[seq_idx] = -1;
-        }
-    }
-    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
-        max_contig_idx = curr_contig_idx;
-        max_contig = kv_cells.size() - curr_contig_idx;
-    }
-    view->max_contiguous = max_contig;
-    view->max_contiguous_idx = max_contig_idx;
-    view->token_count = token_count;
-    view->used_cells = used_cells;
-    if (uint32_t(used_cells) != ctx->kv_self.used) {
-        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
-            __func__, ctx->kv_self.used, used_cells);
-    }
-}
-
-int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    int result = 0;
-
-    for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
-        result += ctx->kv_self.cells[i].seq_id.size();
-    }
-
-    return result;
-}
-
-int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
-    return ctx->kv_self.used;
-}
-
-void llama_kv_cache_clear(struct llama_context * ctx) {
-    llama_kv_cache_clear(ctx->kv_self);
-}
-
-void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
-}
-
-void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
-    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
-}
-
-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
-}
-
-llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_defrag(struct llama_context * ctx) {
-    llama_kv_cache_defrag(ctx->kv_self);
-}
-
-void llama_kv_cache_update(struct llama_context * ctx) {
-    llama_kv_cache_update_internal(*ctx);
-}
-
-
-// Returns the *maximum* size of the state
-size_t llama_get_state_size(const struct llama_context * ctx) {
-    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
-    // for reference, std::mt19937(1337) serializes to 6701 bytes.
-    const size_t s_rng_size        = sizeof(size_t);
-    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
-    const size_t s_logits_size     = sizeof(size_t);
-    // assume worst case for logits although only currently set ones are serialized
-    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
-    const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
-    const size_t s_kv_size         = sizeof(size_t);
-    const size_t s_kv_ntok         = sizeof(int);
-    const size_t s_kv              = ctx->kv_self.total_size();
-
-    const size_t s_total = (
-        + s_rng_size
-        + s_rng
-        + s_logits_size
-        + s_logits
-        + s_embedding_size
-        + s_embedding
-        + s_kv_size
-        + s_kv_ntok
-        + s_kv
-    );
-
-    return s_total;
-}
-
-// llama_context_data
-struct llama_data_context {
-    virtual void write(const void * src, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-    virtual ~llama_data_context() = default;
-};
-
-struct llama_data_buffer_context : llama_data_context {
-    uint8_t * ptr;
-    size_t size_written = 0;
-
-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
-
-    void write(const void * src, size_t size) override {
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-    }
-
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-struct llama_data_file_context : llama_data_context {
-    llama_file * file;
-    size_t size_written = 0;
-
-    llama_data_file_context(llama_file * f) : file(f) {}
-
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
-
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-/** copy state data into either a buffer or file depending on the passed in context
- *
- * file context:
- * llama_file file("/path", "wb");
- * llama_data_file_context data_ctx(&file);
- * llama_copy_state_data(ctx, &data_ctx);
- *
- * buffer context:
- * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_buffer_context data_ctx(&buf.data());
- * llama_copy_state_data(ctx, &data_ctx);
- *
-*/
-static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
-    // copy rng
-    {
-        std::ostringstream rng_ss;
-        rng_ss << ctx->rng;
-
-        const std::string & rng_str = rng_ss.str();
-        const size_t        rng_size = rng_str.size();
-
-        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
-
-        data_ctx->write(&rng_size,      sizeof(rng_size));
-        data_ctx->write(rng_str.data(), rng_size);
-    }
-
-    // copy logits
-    {
-        const size_t logits_size = ctx->logits.size();
-
-        data_ctx->write(&logits_size, sizeof(logits_size));
-
-        if (logits_size) {
-            data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
-        }
-    }
-
-    // copy embeddings
-    {
-        const size_t embedding_size = ctx->embedding.size();
-
-        data_ctx->write(&embedding_size, sizeof(embedding_size));
-
-        if (embedding_size) {
-            data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
-        }
-    }
-
-    // copy kv cache
-    {
-        const auto & kv_self = ctx->kv_self;
-        const auto & hparams = ctx->model.hparams;
-        const auto & cparams = ctx->cparams;
-
-        const uint32_t n_layer      = hparams.n_layer;
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const uint32_t n_ctx        = cparams.n_ctx;
-
-        const size_t   kv_buf_size = kv_self.total_size();
-        const uint32_t kv_head     = kv_self.head;
-        const uint32_t kv_size     = kv_self.size;
-        const uint32_t kv_used     = kv_self.used;
-
-        data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
-        data_ctx->write(&kv_head,     sizeof(kv_head));
-        data_ctx->write(&kv_size,     sizeof(kv_size));
-        data_ctx->write(&kv_used,     sizeof(kv_used));
-
-        if (kv_buf_size) {
-            std::vector<uint8_t> tmp_buf;
-            for (int il = 0; il < (int) n_layer; ++il) {
-                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
-
-                tmp_buf.resize(k_size);
-                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
-                data_ctx->write(tmp_buf.data(), tmp_buf.size());
-
-                // v is not contiguous, copy row by row
-                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
-
-                tmp_buf.resize(v_row_size);
-                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
-                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
-                }
-            }
-        }
-
-        for (uint32_t i = 0; i < kv_size; ++i) {
-            const auto & cell = kv_self.cells[i];
-
-            const llama_pos pos         = cell.pos;
-            const size_t    seq_id_size = cell.seq_id.size();
-
-            data_ctx->write(&pos,         sizeof(pos));
-            data_ctx->write(&seq_id_size, sizeof(seq_id_size));
-
-            for (auto seq_id : cell.seq_id) {
-                data_ctx->write(&seq_id, sizeof(seq_id));
-            }
-        }
-    }
-}
-
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
-    llama_data_buffer_context data_ctx(dst);
-    llama_copy_state_data_internal(ctx, &data_ctx);
-
-    return data_ctx.get_size_written();
-}
-
-// Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    const uint8_t * inp = src;
-
-    // set rng
-    {
-        size_t rng_size;
-        memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
-
-        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
-
-        std::string rng_str((const char *)inp, rng_size); inp += rng_size;
-
-        std::istringstream rng_ss(rng_str);
-        rng_ss >> ctx->rng;
-
-        GGML_ASSERT(!rng_ss.fail());
-    }
-
-    // set logits
-    {
-        size_t logits_size;
-
-        memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
-
-        GGML_ASSERT(ctx->logits.capacity() >= logits_size);
-
-        if (logits_size) {
-            ctx->logits.resize(logits_size);
-
-            memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
-            inp += logits_size * sizeof(float);
-        }
-    }
-
-    // set embeddings
-    {
-        size_t embedding_size;
-
-        memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
-
-        GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
-
-        if (embedding_size) {
-            memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
-            inp += embedding_size * sizeof(float);
-        }
-    }
-
-    // set kv cache
-    {
-        const auto & kv_self = ctx->kv_self;
-        const auto & hparams = ctx->model.hparams;
-        const auto & cparams = ctx->cparams;
-
-        const uint32_t n_layer      = hparams.n_layer;
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const uint32_t n_ctx        = cparams.n_ctx;
-
-        size_t   kv_buf_size;
-        uint32_t kv_head;
-        uint32_t kv_size;
-        uint32_t kv_used;
-
-        memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
-        memcpy(&kv_head,     inp, sizeof(kv_head));     inp += sizeof(kv_head);
-        memcpy(&kv_size,     inp, sizeof(kv_size));     inp += sizeof(kv_size);
-        memcpy(&kv_used,     inp, sizeof(kv_used));     inp += sizeof(kv_used);
-
-        if (kv_buf_size) {
-            GGML_ASSERT(kv_self.total_size() == kv_buf_size);
-
-            for (int il = 0; il < (int) n_layer; ++il) {
-                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
-
-                ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
-                inp += k_size;
-
-                // v is not contiguous, copy row by row
-                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
-
-                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
-                    inp += v_row_size;
-                }
-            }
-        }
-
-        ctx->kv_self.head = kv_head;
-        ctx->kv_self.size = kv_size;
-        ctx->kv_self.used = kv_used;
-
-        ctx->kv_self.cells.resize(kv_size);
-
-        for (uint32_t i = 0; i < kv_size; ++i) {
-            llama_pos pos;
-            size_t    seq_id_size;
-
-            memcpy(&pos,         inp, sizeof(pos));         inp += sizeof(pos);
-            memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
-
-            ctx->kv_self.cells[i].pos = pos;
-
-            llama_seq_id seq_id;
-
-            for (size_t j = 0; j < seq_id_size; ++j) {
-                memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
-                ctx->kv_self.cells[i].seq_id.insert(seq_id);
-            }
-        }
-    }
-
-    const size_t nread    = inp - src;
-    const size_t max_size = llama_get_state_size(ctx);
-
-    GGML_ASSERT(nread <= max_size);
-
-    return nread;
-}
-
-static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(path_session, "rb");
-
-    // sanity checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
-
-        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
-            LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-            return false;
-        }
-
-        llama_hparams session_hparams;
-        file.read_raw(&session_hparams, sizeof(llama_hparams));
-
-        if (session_hparams != ctx->model.hparams) {
-            LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
-            return false;
-        }
-    }
-
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
-
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
-
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
-
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size - file.tell();
-        const size_t n_state_size_max = llama_get_state_size(ctx);
-
-        if (n_state_size_cur > n_state_size_max) {
-            LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
-            return false;
-        }
-
-        std::vector<uint8_t> state_data(n_state_size_max);
-        file.read_raw(state_data.data(), n_state_size_cur);
-
-        llama_set_state_data(ctx, state_data.data());
-    }
-
-    return true;
-}
-
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    try {
-        return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
-        return false;
-    }
-}
-
-bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(path_session, "wb");
-
-    file.write_u32(LLAMA_SESSION_MAGIC);
-    file.write_u32(LLAMA_SESSION_VERSION);
-
-    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    // save the context state using stream saving
-    llama_data_file_context data_ctx(&file);
-    llama_copy_state_data_internal(ctx, &data_ctx);
-
-    return true;
-}
-
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
-    ctx->cparams.n_threads       = n_threads;
-    ctx->cparams.n_threads_batch = n_threads_batch;
-}
-
-struct llama_batch llama_batch_get_one(
-             llama_token * tokens,
-                 int32_t   n_tokens,
-               llama_pos   pos_0,
-            llama_seq_id   seq_id) {
-    return {
-        /*n_tokens       =*/ n_tokens,
-        /*tokens         =*/ tokens,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ pos_0,
-        /*all_pos_1      =*/ 1,
-        /*all_seq_id     =*/ seq_id,
-    };
-}
-
-struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
-
-    if (embd) {
-        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-    } else {
-        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
-    }
-
-    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
-    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
-    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
-    for (int i = 0; i < n_tokens_alloc; ++i) {
-        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
-    }
-    batch.seq_id[n_tokens_alloc] = nullptr;
-
-    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens_alloc);
-
-    return batch;
-}
-
-void llama_batch_free(struct llama_batch batch) {
-    if (batch.token)    free(batch.token);
-    if (batch.embd)     free(batch.embd);
-    if (batch.pos)      free(batch.pos);
-    if (batch.n_seq_id) free(batch.n_seq_id);
-    if (batch.seq_id) {
-        for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
-            free(batch.seq_id[i]);
-        }
-        free(batch.seq_id);
-    }
-    if (batch.logits)   free(batch.logits);
-}
-
-int32_t llama_decode(
-        struct llama_context * ctx,
-          struct llama_batch   batch) {
-    const int ret = llama_decode_internal(*ctx, batch);
-    if (ret < 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
-float * llama_get_logits(struct llama_context * ctx) {
-    return ctx->logits.data();
-}
-
-float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
-    assert(ctx->logits_valid.at(i));
-    return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
-}
-
-float * llama_get_embeddings(struct llama_context * ctx) {
-    return ctx->embedding.data();
-}
-
-float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
-    return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
-}
-
-const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].text.c_str();
-}
-
-float llama_token_get_score(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].score;
-}
-
-llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
-    return model->vocab.id_to_token[token].type;
-}
-
-llama_token llama_token_bos(const struct llama_model * model) {
-    return model->vocab.special_bos_id;
-}
-
-llama_token llama_token_eos(const struct llama_model * model) {
-    return model->vocab.special_eos_id;
-}
-
-llama_token llama_token_nl(const struct llama_model * model) {
-    return model->vocab.linefeed_id;
-}
-
-int32_t llama_add_bos_token(const struct llama_model * model) {
-    return model->vocab.special_add_bos;
-}
-
-int32_t llama_add_eos_token(const struct llama_model * model) {
-    return model->vocab.special_add_eos;
-}
-
-llama_token llama_token_prefix(const struct llama_model * model) {
-    return model->vocab.special_prefix_id;
-}
-
-llama_token llama_token_middle(const struct llama_model * model) {
-    return model->vocab.special_middle_id;
-}
-
-llama_token llama_token_suffix(const struct llama_model * model) {
-    return model->vocab.special_suffix_id;
-}
-
-llama_token llama_token_eot(const struct llama_model * model) {
-    return model->vocab.special_eot_id;
-}
-
-int32_t llama_tokenize(
-    const struct llama_model * model,
-                  const char * text,
-                     int32_t   text_len,
-                 llama_token * tokens,
-                     int32_t   n_max_tokens,
-                        bool   add_bos,
-                        bool   special) {
-    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
-
-    if (n_max_tokens < (int) res.size()) {
-        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-        return -((int) res.size());
-    }
-
-    for (size_t i = 0; i < res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
-}
-
-static std::string llama_decode_text(const std::string & text) {
-    std::string decoded_text;
-    auto unicode_sequences = codepoints_from_utf8(text);
-    for (auto& unicode_sequence : unicode_sequences) {
-        decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
-    }
-
-    return decoded_text;
-}
-
-// does not write null-terminator to buf
-int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
-    if (0 <= token && token < llama_n_vocab(model)) {
-        switch (llama_vocab_get_type(model->vocab)) {
-        case LLAMA_VOCAB_TYPE_WPM:
-        case LLAMA_VOCAB_TYPE_SPM: {
-            // NOTE: we accept all unsupported token types,
-            // suppressing them like CONTROL tokens.
-            if (llama_is_normal_token(model->vocab, token)) {
-                std::string result = model->vocab.id_to_token[token].text;
-                llama_unescape_whitespace(result);
-                if (length < (int) result.length()) {
-                    return -(int) result.length();
-                }
-                memcpy(buf, result.c_str(), result.length());
-                return result.length();
-            } else if (llama_is_user_defined_token(model->vocab, token)) {
-                std::string result = model->vocab.id_to_token[token].text;
-                if (length < (int) result.length()) {
-                    return -result.length();
-                }
-                memcpy(buf, result.c_str(), result.length());
-                return result.length();
-            } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
-                if (length < 3) {
-                    return -3;
-                }
-                memcpy(buf, "\xe2\x96\x85", 3);
-                return 3;
-            } else if (llama_is_control_token(model->vocab, token)) {
-                ;
-            } else if (llama_is_byte_token(model->vocab, token)) {
-                if (length < 1) {
-                    return -1;
-                }
-                buf[0] = llama_token_to_byte(model->vocab, token);
-                return 1;
-            }
-            break;
-        }
-        case LLAMA_VOCAB_TYPE_BPE: {
-            // NOTE: we accept all unsupported token types,
-            // suppressing them like CONTROL tokens.
-            if (llama_is_normal_token(model->vocab, token)) {
-                std::string result = model->vocab.id_to_token[token].text;
-                result = llama_decode_text(result);
-                if (length < (int) result.length()) {
-                    return -(int) result.length();
-                }
-                memcpy(buf, result.c_str(), result.length());
-                return result.length();
-            } else if (llama_is_user_defined_token(model->vocab, token)) {
-                std::string result = model->vocab.id_to_token[token].text;
-                if (length < (int) result.length()) {
-                    return -result.length();
-                }
-                memcpy(buf, result.c_str(), result.length());
-                return result.length();
-            } else if (llama_is_control_token(model->vocab, token)) {
-                ;
-            }
-            break;
-        }
-        default:
-            GGML_ASSERT(false);
-        }
-    }
-    return 0;
-}
-
-// trim whitespace from the beginning and end of a string
-static std::string trim(const std::string & str) {
-    size_t start = 0;
-    size_t end = str.size();
-    while (start < end && isspace(str[start])) {
-        start += 1;
-    }
-    while (end > start && isspace(str[end - 1])) {
-        end -= 1;
-    }
-    return str.substr(start, end - start);
-}
-
-// Simple version of "llama_apply_chat_template" that only works with strings
-// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
-static int32_t llama_chat_apply_template_internal(
-    const std::string & tmpl,
-    const std::vector<const llama_chat_message *> & chat,
-    std::string & dest, bool add_ass) {
-    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
-    std::stringstream ss;
-    if (tmpl.find("<|im_start|>") != std::string::npos) {
-        // chatml template
-        for (auto message : chat) {
-            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
-        }
-        if (add_ass) {
-            ss << "<|im_start|>assistant\n";
-        }
-    } else if (tmpl.find("[INST]") != std::string::npos) {
-        // llama2 template and its variants
-        // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
-        // [variant] space before + after response
-        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
-        // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
-        // [variant] trim spaces from the input message
-        bool strip_message = tmpl.find("content.strip()") != std::string::npos;
-        // construct the prompt
-        bool is_inside_turn = true; // skip BOS at the beginning
-        ss << "[INST] ";
-        for (auto message : chat) {
-            std::string content = strip_message ? trim(message->content) : message->content;
-            std::string role(message->role);
-            if (!is_inside_turn) {
-                is_inside_turn = true;
-                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
-            }
-            if (role == "system") {
-                if (support_system_message) {
-                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
-                } else {
-                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
-                    ss << content << "\n";
-                }
-            } else if (role == "user") {
-                ss << content << " [/INST]";
-            } else {
-                ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
-                is_inside_turn = false;
-            }
-        }
-        // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl.find("<|user|>") != std::string::npos) {
-        // zephyr template
-        for (auto message : chat) {
-            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
-        }
-        if (add_ass) {
-            ss << "<|assistant|>\n";
-        }
-    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
-        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
-        for (auto message : chat) {
-            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
-            ss << bos << message->role << "\n" << message->content << "</s>\n";
-        }
-        if (add_ass) {
-            ss << "<s>assistant\n";
-        }
-    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
-        // google/gemma-7b-it
-        std::string system_prompt = "";
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "system") {
-                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-                system_prompt = trim(message->content);
-                continue;
-            }
-            // in gemma, "assistant" is "model"
-            role = role == "assistant" ? "model" : message->role;
-            ss << "<start_of_turn>" << role << "\n";
-            if (!system_prompt.empty() && role != "model") {
-                ss << system_prompt << "\n\n";
-                system_prompt = "";
-            }
-            ss << trim(message->content) << "<end_of_turn>\n";
-        }
-        if (add_ass) {
-            ss << "<start_of_turn>model\n";
-        }
-    } else {
-        // template not supported
-        return -1;
-    }
-    dest = ss.str();
-    return dest.size();
-}
-
-LLAMA_API int32_t llama_chat_apply_template(
-                const struct llama_model * model,
-                              const char * tmpl,
-         const struct llama_chat_message * chat,
-                                  size_t   n_msg,
-                                    bool   add_ass,
-                                    char * buf,
-                                 int32_t   length) {
-    std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
-    if (tmpl == nullptr) {
-        GGML_ASSERT(model != nullptr);
-        // load template from model
-        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
-        std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        if (res < 0) {
-            // worst case: there is no information about template, we will use chatml by default
-            curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
-        } else {
-            curr_tmpl = std::string(model_template.data(), model_template.size());
-        }
-    }
-    // format the chat to string
-    std::vector<const llama_chat_message *> chat_vec;
-    chat_vec.resize(n_msg);
-    for (size_t i = 0; i < n_msg; i++) {
-        chat_vec[i] = &chat[i];
-    }
-    std::string formatted_chat;
-    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
-    if (res < 0) {
-        return res;
-    }
-    strncpy(buf, formatted_chat.c_str(), length);
-    return res;
-}
-
-struct llama_timings llama_get_timings(struct llama_context * ctx) {
-    struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
-
-        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
-        /*.n_eval   =*/ std::max(1, ctx->n_eval),
-    };
-
-    return result;
-}
-
-void llama_print_timings(struct llama_context * ctx) {
-    const llama_timings timings = llama_get_timings(ctx);
-
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
-}
-
-void llama_reset_timings(struct llama_context * ctx) {
-    ctx->t_start_us = ggml_time_us();
-    ctx->t_sample_us = ctx->n_sample = 0;
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-}
-
-const char * llama_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
-    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
-    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
-    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
-    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
-    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
-    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
-    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
-    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
-    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
-    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
-    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
-    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
-    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
-    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
-    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
-    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-
-    return s.c_str();
-}
-
-void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
-    fprintf(stream, "\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "# Timings #\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "\n");
-
-    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
-            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
-    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
-            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
-            1.0e-3 * ctx->t_sample_us / ctx->n_sample);
-    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
-    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
-    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
-    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
-    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->t_sample_us);
-    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
-            1.0e6 * ctx->n_eval / ctx->t_eval_us);
-    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
-            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
-            1.0e6 * ctx->n_sample / ctx->t_sample_us);
-}
-
-// For internal test use
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-) {
-    return ctx->model.tensors_by_name;
-}
-
-void llama_log_set(ggml_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
-    g_state.log_callback_user_data = user_data;
-#ifdef GGML_USE_METAL
-    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-#endif
-}
-
-static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
-    va_list args_copy;
-    va_copy(args_copy, args);
-    char buffer[128];
-    int len = vsnprintf(buffer, 128, format, args);
-    if (len < 128) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
-    } else {
-        char* buffer2 = new char[len+1];
-        vsnprintf(buffer2, len+1, format, args_copy);
-        buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
-        delete[] buffer2;
-    }
-    va_end(args_copy);
-}
-
-static void llama_log_internal(ggml_log_level level, const char * format, ...) {
-    va_list args;
-    va_start(args, format);
-    llama_log_internal_v(level, format, args);
-    va_end(args);
-}
-
-static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
diff --git a/llama.h b/llama.h
deleted file mode 100644
index ed51f478a..000000000
--- a/llama.h
+++ /dev/null
@@ -1,949 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#ifdef __GNUC__
-#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
-#    define DEPRECATED(func, hint) func
-#endif
-
-#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-
-#define LLAMA_MAX_RNG_STATE (64*1024)
-
-#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
-#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
-
-#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 4
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_model;
-    struct llama_context;
-
-    typedef int32_t llama_pos;
-    typedef int32_t llama_token;
-    typedef int32_t llama_seq_id;
-
-    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
-    };
-
-    // note: these values should be synchronized with ggml_rope
-    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
-    enum llama_rope_type {
-        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM =  0,
-        LLAMA_ROPE_TYPE_NEOX =  2,
-        LLAMA_ROPE_TYPE_GLM  =  4,
-    };
-
-    enum llama_token_type {
-        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
-        LLAMA_TOKEN_TYPE_NORMAL       = 1,
-        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
-        LLAMA_TOKEN_TYPE_CONTROL      = 3,
-        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
-        LLAMA_TOKEN_TYPE_UNUSED       = 5,
-        LLAMA_TOKEN_TYPE_BYTE         = 6,
-    };
-
-    // model file types
-    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-
-        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
-    };
-
-    enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
-        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
-    };
-
-    enum llama_pooling_type {
-        LLAMA_POOLING_TYPE_NONE = 0,
-        LLAMA_POOLING_TYPE_MEAN = 1,
-        LLAMA_POOLING_TYPE_CLS  = 2,
-    };
-
-    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
-    };
-
-    typedef struct llama_token_data {
-        llama_token id; // token id
-        float logit;    // log-odds of the token
-        float p;        // probability of the token
-    } llama_token_data;
-
-    typedef struct llama_token_data_array {
-        llama_token_data * data;
-        size_t size;
-        bool sorted;
-    } llama_token_data_array;
-
-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
-
-    // Input data for llama_decode
-    // A llama_batch object can contain input about one or many sequences
-    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
-    //
-    // - token  : the token ids of the input (used when embd is NULL)
-    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
-    // - pos    : the positions of the respective token in the sequence
-    // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
-    //
-    typedef struct llama_batch {
-        int32_t n_tokens;
-
-        llama_token  *  token;
-        float        *  embd;
-        llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits;
-
-        // NOTE: helpers for smooth API transition - can be deprecated in the future
-        //       for future-proof code, use the above fields instead and ignore everything below
-        //
-        // pos[i] = all_pos_0 + i*all_pos_1
-        //
-        llama_pos    all_pos_0;  // used if pos == NULL
-        llama_pos    all_pos_1;  // used if pos == NULL
-        llama_seq_id all_seq_id; // used if seq_id == NULL
-    } llama_batch;
-
-    enum llama_model_kv_override_type {
-        LLAMA_KV_OVERRIDE_TYPE_INT,
-        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
-        LLAMA_KV_OVERRIDE_TYPE_BOOL,
-    };
-
-    struct llama_model_kv_override {
-        char key[128];
-        enum llama_model_kv_override_type tag;
-        union {
-            int64_t int_value;
-            double float_value;
-            bool bool_value;
-        };
-    };
-
-    struct llama_model_params {
-        int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
-
-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_LAYER: ignored
-        int32_t main_gpu;
-
-        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        const float * tensor_split;
-
-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
-        llama_progress_callback progress_callback;
-
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-
-        // override key-value pairs of the model meta data
-        const struct llama_model_kv_override * kv_overrides;
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
-    };
-
-    struct llama_context_params {
-        uint32_t seed;              // RNG seed, -1 for random
-        uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // prompt processing maximum batch size
-        uint32_t n_threads;         // number of threads to use for generation
-        uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
-
-        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency, 0 = from model
-        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-        uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
-
-        ggml_backend_sched_eval_callback cb_eval;
-        void * cb_eval_user_data;
-
-        enum ggml_type type_k; // data type for K cache
-        enum ggml_type type_v; // data type for V cache
-
-        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
-    };
-
-    // model quantization parameters
-    typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;      // quantize to this llama_ftype
-        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor; // quantize output.weight
-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
-        void * imatrix;              // pointer to importance matrix data
-    } llama_model_quantize_params;
-
-    // grammar types
-    struct llama_grammar;
-
-    // grammar element type
-    enum llama_gretype {
-        // end of rule definition
-        LLAMA_GRETYPE_END            = 0,
-
-        // start of alternate definition for rule
-        LLAMA_GRETYPE_ALT            = 1,
-
-        // non-terminal element: reference to rule
-        LLAMA_GRETYPE_RULE_REF       = 2,
-
-        // terminal element: character (code point)
-        LLAMA_GRETYPE_CHAR           = 3,
-
-        // inverse char(s) ([^a], [^a-b] [^abc])
-        LLAMA_GRETYPE_CHAR_NOT       = 4,
-
-        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
-        // be an inclusive range ([a-z])
-        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
-
-        // modifies a preceding LLAMA_GRETYPE_CHAR or
-        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
-        LLAMA_GRETYPE_CHAR_ALT       = 6,
-    };
-
-    typedef struct llama_grammar_element {
-        enum llama_gretype type;
-        uint32_t           value; // Unicode code point or rule ID
-    } llama_grammar_element;
-
-    // performance timing information
-    struct llama_timings {
-        double t_start_ms;
-        double t_end_ms;
-        double t_load_ms;
-        double t_sample_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_sample;
-        int32_t n_p_eval;
-        int32_t n_eval;
-    };
-
-    // used in chat template
-    typedef struct llama_chat_message {
-        const char * role;
-        const char * content;
-    } llama_chat_message;
-
-    // Helpers for getting default parameters
-    LLAMA_API struct llama_model_params llama_model_default_params(void);
-    LLAMA_API struct llama_context_params llama_context_default_params(void);
-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
-
-    // Initialize the llama + ggml backend
-    // If numa is true, use NUMA optimizations
-    // Call once at the start of the program
-    LLAMA_API void llama_backend_init(void);
-
-    //optional:
-    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
-
-    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free(void);
-
-    LLAMA_API struct llama_model * llama_load_model_from_file(
-                             const char * path_model,
-            struct llama_model_params     params);
-
-    LLAMA_API void llama_free_model(struct llama_model * model);
-
-    LLAMA_API struct llama_context * llama_new_context_with_model(
-                     struct llama_model * model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    LLAMA_API int64_t llama_time_us(void);
-
-    LLAMA_API size_t llama_max_devices(void);
-
-    LLAMA_API bool llama_supports_mmap       (void);
-    LLAMA_API bool llama_supports_mlock      (void);
-    LLAMA_API bool llama_supports_gpu_offload(void);
-
-    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
-
-    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
-
-    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
-    LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
-
-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
-
-    // Get the model's RoPE frequency scaling factor
-    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
-
-    // Functions to access the model's GGUF metadata scalar values
-    // - The functions return the length of the string on success, or -1 on failure
-    // - The output string is always null-terminated and cleared on failure
-    // - GGUF array values are not supported by these functions
-
-    // Get metadata value as a string by key name
-    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
-
-    // Get the number of metadata key/value pairs
-    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
-
-    // Get metadata key name by index
-    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get metadata value as a string by index
-    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get a string describing the model type
-    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
-
-    // Returns the total size of all the tensors in the model in bytes
-    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
-
-    // Returns the total number of parameters in the model
-    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
-
-    // Get a llama model tensor
-    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
-
-    // Returns 0 on success
-    LLAMA_API uint32_t llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-            const llama_model_quantize_params * params);
-
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
-    // Returns 0 on success
-    LLAMA_API int32_t llama_model_apply_lora_from_file(
-            const struct llama_model * model,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads);
-
-    //
-    // KV cache
-    //
-
-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_max_seq;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_max_seq items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    // Returns the number of tokens in the KV cache (slow, use only for debug)
-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
-
-    // Clear the KV cache
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
-
-    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // seq_id < 0 : match any sequence
-    // p0 < 0     : [0,  p1]
-    // p1 < 0     : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1);
-
-    // Copy all tokens that belong to the specified sequence to another sequence
-    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1);
-
-    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta);
-
-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
-
-    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // Defragment the KV cache
-    // This will be applied:
-    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-
-    //
-    // State / sessions
-    //
-
-    // Returns the maximum size in bytes of the state (rng, logits, embedding
-    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
-
-    // Copies the state to the specified destination address.
-    // Destination needs to have allocated enough memory.
-    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(
-            struct llama_context * ctx,
-                         uint8_t * dst);
-
-    // Set the state reading from the specified address
-    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(
-            struct llama_context * ctx,
-                   const uint8_t * src);
-
-    // Save/load session file
-    LLAMA_API bool llama_load_session_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-                     llama_token * tokens_out,
-                          size_t   n_token_capacity,
-                          size_t * n_token_count_out);
-
-    LLAMA_API bool llama_save_session_file(
-            struct llama_context * ctx,
-                      const char * path_session,
-               const llama_token * tokens,
-                          size_t   n_token_count);
-
-    //
-    // Decoding
-    //
-
-    // Return batch for single sequence of tokens starting at pos_0
-    //
-    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
-    //
-    LLAMA_API struct llama_batch llama_batch_get_one(
-                  llama_token * tokens,
-                      int32_t   n_tokens,
-                    llama_pos   pos_0,
-                 llama_seq_id   seq_id);
-
-    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
-    // Each token can be assigned up to n_seq_max sequence ids
-    // The batch has to be freed with llama_batch_free()
-    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
-    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
-    // The rest of the llama_batch members are allocated with size n_tokens
-    // All members are left uninitialized
-    LLAMA_API struct llama_batch llama_batch_init(
-            int32_t n_tokens,
-            int32_t embd,
-            int32_t n_seq_max);
-
-    // Frees a batch of tokens allocated with llama_batch_init()
-    LLAMA_API void llama_batch_free(struct llama_batch batch);
-
-    // Positive return values does not mean a fatal error, but rather a warning.
-    //   0 - success
-    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    // < 0 - error
-    LLAMA_API int32_t llama_decode(
-            struct llama_context * ctx,
-              struct llama_batch   batch);
-
-    // Set the number of threads used for decoding
-    // n_threads is the number of threads used for generation (single token)
-    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Logits for which llama_batch.logits[i] == 0 are undefined
-    // Rows: n_tokens provided with llama_batch
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Logits for the ith token. Equivalent to:
-    // llama_get_logits(ctx) + i*n_vocab
-    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Get the embeddings for the ith sequence
-    // llama_get_embeddings(ctx) + i*n_embd
-    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
-
-    //
-    // Vocab
-    //
-
-    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-
-    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
-
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
-
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
-
-    // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
-
-    //
-    // Tokenization
-    //
-
-    /// @details Convert the provided text into tokens.
-    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    /// @return Returns the number of tokens on success, no more than n_max_tokens
-    /// @return Returns a negative number on failure - the number of tokens that would have been returned
-    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
-    ///                Does not insert a leading space.
-    LLAMA_API int32_t llama_tokenize(
-        const struct llama_model * model,
-                      const char * text,
-                         int32_t   text_len,
-                     llama_token * tokens,
-                         int32_t   n_max_tokens,
-                            bool   add_bos,
-                            bool   special);
-
-    // Token Id -> Piece.
-    // Uses the vocabulary in the provided context.
-    // Does not write null terminator to the buffer.
-    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int32_t llama_token_to_piece(
-              const struct llama_model * model,
-                           llama_token   token,
-                                  char * buf,
-                               int32_t   length);
-
-    /// Apply chat template. Inspired by hf apply_chat_template() on python.
-    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
-    /// @param chat Pointer to a list of multiple llama_chat_message
-    /// @param n_msg Number of llama_chat_message in this chat
-    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
-    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
-    /// @param length The size of the allocated buffer
-    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
-    LLAMA_API int32_t llama_chat_apply_template(
-              const struct llama_model * model,
-                            const char * tmpl,
-       const struct llama_chat_message * chat,
-                                size_t   n_msg,
-                                  bool   add_ass,
-                                  char * buf,
-                               int32_t   length);
-
-    //
-    // Grammar
-    //
-
-    LLAMA_API struct llama_grammar * llama_grammar_init(
-            const llama_grammar_element ** rules,
-                                 size_t    n_rules,
-                                 size_t    start_rule_index);
-
-    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
-
-    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
-
-    //
-    // Sampling functions
-    //
-
-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
-
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present);
-
-    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param logits Logits extracted from the original generation context.
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_apply_guidance(
-              struct llama_context * ctx,
-                             float * logits,
-                             float * logits_guidance,
-                             float   scale);
-
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
-
-    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                         int32_t   k,
-                          size_t   min_keep);
-
-    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
-
-    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API void llama_sample_min_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
-
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   z,
-                          size_t   min_keep);
-
-    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
-
-    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates_p,
-                           float   min_temp,
-                           float   max_temp,
-                           float   exponent_val);
-
-    LLAMA_API void llama_sample_temp(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   temp);
-
-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
-
-    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   tau,
-                           float   eta,
-                         int32_t   m,
-                           float * mu);
-
-    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   tau,
-                           float   eta,
-                           float * mu);
-
-    /// @details Selects the token with the highest probability.
-    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
-    LLAMA_API llama_token llama_sample_token_greedy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
-
-    /// @details Randomly selects a token from the candidates based on their probabilities.
-    LLAMA_API llama_token llama_sample_token(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
-
-    /// @details Accepts the sampled token into the grammar
-    LLAMA_API void llama_grammar_accept_token(
-            struct llama_context * ctx,
-            struct llama_grammar * grammar,
-                     llama_token   token);
-
-    //
-    // Beam search
-    //
-
-    struct llama_beam_view {
-        const llama_token * tokens;
-
-        size_t n_tokens;
-        float  p;        // Cumulative beam probability (renormalized relative to all beams)
-        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
-    };
-
-    // Passed to beam_search_callback function.
-    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
-    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
-    // These pointers are valid only during the synchronous callback, so should not be saved.
-    struct llama_beams_state {
-        struct llama_beam_view * beam_views;
-
-        size_t n_beams;               // Number of elements in beam_views[].
-        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
-        bool   last_call;             // True iff this is the last callback invocation.
-    };
-
-    // Type of pointer to the beam_search_callback function.
-    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
-    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
-    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
-
-    /// @details Deterministically returns entire sentence constructed by a beam search.
-    /// @param ctx Pointer to the llama_context.
-    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
-    /// @param callback_data A pointer that is simply passed back to callback.
-    /// @param n_beams Number of beams to use.
-    /// @param n_past Number of tokens already evaluated.
-    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
-    LLAMA_API void llama_beam_search(
-                   struct llama_context * ctx,
-        llama_beam_search_callback_fn_t   callback,
-                                   void * callback_data,
-                                 size_t   n_beams,
-                                int32_t   n_past,
-                                int32_t   n_predict);
-
-    // Performance information
-    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
-
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-    // Set callback for all future logging events.
-    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
-
-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
-
-#ifdef __cplusplus
-}
-#endif
-
-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef LLAMA_API_INTERNAL
-
-#include <vector>
-#include <string>
-
-struct ggml_tensor;
-
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
-
-#endif // LLAMA_API_INTERNAL
-
-#endif // LLAMA_H
diff --git a/media/llama-leader.jpeg b/media/llama-leader.jpeg
deleted file mode 100644
index 0b4e6e1cf..000000000
Binary files a/media/llama-leader.jpeg and /dev/null differ
diff --git a/media/matmul.png b/media/matmul.png
new file mode 100644
index 000000000..786a20492
Binary files /dev/null and b/media/matmul.png differ
diff --git a/media/matmul.svg b/media/matmul.svg
new file mode 100644
index 000000000..1d6cb4bb7
--- /dev/null
+++ b/media/matmul.svg
@@ -0,0 +1,1238 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="1150"
+   height="600"
+   viewBox="0 0 304.27084 158.75"
+   version="1.1"
+   id="svg1"
+   inkscape:version="1.3.2 (091e20ef0f, 2023-11-25, custom)"
+   sodipodi:docname="matmul.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     inkscape:zoom="1.4677624"
+     inkscape:cx="586.60719"
+     inkscape:cy="306.92978"
+     inkscape:window-width="2560"
+     inkscape:window-height="1360"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     showgrid="false">
+    <inkscape:grid
+       id="grid1"
+       units="mm"
+       originx="0"
+       originy="0"
+       spacingx="0.99999997"
+       spacingy="1"
+       empcolor="#0099e5"
+       empopacity="0.30196078"
+       color="#0099e5"
+       opacity="0.14901961"
+       empspacing="5"
+       dotted="false"
+       gridanglex="30"
+       gridanglez="30"
+       visible="false" />
+  </sodipodi:namedview>
+  <defs
+     id="defs1">
+    <marker
+       style="overflow:visible"
+       id="DartArrow"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Dart arrow"
+       markerWidth="1"
+       markerHeight="1"
+       viewBox="0 0 1 1"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:context-stroke;fill-rule:evenodd;stroke:none"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         transform="scale(-0.5)"
+         id="path6" />
+    </marker>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1">
+    <g
+       id="g16"
+       transform="matrix(0,2.0000411,-2.0000411,0,70.001026,79.998976)"
+       style="stroke-width:0.264583;stroke-dasharray:none">
+      <g
+         id="g15"
+         style="stroke-width:0.264583;stroke-dasharray:none">
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="rect1"
+           width="19.999998"
+           height="20"
+           x="4.9999995"
+           y="5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="path1"
+           cx="7.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse1"
+           cx="7.4999995"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse2"
+           cx="7.4999995"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse3"
+           cx="7.4999995"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 9.9999995,5 V 25"
+           id="path3" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse4"
+           cx="12.499999"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse5"
+           cx="12.499999"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse6"
+           cx="12.499999"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse7"
+           cx="12.499999"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 14.999999,5 V 25"
+           id="path7" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse8"
+           cx="17.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse9"
+           cx="17.5"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse10"
+           cx="17.5"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse11"
+           cx="17.5"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 19.971686,5 V 25"
+           id="path11" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse12"
+           cx="22.471687"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse13"
+           cx="22.471687"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse14"
+           cx="22.471687"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse15"
+           cx="22.471687"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:3.175px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+       x="44"
+       y="33"
+       id="text49"><tspan
+         sodipodi:role="line"
+         id="tspan49"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:'Liberation Sans';-inkscape-font-specification:'Liberation Sans Italic';stroke-width:0.264583"
+         x="44"
+         y="33" /></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:3.175px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+       x="44"
+       y="65"
+       id="text52"><tspan
+         sodipodi:role="line"
+         id="tspan52"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:'Liberation Sans';-inkscape-font-specification:'Liberation Sans Italic';stroke-width:0.264583"
+         x="44"
+         y="65" /></text>
+    <g
+       id="g71"
+       transform="matrix(0,2.0000411,-2.0000411,0,130.00184,19.998976)"
+       style="stroke-width:0.264583;stroke-dasharray:none">
+      <g
+         id="g70"
+         style="stroke-width:0.264583;stroke-dasharray:none"
+         transform="rotate(90,14.999999,15)">
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="rect55"
+           width="19.999998"
+           height="20"
+           x="4.9999995"
+           y="5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse55"
+           cx="7.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse56"
+           cx="7.4999995"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse57"
+           cx="7.4999995"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse58"
+           cx="7.4999995"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 9.9999995,5 V 25"
+           id="path58" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse59"
+           cx="12.499999"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse60"
+           cx="12.499999"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse61"
+           cx="12.499999"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse62"
+           cx="12.499999"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 14.999999,5 V 25"
+           id="path62" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse63"
+           cx="17.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse64"
+           cx="17.5"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse65"
+           cx="17.5"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse66"
+           cx="17.5"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 19.971686,5 V 25"
+           id="path66" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse67"
+           cx="22.471687"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse68"
+           cx="22.471687"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse69"
+           cx="22.471687"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse70"
+           cx="22.471687"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+      </g>
+      <g
+         id="g90"
+         style="stroke-width:0.264583;stroke-dasharray:none"
+         transform="rotate(90,29.999486,29.999486)">
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="rect75"
+           width="19.999998"
+           height="20"
+           x="4.9999995"
+           y="5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse75"
+           cx="7.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse76"
+           cx="7.4999995"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse77"
+           cx="7.4999995"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse78"
+           cx="7.4999995"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 9.9999995,5 V 25"
+           id="path78" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse79"
+           cx="12.499999"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse80"
+           cx="12.499999"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse81"
+           cx="12.499999"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse82"
+           cx="12.499999"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 14.999999,5 V 25"
+           id="path82" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse83"
+           cx="17.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse84"
+           cx="17.5"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse85"
+           cx="17.5"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse86"
+           cx="17.5"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 19.971686,5 V 25"
+           id="path86" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse87"
+           cx="22.471687"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse88"
+           cx="22.471687"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse89"
+           cx="22.471687"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse90"
+           cx="22.471687"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="39.657513"
+       y="140.84073"
+       id="text71"><tspan
+         sodipodi:role="line"
+         id="tspan71"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';stroke-width:0.264583"
+         x="39.657513"
+         y="140.84073">A</tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="39.657513"
+         y="151.81354"
+         id="tspan72">Row-major</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="99.848824"
+       y="13.928269"
+       id="text74"><tspan
+         sodipodi:role="line"
+         id="tspan73"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';stroke-width:0.264583"
+         x="99.848824"
+         y="13.928269">B<tspan
+   style="font-size:65%;baseline-shift:super"
+   id="tspan75">T</tspan></tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="99.848824"
+         y="24.901073"
+         id="tspan74">Column-major</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="100.00081"
+       y="140.77661"
+       id="text92"><tspan
+         sodipodi:role="line"
+         id="tspan91"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';baseline-shift:baseline;stroke-width:0.264583"
+         x="100.00081"
+         y="140.77661">C<tspan
+   style="font-size:65%;baseline-shift:super"
+   id="tspan164">T</tspan>=AB<tspan
+   style="font-size:65%;baseline-shift:super"
+   id="tspan163">T</tspan></tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="100.00081"
+         y="151.74942"
+         id="tspan92">Column-major</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 22.000816,87.999181 H 56.000814"
+       id="path94"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="39.991577"
+       y="86.745056"
+       id="text94"><tspan
+         sodipodi:role="line"
+         id="tspan94"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="39.991577"
+         y="86.745056">ne00</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 18.135726,91.999222 18.000817,125.99918"
+       id="path95"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="10.795282"
+       y="111.73724"
+       id="text95"><tspan
+         sodipodi:role="line"
+         id="tspan95"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="10.795282"
+         y="111.73724">ne01</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 83.000813,87.999181 H 116.00081"
+       id="path96"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="100.42033"
+       y="86.753548"
+       id="text96"><tspan
+         sodipodi:role="line"
+         id="tspan96"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="100.42033"
+         y="86.753548">ne1</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 122.00081,92.999181 V 125.99918"
+       id="path97"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="128.22845"
+       y="111.73724"
+       id="text97"><tspan
+         sodipodi:role="line"
+         id="tspan97"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="128.22845"
+         y="111.73724">ne0</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 122.00081,32.999181 v 33"
+       id="path98"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="130.04456"
+       y="51.737244"
+       id="text98"><tspan
+         sodipodi:role="line"
+         id="tspan98"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="130.04456"
+         y="51.737244">ne10</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 83.000813,71.999181 H 116.0008"
+       id="path99"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="100.42033"
+       y="77.793732"
+       id="text99"><tspan
+         sodipodi:role="line"
+         id="tspan99"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="100.42033"
+         y="77.793732">ne11</tspan></text>
+    <g
+       id="g115"
+       transform="matrix(-1.0156483e-4,-2.0000411,2.0000411,-1.0156483e-4,170.00049,140.00172)"
+       style="stroke-width:0.264583;stroke-dasharray:none">
+      <g
+         id="g114"
+         style="stroke-width:0.264583;stroke-dasharray:none">
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="rect99"
+           width="19.999998"
+           height="20"
+           x="4.9999995"
+           y="5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse99"
+           cx="7.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse100"
+           cx="7.4999995"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse101"
+           cx="7.4999995"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse102"
+           cx="7.4999995"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 9.9999995,5 V 25"
+           id="path102" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse103"
+           cx="12.499999"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse104"
+           cx="12.499999"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse105"
+           cx="12.499999"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse106"
+           cx="12.499999"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 14.999999,5 V 25"
+           id="path106" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse107"
+           cx="17.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse108"
+           cx="17.5"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse109"
+           cx="17.5"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse110"
+           cx="17.5"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 19.971686,5 V 25"
+           id="path110" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse111"
+           cx="22.471687"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse112"
+           cx="22.471687"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse113"
+           cx="22.471687"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse114"
+           cx="22.471687"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+      </g>
+    </g>
+    <g
+       id="g130"
+       style="stroke-width:0.264583;stroke-dasharray:none"
+       transform="matrix(0,-2.0000411,2.0000411,0,229.99978,80.0002)">
+      <g
+         id="g165"
+         transform="rotate(89.997647,14.999999,15)">
+        <rect
+           style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="rect115"
+           width="19.999998"
+           height="20"
+           x="4.9999995"
+           y="5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse115"
+           cx="7.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse116"
+           cx="7.4999995"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse117"
+           cx="7.4999995"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse118"
+           cx="7.4999995"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 9.9999995,5 V 25"
+           id="path118" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse119"
+           cx="12.499999"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse120"
+           cx="12.499999"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse121"
+           cx="12.499999"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse122"
+           cx="12.499999"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 14.999999,5 V 25"
+           id="path122" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse123"
+           cx="17.5"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse124"
+           cx="17.5"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse125"
+           cx="17.5"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse126"
+           cx="17.5"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <path
+           style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           d="M 19.971686,5 V 25"
+           id="path126" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse127"
+           cx="22.471687"
+           cy="7.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse128"
+           cx="22.471687"
+           cy="12.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse129"
+           cx="22.471687"
+           cy="17.5"
+           rx="1.4999999"
+           ry="1.5" />
+        <ellipse
+           style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+           id="ellipse130"
+           cx="22.471687"
+           cy="22.5"
+           rx="1.4999999"
+           ry="1.5" />
+      </g>
+    </g>
+    <g
+       id="g146"
+       style="stroke-width:0.264583;stroke-dasharray:none"
+       transform="matrix(0,-2.0000411,2.0000411,0,229.99978,139.99938)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="rect130"
+         width="19.999998"
+         height="20"
+         x="4.9999995"
+         y="5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse131"
+         cx="7.5"
+         cy="7.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse132"
+         cx="7.4999995"
+         cy="12.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse133"
+         cx="7.4999995"
+         cy="17.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse134"
+         cx="7.4999995"
+         cy="22.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <path
+         style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         d="M 9.9999995,5 V 25"
+         id="path134" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse135"
+         cx="12.499999"
+         cy="7.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse136"
+         cx="12.499999"
+         cy="12.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse137"
+         cx="12.499999"
+         cy="17.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse138"
+         cx="12.499999"
+         cy="22.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <path
+         style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         d="M 14.999999,5 V 25"
+         id="path138" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse139"
+         cx="17.5"
+         cy="7.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse140"
+         cx="17.5"
+         cy="12.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse141"
+         cx="17.5"
+         cy="17.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse142"
+         cx="17.5"
+         cy="22.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <path
+         style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         d="M 19.971686,5 V 25"
+         id="path142" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse143"
+         cx="22.471687"
+         cy="7.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse144"
+         cx="22.471687"
+         cy="12.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse145"
+         cx="22.471687"
+         cy="17.5"
+         rx="1.4999999"
+         ry="1.5" />
+      <ellipse
+         style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+         id="ellipse146"
+         cx="22.471687"
+         cy="22.5"
+         rx="1.4999999"
+         ry="1.5" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="199.65669"
+       y="140.84073"
+       id="text148"><tspan
+         sodipodi:role="line"
+         id="tspan147"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';stroke-width:0.264583"
+         x="199.65669"
+         y="140.84073">B</tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="199.65669"
+         y="151.81354"
+         id="tspan148">Row-major</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="259.84799"
+       y="13.928265"
+       id="text151"><tspan
+         sodipodi:role="line"
+         id="tspan150"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';stroke-width:0.264583"
+         x="259.84799"
+         y="13.928265">A<tspan
+   style="font-size:65%;baseline-shift:super"
+   id="tspan166">T</tspan></tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="259.84799"
+         y="24.90107"
+         id="tspan151">Column-major</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Sans';-inkscape-font-specification:'Nimbus Sans';text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-dasharray:none;stroke-opacity:1"
+       x="260"
+       y="140.82664"
+       id="text154"><tspan
+         sodipodi:role="line"
+         id="tspan153"
+         style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8.46667px;font-family:'Nimbus Roman';-inkscape-font-specification:'Nimbus Roman,  Italic';stroke-width:0.264583"
+         x="260"
+         y="140.82664">C=BA<tspan
+   style="font-size:65%;baseline-shift:super"
+   id="tspan167">T</tspan></tspan><tspan
+         sodipodi:role="line"
+         style="font-size:8.46667px;stroke-width:0.264583"
+         x="260"
+         y="151.79945"
+         id="tspan154">Row-major</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 181.99999,87.999177 h 34"
+       id="path154"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="199.99075"
+       y="86.745049"
+       id="text155"><tspan
+         sodipodi:role="line"
+         id="tspan155"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="199.99075"
+         y="86.745049">ne10</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 178.1349,91.999218 -0.13491,33.999952"
+       id="path155"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="170.79529"
+       y="111.73724"
+       id="text156"><tspan
+         sodipodi:role="line"
+         id="tspan156"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="170.79529"
+         y="111.73724">ne11</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 242.99998,87.999177 h 33"
+       id="path156"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="260.41949"
+       y="86.75354"
+       id="text157"><tspan
+         sodipodi:role="line"
+         id="tspan157"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="260.41949"
+         y="86.75354">ne0</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="M 281.99998,92.999177 V 125.99917"
+       id="path157"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="288.21979"
+       y="111.73688"
+       id="text158"><tspan
+         sodipodi:role="line"
+         id="tspan158"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="288.21979"
+         y="111.73688">ne1</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 281.99998,32.999177 v 33"
+       id="path158"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="290.0437"
+       y="51.73724"
+       id="text159"><tspan
+         sodipodi:role="line"
+         id="tspan159"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="290.0437"
+         y="51.73724">ne00</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#DartArrow)"
+       d="m 242.99998,71.999177 h 32.99999"
+       id="path159"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:Monospace;-inkscape-font-specification:Monospace;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:none;stroke-opacity:1"
+       x="260.41949"
+       y="77.793724"
+       id="text160"><tspan
+         sodipodi:role="line"
+         id="tspan160"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;font-family:'Noto Sans Math';-inkscape-font-specification:'Noto Sans Math';stroke-width:0.396875"
+         x="260.41949"
+         y="77.793724">ne01</tspan></text>
+    <path
+       style="fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:0.396875;stroke-dasharray:1.58749998,1.58749998;stroke-opacity:1;stroke-dashoffset:0"
+       d="m 149.99999,5 0,150"
+       id="path167"
+       sodipodi:nodetypes="cc" />
+  </g>
+</svg>
diff --git a/models/ggml-vocab-bert-bge.gguf b/models/ggml-vocab-bert-bge.gguf
new file mode 100644
index 000000000..b2cbd5df6
Binary files /dev/null and b/models/ggml-vocab-bert-bge.gguf differ
diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-bert-bge.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out
new file mode 100644
index 000000000..a62566ce7
--- /dev/null
+++ b/models/ggml-vocab-bert-bge.gguf.out
@@ -0,0 +1,46 @@
+ 29464 2094 1018 1092 2706
+ 11865 17875
+
+
+
+
+
+
+
+
+
+ 7592 2088
+ 7592 2088
+ 7592 2088
+ 7592 2088
+ 7592 2088 999
+ 7592 1010 2088 999
+ 7592 1010 2088 999
+ 2023 2003 100 1012 18133 2361
+ 1059 2692 18139 1021 8525 28418 2243 16233 20952 6979
+ 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325
+ 100
+ 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007
+ 7592
+ 7592
+ 7592
+ 7592
+ 7592
+ 7592 7592
+ 1006
+ 1027
+ 1005 3690
+ 7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
+ 999 999 999 999 999 999
+ 1017
+ 3943
+ 21211
+ 21211 2509
+ 21211 22394
+ 21211 22394 2509
+ 21211 22394 22394
+ 21211 22394 22394 2509
+ 21211 22394 22394 22394
+ 12731 2050 19710
+ 5860 18117
+ 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222
diff --git a/models/ggml-vocab-chameleon.gguf.inp b/models/ggml-vocab-chameleon.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-chameleon.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-chameleon.gguf.out b/models/ggml-vocab-chameleon.gguf.out
new file mode 100644
index 000000000..7c5413fee
--- /dev/null
+++ b/models/ggml-vocab-chameleon.gguf.out
@@ -0,0 +1,46 @@
+ 17245 16604 16403 16604 33583 18355
+ 16421 51153
+
+ 16604
+ 16650
+ 16650 16604
+ 16581
+ 16582
+ 16582 16582
+ 16582 16582 16582
+ 16581 16582
+ 31596 17394
+ 34926 17394
+ 31596 18671
+ 34926 18671
+ 34926 18671 16384
+ 31596 16395 17394 16384
+ 34926 16395 17394 16384
+ 16811 16704 20410 16483 16631 16397 52854
+ 16470 16399 16403 16407 16604 16406 35764 38185 51595 22592 26639
+ 29479 23955 17012 20103 25527 27670 17408 19005 21473 24774
+ 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 21954 16607 21954 16633 21954 16611 29409 16607 21954 16615
+ 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 16604 16391 24664 17153 57169 16721 16872 17073 17304 28729 16392
+ 31596
+ 34926
+ 16650 31596
+ 16650 34926
+ 16696 31596
+ 16696 31596 16582 16696 31596
+ 16604 16391
+ 16582 16604 16412
+ 16390 22623
+ 31596 16395 16712 16390 16828 16384 17674 16769 16732 23686 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636
+ 16384 16384 16384 16384 16384 16384
+ 16402
+ 16402 16402
+ 16402 16402 16402
+ 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402 16402
+ 16402 16402 16402 16402 16402 16402 16402 16402 16402
+ 16418 19038 16639 16448 24315 33727 16467
+ 18765 17981
+ 16582 16604 16582 16582 16604 16582 16582 16582 16604 16581 16604 16581 16581 16604 16581 16582 16650 16582 16650 16604 16582 16696 16582 16696 16604 16582 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 20410 16483 16631 18885 16483 16631 16604 16402 16604 16402 16402 16604 16402 16402 16402 16604 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16402 16604 16402 16397 16402 16604 16402 16397 16397 16402 16604 16402 16397 16397 16397 16402 16604 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 27683 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 16604 16396 16396 16396 16396 16396 16396 16412 16412 16412 16412 16412 16412 16412 27268 23955 17012 20103 25527 27670 17408 19005 21473 24774 16604 16390 16390 16390 16390 16390 16390 16447 16447 16447 16447 16447 16447 16447 16385 16385 16385 16385 16397 16397 16397 16397 16397 16397 16384 16384 16384 16384 16384 16384 16414 16414 16414 16414 16414 16414 16687 16390 16690 16992 16604 16390 61797 16733 16390 16466 16986 16395 16604 16390 17879 16732 17811 16414 16604 16390 16428 16804 17811 16687 16390 16683 17190 16728 16395 16604 16390 16419 16732 16945 16991 25251 16414 17119 16390 38127 16641 16390 16459 16427
diff --git a/models/ggml-vocab-command-r.gguf b/models/ggml-vocab-command-r.gguf
new file mode 100644
index 000000000..b553eab33
Binary files /dev/null and b/models/ggml-vocab-command-r.gguf differ
diff --git a/models/ggml-vocab-command-r.gguf.inp b/models/ggml-vocab-command-r.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-command-r.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-command-r.gguf.out b/models/ggml-vocab-command-r.gguf.out
new file mode 100644
index 000000000..3f6b41888
--- /dev/null
+++ b/models/ggml-vocab-command-r.gguf.out
@@ -0,0 +1,46 @@
+ 2536 228 27 228 22957 6983
+ 45 193433
+
+ 228
+ 1667
+ 1742
+ 205
+ 206
+ 2126
+ 11516
+ 34777
+ 28339 3845
+ 46609 3845
+ 28339 3930
+ 46609 3930
+ 46609 3930 8
+ 28339 19 3845 8
+ 46609 19 3845 8
+ 2075 1801 11254 107 255 21 19317
+ 94 23 27 31 228 30 21213 20752 39267 6405 9980
+ 4929 40071 2196 3236 8750 1764 37097 41168
+ 38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
+ 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
+ 28339
+ 46609
+ 228 46609
+ 1667 46609
+ 1742 46609
+ 1742 46609 1856 46609
+ 1737
+ 206 1857
+ 14 4515
+ 28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
+ 57178 10251
+ 26
+ 26 26
+ 26 26 26
+ 26 26 26 26
+ 26 26 26 26 26
+ 26 26 26 26 26 26
+ 26 26 26 26 26 26 26
+ 26 26 26 26 26 26 26 26
+ 26 26 26 26 26 26 26 26 26
+ 42 30719 12584
+ 3642 4388
+ 127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51
diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf
new file mode 100644
index 000000000..6728cd747
Binary files /dev/null and b/models/ggml-vocab-deepseek-coder.gguf differ
diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-deepseek-coder.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out
new file mode 100644
index 000000000..52c4111a1
--- /dev/null
+++ b/models/ggml-vocab-deepseek-coder.gguf.out
@@ -0,0 +1,46 @@
+ 1050 207 19 207 19192 4217
+ 37 32009 71 6247
+
+ 207
+ 243
+ 315
+ 184
+ 185
+ 185 185
+ 185 185 185
+ 184 185
+ 17535 1835
+ 414 9489 1835
+ 17535 5414
+ 414 9489 5414
+ 414 9489 5414 0
+ 17535 11 1835 0
+ 414 9489 11 1835 0
+ 437 317 12394 99 234 13 14789
+ 86 15 19 23 207 22 83 3963 27659 26078 3934 14072
+ 1593 6478 616 2251 14994
+ 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218
+ 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8
+ 17535
+ 414 9489
+ 207 414 9489
+ 243 414 9489
+ 315 414 9489
+ 315 414 9489 185 315 414 9489
+ 334
+ 185 405
+ 6 2895
+ 17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
+ 15330 3023
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 34 155 119 242 64 24297 155 119 216 83
+ 1607 2539
+ 185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43
diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf
new file mode 100644
index 000000000..5d66091c4
Binary files /dev/null and b/models/ggml-vocab-deepseek-llm.gguf differ
diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-deepseek-llm.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out
new file mode 100644
index 000000000..0191b7a11
--- /dev/null
+++ b/models/ggml-vocab-deepseek-llm.gguf.out
@@ -0,0 +1,46 @@
+ 1052 207 19 207 19109 4223
+ 37 100014 71 6245
+
+ 207
+ 243
+ 300
+ 184
+ 185
+ 185 185
+ 185 185 185
+ 184 185
+ 17464 1843
+ 37727 1843
+ 17464 5427
+ 37727 5427
+ 37727 5427 0
+ 17464 11 1843 0
+ 37727 11 1843 0
+ 437 317 12356 99 234 13 14743
+ 86 15 19 23 207 22 83 3970 27519 26016 3944 14025
+ 1603 6476 620 91754
+ 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71374 210 71374 236 71374 214 155 240 210 71374 218
+ 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 334 5956 89213 344 643 895 1377 10728 8
+ 17464
+ 37727
+ 207 37727
+ 243 37727
+ 300 37727
+ 300 37727 185 300 37727
+ 334
+ 185 403
+ 6 2906
+ 17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239
+ 15278 3033
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 34 32555 242 64 23708 32555 216 83
+ 1763 2550
+ 185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43
diff --git a/models/ggml-vocab-deepseek-r1-qwen.gguf.inp b/models/ggml-vocab-deepseek-r1-qwen.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-deepseek-r1-qwen.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-r1-qwen.gguf.out b/models/ggml-vocab-deepseek-r1-qwen.gguf.out
new file mode 100644
index 000000000..18b4b45cd
--- /dev/null
+++ b/models/ggml-vocab-deepseek-r1-qwen.gguf.out
@@ -0,0 +1,46 @@
+ 1122 220 19 220 26062 3951
+ 37 50753 261
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1406
+ 1572
+ 9707 1879
+ 21927 1879
+ 9707 4337
+ 21927 4337
+ 21927 4337 0
+ 9707 11 1879 0
+ 21927 11 1879 0
+ 419 374 11162 99 247 13 10821
+ 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
+ 78762 14144 1456 13073 63471 33594 3038 133178 79012
+ 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
+ 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
+ 9707
+ 21927
+ 220 21927
+ 256 21927
+ 262 21927
+ 262 21927 198 262 21927
+ 320
+ 198 284
+ 6 11385
+ 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
+ 17085 2928
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 34 90063 128324
+ 2560 2347
+ 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
diff --git a/models/ggml-vocab-falcon.gguf b/models/ggml-vocab-falcon.gguf
index d4ea2e822..334d50da5 100644
Binary files a/models/ggml-vocab-falcon.gguf and b/models/ggml-vocab-falcon.gguf differ
diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-falcon.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out
new file mode 100644
index 000000000..64a48d97f
--- /dev/null
+++ b/models/ggml-vocab-falcon.gguf.out
@@ -0,0 +1,46 @@
+ 878 204 31 3068 133 2137
+ 28611 132 30042
+
+ 204
+ 258
+ 466
+ 192
+ 193
+ 1001
+ 11331
+ 19125
+ 9856 1079
+ 23090 1079
+ 9856 2889
+ 23090 2889
+ 23090 2889 12
+ 9856 23 1079 12
+ 23090 23 1079 12
+ 414 304 3346 111 231 25 29247
+ 98 55866 204 34 16682 7149 36190 6869 11481
+ 150 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795
+ 38154 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 38154 207 38154 233 38154 211 167 237 207 38154 215
+ 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 204 19 7927 53360 325 504 701 946 10930 20
+ 9856
+ 23090
+ 204 23090
+ 258 23090
+ 466 23090
+ 466 23090 742 23090
+ 204 19
+ 1212 40
+ 18 4932
+ 9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236
+ 51520
+ 30
+ 3138
+ 22287
+ 22287 30
+ 22287 3138
+ 22287 22287
+ 22287 22287 30
+ 22287 22287 3138
+ 22287 22287 22287
+ 46 19768 239 76 9634 19768 213 95
+ 1080 1502
+ 1212 4824 1001 1212 192 204 663 49453 2069 742 561 1501 193 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 3346 111 231 2571 111 231 204 30 204 3138 204 22287 204 22287 30 204 22287 3138 204 22287 22287 204 22287 22287 30 204 22287 22287 3138 204 30 25 30 204 30 513 30 204 30 951 30 27171 236 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 20589 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 204 37057 2228 10666 5052 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 204 7544 7544 7544 8543 8543 17593 3513 3513 12844 51520 17664 4247 295 18 298 650 204 18 95 693 332 18 94 629 23 204 18 1553 299 1310 42 204 18 56 416 1310 295 18 567 717 334 23 204 18 47 299 606 596 6696 42 703 18 16139 241 18 87 55
diff --git a/models/ggml-vocab-gpt2.gguf b/models/ggml-vocab-gpt-2.gguf
similarity index 99%
rename from models/ggml-vocab-gpt2.gguf
rename to models/ggml-vocab-gpt-2.gguf
index 1fbc72c1e..5ea85cf52 100644
Binary files a/models/ggml-vocab-gpt2.gguf and b/models/ggml-vocab-gpt-2.gguf differ
diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-gpt-2.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out
new file mode 100644
index 000000000..17a13bdfc
--- /dev/null
+++ b/models/ggml-vocab-gpt-2.gguf.out
@@ -0,0 +1,46 @@
+ 798 604 25208 1933
+ 37 9116 71 11751
+
+ 220
+ 220 220
+ 220 220 220
+ 197
+ 198
+ 628
+ 628 198
+ 197 198
+ 15496 995
+ 18435 995
+ 15496 2159
+ 18435 2159
+ 18435 2159 0
+ 15496 11 995 0
+ 18435 11 995 0
+ 428 318 12520 99 247 13 20322
+ 86 47202 767 28047 45961 288 82 7568 13415
+ 22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849
+ 157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231
+ 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8
+ 15496
+ 18435
+ 220 18435
+ 220 220 18435
+ 220 220 220 18435
+ 220 220 220 18435 198 220 220 220 18435
+ 357
+ 198 796
+ 6 6980
+ 15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
+ 13896 3228
+ 18
+ 2091
+ 20370
+ 24840
+ 2091 20370
+ 24840 2091
+ 24840 20370
+ 24840 24840
+ 24840 2091 20370
+ 34 157 119 255 64 16049 157 119 229 83
+ 1221 1371
+ 198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43
diff --git a/models/ggml-vocab-llama-bpe.gguf b/models/ggml-vocab-llama-bpe.gguf
new file mode 100644
index 000000000..e51a99118
Binary files /dev/null and b/models/ggml-vocab-llama-bpe.gguf differ
diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out
new file mode 100644
index 000000000..4b35cf93f
--- /dev/null
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@@ -0,0 +1,46 @@
+ 1142 220 19 220 27154 4038
+ 37 51853 261
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1432
+ 1602
+ 9906 1917
+ 22691 1917
+ 9906 4435
+ 22691 4435
+ 22691 4435 0
+ 9906 11 1917 0
+ 22691 11 1917 0
+ 420 374 11410 99 247 13 11055
+ 86 23904 220 22 83 2005 42908 11729 3013 17156
+ 79862 102118 13373 64571 34694 3114 112203 80112
+ 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 21549 223 21549 249 21549 227 45358 223 21549 231
+ 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 320 3323 43465 430 706 1202 1866 4037 8
+ 9906
+ 22691
+ 220 22691
+ 256 22691
+ 262 22691
+ 262 22691 198 262 22691
+ 320
+ 198 284
+ 6 11639
+ 9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909
+ 17523 3001
+ 18
+ 1644
+ 8765
+ 8765 18
+ 8765 1644
+ 8765 8765
+ 8765 8765 18
+ 8765 8765 1644
+ 8765 8765 8765
+ 34 91163 101798
+ 2624 2402
+ 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama-spm.gguf
similarity index 99%
rename from models/ggml-vocab-llama.gguf
rename to models/ggml-vocab-llama-spm.gguf
index 549eed8c5..658295a5d 100644
Binary files a/models/ggml-vocab-llama.gguf and b/models/ggml-vocab-llama-spm.gguf differ
diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-llama-spm.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out
new file mode 100644
index 000000000..93aacf8ba
--- /dev/null
+++ b/models/ggml-vocab-llama-spm.gguf.out
@@ -0,0 +1,46 @@
+ 474 287 29871 29946 29871 30226 7378
+ 383 4000 261
+
+ 259
+ 1678
+ 268
+ 29871 12
+ 29871 13
+ 29871 13 13
+ 29871 13 13 13
+ 29871 12 13
+ 15043 3186
+ 29871 15043 3186
+ 15043 2787
+ 29871 15043 2787
+ 29871 15043 2787 29991
+ 15043 29892 3186 29991
+ 29871 15043 29892 3186 29991
+ 29871 445 338 29871 243 162 169 156 29889 8223
+ 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905
+ 1538 4851 665 1386 29713 1305
+ 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140
+ 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897
+ 15043
+ 29871 15043
+ 259 15043
+ 1678 15043
+ 268 15043
+ 268 15043 13 1678 15043
+ 29871 313
+ 29871 13 353
+ 525 3152
+ 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
+ 1738 6824 21004
+ 29871 29941
+ 29871 29941 29941
+ 29871 29941 29941 29941
+ 29871 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941
+ 315 228 190 176 29874 10630 30529 29873
+ 29871 2313 3163
+ 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931
diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf
index 6affa34bd..f42f56dec 100644
Binary files a/models/ggml-vocab-mpt.gguf and b/models/ggml-vocab-mpt.gguf differ
diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-mpt.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out
new file mode 100644
index 000000000..372c751bf
--- /dev/null
+++ b/models/ggml-vocab-mpt.gguf.out
@@ -0,0 +1,46 @@
+ 728 577 24142 2607
+ 39 26288 6554
+
+ 209
+ 50276
+ 50275
+ 186
+ 187
+ 535
+ 2756
+ 186 187
+ 12092 1533
+ 24387 1533
+ 12092 3645
+ 24387 3645
+ 24387 3645 2
+ 12092 13 1533 2
+ 24387 13 1533 2
+ 436 310 22692 101 236 15 14161
+ 88 27244 818 16853 16392 20505 4989 11917
+ 32520 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389
+ 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 18081 212 18081 238 18081 216 39936 212 18081 220
+ 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 313 7483 802 80 8020 326 556 697 1211 10669 10
+ 12092
+ 24387
+ 50276 12092
+ 50275 12092
+ 50274 12092
+ 50274 12092 187 50274 12092
+ 313
+ 187 426
+ 8 8685
+ 12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241
+ 18963 4672
+ 20
+ 1610
+ 20084
+ 26409
+ 1610 20084
+ 26409 1610
+ 26409 20084
+ 26409 26409
+ 26409 1610 20084
+ 36 6829 244 66 17721 35177 85
+ 1262 2196
+ 586 1744 33525 186 209 623 28910 187 50276 187 50275 187 50274 187 50273 187 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 22692 101 236 14931 101 236 495 5922 30057 495 20084 495 26409 30057 20084 495 26409 1610 495 26409 20084 495 15 20 495 537 20 495 1051 20 209 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 14931 235 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 16081 6877 12880 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 42011 35033 34842 11202 9739 9739 33021 18963 4672 25561 8220 309 1849 644 686 42618 344 434 627 13 686 1848 368 2119 32 686 46 417 2119 309 1833 1056 352 13 686 37 368 751 690 10331 32 844 8 31516 247 8 77 45
diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf
new file mode 100644
index 000000000..745be416a
Binary files /dev/null and b/models/ggml-vocab-phi-3.gguf differ
diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-phi-3.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out
new file mode 100644
index 000000000..93aacf8ba
--- /dev/null
+++ b/models/ggml-vocab-phi-3.gguf.out
@@ -0,0 +1,46 @@
+ 474 287 29871 29946 29871 30226 7378
+ 383 4000 261
+
+ 259
+ 1678
+ 268
+ 29871 12
+ 29871 13
+ 29871 13 13
+ 29871 13 13 13
+ 29871 12 13
+ 15043 3186
+ 29871 15043 3186
+ 15043 2787
+ 29871 15043 2787
+ 29871 15043 2787 29991
+ 15043 29892 3186 29991
+ 29871 15043 29892 3186 29991
+ 29871 445 338 29871 243 162 169 156 29889 8223
+ 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905
+ 1538 4851 665 1386 29713 1305
+ 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140
+ 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897
+ 15043
+ 29871 15043
+ 259 15043
+ 1678 15043
+ 268 15043
+ 268 15043 13 1678 15043
+ 29871 313
+ 29871 13 353
+ 525 3152
+ 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
+ 1738 6824 21004
+ 29871 29941
+ 29871 29941 29941
+ 29871 29941 29941 29941
+ 29871 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941
+ 315 228 190 176 29874 10630 30529 29873
+ 29871 2313 3163
+ 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931
diff --git a/models/ggml-vocab-qwen2.gguf b/models/ggml-vocab-qwen2.gguf
new file mode 100644
index 000000000..541e475bc
Binary files /dev/null and b/models/ggml-vocab-qwen2.gguf differ
diff --git a/models/ggml-vocab-qwen2.gguf.inp b/models/ggml-vocab-qwen2.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-qwen2.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-qwen2.gguf.out b/models/ggml-vocab-qwen2.gguf.out
new file mode 100644
index 000000000..18b4b45cd
--- /dev/null
+++ b/models/ggml-vocab-qwen2.gguf.out
@@ -0,0 +1,46 @@
+ 1122 220 19 220 26062 3951
+ 37 50753 261
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1406
+ 1572
+ 9707 1879
+ 21927 1879
+ 9707 4337
+ 21927 4337
+ 21927 4337 0
+ 9707 11 1879 0
+ 21927 11 1879 0
+ 419 374 11162 99 247 13 10821
+ 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
+ 78762 14144 1456 13073 63471 33594 3038 133178 79012
+ 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
+ 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
+ 9707
+ 21927
+ 220 21927
+ 256 21927
+ 262 21927
+ 262 21927 198 262 21927
+ 320
+ 198 284
+ 6 11385
+ 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
+ 17085 2928
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 34 90063 128324
+ 2560 2347
+ 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf
index 8f26cfb76..52afcf01a 100644
Binary files a/models/ggml-vocab-refact.gguf and b/models/ggml-vocab-refact.gguf differ
diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-refact.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out
new file mode 100644
index 000000000..63d8305c3
--- /dev/null
+++ b/models/ggml-vocab-refact.gguf.out
@@ -0,0 +1,46 @@
+ 4833 225 38 225 143 140 17723
+ 56 2006 3935 265
+
+ 225
+ 261
+ 264
+ 202
+ 203
+ 478
+ 2831
+ 15773
+ 8279 5788
+ 12000 5788
+ 8279 10896
+ 12000 10896
+ 12000 10896 19
+ 8279 30 5788 19
+ 12000 30 5788 19
+ 458 438 5945 118 252 32 3766
+ 105 34 38 42 225 41 102 1707 12530 10180 1479 8278
+ 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700
+ 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236
+ 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27
+ 8279
+ 12000
+ 225 12000
+ 261 12000
+ 264 12000
+ 264 12000 284 12000
+ 308
+ 203 280
+ 25 34666
+ 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838
+ 9163 3202
+ 37
+ 37 37
+ 37 37 37
+ 37 37 37 37
+ 37 37 37 37 37
+ 37 37 37 37 37 37
+ 37 37 37 37 37 37 37
+ 37 37 37 37 37 37 37 37
+ 37 37 37 37 37 37 37 37 37
+ 53 33934 83 33217 17102 102
+ 1214 12258
+ 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62
diff --git a/models/ggml-vocab-roberta-bpe.gguf.inp b/models/ggml-vocab-roberta-bpe.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-roberta-bpe.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-roberta-bpe.gguf.out b/models/ggml-vocab-roberta-bpe.gguf.out
new file mode 100644
index 000000000..f181ac3dc
--- /dev/null
+++ b/models/ggml-vocab-roberta-bpe.gguf.out
@@ -0,0 +1,46 @@
+ 2550 204 18430 377
+ 597 2768 298 8564
+
+ 1437
+ 1437 1437
+ 1437 1437 1437
+ 50117
+ 50118
+ 50140
+ 50140 50118
+ 50117 50118
+ 31414 232
+ 20920 232
+ 31414 623
+ 20920 623
+ 20920 623 328
+ 31414 6 232 328
+ 20920 6 232 328
+ 42 16 8103 18164 27 4 49317
+ 605 40976 262 10109 18474 385 29 36807 6455
+ 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
+ 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
+ 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
+ 31414
+ 20920
+ 1437 20920
+ 1437 1437 20920
+ 1437 1437 1437 20920
+ 1437 1437 1437 20920 50118 1437 1437 1437 20920
+ 36
+ 50118 5457
+ 108 3567
+ 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
+ 32376 12846
+ 246
+ 3103
+ 25631
+ 46152
+ 3103 25631
+ 46152 3103
+ 46152 25631
+ 46152 46152
+ 46152 3103 25631
+ 347 1376 2023 12410 102 16376 1376 2023 6382 90
+ 9553 5954
+ 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
diff --git a/models/ggml-vocab-stablelm-3b-4e1t.gguf b/models/ggml-vocab-stablelm-3b-4e1t.gguf
deleted file mode 100644
index ebb0cdb7d..000000000
Binary files a/models/ggml-vocab-stablelm-3b-4e1t.gguf and /dev/null differ
diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf
index a52983fdb..7a7e7742a 100644
Binary files a/models/ggml-vocab-starcoder.gguf and b/models/ggml-vocab-starcoder.gguf differ
diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp
new file mode 100644
index 000000000..9baf7d77a
--- /dev/null
+++ b/models/ggml-vocab-starcoder.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out
new file mode 100644
index 000000000..87e2465d3
--- /dev/null
+++ b/models/ggml-vocab-starcoder.gguf.out
@@ -0,0 +1,46 @@
+ 4850 244 57 244 162 159 17722
+ 75 2022 3943 284
+
+ 244
+ 280
+ 283
+ 221
+ 222
+ 499
+ 3067
+ 15767
+ 8302 5810
+ 12009 5810
+ 8302 10914
+ 12009 10914
+ 12009 10914 38
+ 8302 49 5810 38
+ 12009 49 5810 38
+ 477 458 5954 137 271 51 3779
+ 124 53 57 61 244 60 121 1726 12568 10240 1519 8290
+ 39916 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721
+ 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 14566 247 14566 273 14566 251 30428 247 14566 255
+ 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 327 2605 22716 708 1421 2840 4387 2421 46
+ 8302
+ 12009
+ 244 12009
+ 280 12009
+ 283 12009
+ 283 12009 303 12009
+ 327
+ 222 299
+ 44 34719
+ 8302 49 553 44 483 38 4998 904 863 18445 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892
+ 9221 3226
+ 56
+ 56 56
+ 56 56 56
+ 56 56 56 56
+ 56 56 56 56 56
+ 56 56 56 56 56 56
+ 56 56 56 56 56 56 56
+ 56 56 56 56 56 56 56 56
+ 56 56 56 56 56 56 56 56 56
+ 72 34269 102 33245 17234 121
+ 1236 12266
+ 353 736 8886 221 10883 4238 16101 28540 222 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 5954 137 271 3822 137 271 244 56 244 56 56 244 56 56 56 244 56 56 56 56 244 56 56 56 56 56 244 56 56 56 56 56 56 244 56 56 56 56 56 56 56 244 56 56 56 56 56 56 56 56 244 56 51 56 244 56 516 56 244 56 1198 56 244 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 36570 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892 20895 16625 13047 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721 5918 9643 13298 932 31723 31330 9221 3226 35426 10400 457 4783 2602 349 121 1477 957 1200 2038 49 349 632 863 3673 68 349 82 666 3673 457 4650 1949 580 49 349 73 863 2144 1649 35941 68 2726 44 7728 331 44 113 81
diff --git a/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja b/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja
new file mode 100644
index 000000000..f5baef30b
--- /dev/null
+++ b/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja
@@ -0,0 +1,202 @@
+
+{%- macro json_to_python_type(json_spec) %}
+{%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+{%- if basic_type_map[json_spec.type] is defined %}
+    {{- basic_type_map[json_spec.type] }}
+{%- elif json_spec.type == "array" %}
+    {{- "List[" +  json_to_python_type(json_spec.items) + "]"}}
+{%- elif json_spec.type == "object" %}
+    {{- "Dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']'}}
+{%- elif json_spec.type is iterable %}
+    {{- "Union[" }}
+    {%- for t in json_spec.type %}
+      {{- json_to_python_type({"type": t}) }}
+      {%- if not loop.last %}
+        {{- "," }} 
+    {%- endif %}
+    {%- endfor %}
+    {{- "]" }}
+{%- else %}
+    {{- "Any" }}
+{%- endif %}
+{%- endmacro %}
+
+{%- macro old_tool_parser(tools) %}
+{%- for tool in tools %}
+    {%- if loop.index0 != 0 %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- '```python\ndef ' + tool.name + '(' }}
+    {%- for param_name, param_fields in tool.parameter_definitions|items %}
+        {%- if loop.index0 != 0 %}
+            {{- ', '}}
+        {%- endif %}
+        {{- param_name + ': ' }}
+        {%- if not param_fields.required %}
+            {{- 'Optional[' + param_fields.type + '] = None'}}
+        {%- else %}
+            {{- param_fields.type }}
+        {%- endif %}
+    {%- endfor %}
+    {{- ') -> List[Dict]:\n    """'}}
+    {{- tool.description }}
+    {%- if tool.parameter_definitions|length != 0 %}
+        {{- '\n\n    Args:\n        '}}
+        {%- for param_name, param_fields in tool.parameter_definitions|items %}
+            {%- if loop.index0 != 0 %}
+                {{- '\n        ' }}
+            {%- endif %}
+            {{- param_name + ' ('}}
+            {%- if not param_fields.required %}
+                {{- 'Optional[' + param_fields.type + ']'}}
+            {%- else %}
+                {{- param_fields.type }}
+            {%- endif %}
+            {{- '): ' + param_fields.description }}
+        {%- endfor %}
+    {%- endif %}
+    {{- '\n    """\n    pass\n```' }}
+{%- endfor %}
+{%- endmacro %}
+
+{%- macro new_tool_parser(tools) %}
+{%- for tool in tools %}
+  {%- if loop.index0 != 0 %}
+    {{- '\n\n'}}
+  {%- endif %}
+  {%- if tool.function is defined %}
+    {%- set tool = tool.function %}
+  {%- endif %}
+  {{-'```python
+def ' + tool.name + '('}}
+  {%- for param_name, param_fields in tool.parameters.properties|items %}
+    {%- if loop.index0 != 0 %}
+      {{- ', '}}
+    {%- endif %}
+    {{-param_name + ": "}} 
+    {%- if not param_name in tool.parameters.required %}
+      {{-'Optional[' + json_to_python_type(param_fields) + '] = None'}}
+    {%- else %}
+      {{- json_to_python_type(param_fields) }}
+    {%- endif %}
+  {%- endfor %}
+  {{- ') -> List[Dict]:
+    """'}}
+  {{- tool.description }}
+  {%- if tool.parameters.properties|length != 0 %}
+    {{- '\n\n    Args:\n        '}}
+    {%- for param_name, param_fields in tool.parameters.properties|items %}
+      {%- if loop.index0 != 0 %}
+        {{- '\n        ' }}
+      {%- endif %}
+      {{- param_name + ' ('}}
+      {%- if not param_name in tool.parameters.required %}
+        {{-'Optional[' + json_to_python_type(param_fields) + ']'}}
+      {%- else %}
+        {{- json_to_python_type(param_fields) }}
+      {%- endif %}
+      {{- '): ' + param_fields.description }}
+    {%- endfor %}
+    {%- endif %}
+    {{- '\n    """\n    pass\n```' }}
+{%- endfor %}
+{%- endmacro %}
+
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+  {%- set loop_messages = messages[1:] %}
+  {%- set system_message = messages[0]['content'] %}
+{%- else %}
+  {%- set loop_messages = messages %}
+  {%- set system_message = '## Task and Context\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user\'s needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.' %}
+{%- endif %}
+{{- '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}
+{{- '# Safety Preamble' }}
+{{- '
+The instructions in this section override those in the task description and style guide sections. Don\'t answer questions that are harmful or immoral.' }}
+{{- '
+
+# System Preamble' }}
+{{- '
+## Basic Rules' }}
+{{- '
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\'s requests, you cite your sources in your answers, according to those instructions.' }}
+{{- '
+
+# User Preamble' }}
+{{- '
+' + system_message }}
+{{-'
+
+## Available Tools
+Here is a list of tools that you have available to you:
+
+'}}
+{%- set ns = namespace(new_tools=true) %}
+{%- for tool in tools %}
+    {%- if tool.parameter_definitions is defined %}
+        {%- set ns.new_tools = false %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.new_tools %}
+    {{- new_tool_parser(tools) }}
+{%- else %}
+    {{- old_tool_parser(tools) }}
+{%- endif %}
+{{- '<|END_OF_TURN_TOKEN|>'}}
+{%- for message in loop_messages %}
+  {%- set content = message['content'] %}
+  {%- if message.role == 'user' %}
+    {{- '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content|trim + '<|END_OF_TURN_TOKEN|>' }}
+  {%- elif message.role == 'system' %}
+    {{- '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content|trim + '<|END_OF_TURN_TOKEN|>' }}
+  {%- elif message.role == 'assistant' and message.tool_calls is defined %}
+    {{- '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}
+    {%- if message.content is defined %}
+        {{- message.content|trim }}
+    {%- endif %}
+    {{- '\nAction:\n```json\n[\n' }}
+    {%- for tool_call in message.tool_calls %}
+        {%- if tool_call.function is defined %}
+            {%- set tool_call = tool_call.function %}
+        {%- endif %}
+        {{- '{\n'|indent(4, first=true) }}
+        {{- '"tool_name": "'|indent(8, first=true) + tool_call.name + '",\n' }}
+        {{- '"parameters": '|indent(8, first=true) }}
+        {%- if tool_call.arguments is defined and tool_call.arguments|length > 0 %}    
+            {{- tool_call.arguments|tojson(indent=4)|indent(8) }}
+            {{- '\n' }}
+        {%- else %}
+            {{- '{}\n' }}
+        {%- endif %}
+        {{- '}'|indent(4, first=true) }}
+        {%- if not loop.last %}
+            {{- ',\n' }}
+        {%- endif %}
+    {%- endfor %}
+    {{- "\n]```\n" }}
+  {%- elif message.role == 'assistant' %}
+    {{- '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content|trim + '<|END_OF_TURN_TOKEN|>' }}
+  {%- elif message.role == 'tool' %}
+    {{- '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><results>\n' }}
+    {{- message.content|trim }}
+    {{- '</results><|END_OF_TURN_TOKEN|>' }}
+  {%- endif %}
+{%- endfor %}
+{{-'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \'Action:\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```<|END_OF_TURN_TOKEN|>'}}
+{%- if add_generation_prompt %}
+  {{- '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}
+{%- endif %}
diff --git a/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja b/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
new file mode 100644
index 000000000..078e9f545
--- /dev/null
+++ b/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
@@ -0,0 +1,156 @@
+{{ bos_token }}{%- macro document_turn(documents) -%}
+{# format documents into chat turn #}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{% for doc in documents %}
+            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
+            {% endif %}
+{% endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- set counter = namespace(value=0) %}
+{%- set tool_call_id_seen = namespace(value=false) %}
+{%- for msg in messages %}
+    {%- if msg.tool_calls %}
+        {%- for tool_call in msg.tool_calls %}
+            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                {{ counter.value }}
+                {%- set tool_call_id_seen.value = true %}
+            {%- endif %}
+            {%- set counter.value = counter.value + 1 %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{# format tool message #}
+    {
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+            "0": {{ tool_msg.content|tojson }}
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set sent_documents = namespace(value=false) %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
+You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
+
+Your information cutoff date is June 2024.
+
+You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
+{% if tools or documents %}
+
+You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.
+
+## Tool Use
+Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.
+
+0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
+    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.
+
+Then carry out your plan by repeatedly executing the following steps.
+1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
+    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
+2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
+    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
+3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
+    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.
+
+You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.
+
+4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
+{% if enable_citations %}
+
+## Grounding
+Importantly, note that "Reflection" and "Response" above can be grounded.
+Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% endif %}
+
+## Available Tools
+Here is the list of tools that you have available to you.
+You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
+Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).
+
+```json
+[
+{% if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}
+
+{% endif %}
+{% for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}
+
+{% endfor %}
+]
+```
+
+{% endif %}
+# Default Preamble
+The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
+- Your name is Command.
+- You are a large language model built by Cohere.
+- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
+- If the input is ambiguous, ask clarifying follow-up questions.
+- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
+- Use LaTeX to generate mathematical notation for complex equations.
+- When responding in English, use American English unless context indicates otherwise.
+- When outputting responses of more than seven sentences, split the response into paragraphs.
+- Prefer the active voice.
+- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
+- Use gender-neutral pronouns for unspecified persons.
+- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
+- Use the third person when asked to write a summary.
+- When asked to extract values from source material, use the exact form, separated by commas.
+- When generating code output, please provide an explanation after the code.
+- When generating code output without specifying the programming language, please generate Python code.
+- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
+{%- if developer_preamble %}
+
+
+# Developer Preamble
+The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
+{{ developer_preamble }}
+{%- endif -%}
+<|END_OF_TURN_TOKEN|>
+{%- for message in messages %}
+    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'user' %}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
+    {% for tc in message.tool_calls %}
+    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+
+    {% set tool_idx.value = tool_idx.value + 1 %}
+    {% endfor %}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
+    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+{{ format_tool_message(messages, message) }}
+    {%- for msg in messages[loop.index0 + 1:] %}
+        {%- if msg.role|lower == 'tool' %},
+{{ format_tool_message(messages, msg) }}
+            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+        {%- else %}
+            {%- break %}
+        {%- endif %}
+    {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
\ No newline at end of file
diff --git a/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja b/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
new file mode 100644
index 000000000..149250bd5
--- /dev/null
+++ b/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
@@ -0,0 +1,152 @@
+{%- macro json_to_python_type(json_spec) %}
+{%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+{%- if basic_type_map[json_spec.type] is defined %}
+    {{- basic_type_map[json_spec.type] }}
+{%- elif json_spec.type == "array" %}
+    {{- "list[" +  json_to_python_type(json_spec|items) + "]"}}
+{%- elif json_spec.type == "object" %}
+    {%- if json_spec.additionalProperties is defined %}
+        {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']'}}
+    {%- else %}
+        {{- "dict" }}
+    {%- endif %}
+{%- elif json_spec.type is iterable %}
+    {{- "Union[" }}
+    {%- for t in json_spec.type %}
+      {{- json_to_python_type({"type": t}) }}
+      {%- if not loop.last %}
+        {{- "," }} 
+    {%- endif %}
+    {%- endfor %}
+    {{- "]" }}
+{%- else %}
+    {{- "Any" }}
+{%- endif %}
+{%- endmacro %}
+
+
+{{- bos_token }}
+{{- '<|im_start|>system
+' }}
+{{- "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
+{%- for tool in tools %}
+    {%- if tool.function is defined %}
+        {%- set tool = tool.function %}
+    {%- endif %}
+    {{- '{"type": "function", "function": ' }}
+    {{- '{"name": "' + tool.name + '", ' }}
+    {{- '"description": "' + tool.name + '(' }}
+    {%- for param_name, param_fields in tool.parameters.properties|items %}
+        {{- param_name + ": " + json_to_python_type(param_fields) }}
+        {%- if not loop.last %}
+            {{- ", " }}
+        {%- endif %}
+    {%- endfor %}
+    {{- ")" }}
+    {%- if tool.return is defined %}
+        {{- " -> " + json_to_python_type(tool.return) }}
+    {%- endif %}
+    {{- " - " + tool.description + "
+
+" }}
+    {%- for param_name, param_fields in tool.parameters.properties|items %}
+        {%- if loop.first %}
+            {{- "    Args:
+" }}
+        {%- endif %}
+        {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+    {%- endfor %}
+    {%- if tool.return is defined and tool.return.description is defined %}
+        {{- "
+    Returns:
+        " + tool.return.description }}
+    {%- endif %}
+    {{- '"' }}
+    {{- ', "parameters": ' }}
+    {%- if tool.parameters.properties | length == 0 %}
+        {{- "{}" }}
+    {%- else %}
+        {{- tool.parameters|tojson }}
+    {%- endif %}
+    {{- "}" }}
+    {%- if not loop.last %}
+        {{- "
+" }}
+    {%- endif %}
+{%- endfor %}
+{{- " </tools>" }}
+{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
+' }}
+{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+" }}
+{{- "<tool_call>
+" }}
+{{- '{"name": <function-name>, "arguments": <args-dict>}
+' }}
+{{- '</tool_call><|im_end|>
+' }}
+{%- for message in messages %}
+    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
+        {{- '<|im_start|>' + message.role + '
+' + message.content + '<|im_end|>' + '
+' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+    {%- for tool_call in message.tool_calls %}
+       {{- '
+<tool_call>
+' }}           {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '{' }}
+            {{- '"name": "' }}
+            {{- tool_call.name }}
+            {{- '"' }}
+            {{- ', '}}
+            {%- if tool_call.arguments is defined %}
+                {{- '"arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments|tojson }}
+                {%- endif %}
+            {%- endif %}
+             {{- '}' }}
+            {{- '
+</tool_call>' }}
+    {%- endfor %}
+        {{- '<|im_end|>
+' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>tool
+' }}
+        {%- endif %}
+        {{- '<tool_response>
+' }}
+        {{- message.content }}
+        {%- if not loop.last %}
+            {{- '
+</tool_response>
+' }}
+        {%- else %}
+            {{- '
+</tool_response>' }}
+        {%- endif %}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant
+' }}
+{%- endif %}
diff --git a/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja b/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
new file mode 100644
index 000000000..149250bd5
--- /dev/null
+++ b/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
@@ -0,0 +1,152 @@
+{%- macro json_to_python_type(json_spec) %}
+{%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+{%- if basic_type_map[json_spec.type] is defined %}
+    {{- basic_type_map[json_spec.type] }}
+{%- elif json_spec.type == "array" %}
+    {{- "list[" +  json_to_python_type(json_spec|items) + "]"}}
+{%- elif json_spec.type == "object" %}
+    {%- if json_spec.additionalProperties is defined %}
+        {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']'}}
+    {%- else %}
+        {{- "dict" }}
+    {%- endif %}
+{%- elif json_spec.type is iterable %}
+    {{- "Union[" }}
+    {%- for t in json_spec.type %}
+      {{- json_to_python_type({"type": t}) }}
+      {%- if not loop.last %}
+        {{- "," }} 
+    {%- endif %}
+    {%- endfor %}
+    {{- "]" }}
+{%- else %}
+    {{- "Any" }}
+{%- endif %}
+{%- endmacro %}
+
+
+{{- bos_token }}
+{{- '<|im_start|>system
+' }}
+{{- "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
+{%- for tool in tools %}
+    {%- if tool.function is defined %}
+        {%- set tool = tool.function %}
+    {%- endif %}
+    {{- '{"type": "function", "function": ' }}
+    {{- '{"name": "' + tool.name + '", ' }}
+    {{- '"description": "' + tool.name + '(' }}
+    {%- for param_name, param_fields in tool.parameters.properties|items %}
+        {{- param_name + ": " + json_to_python_type(param_fields) }}
+        {%- if not loop.last %}
+            {{- ", " }}
+        {%- endif %}
+    {%- endfor %}
+    {{- ")" }}
+    {%- if tool.return is defined %}
+        {{- " -> " + json_to_python_type(tool.return) }}
+    {%- endif %}
+    {{- " - " + tool.description + "
+
+" }}
+    {%- for param_name, param_fields in tool.parameters.properties|items %}
+        {%- if loop.first %}
+            {{- "    Args:
+" }}
+        {%- endif %}
+        {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+    {%- endfor %}
+    {%- if tool.return is defined and tool.return.description is defined %}
+        {{- "
+    Returns:
+        " + tool.return.description }}
+    {%- endif %}
+    {{- '"' }}
+    {{- ', "parameters": ' }}
+    {%- if tool.parameters.properties | length == 0 %}
+        {{- "{}" }}
+    {%- else %}
+        {{- tool.parameters|tojson }}
+    {%- endif %}
+    {{- "}" }}
+    {%- if not loop.last %}
+        {{- "
+" }}
+    {%- endif %}
+{%- endfor %}
+{{- " </tools>" }}
+{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
+' }}
+{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+" }}
+{{- "<tool_call>
+" }}
+{{- '{"name": <function-name>, "arguments": <args-dict>}
+' }}
+{{- '</tool_call><|im_end|>
+' }}
+{%- for message in messages %}
+    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
+        {{- '<|im_start|>' + message.role + '
+' + message.content + '<|im_end|>' + '
+' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+    {%- for tool_call in message.tool_calls %}
+       {{- '
+<tool_call>
+' }}           {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '{' }}
+            {{- '"name": "' }}
+            {{- tool_call.name }}
+            {{- '"' }}
+            {{- ', '}}
+            {%- if tool_call.arguments is defined %}
+                {{- '"arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments|tojson }}
+                {%- endif %}
+            {%- endif %}
+             {{- '}' }}
+            {{- '
+</tool_call>' }}
+    {%- endfor %}
+        {{- '<|im_end|>
+' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>tool
+' }}
+        {%- endif %}
+        {{- '<tool_response>
+' }}
+        {{- message.content }}
+        {%- if not loop.last %}
+            {{- '
+</tool_response>
+' }}
+        {%- else %}
+            {{- '
+</tool_response>' }}
+        {%- endif %}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant
+' }}
+{%- endif %}
diff --git a/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja b/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
new file mode 100644
index 000000000..bdf7919a9
--- /dev/null
+++ b/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
@@ -0,0 +1,54 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja b/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja
new file mode 100644
index 000000000..02a1c3bce
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}
\ No newline at end of file
diff --git a/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja b/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja
new file mode 100644
index 000000000..2ebfe7c1e
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja
@@ -0,0 +1,56 @@
+{% if not add_generation_prompt is defined %}
+{% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}
+{%- for message in messages %}
+{%- if message['role'] == 'system' %}
+{% set ns.system_prompt = message['content'] %}
+{%- endif %}
+{%- endfor %}
+{{bos_token}}
+{{ns.system_prompt}}
+{%- for message in messages %}
+{%- if message['role'] == 'user' %}
+{%- set ns.is_tool = false -%}
+{{'<｜User｜>' + message['content']}}
+{%- endif %}
+{%- if message['role'] == 'assistant' and message['content'] is none %}
+{%- set ns.is_tool = false -%}
+{%- for tool in message['tool_calls']%}
+{%- if not ns.is_first %}
+{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+{%- set ns.is_first = true -%}
+{%- else %}
+{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{%- if message['role'] == 'assistant' and message['content'] is not none %}
+{%- if ns.is_tool %}
+{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}
+{%- set ns.is_tool = false -%}
+{%- else %}
+{% set content = message['content'] %}
+{% if '</think>' in content %}
+{% set content = content.split('</think>')[-1] %}
+{% endif %}
+{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}
+{%- endif %}
+{%- endif %}
+{%- if message['role'] == 'tool' %}
+{%- set ns.is_tool = true -%}
+{%- if ns.is_output_first %}
+{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+{%- set ns.is_output_first = false %}
+{%- else %}
+{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+{%- endif %}
+{%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+{{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_tool %}
+{{'<｜Assistant｜>'}}
+{% endif %}
\ No newline at end of file
diff --git a/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja b/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
new file mode 100644
index 000000000..9b8136df7
--- /dev/null
+++ b/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
@@ -0,0 +1,57 @@
+{%- set loop_messages = messages -%}
+{%- set message_roles = ['system', 'user', 'assistant', 'tool'] -%}
+{%- set system_prompt_suffix -%}
+{%- filter trim -%}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+Available functions as JSON spec:
+{%- endfilter -%}
+{%- endset -%}
+{%- set system_prompt_suffix = system_prompt_suffix + "\n" + functions -%}
+{%- set system_prompt_suffix = system_prompt_suffix + '\nToday is ' + datetime + '.' -%}
+{%- set ns = namespace(role='', content='') -%}
+{#- Basic consistency checks -#}
+{%- if not loop_messages -%}
+  {{ raise_exception('Expected non-empty messages') }}
+{%- endif -%}
+{%- for message in loop_messages -%}
+  {%- set ns.role = message['role'] | lower -%}
+  {%- if ns.role not in message_roles -%}
+    {%- set message_roles_string = message_roles | join(', ') -%}
+    {{ raise_exception('Invalid role ' + message['role'] + '. Only ' + message_roles_string + ' are supported.') }}
+  {%- endif -%}
+  {%- set msg_content = message['content'] | default('', true) | trim -%}
+  {%- if loop.index0 == 0 -%}
+    {%- if ns.role == 'system' -%}
+      {%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\n\n' + message['content'] | trim + '\n' + system_prompt_suffix + '<|eot_id|>' -%}
+    {%- else -%}
+      {%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\n\nYou are a helpful assistant with access to functions.\n' + system_prompt_suffix + '<|eot_id|>' -%}
+    {%- endif -%}
+    {%- set ns.content = bos_token + system_prompt -%}
+    {{- ns.content -}}
+  {%- endif -%}
+  {%- if loop.index0 > 0 or ns.role != 'system' -%}
+    {%- set ns.content = '<|start_header_id|>' + ns.role + '<|end_header_id|>\n\n' + msg_content -%}
+    {%- if 'tool_calls' in message and message['tool_calls'] -%}
+      {%- set tool = namespace(calls=[]) -%}
+      {%- for call in message['tool_calls'] -%}
+        {%- set tool.calls = tool.calls + ['{"name": "' + call['function']['name'] + '", "arguments": ' + call['function']['arguments'] + '}'] -%}
+      {%- endfor -%}
+      {%- set ns.content = ns.content + ' functools[' + tool.calls | join(', ') + ']' -%}
+    {%- endif -%}
+    {%- set ns.content = ns.content + '<|eot_id|>' -%}
+    {{- ns.content -}}
+  {%- endif -%}
+{%- endfor -%}
+{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
diff --git a/models/templates/google-gemma-2-2b-it.jinja b/models/templates/google-gemma-2-2b-it.jinja
new file mode 100644
index 000000000..923ec253c
--- /dev/null
+++ b/models/templates/google-gemma-2-2b-it.jinja
@@ -0,0 +1,4 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
+' + message['content'] | trim + '<end_of_turn>
+' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
+'}}{% endif %}
\ No newline at end of file
diff --git a/models/templates/meetkai-functionary-medium-v3.1.jinja b/models/templates/meetkai-functionary-medium-v3.1.jinja
new file mode 100644
index 000000000..29d64a215
--- /dev/null
+++ b/models/templates/meetkai-functionary-medium-v3.1.jinja
@@ -0,0 +1,58 @@
+{# version=v3-llama3.1 #}{%- if not tools is defined -%}
+    {%- set tools = none -%}
+{%- endif -%}
+
+{%- set has_code_interpreter = tools | selectattr("type", "equalto", "code_interpreter") | list | length > 0 -%}
+{%- if has_code_interpreter -%}
+    {%- set tools = tools | rejectattr("type", "equalto", "code_interpreter") | list -%}
+{%- endif -%}
+
+{#- System message + builtin tools #}
+{{- bos_token + "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if has_code_interpreter %}
+    {{- "Environment: ipython\n\n" }}
+{%- else -%}
+    {{ "\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n\n" }}
+{%- if tools %}
+    {{- "\nYou have access to the following functions:\n\n" }}
+    {%- for t in tools %}
+        {%- if "type" in t -%}
+            {{ "Use the function '"|safe + t["function"]["name"] + "' to '"|safe + t["function"]["description"] + "'\n"|safe + t["function"] | tojson() }}
+        {%- else -%}
+            {{ "Use the function '"|safe + t["name"] + "' to '"|safe + t["description"] + "'\n"|safe + t | tojson() }}
+        {%- endif -%}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- '\nThink very carefully before calling functions.\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => `<function`\nparameters => a JSON dict with the function argument name as key and function argument value as value.\nend_tag => `</function>`\n\nHere is an example,\n<function=example_function_name>{"example_name": "example_value"}</function>\n\nReminder:\n- If looking for real time information use relevant functions before falling back to brave_search\n- Function calls MUST follow the specified format, start with <function= and end with </function>\n- Required parameters MUST be specified\n- Only call one function at a time\n- Put the entire function call reply on one line\n\n' -}}
+{%- endif %}
+{{- "<|eot_id|>" -}}
+
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' or message['role'] == 'system' -%}
+        {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
+    {%- elif message['role'] == 'tool' -%}
+        {{ '<|start_header_id|>ipython<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
+    {%- else -%}
+        {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}
+        {%- if message['content'] -%}
+            {{ message['content'] }}
+        {%- endif -%}
+        {%- if 'tool_calls' in message and message['tool_calls'] -%}
+            {%- for tool_call in message['tool_calls'] -%}
+                {%- if tool_call["function"]["name"] == "python" -%}
+                    {{ '<|python_tag|>' + tool_call['function']['arguments'] }}
+                {%- else -%}
+                    {{ '<function=' + tool_call['function']['name'] + '>' + tool_call['function']['arguments'] + '</function>' }}
+                {%- endif -%}
+            {%- endfor -%}
+            {{ '<|eom_id|>' }}
+        {%- else -%}
+            {{ '<|eot_id|>' }}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif -%}
\ No newline at end of file
diff --git a/models/templates/meetkai-functionary-medium-v3.2.jinja b/models/templates/meetkai-functionary-medium-v3.2.jinja
new file mode 100644
index 000000000..74fd1e7af
--- /dev/null
+++ b/models/templates/meetkai-functionary-medium-v3.2.jinja
@@ -0,0 +1,287 @@
+{# version=v3.llama3 #}{%- macro append_new_param_info(param_declaration, comment_info, examples_info, depth) -%}
+    {%- set offset = "" -%}
+    {%- if depth >= 1 -%}
+        {%- set offset = "    " * depth -%}
+    {%- endif -%}
+    {%- if comment_info != "<|NONE|>" -%}
+        {{ "\n" + offset + comment_info }}
+        {%- if examples_info | length > 0 -%}
+            {# Append each example info #}
+            {%- for example in examples_info -%}
+                {{ "\n" + offset + "// " + example|string|replace("'", '"') }}
+            {%- endfor -%}
+        {%- endif -%}
+    {%- endif -%}
+    {{ "\n" + offset + param_declaration }}
+{%- endmacro -%}
+
+{%- macro convert_data_type(param_type) -%}
+    {%- if param_type == "integer" or param_type == "float" -%}
+        {{ "number" }}
+    {%- else -%}
+        {{ param_type }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro get_param_type(param) -%}
+    {%- set param_type = "any" -%}
+
+    {%- if "type" in param -%}
+        {%- set raw_param_type = param["type"] -%}
+        {%- if raw_param_type is iterable and raw_param_type is not string -%}
+            {%- set param_type = raw_param_type | join(" | ") -%}
+        {%- else -%}
+            {%- set param_type = raw_param_type -%}
+        {%- endif -%}
+        {{ convert_data_type(param_type) }}
+    {%- elif "oneOf" in param -%}
+        {%- set one_of_types = param["oneOf"]|selectattr("type", "defined")|list -%}
+        {%- set one_of_types = one_of_types|map(attribute="type")|unique|list -%}
+        {{ convert_data_type(one_of_types | join(" | ")) }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro get_format_param(param) -%}
+    {%- if "format" in param -%}
+        {{ param["format"] }}
+    {%- elif "oneOf" in param -%}
+        {%- set formats = [] -%}
+        {%- for item in param["oneOf"] -%}
+            {%- if "format" in item -%}
+                {%- if item["format"] == param["oneOf"][-1]["format"] -%}
+                    {{ item["format"] }}
+                {%- else -%}
+                    {{ item["format"] + " or "}}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ "<|NONE|>" }}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro get_param_info(param) -%}
+    {%- set param_type = param.get("type", "any") -%}
+    {%- set format_param = get_format_param(param) -%}
+
+    {%- if "description" in param or "default" in param or format_param != "<|NONE|>" or param["maximum"] or param["minimum"] or param["maxLength"] or param["minLength"] -%}
+        {{ "//" }}
+        {%- if "description" in param -%}
+            {%- set desc = param["description"] -%}
+            {%- if not desc.endswith(".") -%}
+                {%- set desc = desc + "." -%}
+            {%- endif -%}
+            {{ " " + desc }}
+        {%- endif -%}
+
+        {%- if "default" in param -%}
+            {%- set default_value = param["default"] -%}
+            {%- if param_type == "string" -%}
+                {%- set default_value = '"' ~ default_value ~ '"' -%}
+            {%- endif -%}
+            {{ " Default=" ~ default_value ~ "." }}
+        {%- endif -%}
+
+        {%- set format_param = get_format_param(param) -%}
+        {%- if format_param != "<|NONE|>" -%}
+            {{ " Format=" ~ format_param }}
+        {%- endif -%}
+
+        {%- for field, field_name in [("maximum", "Maximum"), ("minimum", "Minimum"), ("maxLength", "Maximum length"), ("minLength", "Minimum length")] -%}
+            {%- if field in param -%}
+                {{ " " + field_name ~ "=" ~ param[field] }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ "<|NONE|>"}}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro get_enum_option_str(enum_options) -%}
+    {%- for v in enum_options -%}
+        {%- if v is string -%}
+            {{ '"' + v + '"' }}
+        {%- else -%}
+            {{ v }}
+        {%- endif -%}
+        {%- if enum_options|length > 0 and v != enum_options[-1] -%}
+            {{ " | " }}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+
+{%- macro get_array_typescript(param_name, param_dic, depth) -%}
+    {%- set offset = '' -%}
+    {%- if depth >= 1 -%}
+        {%- set offset = "    " * depth -%}
+    {%- endif -%}
+    {%- set items_info = param_dic.get('items', {}) -%}
+
+    {%- if items_info|length == 0 -%}
+        {%- if param_name -%}
+            {{ "\n" + offset + param_name + ": []" }}
+        {%- else -%}
+            {{ "\n" + offset + "[]" }}
+        {%- endif -%}
+    {%- else -%}
+        {%- set array_type = get_param_type(items_info) -%}
+        {%- if array_type == 'object' -%}
+            {%- if param_name -%}
+                {{ "\n" + offset + param_name + ": {" }}
+            {%- else -%}
+                {{ "\n" + offset + "{" }}
+            {%- endif -%}
+            {{ get_parameter_typescript(items_info.get('properties', {}), items_info.get('required', []), depth + 1) -}}
+            {{- "\n" + offset + "}[]" }}
+        {%- elif array_type == 'array' -%}
+            {%- set item_info = get_array_typescript(None, items_info, depth + 1) -%}
+            {%- if not param_name -%}
+                {{ "\n" + item_info + "[]" }}
+            {%- else -%}
+                {{ "\n" + offset + param_name + ": " + item_info|trim + "[]" }}
+            {%- endif -%}
+        {%- else -%}
+            {%- if 'enum' in items_info -%}
+                {%- set item_type = get_enum_option_str(items_info['enum']) -%}
+                {%- if param_name is none -%}
+                    {{ "(" + item_type + ")[]"}}
+                {%- else -%}
+                    {{ "\n" + offset + param_name + ": (" + item_type + ")[]" }}
+                {%- endif -%}
+            {%- else -%}
+                {%- if param_name is none -%}
+                    {{ "\n" + array_type + "[]" }}
+                {%- else -%}
+                    {{ "\n" + offset + param_name + ": " + array_type + "[]," }}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endif -%}
+    {%- endif -%}
+{%- endmacro -%}
+
+{%- macro get_parameter_typescript(properties, required_params, depth=0) -%}
+    {%- set res = "" -%}
+    {%- for param_name, param in properties.items() -%}
+        {%- if param is mapping -%}
+            {%- set comment_info = get_param_info(param) -%}
+            {# Param Examples #}
+            {%- set examples_info = [] -%}
+            {%- if "examples" in param -%}
+                {%- set examples_info = ["Example " + param_name + ":"] -%}
+                {%- set examples_info = examples_info + param["examples"] -%}
+            {%- endif -%}
+
+            {# Param Name declaration #}
+            {%- set param_declaration = param_name -%}
+            {%- if required_params is iterable and param_name not in required_params -%}
+                {%- set param_declaration = param_declaration + "?" -%}
+            {%- endif -%}
+
+            {%- set param_type = get_param_type(param) -%}
+
+            {# Handle indentation based on depth #}
+            {%- set offset = "" -%}
+            {%- if depth >= 1 -%}
+                {%- set offset = "    " * depth -%}
+            {%- endif -%}
+
+            {%- if param_type == "object" -%}
+                {%- if comment_info != "<|NONE|>" -%}
+                    {{ "\n" + offset + comment_info }}
+                {%- endif -%}
+                {%- if examples_info|length > 0 -%}
+                    {%- for example in examples_info -%}
+                        {{ "\n" + offset + "// " + example|string|replace("'", '"') }}
+                    {%- endfor -%}
+                {%- endif -%}
+                {%- set param_declaration = param_declaration + ": {" -%}
+                {{ "\n" + offset + param_declaration -}}
+                {{- get_parameter_typescript(param.get("properties", {}), param.get("required", []), depth + 1) -}}
+                {{- "\n" + offset + "}," }}
+            {%- elif param_type == "array" -%}
+                {%- set item_info = param.get("items", {}) -%}
+                {%- if "type" not in item_info -%}
+                    {%- set param_declaration = param_declaration + ": []," -%}
+                    {{ append_new_param_info(param_declaration, comment_info, examples_info, depth) }}
+                {%- else -%}
+                    {%- if comment_info != "<|NONE|>" -%}
+                        {{ "\n" + offset + comment_info }}
+                    {%- endif -%}
+                    {%- if examples_info|length > 0 -%}
+                        {%- for example in examples_info -%}
+                            {{ "\n" + offset + "// " + example|string|replace("'", '"') }}
+                        {%- endfor -%}
+                    {%- endif -%}
+                    {%- set array_declaration = get_array_typescript(param_declaration, param, depth) -%}
+                    {%- if not array_declaration.endswith(",") -%}
+                        {%- set array_declaration = array_declaration + "," -%}
+                    {%- endif -%}
+                    {{ array_declaration}}
+                {%- endif -%}
+            {%- else -%}
+                {%- if "enum" in param -%}
+                    {%- set param_type = get_enum_option_str(param["enum"]) -%}
+                {%- endif -%}
+                {%- if "nullable" in param and param["nullable"] -%}
+                    {%- set param_type = param_type + " | null" -%}
+                {%- endif -%}
+                {%- set param_declaration = param_declaration + ": " + param_type + "," -%}
+                {{ append_new_param_info(param_declaration, comment_info, examples_info, depth) }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+
+{%- macro generate_schema_from_functions(functions, namespace='functions') -%}
+    {{ "// Supported function definitions that should be called when necessary.\n" -}}
+    {{- "namespace " + namespace + " {\n\n" -}}
+
+    {%- for function in functions -%}
+        {%- if function.get("function") -%}
+            {%- set function = function.get("function") -%}
+        {%- endif -%}
+
+        {%- set function_name = function.get("name") -%}
+        {%- if function_name -%}
+            {%- set description = function.get('description', '') -%}
+            {%- set parameters = function.get('parameters', {}) -%}
+            {{- "// " + description + "\n" -}}
+            {{- "type " + function_name -}}
+            {%- if parameters and parameters.get("properties") -%}
+                {{- " = (_: {" -}}
+                {%- set required_params = parameters.get("required", []) -%}
+                {{ get_parameter_typescript(parameters.get("properties"), required_params, 0) -}}
+                {{- "\n}) => any;\n\n" }}
+            {%- else -%}
+                {{ " = () => any;\n\n" }}
+            {%- endif -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{ "} // namespace " + namespace }}
+{%- endmacro -%}
+{%- if not tools -%}
+    {%- set tools = [] -%}
+{%- endif -%}
+{{ bos_token + '<|start_header_id|>system<|end_header_id|>\n\nYou are capable of executing available function(s) if required.\nOnly execute function(s) when absolutely necessary.\nAsk for the required input to:recipient==all\nUse JSON for function arguments.\nRespond in this format:\n>>>${recipient}\n${content}\nAvailable functions:\n' + generate_schema_from_functions(tools) + '<|eot_id|>' -}}
+{%- if tools|length > 0 and tools|selectattr("type", "equalto", "code_interpreter")|list|length > 0 -%}
+    {{ '<|start_header_id|>system<|end_header_id|>\n\nWhen you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at \'/mnt/data\' can be used to save and persist user files.<|eot_id|>' }}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' or message['role'] == 'system' -%}
+        {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
+    {%- elif message['role'] == 'tool' -%}
+        {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
+    {%- else -%}
+        {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}
+        {%- if message['content'] -%}
+            {{ '>>>all\n' + message['content'] }}
+        {%- endif -%}
+        {%- if 'tool_calls' in message and message['tool_calls'] -%}
+            {%- for tool_call in message['tool_calls'] -%}
+                {{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}
+            {%- endfor -%}
+        {%- endif -%}
+        {{ '<|eot_id|>' }}
+    {%- endif -%}
+{%- endfor -%}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n>>>' }}{% endif %}
\ No newline at end of file
diff --git a/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja b/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja
new file mode 100644
index 000000000..33089ace1
--- /dev/null
+++ b/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja
@@ -0,0 +1,109 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja b/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
new file mode 100644
index 000000000..1bad6a0f6
--- /dev/null
+++ b/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
@@ -0,0 +1,93 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja b/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja
new file mode 100644
index 000000000..33089ace1
--- /dev/null
+++ b/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja
@@ -0,0 +1,109 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/models/templates/microsoft-Phi-3.5-mini-instruct.jinja b/models/templates/microsoft-Phi-3.5-mini-instruct.jinja
new file mode 100644
index 000000000..d1533d152
--- /dev/null
+++ b/models/templates/microsoft-Phi-3.5-mini-instruct.jinja
@@ -0,0 +1,8 @@
+{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
+' + message['content'] + '<|end|>
+'}}{% elif message['role'] == 'user' %}{{'<|user|>
+' + message['content'] + '<|end|>
+'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
+' + message['content'] + '<|end|>
+'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
+' }}{% else %}{{ eos_token }}{% endif %}
\ No newline at end of file
diff --git a/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja b/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja
new file mode 100644
index 000000000..9c21a3f13
--- /dev/null
+++ b/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja
@@ -0,0 +1,87 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}
+{%- set ns = namespace() %}
+{%- set ns.index = 0 %}
+{%- for message in loop_messages %}
+    {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}
+        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
+            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+        {%- endif %}
+        {%- set ns.index = ns.index + 1 %}
+    {%- endif %}
+{%- endfor %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS][" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+            {%- endif %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "\n\n" + message["content"] + "[/INST]" }}
+        {%- else %}
+            {{- "[INST]" + message["content"] + "[/INST]" }}
+        {%- endif %}
+    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}
+        {{- "[TOOL_CALLS][" }}
+        {%- for tool_call in message.tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message["role"] == "assistant" %}
+        {{- message["content"] + eos_token}}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS]{"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt
index 03e1d2c04..d49d14dee 100644
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(vdot)
+    if (NOT GGML_BACKEND_DL)
+        add_subdirectory(vdot)
+    endif()
 endif()
diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt
index fb89a1cd4..6235aec1f 100644
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -1,9 +1,9 @@
-set(TARGET vdot)
+set(TARGET llama-vdot)
 add_executable(${TARGET} vdot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-set(TARGET q8dot)
+set(TARGET llama-q8dot)
 add_executable(${TARGET} q8dot.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
index 1a52ff5e9..3df6e1f42 100644
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -11,6 +11,7 @@
 #include <type_traits>
 
 #include <ggml.h>
+#include <ggml-cpu.h>
 
 constexpr int kVecSize = 1 << 16;
 
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
 
     auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
 
-    auto funcs = ggml_internal_get_type_traits(ggml_type);
+    const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
 
     Stat simple, ggml;
 
@@ -156,8 +157,8 @@ int main(int argc, char** argv) {
 
         t1 = std::chrono::high_resolution_clock::now();
         float fs;
-        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
-        else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
+        if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
+        else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
         t2 = std::chrono::high_resolution_clock::now();
         t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
         if (iloop > 3) ggml.addResult(fs, t);
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index 17e9e4482..2dca62848 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -9,6 +9,7 @@
 #include <array>
 
 #include <ggml.h>
+#include <ggml-cpu.h>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -236,7 +237,7 @@ int main(int argc, char** argv) {
     int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
     int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
 
-    auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
+    const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
 
     std::vector<block_q4_0> q40;
     std::vector<block_q4_1> q41;
@@ -261,9 +262,9 @@ int main(int argc, char** argv) {
         // Note, we do not include this in the timing as in practical application
         // we already have the quantized model weights.
         if (useQ4_1) {
-            funcs.from_float(x1.data(), q41.data(), kVecSize);
+            funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
         } else {
-            funcs.from_float(x1.data(), q40.data(), kVecSize);
+            funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
         }
 
         // Now measure time the dot product needs using the "scalar" version above
@@ -282,10 +283,10 @@ int main(int argc, char** argv) {
             dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
         }
         else {
-            auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
-            vdot.from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+            const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
+            vdot->from_float(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
+            else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
         }
         sumq += result;
         t2 = std::chrono::high_resolution_clock::now();
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 000000000..eb6baa6c7
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,1197 @@
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+
+[[package]]
+name = "atomicwrites"
+version = "1.4.1"
+description = "Atomic file writes."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
+]
+
+[[package]]
+name = "attrs"
+version = "23.2.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"},
+    {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
+]
+
+[package.extras]
+cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
+dev = ["attrs[tests]", "pre-commit"]
+docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
+tests = ["attrs[tests-no-zope]", "zope-interface"]
+tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
+tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
+
+[[package]]
+name = "certifi"
+version = "2024.2.2"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
+    {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.3.2"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
+    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "filelock"
+version = "3.13.1"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"},
+    {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"},
+]
+
+[package.extras]
+docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+typing = ["typing-extensions (>=4.8)"]
+
+[[package]]
+name = "fsspec"
+version = "2024.2.0"
+description = "File-system specification"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fsspec-2024.2.0-py3-none-any.whl", hash = "sha256:817f969556fa5916bc682e02ca2045f96ff7f586d45110fcb76022063ad2c7d8"},
+    {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"},
+]
+
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+devel = ["pytest", "pytest-cov"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+tqdm = ["tqdm"]
+
+[[package]]
+name = "gguf"
+version = "0.7.0"
+description = "Read and write ML models in GGUF for GGML"
+optional = false
+python-versions = ">=3.8"
+files = []
+develop = false
+
+[package.dependencies]
+numpy = ">=1.17"
+
+[package.source]
+type = "directory"
+url = "gguf-py"
+
+[[package]]
+name = "huggingface-hub"
+version = "0.20.3"
+description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "huggingface_hub-0.20.3-py3-none-any.whl", hash = "sha256:d988ae4f00d3e307b0c80c6a05ca6dbb7edba8bba3079f74cda7d9c2e562a7b6"},
+    {file = "huggingface_hub-0.20.3.tar.gz", hash = "sha256:94e7f8e074475fbc67d6a71957b678e1b4a74ff1b64a644fd6cbb83da962d05d"},
+]
+
+[package.dependencies]
+filelock = "*"
+fsspec = ">=2023.5.0"
+packaging = ">=20.9"
+pyyaml = ">=5.1"
+requests = "*"
+tqdm = ">=4.42.1"
+typing-extensions = ">=3.7.4.3"
+
+[package.extras]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+cli = ["InquirerPy (==0.3.4)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
+inference = ["aiohttp", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)"]
+quality = ["mypy (==1.5.1)", "ruff (>=0.1.3)"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
+
+[[package]]
+name = "idna"
+version = "3.6"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"},
+    {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.3"
+description = "A very fast and expressive template engine."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
+    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
+]
+
+[package.dependencies]
+MarkupSafe = ">=2.0"
+
+[package.extras]
+i18n = ["Babel (>=2.7)"]
+
+[[package]]
+name = "markupsafe"
+version = "2.1.5"
+description = "Safely add untrusted strings to HTML/XML markup."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
+    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
+]
+
+[[package]]
+name = "more-itertools"
+version = "10.2.0"
+description = "More routines for operating on iterables, beyond itertools"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "more-itertools-10.2.0.tar.gz", hash = "sha256:8fccb480c43d3e99a00087634c06dd02b0d50fbf088b380de5a41a015ec239e1"},
+    {file = "more_itertools-10.2.0-py3-none-any.whl", hash = "sha256:686b06abe565edfab151cb8fd385a05651e1fdf8f0a14191e4439283421f8684"},
+]
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+description = "Python library for arbitrary-precision floating-point arithmetic"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
+    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
+]
+
+[package.extras]
+develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
+docs = ["sphinx"]
+gmpy = ["gmpy2 (>=2.1.0a4)"]
+tests = ["pytest (>=4.6)"]
+
+[[package]]
+name = "networkx"
+version = "3.2.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
+    {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
+developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
+doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
+test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
+[[package]]
+name = "numpy"
+version = "1.26.4"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
+]
+
+[[package]]
+name = "packaging"
+version = "23.2"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
+    {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
+]
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
+[[package]]
+name = "protobuf"
+version = "4.25.3"
+description = ""
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"},
+    {file = "protobuf-4.25.3-cp310-abi3-win_amd64.whl", hash = "sha256:209ba4cc916bab46f64e56b85b090607a676f66b473e6b762e6f1d9d591eb2e8"},
+    {file = "protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f1279ab38ecbfae7e456a108c5c0681e4956d5b1090027c1de0f934dfdb4b35c"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:e7cb0ae90dd83727f0c0718634ed56837bfeeee29a5f82a7514c03ee1364c019"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d"},
+    {file = "protobuf-4.25.3-cp38-cp38-win32.whl", hash = "sha256:f4f118245c4a087776e0a8408be33cf09f6c547442c00395fbfb116fac2f8ac2"},
+    {file = "protobuf-4.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:c053062984e61144385022e53678fbded7aea14ebb3e0305ae3592fb219ccfa4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win32.whl", hash = "sha256:19b270aeaa0099f16d3ca02628546b8baefe2955bbe23224aaf856134eccf1e4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win_amd64.whl", hash = "sha256:e3c97a1555fd6388f857770ff8b9703083de6bf1f9274a002a332d65fbb56c8c"},
+    {file = "protobuf-4.25.3-py3-none-any.whl", hash = "sha256:f0700d54bcf45424477e46a9f0944155b46fb0639d69728739c0e47bab83f2b9"},
+    {file = "protobuf-4.25.3.tar.gz", hash = "sha256:25b5d0b42fd000320bd7830b349e3b696435f3b329810427a6bcce6a5492cc5c"},
+]
+
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
+
+[[package]]
+name = "pytest"
+version = "5.4.3"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
+    {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"},
+]
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=17.4.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+more-itertools = ">=4.0.0"
+packaging = "*"
+pluggy = ">=0.12,<1.0"
+py = ">=1.5.0"
+wcwidth = "*"
+
+[package.extras]
+checkqa-mypy = ["mypy (==v0.761)"]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.1"
+description = "YAML parser and emitter for Python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+]
+
+[[package]]
+name = "regex"
+version = "2023.12.25"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5"},
+    {file = "regex-2023.12.25-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b014333bd0217ad3d54c143de9d4b9a3ca1c5a29a6d0d554952ea071cff0f1f8"},
+    {file = "regex-2023.12.25-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d865984b3f71f6d0af64d0d88f5733521698f6c16f445bb09ce746c92c97c586"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e0eabac536b4cc7f57a5f3d095bfa557860ab912f25965e08fe1545e2ed8b4c"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c25a8ad70e716f96e13a637802813f65d8a6760ef48672aa3502f4c24ea8b400"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9b6d73353f777630626f403b0652055ebfe8ff142a44ec2cf18ae470395766e"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9cc99d6946d750eb75827cb53c4371b8b0fe89c733a94b1573c9dd16ea6c9e4"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88d1f7bef20c721359d8675f7d9f8e414ec5003d8f642fdfd8087777ff7f94b5"},
+    {file = "regex-2023.12.25-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cb3fe77aec8f1995611f966d0c656fdce398317f850d0e6e7aebdfe61f40e1cd"},
+    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7aa47c2e9ea33a4a2a05f40fcd3ea36d73853a2aae7b4feab6fc85f8bf2c9704"},
+    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df26481f0c7a3f8739fecb3e81bc9da3fcfae34d6c094563b9d4670b047312e1"},
+    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c40281f7d70baf6e0db0c2f7472b31609f5bc2748fe7275ea65a0b4601d9b392"},
+    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d94a1db462d5690ebf6ae86d11c5e420042b9898af5dcf278bd97d6bda065423"},
+    {file = "regex-2023.12.25-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba1b30765a55acf15dce3f364e4928b80858fa8f979ad41f862358939bdd1f2f"},
+    {file = "regex-2023.12.25-cp310-cp310-win32.whl", hash = "sha256:150c39f5b964e4d7dba46a7962a088fbc91f06e606f023ce57bb347a3b2d4630"},
+    {file = "regex-2023.12.25-cp310-cp310-win_amd64.whl", hash = "sha256:09da66917262d9481c719599116c7dc0c321ffcec4b1f510c4f8a066f8768105"},
+    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1b9d811f72210fa9306aeb88385b8f8bcef0dfbf3873410413c00aa94c56c2b6"},
+    {file = "regex-2023.12.25-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d902a43085a308cef32c0d3aea962524b725403fd9373dea18110904003bac97"},
+    {file = "regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d166eafc19f4718df38887b2bbe1467a4f74a9830e8605089ea7a30dd4da8887"},
+    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7ad32824b7f02bb3c9f80306d405a1d9b7bb89362d68b3c5a9be53836caebdb"},
+    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:636ba0a77de609d6510235b7f0e77ec494d2657108f777e8765efc060094c98c"},
+    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fda75704357805eb953a3ee15a2b240694a9a514548cd49b3c5124b4e2ad01b"},
+    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f72cbae7f6b01591f90814250e636065850c5926751af02bb48da94dfced7baa"},
+    {file = "regex-2023.12.25-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db2a0b1857f18b11e3b0e54ddfefc96af46b0896fb678c85f63fb8c37518b3e7"},
+    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7502534e55c7c36c0978c91ba6f61703faf7ce733715ca48f499d3dbbd7657e0"},
+    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e8c7e08bb566de4faaf11984af13f6bcf6a08f327b13631d41d62592681d24fe"},
+    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:283fc8eed679758de38fe493b7d7d84a198b558942b03f017b1f94dda8efae80"},
+    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f44dd4d68697559d007462b0a3a1d9acd61d97072b71f6d1968daef26bc744bd"},
+    {file = "regex-2023.12.25-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:67d3ccfc590e5e7197750fcb3a2915b416a53e2de847a728cfa60141054123d4"},
+    {file = "regex-2023.12.25-cp311-cp311-win32.whl", hash = "sha256:68191f80a9bad283432385961d9efe09d783bcd36ed35a60fb1ff3f1ec2efe87"},
+    {file = "regex-2023.12.25-cp311-cp311-win_amd64.whl", hash = "sha256:7d2af3f6b8419661a0c421584cfe8aaec1c0e435ce7e47ee2a97e344b98f794f"},
+    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8a0ccf52bb37d1a700375a6b395bff5dd15c50acb745f7db30415bae3c2b0715"},
+    {file = "regex-2023.12.25-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c3c4a78615b7762740531c27cf46e2f388d8d727d0c0c739e72048beb26c8a9d"},
+    {file = "regex-2023.12.25-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad83e7545b4ab69216cef4cc47e344d19622e28aabec61574b20257c65466d6a"},
+    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7a635871143661feccce3979e1727c4e094f2bdfd3ec4b90dfd4f16f571a87a"},
+    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d498eea3f581fbe1b34b59c697512a8baef88212f92e4c7830fcc1499f5b45a5"},
+    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43f7cd5754d02a56ae4ebb91b33461dc67be8e3e0153f593c509e21d219c5060"},
+    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51f4b32f793812714fd5307222a7f77e739b9bc566dc94a18126aba3b92b98a3"},
+    {file = "regex-2023.12.25-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba99d8077424501b9616b43a2d208095746fb1284fc5ba490139651f971d39d9"},
+    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4bfc2b16e3ba8850e0e262467275dd4d62f0d045e0e9eda2bc65078c0110a11f"},
+    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8c2c19dae8a3eb0ea45a8448356ed561be843b13cbc34b840922ddf565498c1c"},
+    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:60080bb3d8617d96f0fb7e19796384cc2467447ef1c491694850ebd3670bc457"},
+    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b77e27b79448e34c2c51c09836033056a0547aa360c45eeeb67803da7b0eedaf"},
+    {file = "regex-2023.12.25-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:518440c991f514331f4850a63560321f833979d145d7d81186dbe2f19e27ae3d"},
+    {file = "regex-2023.12.25-cp312-cp312-win32.whl", hash = "sha256:e2610e9406d3b0073636a3a2e80db05a02f0c3169b5632022b4e81c0364bcda5"},
+    {file = "regex-2023.12.25-cp312-cp312-win_amd64.whl", hash = "sha256:cc37b9aeebab425f11f27e5e9e6cf580be7206c6582a64467a14dda211abc232"},
+    {file = "regex-2023.12.25-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:da695d75ac97cb1cd725adac136d25ca687da4536154cdc2815f576e4da11c69"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d126361607b33c4eb7b36debc173bf25d7805847346dd4d99b5499e1fef52bc7"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4719bb05094d7d8563a450cf8738d2e1061420f79cfcc1fa7f0a44744c4d8f73"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dd58946bce44b53b06d94aa95560d0b243eb2fe64227cba50017a8d8b3cd3e2"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22a86d9fff2009302c440b9d799ef2fe322416d2d58fc124b926aa89365ec482"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aae8101919e8aa05ecfe6322b278f41ce2994c4a430303c4cd163fef746e04f"},
+    {file = "regex-2023.12.25-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e692296c4cc2873967771345a876bcfc1c547e8dd695c6b89342488b0ea55cd8"},
+    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:263ef5cc10979837f243950637fffb06e8daed7f1ac1e39d5910fd29929e489a"},
+    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d6f7e255e5fa94642a0724e35406e6cb7001c09d476ab5fce002f652b36d0c39"},
+    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:88ad44e220e22b63b0f8f81f007e8abbb92874d8ced66f32571ef8beb0643b2b"},
+    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:3a17d3ede18f9cedcbe23d2daa8a2cd6f59fe2bf082c567e43083bba3fb00347"},
+    {file = "regex-2023.12.25-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d15b274f9e15b1a0b7a45d2ac86d1f634d983ca40d6b886721626c47a400bf39"},
+    {file = "regex-2023.12.25-cp37-cp37m-win32.whl", hash = "sha256:ed19b3a05ae0c97dd8f75a5d8f21f7723a8c33bbc555da6bbe1f96c470139d3c"},
+    {file = "regex-2023.12.25-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d1047952c0b8104a1d371f88f4ab62e6275567d4458c1e26e9627ad489b445"},
+    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b43523d7bc2abd757119dbfb38af91b5735eea45537ec6ec3a5ec3f9562a1c53"},
+    {file = "regex-2023.12.25-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:efb2d82f33b2212898f1659fb1c2e9ac30493ac41e4d53123da374c3b5541e64"},
+    {file = "regex-2023.12.25-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7fca9205b59c1a3d5031f7e64ed627a1074730a51c2a80e97653e3e9fa0d415"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086dd15e9435b393ae06f96ab69ab2d333f5d65cbe65ca5a3ef0ec9564dfe770"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e81469f7d01efed9b53740aedd26085f20d49da65f9c1f41e822a33992cb1590"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34e4af5b27232f68042aa40a91c3b9bb4da0eeb31b7632e0091afc4310afe6cb"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9852b76ab558e45b20bf1893b59af64a28bd3820b0c2efc80e0a70a4a3ea51c1"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff100b203092af77d1a5a7abe085b3506b7eaaf9abf65b73b7d6905b6cb76988"},
+    {file = "regex-2023.12.25-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cc038b2d8b1470364b1888a98fd22d616fba2b6309c5b5f181ad4483e0017861"},
+    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:094ba386bb5c01e54e14434d4caabf6583334090865b23ef58e0424a6286d3dc"},
+    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5cd05d0f57846d8ba4b71d9c00f6f37d6b97d5e5ef8b3c3840426a475c8f70f4"},
+    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9aa1a67bbf0f957bbe096375887b2505f5d8ae16bf04488e8b0f334c36e31360"},
+    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:98a2636994f943b871786c9e82bfe7883ecdaba2ef5df54e1450fa9869d1f756"},
+    {file = "regex-2023.12.25-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37f8e93a81fc5e5bd8db7e10e62dc64261bcd88f8d7e6640aaebe9bc180d9ce2"},
+    {file = "regex-2023.12.25-cp38-cp38-win32.whl", hash = "sha256:d78bd484930c1da2b9679290a41cdb25cc127d783768a0369d6b449e72f88beb"},
+    {file = "regex-2023.12.25-cp38-cp38-win_amd64.whl", hash = "sha256:b521dcecebc5b978b447f0f69b5b7f3840eac454862270406a39837ffae4e697"},
+    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f7bc09bc9c29ebead055bcba136a67378f03d66bf359e87d0f7c759d6d4ffa31"},
+    {file = "regex-2023.12.25-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e14b73607d6231f3cc4622809c196b540a6a44e903bcfad940779c80dffa7be7"},
+    {file = "regex-2023.12.25-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9eda5f7a50141291beda3edd00abc2d4a5b16c29c92daf8d5bd76934150f3edc"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"},
+    {file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"},
+    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"},
+    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"},
+    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"},
+    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"},
+    {file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"},
+    {file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"},
+    {file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"},
+    {file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"},
+]
+
+[[package]]
+name = "requests"
+version = "2.31.0"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "safetensors"
+version = "0.4.2"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "safetensors-0.4.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:69d8bb8384dc2cb5b72c36c4d6980771b293d1a1377b378763f5e37b6bb8d133"},
+    {file = "safetensors-0.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3d420e19fcef96d0067f4de4699682b4bbd85fc8fea0bd45fcd961fdf3e8c82c"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ca54742122fa3c4821754adb67318e1cd25c3a22bbf0c5520d5176e77a099ac"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8b47aa643afdfd66cf7ce4c184092ae734e15d10aba2c2948f24270211801c3c"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d88a16bbc330f27e7f2d4caaf6fb061ad0b8a756ecc4033260b0378e128ce8a2"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9223b8ac21085db614a510eb3445e7083cae915a9202357555fa939695d4f57"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce6cb86133dc8930a7ab5e7438545a7f205f7a1cdd5aaf108c1d0da6bdcfbc2b"},
+    {file = "safetensors-0.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8a628e0ae2bbc334b62952c384aa5f41621d01850f8d67b04a96b9c39dd7326"},
+    {file = "safetensors-0.4.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:88d6beb7f811a081e0e5f1d9669fdac816c45340c04b1eaf7ebfda0ce93ea403"},
+    {file = "safetensors-0.4.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b57fc5b1b54cb12d8690a58a4cf4b7144730d4bde9d98aa0e1dab6295a1cd579"},
+    {file = "safetensors-0.4.2-cp310-none-win32.whl", hash = "sha256:9d87a1c98803c16cf113b9ba03f07b2dce5e8eabfd1811a7f7323fcaa2a1bf47"},
+    {file = "safetensors-0.4.2-cp310-none-win_amd64.whl", hash = "sha256:18930ec1d1ecb526d3d9835abc2489b8f1530877518f0c541e77ef0b7abcbd99"},
+    {file = "safetensors-0.4.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:c5dd2ed788730ed56b415d1a11c62026b8cc8c573f55a2092afb3ab383e94fff"},
+    {file = "safetensors-0.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc41791b33efb9c83a59b731619f3d15f543dfe71f3a793cb8fbf9bd5d0d5d71"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c888bf71d5ca12a720f1ed87d407c4918afa022fb247a6546d8fac15b1f112b"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e6b2feb4b47226a16a792e6fac3f49442714884a3d4c1008569d5068a3941be9"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f41cc0ee4b838ae8f4d8364a1b162067693d11a3893f0863be8c228d40e4d0ee"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:51b7228e46c0a483c40ba4b9470dea00fb1ff8685026bb4766799000f6328ac2"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02697f8f2be8ca3c37a4958702dbdb1864447ef765e18b5328a1617022dcf164"},
+    {file = "safetensors-0.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:27fd8f65cf7c80e4280cae1ee6bcd85c483882f6580821abe71ee1a0d3dcfca7"},
+    {file = "safetensors-0.4.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c487b5f113b0924c9534a07dc034830fb4ef05ce9bb6d78cfe016a7dedfe281f"},
+    {file = "safetensors-0.4.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:da7f6483f3fe67ff39b3a55552552c67930ea10a36e9f2539d36fc205273d767"},
+    {file = "safetensors-0.4.2-cp311-none-win32.whl", hash = "sha256:52a7012f6cb9cb4a132760b6308daede18a9f5f8952ce08adc7c67a7d865c2d8"},
+    {file = "safetensors-0.4.2-cp311-none-win_amd64.whl", hash = "sha256:4d1361a097ac430b310ce9eed8ed4746edee33ddafdfbb965debc8966fc34dc2"},
+    {file = "safetensors-0.4.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:77af8aa0edcc2863760fd6febbfdb82e88fd75d0e60c1ce4ba57208ba5e4a89b"},
+    {file = "safetensors-0.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846666c1c5a8c8888d2dfda8d3921cb9cb8e2c5f78365be756c11021e75a0a2a"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f4bfc7ea19b446bfad41510d4b4c76101698c00caaa8a332c8edd8090a412ef"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:233436fd30f27ffeb3c3780d0b84f496518868445c7a8db003639a649cc98453"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7a09237a795d11cd11f9dae505d170a29b5616151db1e10c14f892b11caadc7d"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de01c9a3a3b7b69627d624ff69d9f11d28ce9908eea2fb6245adafa4b1d43df6"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c1f25c5069ee42a5bcffdc66c300a407941edd73f3239e9fdefd26216407391"},
+    {file = "safetensors-0.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7a73b3649456d09ca8506140d44484b63154a7378434cc1e8719f8056550b224"},
+    {file = "safetensors-0.4.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e1625a8d07d046e968bd5c4961810aba1225984e4fb9243626f9d04a06ed3fee"},
+    {file = "safetensors-0.4.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f74c86b25615cb24ad4cff765a2eefc09d71bf0fed97588cf585aad9c38fbb4"},
+    {file = "safetensors-0.4.2-cp312-none-win32.whl", hash = "sha256:8523b9c5777d771bcde5c2389c03f1cdf7ebe8797432a1bd5e345efe25c55987"},
+    {file = "safetensors-0.4.2-cp312-none-win_amd64.whl", hash = "sha256:dcff0243e1737a21f83d664c63fed89d1f532c23fc6830d0427279fabd789ccb"},
+    {file = "safetensors-0.4.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:96ad3d7d472612e26cbe413922b4fb13933310f0511d346ea5cc9a1e856e52eb"},
+    {file = "safetensors-0.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:88250922401b5ae4e37de929178caf46be47ed16c817b2237b81679bec07c120"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d40443554142fc0ab30652d5cc8554c4b7a613513bde00373e18afd5de8cbe4b"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:27f53f70106224d32d874aacecbeb4a6e4c5b16a1d2006d0e876d97229086d71"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cc068afe23734dfb26ce19db0a7877499ddf73b1d55ceb762417e8da4a1b05fb"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9be1918eb8d43a11a6f8806759fccfa0eeb0542b12924caba66af8a7800ad01a"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41911087d20a7bbd78cb4ad4f98aab0c431533107584df6635d8b54b99945573"},
+    {file = "safetensors-0.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:50771c662aab909f31e94d048e76861fd027d66076ea773eef2e66c717766e24"},
+    {file = "safetensors-0.4.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:13f2e57be007b7ea9329133d2399e6bdfcf1910f655440a4da17df3a45afcd30"},
+    {file = "safetensors-0.4.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c772147e6395bc829842e0a98e1b30c67fe25d816299c28196488511d5a5e951"},
+    {file = "safetensors-0.4.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:36239a0060b537a3e8c473df78cffee14c3ec4f51d5f1a853af99371a2fb2a35"},
+    {file = "safetensors-0.4.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:d0cbb7664fad2c307f95195f951b7059e95dc23e0e1822e5978c8b500098543c"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b3e55adb6bd9dc1c2a341e72f48f075953fa35d173dd8e29a95b3b02d0d1462"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42f743b3cca863fba53ca57a193f510e5ec359b97f38c282437716b6768e4a25"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e6af4a6dbeb06c4e6e7d46cf9c716cbc4cc5ef62584fd8a7c0fe558562df45"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a492ba21b5c8f14ee5ec9b20f42ba969e53ca1f909a4d04aad736b66a341dcc2"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b25b8233a1a85dc67e39838951cfb01595d792f3b7b644add63edb652992e030"},
+    {file = "safetensors-0.4.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd27e063fbdafe776f7b1714da59110e88f270e86db00788a8fd65f4eacfeba7"},
+    {file = "safetensors-0.4.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1b6fa399f251bbeb52029bf5a0ac2878d7705dd3612a2f8895b48e9c11f0367d"},
+    {file = "safetensors-0.4.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:de642d46b459e4afd5c2020b26c0d6d869a171ea00411897d5776c127cac74f0"},
+    {file = "safetensors-0.4.2-cp37-none-win32.whl", hash = "sha256:77b72d17754c93bb68f3598182f14d78776e0b9b31682ca5bb2c7c5bd9a75267"},
+    {file = "safetensors-0.4.2-cp37-none-win_amd64.whl", hash = "sha256:d36ee3244d461cd655aeef493792c3bccf4875282f8407fd9af99e9a41cf2530"},
+    {file = "safetensors-0.4.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:16b6b3884f7876c6b3b23a742428223a7170a5a9dac819d8c12a1569422c4b5a"},
+    {file = "safetensors-0.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ee25d311493fbbe0be9d395faee46e9d79e8948f461e388ff39e59875ed9a350"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eed8097968585cd752a1171f86fce9aa1d89a29033e5cd8bec5a502e29f6b7af"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:880e6865cf72cb67f9ab8d04a3c4b49dd95ae92fb1583929ce65aed94e1f685f"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91290f83daf80ce6d1a7f629b244443c200060a80f908b29d879021409e5ea94"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3517d568486ab3508a7acc360b82d7a4a3e26b86efdf210a9ecd9d233c40708a"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1f43a77eb38540f782999e5dc5645164fe9027d3f0194f6c9a5126168017efa"},
+    {file = "safetensors-0.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b684d9818aa5d63fddc65f7d0151968037d255d91adf74eba82125b41c680aaa"},
+    {file = "safetensors-0.4.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ab1f5d84185f9fefaf21413efb764e4908057b8a9a0b987ede890c353490fd70"},
+    {file = "safetensors-0.4.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2bd979642e6c3a517ef4b84ff36c2fee4015664fea05a61154fc565978347553"},
+    {file = "safetensors-0.4.2-cp38-none-win32.whl", hash = "sha256:11be6e7afed29e5a5628f0aa6214e34bc194da73f558dc69fc7d56e07037422a"},
+    {file = "safetensors-0.4.2-cp38-none-win_amd64.whl", hash = "sha256:2f7a6e5d29bd2cc340cffaa391fa437b1be9d21a2bd8b8724d2875d13a6ef2a9"},
+    {file = "safetensors-0.4.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a5a921b4fe6925f9942adff3ebae8c16e0487908c54586a5a42f35b59fd69794"},
+    {file = "safetensors-0.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b691727228c28f2d82d8a92b2bc26e7a1f129ee40b2f2a3185b5974e038ed47c"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91ca1056decc4e981248786e87b2a202d4841ee5f99d433f1adf3d44d4bcfa0e"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55969fd2e6fdb38dc221b0ab380668c21b0efa12a7562db9924759faa3c51757"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae429bfaecc10ab5fe78c93009b3d1656c1581da560041e700eadb497dbe7a4"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ff88f194fe4ac50b463a4a6f0c03af9ad72eb5d24ec6d6730af59522e37fedb"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a80cb48d0a447f8dd18e61813efa7d3f8f8d52edf0f05806abc0c59b83431f57"},
+    {file = "safetensors-0.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b286fb7adfee70a4189898ac2342b8a67d5f493e6b21b0af89ca8eac1b967cbf"},
+    {file = "safetensors-0.4.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ceeff9ddbab4f78738489eb6682867ae946178776f33699737b2129b5394dc1"},
+    {file = "safetensors-0.4.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a26fae748a7488cb3aac381eddfa818c42052c87b5e689fb4c6e82ed58cec209"},
+    {file = "safetensors-0.4.2-cp39-none-win32.whl", hash = "sha256:039a42ab33c9d68b39706fd38f1922ace26866eff246bf20271edb619f5f848b"},
+    {file = "safetensors-0.4.2-cp39-none-win_amd64.whl", hash = "sha256:b3a3e1f5b85859e398773f064943b62a4059f225008a2a8ee6add1edcf77cacf"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:4e70d442ad17e8b153ef9095bf48ea64f15a66bf26dc2b6ca94660c154edbc24"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b90f1d9809caf4ff395951b4703295a68d12907f6945bbc3129e934ff8ae46f6"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c7ac9ad3728838006598e296b3ae9f27d80b489effd4685b92d97b3fc4c98f6"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5730d77e6ff7f4c7039e20913661ad0ea2f86c09e71c039e73dfdd1f394f08"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:44feb8cb156d6803dcd19fc6b81b27235f29b877660605a6ac35e1da7d64f0e4"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:523a241c33e7c827ab9a3a23760d75c7d062f43dfe55b6b019409f89b0fb52d1"},
+    {file = "safetensors-0.4.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fb18300e8eb74291225214f26c9a8ae2110fd61a6c9b5a2ff4c4e0eb1bb9a998"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fe5437ff9fb116e44f2ab558981249ae63f978392b4576e62fcfe167d353edbc"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9304a0934ced5a5d272f39de36291dc141dfc152d277f03fb4d65f2fb2ffa7c"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:160ba1b1e11cf874602c233ab80a14f588571d09556cbc3586900121d622b5ed"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04fcd6fcf7d9c13c7e5dc7e08de5e492ee4daa8f4ad74b4d8299d3eb0224292f"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:906d14c4a677d35834fb0f3a5455ef8305e1bba10a5e0f2e0f357b3d1ad989f2"},
+    {file = "safetensors-0.4.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:df3fcdec0cd543084610d1f09c65cdb10fb3079f79bceddc092b0d187c6a265b"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5ca76f13fb1cef242ea3ad2cb37388e7d005994f42af8b44bee56ba48b2d45ce"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:278a1a3414c020785decdcd741c578725721274d2f9f787fcc930882e83b89cc"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b5a461cc68ecd42d9d546e5e1268a39d8ede7934a68d1ce17c3c659cb829d6"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2341411412a41671d25e26bed59ec121e46bf4fadb8132895e610411c4b9681"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3497ac3895acf17c5f98197f1fa4769f09c5e7ede07fcb102f1c201e663e052c"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:01b5e71d3754d2201294f1eb7a6d59cce3a5702ff96d83d226571b2ca2183837"},
+    {file = "safetensors-0.4.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3627dbd1ea488dd8046a0491de5087f3c0d641e7acc80c0189a33c69398f1cd1"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9d56f0ef53afad26ec54ceede78a43e9a23a076dadbbda7b44d304c591abf4c1"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b259ca73d42daf658a1bda463f1f83885ae4d93a60869be80d7f7dfcc9d8bbb5"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ebc3cd401e4eb54e7c0a70346be565e81942d9a41fafd5f4bf7ab3a55d10378"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bc384a0309b706aa0425c93abb0390508a61bf029ce99c7d9df4220f25871a5"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af2d8f7235d8a08fbccfb8394387890e7fa38942b349a94e6eff13c52ac98087"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0911315bbcc5289087d063c2c2c7ccd711ea97a7e557a7bce005ac2cf80146aa"},
+    {file = "safetensors-0.4.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1efe31673be91832d73439a2af426743e1395fc9ef7b081914e9e1d567bd7b5f"},
+    {file = "safetensors-0.4.2.tar.gz", hash = "sha256:acc85dcb09ec5e8aa787f588d7ad4d55c103f31e4ff060e17d92cc0e8b8cac73"},
+]
+
+[package.extras]
+all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
+dev = ["safetensors[all]"]
+jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"]
+mlx = ["mlx (>=0.0.9)"]
+numpy = ["numpy (>=1.21.6)"]
+paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
+pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
+quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
+tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
+testing = ["h5py (>=3.7.0)", "huggingface_hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools_rust (>=1.5.2)"]
+torch = ["safetensors[numpy]", "torch (>=1.10)"]
+
+[[package]]
+name = "sentencepiece"
+version = "0.1.99"
+description = "SentencePiece python wrapper"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0eb528e70571b7c02723e5804322469b82fe7ea418c96051d0286c0fa028db73"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d7fafb2c4e4659cbdf303929503f37a26eabc4ff31d3a79bf1c5a1b338caa7"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be9cf5b9e404c245aeb3d3723c737ba7a8f5d4ba262ef233a431fa6c45f732a0"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baed1a26464998f9710d20e52607c29ffd4293e7c71c6a1f83f51ad0911ec12c"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9832f08bb372d4c8b567612f8eab9e36e268dff645f1c28f9f8e851be705f6d1"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:019e7535108e309dae2b253a75834fc3128240aa87c00eb80732078cdc182588"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-win32.whl", hash = "sha256:fa16a830416bb823fa2a52cbdd474d1f7f3bba527fd2304fb4b140dad31bb9bc"},
+    {file = "sentencepiece-0.1.99-cp310-cp310-win_amd64.whl", hash = "sha256:14b0eccb7b641d4591c3e12ae44cab537d68352e4d3b6424944f0c447d2348d5"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6d3c56f24183a1e8bd61043ff2c58dfecdc68a5dd8955dc13bab83afd5f76b81"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed6ea1819fd612c989999e44a51bf556d0ef6abfb553080b9be3d347e18bcfb7"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2a0260cd1fb7bd8b4d4f39dc2444a8d5fd4e0a0c4d5c899810ef1abf99b2d45"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a1abff4d1ff81c77cac3cc6fefa34fa4b8b371e5ee51cb7e8d1ebc996d05983"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004e6a621d4bc88978eecb6ea7959264239a17b70f2cbc348033d8195c9808ec"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db361e03342c41680afae5807590bc88aa0e17cfd1a42696a160e4005fcda03b"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-win32.whl", hash = "sha256:2d95e19168875b70df62916eb55428a0cbcb834ac51d5a7e664eda74def9e1e0"},
+    {file = "sentencepiece-0.1.99-cp311-cp311-win_amd64.whl", hash = "sha256:f90d73a6f81248a909f55d8e6ef56fec32d559e1e9af045f0b0322637cb8e5c7"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:62e24c81e74bd87a6e0d63c51beb6527e4c0add67e1a17bac18bcd2076afcfeb"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57efcc2d51caff20d9573567d9fd3f854d9efe613ed58a439c78c9f93101384a"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a904c46197993bd1e95b93a6e373dca2f170379d64441041e2e628ad4afb16f"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89adf59854741c0d465f0e1525b388c0d174f611cc04af54153c5c4f36088c4"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-win32.whl", hash = "sha256:47c378146928690d1bc106fdf0da768cebd03b65dd8405aa3dd88f9c81e35dba"},
+    {file = "sentencepiece-0.1.99-cp36-cp36m-win_amd64.whl", hash = "sha256:9ba142e7a90dd6d823c44f9870abdad45e6c63958eb60fe44cca6828d3b69da2"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7b1a9ae4d7c6f1f867e63370cca25cc17b6f4886729595b885ee07a58d3cec3"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0f644c9d4d35c096a538507b2163e6191512460035bf51358794a78515b74f7"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8843d23a0f686d85e569bd6dcd0dd0e0cbc03731e63497ca6d5bacd18df8b85"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e6f690a1caebb4867a2e367afa1918ad35be257ecdb3455d2bbd787936f155"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-win32.whl", hash = "sha256:8a321866c2f85da7beac74a824b4ad6ddc2a4c9bccd9382529506d48f744a12c"},
+    {file = "sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl", hash = "sha256:c42f753bcfb7661c122a15b20be7f684b61fc8592c89c870adf52382ea72262d"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:85b476406da69c70586f0bb682fcca4c9b40e5059814f2db92303ea4585c650c"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cfbcfe13c69d3f87b7fcd5da168df7290a6d006329be71f90ba4f56bc77f8561"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:445b0ec381af1cd4eef95243e7180c63d9c384443c16c4c47a28196bd1cda937"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6890ea0f2b4703f62d0bf27932e35808b1f679bdb05c7eeb3812b935ba02001"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb71af492b0eefbf9f2501bec97bcd043b6812ab000d119eaf4bd33f9e283d03"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b866b5bd3ddd54166bbcbf5c8d7dd2e0b397fac8537991c7f544220b1f67bc"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-win32.whl", hash = "sha256:b133e8a499eac49c581c3c76e9bdd08c338cc1939e441fee6f92c0ccb5f1f8be"},
+    {file = "sentencepiece-0.1.99-cp38-cp38-win_amd64.whl", hash = "sha256:0eaf3591dd0690a87f44f4df129cf8d05d8a4029b5b6709b489b8e27f9a9bcff"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38efeda9bbfb55052d482a009c6a37e52f42ebffcea9d3a98a61de7aee356a28"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c030b081dc1e1bcc9fadc314b19b740715d3d566ad73a482da20d7d46fd444c"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84dbe53e02e4f8a2e45d2ac3e430d5c83182142658e25edd76539b7648928727"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b0f55d0a0ee1719b4b04221fe0c9f0c3461dc3dabd77a035fa2f4788eb3ef9a"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e800f206cd235dc27dc749299e05853a4e4332e8d3dfd81bf13d0e5b9007d9"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae1c40cda8f9d5b0423cfa98542735c0235e7597d79caf318855cdf971b2280"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-win32.whl", hash = "sha256:c84ce33af12ca222d14a1cdd37bd76a69401e32bc68fe61c67ef6b59402f4ab8"},
+    {file = "sentencepiece-0.1.99-cp39-cp39-win_amd64.whl", hash = "sha256:350e5c74d739973f1c9643edb80f7cc904dc948578bcb1d43c6f2b173e5d18dd"},
+    {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"},
+]
+
+[[package]]
+name = "sympy"
+version = "1.12"
+description = "Computer algebra system (CAS) in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
+    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
+]
+
+[package.dependencies]
+mpmath = ">=0.19"
+
+[[package]]
+name = "tokenizers"
+version = "0.15.2"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"},
+    {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"},
+    {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"},
+    {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"},
+    {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"},
+    {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"},
+    {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"},
+    {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"},
+    {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"},
+    {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"},
+    {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"},
+    {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"},
+    {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"},
+    {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"},
+    {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"},
+    {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"},
+    {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"},
+    {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"},
+    {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"},
+    {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"},
+    {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"},
+    {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"},
+    {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"},
+    {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"},
+    {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"},
+    {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"},
+    {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"},
+    {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"},
+    {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"},
+    {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"},
+    {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"},
+    {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"},
+    {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"},
+    {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"},
+    {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"},
+    {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"},
+    {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"},
+    {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"},
+    {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"},
+    {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"},
+    {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"},
+    {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"},
+    {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"},
+    {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"},
+    {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"},
+    {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"},
+    {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"},
+    {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"},
+]
+
+[package.dependencies]
+huggingface_hub = ">=0.16.4,<1.0"
+
+[package.extras]
+dev = ["tokenizers[testing]"]
+docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"]
+testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
+
+[[package]]
+name = "torch"
+version = "2.2.1+cpu"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "torch-2.2.1+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:5d82422cf04797f1b2a8574b64a916070ec83eef58ad4900615ee0218d7b8b8e"},
+    {file = "torch-2.2.1+cpu-cp310-cp310-win_amd64.whl", hash = "sha256:f8914dd0f5f0e5c66fdecd9559403eea9feac82d1ea639b672fde0073c6addbd"},
+    {file = "torch-2.2.1+cpu-cp311-cp311-linux_x86_64.whl", hash = "sha256:6bc973d5632374b92b4b293817b4d2ff8c8ce1c784c748b471dba1fffcd9c333"},
+    {file = "torch-2.2.1+cpu-cp311-cp311-win_amd64.whl", hash = "sha256:abdec34b0ade8fca0520055e72c3094425ae0ef210718e9c0278121cd3608c32"},
+    {file = "torch-2.2.1+cpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:d7339580135da4105c1244a8621faa076990409afeab5a7b642c3c1ee70a5622"},
+    {file = "torch-2.2.1+cpu-cp312-cp312-win_amd64.whl", hash = "sha256:039128fcb5548122465b15f679b8831c47d14f0d6c28c1f1b631f8019c104720"},
+    {file = "torch-2.2.1+cpu-cp38-cp38-linux_x86_64.whl", hash = "sha256:2b447f7bb50b393b4544b4036d587e39ab524d4353e77c197f6a2727f22b0d47"},
+    {file = "torch-2.2.1+cpu-cp38-cp38-win_amd64.whl", hash = "sha256:2ccdf3e5f71e6426ea9e34d21c3cc333b29d4f48299b981d28aeb5112b5495e1"},
+    {file = "torch-2.2.1+cpu-cp39-cp39-linux_x86_64.whl", hash = "sha256:2fb340b289760040a16a77a6d70b8a48961abba1822e6f58705c97c80befa03e"},
+    {file = "torch-2.2.1+cpu-cp39-cp39-win_amd64.whl", hash = "sha256:e03dc4654ecceeb5b03f0a6f60b342c0e0d267b3ebc61e4f672cace1df8cd930"},
+]
+
+[package.dependencies]
+filelock = "*"
+fsspec = "*"
+jinja2 = "*"
+networkx = "*"
+sympy = "*"
+typing-extensions = ">=4.8.0"
+
+[package.extras]
+opt-einsum = ["opt-einsum (>=3.3)"]
+optree = ["optree (>=0.9.1)"]
+
+[package.source]
+type = "legacy"
+url = "https://download.pytorch.org/whl/cpu"
+reference = "pytorch"
+
+[[package]]
+name = "tqdm"
+version = "4.66.2"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"},
+    {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "transformers"
+version = "4.38.1"
+description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "transformers-4.38.1-py3-none-any.whl", hash = "sha256:a7a9265fb060183e9d975cbbadc4d531b10281589c43f6d07563f86322728973"},
+    {file = "transformers-4.38.1.tar.gz", hash = "sha256:86dc84ccbe36123647e84cbd50fc31618c109a41e6be92514b064ab55bf1304c"},
+]
+
+[package.dependencies]
+filelock = "*"
+huggingface-hub = ">=0.19.3,<1.0"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pyyaml = ">=5.1"
+regex = "!=2019.12.17"
+requests = "*"
+safetensors = ">=0.4.1"
+tokenizers = ">=0.14,<0.19"
+tqdm = ">=4.27"
+
+[package.extras]
+accelerate = ["accelerate (>=0.21.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"]
+audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+codecarbon = ["codecarbon (==1.2.0)"]
+deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+docs = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"]
+docs-specific = ["hf-doc-builder"]
+flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"]
+flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+ftfy = ["ftfy"]
+integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
+ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
+modelcreation = ["cookiecutter (==1.7.3)"]
+natten = ["natten (>=0.14.6,<0.15.0)"]
+onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
+onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
+optuna = ["optuna"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"]
+ray = ["ray[tune] (>=2.7.0)"]
+retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
+sagemaker = ["sagemaker (>=2.31.0)"]
+sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
+serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
+sigopt = ["sigopt"]
+sklearn = ["scikit-learn"]
+speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+timm = ["timm"]
+tokenizers = ["tokenizers (>=0.14,<0.19)"]
+torch = ["accelerate (>=0.21.0)", "torch"]
+torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
+torchhub = ["filelock", "huggingface-hub (>=0.19.3,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch", "tqdm (>=4.27)"]
+video = ["av (==9.2.0)", "decord (==0.6.0)"]
+vision = ["Pillow (>=10.0.1,<=15.0)"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.9.0"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"},
+    {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.2.1"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
+    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "wcwidth"
+version = "0.2.13"
+description = "Measures the displayed width of unicode strings in a terminal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
+    {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = ">=3.9"
+content-hash = "c8c4cc87637266a7b85debcbafa8887c5ad81cc8ef40e98a3f52c7c50af05c03"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..84e71de6d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,44 @@
+[tool.poetry]
+name = "llama-cpp-scripts"
+version = "0.0.0"
+description = "Scripts that ship with llama.cpp"
+authors = ["GGML <ggml@ggml.ai>"]
+readme = "README.md"
+homepage = "https://ggml.ai"
+repository = "https://github.com/ggerganov/llama.cpp"
+keywords = ["ggml", "gguf", "llama.cpp"]
+packages = [{ include = "*.py", from = "." }]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[tool.poetry.dependencies]
+python = ">=3.9"
+numpy = "^1.25.0"
+sentencepiece = ">=0.1.98,<=0.2.0"
+transformers = ">=4.35.2,<5.0.0"
+protobuf = ">=4.21.0,<5.0.0"
+gguf = { path = "./gguf-py" }
+torch = { version = "^2.2.0", source = "pytorch" }
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+
+# Force wheel + cpu
+# For discussion and context see https://github.com/python-poetry/poetry#6409
+[[tool.poetry.source]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+llama-convert-hf-to-gguf = "convert_hf_to_gguf:main"
+llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
+llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
diff --git a/pyrightconfig.json b/pyrightconfig.json
new file mode 100644
index 000000000..9acbbeb78
--- /dev/null
+++ b/pyrightconfig.json
@@ -0,0 +1,22 @@
+{
+  "extraPaths": ["gguf-py"],
+  "pythonVersion": "3.9",
+  "pythonPlatform": "All",
+  "reportUnusedImport": "warning",
+  "reportDuplicateImport": "error",
+  "reportDeprecated": "warning",
+  "reportUnnecessaryTypeIgnoreComment": "information",
+  "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum
+  "executionEnvironments": [
+    {
+      // TODO: make this version override work correctly
+      "root": "gguf-py",
+      "pythonVersion": "3.8",
+    },
+    {
+      // uses match expressions in steps.py
+      "root": "examples/server/tests",
+      "pythonVersion": "3.10",
+    },
+  ],
+ }
diff --git a/requirements.txt b/requirements.txt
index d36f74520..9e190ae27 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,9 +4,9 @@
 # Package versions must stay compatible across all top-level python scripts.
 #
 
--r ./requirements/requirements-convert.txt
+-r ./requirements/requirements-convert_legacy_llama.txt
 
--r ./requirements/requirements-convert-hf-to-gguf.txt
--r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
--r ./requirements/requirements-convert-lora-to-ggml.txt
--r ./requirements/requirements-convert-persimmon-to-gguf.txt
+-r ./requirements/requirements-convert_hf_to_gguf.txt
+-r ./requirements/requirements-convert_hf_to_gguf_update.txt
+-r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements/requirements-convert_lora_to_gguf.txt
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
new file mode 100644
index 000000000..94de59d7e
--- /dev/null
+++ b/requirements/requirements-all.txt
@@ -0,0 +1,12 @@
+-r ../examples/llava/requirements.txt
+-r ../examples/server/bench/requirements.txt
+-r ../examples/server/tests/requirements.txt
+
+-r ./requirements-compare-llama-bench.txt
+-r ./requirements-pydantic.txt
+-r ./requirements-test-tokenizer-random.txt
+
+-r ./requirements-convert_hf_to_gguf.txt
+-r ./requirements-convert_hf_to_gguf_update.txt
+-r ./requirements-convert_legacy_llama.txt
+-r ./requirements-convert_llama_ggml_to_gguf.txt
diff --git a/requirements/requirements-compare-llama-bench.txt b/requirements/requirements-compare-llama-bench.txt
new file mode 100644
index 000000000..e0aaa3204
--- /dev/null
+++ b/requirements/requirements-compare-llama-bench.txt
@@ -0,0 +1,2 @@
+tabulate~=0.9.0
+GitPython~=3.1.43
diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt
deleted file mode 100644
index 6ac402610..000000000
--- a/requirements/requirements-convert-hf-to-gguf.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r ./requirements-convert.txt
-torch~=2.1.1
diff --git a/requirements/requirements-convert-llama-ggml-to-gguf.txt b/requirements/requirements-convert-llama-ggml-to-gguf.txt
deleted file mode 100644
index a0f37cd1c..000000000
--- a/requirements/requirements-convert-llama-ggml-to-gguf.txt
+++ /dev/null
@@ -1 +0,0 @@
--r ./requirements-convert.txt
diff --git a/requirements/requirements-convert-lora-to-ggml.txt b/requirements/requirements-convert-lora-to-ggml.txt
deleted file mode 100644
index 6ac402610..000000000
--- a/requirements/requirements-convert-lora-to-ggml.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r ./requirements-convert.txt
-torch~=2.1.1
diff --git a/requirements/requirements-convert-persimmon-to-gguf.txt b/requirements/requirements-convert-persimmon-to-gguf.txt
deleted file mode 100644
index 6ac402610..000000000
--- a/requirements/requirements-convert-persimmon-to-gguf.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r ./requirements-convert.txt
-torch~=2.1.1
diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert.txt
deleted file mode 100644
index a3d6ecec0..000000000
--- a/requirements/requirements-convert.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-numpy~=1.24.4
-sentencepiece~=0.1.98
-transformers>=4.35.2,<5.0.0
-gguf>=0.1.0
-protobuf>=4.21.0,<5.0.0
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
new file mode 100644
index 000000000..8cb9c354f
--- /dev/null
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -0,0 +1,3 @@
+-r ./requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch~=2.2.1
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
new file mode 100644
index 000000000..8cb9c354f
--- /dev/null
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -0,0 +1,3 @@
+-r ./requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch~=2.2.1
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt
new file mode 100644
index 000000000..859204b27
--- /dev/null
+++ b/requirements/requirements-convert_legacy_llama.txt
@@ -0,0 +1,5 @@
+numpy~=1.26.4
+sentencepiece~=0.2.0
+transformers>=4.45.1,<5.0.0
+gguf>=0.1.0
+protobuf>=4.21.0,<5.0.0
diff --git a/requirements/requirements-convert_llama_ggml_to_gguf.txt b/requirements/requirements-convert_llama_ggml_to_gguf.txt
new file mode 100644
index 000000000..afe2747d4
--- /dev/null
+++ b/requirements/requirements-convert_llama_ggml_to_gguf.txt
@@ -0,0 +1 @@
+-r ./requirements-convert_legacy_llama.txt
diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt
new file mode 100644
index 000000000..5758076c4
--- /dev/null
+++ b/requirements/requirements-convert_lora_to_gguf.txt
@@ -0,0 +1,2 @@
+-r ./requirements-convert_hf_to_gguf.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/requirements/requirements-pydantic.txt b/requirements/requirements-pydantic.txt
new file mode 100644
index 000000000..bdd423e07
--- /dev/null
+++ b/requirements/requirements-pydantic.txt
@@ -0,0 +1,3 @@
+docstring_parser~=0.15
+pydantic~=2.6.3
+requests
diff --git a/requirements/requirements-test-tokenizer-random.txt b/requirements/requirements-test-tokenizer-random.txt
new file mode 100644
index 000000000..2785e71a2
--- /dev/null
+++ b/requirements/requirements-test-tokenizer-random.txt
@@ -0,0 +1 @@
+cffi~=1.16.0
diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in
deleted file mode 100644
index 6a6d8e39e..000000000
--- a/scripts/LlamaConfig.cmake.in
+++ /dev/null
@@ -1,71 +0,0 @@
-set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
-set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
-set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_MPI @LLAMA_MPI@)
-set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
-set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
-set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
-
-@PACKAGE_INIT@
-
-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
-
-# Ensure transient dependencies satisfied
-
-find_package(Threads REQUIRED)
-if (APPLE AND LLAMA_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-endif()
-
-if (LLAMA_BLAS)
-    find_package(BLAS REQUIRED)
-endif()
-
-if (LLAMA_CUBLAS)
-    find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK Metal REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-endif()
-
-if (LLAMA_MPI)
-    find_package(MPI REQUIRED)
-endif()
-
-if (LLAMA_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
-if (LLAMA_HIPBLAS)
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-endif()
-
-find_library(llama_LIBRARY llama
-    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
-
-set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
-add_library(llama UNKNOWN IMPORTED)
-set_target_properties(llama
-    PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
-        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES cxx_std_11
-        POSITION_INDEPENDENT_CODE ON )
-
-check_required_components(Llama)
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
index 32682afbd..fa9e7bacd 100755
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -8,20 +8,20 @@ build_compiler="unknown"
 build_target="unknown"
 
 if out=$(git rev-list --count HEAD); then
-  # git is broken on WSL so we need to strip extra newlines
-  build_number=$(printf '%s' "$out" | tr -d '\n')
+    # git is broken on WSL so we need to strip extra newlines
+    build_number=$(printf '%s' "$out" | tr -d '\n')
 fi
 
 if out=$(git rev-parse --short HEAD); then
-  build_commit=$(printf '%s' "$out" | tr -d '\n')
+    build_commit=$(printf '%s' "$out" | tr -d '\n')
 fi
 
 if out=$($CC --version | head -1); then
-  build_compiler=$out
+    build_compiler=$out
 fi
 
 if out=$($CC -dumpmachine); then
-  build_target=$out
+    build_target=$out
 fi
 
 echo "int LLAMA_BUILD_NUMBER = ${build_number};"
diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh
index af7bab753..d3bbded13 100755
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@@ -97,9 +97,9 @@ check_requirements() {
 }
 
 check_convert_script() {
-    local py=$1             # e.g. ./convert-hf-to-gguf.py
-    local pyname=${py##*/}  # e.g. convert-hf-to-gguf.py
-    pyname=${pyname%.py}    # e.g. convert-hf-to-gguf
+    local py=$1             # e.g. ./convert_hf_to_gguf.py
+    local pyname=${py##*/}  # e.g. convert_hf_to_gguf.py
+    pyname=${pyname%.py}    # e.g. convert_hf_to_gguf
 
     info "$py: beginning check"
 
@@ -108,6 +108,11 @@ check_convert_script() {
         fatal "$py missing requirements. Expected: $reqs"
     fi
 
+    # Check that all sub-requirements are added to top-level requirements.txt
+    if ! grep -qF "$reqs" requirements.txt; then
+        fatal "$reqs needs to be added to requirements.txt"
+    fi
+
     local venv="$workdir/$pyname-venv"
     python3 -m venv "$venv"
 
@@ -134,12 +139,7 @@ EOF
 
 readonly ignore_eq_eq='check_requirements: ignore "=="'
 
-for req in "$reqs_dir"/*; do
-    # Check that all sub-requirements are added to top-level requirements.txt
-    if ! grep -qF "$req" requirements.txt; then
-        fatal "$req needs to be added to requirements.txt"
-    fi
-
+for req in */**/requirements*.txt; do
     # Make sure exact release versions aren't being pinned in the requirements
     # Filters out the ignore string
     if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
@@ -166,8 +166,13 @@ if (( do_cleanup )); then
     rm -rf -- "$all_venv"
 fi
 
-check_convert_script convert.py
-for py in convert-*.py; do
+check_convert_script examples/convert_legacy_llama.py
+for py in convert_*.py; do
+    # skip convert_hf_to_gguf_update.py
+    # TODO: the check is failing for some reason:
+    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
+    [[ $py == convert_hf_to_gguf_update.py ]] && continue
+
     check_convert_script "$py"
 done
 
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index 331c4b9ce..e40d1cc6d 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -8,30 +8,31 @@ fi
 set -e
 set -x
 
+# verify at the start that the compare script has all the necessary dependencies installed
+./scripts/compare-llama-bench.py --check
+
 bench_args="${@:3}"
 
-rm -f llama-bench.sqlite
+rm -f llama-bench.sqlite > /dev/null
 
-backend="cpu"
-
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
+# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
+if [ -n "$GGML_CUDA" ]; then
+    cmake_opts="-DGGML_CUDA=ON"
 fi
 
-make_opts=""
+dir="build-bench"
 
-if [[ "$backend" == "cuda" ]]; then
-    make_opts="LLAMA_CUBLAS=1"
-fi
+function run {
+    rm -fr ${dir} > /dev/null
+    cmake -B ${dir} -S . $cmake_opts > /dev/null
+    cmake --build ${dir} -t llama-bench > /dev/null
+    ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
+}
 
-git checkout $1
-make clean && make -j32 $make_opts llama-bench
-./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
+git checkout $1 > /dev/null
+run
 
-git checkout $2
-make clean && make -j32 $make_opts llama-bench
-./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
+git checkout $2 > /dev/null
+run
 
 ./scripts/compare-llama-bench.py -b $1 -c $2
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 39c3e52e5..239c458d8 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import logging
 import argparse
 import heapq
 import sys
@@ -11,27 +12,29 @@ try:
     import git
     from tabulate import tabulate
 except ImportError as e:
-    print("ERROR: the following Python libraries are required: GitPython, tabulate.")
+    print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
     raise e
 
+logger = logging.getLogger("compare-llama-bench")
+
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
-    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
-    "type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
+    "cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
+    "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
+    "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 
 # Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "opencl", "metal", "gpu_blas", "blas"]
+BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
 
 # Header names for the table:
 PRETTY_NAMES = {
-    "cuda": "CUDA", "opencl": "OpenCL", "metal": "Metal", "gpu_blas": "GPU BLAS", "blas": "BLAS",
-    "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
-    "model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters",
-    "n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
-    "n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO",
-    "tensor_split": "Tensor split"
+    "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
+    "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
+    "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
+    "embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
+    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
+    "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
 }
 
 DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.
@@ -89,13 +92,20 @@ help_s = (
     "If the columns are manually specified, then the results for each unique combination of the "
     "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
 )
+parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
 parser.add_argument("-s", "--show", help=help_s)
+parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
 
 known_args, unknown_args = parser.parse_known_args()
 
+logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
+
+if known_args.check:
+    # Check if all required Python libraries are installed. Would have failed earlier if not.
+    sys.exit(0)
+
 if unknown_args:
-    print(f"ERROR: Received unknown args: {unknown_args}.")
-    print()
+    logger.error(f"Received unknown args: {unknown_args}.\n")
     parser.print_help()
     sys.exit(1)
 
@@ -108,8 +118,7 @@ if input_file is None:
         input_file = sqlite_files[0]
 
 if input_file is None:
-    print("ERROR: Cannot find a suitable input file, please provide one.")
-    print()
+    logger.error("Cannot find a suitable input file, please provide one.\n")
     parser.print_help()
     sys.exit(1)
 
@@ -117,39 +126,41 @@ connection = sqlite3.connect(input_file)
 cursor = connection.cursor()
 builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
 
+commit_short_len = len(builds[0][0])
+
 try:
     repo = git.Repo(".", search_parent_directories=True)
-except git.exc.InvalidGitRepositoryError:
+except git.InvalidGitRepositoryError:
     repo = None
 
 
-def find_parent_in_data(commit):
+def find_parent_in_data(commit: git.Commit):
     """Helper function to find the most recent parent measured in number of commits for which there is data."""
-    heap = [(0, commit)]
+    heap: list[tuple[int, git.Commit]] = [(0, commit)]
     seen_hexsha8 = set()
     while heap:
         depth, current_commit = heapq.heappop(heap)
-        current_hexsha8 = commit.hexsha[:8]
+        current_hexsha8 = commit.hexsha[:commit_short_len]
         if (current_hexsha8,) in builds:
             return current_hexsha8
         for parent in commit.parents:
-            parent_hexsha8 = parent.hexsha[:8]
+            parent_hexsha8 = parent.hexsha[:commit_short_len]
             if parent_hexsha8 not in seen_hexsha8:
                 seen_hexsha8.add(parent_hexsha8)
                 heapq.heappush(heap, (depth + 1, parent))
     return None
 
 
-def get_all_parent_hexsha8s(commit):
+def get_all_parent_hexsha8s(commit: git.Commit):
     """Helper function to recursively get hexsha8 values for all parents of a commit."""
     unvisited = [commit]
     visited   = []
 
     while unvisited:
         current_commit = unvisited.pop(0)
-        visited.append(current_commit.hexsha[:8])
+        visited.append(current_commit.hexsha[:commit_short_len])
         for parent in current_commit.parents:
-            if parent.hexsha[:8] not in visited:
+            if parent.hexsha[:commit_short_len] not in visited:
                 unvisited.append(parent)
 
     return visited
@@ -160,10 +171,10 @@ def get_commit_name(hexsha8):
     if repo is None:
         return hexsha8
     for h in repo.heads:
-        if h.commit.hexsha[:8] == hexsha8:
+        if h.commit.hexsha[:commit_short_len] == hexsha8:
             return h.name
     for t in repo.tags:
-        if t.commit.hexsha[:8] == hexsha8:
+        if t.commit.hexsha[:commit_short_len] == hexsha8:
             return t.name
     return hexsha8
 
@@ -174,10 +185,13 @@ def get_commit_hexsha8(name):
         return None
     for h in repo.heads:
         if h.name == name:
-            return h.commit.hexsha[:8]
+            return h.commit.hexsha[:commit_short_len]
     for t in repo.tags:
         if t.name == name:
-            return t.commit.hexsha[:8]
+            return t.commit.hexsha[:commit_short_len]
+    for c in repo.iter_commits("--all"):
+        if c.hexsha[:commit_short_len] == name[:commit_short_len]:
+            return c.hexsha[:commit_short_len]
     return None
 
 
@@ -191,23 +205,19 @@ if known_args.baseline is not None:
         hexsha8_baseline = get_commit_hexsha8(known_args.baseline)
         name_baseline = known_args.baseline
     if hexsha8_baseline is None:
-        print(f"ERROR: cannot find data for baseline={known_args.baseline}.")
+        logger.error(f"cannot find data for baseline={known_args.baseline}.")
         sys.exit(1)
 # Otherwise, search for the most recent parent of master for which there is data:
 elif repo is not None:
     hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
 
     if hexsha8_baseline is None:
-        print("ERROR: No baseline was provided and did not find data for any master branch commits.")
-        print()
+        logger.error("No baseline was provided and did not find data for any master branch commits.\n")
         parser.print_help()
         sys.exit(1)
 else:
-    print(
-        "ERROR: No baseline was provided and the current working directory "
-        "is not part of a git repository from which a baseline could be inferred."
-    )
-    print()
+    logger.error("No baseline was provided and the current working directory "
+                 "is not part of a git repository from which a baseline could be inferred.\n")
     parser.print_help()
     sys.exit(1)
 
@@ -224,7 +234,7 @@ if known_args.compare is not None:
         hexsha8_compare = get_commit_hexsha8(known_args.compare)
         name_compare = known_args.compare
     if hexsha8_compare is None:
-        print(f"ERROR: cannot find data for baseline={known_args.compare}.")
+        logger.error(f"cannot find data for compare={known_args.compare}.")
         sys.exit(1)
 # Otherwise, search for the commit for llama-bench was most recently run
 # and that is not a parent of master:
@@ -238,16 +248,12 @@ elif repo is not None:
             break
 
     if hexsha8_compare is None:
-        print("ERROR: No compare target was provided and did not find data for any non-master commits.")
-        print()
+        logger.error("No compare target was provided and did not find data for any non-master commits.\n")
         parser.print_help()
         sys.exit(1)
 else:
-    print(
-        "ERROR: No compare target was provided and the current working directory "
-        "is not part of a git repository from which a compare target could be inferred."
-    )
-    print()
+    logger.error("No compare target was provided and the current working directory "
+                 "is not part of a git repository from which a compare target could be inferred.\n")
     parser.print_help()
     sys.exit(1)
 
@@ -281,8 +287,7 @@ if known_args.show is not None:
         if prop not in KEY_PROPERTIES[:-2]:  # Last two values are n_prompt, n_gen.
             unknown_cols.append(prop)
     if unknown_cols:
-        print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}")
-        print()
+        logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
         parser.print_usage()
         sys.exit(1)
     rows_show = get_rows(show)
@@ -300,14 +305,11 @@ else:
 
     show = []
     # Show CPU and/or GPU by default even if the hardware for all results is the same:
-    if "gpu_blas" not in properties_different and "n_gpu_layers" not in properties_different:
-        gpu_blas = bool(rows_full[0][KEY_PROPERTIES.index("gpu_blas")])
+    if "n_gpu_layers" not in properties_different:
         ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
 
-        if not gpu_blas or ngl != 99 and "cpu_info" not in properties_different:
+        if ngl != 99 and "cpu_info" not in properties_different:
             show.append("cpu_info")
-        if gpu_blas and "gpu_info" not in properties_different:
-            show.append("gpu_info")
 
     show += properties_different
 
@@ -327,8 +329,12 @@ table = []
 for row in rows_show:
     n_prompt = int(row[-4])
     n_gen    = int(row[-3])
-    assert n_prompt == 0 or n_gen == 0
-    test_name = f"tg{n_gen}" if n_prompt == 0 else f"pp{n_prompt}"
+    if n_prompt != 0 and n_gen == 0:
+        test_name = f"pp{n_prompt}"
+    elif n_prompt == 0 and n_gen != 0:
+        test_name = f"tg{n_gen}"
+    else:
+        test_name = f"pp{n_prompt}+tg{n_gen}"
     #           Regular columns    test name    avg t/s values              Speedup
     #            VVVVVVVVVVVVV     VVVVVVVVV    VVVVVVVVVVVVVV              VVVVVVV
     table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
@@ -366,7 +372,7 @@ if "gpu_info" in show:
 headers  = [PRETTY_NAMES[p] for p in show]
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
 
-print(tabulate(
+print(tabulate( # noqa: NP100
     table,
     headers=headers,
     floatfmt=".2f",
diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh
deleted file mode 100755
index 01fda16fd..000000000
--- a/scripts/convert-gg.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# LLaMA v1
-python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
-
-# LLaMA v2
-python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
-
-# Code Llama
-python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
-
-# Falcon
-python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
-mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf
-
-python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1
-mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf
diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh
new file mode 100755
index 000000000..c6c1e988a
--- /dev/null
+++ b/scripts/debug-test.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+PROG=${0##*/}
+build_dir="build-ci-debug"
+
+# Print Color Commands
+red=$(tput setaf 1)
+green=$(tput setaf 2)
+yellow=$(tput setaf 3)
+blue=$(tput setaf 4)
+magenta=$(tput setaf 5)
+cyan=$(tput setaf 6)
+normal=$(tput sgr0)
+
+
+# Print Help Message
+####################
+
+print_full_help() {
+  cat << EOF
+Usage: $PROG [OPTION]... <test_regex> (test_number)
+Debug specific ctest program.
+
+Options:
+  -h, --help            display this help and exit
+  -g                    run in gdb mode
+
+Arguments:
+  <test_regex>     (Mandatory) Supply one regex to the script to filter tests
+  (test_number)    (Optional) Test number to run a specific test
+
+Example:
+  $PROG test-tokenizer
+  $PROG test-tokenizer 3
+EOF
+}
+
+abort() {
+  echo "Error: $1" >&2
+  cat << EOF >&2
+Usage: $PROG [OPTION]... <test_regex> (test_number)
+Debug specific ctest program.
+Refer to --help for full instructions.
+EOF
+  exit 1
+}
+
+
+# Dependency Sanity Check
+#########################
+
+check_dependency() {
+  command -v "$1" >/dev/null 2>&1 || {
+    abort "$1 is required but not found. Please install it and try again."
+  }
+}
+
+check_dependency ctest
+check_dependency cmake
+
+
+# Step 0: Check the args
+########################
+
+if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
+  print_full_help >&2
+  exit 0
+fi
+
+# Parse command-line options
+gdb_mode=false
+while getopts "g" opt; do
+    case $opt in
+        g)
+            gdb_mode=true
+            echo "gdb_mode Mode Enabled"
+            ;;
+    esac
+done
+
+# Shift the option parameters
+shift $((OPTIND - 1))
+
+# Positionial Argument Processing : <test_regex>
+if [ -z "${1}" ]; then
+    abort "Test regex is required"
+else
+    test_suite=${1:-}
+fi
+
+# Positionial Argument Processing : (test_number)
+test_number=${2:-}
+
+
+# Step 1: Reset and Setup folder context
+########################################
+
+## Sanity check that we are actually in a git repo
+repo_root=$(git rev-parse --show-toplevel)
+if [ ! -d "$repo_root" ]; then
+    abort "Not in a Git repository."
+fi
+
+## Reset folder to root context of git repo and Create and enter build directory
+pushd "$repo_root"
+rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
+
+
+# Step 2: Setup Build Environment and Compile Test Binaries
+###########################################################
+
+# Note: test-eval-callback requires -DLLAMA_CURL
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
+pushd "$build_dir"
+make -j || abort "Failed to compile"
+popd > /dev/null || exit 1
+
+
+# Step 3: Find all tests available that matches REGEX
+####################################################
+
+# Ctest Gather Tests
+# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
+# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
+# `-V` : Verbose Mode
+printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
+pushd "$build_dir"
+tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
+if [ ${#tests[@]} -eq 0 ]; then
+    abort "No tests available... check your compilation process..."
+fi
+popd > /dev/null || exit 1
+
+
+# Step 4: Identify Test Command for Debugging
+#############################################
+
+# Select test number
+if [ -z $test_number ]; then
+    # List out available tests
+    printf "Which test would you like to debug?\n"
+    id=0
+    for s in "${tests[@]}"
+    do
+        echo "Test# ${id}"
+        echo "  $s"
+        ((id++))
+    done
+
+    # Prompt user which test they wanted to run
+    printf "\nRun test#? "
+    read test_number
+
+else
+    printf "\nUser Already Requested #${test_number}\n"
+
+fi
+
+# Grab all tests commands
+pushd "$build_dir"
+sIFS=$IFS # Save Initial IFS (Internal Field Separator)
+IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
+test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
+IFS=$sIFS # Reset IFS (Internal Field Separator)
+popd > /dev/null || exit 1
+
+# Grab specific test command
+single_test_name="${tests[test_number]}"
+single_test_command="${test_args[test_number]}"
+
+
+# Step 5: Execute or GDB Debug
+##############################
+
+printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
+printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
+
+if [ "$gdb_mode" = "true" ]; then
+    # Execute debugger
+    pushd "$repo_root" || exit 1
+    eval "gdb --args ${single_test_command}"
+    popd > /dev/null || exit 1
+
+else
+    # Execute Test
+    pushd "$repo_root" || exit 1
+    eval "${single_test_command}"
+    exit_code=$?
+    popd > /dev/null || exit 1
+
+    # Print Result
+    printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
+    printf "${yellow}Command: ${single_test_command}${normal}\n"
+    if [ $exit_code -eq 0 ]; then
+        printf "${green}TEST PASS${normal}\n"
+    else
+        printf "${red}TEST FAIL${normal}\n"
+    fi
+
+fi
+
+# Return to the directory from which the user ran the command.
+popd > /dev/null || exit 1
diff --git a/scripts/fetch_server_test_models.py b/scripts/fetch_server_test_models.py
new file mode 100755
index 000000000..05690b138
--- /dev/null
+++ b/scripts/fetch_server_test_models.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+'''
+    This script fetches all the models used in the server tests.
+
+    This is useful for slow tests that use larger models, to avoid them timing out on the model downloads.
+
+    It is meant to be run from the root of the repository.
+
+    Example:
+        python scripts/fetch_server_test_models.py
+        ( cd examples/server/tests && ./tests.sh -v -x -m slow )
+'''
+import ast
+import glob
+import logging
+import os
+from typing import Generator
+from pydantic import BaseModel
+from typing import Optional
+import subprocess
+
+
+class HuggingFaceModel(BaseModel):
+    hf_repo: str
+    hf_file: Optional[str] = None
+
+    class Config:
+        frozen = True
+
+
+def collect_hf_model_test_parameters(test_file) -> Generator[HuggingFaceModel, None, None]:
+    try:
+        with open(test_file) as f:
+            tree = ast.parse(f.read())
+    except Exception as e:
+        logging.error(f'collect_hf_model_test_parameters failed on {test_file}: {e}')
+        return
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef):
+            for dec in node.decorator_list:
+                if isinstance(dec, ast.Call) and isinstance(dec.func, ast.Attribute) and dec.func.attr == 'parametrize':
+                    param_names = ast.literal_eval(dec.args[0]).split(",")
+                    if "hf_repo" not in param_names:
+                        continue
+
+                    raw_param_values = dec.args[1]
+                    if not isinstance(raw_param_values, ast.List):
+                        logging.warning(f'Skipping non-list parametrize entry at {test_file}:{node.lineno}')
+                        continue
+
+                    hf_repo_idx = param_names.index("hf_repo")
+                    hf_file_idx = param_names.index("hf_file") if "hf_file" in param_names else None
+
+                    for t in raw_param_values.elts:
+                        if not isinstance(t, ast.Tuple):
+                            logging.warning(f'Skipping non-tuple parametrize entry at {test_file}:{node.lineno}')
+                            continue
+                        yield HuggingFaceModel(
+                            hf_repo=ast.literal_eval(t.elts[hf_repo_idx]),
+                            hf_file=ast.literal_eval(t.elts[hf_file_idx]) if hf_file_idx is not None else None)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+
+    models = sorted(list(set([
+        model
+        for test_file in glob.glob('examples/server/tests/unit/test_*.py')
+        for model in collect_hf_model_test_parameters(test_file)
+    ])), key=lambda m: (m.hf_repo, m.hf_file))
+
+    logging.info(f'Found {len(models)} models in parameterized tests:')
+    for m in models:
+        logging.info(f'  - {m.hf_repo} / {m.hf_file}')
+
+    cli_path = os.environ.get(
+        'LLAMA_SERVER_BIN_PATH',
+        os.path.join(
+            os.path.dirname(__file__),
+            '../build/bin/Release/llama-cli.exe' if os.name == 'nt' else '../build/bin/llama-cli'))
+
+    for m in models:
+        if '<' in m.hf_repo or (m.hf_file is not None and '<' in m.hf_file):
+            continue
+        if m.hf_file is not None and '-of-' in m.hf_file:
+            logging.warning(f'Skipping model at {m.hf_repo} / {m.hf_file} because it is a split file')
+            continue
+        logging.info(f'Using llama-cli to ensure model {m.hf_repo}/{m.hf_file} was fetched')
+        cmd = [
+            cli_path,
+            '-hfr', m.hf_repo,
+            *([] if m.hf_file is None else ['-hff', m.hf_file]),
+            '-n', '1',
+            '-p', 'Hey',
+            '--no-warmup',
+            '--log-disable',
+            '-no-cnv']
+        if m.hf_file != 'tinyllamas/stories260K.gguf' and 'Mistral-Nemo' not in m.hf_repo:
+            cmd.append('-fa')
+        try:
+            subprocess.check_call(cmd)
+        except subprocess.CalledProcessError:
+            logging.error(f'Failed to fetch model at {m.hf_repo} / {m.hf_file} with command:\n  {" ".join(cmd)}')
+            exit(1)
diff --git a/scripts/gen-authors.sh b/scripts/gen-authors.sh
new file mode 100755
index 000000000..3ef8391cc
--- /dev/null
+++ b/scripts/gen-authors.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+printf "# date: $(date)\n" > AUTHORS
+printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
+
+git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
+
+# if necessary, update your name here. for example: jdoe -> John Doe
+sed -i '' 's/^jdoe/John Doe/g' AUTHORS
diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
new file mode 100644
index 000000000..2d9bde01c
--- /dev/null
+++ b/scripts/gen-unicode-data.py
@@ -0,0 +1,196 @@
+from __future__ import annotations
+
+import array
+import unicodedata
+import requests
+
+
+MAX_CODEPOINTS = 0x110000
+
+UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
+
+
+# see https://www.unicode.org/L2/L1999/UnicodeData.html
+def unicode_data_iter():
+    res = requests.get(UNICODE_DATA_URL)
+    res.raise_for_status()
+    data = res.content.decode()
+
+    prev = []
+
+    for line in data.splitlines():
+        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
+        line = line.split(";")
+
+        cpt = int(line[0], base=16)
+        assert cpt < MAX_CODEPOINTS
+
+        cpt_lower = int(line[-2] or "0", base=16)
+        assert cpt_lower < MAX_CODEPOINTS
+
+        cpt_upper = int(line[-3] or "0", base=16)
+        assert cpt_upper < MAX_CODEPOINTS
+
+        categ = line[2].strip()
+        assert len(categ) == 2
+
+        bidir = line[4].strip()
+        assert len(categ) == 2
+
+        name = line[1]
+        if name.endswith(", First>"):
+            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
+            continue
+        if name.endswith(", Last>"):
+            assert prev[1:] == (0, 0, categ, bidir)
+            for c in range(prev[0], cpt):
+                yield (c, cpt_lower, cpt_upper, categ, bidir)
+
+        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
+
+
+# see definition in unicode.h
+CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
+CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
+CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
+CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
+CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
+CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
+CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
+CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
+
+UNICODE_CATEGORY_TO_FLAG = {
+    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
+    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
+    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
+    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
+    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
+    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
+    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
+    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
+    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
+    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
+    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
+    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
+    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
+    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
+    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
+    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
+    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
+    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
+    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
+    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
+    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
+    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
+    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
+    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
+    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
+    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
+    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
+    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
+    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
+    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
+    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
+}
+
+
+codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
+table_whitespace = []
+table_lowercase = []
+table_uppercase = []
+table_nfd = []
+
+for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
+    # convert codepoint to unicode character
+    char = chr(cpt)
+
+    # codepoint category flags
+    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
+
+    # lowercase conversion
+    if cpt_lower:
+        table_lowercase.append((cpt, cpt_lower))
+
+    # uppercase conversion
+    if cpt_upper:
+        table_uppercase.append((cpt, cpt_upper))
+
+    # NFD normalization
+    norm = ord(unicodedata.normalize('NFD', char)[0])
+    if cpt != norm:
+        table_nfd.append((cpt, norm))
+
+
+# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+table_whitespace.extend(range(0x0009, 0x000D + 1))
+table_whitespace.extend(range(0x2000, 0x200A + 1))
+table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
+
+
+# sort by codepoint
+table_whitespace.sort()
+table_lowercase.sort()
+table_uppercase.sort()
+table_nfd.sort()
+
+
+# group ranges with same flags
+ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]  # start, flags
+for codepoint, flags in enumerate(codepoint_flags):
+    if flags != ranges_flags[-1][1]:
+        ranges_flags.append((codepoint, flags))
+ranges_flags.append((MAX_CODEPOINTS, 0x0000))
+
+
+# group ranges with same nfd
+ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]  # start, last, nfd
+for codepoint, norm in table_nfd:
+    start = ranges_nfd[-1][0]
+    if ranges_nfd[-1] != (start, codepoint - 1, norm):
+        ranges_nfd.append(None)  # type: ignore[arg-type]  # dummy, will be replaced below
+        start = codepoint
+    ranges_nfd[-1] = (start, codepoint, norm)
+
+
+# Generate 'unicode-data.cpp':
+#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
+
+def out(line=""):
+    print(line, end='\n')  # noqa
+
+
+out("""\
+// generated with scripts/gen-unicode-data.py
+
+#include "unicode-data.h"
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+""")
+
+out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
+for codepoint, flags in ranges_flags:
+    out("{0x%06X, 0x%04X}," % (codepoint, flags))
+out("};\n")
+
+out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
+for codepoint in table_whitespace:
+    out("0x%06X," % codepoint)
+out("};\n")
+
+out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
+for tuple_lw in table_lowercase:
+    out("{0x%06X, 0x%06X}," % tuple_lw)
+out("};\n")
+
+out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
+for tuple_up in table_uppercase:
+    out("{0x%06X, 0x%06X}," % tuple_up)
+out("};\n")
+
+out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
+for triple in ranges_nfd:
+    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
+out("};\n")
diff --git a/scripts/get-hellaswag.sh b/scripts/get-hellaswag.sh
index 121979fe2..4e1b1cc15 100755
--- a/scripts/get-hellaswag.sh
+++ b/scripts/get-hellaswag.sh
@@ -4,7 +4,7 @@ wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag
 
 echo "Usage:"
 echo ""
-echo "  ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
+echo "  ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh
new file mode 100755
index 000000000..9c65fafbc
--- /dev/null
+++ b/scripts/get-wikitext-103.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
+
+echo "Usage:"
+echo ""
+echo "  ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo ""
+
+exit 0
diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh
index 7ca760fa6..5f3845ef5 100755
--- a/scripts/get-wikitext-2.sh
+++ b/scripts/get-wikitext-2.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 
 wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+unzip wikitext-2-raw-v1.zip
 
 echo "Usage:"
 echo ""
-echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo "  ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get-winogrande.sh b/scripts/get-winogrande.sh
index 5f234468e..f1fc0e2d4 100755
--- a/scripts/get-winogrande.sh
+++ b/scripts/get-winogrande.sh
@@ -4,7 +4,7 @@ wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw
 
 echo "Usage:"
 echo ""
-echo "  ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
+echo "  ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
 echo ""
 
 exit 0
diff --git a/scripts/get_chat_template.py b/scripts/get_chat_template.py
new file mode 100644
index 000000000..e8982d11a
--- /dev/null
+++ b/scripts/get_chat_template.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+'''
+  Fetches the Jinja chat template of a HuggingFace model.
+  If a model has multiple chat templates, you can specify the variant name.
+
+  Syntax:
+    ./scripts/get_chat_template.py model_id [variant]
+
+  Examples:
+    ./scripts/get_chat_template.py NousResearch/Meta-Llama-3-8B-Instruct
+    ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use
+    ./scripts/get_chat_template.py meta-llama/Llama-3.2-3B-Instruct
+'''
+
+import json
+import re
+import sys
+
+
+def get_chat_template(model_id, variant=None):
+    try:
+        # Use huggingface_hub library if available.
+        # Allows access to gated models if the user has access and ran `huggingface-cli login`.
+        from huggingface_hub import hf_hub_download
+        with open(hf_hub_download(repo_id=model_id, filename="tokenizer_config.json")) as f:
+            config_str = f.read()
+    except ImportError:
+        import requests
+        assert re.match(r"^[\w.-]+/[\w.-]+$", model_id), f"Invalid model ID: {model_id}"
+        response = requests.get(f"https://huggingface.co/{model_id}/resolve/main/tokenizer_config.json")
+        if response.status_code == 401:
+            raise Exception('Access to this model is gated, please request access, authenticate with `huggingface-cli login` and make sure to run `pip install huggingface_hub`')
+        response.raise_for_status()
+        config_str = response.text
+
+    try:
+        config = json.loads(config_str)
+    except json.JSONDecodeError:
+        # Fix https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
+        # (Remove extra '}' near the end of the file)
+        config = json.loads(re.sub(r'\}([\n\s]*\}[\n\s]*\],[\n\s]*"clean_up_tokenization_spaces")', r'\1', config_str))
+
+    chat_template = config['chat_template']
+    if isinstance(chat_template, str):
+        return chat_template
+    else:
+        variants = {
+            ct['name']: ct['template']
+            for ct in chat_template
+        }
+
+        def format_variants():
+            return ', '.join(f'"{v}"' for v in variants.keys())
+
+        if variant is None:
+            if 'default' not in variants:
+                raise Exception(f'Please specify a chat template variant (one of {format_variants()})')
+            variant = 'default'
+            sys.stderr.write(f'Note: picked "default" chat template variant (out of {format_variants()})\n')
+        elif variant not in variants:
+            raise Exception(f"Variant {variant} not found in chat template (found {format_variants()})")
+
+        return variants[variant]
+
+
+def main(args):
+    if len(args) < 1:
+        raise ValueError("Please provide a model ID and an optional variant name")
+    model_id = args[0]
+    variant = None if len(args) < 2 else args[1]
+
+    template = get_chat_template(model_id, variant)
+    sys.stdout.write(template)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/scripts/hf.sh b/scripts/hf.sh
index 1e9e5a6ea..b251925fa 100755
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@@ -3,9 +3,9 @@
 # Shortcut for downloading HF models
 #
 # Usage:
-#   ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-#   ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
 #
 
 # all logs go to stderr
@@ -14,7 +14,7 @@ function log {
 }
 
 function usage {
-    log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
+    log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
     exit 1
 }
 
@@ -26,9 +26,9 @@ function has_cmd {
 }
 
 if has_cmd wget; then
-    cmd="wget -q --show-progress -c -O %s %s"
+    cmd="wget -q -c -O %s/%s %s"
 elif has_cmd curl; then
-    cmd="curl -C - -f -o %s -L %s"
+    cmd="curl -C - -f --output-dir %s -o %s -L %s"
 else
     log "[E] curl or wget not found"
     exit 1
@@ -37,6 +37,7 @@ fi
 url=""
 repo=""
 file=""
+outdir="."
 
 # parse args
 while [[ $# -gt 0 ]]; do
@@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
             file="$2"
             shift 2
             ;;
+        --outdir)
+            outdir="$2"
+            shift 2
+            ;;
         -h|--help)
             usage
             ;;
@@ -94,10 +99,10 @@ basename=$(basename $url)
 log "[+] attempting to download $basename"
 
 if [ -n "$cmd" ]; then
-    cmd=$(printf "$cmd" "$basename" "$url")
+    cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
     log "[+] $cmd"
     if $cmd; then
-        echo $basename
+        echo $outdir/$basename
         exit 0
     fi
 fi
diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh
index b4c2a159e..bc43738a2 100755
--- a/scripts/qnt-all.sh
+++ b/scripts/qnt-all.sh
@@ -26,5 +26,5 @@ set -e
 mkdir -p ${out}
 
 for q in ${qnt[@]}; do
-    time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
+    time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
 done
diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh
index e04d61d7f..e15f74f1b 100755
--- a/scripts/run-all-ppl.sh
+++ b/scripts/run-all-ppl.sh
@@ -26,5 +26,5 @@ out="../tmp/results-${model}"
 mkdir -p ${out}
 
 for q in ${qnt[@]}; do
-    time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
+    time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
 done
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
deleted file mode 100755
index a18252730..000000000
--- a/scripts/run-with-preset.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import subprocess
-import sys
-
-import yaml
-
-CLI_ARGS_MAIN_PERPLEXITY = [
-    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
-    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
-    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
-    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
-    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
-    "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
-    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
-    "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
-    "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
-    "verbose-prompt"
-]
-
-CLI_ARGS_LLAMA_BENCH = [
-    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
-    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
-]
-
-CLI_ARGS_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
-    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
-    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
-    "threads", "verbose"
-]
-
-description = """Run llama.cpp binaries with presets from YAML file(s).
-To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
-To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
-
-Formatting considerations:
-- The YAML property names are the same as the CLI argument names of the corresponding binary.
-- Properties must use the long name of their corresponding llama.cpp CLI arguments.
-- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
-- Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
-- To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
-- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
-- To define a tensor split, pass a list of floats.
-"""
-usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
-epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
-          "Unknown args will be ignored.")
-
-parser = argparse.ArgumentParser(
-    description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-bin", "--binary", help="The binary to run.")
-parser.add_argument("yaml_files", nargs="*",
-                    help="Arbitrary number of YAML files from which to read preset values. "
-                    "If two files specify the same values the later one will be used.")
-
-known_args, unknown_args = parser.parse_known_args()
-
-if not known_args.yaml_files and not unknown_args:
-    parser.print_help()
-    sys.exit(0)
-
-props = dict()
-
-for yaml_file in known_args.yaml_files:
-    with open(yaml_file, "r") as f:
-        props.update(yaml.load(f, yaml.SafeLoader))
-
-props = {prop.replace("_", "-"): val for prop, val in props.items()}
-
-binary = props.pop("binary", "main")
-if known_args.binary:
-    binary = known_args.binary
-
-if os.path.exists(f"./{binary}"):
-    binary = f"./{binary}"
-
-if binary.lower().endswith("main") or binary.lower().endswith("perplexity"):
-    cli_args = CLI_ARGS_MAIN_PERPLEXITY
-elif binary.lower().endswith("llama-bench"):
-    cli_args = CLI_ARGS_LLAMA_BENCH
-elif binary.lower().endswith("server"):
-    cli_args = CLI_ARGS_SERVER
-else:
-    print(f"Unknown binary: {binary}")
-    sys.exit(1)
-
-command_list = [binary]
-
-for cli_arg in cli_args:
-    value = props.pop(cli_arg, None)
-
-    if not value or value == -1:
-        continue
-
-    if cli_arg == "logit-bias":
-        for token, bias in value.items():
-            command_list.append("--logit-bias")
-            command_list.append(f"{token}{bias:+}")
-        continue
-
-    if cli_arg == "reverse-prompt" and not isinstance(value, str):
-        for rp in value:
-            command_list.append("--reverse-prompt")
-            command_list.append(str(rp))
-        continue
-
-    command_list.append(f"--{cli_arg}")
-
-    if cli_arg == "tensor-split":
-        command_list.append(",".join([str(v) for v in value]))
-        continue
-
-    value = str(value)
-
-    if value != "True":
-        command_list.append(str(value))
-
-num_unused = len(props)
-if num_unused > 10:
-    print(f"The preset file contained a total of {num_unused} unused properties.")
-elif num_unused > 0:
-    print("The preset file contained the following unused properties:")
-    for prop, value in props.items():
-        print(f"  {prop}: {value}")
-
-command_list += unknown_args
-
-sp = subprocess.Popen(command_list)
-
-while sp.returncode is None:
-    try:
-        sp.wait()
-    except KeyboardInterrupt:
-        pass
-
-sys.exit(sp.returncode)
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
deleted file mode 100644
index 30bbac321..000000000
--- a/scripts/server-llm.sh
+++ /dev/null
@@ -1,423 +0,0 @@
-#!/bin/bash
-#
-# Helper script for deploying llama.cpp server with a single Bash command
-#
-# - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal, OpenCL
-# - Can run all GGUF models from HuggingFace
-# - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
-#
-# Limitations
-#
-# - Chat templates are poorly supported (base models recommended)
-# - Might be unstable!
-#
-# Usage:
-#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
-#
-#   --port:            port number, default is 8888
-#   --repo:            path to a repo containing GGUF model files
-#   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:         cpu, cuda, metal, opencl, depends on the OS
-#   --gpu-id:          gpu id, default is 0
-#   --n-parallel:      number of parallel requests, default is 8
-#   --n-kv:            KV cache size, default is 4096
-#   --verbose:         verbose output
-#   --non-interactive: run without asking a permission to run
-#
-# Example:
-#
-#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
-#
-
-set -e
-
-# required utils: curl, git, make
-if ! command -v curl &> /dev/null; then
-    printf "[-] curl not found\n"
-    exit 1
-fi
-if ! command -v git &> /dev/null; then
-    printf "[-] git not found\n"
-    exit 1
-fi
-if ! command -v make &> /dev/null; then
-    printf "[-] make not found\n"
-    exit 1
-fi
-
-# parse arguments
-is_interactive=1
-port=8888
-repo=""
-wtype=""
-backend="cpu"
-
-# if macOS, use metal backend by default
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-gpu_id=0
-n_parallel=8
-n_kv=4096
-verbose=0
-
-function print_usage {
-    printf "Usage:\n"
-    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
-    printf "  --port:             port number, default is 8888\n"
-    printf "  --repo:             path to a repo containing GGUF model files\n"
-    printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:          cpu, cuda, metal, opencl, depends on the OS\n"
-    printf "  --gpu-id:           gpu id, default is 0\n"
-    printf "  --n-parallel:       number of parallel requests, default is 8\n"
-    printf "  --n-kv:             KV cache size, default is 4096\n"
-    printf "  --verbose:          verbose output\n\n"
-    printf "  --non-interactive:  run without asking a permission to run\n"
-    printf "Example:\n\n"
-    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
-}
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --non-interactive)
-            is_interactive=0
-            shift
-            ;;
-        --port)
-            port="$2"
-            shift
-            shift
-            ;;
-        --repo)
-            repo="$2"
-            shift
-            shift
-            ;;
-        --wtype)
-            wtype="$2"
-            shift
-            shift
-            ;;
-        --backend)
-            backend="$2"
-            shift
-            shift
-            ;;
-        --gpu-id)
-            gpu_id="$2"
-            shift
-            shift
-            ;;
-        --n-parallel)
-            n_parallel="$2"
-            shift
-            shift
-            ;;
-        --n-kv)
-            n_kv="$2"
-            shift
-            shift
-            ;;
-        --verbose)
-            verbose=1
-            shift
-            ;;
-        --help)
-            print_usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown argument: $key"
-            print_usage
-            exit 1
-            ;;
-    esac
-done
-
-# available weights types
-wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
-
-wfiles=()
-for wt in "${wtypes[@]}"; do
-    wfiles+=("")
-done
-
-# map wtype input to index
-if [[ ! -z "$wtype" ]]; then
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
-        if [[ "$uwt" == "$wtype" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        exit 1
-    fi
-
-    wtype="$iw"
-fi
-
-# sample repos
-repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
-    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
-    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
-    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
-    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
-)
-if [ $is_interactive -eq 1 ]; then
-    printf "\n"
-    printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
-    printf "    Based on the options that follow, the script might download a model file\n"
-    printf "    from the internet, which can be a few GBs in size. The script will also\n"
-    printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
-    printf "\n"
-    printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-    printf "    model using llama.cpp for demonstration purposes.\n"
-    printf "\n"
-    printf "    Please note:\n"
-    printf "\n"
-    printf "    - All new data will be stored in the current folder\n"
-    printf "    - The server will be listening on all network interfaces\n"
-    printf "    - The server will run with default settings which are not always optimal\n"
-    printf "    - Do not judge the quality of a model based on the results from this script\n"
-    printf "    - Do not use this script to benchmark llama.cpp\n"
-    printf "    - Do not use this script in production\n"
-    printf "    - This script is only for demonstration purposes\n"
-    printf "\n"
-    printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
-    printf "\n"
-    printf "    Press Enter to continue ...\n\n"
-
-    read
-fi
-
-if [[ -z "$repo" ]]; then
-    printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the list below or enter an URL:\n\n"
-
-    is=0
-    for r in "${repos[@]}"; do
-        printf "    %2d) %s\n" $is "$r"
-        is=$((is+1))
-    done
-
-    # ask for repo until index of sample repo is provided or an URL
-    while [[ -z "$repo" ]]; do
-        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
-        read -p "[+] Select repo: " repo
-
-        # check if the input is a number
-        if [[ "$repo" =~ ^[0-9]+$ ]]; then
-            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
-                repo="${repos[$repo]}"
-            else
-                printf "[-] Invalid repo index: %s\n" "$repo"
-                repo=""
-            fi
-        elif [[ "$repo" =~ ^https?:// ]]; then
-            repo="$repo"
-        else
-            printf "[-] Invalid repo URL: %s\n" "$repo"
-            repo=""
-        fi
-    done
-fi
-
-# remove suffix
-repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
-
-printf "[+] Checking for GGUF model files in %s\n" "$repo"
-
-# find GGUF files in the source
-# TODO: better logic
-model_tree="${repo%/}/tree/main"
-model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
-
-# list all files in the provided git repo
-printf "[+] Model files:\n\n"
-for file in $model_files; do
-    # determine iw by grepping the filename with wtypes
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
-        if [[ "$ufile" =~ "$wt" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        continue
-    fi
-
-    wfiles[$iw]="$file"
-
-    have=" "
-    if [[ -f "$file" ]]; then
-        have="*"
-    fi
-
-    printf "    %2d) %s %s\n" $iw "$have" "$file"
-done
-
-wfile="${wfiles[$wtype]}"
-
-# ask for weights type until provided and available
-while [[ -z "$wfile" ]]; do
-    printf "\n"
-    read -p "[+] Select weight type: " wtype
-    wfile="${wfiles[$wtype]}"
-
-    if [[ -z "$wfile" ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        wtype=""
-    fi
-done
-
-printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
-
-url="${repo%/}/resolve/main/$wfile"
-
-# check file if the model has been downloaded before
-chk="$wfile.chk"
-
-# check if we should download the file
-# - if $wfile does not exist
-# - if $wfile exists but $chk does not exist
-# - if $wfile exists and $chk exists but $wfile is newer than $chk
-# TODO: better logic using git lfs info
-
-do_download=0
-
-if [[ ! -f "$wfile" ]]; then
-    do_download=1
-elif [[ ! -f "$chk" ]]; then
-    do_download=1
-elif [[ "$wfile" -nt "$chk" ]]; then
-    do_download=1
-fi
-
-if [[ $do_download -eq 1 ]]; then
-    printf "[+] Downloading weights from %s\n" "$url"
-
-    # download the weights file
-    curl -o "$wfile" -# -L "$url"
-
-    # create a check file if successful
-    if [[ $? -eq 0 ]]; then
-        printf "[+] Creating check file %s\n" "$chk"
-        touch "$chk"
-    fi
-else
-    printf "[+] Using cached weights %s\n" "$wfile"
-fi
-
-# get latest llama.cpp and build
-
-printf "[+] Downloading latest llama.cpp\n"
-
-llama_cpp_dir="__llama_cpp_port_${port}__"
-
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
-    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[-] Please remove it and try again\n"
-    exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
-
-    cd "$llama_cpp_dir"
-    git reset --hard
-    git fetch
-    git checkout origin/master
-
-    cd ..
-else
-    printf "[+] Cloning llama.cpp\n"
-
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
-fi
-
-# mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
-
-if [[ $verbose -eq 1 ]]; then
-    set -x
-fi
-
-# build
-cd "$llama_cpp_dir"
-
-make clean
-
-log="--silent"
-if [[ $verbose -eq 1 ]]; then
-    log=""
-fi
-
-if [[ "$backend" == "cuda" ]]; then
-    printf "[+] Building with CUDA backend\n"
-    LLAMA_CUBLAS=1 make -j server $log
-elif [[ "$backend" == "cpu" ]]; then
-    printf "[+] Building with CPU backend\n"
-    make -j server $log
-elif [[ "$backend" == "metal" ]]; then
-    printf "[+] Building with Metal backend\n"
-    make -j server $log
-elif [[ "$backend" == "opencl" ]]; then
-    printf "[+] Building with OpenCL backend\n"
-    LLAMA_CLBLAST=1 make -j server $log
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-# run the server
-
-printf "[+] Running server\n"
-
-args=""
-if [[ "$backend" == "cuda" ]]; then
-    export CUDA_VISIBLE_DEVICES=$gpu_id
-    args="-ngl 999"
-elif [[ "$backend" == "cpu" ]]; then
-    args="-ngl 0"
-elif [[ "$backend" == "metal" ]]; then
-    args="-ngl 999"
-elif [[ "$backend" == "opencl" ]]; then
-    args="-ngl 999"
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-if [[ $verbose -eq 1 ]]; then
-    args="$args --verbose"
-fi
-
-./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
-
-exit 0
diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 2c391e641..ec4f4b0a2 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -5,7 +5,7 @@
 # Usage:
 #
 #   $ cd /path/to/llama.cpp
-#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
+#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
 #
 
 set -e
@@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
 echo "Syncing ggml changes since commit $lc"
 
 to_skip=""
-if [ "$1" == "-skip" ]; then
-    to_skip=$2
-fi
+
+# context for git patches in number of lines
+ctx="8"
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -skip )
+            shift
+            to_skip=$1
+            ;;
+        -C )
+            shift
+            ctx=$1
+            ;;
+    esac
+    shift
+done
 
 cd $SRC_GGML
 
@@ -52,19 +66,34 @@ while read c; do
         fi
     fi
 
-    git format-patch -k $c~1..$c --stdout -- \
-        include/ggml/ggml*.h \
+    git format-patch -U${ctx} -k $c~1..$c --stdout -- \
+        CMakeLists.txt \
+        src/CMakeLists.txt \
+        cmake/FindSIMD.cmake \
         src/ggml*.h \
         src/ggml*.c \
         src/ggml*.cpp \
-        src/ggml*.m \
-        src/ggml*.metal \
-        src/ggml*.cu \
+        src/gguf*.cpp \
+        src/ggml-blas/* \
+        src/ggml-cann/* \
+        src/ggml-cpu/* \
+        src/ggml-cuda/* \
+        src/ggml-hip/* \
+        src/ggml-kompute/* \
+        src/ggml-metal/* \
+        src/ggml-musa/* \
+        src/ggml-opencl/* \
+        src/ggml-rpc/* \
+        src/ggml-sycl/* \
+        src/ggml-vulkan/* \
+        include/ggml*.h \
+        include/gguf*.h \
         tests/test-opt.cpp \
-        tests/test-grad0.cpp \
         tests/test-quantize-fns.cpp \
         tests/test-quantize-perf.cpp \
         tests/test-backend-ops.cpp \
+        LICENSE \
+        scripts/gen-authors.sh \
         >> $SRC_LLAMA/ggml-src.patch
 done < $SRC_LLAMA/ggml-commits
 
@@ -90,71 +119,63 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
 
     # replace filenames:
     #
-    # src/ggml.c                  -> ggml.c
-    # src/ggml-alloc.c            -> ggml-alloc.c
-    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
-    # src/ggml-backend.c          -> ggml-backend.c
-    # src/ggml-cuda.cu            -> ggml-cuda.cu
-    # src/ggml-cuda.h             -> ggml-cuda.h
-    # src/ggml-impl.h             -> ggml-impl.h
-    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
-    # src/ggml-kompute.h          -> ggml-kompute.h
-    # src/ggml-metal.h            -> ggml-metal.h
-    # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-mpi.h              -> ggml-mpi.h
-    # src/ggml-mpi.c              -> ggml-mpi.c
-    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
-    # src/ggml-opencl.h           -> ggml-opencl.h
-    # src/ggml-quants.c           -> ggml-quants.c
-    # src/ggml-quants.h           -> ggml-quants.h
-    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
-    # src/ggml-sycl.h             -> ggml-sycl.h
-    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
-    # src/ggml-vulkan.h           -> ggml-vulkan.h
-    # include/ggml/ggml.h         -> ggml.h
-    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
-    # include/ggml/ggml-backend.h -> ggml-backend.h
+    # CMakelists.txt       -> ggml/CMakeLists.txt
+    # src/CMakeLists.txt   -> ggml/src/CMakeLists.txt
+    # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
     #
-    # tests/test-opt.cpp           -> tests/test-opt.cpp
-    # tests/test-grad0.cpp         -> tests/test-grad0.cpp
-    # tests/test-quantize-fns.cpp  -> tests/test-quantize-fns.cpp
-    # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
-    # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
+    # src/ggml*.c          -> ggml/src/ggml*.c
+    # src/ggml*.cpp        -> ggml/src/ggml*.cpp
+    # src/ggml*.h          -> ggml/src/ggml*.h
+    # src/gguf*.cpp        -> ggml/src/gguf*.cpp
+    # src/ggml-blas/*      -> ggml/src/ggml-blas/*
+    # src/ggml-cann/*      -> ggml/src/ggml-cann/*
+    # src/ggml-cpu/*       -> ggml/src/ggml-cpu/*
+    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/*
+    # src/ggml-hip/*       -> ggml/src/ggml-hip/*
+    # src/ggml-kompute/*   -> ggml/src/ggml-kompute/*
+    # src/ggml-metal/*     -> ggml/src/ggml-metal/*
+    # src/ggml-musa/*      -> ggml/src/ggml-musa/*
+    # src/ggml-opencl/*    -> ggml/src/ggml-opencl/*
+    # src/ggml-rpc/*       -> ggml/src/ggml-rpc/*
+    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/*
+    # src/ggml-vulkan/*    -> ggml/src/ggml-vulkan/*
+    #
+    # include/ggml*.h -> ggml/include/ggml*.h
+    # include/gguf*.h -> ggml/include/gguf*.h
+    #
+    # tests/test*.cpp -> tests/
+    #
+    # LICENSE                -> LICENSE
+    # scripts/gen-authors.sh -> scripts/gen-authors.sh
 
-    cat ggml-src.patch | sed \
-        -e 's/src\/ggml\.c/ggml.c/g' \
-        -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
-        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
-        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
-        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
-        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
-        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
-        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
-        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
-        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
-        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
-        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
-        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
-        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
-        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
-        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
-        -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
-        -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
-        -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
-        -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
-        -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
+    cat ggml-src.patch | sed -E \
+        -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/gguf(.*)\.cpp/\1ggml\/src\/gguf\2.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\//\1ggml\/src\/ggml-kompute\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \
+        -e 's/([[:space:]]|[ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \
+        -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
         > ggml-src.patch.tmp
     mv ggml-src.patch.tmp ggml-src.patch
 
-    git am ggml-src.patch
+    git am -C${ctx} ggml-src.patch
 
     rm -v $SRC_LLAMA/ggml-src.patch
 fi
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 389c0bdfe..255109ab7 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-b458250b736a7473f7ff3560d47c93f1644f3290
+08b538031f7f944e84f472483ef5d26bf5190ead
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index feb34bbc8..e83d415c0 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -1,31 +1,33 @@
 #!/bin/bash
 
-cp -rpv ../ggml/src/ggml.c                  ./ggml.c
-cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
-cp -rpv ../ggml/src/ggml-backend-impl.h     ./ggml-backend-impl.h
-cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
-cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
-cp -rpv ../ggml/src/ggml-impl.h             ./ggml-impl.h
-cp -rpv ../ggml/src/ggml-kompute.cpp        ./ggml-kompute.cpp
-cp -rpv ../ggml/src/ggml-kompute.h          ./ggml-kompute.h
-cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
-cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
-cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
-cp -rpv ../ggml/src/ggml-mpi.h              ./ggml-mpi.h
-cp -rpv ../ggml/src/ggml-mpi.c              ./ggml-mpi.c
-cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
-cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
-cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
-cp -rpv ../ggml/src/ggml-quants.h           ./ggml-quants.h
-cp -rpv ../ggml/src/ggml-sycl.cpp           ./ggml-sycl.cpp
-cp -rpv ../ggml/src/ggml-sycl.h             ./ggml-sycl.h
-cp -rpv ../ggml/src/ggml-vulkan.cpp         ./ggml-vulkan.cpp
-cp -rpv ../ggml/src/ggml-vulkan.h           ./ggml-vulkan.h
-cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
-cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
-cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
+cp -rpv ../ggml/CMakeLists.txt       ./ggml/CMakeLists.txt
+cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
+cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 
-cp -rpv ../ggml/tests/test-opt.cpp         ./tests/test-opt.cpp
-cp -rpv ../ggml/tests/test-grad0.cpp       ./tests/test-grad0.cpp
-cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
+cp -rpv ../ggml/src/ggml*.c        ./ggml/src/
+cp -rpv ../ggml/src/ggml*.cpp      ./ggml/src/
+cp -rpv ../ggml/src/ggml*.h        ./ggml/src/
+cp -rpv ../ggml/src/gguf*.cpp      ./ggml/src/
+cp -rpv ../ggml/src/ggml-blas/*    ./ggml/src/ggml-blas/
+cp -rpv ../ggml/src/ggml-cann/*    ./ggml/src/ggml-cann/
+cp -rpv ../ggml/src/ggml-cpu/*     ./ggml/src/ggml-cpu/
+cp -rpv ../ggml/src/ggml-cuda/*    ./ggml/src/ggml-cuda/
+cp -rpv ../ggml/src/ggml-hip/*     ./ggml/src/ggml-hip/
+cp -rpv ../ggml/src/ggml-kompute/* ./ggml/src/ggml-kompute/
+cp -rpv ../ggml/src/ggml-metal/*   ./ggml/src/ggml-metal/
+cp -rpv ../ggml/src/ggml-musa/*    ./ggml/src/ggml-musa/
+cp -rpv ../ggml/src/ggml-opencl/*  ./ggml/src/ggml-opencl/
+cp -rpv ../ggml/src/ggml-rpc/*     ./ggml/src/ggml-rpc/
+cp -rpv ../ggml/src/ggml-sycl/*    ./ggml/src/ggml-sycl/
+cp -rpv ../ggml/src/ggml-vulkan/*  ./ggml/src/ggml-vulkan/
+
+cp -rpv ../ggml/include/ggml*.h ./ggml/include/
+cp -rpv ../ggml/include/gguf*.h ./ggml/include/
+
+cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-quantize-fns.cpp  ./tests/test-quantize-fns.cpp
+cp -rpv ../ggml/tests/test-quantize-perf.cpp ./tests/test-quantize-perf.cpp
+cp -rpv ../ggml/tests/test-backend-ops.cpp   ./tests/test-backend-ops.cpp
+
+cp -rpv ../LICENSE                     ./LICENSE
+cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
index dff4b4734..0b5b9aafa 100755
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
 
+import logging
 import os
 import hashlib
 
+logger = logging.getLogger("verify-checksum-models")
+
 
 def sha256sum(file):
     block_size = 16 * 1024 * 1024  # 16 MB block size
@@ -27,7 +30,7 @@ hash_list_file = os.path.join(llama_path, "SHA256SUMS")
 
 # Check if the hash list file exists
 if not os.path.exists(hash_list_file):
-    print(f"Hash list file not found: {hash_list_file}")
+    logger.error(f"Hash list file not found: {hash_list_file}")
     exit(1)
 
 # Read the hash file content and split it into an array of lines
@@ -46,7 +49,7 @@ for line in hash_list:
     file_path = os.path.join(llama_path, filename)
 
     # Informing user of the progress of the integrity check
-    print(f"Verifying the checksum of {file_path}")
+    logger.info(f"Verifying the checksum of {file_path}")
 
     # Check if the file exists
     if os.path.exists(file_path):
@@ -73,9 +76,9 @@ for line in hash_list:
 
 
 # Print column headers for results table
-print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20))
-print("-" * 80)
+print("filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) # noqa: NP100
+print("-" * 80) # noqa: NP100
 
 # Output the results as a table
 for r in results:
-    print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}")
+    print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") # noqa: NP100
diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake
new file mode 100644
index 000000000..f5ad6ab9b
--- /dev/null
+++ b/scripts/xxd.cmake
@@ -0,0 +1,16 @@
+# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
+# Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
+
+SET(INPUT "" CACHE STRING "Input File")
+SET(OUTPUT "" CACHE STRING "Output File")
+
+get_filename_component(filename "${INPUT}" NAME)
+string(REGEX REPLACE "\\.|-" "_" name "${filename}")
+
+file(READ "${INPUT}" hex_data HEX)
+string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
+
+string(LENGTH ${hex_data} hex_len)
+math(EXPR len "${hex_len} / 2")
+
+file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")
diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h
index a49d385a1..0361ffc38 120000
--- a/spm-headers/ggml-alloc.h
+++ b/spm-headers/ggml-alloc.h
@@ -1 +1 @@
-../ggml-alloc.h
\ No newline at end of file
+../ggml/include/ggml-alloc.h
\ No newline at end of file
diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h
index 17c2cf14f..7295f0f0d 120000
--- a/spm-headers/ggml-backend.h
+++ b/spm-headers/ggml-backend.h
@@ -1 +1 @@
-../ggml-backend.h
\ No newline at end of file
+../ggml/include/ggml-backend.h
\ No newline at end of file
diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h
new file mode 120000
index 000000000..8a8604cc2
--- /dev/null
+++ b/spm-headers/ggml-cpp.h
@@ -0,0 +1 @@
+../ggml/include/ggml-cpp.h
\ No newline at end of file
diff --git a/spm-headers/ggml-cpu.h b/spm-headers/ggml-cpu.h
new file mode 120000
index 000000000..66e629607
--- /dev/null
+++ b/spm-headers/ggml-cpu.h
@@ -0,0 +1 @@
+../ggml/include/ggml-cpu.h
\ No newline at end of file
diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h
new file mode 120000
index 000000000..aefad5fa0
--- /dev/null
+++ b/spm-headers/ggml-metal.h
@@ -0,0 +1 @@
+../ggml/include/ggml-metal.h
\ No newline at end of file
diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h
index 39215298f..0bdfeacbd 120000
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@@ -1 +1 @@
-../ggml.h
\ No newline at end of file
+../ggml/include/ggml.h
\ No newline at end of file
diff --git a/spm-headers/llama.h b/spm-headers/llama.h
index 9acceb980..b31388f0d 120000
--- a/spm-headers/llama.h
+++ b/spm-headers/llama.h
@@ -1 +1 @@
-../llama.h
\ No newline at end of file
+../include/llama.h
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..e1b02e4c0
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,41 @@
+llama_add_compile_flags()
+
+#
+# libraries
+#
+
+# llama
+
+add_library(llama
+            ../include/llama.h
+            llama.cpp
+            llama-adapter.cpp
+            llama-arch.cpp
+            llama-batch.cpp
+            llama-chat.cpp
+            llama-context.cpp
+            llama-grammar.cpp
+            llama-hparams.cpp
+            llama-impl.cpp
+            llama-kv-cache.cpp
+            llama-mmap.cpp
+            llama-model-loader.cpp
+            llama-model.cpp
+            llama-quant.cpp
+            llama-sampling.cpp
+            llama-vocab.cpp
+            unicode.h
+            unicode.cpp
+            unicode-data.cpp
+            )
+
+target_include_directories(llama PUBLIC . ../include ../common)
+target_compile_features   (llama PUBLIC cxx_std_17) # don't bump
+
+target_link_libraries(llama PUBLIC ggml)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_BUILD)
+    target_compile_definitions(llama PUBLIC  LLAMA_SHARED)
+endif()
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
new file mode 100644
index 000000000..8a0800463
--- /dev/null
+++ b/src/llama-adapter.cpp
@@ -0,0 +1,347 @@
+#include "llama-adapter.h"
+
+#include "llama-impl.h"
+#include "llama-mmap.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <map>
+#include <cassert>
+#include <stdexcept>
+
+// vec
+
+struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+        return nullptr;
+    }
+
+    return tensors[il];
+}
+
+struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+    ggml_tensor * layer_dir = tensor_for(il);
+    if (layer_dir != nullptr) {
+        cur = ggml_add(ctx, cur, layer_dir);
+    }
+
+    return cur;
+}
+
+bool llama_adapter_cvec::init(const llama_model & model) {
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT(tensors.empty());
+    GGML_ASSERT(ctxs.empty());
+    GGML_ASSERT(bufs.empty());
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    // make tensors
+    tensors.reserve(hparams.n_layer);
+    tensors.push_back(nullptr); // there's never a tensor for layer 0
+    for (size_t il = 1; il < hparams.n_layer; il++) {
+        ggml_backend_buffer_type_t buft = model.select_buft(il);
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return false;
+        }
+        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+        tensors.push_back(tensor);
+    }
+
+    // allocate tensors / buffers and zero
+    bufs.reserve(ctx_map.size());
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        bufs.emplace_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_adapter_cvec::apply(
+        const llama_model & model,
+        const float * data,
+        size_t len,
+        int32_t n_embd,
+        int32_t il_start,
+        int32_t il_end) {
+    const auto & hparams = model.hparams;
+
+    if (data == nullptr) {
+        // disable the current control vector (but leave allocated for later)
+        layer_start = -1;
+        layer_end   = -1;
+        return 0;
+    }
+
+    if (n_embd != (int) hparams.n_embd) {
+        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        return 1;
+    }
+
+    if (tensors.empty()) {
+        if (!init(model)) {
+            return 1;
+        }
+    }
+
+    layer_start = il_start;
+    layer_end   = il_end;
+
+    for (size_t il = 1; il < hparams.n_layer; il++) {
+        assert(tensors[il] != nullptr);
+
+        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+        if (off + n_embd <= len) {
+            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
+        }
+    }
+
+    return 0;
+}
+
+// lora
+
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
+    const std::string name(w->name);
+
+    const auto pos = ab_map.find(name);
+    if (pos != ab_map.end()) {
+        return &pos->second;
+    }
+
+    return nullptr;
+}
+
+static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+
+    ggml_context * ctx_init;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ true,
+        /* .ctx      = */ &ctx_init,
+    };
+
+    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
+    }
+
+    ggml_context_ptr ctx { ctx_init };
+
+    // check metadata
+    {
+        auto get_kv_str = [&](const std::string & key) -> std::string {
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
+        };
+        auto get_kv_f32 = [&](const std::string & key) -> float {
+            int id = gguf_find_key(ctx_gguf.get(), key.c_str());
+            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
+        };
+        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+
+        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
+        auto general_arch = llm_arch_from_string(general_arch_str);
+        if (general_arch != model.arch) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+
+        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
+    }
+
+    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
+
+    // contexts for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // add a new context
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * buft_ctx = ggml_init(params);
+            if (!buft_ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = buft_ctx;
+            adapter.ctxs.emplace_back(buft_ctx);
+            return buft_ctx;
+        };
+        return it->second;
+    };
+
+    // bundle lora_a and lora_b into pairs
+    std::map<std::string, llama_adapter_lora_weight> ab_map;
+    auto str_endswith = [](const std::string & str, const std::string & suffix) {
+        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+    };
+
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+        std::string name(cur->name);
+        if (str_endswith(name, ".lora_a")) {
+            replace_all(name, ".lora_a", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
+            } else {
+                ab_map[name].a = cur;
+            }
+        } else if (str_endswith(name, ".lora_b")) {
+            replace_all(name, ".lora_b", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
+            } else {
+                ab_map[name].b = cur;
+            }
+        } else if (str_endswith(name, "_norm.weight")) {
+            // TODO: add support for norm vector
+            // for now, we don't really care because most adapters still work fine without it
+            continue;
+        } else {
+            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
+        }
+    }
+
+    // add tensors
+    for (auto & it : ab_map) {
+        const std::string & name = it.first;
+        llama_adapter_lora_weight & w = it.second;
+        bool is_token_embd = str_endswith(name, "token_embd.weight");
+
+        if (!w.a || !w.b) {
+            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+        }
+
+        // device buft and device ctx
+        const auto * model_tensor = model.get_tensor(name.c_str());
+        if (!model_tensor) {
+            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
+        }
+
+        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
+        // validate tensor shape
+        if (is_token_embd) {
+            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+            if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+            }
+        } else {
+            if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+            }
+            if (w.a->ne[1] != w.b->ne[0]) {
+                throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+            }
+        }
+
+        // save tensor to adapter
+        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_set_name(tensor_a, w.a->name);
+        ggml_set_name(tensor_b, w.b->name);
+        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
+    }
+
+    // allocate tensors / buffers and zero
+    {
+        adapter.ctxs.reserve(ctx_map.size());
+        adapter.bufs.reserve(ctx_map.size());
+        for (auto & it : ctx_map) {
+            ggml_backend_buffer_type_t buft = it.first;
+            ggml_context * ctx_dev = it.second;
+            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
+            if (!buf) {
+                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
+            }
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
+            adapter.bufs.emplace_back(std::move(buf));
+        }
+    }
+
+    // set tensor data
+    {
+        llama_file gguf_file(path_lora, "rb");
+        std::vector<uint8_t> read_buf;
+        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
+            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
+            size_t size = ggml_nbytes(orig);
+            read_buf.resize(size);
+            gguf_file.seek(offs, SEEK_SET);
+            gguf_file.read_raw(read_buf.data(), size);
+            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
+        };
+        for (auto & it : adapter.ab_map) {
+            auto orig = ab_map[it.first];
+            auto dev  = it.second;
+            set_tensor(orig.a, dev.a);
+            set_tensor(orig.b, dev.b);
+        }
+    }
+
+    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+}
+
+struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+    struct llama_adapter_lora * adapter = new llama_adapter_lora();
+
+    try {
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+        return adapter;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+
+        delete adapter;
+    }
+
+    return nullptr;
+}
+
+void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+    delete adapter;
+}
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
new file mode 100644
index 000000000..603fa08f6
--- /dev/null
+++ b/src/llama-adapter.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "llama.h"
+
+#include "ggml-cpp.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// TODO: pimpl
+
+//
+// llama_adapter_cvec
+//
+
+struct llama_adapter_cvec {
+    struct ggml_tensor * tensor_for(int il) const;
+
+    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
+
+    int32_t apply(
+            const llama_model & model,
+            const float * data,
+            size_t len,
+            int32_t n_embd,
+            int32_t il_start,
+            int32_t il_end);
+
+private:
+    bool init(const llama_model & model);
+
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    std::vector<struct ggml_tensor *> tensors; // per layer
+};
+
+//
+// llama_adapter_lora
+//
+
+struct llama_adapter_lora_weight {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+
+    // get actual scale based on rank and alpha
+    float get_scale(float alpha, float adapter_scale) const {
+        const float rank  = (float) b->ne[0];
+        const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
+        return scale;
+    }
+
+    llama_adapter_lora_weight() = default;
+    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+};
+
+struct llama_adapter_lora {
+    // map tensor name to lora_a_b
+    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
+
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    float alpha;
+
+    llama_adapter_lora() = default;
+    ~llama_adapter_lora() = default;
+
+    llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
+};
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
new file mode 100644
index 000000000..97a1e7e5e
--- /dev/null
+++ b/src/llama-arch.cpp
@@ -0,0 +1,1492 @@
+#include "llama-arch.h"
+
+#include "llama-impl.h"
+
+#include <map>
+
+static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_LLAMA,            "llama"            },
+    { LLM_ARCH_DECI,             "deci"             },
+    { LLM_ARCH_FALCON,           "falcon"           },
+    { LLM_ARCH_GROK,             "grok"             },
+    { LLM_ARCH_GPT2,             "gpt2"             },
+    { LLM_ARCH_GPTJ,             "gptj"             },
+    { LLM_ARCH_GPTNEOX,          "gptneox"          },
+    { LLM_ARCH_MPT,              "mpt"              },
+    { LLM_ARCH_BAICHUAN,         "baichuan"         },
+    { LLM_ARCH_STARCODER,        "starcoder"        },
+    { LLM_ARCH_REFACT,           "refact"           },
+    { LLM_ARCH_BERT,             "bert"             },
+    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
+    { LLM_ARCH_BLOOM,            "bloom"            },
+    { LLM_ARCH_STABLELM,         "stablelm"         },
+    { LLM_ARCH_QWEN,             "qwen"             },
+    { LLM_ARCH_QWEN2,            "qwen2"            },
+    { LLM_ARCH_QWEN2MOE,         "qwen2moe"         },
+    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
+    { LLM_ARCH_PHI2,             "phi2"             },
+    { LLM_ARCH_PHI3,             "phi3"             },
+    { LLM_ARCH_PHIMOE,           "phimoe"           },
+    { LLM_ARCH_PLAMO,            "plamo"            },
+    { LLM_ARCH_CODESHELL,        "codeshell"        },
+    { LLM_ARCH_ORION,            "orion"            },
+    { LLM_ARCH_INTERNLM2,        "internlm2"        },
+    { LLM_ARCH_MINICPM,          "minicpm"          },
+    { LLM_ARCH_MINICPM3,         "minicpm3"         },
+    { LLM_ARCH_GEMMA,            "gemma"            },
+    { LLM_ARCH_GEMMA2,           "gemma2"           },
+    { LLM_ARCH_STARCODER2,       "starcoder2"       },
+    { LLM_ARCH_MAMBA,            "mamba"            },
+    { LLM_ARCH_XVERSE,           "xverse"           },
+    { LLM_ARCH_COMMAND_R,        "command-r"        },
+    { LLM_ARCH_COHERE2,          "cohere2"          },
+    { LLM_ARCH_DBRX,             "dbrx"             },
+    { LLM_ARCH_OLMO,             "olmo"             },
+    { LLM_ARCH_OLMO2,            "olmo2"            },
+    { LLM_ARCH_OLMOE,            "olmoe"            },
+    { LLM_ARCH_OPENELM,          "openelm"          },
+    { LLM_ARCH_ARCTIC,           "arctic"           },
+    { LLM_ARCH_DEEPSEEK,         "deepseek"         },
+    { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
+    { LLM_ARCH_CHATGLM,          "chatglm"          },
+    { LLM_ARCH_BITNET,           "bitnet"           },
+    { LLM_ARCH_T5,               "t5"               },
+    { LLM_ARCH_T5ENCODER,        "t5encoder"        },
+    { LLM_ARCH_JAIS,             "jais"             },
+    { LLM_ARCH_NEMOTRON,         "nemotron"         },
+    { LLM_ARCH_EXAONE,           "exaone"           },
+    { LLM_ARCH_RWKV6,            "rwkv6"            },
+    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
+    { LLM_ARCH_GRANITE,          "granite"          },
+    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
+    { LLM_ARCH_CHAMELEON,        "chameleon"        },
+    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
+};
+
+static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+    { LLM_KV_GENERAL_TYPE,                 "general.type"                          },
+    { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
+    { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
+    { LLM_KV_GENERAL_NAME,                 "general.name"                          },
+    { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,              "general.version"                       },
+    { LLM_KV_GENERAL_URL,                  "general.url"                           },
+    { LLM_KV_GENERAL_DESCRIPTION,          "general.description"                   },
+    { LLM_KV_GENERAL_LICENSE,              "general.license"                       },
+    { LLM_KV_GENERAL_SOURCE_URL,           "general.source.url"                    },
+    { LLM_KV_GENERAL_SOURCE_HF_REPO,       "general.source.huggingface.repository" },
+
+    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
+    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
+    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
+    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
+    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
+    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
+    { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
+    { LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        "%s.expert_feed_forward_length"        },
+    { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
+    { LLM_KV_USE_PARALLEL_RESIDUAL,             "%s.use_parallel_residual"             },
+    { LLM_KV_TENSOR_DATA_LAYOUT,                "%s.tensor_data_layout"                },
+    { LLM_KV_EXPERT_COUNT,                      "%s.expert_count"                      },
+    { LLM_KV_EXPERT_USED_COUNT,                 "%s.expert_used_count"                 },
+    { LLM_KV_EXPERT_SHARED_COUNT,               "%s.expert_shared_count"               },
+    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
+    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
+    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
+    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
+    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
+    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
+    { LLM_KV_ATTN_LOGIT_SOFTCAPPING,            "%s.attn_logit_softcapping"            },
+    { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },
+    { LLM_KV_SWIN_NORM,                         "%s.swin_norm"                         },
+    { LLM_KV_RESCALE_EVERY_N_LAYERS,            "%s.rescale_every_n_layers"            },
+    { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
+    { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
+    { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
+    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
+    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
+
+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
+    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
+
+    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
+    { LLM_KV_ROPE_FREQ_BASE,            "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_SCALE_LINEAR,         "%s.rope.scale_linear"                    },
+    { LLM_KV_ROPE_SCALING_TYPE,         "%s.rope.scaling.type"                    },
+    { LLM_KV_ROPE_SCALING_FACTOR,       "%s.rope.scaling.factor"                  },
+    { LLM_KV_ROPE_SCALING_ATTN_FACTOR,  "%s.rope.scaling.attn_factor"             },
+    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
+    { LLM_KV_ROPE_SCALING_FINETUNED,    "%s.rope.scaling.finetuned"               },
+    { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier"     },
+
+    { LLM_KV_SPLIT_NO,            "split.no"            },
+    { LLM_KV_SPLIT_COUNT,         "split.count"         },
+    { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
+
+    { LLM_KV_SSM_CONV_KERNEL,    "%s.ssm.conv_kernel"    },
+    { LLM_KV_SSM_INNER_SIZE,     "%s.ssm.inner_size"     },
+    { LLM_KV_SSM_STATE_SIZE,     "%s.ssm.state_size"     },
+    { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
+    { LLM_KV_SSM_DT_B_C_RMS,     "%s.ssm.dt_b_c_rms"     },
+
+    { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
+
+    { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
+    { LLM_KV_POSNET_BLOCK_COUNT,      "%s.posnet.block_count"      },
+
+    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
+    { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },
+
+    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
+    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
+    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE,           "tokenizer.ggml.token_type"               },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,     "tokenizer.ggml.token_type_count"         },
+    { LLM_KV_TOKENIZER_SCORES,               "tokenizer.ggml.scores"                   },
+    { LLM_KV_TOKENIZER_MERGES,               "tokenizer.ggml.merges"                   },
+    { LLM_KV_TOKENIZER_BOS_ID,               "tokenizer.ggml.bos_token_id"             },
+    { LLM_KV_TOKENIZER_EOS_ID,               "tokenizer.ggml.eos_token_id"             },
+    { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
+    { LLM_KV_TOKENIZER_EOM_ID,               "tokenizer.ggml.eom_token_id"             },
+    { LLM_KV_TOKENIZER_UNK_ID,               "tokenizer.ggml.unknown_token_id"         },
+    { LLM_KV_TOKENIZER_SEP_ID,               "tokenizer.ggml.seperator_token_id"       },
+    { LLM_KV_TOKENIZER_PAD_ID,               "tokenizer.ggml.padding_token_id"         },
+    { LLM_KV_TOKENIZER_CLS_ID,               "tokenizer.ggml.cls_token_id"             },
+    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
+    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
+    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
+    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
+    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
+    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
+    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
+    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_PAD_ID,           "tokenizer.ggml.fim_pad_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_REP_ID,           "tokenizer.ggml.fim_rep_token_id"         },
+    { LLM_KV_TOKENIZER_FIM_SEP_ID,           "tokenizer.ggml.fim_sep_token_id"         },
+
+    { LLM_KV_ADAPTER_TYPE,       "adapter.type"       },
+    { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
+
+    // deprecated
+    { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
+    { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
+    { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
+};
+
+static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_LLAMA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_DECI,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_BAICHUAN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_FALCON,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_GROK,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+        },
+    },
+    {
+        LLM_ARCH_GPT2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_GPTJ,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
+    {
+        LLM_ARCH_GPTNEOX,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_MPT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output"},
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_ACT,         "blk.%d.ffn.act" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
+        },
+    },
+    {
+        LLM_ARCH_STARCODER,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_REFACT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_BERT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_CLS,             "cls" },
+            { LLM_TENSOR_CLS_OUT,         "cls.output" },
+        },
+    },
+    {
+        LLM_ARCH_NOMIC_BERT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_JINA_BERT_V2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_CLS,             "cls" },
+        },
+    },
+    {
+        LLM_ARCH_BLOOM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_STABLELM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN2VL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN2MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+        },
+    },
+    {
+        LLM_ARCH_PHI2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_PHI3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_PHIMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,           "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_PLAMO,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_CODESHELL,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_ORION,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_INTERNLM2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_MINICPM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+        },
+    },
+    {
+        LLM_ARCH_MINICPM3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },
+            { LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },
+            { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
+            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
+            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_GEMMA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_GEMMA2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
+    {
+        LLM_ARCH_STARCODER2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_MAMBA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
+            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
+            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" },
+            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
+            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
+        },
+    },
+    {
+        LLM_ARCH_XVERSE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_COMMAND_R,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+        },
+    },
+    {
+        LLM_ARCH_COHERE2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_DBRX,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_OLMO,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_OLMO2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_OLMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_OPENELM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_ARCTIC,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_NORM_EXPS,   "blk.%d.ffn_norm_exps" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_DEEPSEEK,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,      "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+        },
+    },
+    {
+        LLM_ARCH_DEEPSEEK2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },
+            { LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },
+            { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
+            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
+            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+        },
+    },
+    {
+        LLM_ARCH_CHATGLM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_BITNET,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_SUB_NORM,      "blk.%d.attn_sub_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_SUB_NORM,       "blk.%d.ffn_sub_norm" },
+        },
+    },
+    {
+        LLM_ARCH_T5,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
+            { LLM_TENSOR_OUTPUT,               "output" },
+            { LLM_TENSOR_DEC_OUTPUT_NORM,      "dec.output_norm" },
+            { LLM_TENSOR_DEC_ATTN_NORM,        "dec.blk.%d.attn_norm" },
+            { LLM_TENSOR_DEC_ATTN_Q,           "dec.blk.%d.attn_q" },
+            { LLM_TENSOR_DEC_ATTN_K,           "dec.blk.%d.attn_k" },
+            { LLM_TENSOR_DEC_ATTN_V,           "dec.blk.%d.attn_v" },
+            { LLM_TENSOR_DEC_ATTN_OUT,         "dec.blk.%d.attn_o" },
+            { LLM_TENSOR_DEC_ATTN_REL_B,       "dec.blk.%d.attn_rel_b" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "dec.blk.%d.cross_attn_norm" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_Q,     "dec.blk.%d.cross_attn_q" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_K,     "dec.blk.%d.cross_attn_k" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_V,     "dec.blk.%d.cross_attn_v" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_OUT,   "dec.blk.%d.cross_attn_o" },
+            { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
+            { LLM_TENSOR_DEC_FFN_NORM,         "dec.blk.%d.ffn_norm" },
+            { LLM_TENSOR_DEC_FFN_GATE,         "dec.blk.%d.ffn_gate" },
+            { LLM_TENSOR_DEC_FFN_DOWN,         "dec.blk.%d.ffn_down" },
+            { LLM_TENSOR_DEC_FFN_UP,           "dec.blk.%d.ffn_up" },
+            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
+            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
+            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
+            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
+            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
+            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
+            { LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },
+            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
+            { LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },
+            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
+            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_T5ENCODER,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
+            { LLM_TENSOR_OUTPUT,               "output" },
+            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
+            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
+            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
+            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
+            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
+            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
+            { LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },
+            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
+            { LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },
+            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
+            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_JAIS,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+        },
+    },
+    {
+        LLM_ARCH_NEMOTRON,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_EXAONE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_RWKV6,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,           "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_NORM_2,               "blk.%d.attn_norm_2" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },
+            { LLM_TENSOR_TIME_MIX_LERP_W,           "blk.%d.time_mix_lerp_w" },
+            { LLM_TENSOR_TIME_MIX_LERP_K,           "blk.%d.time_mix_lerp_k" },
+            { LLM_TENSOR_TIME_MIX_LERP_V,           "blk.%d.time_mix_lerp_v" },
+            { LLM_TENSOR_TIME_MIX_LERP_R,           "blk.%d.time_mix_lerp_r" },
+            { LLM_TENSOR_TIME_MIX_LERP_G,           "blk.%d.time_mix_lerp_g" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
+            { LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },
+            { LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },
+            { LLM_TENSOR_TIME_MIX_LN,               "blk.%d.time_mix_ln" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_CHANNEL_MIX_LERP_K,        "blk.%d.channel_mix_lerp_k" },
+            { LLM_TENSOR_CHANNEL_MIX_LERP_R,        "blk.%d.channel_mix_lerp_r" },
+            { LLM_TENSOR_CHANNEL_MIX_KEY,           "blk.%d.channel_mix_key" },
+            { LLM_TENSOR_CHANNEL_MIX_VALUE,         "blk.%d.channel_mix_value" },
+            { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
+        },
+    },
+    {
+        LLM_ARCH_RWKV6QWEN2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
+            { LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },
+            { LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_FFN_NORM,                  "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,                  "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,                  "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,                    "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_GRANITE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_GRANITE_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_CHAMELEON,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+        },
+    },
+    {
+        LLM_ARCH_WAVTOKENIZER_DEC,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_CONV1D,            "conv1d" },
+            { LLM_TENSOR_CONVNEXT_DW,       "convnext.%d.dw" },
+            { LLM_TENSOR_CONVNEXT_NORM,     "convnext.%d.norm" },
+            { LLM_TENSOR_CONVNEXT_PW1,      "convnext.%d.pw1" },
+            { LLM_TENSOR_CONVNEXT_PW2,      "convnext.%d.pw2" },
+            { LLM_TENSOR_CONVNEXT_GAMMA,    "convnext.%d.gamma" },
+            { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
+            { LLM_TENSOR_OUTPUT,            "output" },
+            { LLM_TENSOR_POS_NET_CONV1,     "posnet.%d.conv1" },
+            { LLM_TENSOR_POS_NET_CONV2,     "posnet.%d.conv2" },
+            { LLM_TENSOR_POS_NET_NORM,      "posnet.%d.norm" },
+            { LLM_TENSOR_POS_NET_NORM1,     "posnet.%d.norm1" },
+            { LLM_TENSOR_POS_NET_NORM2,     "posnet.%d.norm2" },
+            { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
+            { LLM_TENSOR_POS_NET_ATTN_Q,    "posnet.%d.attn_q" },
+            { LLM_TENSOR_POS_NET_ATTN_K,    "posnet.%d.attn_k" },
+            { LLM_TENSOR_POS_NET_ATTN_V,    "posnet.%d.attn_v" },
+            { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
+        },
+    },
+    {
+        LLM_ARCH_UNKNOWN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
+};
+
+static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+    {LLM_TENSOR_TOKEN_EMBD,                 {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_NORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_OUTPUT_NORM,            {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_ROPE_FREQS,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ROPE_FACTORS_LONG,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ROPE_FACTORS_SHORT,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}},
+    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_ATTN_OUT,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_Q,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_K,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_V,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_OUT,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_GATE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_DOWN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DEC_FFN_UP,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_ATTN_OUT,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_GATE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_DOWN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ENC_FFN_UP,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_INP_SHEXP,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_GATE_INP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_IN,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_X,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_DT,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_W2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_DECAY_W1,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_DECAY_W2,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_KEY,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_VALUE,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_RECEPTANCE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_GATE,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_OUTPUT,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_KEY,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CHANNEL_MIX_VALUE,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_FFN_ACT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
+    {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+    {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
+    {LLM_TENSOR_SSM_D,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CHANNEL_MIX_LERP_R,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_LERP_W,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_K,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_V,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_R,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_LERP_FUSED,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
+    {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_OUT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_POST_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_NORM,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_POST_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_NORM_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_Q_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_K_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_LAYER_OUT_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_Q_A_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_KV_A_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_SUB_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_SUB_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_ATTN_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_CROSS_ATTN_NORM,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_FFN_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_ATTN_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ENC_FFN_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_DEC_ATTN_REL_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_ENC_ATTN_REL_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_FFN_DOWN_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_GATE_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    // this tensor is loaded for T5, but never used
+    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_NORM2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_CONV1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_CONV2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_ATTN_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_ATTN_Q,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_V,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_OUT,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_CONVNEXT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CONVNEXT_PW1,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_PW2,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+};
+
+LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
+
+std::string LLM_KV::operator()(llm_kv kv) const {
+    return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
+        : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+}
+
+std::string LLM_TN_IMPL::str() const {
+    if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
+        return "__missing__";
+    }
+
+    std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid);
+
+    if (suffix != nullptr) {
+        name += ".";
+        name += suffix;
+    }
+
+    return name;
+}
+
+const char * llm_arch_name(llm_arch arch) {
+    auto it = LLM_ARCH_NAMES.find(arch);
+    if (it == LLM_ARCH_NAMES.end()) {
+        return "unknown";
+    }
+    return it->second;
+}
+
+llm_arch llm_arch_from_string(const std::string & name) {
+    for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+
+    return LLM_ARCH_UNKNOWN;
+}
+
+const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
+    return LLM_TENSOR_INFOS.at(tensor);
+}
diff --git a/src/llama-arch.h b/src/llama-arch.h
new file mode 100644
index 000000000..122fdcebe
--- /dev/null
+++ b/src/llama-arch.h
@@ -0,0 +1,402 @@
+#pragma once
+
+#include "ggml.h" // ggml_op
+
+#include <string>
+
+//
+// gguf constants (sync with gguf.py)
+//
+
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_DECI,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GROK,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BERT,
+    LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_JINA_BERT_V2,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_QWEN,
+    LLM_ARCH_QWEN2,
+    LLM_ARCH_QWEN2MOE,
+    LLM_ARCH_QWEN2VL,
+    LLM_ARCH_PHI2,
+    LLM_ARCH_PHI3,
+    LLM_ARCH_PHIMOE,
+    LLM_ARCH_PLAMO,
+    LLM_ARCH_CODESHELL,
+    LLM_ARCH_ORION,
+    LLM_ARCH_INTERNLM2,
+    LLM_ARCH_MINICPM,
+    LLM_ARCH_MINICPM3,
+    LLM_ARCH_GEMMA,
+    LLM_ARCH_GEMMA2,
+    LLM_ARCH_STARCODER2,
+    LLM_ARCH_MAMBA,
+    LLM_ARCH_XVERSE,
+    LLM_ARCH_COMMAND_R,
+    LLM_ARCH_COHERE2,
+    LLM_ARCH_DBRX,
+    LLM_ARCH_OLMO,
+    LLM_ARCH_OLMO2,
+    LLM_ARCH_OLMOE,
+    LLM_ARCH_OPENELM,
+    LLM_ARCH_ARCTIC,
+    LLM_ARCH_DEEPSEEK,
+    LLM_ARCH_DEEPSEEK2,
+    LLM_ARCH_CHATGLM,
+    LLM_ARCH_BITNET,
+    LLM_ARCH_T5,
+    LLM_ARCH_T5ENCODER,
+    LLM_ARCH_JAIS,
+    LLM_ARCH_NEMOTRON,
+    LLM_ARCH_EXAONE,
+    LLM_ARCH_RWKV6,
+    LLM_ARCH_RWKV6QWEN2,
+    LLM_ARCH_GRANITE,
+    LLM_ARCH_GRANITE_MOE,
+    LLM_ARCH_CHAMELEON,
+    LLM_ARCH_WAVTOKENIZER_DEC,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_TYPE,
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_VERSION,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_VOCAB_SIZE,
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_FEATURES_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
+    LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+    LLM_KV_EXPERT_COUNT,
+    LLM_KV_EXPERT_USED_COUNT,
+    LLM_KV_EXPERT_SHARED_COUNT,
+    LLM_KV_EXPERT_WEIGHTS_SCALE,
+    LLM_KV_EXPERT_WEIGHTS_NORM,
+    LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_POOLING_TYPE,
+    LLM_KV_LOGIT_SCALE,
+    LLM_KV_DECODER_START_TOKEN_ID,
+    LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+    LLM_KV_FINAL_LOGIT_SOFTCAPPING,
+    LLM_KV_SWIN_NORM,
+    LLM_KV_RESCALE_EVERY_N_LAYERS,
+    LLM_KV_TIME_MIX_EXTRA_DIM,
+    LLM_KV_TIME_DECAY_EXTRA_DIM,
+    LLM_KV_RESIDUAL_SCALE,
+    LLM_KV_EMBEDDING_SCALE,
+    LLM_KV_TOKEN_SHIFT_COUNT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_KEY_LENGTH,
+    LLM_KV_ATTENTION_VALUE_LENGTH,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_GROUPS,
+    LLM_KV_ATTENTION_CAUSAL,
+    LLM_KV_ATTENTION_Q_LORA_RANK,
+    LLM_KV_ATTENTION_KV_LORA_RANK,
+    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+    LLM_KV_ATTENTION_SLIDING_WINDOW,
+    LLM_KV_ATTENTION_SCALE,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_SECTIONS,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ATTN_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+    LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
+
+    LLM_KV_SPLIT_NO,
+    LLM_KV_SPLIT_COUNT,
+    LLM_KV_SPLIT_TENSORS_COUNT,
+
+    LLM_KV_SSM_INNER_SIZE,
+    LLM_KV_SSM_CONV_KERNEL,
+    LLM_KV_SSM_STATE_SIZE,
+    LLM_KV_SSM_TIME_STEP_RANK,
+    LLM_KV_SSM_DT_B_C_RMS,
+
+    LLM_KV_WKV_HEAD_SIZE,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_PRE,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_EOT_ID,
+    LLM_KV_TOKENIZER_EOM_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_CLS_ID,
+    LLM_KV_TOKENIZER_MASK_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_PREFIX,
+    LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
+    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+    LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
+    LLM_KV_TOKENIZER_FIM_PRE_ID,
+    LLM_KV_TOKENIZER_FIM_SUF_ID,
+    LLM_KV_TOKENIZER_FIM_MID_ID,
+    LLM_KV_TOKENIZER_FIM_PAD_ID,
+    LLM_KV_TOKENIZER_FIM_REP_ID,
+    LLM_KV_TOKENIZER_FIM_SEP_ID,
+
+    LLM_KV_ADAPTER_TYPE,
+    LLM_KV_ADAPTER_LORA_ALPHA,
+
+    LLM_KV_POSNET_EMBEDDING_LENGTH,
+    LLM_KV_POSNET_BLOCK_COUNT,
+
+    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
+    LLM_KV_CONVNEXT_BLOCK_COUNT,
+
+    // deprecated:
+    LLM_KV_TOKENIZER_PREFIX_ID,
+    LLM_KV_TOKENIZER_SUFFIX_ID,
+    LLM_KV_TOKENIZER_MIDDLE_ID,
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_TOKEN_TYPES,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ROPE_FACTORS_LONG,
+    LLM_TENSOR_ROPE_FACTORS_SHORT,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_OUT_NORM,
+    LLM_TENSOR_ATTN_POST_NORM,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE_INP,
+    LLM_TENSOR_FFN_GATE_INP_SHEXP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_FFN_POST_NORM,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_ACT,
+    LLM_TENSOR_FFN_DOWN_EXP,  // split experts for backward compatibility
+    LLM_TENSOR_FFN_GATE_EXP,
+    LLM_TENSOR_FFN_UP_EXP,
+    LLM_TENSOR_FFN_NORM_EXPS,
+    LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
+    LLM_TENSOR_FFN_GATE_EXPS,
+    LLM_TENSOR_FFN_UP_EXPS,
+    LLM_TENSOR_FFN_DOWN_SHEXP,
+    LLM_TENSOR_FFN_GATE_SHEXP,
+    LLM_TENSOR_FFN_UP_SHEXP,
+    LLM_TENSOR_FFN_EXP_PROBS_B,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+    LLM_TENSOR_LAYER_OUT_NORM,
+    LLM_TENSOR_SSM_IN,
+    LLM_TENSOR_SSM_CONV1D,
+    LLM_TENSOR_SSM_X,
+    LLM_TENSOR_SSM_DT,
+    LLM_TENSOR_SSM_A,
+    LLM_TENSOR_SSM_D,
+    LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_TIME_MIX_W1,
+    LLM_TENSOR_TIME_MIX_W2,
+    LLM_TENSOR_TIME_MIX_LERP_X,
+    LLM_TENSOR_TIME_MIX_LERP_W,
+    LLM_TENSOR_TIME_MIX_LERP_K,
+    LLM_TENSOR_TIME_MIX_LERP_V,
+    LLM_TENSOR_TIME_MIX_LERP_R,
+    LLM_TENSOR_TIME_MIX_LERP_G,
+    LLM_TENSOR_TIME_MIX_LERP_FUSED,
+    LLM_TENSOR_TIME_MIX_FIRST,
+    LLM_TENSOR_TIME_MIX_DECAY,
+    LLM_TENSOR_TIME_MIX_DECAY_W1,
+    LLM_TENSOR_TIME_MIX_DECAY_W2,
+    LLM_TENSOR_TIME_MIX_KEY,
+    LLM_TENSOR_TIME_MIX_VALUE,
+    LLM_TENSOR_TIME_MIX_RECEPTANCE,
+    LLM_TENSOR_TIME_MIX_GATE,
+    LLM_TENSOR_TIME_MIX_LN,
+    LLM_TENSOR_TIME_MIX_OUTPUT,
+    LLM_TENSOR_CHANNEL_MIX_LERP_K,
+    LLM_TENSOR_CHANNEL_MIX_LERP_R,
+    LLM_TENSOR_CHANNEL_MIX_KEY,
+    LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
+    LLM_TENSOR_CHANNEL_MIX_VALUE,
+    LLM_TENSOR_ATTN_Q_A,
+    LLM_TENSOR_ATTN_Q_B,
+    LLM_TENSOR_ATTN_KV_A_MQA,
+    LLM_TENSOR_ATTN_KV_B,
+    LLM_TENSOR_ATTN_Q_A_NORM,
+    LLM_TENSOR_ATTN_KV_A_NORM,
+    LLM_TENSOR_ATTN_SUB_NORM,
+    LLM_TENSOR_FFN_SUB_NORM,
+    LLM_TENSOR_DEC_ATTN_NORM,
+    LLM_TENSOR_DEC_ATTN_Q,
+    LLM_TENSOR_DEC_ATTN_K,
+    LLM_TENSOR_DEC_ATTN_V,
+    LLM_TENSOR_DEC_ATTN_OUT,
+    LLM_TENSOR_DEC_ATTN_REL_B,
+    LLM_TENSOR_DEC_CROSS_ATTN_NORM,
+    LLM_TENSOR_DEC_CROSS_ATTN_Q,
+    LLM_TENSOR_DEC_CROSS_ATTN_K,
+    LLM_TENSOR_DEC_CROSS_ATTN_V,
+    LLM_TENSOR_DEC_CROSS_ATTN_OUT,
+    LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
+    LLM_TENSOR_DEC_FFN_NORM,
+    LLM_TENSOR_DEC_FFN_GATE,
+    LLM_TENSOR_DEC_FFN_DOWN,
+    LLM_TENSOR_DEC_FFN_UP,
+    LLM_TENSOR_DEC_OUTPUT_NORM,
+    LLM_TENSOR_ENC_ATTN_NORM,
+    LLM_TENSOR_ENC_ATTN_Q,
+    LLM_TENSOR_ENC_ATTN_K,
+    LLM_TENSOR_ENC_ATTN_V,
+    LLM_TENSOR_ENC_ATTN_OUT,
+    LLM_TENSOR_ENC_ATTN_REL_B,
+    LLM_TENSOR_ENC_FFN_NORM,
+    LLM_TENSOR_ENC_FFN_GATE,
+    LLM_TENSOR_ENC_FFN_DOWN,
+    LLM_TENSOR_ENC_FFN_UP,
+    LLM_TENSOR_ENC_OUTPUT_NORM,
+    LLM_TENSOR_CLS,
+    LLM_TENSOR_CLS_OUT,
+    LLM_TENSOR_CONV1D,
+    LLM_TENSOR_CONVNEXT_DW,
+    LLM_TENSOR_CONVNEXT_NORM,
+    LLM_TENSOR_CONVNEXT_PW1,
+    LLM_TENSOR_CONVNEXT_PW2,
+    LLM_TENSOR_CONVNEXT_GAMMA,
+    LLM_TENSOR_POS_NET_CONV1,
+    LLM_TENSOR_POS_NET_CONV2,
+    LLM_TENSOR_POS_NET_NORM,
+    LLM_TENSOR_POS_NET_NORM1,
+    LLM_TENSOR_POS_NET_NORM2,
+    LLM_TENSOR_POS_NET_ATTN_NORM,
+    LLM_TENSOR_POS_NET_ATTN_Q,
+    LLM_TENSOR_POS_NET_ATTN_K,
+    LLM_TENSOR_POS_NET_ATTN_V,
+    LLM_TENSOR_POS_NET_ATTN_OUT,
+};
+
+enum llm_tensor_layer {
+    LLM_TENSOR_LAYER_INPUT,
+    LLM_TENSOR_LAYER_REPEATING,
+    LLM_TENSOR_LAYER_OUTPUT,
+};
+
+struct LLM_KV {
+    LLM_KV(llm_arch arch, const char * suffix = nullptr);
+
+    llm_arch arch;
+    const char * suffix;
+
+    std::string operator()(llm_kv kv) const;
+};
+
+// helper to handle gguf constants
+// usage:
+//
+//   const auto tn = LLM_TN(LLM_ARCH_LLAMA);
+//
+//   std::string name = tn(LLM_TENSOR_OUTPUT);                     -> "output"
+//   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
+//   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
+//
+struct LLM_TN_IMPL {
+    const llm_arch arch;
+    const llm_tensor tensor;
+    const char * const suffix;
+    const int bid;
+    const int xid;
+
+    std::string str() const;
+
+    operator std::string() const {
+        return str();
+    }
+
+    friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
+        return str == tn.str();
+    }
+
+    friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
+        return str != tn.str();
+    }
+};
+
+struct LLM_TN {
+    LLM_TN(llm_arch arch) : arch(arch) {}
+
+    llm_arch arch;
+
+    LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
+        return { arch, tensor, suffix, bid, xid };
+    }
+
+    LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
+        return { arch, tensor, nullptr, bid, xid };
+    }
+};
+
+
+struct llm_tensor_info {
+    llm_tensor_layer layer;
+    ggml_op op;
+};
+
+const char * llm_arch_name(llm_arch arch);
+
+llm_arch llm_arch_from_string(const std::string & name);
+
+const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
new file mode 100644
index 000000000..01d5ca57f
--- /dev/null
+++ b/src/llama-batch.cpp
@@ -0,0 +1,368 @@
+#include "llama-batch.h"
+
+#include <cstring>
+#include <algorithm>
+
+llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
+    // clear empty sequences
+    // the previous ubatch is assumed to be gone,
+    // so nothing should refer to values in these sequences anymore.
+    for (size_t i = seq.size(); i-- > 0;) {
+        if (seq[i].length == 0) {
+            seq.pop_back();
+        } else {
+            break;
+        }
+    }
+    ubatch_token.resize(!has_embd ? n_ubatch : 0);
+    ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
+    ubatch_pos.resize(n_ubatch);
+    ubatch_n_seq_id.resize(n_ubatch);
+    ubatch_seq_id.resize(n_ubatch);
+    ubatch_output.resize(n_ubatch);
+    llama_ubatch ubatch = {
+        /*equal_seqs   =*/ true,
+        /*n_tokens     =*/ 0,
+        /*n_seq_tokens =*/ 0,
+        /*n_seqs       =*/ 0,
+        /*token        =*/ !has_embd ? ubatch_token.data() : nullptr,
+        /*embd         =*/ has_embd  ? ubatch_embd.data()  : nullptr,
+        /*pos          =*/ ubatch_pos.data(),
+        /*n_seq_id     =*/ ubatch_n_seq_id.data(),
+        /*seq_id       =*/ ubatch_seq_id.data(),
+        /*output       =*/ ubatch_output.data(),
+    };
+    return ubatch;
+}
+
+void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
+    GGML_ASSERT(batch != nullptr);
+    GGML_ASSERT(length <= seq.length);
+    // Can only add sequences of equal lengths to a batch,
+    // otherwise it isn't clear to which sequence a token belongs
+    GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
+    GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
+    // NOTE: loops are separated for cache-friendliness
+    if (batch->token) {
+        if (ubatch.equal_seqs) {
+            for (size_t i = 0; i < length; ++i) {
+                ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
+            }
+        } else {
+            // simple split
+            ubatch.token = batch->token + seq.offset;
+        }
+    } else {
+        ubatch.token = nullptr;
+    }
+    if (batch->embd) {
+        if (ubatch.equal_seqs) {
+            for (size_t i = 0; i < length; ++i) {
+                memcpy(
+                        ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
+                        batch->embd + (n_embd * ids[seq.offset + i]),
+                        n_embd * sizeof(float)
+                      );
+            }
+        } else {
+            // simple split
+            ubatch.embd = batch->embd + (n_embd * seq.offset);
+        }
+    } else {
+        ubatch.embd = nullptr;
+    }
+    if (ubatch.equal_seqs) {
+        for (size_t i = 0; i < length; ++i) {
+            ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
+        }
+    } else {
+        // simple split
+        ubatch.pos = batch->pos + seq.offset;
+    }
+    if (ubatch.equal_seqs) {
+        ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
+        if (seq.seq_id) {
+            ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
+        }
+    } else {
+        // simple split
+        if (batch->n_seq_id) {
+            ubatch.n_seq_id = batch->n_seq_id + seq.offset;
+        } else {
+            for (size_t i = 0; i < length; ++i) {
+                ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
+            }
+        }
+        if (batch->seq_id) {
+            ubatch.seq_id = batch->seq_id + seq.offset;
+        }
+    }
+    if (logits_all) {
+        for (size_t i = 0; i < length; ++i) {
+            ubatch.output[ubatch.n_tokens + i] = 1;
+            out_ids.push_back(ids[seq.offset + i]);
+        }
+    } else if (batch->logits) {
+        if (ubatch.equal_seqs) {
+            for (size_t i = 0; i < length; ++i) {
+                size_t id = ids[seq.offset + i];
+                int8_t is_output = batch->logits[id];
+                ubatch.output[ubatch.n_tokens + i] = is_output;
+                if (is_output) { out_ids.push_back(id); }
+            }
+        } else {
+            // simple split
+            ubatch.output = batch->logits + seq.offset;
+            for (size_t i = 0; i < length; ++i) {
+                if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
+            }
+        }
+    } else {
+        // only get last output
+        for (size_t i = 0; i < length; ++i) {
+            size_t id = ids[seq.offset + i];
+            int8_t is_last = id == ids.size() - 1;
+            ubatch.output[ubatch.n_tokens + i] = is_last;
+            if (is_last) { out_ids.push_back(id); }
+        }
+    }
+    if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) {
+        ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1;
+    }
+    ubatch.n_tokens += length;
+    ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits
+    seq.offset += length;
+    seq.length -= length;
+    n_tokens -= length;
+    GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs);
+}
+
+llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
+    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    ubatch.equal_seqs = false;
+    if (!seq.empty()) {
+        llama_sbatch_seq & s = seq[0];
+        size_t length = s.length < n_ubatch ? s.length : n_ubatch;
+        GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
+        add_seq_to_ubatch(ubatch, s, length);
+    }
+    return ubatch;
+}
+
+llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
+    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    if (!seq.empty()) {
+        size_t length = 0;
+        size_t n_tokens_in_ubatch = 0;
+        GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
+                                          // smallest first, because it's easier to split this way;
+                                          // starting from the end to pop in constant time.
+        for (size_t i = seq.size(); i-- > 0;) {
+            llama_sbatch_seq & s = seq[i];
+            GGML_ASSERT(s.length > 0);
+            if (length == 0) {
+                length = s.length < n_ubatch ? s.length : n_ubatch;
+            }
+            add_seq_to_ubatch(ubatch, s, length);
+            n_tokens_in_ubatch += length;
+            // shared prompts can't be mixed with any of their sequences,
+            // so it's safer to compute them in their own ubatch
+            if (s.n_seq_id > 1) { break; }
+            // stop when there isn't enough space for another sequence
+            if (length + n_tokens_in_ubatch > n_ubatch) { break; }
+        }
+    }
+    return ubatch;
+}
+
+llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
+    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    if (!seq.empty()) {
+        llama_sbatch_seq & s = seq[seq.size() - 1];
+        size_t length = s.length < n_ubatch ? s.length : n_ubatch;
+        GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
+        add_seq_to_ubatch(ubatch, s, length);
+    }
+    return ubatch;
+}
+
+void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+    GGML_ASSERT(batch.n_tokens >= 0);
+    this->batch = &batch;
+    this->n_embd = n_embd;
+    this->logits_all = logits_all;
+
+    n_tokens = batch.n_tokens;
+    ids.resize(n_tokens);
+    out_ids.clear();
+    // TODO: reserve out_ids and seq
+
+    for (size_t i = 0; i < n_tokens; ++i) {
+        ids[i] = i;
+    }
+    if (simple_split) {
+        seq.resize(1);
+        llama_sbatch_seq & s = seq[0];
+        s.n_seq_id = 0;
+        s.seq_id = nullptr;
+        s.offset = 0;
+        s.length = n_tokens;
+        return;
+    }
+    std::sort(ids.begin(), ids.end(),
+            [&batch](size_t a, size_t b) {
+                int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
+                int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1;
+                // sort by seq_id, then by pos
+                if (n_seq_a == n_seq_b) {
+                    if (batch.seq_id) {
+                        for (int32_t i = 0; i < n_seq_a; ++i) {
+                            llama_seq_id seq_id_a = batch.seq_id[a][i];
+                            llama_seq_id seq_id_b = batch.seq_id[b][i];
+                            // smaller seq_ids go first
+                            if (seq_id_a != seq_id_b) {
+                                return seq_id_a < seq_id_b;
+                            }
+                        }
+                    }
+                    // when all else is equal, sort by pos
+                    if (batch.pos) {
+                        return batch.pos[a] < batch.pos[b];
+                    }
+                    // no pos, sort by id
+                    return a < b;
+                }
+                // shared prompts go first
+                return n_seq_a > n_seq_b;
+            }
+    );
+    // init seq
+    llama_sbatch_seq * last_seq = nullptr;
+
+    for (size_t i = 0; i < n_tokens; ++i) {
+        const size_t bi = ids[i];
+        const int32_t n_seqs = batch.n_seq_id[bi];
+        llama_seq_id * seq_ids = batch.seq_id[bi];
+        if (last_seq != nullptr) {
+            bool same = n_seqs == last_seq->n_seq_id;
+            for (int32_t j = 0; same && j < n_seqs; ++j) {
+                if (seq_ids[j] != last_seq->seq_id[j]) {
+                    same = false;
+                }
+            }
+            if (same) {
+                last_seq->length += 1;
+                continue;
+            }
+        }
+        llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
+        seq.push_back(new_seq);
+        last_seq = &seq.back();
+    }
+    // keep shared prompts first at the end, then sort by length descending.
+    std::sort(seq.begin(), seq.end(),
+            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
+                if (a.n_seq_id == b.n_seq_id) {
+                    return a.length > b.length;
+                }
+                return a.n_seq_id < b.n_seq_id;
+            }
+            );
+}
+
+llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
+    batch = in_batch;
+    GGML_ASSERT(batch.n_tokens > 0);
+    if (!batch.pos) {
+        pos.resize(batch.n_tokens);
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            pos[i] = i + p0;
+        }
+        batch.pos = pos.data();
+    }
+    if (!batch.n_seq_id) {
+        n_seq_id.resize(batch.n_tokens);
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            n_seq_id[i] = seq_id_0.size();
+        }
+        batch.n_seq_id = n_seq_id.data();
+    }
+    if (!batch.seq_id) {
+        seq_id.resize(batch.n_tokens + 1);
+        seq_id[batch.n_tokens] = NULL;
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            seq_id[i] = seq_id_0.data();
+        }
+        batch.seq_id = seq_id.data();
+    }
+    if (!batch.logits) {
+        logits.resize(batch.n_tokens);
+        logits[logits.size() - 1] = true;
+        batch.logits = logits.data();
+    }
+}
+
+//
+// interface implementation
+//
+
+struct llama_batch llama_batch_get_one(
+             llama_token * tokens,
+                 int32_t   n_tokens) {
+    return {
+        /*n_tokens       =*/ n_tokens,
+        /*tokens         =*/ tokens,
+        /*embd           =*/ nullptr,
+        /*pos            =*/ nullptr,
+        /*n_seq_id       =*/ nullptr,
+        /*seq_id         =*/ nullptr,
+        /*logits         =*/ nullptr,
+    };
+}
+
+struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
+    llama_batch batch = {
+        /*n_tokens       =*/ 0,
+        /*tokens         =*/ nullptr,
+        /*embd           =*/ nullptr,
+        /*pos            =*/ nullptr,
+        /*n_seq_id       =*/ nullptr,
+        /*seq_id         =*/ nullptr,
+        /*logits         =*/ nullptr,
+    };
+
+    if (embd) {
+        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+    } else {
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
+    }
+
+    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
+    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
+    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
+    for (int i = 0; i < n_tokens_alloc; ++i) {
+        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+    }
+    batch.seq_id[n_tokens_alloc] = nullptr;
+
+    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens_alloc);
+
+    return batch;
+}
+
+void llama_batch_free(struct llama_batch batch) {
+    if (batch.token)    free(batch.token);
+    if (batch.embd)     free(batch.embd);
+    if (batch.pos)      free(batch.pos);
+    if (batch.n_seq_id) free(batch.n_seq_id);
+    if (batch.seq_id) {
+        for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
+            free(batch.seq_id[i]);
+        }
+        free(batch.seq_id);
+    }
+    if (batch.logits)   free(batch.logits);
+}
diff --git a/src/llama-batch.h b/src/llama-batch.h
new file mode 100644
index 000000000..773c3808b
--- /dev/null
+++ b/src/llama-batch.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "llama.h"
+
+#include <array>
+#include <vector>
+
+// very similar to llama_batch,
+// but has more metadata about sequences
+struct llama_ubatch {
+    bool equal_seqs;
+    // TODO: whole_seqs for embeddings?
+
+    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_seq_tokens; // tokens per sequence
+    uint32_t n_seqs;
+
+    llama_token  *  token;    // [n_tokens]
+    float        *  embd;     // [n_embd, n_tokens]
+    llama_pos    *  pos;      // [n_tokens]
+    int32_t      *  n_seq_id; // [n_seqs]
+    llama_seq_id ** seq_id;   // [n_seqs]
+    int8_t       *  output;   // [n_tokens]
+};
+
+struct llama_sbatch_seq {
+    int32_t n_seq_id;
+
+    llama_seq_id * seq_id;
+
+    size_t offset;
+    size_t length;
+};
+
+// sequence-length-aware batch splitting
+struct llama_sbatch {
+    // tokens left in this batch
+    size_t n_tokens;
+
+    size_t n_embd;
+
+    bool logits_all; // TODO: remove once lctx.logits_all is removed too
+
+    // sorted indices into the batch
+    std::vector<size_t> ids;
+    // batch indices of the output
+    std::vector<size_t> out_ids;
+    std::vector<llama_sbatch_seq> seq;
+
+    const llama_batch * batch = nullptr;
+
+    // buffers for the ubatch
+    std::vector<llama_token>    ubatch_token;
+    std::vector<float>          ubatch_embd;
+    std::vector<llama_pos>      ubatch_pos;
+    std::vector<int32_t>        ubatch_n_seq_id;
+    std::vector<llama_seq_id *> ubatch_seq_id;
+    std::vector<int8_t>         ubatch_output;
+
+    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
+
+    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
+
+    // simple split, unknown number of sequences of unequal lengths
+    llama_ubatch split_simple(size_t n_ubatch);
+
+    // make batches of equal-length sequences
+    llama_ubatch split_equal(size_t n_ubatch);
+
+    // sequence-wise split
+    llama_ubatch split_seq(size_t n_ubatch);
+
+    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+};
+
+// temporary allocate memory for the input batch if needed
+struct llama_batch_allocr {
+    struct llama_batch batch;
+
+    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id *> seq_id;
+    std::vector<int8_t>         logits;
+
+    // optionally fulfill the batch returned by llama_batch_get_one
+    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
+};
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
new file mode 100644
index 000000000..028a64794
--- /dev/null
+++ b/src/llama-chat.cpp
@@ -0,0 +1,587 @@
+#include "llama-chat.h"
+
+#include "llama.h"
+
+#include <map>
+#include <sstream>
+
+#if __cplusplus >= 202000L
+    #define LU8(x) (const char*)(u8##x)
+#else
+    #define LU8(x) u8##x
+#endif
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+    return str.substr(start, end - start);
+}
+
+static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
+    { "chatml",            LLM_CHAT_TEMPLATE_CHATML            },
+    { "llama2",            LLM_CHAT_TEMPLATE_LLAMA_2           },
+    { "llama2-sys",        LLM_CHAT_TEMPLATE_LLAMA_2_SYS       },
+    { "llama2-sys-bos",    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS   },
+    { "llama2-sys-strip",  LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
+    { "mistral-v1",        LLM_CHAT_TEMPLATE_MISTRAL_V1        },
+    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
+    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
+    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
+    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
+    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
+    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
+    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
+    { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
+    { "orion",             LLM_CHAT_TEMPLATE_ORION             },
+    { "openchat",          LLM_CHAT_TEMPLATE_OPENCHAT          },
+    { "vicuna",            LLM_CHAT_TEMPLATE_VICUNA            },
+    { "vicuna-orca",       LLM_CHAT_TEMPLATE_VICUNA_ORCA       },
+    { "deepseek",          LLM_CHAT_TEMPLATE_DEEPSEEK          },
+    { "deepseek2",         LLM_CHAT_TEMPLATE_DEEPSEEK_2        },
+    { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
+    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
+    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
+    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
+    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
+    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
+    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
+    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
+    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
+};
+
+llm_chat_template llm_chat_template_from_str(const std::string & name) {
+    return LLM_CHAT_TEMPLATES.at(name);
+}
+
+llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
+    try {
+        return llm_chat_template_from_str(tmpl);
+    } catch (const std::out_of_range &) {
+        // ignore
+    }
+
+    auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
+        return tmpl.find(haystack) != std::string::npos;
+    };
+    if (tmpl_contains("<|im_start|>")) {
+        return tmpl_contains("<|im_sep|>")
+            ? LLM_CHAT_TEMPLATE_PHI_4
+            : LLM_CHAT_TEMPLATE_CHATML;
+    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
+        if (tmpl_contains("[SYSTEM_PROMPT]")) {
+            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
+        } else if (
+            // catches official 'v1' template
+            tmpl_contains("' [INST] ' + system_message")
+            // catches official 'v3' and 'v3-tekken' templates
+            || tmpl_contains("[AVAILABLE_TOOLS]")
+        ) {
+            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+            if (tmpl_contains(" [INST]")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V1;
+            } else if (tmpl_contains("\"[INST]\"")) {
+                return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
+            }
+            return LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        } else {
+            // llama2 template and its variants
+            // [variant] support system message
+            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+            bool support_system_message = tmpl_contains("<<SYS>>");
+            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
+            bool strip_message = tmpl_contains("content.strip()");
+            if (strip_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+            } else if (add_bos_inside_history) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+            } else if (support_system_message) {
+                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
+            } else {
+                return LLM_CHAT_TEMPLATE_LLAMA_2;
+            }
+        }
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
+        return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
+        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
+    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
+        return LLM_CHAT_TEMPLATE_ZEPHYR;
+    } else if (tmpl_contains("bos_token + message['role']")) {
+        return LLM_CHAT_TEMPLATE_MONARCH;
+    } else if (tmpl_contains("<start_of_turn>")) {
+        return LLM_CHAT_TEMPLATE_GEMMA;
+    } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
+        // OrionStarAI/Orion-14B-Chat
+        return LLM_CHAT_TEMPLATE_ORION;
+    } else if (tmpl_contains("GPT4 Correct ")) {
+        // openchat/openchat-3.5-0106
+        return LLM_CHAT_TEMPLATE_OPENCHAT;
+    } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
+        // eachadea/vicuna-13b-1.1 (and Orca variant)
+        if (tmpl_contains("SYSTEM: ")) {
+            return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
+        }
+        return LLM_CHAT_TEMPLATE_VICUNA;
+    } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
+        // deepseek-ai/deepseek-coder-33b-instruct
+        return LLM_CHAT_TEMPLATE_DEEPSEEK;
+    } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
+        // CohereForAI/c4ai-command-r-plus
+        return LLM_CHAT_TEMPLATE_COMMAND_R;
+    } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
+        return LLM_CHAT_TEMPLATE_LLAMA_3;
+    } else if (tmpl_contains("[gMASK]sop")) {
+        // chatglm3-6b
+        return LLM_CHAT_TEMPLATE_CHATGML_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGML_4;
+    } else if (tmpl_contains(LU8("<用户>"))) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        return LLM_CHAT_TEMPLATE_MINICPM;
+    } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+        return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
+    } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
+        return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
+    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+        // EXAONE-3.0-7.8B-Instruct
+        return LLM_CHAT_TEMPLATE_EXAONE_3;
+    } else if (tmpl_contains("rwkv-world")) {
+        return LLM_CHAT_TEMPLATE_RWKV_WORLD;
+    } else if (tmpl_contains("<|start_of_role|>")) {
+        return LLM_CHAT_TEMPLATE_GRANITE;
+    } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
+        return LLM_CHAT_TEMPLATE_GIGACHAT;
+    } else if (tmpl_contains("<|role_start|>")) {
+        return LLM_CHAT_TEMPLATE_MEGREZ;
+    }
+    return LLM_CHAT_TEMPLATE_UNKNOWN;
+}
+
+// Simple version of "llama_apply_chat_template" that only works with strings
+// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
+int32_t llm_chat_apply_template(
+    llm_chat_template tmpl,
+    const std::vector<const llama_chat_message *> & chat,
+    std::string & dest, bool add_ass) {
+    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
+    std::stringstream ss;
+    if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
+        // chatml template
+        for (auto message : chat) {
+            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
+        }
+        if (add_ass) {
+            ss << "<|im_start|>assistant\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+        // Official mistral 'v7' template
+        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        for (auto message : chat) {
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+            } else if (role == "user") {
+                ss << "[INST] " << content << "[/INST]";
+            }
+            else {
+                ss << " " << content << "</s>";
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
+            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
+        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
+        std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
+        std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
+        bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
+        bool is_inside_turn = false;
+        for (auto message : chat) {
+            if (!is_inside_turn) {
+                ss << leading_space << "[INST]" << trailing_space;
+                is_inside_turn = true;
+            }
+            std::string role(message->role);
+            std::string content(message->content);
+            if (role == "system") {
+                ss << content << "\n\n";
+            } else if (role == "user") {
+                ss << content << leading_space << "[/INST]";
+            } else {
+                ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (
+            tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
+            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
+        // llama2 template and its variants
+        // [variant] support system message
+        // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+        bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
+        // [variant] add BOS inside history
+        bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
+        // [variant] trim spaces from the input message
+        bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
+        // construct the prompt
+        bool is_inside_turn = true; // skip BOS at the beginning
+        ss << "[INST] ";
+        for (auto message : chat) {
+            std::string content = strip_message ? trim(message->content) : message->content;
+            std::string role(message->role);
+            if (!is_inside_turn) {
+                is_inside_turn = true;
+                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
+            }
+            if (role == "system") {
+                if (support_system_message) {
+                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
+                } else {
+                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
+                    ss << content << "\n";
+                }
+            } else if (role == "user") {
+                ss << content << " [/INST]";
+            } else {
+                ss << content << "</s>";
+                is_inside_turn = false;
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
+        // Phi 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
+        // chatml template
+        for (auto message : chat) {
+            ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
+        }
+        if (add_ass) {
+            ss << "<|im_start|>assistant<|im_sep|>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
+        // Falcon 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>\n" << message->content << "\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
+        // zephyr template
+        for (auto message : chat) {
+            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
+        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
+        for (auto message : chat) {
+            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
+            ss << bos << message->role << "\n" << message->content << "</s>\n";
+        }
+        if (add_ass) {
+            ss << "<s>assistant\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
+        // google/gemma-7b-it
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
+                system_prompt = trim(message->content);
+                continue;
+            }
+            // in gemma, "assistant" is "model"
+            role = role == "assistant" ? "model" : message->role;
+            ss << "<start_of_turn>" << role << "\n";
+            if (!system_prompt.empty() && role != "model") {
+                ss << system_prompt << "\n\n";
+                system_prompt = "";
+            }
+            ss << trim(message->content) << "<end_of_turn>\n";
+        }
+        if (add_ass) {
+            ss << "<start_of_turn>model\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
+        // OrionStarAI/Orion-14B-Chat
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message support, we will merge it with user prompt
+                system_prompt = message->content;
+                continue;
+            } else if (role == "user") {
+                ss << "Human: ";
+                if (!system_prompt.empty()) {
+                    ss << system_prompt << "\n\n";
+                    system_prompt = "";
+                }
+                ss << message->content << "\n\nAssistant: </s>";
+            } else {
+                ss << message->content << "</s>";
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
+        // openchat/openchat-3.5-0106,
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "<|end_of_turn|>";
+            } else {
+                role[0] = toupper(role[0]);
+                ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
+            }
+        }
+        if (add_ass) {
+            ss << "GPT4 Correct Assistant:";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
+        // eachadea/vicuna-13b-1.1 (and Orca variant)
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // Orca-Vicuna variant uses a system prefix
+                if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
+                    ss << "SYSTEM: " << message->content << "\n";
+                } else {
+                    ss << message->content << "\n\n";
+                }
+            } else if (role == "user") {
+                ss << "USER: " << message->content << "\n";
+            } else if (role == "assistant") {
+                ss << "ASSISTANT: " << message->content << "</s>\n";
+            }
+        }
+        if (add_ass) {
+            ss << "ASSISTANT:";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
+        // deepseek-ai/deepseek-coder-33b-instruct
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content;
+            } else if (role == "user") {
+                ss << "### Instruction:\n" << message->content << "\n";
+            } else if (role == "assistant") {
+                ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
+            }
+        }
+        if (add_ass) {
+            ss << "### Response:\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
+        // CohereForAI/c4ai-command-r-plus
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+            } else if (role == "user") {
+                ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+            } else if (role == "assistant") {
+                ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
+            }
+        }
+        if (add_ass) {
+            ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
+        // Llama 3
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
+        }
+        if (add_ass) {
+            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
+        // chatglm3-6b
+        ss << "[gMASK]" << "sop";
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n " << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
+        ss << "[gMASK]" << "<sop>";
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n" << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n" << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << LU8("<用户>");
+                ss << trim(message->content);
+                ss << "<AI>";
+            } else {
+                ss << trim(message->content);
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
+        // DeepSeek-V2
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << message->content << "\n\n";
+            } else if (role == "assistant") {
+                ss << "Assistant: " << message->content << LU8("<｜end▁of▁sentence｜>");
+            }
+        }
+        if (add_ass) {
+            ss << "Assistant:";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
+        // DeepSeek-V3
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << LU8("<｜User｜>") << message->content;
+            } else if (role == "assistant") {
+                ss << LU8("<｜Assistant｜>") << message->content << LU8("<｜end▁of▁sentence｜>");
+            }
+        }
+        if (add_ass) {
+            ss << LU8("<｜Assistant｜>");
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
+        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
+        // EXAONE-3.0-7.8B-Instruct
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
+            } else if (role == "user") {
+                ss << "[|user|]" << trim(message->content) << "\n";
+            } else if (role == "assistant") {
+                ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
+            }
+        }
+        if (add_ass) {
+            ss << "[|assistant|]";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
+        // this template requires the model to have "\n\n" as EOT token
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << "User: " << message->content << "\n\nAssistant:";
+            } else {
+                ss << message->content << "\n\n";
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
+        // IBM Granite template
+        for (const auto & message : chat) {
+            std::string role(message->role);
+            ss << "<|start_of_role|>" << role << "<|end_of_role|>";
+            if (role == "assistant_tool_call") {
+                ss << "<|tool_call|>";
+            }
+            ss << message->content << "<|end_of_text|>\n";
+        }
+        if (add_ass) {
+            ss << "<|start_of_role|>assistant<|end_of_role|>\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
+        // GigaChat template
+        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
+
+        // Handle system message if present
+        if (has_system) {
+            ss << "<s>" << chat[0]->content << "<|message_sep|>";
+        } else {
+            ss << "<s>";
+        }
+
+        // Process remaining messages
+        for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (role == "user") {
+                ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
+                << "available functions<|role_sep|>[]<|message_sep|>";
+            } else if (role == "assistant") {
+                ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
+            }
+        }
+
+        // Add generation prompt if needed
+        if (add_ass) {
+            ss << "assistant<|role_sep|>";
+        }
+    }  else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
+        // Megrez template
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
+        }
+
+        if (add_ass) {
+            ss << "<|role_start|>assistant<|role_end|>";
+        }
+    } else {
+        // template not supported
+        return -1;
+    }
+    dest = ss.str();
+    return dest.size();
+}
+
+// public interface
+
+int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
+    auto it = LLM_CHAT_TEMPLATES.begin();
+    for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
+        output[i] = it->first.c_str();
+        std::advance(it, 1);
+    }
+    return (int32_t) LLM_CHAT_TEMPLATES.size();
+}
+
diff --git a/src/llama-chat.h b/src/llama-chat.h
new file mode 100644
index 000000000..2f6a0e3e2
--- /dev/null
+++ b/src/llama-chat.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <cstdint>
+
+enum llm_chat_template {
+    LLM_CHAT_TEMPLATE_CHATML,
+    LLM_CHAT_TEMPLATE_LLAMA_2,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
+    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
+    LLM_CHAT_TEMPLATE_MISTRAL_V1,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3,
+    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_PHI_3,
+    LLM_CHAT_TEMPLATE_PHI_4,
+    LLM_CHAT_TEMPLATE_FALCON_3,
+    LLM_CHAT_TEMPLATE_ZEPHYR,
+    LLM_CHAT_TEMPLATE_MONARCH,
+    LLM_CHAT_TEMPLATE_GEMMA,
+    LLM_CHAT_TEMPLATE_ORION,
+    LLM_CHAT_TEMPLATE_OPENCHAT,
+    LLM_CHAT_TEMPLATE_VICUNA,
+    LLM_CHAT_TEMPLATE_VICUNA_ORCA,
+    LLM_CHAT_TEMPLATE_DEEPSEEK,
+    LLM_CHAT_TEMPLATE_DEEPSEEK_2,
+    LLM_CHAT_TEMPLATE_DEEPSEEK_3,
+    LLM_CHAT_TEMPLATE_COMMAND_R,
+    LLM_CHAT_TEMPLATE_LLAMA_3,
+    LLM_CHAT_TEMPLATE_CHATGML_3,
+    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_GLMEDGE,
+    LLM_CHAT_TEMPLATE_MINICPM,
+    LLM_CHAT_TEMPLATE_EXAONE_3,
+    LLM_CHAT_TEMPLATE_RWKV_WORLD,
+    LLM_CHAT_TEMPLATE_GRANITE,
+    LLM_CHAT_TEMPLATE_GIGACHAT,
+    LLM_CHAT_TEMPLATE_MEGREZ,
+    LLM_CHAT_TEMPLATE_UNKNOWN,
+};
+
+struct llama_chat_message;
+
+llm_chat_template llm_chat_template_from_str(const std::string & name);
+
+llm_chat_template llm_chat_detect_template(const std::string & tmpl);
+
+int32_t llm_chat_apply_template(
+    llm_chat_template tmpl,
+    const std::vector<const llama_chat_message *> & chat,
+    std::string & dest, bool add_ass);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
new file mode 100644
index 000000000..671d2a81a
--- /dev/null
+++ b/src/llama-context.cpp
@@ -0,0 +1,1775 @@
+#include "llama-context.h"
+
+#include "llama-impl.h"
+#include "llama-mmap.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+
+void llama_set_k_shift(struct llama_context & lctx) {
+    const int64_t kv_size = lctx.kv_self.size;
+
+    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
+
+    int32_t * data = (int32_t *) lctx.inp_K_shift->data;
+
+    for (int i = 0; i < kv_size; ++i) {
+        data[i] = lctx.kv_self.cells[i].delta;
+    }
+}
+
+void llama_set_s_copy(struct llama_context & lctx) {
+    const int64_t kv_size = lctx.kv_self.size;
+
+    assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
+
+    int32_t * data = (int32_t *) lctx.inp_s_copy->data;
+
+    for (int i = 0; i < kv_size; ++i) {
+        data[i] = lctx.kv_self.cells[i].src;
+    }
+}
+
+// llama input
+
+static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+
+    const int64_t max_exact = n_buckets >> 1;
+
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+    return relative_bucket;
+}
+
+void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+    //
+    // set input data
+    //
+
+    const auto & hparams = lctx.model.hparams;
+    const auto & cparams = lctx.cparams;
+    const auto & kv_self = lctx.kv_self;
+
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+    }
+
+    if (ubatch.embd) {
+        const int64_t n_embd   = hparams.n_embd;
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+    }
+
+    if (ubatch.pos && lctx.inp_pos) {
+        const int64_t n_tokens = ubatch.n_tokens;
+        auto n_pos = lctx.n_pos_per_token;
+        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
+    }
+
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
+
+        if (!lctx.inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
+            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+
+            if (lctx.n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(lctx.n_outputs == n_outputs);
+            } else if (lctx.n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(lctx.n_outputs == 0);
+            }
+        }
+    }
+
+    GGML_ASSERT(
+        // (!a || b) is a logical implication (a -> b)
+        // !hparams.causal_attn -> !cparams.causal_attn
+        (hparams.causal_attn || !cparams.causal_attn) &&
+        "causal attention is not supported by this model"
+    );
+
+    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
+        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
+        if (cparams.causal_attn && !lctx.is_encoding) {
+            const int64_t n_kv         = kv_self.n;
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+
+
+            float * data     = nullptr;
+            float * data_swa = nullptr;
+
+            if (lctx.inp_KQ_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+                data = (float *) lctx.inp_KQ_mask->data;
+            }
+
+            if (lctx.inp_KQ_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
+                data_swa = (float *) lctx.inp_KQ_mask_swa->data;
+            }
+
+            // For causal attention, use only the previous KV cells
+            // of the correct sequence for each token of the ubatch.
+            // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+            for (int h = 0; h < 1; ++h) {
+                for (int s = 0; s < n_seqs; ++s) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
+
+                        for (int i = 0; i < n_kv; ++i) {
+                            float f;
+                            if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                                f = -INFINITY;
+                            } else {
+                                if (hparams.use_alibi) {
+                                    f = -std::abs(kv_self.cells[i].pos - pos);
+                                } else {
+                                    f = 0.0f;
+                                }
+                            }
+
+                            if (data) {
+                                data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                            }
+
+                            // may need to cut off old tokens for sliding window
+                            if (data_swa) {
+                                if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
+                                data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                            }
+                        }
+                    }
+                }
+
+                if (data) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+
+                if (data_swa) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        } else {
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+            // when using kv cache, the mask needs to match the kv cache size
+            const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+
+            float * data = (float *) lctx.inp_KQ_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
+                            }
+                        }
+
+                        for (int i = n_tokens; i < n_stride; ++i) {
+                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(lctx.inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
+
+        float * data = (float *) lctx.inp_mean->data;
+        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch.n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(lctx.inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
+        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(lctx.inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
+        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+
+    if (kv_self.recurrent) {
+        const int64_t n_kv = kv_self.n;
+
+        if (lctx.inp_s_mask) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
+            float * data = (float *) lctx.inp_s_mask->data;
+
+            // clear unused states
+            for (int i = 0; i < n_kv; ++i) {
+                const uint32_t  cell_id = i + kv_self.head;
+                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+
+                data[i] = (float) (kv_cell.src >= 0);
+
+                // only clear once
+                if (kv_cell.src < 0) {
+                    kv_cell.src = cell_id;
+                }
+            }
+        }
+
+        if (lctx.inp_s_copy) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
+            int32_t * data = (int32_t *) lctx.inp_s_copy->data;
+
+            // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+            for (uint32_t i = 0; i < n_kv; ++i) {
+                const uint32_t  cell_id = i + kv_self.head;
+                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+
+                // prevent out-of-bound sources
+                if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
+                    kv_cell.src = cell_id;
+                }
+
+                data[i] = kv_cell.src;
+
+                // ensure copy only happens once
+                if (kv_cell.src != (int32_t) cell_id) {
+                    kv_cell.src = cell_id;
+                }
+            }
+        }
+    }
+
+    if (lctx.inp_pos_bucket) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
+
+        int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
+
+        if (!lctx.is_encoding) {
+            const int64_t n_kv = kv_self.n;
+            for (int h = 0; h < 1; ++h) {
+                for (int j = 0; j < n_tokens; ++j) {
+                    for (int i = 0; i < n_kv; ++i) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                    }
+                }
+            }
+        } else {
+            for (int h = 0; h < 1; ++h) {
+                for (int j = 0; j < n_tokens; ++j) {
+                    for (int i = 0; i < n_tokens; ++i) {
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                    }
+                }
+            }
+        }
+    }
+
+    if (!lctx.is_encoding && lctx.inp_embd_enc) {
+        assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
+        assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
+
+        ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
+    }
+
+    if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
+        const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
+
+        float * data = (float *) lctx.inp_KQ_mask_cross->data;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_output_enc; ++i) {
+                    float f = -INFINITY;
+                    for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
+                        if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
+                            f = 0.0f;
+                        }
+                    }
+                    data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
+                }
+            }
+
+            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (int j = 0; j < n_output_enc; ++j) {
+                    data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+// llama output
+
+size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+    const auto & cparams = lctx.cparams;
+    const auto & hparams = lctx.model.hparams;
+    const auto & vocab   = lctx.model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (lctx.output_ids.empty()) {
+        // init, never resized afterwards
+        lctx.output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!lctx.buf_output || prev_size < new_size) {
+        if (lctx.buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            lctx.buf_output = nullptr;
+            lctx.logits = nullptr;
+            lctx.embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = lctx.model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (lctx.buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
+
+    lctx.logits = has_logits ? output_base               : nullptr;
+    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    lctx.output_size = n_outputs_max;
+    lctx.logits_size = logits_size;
+    lctx.embd_size   = embd_size;
+
+    // set all ids as invalid (negative)
+    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
+
+    lctx.n_outputs = 0;
+
+    return n_outputs_max;
+}
+
+void llama_output_reorder(struct llama_context & ctx) {
+    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+        const uint32_t n_embd  = ctx.model.hparams.n_embd;
+
+        const int32_t n_outputs = ctx.n_outputs;
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (ctx.logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
+                }
+            }
+            if (ctx.embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            ctx.output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
+
+//
+// interface implementation
+//
+
+void llama_free(struct llama_context * ctx) {
+    delete ctx;
+}
+
+uint32_t llama_n_ctx(const struct llama_context * ctx) {
+    return ctx->cparams.n_ctx;
+}
+
+uint32_t llama_n_batch(const struct llama_context * ctx) {
+    return ctx->cparams.n_batch;
+}
+
+uint32_t llama_n_ubatch(const struct llama_context * ctx) {
+    return ctx->cparams.n_ubatch;
+}
+
+uint32_t llama_n_seq_max(const struct llama_context * ctx) {
+    return ctx->kv_self.size;
+}
+
+const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+    return &ctx->model;
+}
+
+enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+    return ctx->cparams.pooling_type;
+}
+
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_threadpool_t   threadpool,
+        ggml_threadpool_t   threadpool_batch) {
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
+}
+
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
+    ctx->cparams.n_threads       = n_threads;
+    ctx->cparams.n_threads_batch = n_threads_batch;
+}
+
+int32_t llama_n_threads(struct llama_context * ctx) {
+    return ctx->cparams.n_threads;
+}
+
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
+    return ctx->cparams.n_threads_batch;
+}
+
+void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
+    ctx->abort_callback      = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+
+    for (auto & backend : ctx->backends) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+        if (set_abort_callback_fn) {
+            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
+        }
+    }
+}
+
+void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
+    ctx->cparams.embeddings = embeddings;
+}
+
+void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+    ctx->cparams.causal_attn = causal_attn;
+}
+
+void llama_synchronize(struct llama_context * ctx) {
+    ggml_backend_sched_synchronize(ctx->sched.get());
+
+    // FIXME: if multiple single tokens are evaluated without a synchronization,
+    // the stats will be added to the prompt evaluation stats
+    // this should only happen when using batch size 1 to evaluate a batch
+
+    // add the evaluation to the stats
+    if (ctx->n_queued_tokens == 1) {
+        if (!ctx->cparams.no_perf) {
+            ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
+        ctx->n_eval++;
+    } else if (ctx->n_queued_tokens > 1) {
+        if (!ctx->cparams.no_perf) {
+            ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
+        ctx->n_p_eval += ctx->n_queued_tokens;
+    }
+
+    // get a more accurate load time, upon first eval
+    if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
+
+    ctx->n_queued_tokens = 0;
+    ctx->t_compute_start_us = 0;
+}
+
+float * llama_get_logits(struct llama_context * ctx) {
+    llama_synchronize(ctx);
+
+    // reorder logits for backward compatibility
+    // TODO: maybe deprecate this
+    llama_output_reorder(*ctx);
+
+    return ctx->logits;
+}
+
+float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+    int32_t j = -1;
+
+    llama_synchronize(ctx);
+
+    try {
+        if (ctx->logits == nullptr) {
+            throw std::runtime_error("no logits");
+        }
+
+        if (i < 0) {
+            j = ctx->n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
+            }
+        } else if ((size_t) i >= ctx->output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
+        } else {
+            j = ctx->output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= ctx->n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
+        }
+
+        return ctx->logits + j*ctx->model.vocab.n_tokens();
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_get_embeddings(struct llama_context * ctx) {
+    llama_synchronize(ctx);
+
+    // reorder embeddings for backward compatibility
+    // TODO: maybe deprecate this
+    llama_output_reorder(*ctx);
+
+    return ctx->embd;
+}
+
+float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+    int32_t j = -1;
+
+    llama_synchronize(ctx);
+
+    try {
+        if (ctx->embd == nullptr) {
+            throw std::runtime_error("no embeddings");
+        }
+
+        if (i < 0) {
+            j = ctx->n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
+            }
+        } else if ((size_t) i >= ctx->output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
+        } else {
+            j = ctx->output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= ctx->n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
+        }
+
+        return ctx->embd + j*ctx->model.hparams.n_embd;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_synchronize(ctx);
+
+    auto it = ctx->embd_seq.find(seq_id);
+    if (it == ctx->embd_seq.end()) {
+        return nullptr;
+    }
+
+    return it->second.data();
+}
+
+// llama state API
+
+// deprecated
+size_t llama_get_state_size(struct llama_context * ctx) {
+    return llama_state_get_size(ctx);
+}
+
+// deprecated
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
+    return llama_state_get_data(ctx, dst, -1);
+}
+
+// deprecated
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    return llama_state_set_data(ctx, src, -1);
+}
+
+// deprecated
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+}
+
+// deprecated
+bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    return llama_state_save_file(ctx, path_session, tokens, n_token_count);
+}
+
+// TODO: replace all non-fatal assertions with returned errors or exceptions
+struct llama_data_write {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_write() = default;
+
+    void write_string(const std::string & str) {
+        uint32_t str_size = str.size();
+
+        write(&str_size,  sizeof(str_size));
+        write(str.data(), str_size);
+    }
+
+    void write_model_info(const struct llama_context * ctx) {
+        const std::string arch_str = llm_arch_name(ctx->model.arch);
+        write_string(arch_str);
+        // TODO: add more model-specific info which should prevent loading the session file if not identical
+    }
+
+    //void write_rng(const std::mt19937 & rng) {
+    //    std::ostringstream rng_ss;
+    //    rng_ss << rng;
+
+    //    const std::string & rng_str = rng_ss.str();
+
+    //    write_string(rng_str);
+    //}
+
+    void write_output_ids(struct llama_context * ctx) {
+        llama_output_reorder(*ctx);
+
+        const uint32_t n_outputs = ctx->n_outputs;
+
+        std::vector<int32_t> output_pos;
+
+        const size_t    n_batch = ctx->cparams.n_batch;
+        const auto & output_ids = ctx->output_ids;
+
+        GGML_ASSERT(n_outputs <= ctx->output_size);
+
+        output_pos.resize(n_outputs);
+
+        // build a more compact representation of the output ids
+        for (size_t i = 0; i < n_batch; ++i) {
+            // map an output id to a position in the batch
+            int32_t pos = output_ids[i];
+            if (pos >= 0) {
+                GGML_ASSERT((uint32_t) pos < n_outputs);
+                output_pos[pos] = i;
+            }
+        }
+
+        write(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs) {
+            write(output_pos.data(), n_outputs * sizeof(int32_t));
+        }
+    }
+
+    void write_logits(const struct llama_context * ctx) {
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+
+        write(&logits_size, sizeof(logits_size));
+
+        if (logits_size) {
+            write(ctx->logits, logits_size * sizeof(float));
+        }
+    }
+
+    void write_embeddings(const struct llama_context * ctx) {
+        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
+
+        write(&embeddings_size, sizeof(embeddings_size));
+
+        if (embeddings_size) {
+            write(ctx->embd, embeddings_size * sizeof(float));
+        }
+    }
+
+    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
+        for (const auto & range : cell_ranges) {
+            for (uint32_t i = range.first; i < range.second; ++i) {
+                const auto & cell = kv_self.cells[i];
+                const llama_pos pos      = cell.pos;
+                const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+                write(&pos,      sizeof(pos));
+                write(&n_seq_id, sizeof(n_seq_id));
+
+                if (n_seq_id) {
+                    for (auto seq_id : cell.seq_id) {
+                        write(&seq_id, sizeof(seq_id));
+                    }
+                }
+            }
+        }
+    }
+
+    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+        const struct llama_kv_cache & kv_self = ctx->kv_self;
+        const struct llama_hparams & hparams = ctx->model.hparams;
+
+        const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
+        const uint32_t n_layer = hparams.n_layer;
+
+        write(&v_trans, sizeof(v_trans));
+        write(&n_layer, sizeof(n_layer));
+
+        std::vector<uint8_t> tmp_buf;
+
+        // Iterate and write all the keys first, each row is a cell
+        // Get whole range at a time
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+            // Write key type
+            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            write(&k_type_i, sizeof(k_type_i));
+
+            // Write row size of key
+            const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            write(&k_size_row, sizeof(k_size_row));
+
+            // Read each range of cells of k_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * k_size_row;
+                write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
+            }
+        }
+
+        if (!kv_self.v_trans) {
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+                // Write value type
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                write(&v_type_i, sizeof(v_type_i));
+
+                // Write row size of value
+                const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                write(&v_size_row, sizeof(v_size_row));
+
+                // Read each range of cells of v_size length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t buf_size = range_size * v_size_row;
+                    write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
+                }
+            }
+        } else {
+            // When v is transposed, we also need the element size and get the element ranges from each row
+            const uint32_t kv_size = kv_self.size;
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+                // Write value type
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                write(&v_type_i, sizeof(v_type_i));
+
+                // Write element size
+                const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                write(&v_size_el, sizeof(v_size_el));
+
+                // Write GQA embedding size
+                write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+                // For each row, we get the element values of each cell
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    // Read each range of cells of v_size_el length each into tmp_buf and write out
+                    for (const auto & range : cell_ranges) {
+                        const size_t range_size = range.second - range.first;
+                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                        const size_t buf_size = range_size * v_size_el;
+                        write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
+                    }
+                }
+            }
+        }
+    }
+
+    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
+        const struct llama_kv_cache & kv_self = ctx->kv_self;
+        std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+        uint32_t cell_count = 0;
+
+        // Count the number of cells with the specified seq_id
+        // Find all the ranges of cells with this seq id (or all, when -1)
+        uint32_t cell_range_begin = kv_self.size;
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            const auto & cell = kv_self.cells[i];
+            if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+                ++cell_count;
+                if (cell_range_begin == kv_self.size) {
+                    cell_range_begin = i;
+                }
+            } else {
+                if (cell_range_begin != kv_self.size) {
+                    cell_ranges.emplace_back(cell_range_begin, i);
+                    cell_range_begin = kv_self.size;
+                }
+            }
+        }
+        if (cell_range_begin != kv_self.size) {
+            cell_ranges.emplace_back(cell_range_begin, kv_self.size);
+        }
+
+        // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+        uint32_t cell_count_check = 0;
+        for (const auto & range : cell_ranges) {
+            cell_count_check += range.second - range.first;
+        }
+        GGML_ASSERT(cell_count == cell_count_check);
+
+        write(&cell_count, sizeof(cell_count));
+
+        write_kv_cache_meta(kv_self, cell_ranges, seq_id);
+        write_kv_cache_data(ctx, cell_ranges);
+    }
+};
+
+struct llama_data_read {
+    virtual const uint8_t * read(size_t size) = 0;
+    virtual void read_to(void * dst, size_t size) = 0;
+    virtual size_t get_size_read() = 0;
+    virtual ~llama_data_read() = default;
+
+    void read_string(std::string & str) {
+        uint32_t str_size;
+        read_to(&str_size, sizeof(str_size));
+
+        str.assign((const char *) read(str_size), str_size);
+    }
+
+    // validate model information
+    void read_model_info(const struct llama_context * ctx) {
+        const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
+
+        std::string arch_str;
+        read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+        }
+        // TODO: add more info which needs to be identical but which is not verified otherwise
+    }
+
+    //void read_rng(std::mt19937 & rng) {
+    //    std::string rng_str;
+    //    read_string(rng_str);
+
+    //    std::istringstream rng_ss(rng_str);
+    //    rng_ss >> rng;
+
+    //    if (rng_ss.fail()) {
+    //        throw std::runtime_error("failed to load RNG state");
+    //    }
+    //}
+
+    void read_output_ids(struct llama_context * ctx) {
+        std::vector<int32_t> output_pos;
+
+        uint32_t n_outputs;
+        read_to(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
+        }
+
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= ctx->cparams.n_batch) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
+                }
+                ctx->output_ids[id] = i;
+            }
+
+            ctx->n_outputs = n_outputs;
+        }
+    }
+
+    void read_logits(struct llama_context * ctx) {
+        uint64_t logits_size;
+        read_to(&logits_size, sizeof(logits_size));
+
+        if (ctx->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
+
+        if (logits_size) {
+            read_to(ctx->logits, logits_size * sizeof(float));
+        }
+    }
+
+    void read_embeddings(struct llama_context * ctx) {
+        uint64_t embeddings_size;
+        read_to(&embeddings_size, sizeof(embeddings_size));
+
+        if (ctx->embd_size < embeddings_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
+
+        if (embeddings_size) {
+            read_to(ctx->embd, embeddings_size * sizeof(float));
+        }
+    }
+
+    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
+        struct llama_kv_cache & kv_self = ctx->kv_self;
+
+        if (dest_seq_id != -1) {
+            // single sequence
+
+            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+
+            llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+            batch.n_tokens = cell_count;
+            batch.n_seq_tokens = cell_count;
+            batch.n_seqs = 1;
+
+            for (uint32_t i = 0; i < cell_count; ++i) {
+                llama_pos pos;
+                uint32_t n_seq_id;
+
+                read_to(&pos, sizeof(pos));
+                read_to(&n_seq_id, sizeof(n_seq_id));
+
+                if (n_seq_id != 0) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                    return false;
+                }
+
+                batch.pos[i] = pos;
+            }
+            batch.n_seq_id[0] = 1;
+            batch.seq_id[0] = &dest_seq_id;
+            if (!llama_kv_cache_find_slot(kv_self, batch)) {
+                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+                return false;
+            }
+
+            // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+            // Assume that this is one contiguous block of cells
+            GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
+            GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
+            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+            GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
+        } else {
+            // whole KV cache restore
+
+            if (cell_count > kv_self.size) {
+                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+                return false;
+            }
+
+            llama_kv_cache_clear(kv_self);
+
+            for (uint32_t i = 0; i < cell_count; ++i) {
+                llama_kv_cell & cell = kv_self.cells[i];
+
+                llama_pos pos;
+                uint32_t  n_seq_id;
+
+                read_to(&pos,      sizeof(pos));
+                read_to(&n_seq_id, sizeof(n_seq_id));
+
+                cell.pos = pos;
+
+                for (uint32_t j = 0; j < n_seq_id; ++j) {
+                    llama_seq_id seq_id;
+                    read_to(&seq_id, sizeof(seq_id));
+
+                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                        return false;
+                    }
+
+                    cell.seq_id.insert(seq_id);
+
+                    if (kv_self.recurrent) {
+                        int32_t & tail = kv_self.cells[seq_id].tail;
+                        if (tail != -1) {
+                            LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                            return false;
+                        }
+                        tail = i;
+                    }
+                }
+            }
+
+            kv_self.head = 0;
+            kv_self.used = cell_count;
+        }
+
+        if (kv_self.recurrent) {
+            for (uint32_t i = 0; i < cell_count; ++i) {
+                uint32_t cell_id = kv_self.head + i;
+                // make sure the recurrent states will keep their restored state
+                kv_self.cells[cell_id].src = cell_id;
+            }
+        }
+
+        return true;
+    }
+
+    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
+        const struct llama_hparams & hparams = ctx->model.hparams;
+        struct llama_kv_cache & kv_self = ctx->kv_self;
+        uint32_t v_trans;
+        uint32_t n_layer;
+        read_to(&v_trans, sizeof(v_trans));
+        read_to(&n_layer, sizeof(n_layer));
+
+        if (n_layer != hparams.n_layer) {
+            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+            return false;
+        }
+        if (cell_count > kv_self.size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+            return false;
+        }
+        if (kv_self.v_trans != (bool) v_trans) {
+            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+            return false;
+        }
+
+        // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+            // Read type of key
+            int32_t k_type_i_ref;
+            read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            if (k_type_i != k_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of key
+            uint64_t k_size_row_ref;
+            read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+            const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            if (k_size_row != k_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the keys for the whole cell range
+                ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
+            }
+        }
+
+        if (!kv_self.v_trans) {
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+                // Read type of value
+                int32_t v_type_i_ref;
+                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                if (v_type_i != v_type_i_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    return false;
+                }
+
+                // Read row size of value
+                uint64_t v_size_row_ref;
+                read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+                const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                if (v_size_row != v_size_row_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                    return false;
+                }
+
+                if (cell_count) {
+                    // Read and set the values for the whole cell range
+                    ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
+                }
+            }
+        } else {
+            // For each layer, read the values for each cell (transposed)
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+                // Read type of value
+                int32_t v_type_i_ref;
+                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                if (v_type_i != v_type_i_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    return false;
+                }
+
+                // Read element size of value
+                uint32_t v_size_el_ref;
+                read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+                const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                if (v_size_el != v_size_el_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                    return false;
+                }
+
+                // Read GQA embedding size
+                uint32_t n_embd_v_gqa_ref;
+                read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                    return false;
+                }
+
+                if (cell_count) {
+                    // For each row in the transposed matrix, read the values for the whole cell range
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
+                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+        uint32_t cell_count;
+        read_to(&cell_count, sizeof(cell_count));
+
+        bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
+
+        if (!res) {
+            if (seq_id == -1) {
+                llama_kv_cache_clear(ctx);
+            } else {
+                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+            }
+            throw std::runtime_error("failed to restore kv cache");
+        }
+    }
+};
+
+struct llama_data_write_dummy : llama_data_write {
+    size_t size_written = 0;
+
+    llama_data_write_dummy() {}
+
+    void write(const void * /* src */, size_t size) override {
+        size_written += size;
+    }
+
+    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+        size_written += size;
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_write_buffer : llama_data_write {
+    uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_written = 0;
+
+    llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+    void write(const void * src, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        memcpy(ptr, src, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
+    }
+
+    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ggml_backend_tensor_get(tensor, ptr, offset, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_read_buffer : llama_data_read {
+    const uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_read = 0;
+
+    llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+    const uint8_t * read(size_t size) override {
+        const uint8_t * base_ptr = ptr;
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ptr += size;
+        size_read += size;
+        buf_size -= size;
+        return base_ptr;
+    }
+
+    void read_to(void * dst, size_t size) override {
+        memcpy(dst, read(size), size);
+    }
+
+    size_t get_size_read() override {
+        return size_read;
+    }
+};
+
+struct llama_data_write_file : llama_data_write {
+    llama_file * file;
+    size_t size_written = 0;
+    std::vector<uint8_t> temp_buffer;
+
+    llama_data_write_file(llama_file * f) : file(f) {}
+
+    void write(const void * src, size_t size) override {
+        file->write_raw(src, size);
+        size_written += size;
+    }
+
+    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+        temp_buffer.resize(size);
+        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
+        write(temp_buffer.data(), temp_buffer.size());
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_read_file : llama_data_read {
+    llama_file * file;
+    size_t size_read = 0;
+    std::vector<uint8_t> temp_buffer;
+
+    llama_data_read_file(llama_file * f) : file(f) {}
+
+    void read_to(void * dst, size_t size) override {
+        file->read_raw(dst, size);
+        size_read += size;
+    }
+
+    const uint8_t * read(size_t size) override {
+        temp_buffer.resize(size);
+        read_to(temp_buffer.data(), size);
+        return temp_buffer.data();
+    }
+
+    size_t get_size_read() override {
+        return size_read;
+    }
+};
+
+/** copy state data into either a buffer or file depending on the passed in context
+ *
+ * file context:
+ * llama_file file("/path", "wb");
+ * llama_data_write_file data_ctx(&file);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+ * buffer context:
+ * std::vector<uint8_t> buf(max_size, 0);
+ * llama_data_write_buffer data_ctx(buf.data(), max_size);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+*/
+static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
+    llama_synchronize(ctx);
+
+    data_ctx.write_model_info(ctx);
+
+    // copy outputs
+    data_ctx.write_output_ids(ctx);
+    data_ctx.write_logits(ctx);
+    data_ctx.write_embeddings(ctx);
+
+    data_ctx.write_kv_cache(ctx);
+
+    return data_ctx.get_size_written();
+}
+
+size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
+    llama_data_write_buffer data_ctx(dst, size);
+    try {
+        return llama_state_get_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+// Returns the *actual* size of the state.
+// Intended to be used when saving to state to a buffer.
+size_t llama_state_get_size(struct llama_context * ctx) {
+    llama_data_write_dummy data_ctx;
+    try {
+        return llama_state_get_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
+    llama_synchronize(ctx);
+
+    data_ctx.read_model_info(ctx);
+
+    // set outputs
+    data_ctx.read_output_ids(ctx);
+    data_ctx.read_logits(ctx);
+    data_ctx.read_embeddings(ctx);
+
+    data_ctx.read_kv_cache(ctx);
+
+    return data_ctx.get_size_read();
+}
+
+// Sets the state reading from the specified source address
+size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
+    llama_data_read_buffer data_ctx(src, size);
+    try {
+        return llama_state_set_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(path_session, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size() - file.tell();
+
+        llama_data_read_file data_ctx(&file);
+        const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
+
+        if (n_read != n_state_size_cur) {
+            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    try {
+        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
+        return false;
+    }
+}
+
+static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(path_session, "wb");
+
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state using stream saving
+    llama_data_write_file data_ctx(&file);
+    llama_state_get_data_internal(ctx, data_ctx);
+
+    return true;
+}
+
+bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    try {
+        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
+        return false;
+    }
+}
+
+static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
+    llama_synchronize(ctx);
+
+    data_ctx.write_kv_cache(ctx, seq_id);
+
+    return data_ctx.get_size_written();
+}
+
+size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_data_write_dummy data_ctx;
+    return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+}
+
+size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
+    llama_data_write_buffer data_ctx(dst, size);
+    try {
+        return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
+    llama_synchronize(ctx);
+
+    data_ctx.read_kv_cache(ctx, dest_seq_id);
+
+    return data_ctx.get_size_read();
+}
+
+size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
+    llama_data_read_buffer data_ctx(src, size);
+    try {
+        return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(filepath, "wb");
+
+    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
+    file.write_u32(LLAMA_STATE_SEQ_VERSION);
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state using stream saving
+    llama_data_write_file data_ctx(&file);
+    llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+
+    const size_t res = file.tell();
+    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
+    return res;
+}
+
+static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(filepath, "rb");
+
+    // version checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
+            return 0;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return 0;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t state_size = file.size() - file.tell();
+        llama_data_read_file data_ctx(&file);
+        const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+        if (!nread) {
+            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
+            return 0;
+        }
+        GGML_ASSERT(nread <= state_size);
+        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+    }
+
+    return file.tell();
+}
+
+size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
+    try {
+        return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    try {
+        return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+) {
+    return ctx->model.tensors_by_name;
+}
diff --git a/src/llama-context.h b/src/llama-context.h
new file mode 100644
index 000000000..a9268b292
--- /dev/null
+++ b/src/llama-context.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
+#include "llama-model.h"
+#include "llama-kv-cache.h"
+#include "llama-adapter.h"
+
+#include "ggml-cpp.h"
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include <set>
+
+struct llama_context {
+    llama_context(const llama_model & model)
+        : model(model)
+        , t_start_us(model.t_start_us)
+        , t_load_us(model.t_load_us) {}
+
+    const struct llama_model & model;
+
+    struct llama_cparams      cparams;
+    struct llama_sbatch       sbatch;  // TODO: revisit if needed
+    struct llama_kv_cache     kv_self;
+    struct llama_adapter_cvec cvec;
+
+    std::unordered_map<struct llama_adapter_lora *, float> lora;
+
+    std::vector<ggml_backend_ptr> backends;
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
+    ggml_backend_t backend_cpu = nullptr;
+
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
+    bool has_evaluated_once = false;
+
+    mutable int64_t t_start_us;
+    mutable int64_t t_load_us;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;
+
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens = 0;
+
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls
+
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_ptr buf_output;
+
+    // decode output (2-dimensional array: [n_outputs][n_vocab])
+    size_t  logits_size = 0; // capacity (of floats) for logits
+    float * logits      = nullptr;
+
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
+    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
+
+    bool logits_all = false;
+
+    // embeddings output (2-dimensional array: [n_outputs][n_embd])
+    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
+    size_t  embd_size = 0; // capacity (of floats) for embeddings
+    float * embd      = nullptr;
+
+    // sequence embeddings output (map of [n_embd] vectors)
+    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
+    std::map<llama_seq_id, std::vector<float>> embd_seq;
+
+    // whether we are computing encoder output or decoder output
+    bool is_encoding = false;
+
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    // number of position id each token get, 1 for each token in most cases.
+    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+    int n_pos_per_token = 1;
+
+    // output of the encoder part of the encoder-decoder models
+    std::vector<float> embd_enc;
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+    ggml_backend_sched_ptr sched;
+
+    ggml_abort_callback abort_callback      = nullptr;
+    void *              abort_callback_data = nullptr;
+
+    // input tensors
+    struct ggml_tensor * inp_tokens;        // I32 [n_batch]
+    struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
+    struct ggml_tensor * inp_pos;           // I32 [n_batch]
+    struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
+    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
+    struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
+    struct ggml_tensor * inp_cls;           // I32 [n_batch]
+    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
+    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
+    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+};
+
+// TODO: make these methods of llama_context
+void llama_set_k_shift(struct llama_context & lctx);
+
+void llama_set_s_copy(struct llama_context & lctx);
+
+void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
+
+// Make sure enough space is available for outputs.
+// Returns max number of outputs for which space was reserved.
+size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
+
+// make the outputs have the same order they had in the user-provided batch
+void llama_output_reorder(struct llama_context & ctx);
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
new file mode 100644
index 000000000..28369be36
--- /dev/null
+++ b/src/llama-cparams.cpp
@@ -0,0 +1 @@
+#include "llama-cparams.h"
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
new file mode 100644
index 000000000..252012f3d
--- /dev/null
+++ b/src/llama-cparams.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+struct llama_cparams {
+    uint32_t n_ctx;           // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_ubatch;
+    uint32_t n_seq_max;
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
+
+    float rope_freq_base;
+    float rope_freq_scale;
+
+    uint32_t n_ctx_orig_yarn;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+    float defrag_thold;
+
+    bool embeddings;
+    bool causal_attn;
+    bool offload_kqv;
+    bool flash_attn;
+    bool no_perf;
+
+    enum llama_pooling_type pooling_type;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
new file mode 100644
index 000000000..9b518d1ac
--- /dev/null
+++ b/src/llama-grammar.cpp
@@ -0,0 +1,1219 @@
+#include "llama-grammar.h"
+
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-sampling.h"
+
+#include <cmath>
+#include <algorithm>
+#include <stdexcept>
+
+//
+// helpers
+//
+
+// NOTE: assumes valid utf8 (but checks for overrun)
+static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+    static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t  first_byte = static_cast<uint8_t>(*src);
+    uint8_t  highbits   = first_byte >> 4;
+    int      len        = lookup[highbits];
+    uint8_t  mask       = (1 << (8 - len)) - 1;
+    uint32_t value      = first_byte & mask;
+    const char * end    = src + len; // may overrun!
+    const char * pos    = src + 1;
+    for ( ; pos < end && *pos; pos++) {
+        value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+    }
+    return std::make_pair(value, pos);
+}
+
+static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        const std::string & src,
+        llama_partial_utf8 partial_start) {
+    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
+    const char          * pos      = src.c_str();
+    std::vector<uint32_t> code_points;
+
+    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+    code_points.reserve(src.size() + 1);
+    uint32_t value    = partial_start.value;
+    int      n_remain = partial_start.n_remain;
+
+    // continue previous decode, if applicable
+    while (*pos != 0 && n_remain > 0) {
+        uint8_t next_byte = static_cast<uint8_t>(*pos);
+        if ((next_byte >> 6) != 2) {
+            // invalid sequence, abort
+            code_points.push_back(0);
+            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
+        }
+        value = (value << 6) + (next_byte & 0x3F);
+        ++pos;
+        --n_remain;
+    }
+
+    if (partial_start.n_remain > 0 && n_remain == 0) {
+        code_points.push_back(value);
+    }
+
+    // decode any subsequent utf-8 sequences, which may end in an incomplete one
+    while (*pos != 0) {
+        uint8_t first_byte = static_cast<uint8_t>(*pos);
+        uint8_t highbits   = first_byte >> 4;
+        n_remain   = lookup[highbits] - 1;
+
+        if (n_remain < 0) {
+            // invalid sequence, abort
+            code_points.clear();
+            code_points.push_back(0);
+            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
+        }
+
+        uint8_t mask  = (1 << (7 - n_remain)) - 1;
+        value = first_byte & mask;
+
+        ++pos;
+        while (*pos != 0 && n_remain > 0) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+            ++pos;
+            --n_remain;
+        }
+        if (n_remain == 0) {
+            code_points.push_back(value);
+        }
+    }
+    code_points.push_back(0);
+
+    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
+}
+
+static bool is_digit_char(char c) {
+    return '0' <= c && c <= '9';
+}
+
+static bool is_word_char(char c) {
+    return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
+}
+
+static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+    const char * pos   = src;
+    const char * end   = src + size;
+    uint32_t     value = 0;
+    for ( ; pos < end && *pos; pos++) {
+        value <<= 4;
+        char c = *pos;
+        if ('a' <= c && c <= 'f') {
+            value += c - 'a' + 10;
+        } else if ('A' <= c && c <= 'F') {
+            value += c - 'A' + 10;
+        } else if ('0' <= c && c <= '9') {
+            value += c - '0';
+        } else {
+            break;
+        }
+    }
+    if (pos != end) {
+        throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+    }
+    return std::make_pair(value, pos);
+}
+
+static const char * parse_space(const char * src, bool newline_ok) {
+    const char * pos = src;
+    while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+            (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+        if (*pos == '#') {
+            while (*pos && *pos != '\r' && *pos != '\n') {
+                pos++;
+            }
+        } else {
+            pos++;
+        }
+    }
+    return pos;
+}
+
+static const char * parse_name(const char * src) {
+    const char * pos = src;
+    while (is_word_char(*pos)) {
+        pos++;
+    }
+    if (pos == src) {
+        throw std::runtime_error(std::string("expecting name at ") + src);
+    }
+    return pos;
+}
+
+static const char * parse_int(const char * src) {
+    const char * pos = src;
+    while (is_digit_char(*pos)) {
+        pos++;
+    }
+    if (pos == src) {
+        throw std::runtime_error(std::string("expecting integer at ") + src);
+    }
+    return pos;
+}
+
+static std::pair<uint32_t, const char *> parse_char(const char * src) {
+    if (*src == '\\') {
+        switch (src[1]) {
+            case 'x': return parse_hex(src + 2, 2);
+            case 'u': return parse_hex(src + 2, 4);
+            case 'U': return parse_hex(src + 2, 8);
+            case 't': return std::make_pair('\t', src + 2);
+            case 'r': return std::make_pair('\r', src + 2);
+            case 'n': return std::make_pair('\n', src + 2);
+            case '\\':
+            case '"':
+            case '[':
+            case ']':
+                      return std::make_pair(src[1], src + 2);
+            default:
+                      throw std::runtime_error(std::string("unknown escape at ") + src);
+        }
+    } else if (*src) {
+        return decode_utf8(src);
+    }
+    throw std::runtime_error("unexpected end of input");
+}
+
+static void print_grammar_char(FILE * file, uint32_t c) {
+    if (0x20 <= c && c <= 0x7f) {
+        fprintf(file, "%c", static_cast<char>(c));
+    } else {
+        // cop out of encoding UTF-8
+        fprintf(file, "<U+%04X>", c);
+    }
+}
+
+static bool is_char_element(llama_grammar_element elem) {
+    switch (elem.type) {
+        case LLAMA_GRETYPE_CHAR:           return true;
+        case LLAMA_GRETYPE_CHAR_NOT:       return true;
+        case LLAMA_GRETYPE_CHAR_ALT:       return true;
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+        case LLAMA_GRETYPE_CHAR_ANY:       return true;
+        default:                           return false;
+    }
+}
+
+static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
+    for (auto elem : rule) {
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
+            case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+            case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+            case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+            case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+            case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+        }
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:
+            case LLAMA_GRETYPE_ALT:
+            case LLAMA_GRETYPE_RULE_REF:
+                fprintf(file, "(%u) ", elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR:
+            case LLAMA_GRETYPE_CHAR_NOT:
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+            case LLAMA_GRETYPE_CHAR_ALT:
+            case LLAMA_GRETYPE_CHAR_ANY:
+                fprintf(file, "(\"");
+                print_grammar_char(file, elem.value);
+                fprintf(file, "\") ");
+                break;
+        }
+    }
+    fprintf(file, "\n");
+}
+
+static void print_rule(
+        FILE     * file,
+        uint32_t   rule_id,
+        const llama_grammar_rule & rule,
+        const std::map<uint32_t, std::string> & symbol_id_names) {
+    if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+        throw std::runtime_error(
+            "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+    }
+    fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+    for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+        llama_grammar_element elem = rule[i];
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:
+                throw std::runtime_error(
+                    "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                    std::to_string(i));
+            case LLAMA_GRETYPE_ALT:
+                fprintf(file, "| ");
+                break;
+            case LLAMA_GRETYPE_RULE_REF:
+                fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                break;
+            case LLAMA_GRETYPE_CHAR:
+                fprintf(file, "[");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_NOT:
+                fprintf(file, "[^");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                if (i == 0 || !is_char_element(rule[i - 1])) {
+                    throw std::runtime_error(
+                        "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                        std::to_string(rule_id) + "," + std::to_string(i));
+                }
+                fprintf(file, "-");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_ALT:
+                if (i == 0 || !is_char_element(rule[i - 1])) {
+                    throw std::runtime_error(
+                        "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                        std::to_string(rule_id) + "," + std::to_string(i));
+                }
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_ANY:
+                fprintf(file, ".");
+                break;
+        }
+        if (is_char_element(elem)) {
+            switch (rule[i + 1].type) {
+                case LLAMA_GRETYPE_CHAR_ALT:
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                case LLAMA_GRETYPE_CHAR_ANY:
+                    break;
+                default:
+                    fprintf(file, "] ");
+            }
+        }
+    }
+    fprintf(file, "\n");
+}
+
+//
+// implementation
+//
+
+uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) {
+    uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+    auto result = symbol_ids.emplace(std::string(src, len), next_id);
+    return result.first->second;
+}
+
+uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) {
+    uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+    symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+    return next_id;
+}
+
+void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) {
+    if (rules.size() <= rule_id) {
+        rules.resize(rule_id + 1);
+    }
+    rules[rule_id] = rule;
+}
+
+const char * llama_grammar_parser::parse_alternates(
+        const char        * src,
+        const std::string & rule_name,
+        uint32_t            rule_id,
+        bool                is_nested) {
+    llama_grammar_rule rule;
+    const char * pos = parse_sequence(src, rule_name, rule, is_nested);
+    while (*pos == '|') {
+        rule.push_back({LLAMA_GRETYPE_ALT, 0});
+        pos = parse_space(pos + 1, true);
+        pos = parse_sequence(pos, rule_name, rule, is_nested);
+    }
+    rule.push_back({LLAMA_GRETYPE_END, 0});
+    add_rule(rule_id, rule);
+    return pos;
+}
+
+const char * llama_grammar_parser::parse_sequence(
+        const char         * src,
+        const std::string  & rule_name,
+        llama_grammar_rule & rule,
+        bool               is_nested) {
+    size_t last_sym_start = rule.size();
+    const char * pos = src;
+
+        auto handle_repetitions = [&](int min_times, int max_times) {
+
+            if (last_sym_start == rule.size()) {
+                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+            }
+
+            // apply transformation to previous symbol (last_sym_start to end) according to
+            // the following rewrite rules:
+            // S{m,n} --> S S S (m times) S'(n-m)
+            //            S'(x)   ::= S S'(x-1) |
+            //            (... n-m definitions of these S' rules ...)
+            //            S'(1)   ::= S |
+            // S{m,} -->  S S S (m times) S'
+            //            S'     ::= S S' |
+            // S*     --> S{0,}
+            //        --> S'     ::= S S' |
+            // S+     --> S{1,}
+            //        --> S S'
+            //            S'     ::= S S' |
+            // S?     --> S{0,1}
+            //        --> S'
+            //            S'     ::= S |
+
+            llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+            if (min_times == 0) {
+                rule.resize(last_sym_start);
+            } else {
+                // Repeat the previous elements (min_times - 1) times
+                for (int i = 1; i < min_times; i++) {
+                    rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
+                }
+            }
+
+            uint32_t last_rec_rule_id = 0;
+            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+            llama_grammar_rule rec_rule(prev_rule);
+            for (int i = 0; i < n_opt; i++) {
+                rec_rule.resize(prev_rule.size());
+                uint32_t rec_rule_id = generate_symbol_id( rule_name);
+                if (i > 0 || max_times < 0) {
+                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+                }
+                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule( rec_rule_id, rec_rule);
+                last_rec_rule_id = rec_rule_id;
+            }
+            if (n_opt > 0) {
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+            }
+        };
+
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = rule.size();
+                while (*pos != '"') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = rule.size();
+                while (*pos != ']') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum llama_gretype type = last_sym_start < rule.size()
+                        ? LLAMA_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    rule.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        if (!pos[1]) {
+                            throw std::runtime_error("unexpected end of input");
+                        }
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = rule.size();
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(rule_name);
+                pos = parse_alternates(pos, rule_name, sub_rule_id, true);
+                last_sym_start = rule.size();
+                // output reference to synthesized rule
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '.') { // any char
+                last_sym_start = rule.size();
+                rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, -1);
+            } else if (*pos == '+') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(1, -1);
+            } else if (*pos == '?') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, 1);
+            } else if (*pos == '{') {
+                pos = parse_space(pos + 1, is_nested);
+
+                if (!is_digit_char(*pos)) {
+                    throw std::runtime_error(std::string("expecting an int at ") + pos);
+                }
+                const char * int_end = parse_int(pos);
+                int min_times = std::stoul(std::string(pos, int_end - pos));
+                pos = parse_space(int_end, is_nested);
+
+                int max_times = -1;
+
+                if (*pos == '}') {
+                    max_times = min_times;
+                    pos = parse_space(pos + 1, is_nested);
+                } else if (*pos == ',') {
+                    pos = parse_space(pos + 1, is_nested);
+
+                    if (is_digit_char(*pos)) {
+                        const char * int_end = parse_int(pos);
+                        max_times = std::stoul(std::string(pos, int_end - pos));
+                        pos = parse_space(int_end, is_nested);
+                    }
+
+                    if (*pos != '}') {
+                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
+                    }
+                    pos = parse_space(pos + 1, is_nested);
+                } else {
+                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
+                }
+                handle_repetitions(min_times, max_times);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+const char * llama_grammar_parser::parse_rule(const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+bool llama_grammar_parser::parse(const char * src) {
+    try {
+        const char * pos = parse_space(src, true);
+        while (*pos) {
+            pos = parse_rule(pos);
+        }
+        // Validate the state to ensure that all rules are defined
+        for (const auto & rule : rules) {
+            if (rule.empty()) {
+                throw std::runtime_error("Undefined rule");
+            }
+            for (const auto & elem : rule) {
+                if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+                    // Ensure that the rule at that location exists
+                    if (elem.value >= rules.size() || rules[elem.value].empty()) {
+                        // Get the name of the rule that is missing
+                        for (const auto & kv : symbol_ids) {
+                            if (kv.second == elem.value) {
+                                throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
+        rules.clear();
+        return false;
+    }
+
+    return true;
+}
+
+void llama_grammar_parser::print(FILE * file) {
+    try {
+        std::map<uint32_t, std::string> symbol_id_names;
+        for (const auto & kv : symbol_ids) {
+            symbol_id_names[kv.second] = kv.first;
+        }
+        for (size_t i = 0, end = rules.size(); i < end; i++) {
+            // fprintf(file, "%zu: ", i);
+            // print_rule_binary(file, rules[i]);
+            print_rule(file, uint32_t(i), rules[i], symbol_id_names);
+            // fprintf(file, "\n");
+        }
+    } catch (const std::exception & err) {
+        fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+    }
+}
+
+llama_grammar_stack llama_grammar_parser::c_rules() const {
+    llama_grammar_stack ret;
+    ret.reserve(rules.size());
+    for (const auto & rule : rules) {
+        ret.push_back(rule.data());
+    }
+    return ret;
+}
+
+// returns true iff pos points to the end of one of the definitions of a rule
+static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
+    switch (pos->type) {
+        case LLAMA_GRETYPE_END: return true;  // NOLINT
+        case LLAMA_GRETYPE_ALT: return true;  // NOLINT
+        default:                return false;
+    }
+}
+
+// returns true iff chr satisfies the char range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
+        const llama_grammar_element * pos,
+        const uint32_t                chr) {
+    bool found            = false;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+
+    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
+
+    do {
+        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+            // inclusive range, e.g. [a-z]
+            found = found || (pos->value <= chr && chr <= pos[1].value);
+            pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            found = true;
+            pos += 1;
+        } else {
+            // exact char match, e.g. [a] or "a"
+            found = found || pos->value == chr;
+            pos += 1;
+        }
+    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+    return std::make_pair(found == is_positive_char, pos);
+}
+
+// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
+// range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static bool llama_grammar_match_partial_char(
+        const llama_grammar_element * pos,
+        const llama_partial_utf8      partial_utf8) {
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
+
+    uint32_t partial_value = partial_utf8.value;
+    int      n_remain      = partial_utf8.n_remain;
+
+    // invalid sequence or 7-bit char split across 2 bytes (overlong)
+    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
+        return false;
+    }
+
+    // range of possible code points this partial UTF-8 sequence could complete to
+    uint32_t low  = partial_value << (n_remain * 6);
+    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
+
+    if (low == 0) {
+        if (n_remain == 2) {
+            low = 1 << 11;
+        } else if (n_remain == 3) {
+            low = 1 << 16;
+        }
+    }
+
+    do {
+        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+            // inclusive range, e.g. [a-z]
+            if (pos->value <= high && low <= pos[1].value) {
+                return is_positive_char;
+            }
+            pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            return true;
+        } else {
+            // exact char match, e.g. [a] or "a"
+            if (low <= pos->value && pos->value <= high) {
+                return is_positive_char;
+            }
+            pos += 1;
+        }
+    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+    return !is_positive_char;
+}
+
+// transforms a grammar pushdown stack into N possible stacks, all ending
+// at a character range (terminal element)
+static void llama_grammar_advance_stack(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stack  & stack,
+              llama_grammar_stacks & new_stacks) {
+    if (stack.empty()) {
+        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+            new_stacks.emplace_back(stack);
+        }
+        return;
+    }
+
+    const llama_grammar_element * pos = stack.back();
+
+    switch (pos->type) {
+        case LLAMA_GRETYPE_RULE_REF: {
+            const size_t                  rule_id = static_cast<size_t>(pos->value);
+            const llama_grammar_element * subpos  = rules[rule_id].data();
+            do {
+                // init new stack without the top (pos)
+                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                    // if this rule ref is followed by another element, add that to stack
+                    new_stack.push_back(pos + 1);
+                }
+                if (!llama_grammar_is_end_of_sequence(subpos)) {
+                    // if alternate is nonempty, add to stack
+                    new_stack.push_back(subpos);
+                }
+                llama_grammar_advance_stack(rules, new_stack, new_stacks);
+                while (!llama_grammar_is_end_of_sequence(subpos)) {
+                    // scan to end of alternate def
+                    subpos++;
+                }
+                if (subpos->type == LLAMA_GRETYPE_ALT) {
+                    // there's another alternate def of this rule to process
+                    subpos++;
+                } else {
+                    break;
+                }
+            } while (true);
+            break;
+        }
+        case LLAMA_GRETYPE_CHAR:
+        case LLAMA_GRETYPE_CHAR_NOT:
+        case LLAMA_GRETYPE_CHAR_ANY:
+            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+                // only add the stack if it's not a duplicate of one we already have
+                new_stacks.emplace_back(stack);
+            }
+            break;
+        default:
+            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
+            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
+            // those
+            GGML_ABORT("fatal error");
+    }
+}
+
+static llama_grammar_candidates llama_grammar_reject_candidates(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stacks     & stacks,
+        const llama_grammar_candidates & candidates) {
+    GGML_ASSERT(!stacks.empty()); // REVIEW
+
+    if (candidates.empty()) {
+        return {};
+    }
+
+    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
+
+    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
+        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
+    }
+
+    return rejects;
+}
+
+static bool llama_grammar_detect_left_recursion(
+        const llama_grammar_rules & rules,
+        size_t rule_index,
+        std::vector<bool> * rules_visited,
+        std::vector<bool> * rules_in_progress,
+        std::vector<bool> * rules_may_be_empty) {
+    if ((*rules_in_progress)[rule_index]) {
+        return true;
+    }
+
+    (*rules_in_progress)[rule_index] = true;
+
+    const llama_grammar_rule & rule = rules[rule_index];
+
+    // First check if the rule might produce the empty string. This could be done combined with the second
+    // step but it's more readable as two steps.
+    bool at_rule_start = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            if (at_rule_start) {
+                (*rules_may_be_empty)[rule_index] = true;
+                break;
+            }
+            at_rule_start = true;
+        } else {
+            at_rule_start = false;
+        }
+    }
+
+    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+    // be empty)
+    bool recurse_into_nonterminal = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+                return true;
+            }
+            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+                recurse_into_nonterminal = false;
+            }
+        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            recurse_into_nonterminal = true;
+        } else {
+            recurse_into_nonterminal = false;
+        }
+    }
+
+    (*rules_in_progress)[rule_index] = false;
+    (*rules_visited)[rule_index] = true;
+
+    return false;
+}
+
+const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
+    return grammar->rules;
+}
+
+llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
+    return grammar->stacks;
+}
+
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar->stacks.size());
+
+    for (const auto & stack : grammar->stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        auto match = llama_grammar_match_char(stack.back(), chr);
+        if (match.first) {
+            const llama_grammar_element * pos = match.second;
+
+            // update top of stack to next element, if any
+            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+            if (!llama_grammar_is_end_of_sequence(pos)) {
+                new_stack.push_back(pos);
+            }
+            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
+        }
+    }
+
+    grammar->stacks = std::move(stacks_new);
+}
+
+llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stack      & stack,
+        const llama_grammar_candidates & candidates) {
+
+    llama_grammar_candidates rejects;
+    rejects.reserve(candidates.size());
+
+    if (stack.empty()) {
+        for (const auto & tok : candidates) {
+            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
+                rejects.push_back(tok);
+            }
+        }
+        return rejects;
+    }
+
+    const llama_grammar_element * stack_pos = stack.back();
+
+    llama_grammar_candidates next_candidates;
+    next_candidates.reserve(candidates.size());
+
+    for (const auto & tok : candidates) {
+        if (*tok.code_points == 0) {
+            // reached end of full codepoints in token, reject iff it ended in a partial sequence
+            // that cannot satisfy this position in grammar
+            if (tok.partial_utf8.n_remain != 0 &&
+                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
+                rejects.push_back(tok);
+            }
+        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
+            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
+        } else {
+            rejects.push_back(tok);
+        }
+    }
+
+    const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
+
+    // update top of stack to next element, if any
+    llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
+    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
+        stack_after.push_back(stack_pos_after);
+    }
+    llama_grammar_stacks next_stacks;
+    llama_grammar_advance_stack(rules, stack_after, next_stacks);
+
+    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
+    for (const auto & tok : next_rejects) {
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
+    }
+
+    return rejects;
+}
+
+////////////////////
+
+struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
+        const llama_grammar_element ** rules,
+        size_t n_rules,
+        size_t start_rule_index) {
+    const llama_grammar_element * pos;
+
+    // copy rule definitions into vectors
+    llama_grammar_rules vec_rules(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+            vec_rules[i].push_back(*pos);
+        }
+        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+    }
+
+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            return nullptr;
+        }
+    }
+
+    // loop over alternates of start rule to build initial stacks
+    llama_grammar_stacks stacks;
+    pos = vec_rules[start_rule_index].data();
+    do {
+        llama_grammar_stack stack;
+        if (!llama_grammar_is_end_of_sequence(pos)) {
+            // if alternate is nonempty, add to stack
+            stack.push_back(pos);
+        }
+        llama_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!llama_grammar_is_end_of_sequence(pos)) {
+            // scan to end of alternate def
+            pos++;
+        }
+        if (pos->type == LLAMA_GRETYPE_ALT) {
+            // there's another alternate def of this rule to process
+            pos++;
+        } else {
+            break;
+        }
+    } while (true);
+
+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
+    return new llama_grammar {
+        vocab,
+        std::move(vec_rules),
+        std::move(stacks),
+        /* .partial_utf8 = */     {},
+        /* .lazy =*/              false,
+        /* .awaiting_trigger = */ false,
+        /* .trigger_buffer = */   "",
+        /* .trigger_tokens   = */ {},
+        /* .trigger_words    = */ {},
+    };
+}
+
+struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                              bool lazy,
+                     const char ** trigger_words,
+                            size_t num_trigger_words,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    llama_grammar_parser parser;
+
+    // if there is a grammar, parse it
+    if (!parser.parse(grammar_str)) {
+        return nullptr;
+    }
+
+    // will be empty (default) if there are parse errors
+    if (parser.rules.empty()) {
+        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+        return nullptr;
+    }
+
+    // Ensure that there is a "root" node.
+    if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
+        fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+        return nullptr;
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parser.c_rules());
+
+    const size_t n_rules = grammar_rules.size();
+    const size_t start_rule_index = parser.symbol_ids.at(grammar_root);
+
+    const llama_grammar_element * pos;
+
+    // copy rule definitions into vectors
+    llama_grammar_rules vec_rules(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+            vec_rules[i].push_back(*pos);
+        }
+        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+    }
+
+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            return nullptr;
+        }
+    }
+
+    // loop over alternates of start rule to build initial stacks
+    llama_grammar_stacks stacks;
+    pos = vec_rules[start_rule_index].data();
+    do {
+        llama_grammar_stack stack;
+        if (!llama_grammar_is_end_of_sequence(pos)) {
+            // if alternate is nonempty, add to stack
+            stack.push_back(pos);
+        }
+        llama_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!llama_grammar_is_end_of_sequence(pos)) {
+            // scan to end of alternate def
+            pos++;
+        }
+        if (pos->type == LLAMA_GRETYPE_ALT) {
+            // there's another alternate def of this rule to process
+            pos++;
+        } else {
+            break;
+        }
+    } while (true);
+
+    std::vector<llama_token>    vec_trigger_tokens;
+    std::vector<std::string> vec_trigger_words;
+    for (size_t i = 0; i < num_trigger_tokens; i++) {
+        GGML_ASSERT(trigger_tokens != nullptr);
+        vec_trigger_tokens.push_back(trigger_tokens[i]);
+    }
+    for (size_t i = 0; i < num_trigger_words; i++) {
+        GGML_ASSERT(trigger_words != nullptr);
+        vec_trigger_words.push_back(trigger_words[i]);
+    }
+
+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
+    return new llama_grammar {
+        vocab,
+        std::move(vec_rules),
+        std::move(stacks),
+        /* .partial_utf8 = */     {},
+        /* .lazy = */             lazy,
+        /* .awaiting_trigger = */ lazy,
+        /* .trigger_buffer = */   "",
+        std::move(vec_trigger_tokens),
+        std::move(vec_trigger_words),
+    };
+}
+
+void llama_grammar_free_impl(struct llama_grammar * grammar) {
+    if (grammar == nullptr) {
+        return;
+    }
+
+    delete grammar;
+}
+
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
+    llama_grammar * result = new llama_grammar {
+        grammar.vocab,
+        grammar.rules,
+        grammar.stacks,
+        grammar.partial_utf8,
+        grammar.lazy,
+        grammar.awaiting_trigger,
+        grammar.trigger_buffer,
+        grammar.trigger_tokens,
+        grammar.trigger_words,
+    };
+
+    // redirect elements in stacks to point to new rules
+    for (size_t is = 0; is < result->stacks.size(); is++) {
+        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
+            for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
+                for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
+                    if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
+                        result->stacks[is][ie] =  &result->rules[ir0][ir1];
+                    }
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
+    GGML_ASSERT(grammar.vocab != nullptr);
+
+    if (grammar.awaiting_trigger) {
+        return;
+    }
+
+    bool allow_eog = false;
+    for (const auto & stack : grammar.stacks) {
+        if (stack.empty()) {
+            allow_eog = true;
+            break;
+        }
+    }
+
+    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+    candidates_decoded.reserve(cur_p->size);
+
+    llama_grammar_candidates candidates_grammar;
+    candidates_grammar.reserve(cur_p->size);
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const llama_token id      = cur_p->data[i].id;
+        const std::string & piece = grammar.vocab->token_to_piece(id);
+
+        if (grammar.vocab->is_eog(id)) {
+            if (!allow_eog) {
+                cur_p->data[i].logit = -INFINITY;
+            }
+        } else if (piece.empty() || piece[0] == 0) {
+            cur_p->data[i].logit = -INFINITY;
+        } else {
+            candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
+        }
+    }
+
+    const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
+    for (const auto & reject : rejects) {
+        cur_p->data[reject.index].logit = -INFINITY;
+    }
+}
+
+void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
+    GGML_ASSERT(grammar.vocab != nullptr);
+
+    const auto & piece = grammar.vocab->token_to_piece(token);
+
+    if (grammar.awaiting_trigger) {
+        if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
+            grammar.awaiting_trigger = false;
+            grammar.trigger_buffer.clear();
+            llama_grammar_accept_str(grammar, piece);
+            LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
+            return;
+        } else {
+            // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
+            grammar.trigger_buffer += piece;
+            for (const auto & word : grammar.trigger_words) {
+                auto pos = grammar.trigger_buffer.find(word);
+                if (pos != std::string::npos) {
+                    grammar.awaiting_trigger = false;
+                    auto constrained_str = grammar.trigger_buffer.substr(pos);
+                    grammar.trigger_buffer.clear();
+                    llama_grammar_accept_str(grammar, constrained_str);
+                    LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
+                    return;
+                }
+            }
+            LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`) (buffer: `%s`)\n", token, piece.c_str(), grammar.trigger_buffer.c_str());
+            return;
+        }
+    }
+
+    if (grammar.vocab->is_eog(token)) {
+        for (const auto & stack : grammar.stacks) {
+            if (stack.empty()) {
+                return;
+            }
+        }
+        GGML_ABORT("fatal error");
+    }
+
+    llama_grammar_accept_str(grammar, piece);
+}
+
+void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
+    // Note terminating 0 in decoded string
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
+    const auto & code_points = decoded.first;
+
+    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+        llama_grammar_accept(&grammar, *it);
+    }
+
+    grammar.partial_utf8 = decoded.second;
+    if (grammar.stacks.empty()) {
+        throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
+    }
+}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
new file mode 100644
index 000000000..252d54d4c
--- /dev/null
+++ b/src/llama-grammar.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "llama.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+struct llama_vocab;
+
+// grammar element type
+enum llama_gretype {
+    // end of rule definition
+    LLAMA_GRETYPE_END            = 0,
+
+    // start of alternate definition for rule
+    LLAMA_GRETYPE_ALT            = 1,
+
+    // non-terminal element: reference to rule
+    LLAMA_GRETYPE_RULE_REF       = 2,
+
+    // terminal element: character (code point)
+    LLAMA_GRETYPE_CHAR           = 3,
+
+    // inverse char(s) ([^a], [^a-b] [^abc])
+    LLAMA_GRETYPE_CHAR_NOT       = 4,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+    // be an inclusive range ([a-z])
+    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or
+    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+    LLAMA_GRETYPE_CHAR_ALT       = 6,
+
+    // any character (.)
+    LLAMA_GRETYPE_CHAR_ANY       = 7,
+};
+
+typedef struct llama_grammar_element {
+    enum llama_gretype type;
+    uint32_t           value; // Unicode code point or rule ID
+} llama_grammar_element;
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+using llama_grammar_rule  = std::vector<      llama_grammar_element>;
+using llama_grammar_stack = std::vector<const llama_grammar_element *>;
+
+using llama_grammar_rules      = std::vector<llama_grammar_rule>;
+using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
+using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+
+// TODO: remove, needed for tests atm
+const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
+      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `llama_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
+
+std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stack      & stack,
+        const llama_grammar_candidates & candidates);
+
+struct llama_grammar_parser {
+    std::map<std::string, uint32_t> symbol_ids;
+
+    llama_grammar_rules rules;
+
+    llama_grammar_stack c_rules() const;
+
+    uint32_t get_symbol_id(const char * src, size_t len);
+    uint32_t generate_symbol_id(const std::string & base_name);
+
+    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
+
+    const char * parse_alternates(
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            const char         * src,
+            const std::string  & rule_name,
+            llama_grammar_rule & rule,
+            bool               is_nested);
+
+    const char * parse_rule(const char * src);
+
+    bool parse(const char * src);
+    void print(FILE * file);
+};
+
+struct llama_grammar {
+    // note: allow null vocab for testing (not great)
+    const llama_vocab * vocab;
+
+    const llama_grammar_rules  rules;  // TODO: shared ptr
+          llama_grammar_stacks stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8 partial_utf8;
+
+    // lazy grammars wait for trigger words or tokens before constraining the sampling.
+    // we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
+    // (useful e.g. for tool_choice=required)
+    bool                     lazy             = false;
+    bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
+    std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
+    std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
+    std::vector<std::string> trigger_words;
+};
+
+//
+// internal API
+//
+
+// note: needed for tests (not great)
+struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
+        const llama_grammar_element ** rules,
+        size_t n_rules,
+        size_t start_rule_index);
+
+struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                              bool lazy,
+                     const char ** trigger_words,
+                            size_t num_trigger_words,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens);
+
+void llama_grammar_free_impl(struct llama_grammar * grammar);
+
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
+
+// TODO: move the API below as member functions of llama_grammar
+void llama_grammar_apply_impl(
+        const struct llama_grammar & grammar,
+            llama_token_data_array * cur_p);
+
+void llama_grammar_accept_impl(
+              struct llama_grammar & grammar,
+                       llama_token   token);
+
+void llama_grammar_accept_str(
+              struct llama_grammar & grammar,
+                 const std::string & piece);
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
new file mode 100644
index 000000000..ea87b2953
--- /dev/null
+++ b/src/llama-hparams.cpp
@@ -0,0 +1,71 @@
+#include "llama-hparams.h"
+
+#include "ggml.h"
+
+uint32_t llama_hparams::n_head(uint32_t il) const {
+    if (il < n_layer) {
+        return n_head_arr[il];
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_head_kv(uint32_t il) const {
+    if (il < n_layer) {
+        return n_head_kv_arr[il];
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_ff(uint32_t il) const {
+    if (il < n_layer) {
+        return n_ff_arr[il];
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_gqa(uint32_t il) const {
+    const uint32_t n_head    = this->n_head(il);
+    const uint32_t n_head_kv = this->n_head_kv(il);
+
+    if (n_head_kv == 0) {
+        return 0;
+    }
+
+    return n_head/n_head_kv;
+}
+
+uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
+    const uint32_t n_head_kv = this->n_head_kv(il);
+
+    return n_embd_head_k * n_head_kv;
+}
+
+uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
+    const uint32_t n_head_kv = this->n_head_kv(il);
+
+    return n_embd_head_v * n_head_kv;
+}
+
+uint32_t llama_hparams::n_embd_k_s() const {
+    if (wkv_head_size != 0) {
+        // for RWKV models
+        return token_shift_count * n_embd;
+    }
+
+    // TODO: maybe support other convolution strides than 1
+    // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+    return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
+}
+
+uint32_t llama_hparams::n_embd_v_s() const {
+    if (wkv_head_size != 0) {
+        // corresponds to RWKV's wkv_states size
+        return n_embd * wkv_head_size;
+    }
+
+    // corresponds to Mamba's ssm_states size
+    return ssm_d_state * ssm_d_inner;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
new file mode 100644
index 000000000..1fe454103
--- /dev/null
+++ b/src/llama-hparams.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include "llama.h"
+
+#include <array>
+
+// bump if necessary
+#define LLAMA_MAX_LAYERS  512
+#define LLAMA_MAX_EXPERTS 256  // DeepSeekV3
+
+enum llama_expert_gating_func_type {
+    LLAMA_EXPERT_GATING_FUNC_TYPE_NONE    = 0,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
+};
+
+struct llama_hparams_posnet {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
+struct llama_hparams_convnext {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
+struct llama_hparams {
+    bool vocab_only;
+    bool rope_finetuned;
+    bool use_par_res;
+    bool swin_norm;
+
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_embd_features = 0;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_swa = 0; // sliding window attention (SWA)
+    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
+    uint32_t n_expert = 0;
+    uint32_t n_expert_used = 0;
+    uint32_t n_rel_attn_bkts = 0;
+
+    // for WavTokenizer
+    struct llama_hparams_posnet   posnet;
+    struct llama_hparams_convnext convnext;
+
+    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+
+    uint32_t n_layer_dense_lead = 0;
+    uint32_t n_lora_q           = 0;
+    uint32_t n_lora_kv          = 0;
+    uint32_t n_ff_exp           = 0;
+    uint32_t n_ff_shexp         = 0;
+    uint32_t n_expert_shared    = 0;
+    uint32_t n_norm_groups      = 0;
+
+    float    expert_weights_scale = 0.0;
+    bool     expert_weights_norm  = false;
+    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+    float f_norm_group_eps;
+
+    float f_attn_logit_softcapping  = 50.0f;
+    float f_final_logit_softcapping = 30.0f;
+
+    // for RWKV
+    uint32_t rescale_every_n_layers = 0;
+    uint32_t time_mix_extra_dim     = 0;
+    uint32_t time_decay_extra_dim   = 0;
+    uint32_t wkv_head_size          = 0;
+    uint32_t token_shift_count      = 2;
+
+    float    rope_attn_factor = 1.0f;
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_ctx_orig_yarn;
+    float    rope_yarn_log_mul;
+
+    std::array<int, 4> rope_sections;
+
+    // for State Space Models
+    uint32_t ssm_d_conv  = 0;
+    uint32_t ssm_d_inner = 0;
+    uint32_t ssm_d_state = 0;
+    uint32_t ssm_dt_rank = 0;
+
+    bool ssm_dt_b_c_rms = false;
+
+    float f_clamp_kqv      = 0.0f;
+    float f_max_alibi_bias = 0.0f;
+    float f_logit_scale    = 0.0f;
+
+    // Additional scale factors (Granite/Granite MoE)
+    float f_residual_scale  = 0.0f;
+    float f_embedding_scale = 0.0f;
+    float f_attention_scale = 0.0f;
+
+    bool causal_attn   = true;
+    bool use_alibi     = false;
+    bool attn_soft_cap = false;
+
+    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
+    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
+
+    enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
+    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
+    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
+
+    uint32_t n_head(uint32_t il = 0) const;
+
+    uint32_t n_head_kv(uint32_t il = 0) const;
+
+    uint32_t n_ff(uint32_t il = 0) const;
+
+    uint32_t n_gqa(uint32_t il = 0) const;
+
+    // dimension of key embeddings across all k-v heads
+    uint32_t n_embd_k_gqa(uint32_t il = 0) const;
+
+    // dimension of value embeddings across all k-v heads
+    uint32_t n_embd_v_gqa(uint32_t il = 0) const;
+
+    // dimension of the rolling state embeddings
+    // corresponds to Mamba's conv_states size or RWKV's token_shift states size
+    uint32_t n_embd_k_s() const;
+
+    // dimension of the recurrent state embeddings
+    uint32_t n_embd_v_s() const;
+};
+
+static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
+
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
new file mode 100644
index 000000000..6ec709dd3
--- /dev/null
+++ b/src/llama-impl.cpp
@@ -0,0 +1,167 @@
+#include "llama-impl.h"
+
+#include "gguf.h"
+#include "llama.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstdarg>
+#include <cstring>
+#include <vector>
+#include <sstream>
+
+struct llama_logger_state {
+    ggml_log_callback log_callback = llama_log_callback_default;
+    void * log_callback_user_data = nullptr;
+};
+
+static llama_logger_state g_logger_state;
+
+time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+time_meas::~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+void llama_log_set(ggml_log_callback log_callback, void * user_data) {
+    ggml_log_set(log_callback, user_data);
+    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
+}
+
+static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = new char[len + 1];
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        delete[] buffer2;
+    }
+    va_end(args_copy);
+}
+
+void llama_log_internal(ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    llama_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+    for (size_t i = 1; i < ne.size(); i++) {
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+    }
+    return buf;
+}
+
+std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+    }
+    return buf;
+}
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
diff --git a/src/llama-impl.h b/src/llama-impl.h
new file mode 100644
index 000000000..12d1fb082
--- /dev/null
+++ b/src/llama-impl.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "ggml.h" // for ggml_log_level
+
+#include <string>
+#include <vector>
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal        (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
+//
+// helpers
+//
+
+template <typename T>
+struct no_init {
+    T value;
+    no_init() { /* do nothing */ }
+};
+
+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false);
+    ~time_meas();
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace);
+
+// TODO: rename to llama_format ?
+LLAMA_ATTRIBUTE_FORMAT(1, 2)
+std::string format(const char * fmt, ...);
+
+std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
+std::string llama_format_tensor_shape(const struct ggml_tensor * t);
+
+std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
new file mode 100644
index 000000000..feffdf0de
--- /dev/null
+++ b/src/llama-kv-cache.cpp
@@ -0,0 +1,718 @@
+#include "llama-kv-cache.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <limits>
+#include <map>
+
+static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
+
+uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
+bool llama_kv_cache_init(
+             struct llama_kv_cache & cache,
+                 const llama_model & model,
+               const llama_cparams & cparams,
+                         ggml_type   type_k,
+                         ggml_type   type_v,
+                          uint32_t   kv_size,
+                              bool   offload) {
+    const struct llama_hparams & hparams = model.hparams;
+
+    const int32_t n_layer = hparams.n_layer;
+
+    cache.has_shift = false;
+
+    cache.recurrent = llama_model_is_recurrent(&model);
+    cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
+    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
+            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);
+
+    cache.head = 0;
+    cache.size = kv_size;
+    cache.used = 0;
+
+    cache.type_k = type_k;
+    cache.type_v = type_v;
+
+    cache.cells.clear();
+    cache.cells.resize(kv_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = ctx;
+            cache.ctxs.emplace_back(ctx);
+            return ctx;
+        }
+        return it->second;
+    };
+
+    cache.k_l.reserve(n_layer);
+    cache.v_l.reserve(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
+        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+
+        ggml_backend_buffer_type_t buft;
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+        } else {
+            buft = ggml_backend_cpu_buffer_type();
+        }
+        ggml_context * ctx = ctx_for_buft(buft);
+
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+            return false;
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        cache.k_l.push_back(k);
+        cache.v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        cache.bufs.emplace_back(buf);
+    }
+
+    return true;
+}
+
+struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
+           struct llama_kv_cache & cache,
+       const struct llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    if (cache.recurrent) {
+        // For recurrent state architectures (like Mamba or RWKV),
+        // each cache cell can store the state for a whole sequence.
+        // A slot should be always be contiguous.
+
+        // can only process batches with an equal number of new tokens in each sequence
+        GGML_ASSERT(ubatch.equal_seqs);
+
+        int32_t min = cache.size - 1;
+        int32_t max = 0;
+
+        // everything should fit if all seq_ids are smaller than the max
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const uint32_t n_seq_id = ubatch.n_seq_id[s];
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+
+                if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
+                    // too big seq_id
+                    // TODO: would it be possible to resize the cache instead?
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
+                    return llama_kv_cache_slot_info_failed;
+                }
+                if (j > 0) {
+                    llama_kv_cell & seq = cache.cells[seq_id];
+                    if (seq.tail >= 0) {
+                        llama_kv_cell & cell = cache.cells[seq.tail];
+                        // clear cells from seq_ids that become shared
+                        // (should not normally happen, but let's handle it anyway)
+                        cell.seq_id.erase(seq_id);
+                        seq.tail = -1;
+                        if (cell.seq_id.empty()) {
+                            cell.pos = -1;
+                            cell.src = -1;
+                            cache.used -= 1;
+                        }
+                    }
+                }
+            }
+        }
+
+#ifndef NDEBUG
+        {
+            std::vector<int32_t> tails_verif;
+            tails_verif.assign(cache.size, -1);
+            for (uint32_t i = 0; i < cache.size; ++i) {
+                llama_kv_cell & cell = cache.cells[i];
+                for (llama_seq_id seq_id : cell.seq_id) {
+                    if (tails_verif[seq_id] != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                    }
+                    tails_verif[seq_id] = i;
+                }
+            }
+            for (uint32_t i = 0; i < cache.size; ++i) {
+                if (tails_verif[i] != cache.cells[i].tail) {
+                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
+                }
+            }
+        }
+#endif
+
+        // find next empty cell
+        uint32_t next_empty_cell = cache.head;
+
+        for (uint32_t i = 0; i < cache.size; ++i) {
+            if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
+            llama_kv_cell & cell = cache.cells[next_empty_cell];
+            if (cell.is_empty()) { break; }
+            next_empty_cell += 1;
+        }
+
+        // find usable cell range
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            llama_kv_cell & seq_meta = cache.cells[seq_id];
+            bool has_cell = false;
+            if (seq_meta.tail >= 0) {
+                llama_kv_cell & cell = cache.cells[seq_meta.tail];
+                GGML_ASSERT(cell.has_seq_id(seq_id));
+                // does this seq_id "own" the cell?
+                if (cell.seq_id.size() == 1) { has_cell = true; }
+            }
+            if (!has_cell) {
+                llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
+                GGML_ASSERT(empty_cell.is_empty());
+                // copy old tail into the empty cell
+                if (seq_meta.tail >= 0) {
+                    llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
+                    empty_cell.pos = orig_cell.pos;
+                    empty_cell.src = orig_cell.src;
+                    orig_cell.seq_id.erase(seq_id);
+                    empty_cell.seq_id.insert(seq_id); // will be overwritten
+                }
+                seq_meta.tail = next_empty_cell;
+                // find next empty cell
+                if (s + 1 < n_seqs) {
+                    next_empty_cell += 1;
+                    for (uint32_t i = 0; i < cache.size; ++i) {
+                        if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
+                        llama_kv_cell & cell = cache.cells[next_empty_cell];
+                        if (cell.is_empty()) { break; }
+                        next_empty_cell += 1;
+                    }
+                }
+            }
+            if (min > seq_meta.tail) { min = seq_meta.tail; }
+            if (max < seq_meta.tail) { max = seq_meta.tail; }
+        }
+
+        // gather and re-order
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            int32_t dst_id = s + min;
+            int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
+            if (dst_id != src_id) {
+                llama_kv_cell & dst_cell = cache.cells[dst_id];
+                llama_kv_cell & src_cell = cache.cells[src_id];
+
+                std::swap(dst_cell.pos, src_cell.pos);
+                std::swap(dst_cell.src, src_cell.src);
+                std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+                // swap tails (assuming they NEVER overlap)
+                for (const llama_seq_id seq_id : src_cell.seq_id) {
+                    cache.cells[seq_id].tail = src_id;
+                }
+                for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                    cache.cells[seq_id].tail = dst_id;
+                }
+            }
+        }
+
+        // update the pos of the used seqs
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+            int32_t cell_id = s + min;
+            llama_kv_cell & cell = cache.cells[cell_id];
+
+            if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+                // What should happen when the pos backtracks or skips a value?
+                // Clearing the state mid-batch would require special-casing which isn't done.
+                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                    __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+            }
+            cell.pos = last_pos;
+            cell.seq_id.clear();
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+                cell.seq_id.insert(seq_id);
+                cache.cells[seq_id].tail = cell_id;
+            }
+        }
+
+        // allow getting the range of used cells, from head to head + n
+        cache.head = min;
+        cache.n    = max - min + 1;
+        cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
+            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
+
+        // sanity check
+        return llama_kv_cache_slot_info(cache.n >= n_seqs);
+    }
+    // otherwise, one cell per token.
+
+    if (n_tokens > cache.size) {
+        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
+        return llama_kv_cache_slot_info_failed;
+    }
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (cache.head + n_tokens > cache.size) {
+            n_tested += cache.size - cache.head;
+            cache.head = 0;
+            continue;
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            if (cache.cells[cache.head + i].pos >= 0) {
+                found = false;
+                cache.head += i + 1;
+                n_tested   += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= cache.size) {
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return llama_kv_cache_slot_info_failed;
+        }
+    }
+
+    for (uint32_t s = 0; s < n_seqs; s++) {
+        for (uint32_t i = 0; i < n_seq_tokens; ++i) {
+            uint32_t k = s*n_seq_tokens + i;
+            cache.cells[cache.head + k].pos = ubatch.pos[k];
+
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
+                cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
+            }
+        }
+    }
+
+    cache.used += n_tokens;
+
+    return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
+}
+
+uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
+    for (uint32_t i = cache.size; i > 0; --i) {
+        const llama_kv_cell & cell = cache.cells[i - 1];
+
+        if (cell.pos >= 0 && !cell.is_empty()) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
+void llama_kv_cache_clear(struct llama_kv_cache & cache) {
+    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
+        cache.cells[i].pos = -1;
+        cache.cells[i].seq_id.clear();
+        cache.cells[i].src = -1;
+        cache.cells[i].tail = -1;
+    }
+    cache.head = 0;
+    cache.used = 0;
+
+    for (auto & buf : cache.bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_seq_rm(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1) {
+    uint32_t new_head = cache.size;
+
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (cache.recurrent) {
+        if (seq_id >= (int64_t) cache.size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            int32_t & tail_id = cache.cells[seq_id].tail;
+            if (tail_id >= 0) {
+                const llama_kv_cell & cell = cache.cells[tail_id];
+                // partial intersection is invalid
+                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                    return false;
+                }
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
+                    tail_id = -1;
+                }
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
+    }
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cache.cells[i].seq_id.clear();
+            } else if (cache.cells[i].has_seq_id(seq_id)) {
+                cache.cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cache.cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cache.cells[i].pos >= 0) cache.used--;
+
+                cache.cells[i].pos = -1;
+                cache.cells[i].src = -1;
+                if (new_head == cache.size) new_head = i;
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+
+    return true;
+}
+
+void llama_kv_cache_seq_cp(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id_src,
+                 llama_seq_id   seq_id_dst,
+                    llama_pos   p0,
+                    llama_pos   p1) {
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+
+    if (cache.recurrent) {
+        if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
+            llama_kv_cell & tail_src = cache.cells[seq_id_src];
+            llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
+            if (tail_dst.tail >= 0) {
+                // clear destination seq_id if it wasn't empty
+                llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
+
+                cell_dst.seq_id.erase(seq_id_dst);
+                tail_dst.tail = -1;
+                if (cell_dst.seq_id.empty()) {
+                    cell_dst.pos = -1;
+                    cell_dst.delta = -1;
+                    cell_dst.src = -1;
+                    cache.used -= 1;
+                }
+            }
+            if (tail_src.tail >= 0) {
+                llama_kv_cell & cell_src = cache.cells[tail_src.tail];
+
+                cell_src.seq_id.insert(seq_id_dst);
+                tail_dst.tail = tail_src.tail;
+            }
+        }
+
+        return;
+    }
+    // otherwise, this is the KV cache of a Transformer-like model
+
+    cache.head = 0;
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    uint32_t new_head = cache.size;
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.recurrent && (llama_seq_id) i != seq_id) {
+            cache.cells[i].tail = -1;
+        }
+        if (!cache.cells[i].has_seq_id(seq_id)) {
+            if (cache.cells[i].pos >= 0) cache.used--;
+            cache.cells[i].pos = -1;
+            cache.cells[i].src = -1;
+            cache.cells[i].seq_id.clear();
+            if (new_head == cache.size) new_head = i;
+        } else {
+            cache.cells[i].seq_id.clear();
+            cache.cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+}
+
+void llama_kv_cache_seq_add(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                    llama_pos   delta) {
+    uint32_t new_head = cache.size;
+
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) return;
+
+    if (cache.recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
+            const int32_t tail_id = cache.cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cache.cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos += delta;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.has_shift = true;
+            cache.cells[i].pos   += delta;
+            cache.cells[i].delta += delta;
+
+            if (cache.cells[i].pos < 0) {
+                if (!cache.cells[i].is_empty()) {
+                    cache.used--;
+                }
+                cache.cells[i].pos = -1;
+                cache.cells[i].seq_id.clear();
+                if (new_head == cache.size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    cache.head = new_head != cache.size ? new_head : 0;
+}
+
+void llama_kv_cache_seq_div(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                          int   d) {
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) return;
+
+    if (cache.recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
+            const int32_t tail_id = cache.cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cache.cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos /= d;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.has_shift = true;
+
+            {
+                llama_pos p_old = cache.cells[i].pos;
+                cache.cells[i].pos   /= d;
+                cache.cells[i].delta += cache.cells[i].pos - p_old;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cache.cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
+    if (!cache.recurrent) {
+        cache.do_defrag = true;
+    }
+}
+
+int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
+    int result = 0;
+
+    for (uint32_t i = 0; i < kv.size; i++) {
+        result += kv.cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
+    return kv.used;
+}
+
+bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
+    return kv.can_shift;
+}
+
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
+    struct llama_kv_cache_view result = {
+        /*.n_cells            = */ 0,
+        /*.n_seq_max          = */ n_seq_max,
+        /*.token_count        = */ 0,
+        /*.used_cells         = */ llama_get_kv_cache_used_cells(kv),
+        /*.max_contiguous     = */ 0,
+        /*.max_contiguous_idx = */ -1,
+        /*.cells              = */ nullptr,
+        /*.cells_sequences    = */ nullptr,
+    };
+
+    return result;
+}
+
+void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
+    if (view->cells != nullptr) {
+        free(view->cells);
+        view->cells = nullptr;
+    }
+    if (view->cells_sequences != nullptr) {
+        free(view->cells_sequences);
+        view->cells_sequences = nullptr;
+    }
+}
+
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
+    if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
+        view->n_cells = int32_t(kv.size);
+        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
+        view->cells = (struct llama_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
+        view->cells_sequences = (llama_seq_id *)p;
+    }
+
+    const std::vector<llama_kv_cell> & kv_cells = kv.cells;
+    llama_kv_cache_view_cell * c_curr = view->cells;
+    llama_seq_id * cs_curr = view->cells_sequences;
+    int32_t used_cells = 0;
+    int32_t token_count = 0;
+    int32_t curr_contig_idx = -1;
+    uint32_t max_contig = 0;
+    int32_t max_contig_idx = -1;
+
+    for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
+        const size_t curr_size = kv_cells[i].seq_id.size();
+        token_count += curr_size;
+        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
+
+        if (curr_size > 0) {
+            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
+                max_contig = i - curr_contig_idx;
+                max_contig_idx = curr_contig_idx;
+            }
+            curr_contig_idx = -1;
+        } else if (curr_contig_idx < 0) {
+            curr_contig_idx = i;
+        }
+
+        int seq_idx = 0;
+        for (const llama_seq_id it : kv_cells[i].seq_id) {
+            if (seq_idx >= view->n_seq_max) {
+                break;
+            }
+            cs_curr[seq_idx] = it;
+            seq_idx++;
+        }
+        if (seq_idx != 0) {
+            used_cells++;
+        }
+        for (; seq_idx < view->n_seq_max; seq_idx++) {
+            cs_curr[seq_idx] = -1;
+        }
+    }
+    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
+        max_contig_idx = curr_contig_idx;
+        max_contig = kv_cells.size() - curr_contig_idx;
+    }
+    view->max_contiguous = max_contig;
+    view->max_contiguous_idx = max_contig_idx;
+    view->token_count = token_count;
+    view->used_cells = used_cells;
+    if (uint32_t(used_cells) != kv.used) {
+        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+            __func__, kv.used, used_cells);
+    }
+}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
new file mode 100644
index 000000000..dca6f3998
--- /dev/null
+++ b/src/llama-kv-cache.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include "llama.h"
+
+#include "ggml-cpp.h"
+
+#include <set>
+#include <vector>
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+    int32_t   src   = -1; // used by recurrent state models to copy states
+    int32_t   tail  = -1;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+
+    bool is_empty() const {
+        return seq_id.empty();
+    }
+
+    bool is_same_seq(const llama_kv_cell & other) const {
+        return seq_id == other.seq_id;
+    }
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+    bool do_defrag = false;
+    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+    bool v_trans   = true;  // the value tensor is transposed
+    bool can_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    std::vector<llama_kv_cell> cells;
+
+    std::vector<struct ggml_tensor *> k_l; // per layer
+    std::vector<struct ggml_tensor *> v_l;
+
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    size_t total_size() const {
+        size_t size = 0;
+        for (const auto & buf : bufs) {
+            size += ggml_backend_buffer_get_size(buf.get());
+        }
+
+        return size;
+    }
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos max_pos() const {
+        llama_pos max_pos = -1;
+        for (const auto & cell : cells) {
+            max_pos = std::max(max_pos, cell.pos);
+        }
+
+        return max_pos;
+    }
+};
+
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+
+    operator bool() const { return found; }
+};
+
+// TODO: maybe not needed
+uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
+
+bool llama_kv_cache_init(
+        struct llama_kv_cache & cache,
+            const llama_model & model,
+          const llama_cparams & cparams,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                     uint32_t   kv_size,
+                         bool   offload);
+
+// find an empty slot of size "n_tokens" in the cache
+// updates the cache head
+// returns a structure holding information about the slot found
+// Note: On success, it's important that cache.head points
+// to the first cell of the slot.
+struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
+           struct llama_kv_cache & cache,
+       const struct llama_ubatch & batch);
+
+// find how many cells are currently in use
+uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
+
+void llama_kv_cache_clear(struct llama_kv_cache & cache);
+
+bool llama_kv_cache_seq_rm(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1);
+
+void llama_kv_cache_seq_cp(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id_src,
+                 llama_seq_id   seq_id_dst,
+                    llama_pos   p0,
+                    llama_pos   p1);
+
+void llama_kv_cache_seq_keep(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id);
+
+void llama_kv_cache_seq_add(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                    llama_pos   delta);
+
+void llama_kv_cache_seq_div(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id,
+                    llama_pos   p0,
+                    llama_pos   p1,
+                          int   d);
+
+llama_pos llama_kv_cache_seq_pos_max(
+        struct llama_kv_cache & cache,
+                 llama_seq_id   seq_id);
+
+void llama_kv_cache_defrag(struct llama_kv_cache & cache);
+
+int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
+
+int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
+
+bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
+
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
+
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
+
+//
+// kv cache restore
+//
+
+// saves the kv_cache state for future recovery.
+// used to rollback llama_kv_cache_find_slot changes.
+struct llama_kv_slot_restorer {
+    struct llama_kv_cache_state {
+        uint32_t head = 0;
+        uint32_t n    = 0;
+    } old_state;
+
+    // for non-recurrent models only
+    // list of slots to restore
+    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
+
+    bool do_restore = false;
+
+    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+        old_state.head = cache.head;
+        old_state.n    = cache.n;
+    }
+
+    // saves a slot information for future restoration
+    void save(const struct llama_kv_cache_slot_info & slot) {
+        if (slot) {
+            do_restore = true;
+            if (slot.boundaries.first != slot.boundaries.second) {
+                slot_boundaries.push_back(slot.boundaries);
+            }
+        }
+    }
+
+    // must be explicitly called to restore the kv_cache state
+    // and rollback changes from all llama_kv_cache_find_slot calls
+    void restore(struct llama_kv_cache & cache) {
+        if (do_restore) {
+            cache.head = old_state.head;
+            cache.n    = old_state.n;
+
+            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
+                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+            } else {
+                for (auto & slot : slot_boundaries) {
+                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                }
+            }
+        }
+    }
+};
+
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
new file mode 100644
index 000000000..b716630a8
--- /dev/null
+++ b/src/llama-mmap.cpp
@@ -0,0 +1,590 @@
+#include "llama-mmap.h"
+
+#include "llama-impl.h"
+
+#include "ggml.h"
+
+#include <cstring>
+#include <climits>
+#include <stdexcept>
+#include <cerrno>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+            #include <fcntl.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+// TODO: consider moving to llama-impl.h if needed in more places
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+// llama_file
+
+struct llama_file::impl {
+#if defined(_WIN32)
+    HANDLE fp_win32;
+    std::string GetErrorMessageWin32(DWORD error_code) const {
+        std::string ret;
+        LPSTR lpMsgBuf = NULL;
+        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                    NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
+        if (!bufLen) {
+            ret = format("Win32 error code: %lx", error_code);
+        } else {
+            ret = lpMsgBuf;
+            LocalFree(lpMsgBuf);
+        }
+
+        return ret;
+    }
+
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+        LARGE_INTEGER li;
+        li.QuadPart = 0;
+        BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
+        if (!ret) {
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+        }
+
+        return li.QuadPart;
+    }
+
+    void seek(size_t offset, int whence) const {
+        static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
+        static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
+        static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
+
+        LARGE_INTEGER li;
+        li.QuadPart = offset;
+        BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
+        if (!ret) {
+            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+        }
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        size_t bytes_read = 0;
+        while (bytes_read < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
+            DWORD chunk_read = 0;
+            BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
+            if (!result) {
+                throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+            }
+            if (chunk_read < chunk_size || chunk_read == 0) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+
+            bytes_read += chunk_read;
+        }
+    }
+
+    uint32_t read_u32() const {
+        uint32_t val;
+        read_raw(&val, sizeof(val));
+        return val;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        size_t bytes_written = 0;
+        while (bytes_written < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
+            DWORD chunk_written = 0;
+            BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
+            if (!result) {
+                throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
+            }
+            if (chunk_written < chunk_size || chunk_written == 0) {
+                throw std::runtime_error("unexpectedly failed to write bytes");
+            }
+
+            bytes_written += chunk_written;
+        }
+    }
+
+    void write_u32(uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~impl() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+#else
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        if (ret == -1) {
+            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+        }
+
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) const {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        if (ret != 0) {
+            throw std::runtime_error(format("seek error: %s", strerror(errno)));
+        }
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error("unexpectedly reached end of file");
+        }
+    }
+
+    uint32_t read_u32() const {
+        uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, len, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~impl() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+#endif
+
+    FILE * fp;
+    size_t size;
+};
+
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file::~llama_file() = default;
+
+size_t llama_file::tell() const { return pimpl->tell(); }
+size_t llama_file::size() const { return pimpl->size; }
+
+int llama_file::file_id() const {
+#ifdef _WIN32
+    return _fileno(pimpl->fp);
+#else
+#if defined(fileno)
+    return fileno(pimpl->fp);
+#else
+    return ::fileno(pimpl->fp);
+#endif
+#endif
+}
+
+void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+
+void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
+void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
+
+// llama_mmap
+
+struct llama_mmap::impl {
+#ifdef _POSIX_MAPPED_FILES
+    std::vector<std::pair<size_t, size_t>> mapped_fragments;
+
+    impl(struct llama_file * file, size_t prefetch, bool numa) {
+        size = file->size();
+        int fd = file->file_id();
+        int flags = MAP_SHARED;
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
+            LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
+                    strerror(errno));
+        }
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+        }
+
+        if (prefetch > 0) {
+            if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
+                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            if (posix_madvise(addr, file->size(), POSIX_MADV_RANDOM)) {
+                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+
+        mapped_fragments.emplace_back(0, file->size());
+    }
+
+    static void align_range(size_t * first, size_t * last, size_t page_size) {
+        size_t offset_in_page = *first & (page_size - 1);
+        size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
+        *first += offset_to_page;
+
+        *last = *last & ~(page_size - 1);
+
+        if (*last <= *first) {
+            *last = *first;
+        }
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        int page_size = sysconf(_SC_PAGESIZE);
+        align_range(&first, &last, page_size);
+        size_t len = last - first;
+
+        if (len == 0) {
+            return;
+        }
+
+        GGML_ASSERT(first % page_size == 0);
+        GGML_ASSERT(last % page_size == 0);
+        GGML_ASSERT(last > first);
+
+        void * next_page_start = (uint8_t *) addr + first;
+
+        if (munmap(next_page_start, len)) {
+            LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+        }
+
+        std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
+        for (const auto & frag : mapped_fragments) {
+            if (frag.first < first && frag.second > last) {
+                new_mapped_fragments.emplace_back(frag.first, first);
+                new_mapped_fragments.emplace_back(last, frag.second);
+            } else if (frag.first < first && frag.second > first) {
+                new_mapped_fragments.emplace_back(frag.first, first);
+            } else if (frag.first < last && frag.second > last) {
+                new_mapped_fragments.emplace_back(last, frag.second);
+            } else if (frag.first >= first && frag.second <= last) {
+            } else {
+                new_mapped_fragments.push_back(frag);
+            }
+        }
+        mapped_fragments = std::move(new_mapped_fragments);
+    }
+
+    ~impl() {
+        for (const auto & frag : mapped_fragments) {
+            if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
+                LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+            }
+        }
+    }
+#elif defined(_WIN32)
+    impl(struct llama_file * file, size_t prefetch, bool numa) {
+        GGML_UNUSED(numa);
+
+        size = file->size();
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+
+        if (hMapping == NULL) {
+            DWORD error = GetLastError();
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        DWORD error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
+        }
+
+        if (prefetch > 0) {
+#if _WIN32_WINNT >= 0x602
+            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
+            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
+
+            pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
+
+            if (pPrefetchVirtualMemory) {
+                WIN32_MEMORY_RANGE_ENTRY range;
+                range.VirtualAddress = addr;
+                range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
+                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                    LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
+                            llama_format_win_err(GetLastError()).c_str());
+                }
+            }
+#else
+            throw std::runtime_error("PrefetchVirtualMemory unavailable");
+#endif
+        }
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);
+    }
+
+    ~impl() {
+        if (!UnmapViewOfFile(addr)) {
+            LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    impl(struct llama_file * file, size_t prefetch, bool numa) {
+        GGML_UNUSED(file);
+        GGML_UNUSED(prefetch);
+        GGML_UNUSED(numa);
+
+        throw std::runtime_error("mmap not supported");
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);
+
+        throw std::runtime_error("mmap not supported");
+    }
+#endif
+
+    void * addr;
+    size_t size;
+};
+
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(file, prefetch, numa)) {}
+llama_mmap::~llama_mmap() = default;
+
+size_t llama_mmap::size() const { return pimpl->size; }
+void * llama_mmap::addr() const { return pimpl->addr; }
+
+void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mmap::SUPPORTED  = true;
+#else
+const bool llama_mmap::SUPPORTED  = false;
+#endif
+
+// llama_mlock
+
+struct llama_mlock::impl {
+#ifdef _POSIX_MEMLOCK_RANGE
+    static size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    bool raw_lock(const void * addr, size_t size) const {
+        if (!mlock(addr, size)) {
+            return true;
+        }
+
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION \
+        "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+        "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION \
+        "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
+#endif
+
+        char* errmsg = std::strerror(errno);
+        bool suggest = (errno == ENOMEM);
+
+        struct rlimit lock_limit;
+        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+            suggest = false;
+        }
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+            suggest = false;
+        }
+
+        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+        return false;
+    }
+
+    static void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * ptr, size_t len) const {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(ptr, len)) {
+                return true;
+            }
+            if (tries == 2) {
+                LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                    len, size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            size_t increment = len + 1048576;
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    static void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
+            LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t len) const {
+        LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
+        return false;
+    }
+
+    static void raw_unlock(const void * addr, size_t len) {}
+#endif
+
+    impl() : addr(NULL), size(0), failed_already(false) {}
+
+    void init(void * ptr) {
+        GGML_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
+    }
+
+    void grow_to(size_t target_size) {
+        GGML_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+    void * addr;
+    size_t size;
+
+    bool failed_already;
+};
+
+llama_mlock::llama_mlock() : pimpl(std::make_unique<impl>()) {}
+llama_mlock::~llama_mlock() = default;
+
+void llama_mlock::init(void * ptr) { pimpl->init(ptr); }
+void llama_mlock::grow_to(size_t target_size) { pimpl->grow_to(target_size); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mlock::SUPPORTED = true;
+#else
+const bool llama_mlock::SUPPORTED = false;
+#endif
+
+size_t llama_path_max() {
+    return PATH_MAX;
+}
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
new file mode 100644
index 000000000..1da9ecb6b
--- /dev/null
+++ b/src/llama-mmap.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+struct llama_file;
+struct llama_mmap;
+struct llama_mlock;
+
+using llama_files  = std::vector<std::unique_ptr<llama_file>>;
+using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
+using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
+
+struct llama_file {
+    llama_file(const char * fname, const char * mode);
+    ~llama_file();
+
+    size_t tell() const;
+    size_t size() const;
+
+    int file_id() const; // fileno overload
+
+    void seek(size_t offset, int whence) const;
+
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;
+
+    void write_raw(const void * ptr, size_t len) const;
+    void write_u32(uint32_t val) const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mmap {
+    llama_mmap(const llama_mmap &) = delete;
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false);
+    ~llama_mmap();
+
+    size_t size() const;
+    void * addr() const;
+
+    void unmap_fragment(size_t first, size_t last);
+
+    static const bool SUPPORTED;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mlock {
+    llama_mlock();
+    ~llama_mlock();
+
+    void init(void * ptr);
+    void grow_to(size_t target_size);
+
+    static const bool SUPPORTED;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+size_t llama_path_max();
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
new file mode 100644
index 000000000..05d58ad90
--- /dev/null
+++ b/src/llama-model-loader.cpp
@@ -0,0 +1,1124 @@
+#include "llama-model-loader.h"
+
+#include "ggml.h"
+
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <future>
+
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
+
+const char * llama_file_version_name(llama_fver version) {
+    switch (version) {
+        case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
+        case GGUF_FILE_VERSION_V2: return "GGUF V2";
+        case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
+    }
+
+    return "unknown";
+}
+
+static std::string llama_model_ftype_name(llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
+    switch (ftype) {
+        case LLAMA_FTYPE_ALL_F32:         return "all F32";
+        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
+        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
+
+        default: return "unknown, may not work";
+    }
+}
+
+// return a list of splits for a given path
+// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
+static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
+    std::vector<std::string> paths;
+    std::string split_prefix;
+    std::vector<char> buf(llama_path_max(), 0);
+
+    {
+        int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
+        if (!ret) {
+            throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
+        }
+        split_prefix = std::string(buf.data(), ret);
+    }
+
+    if (split_prefix.empty()) {
+        throw std::runtime_error(format("invalid split file: %s", path.c_str()));
+    }
+
+    for (int idx = 0; idx < n_split; ++idx) {
+        int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
+        paths.push_back(std::string(buf.data(), ret));
+    }
+
+    return paths;
+}
+
+namespace GGUFMeta {
+    template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
+    struct GKV_Base_Type {
+        static constexpr gguf_type gt = gt_;
+
+        static T getter(const gguf_context * ctx, const int kid) {
+            return gfun(ctx, kid);
+        }
+    };
+
+    template<typename T> struct GKV_Base;
+
+    template<> struct GKV_Base<bool        >: GKV_Base_Type<bool,         GGUF_TYPE_BOOL,    gguf_get_val_bool> {};
+    template<> struct GKV_Base<uint8_t     >: GKV_Base_Type<uint8_t,      GGUF_TYPE_UINT8,   gguf_get_val_u8  > {};
+    template<> struct GKV_Base<uint16_t    >: GKV_Base_Type<uint16_t,     GGUF_TYPE_UINT16,  gguf_get_val_u16 > {};
+    template<> struct GKV_Base<uint32_t    >: GKV_Base_Type<uint32_t,     GGUF_TYPE_UINT32,  gguf_get_val_u32 > {};
+    template<> struct GKV_Base<uint64_t    >: GKV_Base_Type<uint64_t,     GGUF_TYPE_UINT64,  gguf_get_val_u64 > {};
+    template<> struct GKV_Base<int8_t      >: GKV_Base_Type<int8_t,       GGUF_TYPE_INT8,    gguf_get_val_i8  > {};
+    template<> struct GKV_Base<int16_t     >: GKV_Base_Type<int16_t,      GGUF_TYPE_INT16,   gguf_get_val_i16 > {};
+    template<> struct GKV_Base<int32_t     >: GKV_Base_Type<int32_t,      GGUF_TYPE_INT32,   gguf_get_val_i32 > {};
+    template<> struct GKV_Base<int64_t     >: GKV_Base_Type<int64_t,      GGUF_TYPE_INT64,   gguf_get_val_i64 > {};
+    template<> struct GKV_Base<float       >: GKV_Base_Type<float,        GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
+    template<> struct GKV_Base<double      >: GKV_Base_Type<double,       GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
+    template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING,  gguf_get_val_str > {};
+
+    template<> struct GKV_Base<std::string> {
+        static constexpr gguf_type gt = GGUF_TYPE_STRING;
+
+        static std::string getter(const gguf_context * ctx, const int kid) {
+            return gguf_get_val_str(ctx, kid);
+        }
+    };
+
+    struct ArrayInfo {
+        const gguf_type gt;
+        const size_t length;
+        const void * data;
+    };
+
+    template<> struct GKV_Base<ArrayInfo> {
+        public:
+        static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
+        static ArrayInfo getter(const gguf_context *ctx, const int k) {
+            const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
+            return ArrayInfo {
+                arr_type,
+                size_t(gguf_get_arr_n(ctx, k)),
+                arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
+            };
+        }
+    };
+
+    template<typename T>
+    class GKV : public GKV_Base<T> {
+        GKV() = delete;
+
+        public:
+        static T get_kv(const gguf_context * ctx, const int k) {
+            const enum gguf_type kt = gguf_get_kv_type(ctx, k);
+
+            if (kt != GKV::gt) {
+                throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
+                    gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
+            }
+            return GKV::getter(ctx, k);
+        }
+
+        static const char * override_type_to_str(const llama_model_kv_override_type ty) {
+            switch (ty) {
+                case LLAMA_KV_OVERRIDE_TYPE_BOOL:  return "bool";
+                case LLAMA_KV_OVERRIDE_TYPE_INT:   return "int";
+                case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
+                case LLAMA_KV_OVERRIDE_TYPE_STR:   return "str";
+            }
+            return "unknown";
+        }
+
+        static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
+            if (!ovrd) { return false; }
+            if (ovrd->tag == expected_type) {
+                LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
+                    __func__, override_type_to_str(ovrd->tag), ovrd->key);
+                switch (ovrd->tag) {
+                    case LLAMA_KV_OVERRIDE_TYPE_BOOL:  {
+                        LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
+                    } break;
+                    case LLAMA_KV_OVERRIDE_TYPE_INT:   {
+                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
+                    } break;
+                    case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
+                        LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
+                    } break;
+                    case LLAMA_KV_OVERRIDE_TYPE_STR: {
+                        LLAMA_LOG_INFO("%s\n", ovrd->val_str);
+                    } break;
+                    default:
+                        // Shouldn't be possible to end up here, but just in case...
+                        throw std::runtime_error(
+                            format("Unsupported attempt to override %s type for metadata key %s\n",
+                                override_type_to_str(ovrd->tag), ovrd->key));
+                }
+                return true;
+            }
+            LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
+                __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
+                target = ovrd->val_bool;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
+        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
+                target = ovrd->val_i64;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
+                target = ovrd->val_f64;
+                return true;
+            }
+            return false;
+        }
+
+        template<typename OT>
+        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
+        try_override(T & target, const struct llama_model_kv_override * ovrd) {
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
+                target = ovrd->val_str;
+                return true;
+            }
+            return false;
+        }
+
+        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            if (try_override<T>(target, ovrd)) {
+                return true;
+            }
+            if (k < 0) { return false; }
+            target = get_kv(ctx, k);
+            return true;
+        }
+
+        static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            return set(ctx, gguf_find_key(ctx, key), target, ovrd);
+        }
+
+        static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
+            return set(ctx, key.c_str(), target, ovrd);
+        }
+    };
+}
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
+        const int kid = gguf_find_key(meta.get(), key.c_str());
+
+        if (kid < 0) {
+            if (required) {
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        struct GGUFMeta::ArrayInfo arr_info =
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+
+        result = arr_info.length;
+        return true;
+    }
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
+        return get_arr_n(llm_kv(kid), result, required);
+    }
+
+    template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
+
+    template<typename T>
+    bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
+        const int kid = gguf_find_key(meta.get(), key.c_str());
+
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+            if (required) {
+                throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        struct GGUFMeta::ArrayInfo arr_info =
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+        switch (arr_info.gt) {
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT(
+                                            (std::is_same<T,  int32_t>::value) ||
+                                            (std::is_same<T, uint32_t>::value));  break;
+            default:
+                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+        }
+
+        result.resize(arr_info.length);
+        result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+
+        return true;
+    }
+
+    template<typename T, size_t N_MAX>
+    bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
+        const int kid = gguf_find_key(meta.get(), key.c_str());
+
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+            if (required) {
+                throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        struct GGUFMeta::ArrayInfo arr_info =
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+        switch (arr_info.gt) {
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT(
+                                            (std::is_same<T,  int32_t>::value) ||
+                                            (std::is_same<T, uint32_t>::value));  break;
+            default:
+                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+        }
+
+        if (arr_info.length > N_MAX) {
+            throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
+        }
+
+        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+
+        return true;
+    }
+
+    template<typename T>
+    bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
+        return get_arr(llm_kv(kid), result, required);
+    }
+
+    template<typename T>
+    bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
+        auto it = kv_overrides.find(key);
+
+        const struct llama_model_kv_override * override =
+            it != kv_overrides.end() ? &it->second : nullptr;
+
+        const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
+
+        if (required && !found) {
+            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+        }
+
+        return found;
+    }
+
+    template<typename T>
+    bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
+        return get_key(llm_kv(kid), result, required);
+    }
+
+    template bool llama_model_loader::get_key<bool>       (enum llm_kv kid, bool & result,        bool required);
+    template bool llama_model_loader::get_key<float>      (enum llm_kv kid, float & result,       bool required);
+    template bool llama_model_loader::get_key<uint32_t>   (enum llm_kv kid, uint32_t & result,    bool required);
+    template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
+
+    template<>
+    bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
+        uint32_t tmp;
+        const bool found = get_key(kid, tmp, required);
+        if (found) {
+            result = (enum llama_pooling_type) tmp;
+        } else {
+            result = LLAMA_POOLING_TYPE_UNSPECIFIED;
+        }
+        return found;
+    }
+
+    // get array of n <= N_MAX elements, or a single element repeated n times
+    template<typename T, size_t N_MAX>
+    bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
+        const int kid = gguf_find_key(meta.get(), key.c_str());
+
+        if (kid < 0) {
+            if (required) {
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+            }
+            return false;
+        }
+
+        if (n > N_MAX) {
+            throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
+        }
+
+        if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
+            struct GGUFMeta::ArrayInfo arr_info =
+                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+
+            if (n != arr_info.length) {
+                throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
+            }
+
+            return get_arr(key, result, required);
+        }
+
+        T value;
+
+        bool ok = get_key(key, value, required);
+        if (!ok) {
+            return false;
+        }
+
+        for (uint32_t i = 0; i < n; i++) {
+            result[i] = value;
+        }
+
+        return true;
+    }
+
+    template<typename T>
+    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
+        return get_key_or_arr(llm_kv(kid), result, n, required);
+    }
+
+    // TODO: this is not very clever - figure out something better
+    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+
+llama_model_loader::llama_model_loader(
+        const std::string & fname,
+        std::vector<std::string> & splits,
+        bool use_mmap,
+        bool check_tensors,
+        const struct llama_model_kv_override * param_overrides_p) {
+    int trace = 0;
+    if (getenv("LLAMA_TRACE")) {
+        trace = atoi(getenv("LLAMA_TRACE"));
+    }
+
+    if (param_overrides_p != nullptr) {
+        for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
+            kv_overrides.insert({std::string(p->key), *p});
+        }
+    }
+
+    // Load the main GGUF
+    struct ggml_context * ctx = NULL;
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx,
+    };
+
+    meta.reset(gguf_init_from_file(fname.c_str(), params));
+    if (!meta) {
+        throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
+    }
+
+    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+    llm_kv = LLM_KV(llm_arch_from_string(arch_name));
+
+    files.emplace_back(new llama_file(fname.c_str(), "rb"));
+    contexts.emplace_back(ctx);
+
+    // Save tensors data offset of the main file.
+    // For subsidiary files, `meta` tensor data offset must not be used,
+    // so we build a unified tensors index for weights.
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string tensor_name = std::string(cur->name);
+        // make sure there is no duplicated tensor names
+        if (weights_map.find(tensor_name) != weights_map.end()) {
+            throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+        }
+        n_elements += ggml_nelements(cur);
+        n_bytes    += ggml_nbytes(cur);
+        weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
+    }
+    uint16_t n_split = 0;
+    get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+    // Load additional GGML contexts
+    if (n_split > 1) {
+        // make sure the main file is loaded first
+        uint16_t idx = 0;
+        const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
+        get_key(kv_split_no, idx);
+        if (idx != 0) {
+            throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+        }
+
+        // generate list of splits if needed
+        if (splits.empty()) {
+            splits = llama_get_list_splits(fname, idx, n_split);
+        }
+
+        // in case user give a custom list of splits, check if it matches the expected number
+        if (n_split != (uint16_t)splits.size()) {
+            throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+        }
+
+        if (trace > 0) {
+            LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+        }
+
+        // load other splits
+        for (idx = 1; idx < n_split; idx++) {
+            const char * fname_split = splits[idx].c_str();
+
+            struct gguf_init_params split_params = {
+                /*.no_alloc = */ true,
+                /*.ctx      = */ &ctx,
+            };
+            gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+            if (!ctx_gguf) {
+                throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
+            }
+
+            // check idx
+            {
+                const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+                if (kid < 0) {
+                    throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+                }
+                int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+                if (idx_gguf != idx) {
+                    throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+                }
+            }
+
+            files.emplace_back(new llama_file(fname_split, "rb"));
+            contexts.emplace_back(ctx);
+
+            // Save tensors data offset info of the shard.
+            for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+                std::string tensor_name = std::string(cur->name);
+                // make sure there is no duplicated tensor names
+                if (weights_map.find(tensor_name) != weights_map.end()) {
+                    throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+                }
+                n_elements += ggml_nelements(cur);
+                n_bytes    += ggml_nbytes(cur);
+                weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
+            }
+        }
+
+        get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+
+        // sanity check
+        {
+            const int n_tensors_loaded = (int) weights_map.size();
+            if (n_tensors != n_tensors_loaded) {
+                throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+            }
+        }
+
+        LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
+    }
+
+    n_kv      = gguf_get_n_kv(meta.get());
+    n_tensors = weights_map.size();
+
+    fver = (enum llama_fver) gguf_get_version(meta.get());
+
+    LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
+            __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+
+    // determine file type based on the number of tensors for each quantization and print meta data
+    // TODO: make optional
+    {
+        std::map<enum ggml_type, uint32_t> n_type;
+
+        uint32_t n_type_max = 0;
+        enum ggml_type type_max = GGML_TYPE_F32;
+
+        for (const auto & it : weights_map) {
+            const llama_tensor_weight & w = it.second;
+            const ggml_tensor * tensor = w.tensor;
+
+            enum ggml_type type = tensor->type;
+
+            n_type[type]++;
+
+            if (n_type_max < n_type[type]) {
+                n_type_max = n_type[type];
+                type_max   = type;
+            }
+
+            if (trace > 0) {
+                const uint16_t sid = w.idx;
+                LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+            }
+        }
+
+        switch (type_max) {
+            case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
+            case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
+            case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
+            case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
+            case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
+            case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
+            case GGML_TYPE_Q5_1:    ftype = LLAMA_FTYPE_MOSTLY_Q5_1;    break;
+            case GGML_TYPE_Q8_0:    ftype = LLAMA_FTYPE_MOSTLY_Q8_0;    break;
+            case GGML_TYPE_Q2_K:    ftype = LLAMA_FTYPE_MOSTLY_Q2_K;    break;
+            case GGML_TYPE_Q3_K:    ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M;  break;
+            case GGML_TYPE_Q4_K:    ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M;  break;
+            case GGML_TYPE_Q5_K:    ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M;  break;
+            case GGML_TYPE_Q6_K:    ftype = LLAMA_FTYPE_MOSTLY_Q6_K;    break;
+            case GGML_TYPE_TQ1_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ1_0;   break;
+            case GGML_TYPE_TQ2_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ2_0;   break;
+            case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
+            case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
+            case GGML_TYPE_IQ2_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_S;   break;
+            case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
+            case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
+            case GGML_TYPE_IQ1_M:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_M;   break;
+            case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
+            case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
+            case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+            default:
+                {
+                    LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
+                    ftype = LLAMA_FTYPE_ALL_F32;
+                } break;
+        }
+
+        // this is a way to mark that we have "guessed" the file type
+        ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
+
+        {
+            const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
+            if (kid >= 0) {
+                ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid);
+            }
+        }
+
+        LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(meta.get(), i);
+            const enum gguf_type type   = gguf_get_kv_type(meta.get(), i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(meta.get(), i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+        }
+    }
+
+    if (!llama_mmap::SUPPORTED) {
+        LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
+        use_mmap = false;
+    }
+
+    this->use_mmap = use_mmap;
+    this->check_tensors = check_tensors;
+}
+
+std::string llama_model_loader::get_arch_name() const {
+    return arch_name;
+}
+
+enum llm_arch llama_model_loader::get_arch() const {
+    return llm_kv.arch;
+}
+
+const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
+    auto pos = weights_map.find(name);
+    if (pos != weights_map.end()) {
+        return &pos->second;
+    }
+
+    return nullptr;
+}
+
+const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
+    const llama_tensor_weight * weight = get_weight(name);
+    if (!weight) {
+        throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
+    }
+    return *weight;
+}
+
+struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
+    const auto * weight = get_weight(name);
+    if (!weight) {
+        return nullptr;
+    }
+    return weight->tensor;
+}
+
+struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
+    struct ggml_tensor * tensor = get_tensor_meta(name.c_str());
+    if (!tensor) {
+        throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+    }
+    return tensor;
+}
+
+const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
+    const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
+
+    if (cur == NULL) {
+        if (!required) {
+            return NULL;
+        }
+        throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+    }
+
+    {
+        bool is_ok = true;
+        for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+            if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
+                is_ok = false;
+                break;
+            }
+        }
+        if (!is_ok) {
+            throw std::runtime_error(
+                    format("%s: tensor '%s' has wrong shape; expected %s, got %s",
+                        __func__, name.c_str(),
+                        llama_format_tensor_shape(ne).c_str(),
+                        llama_format_tensor_shape(cur).c_str()));
+        }
+    }
+
+    return cur;
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
+    const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
+
+    if (cur == NULL) {
+        return NULL;
+    }
+
+    bool duplicated = flags & TENSOR_DUPLICATED;
+
+    struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
+    ggml_set_name(tensor, ggml_get_name(cur));
+
+    if (duplicated) {
+        size_data += ggml_nbytes(cur);
+    } else {
+        n_created++;
+    }
+
+    return tensor;
+
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
+    const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
+
+    if (cur == NULL) {
+        return NULL;
+    }
+
+    if (cur->type != base->type) {
+        throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
+    }
+
+    std::array<int64_t, GGML_MAX_DIMS> dims;
+    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+        dims[i] = i < ne.size() ? ne.begin()[i] : 1;
+    }
+
+    struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
+                                    dims[0], dims[1], dims[2], dims[3],
+                                    cur->nb[1], cur->nb[2], cur->nb[3],
+                                    offset);
+
+    ggml_set_name(tensor, name.c_str());
+
+    n_created++;
+
+    return tensor;
+}
+
+void llama_model_loader::done_getting_tensors() const {
+    if (n_created != n_tensors) {
+        throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+    }
+}
+
+void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
+    if (use_mmap) {
+        mappings.reserve(files.size());
+        mmaps_used.reserve(files.size());
+        for (const auto & file : files) {
+            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
+            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
+            mmaps_used.emplace_back(mapping->size(), 0);
+            if (mlock_mmaps) {
+                std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
+                mlock_mmap->init(mapping->addr());
+                mlock_mmaps->emplace_back(std::move(mlock_mmap));
+            }
+            mappings.emplace_back(std::move(mapping));
+        }
+    }
+
+    // compute the total size of all tensors for progress reporting
+    for (const auto & it : weights_map) {
+        size_data += ggml_nbytes(it.second.tensor);
+    }
+}
+
+void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
+    GGML_ASSERT(!mappings.empty());
+    const auto & mapping = mappings.at(idx);
+
+    *first = mapping->size();
+    *last  = 0;
+    *addr = mapping->addr();
+    for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+        const auto * weight = get_weight(ggml_get_name(tensor));
+        if (!weight || weight->idx != idx) {
+            continue;
+        }
+        *first = std::min(*first, weight->offs);
+        *last  = std::max(*last,  weight->offs + ggml_nbytes(tensor));
+    }
+}
+
+void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
+    const auto & w = require_weight(ggml_get_name(cur));
+
+    if (use_mmap) {
+        const auto & mapping = mappings.at(w.idx);
+        if (cur->data == nullptr) {
+            cur->data = (uint8_t *)mapping->addr() + w.offs;
+        } else {
+            memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur));
+        }
+    } else {
+        GGML_ASSERT(cur->data != nullptr);
+        GGML_ASSERT(w.idx < files.size());
+        const auto & file = files.at(w.idx);
+        file->seek(w.offs, SEEK_SET);
+        file->read_raw(cur->data, ggml_nbytes(cur));
+    }
+
+    if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
+        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+    }
+}
+
+bool llama_model_loader::load_all_data(
+        struct ggml_context * ctx,
+        llama_buf_map & bufs,
+        llama_mlocks * lmlocks,
+        llama_progress_callback progress_callback,
+        void * progress_callback_user_data) {
+    GGML_ASSERT(size_data != 0 && "call init_mappings() first");
+
+    std::vector<no_init<uint8_t>> read_buf;
+    std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
+
+    // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
+    // NVMe raid configurations might require more / larger buffers.
+    constexpr size_t n_buffers = 4;
+    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+
+    std::vector<ggml_backend_buffer_t> host_buffers;
+    std::vector<ggml_backend_event_t> events;
+    std::vector<void *> host_ptrs;
+    size_t buffer_idx = 0; // buffer to use for async loads
+    ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
+        if (use_mmap || check_tensors) {
+            return nullptr;
+        }
+        // When not using mmaped io use async uploads from pinned memory to GPU memory.
+        // First determine if the backend supports the necessary features for async uploads.
+        auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
+        if (!buf) {
+            LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
+            return nullptr;
+        }
+
+        auto * buft = ggml_backend_buffer_get_type(buf);
+        auto * dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
+                ggml_backend_buft_name(buft));
+            return nullptr;
+        }
+
+        if (buft != ggml_backend_dev_buffer_type(dev)) {
+            LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
+                ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
+            return nullptr;
+        }
+
+        ggml_backend_dev_props props;
+        ggml_backend_dev_get_props(dev, &props);
+        if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
+            LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
+                ggml_backend_dev_name(dev));
+            return nullptr;
+        }
+
+        auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+        if (!host_buft) {
+            LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
+                ggml_backend_dev_name(dev));
+            return nullptr;
+        }
+
+        // If the backend is supported, create pinned memory buffers and events for synchronisation.
+        for (size_t idx = 0; idx < n_buffers; ++idx) {
+            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+            if (!buf) {
+                LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
+                    ggml_backend_dev_name(dev));
+                return nullptr;
+            }
+
+            host_buffers.emplace_back(buf);
+            host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
+
+            auto * event = ggml_backend_event_new(dev);
+            if (!event) {
+                LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
+                    ggml_backend_dev_name(dev));
+                return nullptr;
+            }
+
+            events.emplace_back(event);
+        }
+
+        ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+        if (!backend) {
+            LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
+                ggml_backend_dev_name(dev));
+            return nullptr;
+        }
+
+        return backend;
+    }(__func__);
+
+    if (upload_backend) {
+        LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
+            ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
+            ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
+            ggml_backend_name(upload_backend));
+    }
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+        const auto * weight = get_weight(ggml_get_name(cur));
+        if (weight == nullptr) {
+            // this can happen with split experts models
+            continue;
+        }
+
+        if (progress_callback) {
+            if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+                return false;
+            }
+        }
+
+        size_t n_size = ggml_nbytes(cur);
+
+        if (use_mmap) {
+            const auto & mapping = mappings.at(weight->idx);
+            ggml_backend_buffer_t buf_mmap = nullptr;
+            if (bufs.count(weight->idx)) {
+                buf_mmap = bufs.at(weight->idx);
+            }
+            uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
+
+            if (check_tensors) {
+                validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
+                    return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
+                }));
+            }
+
+            GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+            if (buf_mmap && cur->data == nullptr) {
+                ggml_backend_tensor_alloc(buf_mmap, cur, data);
+                if (lmlocks) {
+                    const auto & lmlock = lmlocks->at(weight->idx);
+                    lmlock->grow_to(weight->offs + n_size);
+                }
+
+                auto & mmap_used = mmaps_used[weight->idx];
+                mmap_used.first  = std::min(mmap_used.first,  weight->offs);
+                mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+            } else {
+                ggml_backend_tensor_set(cur, data, 0, n_size);
+            }
+        } else {
+            const auto & file = files.at(weight->idx);
+            if (ggml_backend_buffer_is_host(cur->buffer)) {
+                file->seek(weight->offs, SEEK_SET);
+                file->read_raw(cur->data, n_size);
+                if (check_tensors) {
+                    validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+                        return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+                    }));
+                }
+            } else {
+                // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
+                if (upload_backend) {
+                    file->seek(weight->offs, SEEK_SET);
+
+                    size_t bytes_read = 0;
+
+                    while (bytes_read < n_size) {
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+
+                        ggml_backend_event_synchronize(events[buffer_idx]);
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                        ggml_backend_event_record(events[buffer_idx], upload_backend);
+
+                        bytes_read += read_iteration;
+                        ++buffer_idx;
+                        buffer_idx %= n_buffers;
+                    }
+                } else {
+                    read_buf.resize(n_size);
+                    file->seek(weight->offs, SEEK_SET);
+                    file->read_raw(read_buf.data(), n_size);
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                    if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+                        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                    }
+                }
+            }
+        }
+
+        size_done += n_size;
+    }
+
+    // free temporary resources used for async uploads
+    for (auto * event : events) {
+        ggml_backend_event_synchronize(event);
+        ggml_backend_event_free(event);
+    }
+    for (auto * buf : host_buffers) {
+        ggml_backend_buffer_free(buf);
+    }
+    ggml_backend_free(upload_backend);
+
+    // check validation results
+    bool validation_failed = false;
+    for (auto & future : validation_result) {
+        auto result = future.get();
+        if (!result.second) {
+            LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
+            validation_failed = true;
+        }
+    }
+    if (validation_failed) {
+        throw std::runtime_error("found tensors with invalid data");
+    }
+
+    // check if this is the last call and do final cleanup
+    if (size_done >= size_data) {
+        // unmap offloaded tensors and metadata
+        if (use_mmap) {
+            for (uint32_t idx = 0; idx < mappings.size(); idx++) {
+                const auto & mmap_used = mmaps_used.at(idx);
+                auto & mapping = mappings.at(idx);
+                mapping->unmap_fragment(0, mmap_used.first);
+                if (mmap_used.second != 0) {
+                    mapping->unmap_fragment(mmap_used.second, mapping->size());
+                }
+            }
+        }
+        if (progress_callback) {
+            // Even though the model is done loading, we still honor
+            // cancellation since we need to free allocations.
+            return progress_callback(1.0f, progress_callback_user_data);
+        }
+    }
+
+    return true;
+}
+
+std::string llama_model_loader::ftype_name() const {
+    return llama_model_ftype_name(ftype);
+}
+
+void llama_model_loader::print_info() const {
+    LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
+    LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
+    if (n_bytes < GiB) {
+        LLAMA_LOG_INFO("%s: file size   = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0,        n_bytes*8.0/n_elements);
+    } else {
+        LLAMA_LOG_INFO("%s: file size   = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
+    }
+}
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
new file mode 100644
index 000000000..fe35404b2
--- /dev/null
+++ b/src/llama-model-loader.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#include "llama.h"
+
+#include "llama-impl.h"
+#include "llama-arch.h"
+#include "llama-mmap.h"
+
+#include "ggml-cpp.h"
+
+#include <cstddef>
+#include <map>
+#include <stdexcept>
+#include <unordered_map>
+
+using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+const char * llama_file_version_name(llama_fver version);
+
+struct llama_model_loader {
+    // Holds information on a model weight
+    struct llama_tensor_weight {
+        uint16_t  idx; // source file index
+        size_t   offs; // tensor data offset in the original file
+
+        ggml_tensor * tensor;
+
+        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
+            if (tensor_idx < 0) {
+                throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
+            }
+
+            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
+                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
+            }
+        }
+    };
+
+    // custom comparator to sort weights more nicely by layer
+    struct weight_name_comparer {
+        bool operator()(const std::string & a, const std::string & b) const {
+            int a_layer = -1;
+            int b_layer = -1;
+            sscanf(a.c_str(), "blk.%d.", &a_layer);
+            sscanf(b.c_str(), "blk.%d.", &b_layer);
+            if (a_layer != b_layer) {
+                return a_layer < b_layer;
+            }
+            return a < b;
+        }
+    };
+
+    static const int TENSOR_NOT_REQUIRED = 1;
+    static const int TENSOR_DUPLICATED   = 2;
+
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    uint64_t n_elements = 0;
+    size_t   n_bytes    = 0;
+
+    bool use_mmap = false;
+    bool check_tensors;
+
+    llama_files files;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    llama_mmaps mappings;
+
+    std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
+    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
+
+    gguf_context_ptr meta;
+    std::vector<ggml_context_ptr> contexts;
+
+    std::string arch_name;
+    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
+
+    size_t size_done = 0;
+    size_t size_data = 0;
+    std::vector<std::pair<size_t, size_t>> mmaps_used;
+
+    llama_model_loader(
+        const std::string & fname,
+        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
+        bool use_mmap,
+        bool check_tensors,
+        const struct llama_model_kv_override * param_overrides_p);
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(const std::string & key, T & result, bool required = true);
+
+    template<typename T>
+    typename std::enable_if<std::is_integral<T>::value, bool>::type
+    get_arr_n(enum llm_kv kid, T & result, bool required = true);
+
+    template<typename T>
+    bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
+
+    template<typename T, size_t N_MAX>
+    bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
+
+    template<typename T>
+    bool get_arr(enum llm_kv kid, T & result, bool required = true);
+
+    template<typename T>
+    bool get_key(const std::string & key, T & result, bool required = true);
+
+    template<typename T>
+    bool get_key(enum llm_kv kid, T & result, bool required = true);
+
+    template<typename T, size_t N_MAX>
+    bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
+
+    template<typename T>
+    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
+
+    std::string get_arch_name() const;
+
+    enum llm_arch get_arch() const;
+
+    const llama_tensor_weight * get_weight(const char * name) const;
+
+    const llama_tensor_weight & require_weight(const char * name) const;
+
+    struct ggml_tensor * get_tensor_meta(const char * name) const;
+
+    struct ggml_tensor * require_tensor_meta(const std::string & name) const;
+
+    const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
+
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+
+    struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
+
+    void done_getting_tensors() const;
+
+    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
+
+    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
+
+    // for backwards compatibility, does not support ggml-backend
+    void load_data_for(struct ggml_tensor * cur) const;
+
+    // Returns false if cancelled by progress_callback
+    bool load_all_data(
+            struct ggml_context * ctx,
+            llama_buf_map & bufs,
+            llama_mlocks * lmlocks,
+            llama_progress_callback progress_callback,
+            void * progress_callback_user_data);
+
+    std::string ftype_name() const;
+
+    void print_info() const;
+};
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
new file mode 100644
index 000000000..0f4b62c43
--- /dev/null
+++ b/src/llama-model.cpp
@@ -0,0 +1,4023 @@
+#include "llama-model.h"
+
+#include "llama-impl.h"
+#include "llama-mmap.h"
+#include "llama-model-loader.h"
+
+#include "ggml-cpp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+
+const char * llm_type_name(llm_type type) {
+    switch (type) {
+        case LLM_TYPE_14M:           return "14M";
+        case LLM_TYPE_17M:           return "17M";
+        case LLM_TYPE_22M:           return "22M";
+        case LLM_TYPE_33M:           return "33M";
+        case LLM_TYPE_60M:           return "60M";
+        case LLM_TYPE_70M:           return "70M";
+        case LLM_TYPE_80M:           return "80M";
+        case LLM_TYPE_109M:          return "109M";
+        case LLM_TYPE_137M:          return "137M";
+        case LLM_TYPE_160M:          return "160M";
+        case LLM_TYPE_220M:          return "220M";
+        case LLM_TYPE_250M:          return "250M";
+        case LLM_TYPE_270M:          return "270M";
+        case LLM_TYPE_335M:          return "335M";
+        case LLM_TYPE_410M:          return "410M";
+        case LLM_TYPE_450M:          return "450M";
+        case LLM_TYPE_770M:          return "770M";
+        case LLM_TYPE_780M:          return "780M";
+        case LLM_TYPE_0_5B:          return "0.5B";
+        case LLM_TYPE_1B:            return "1B";
+        case LLM_TYPE_1_3B:          return "1.3B";
+        case LLM_TYPE_1_4B:          return "1.4B";
+        case LLM_TYPE_1_5B:          return "1.5B";
+        case LLM_TYPE_1_6B:          return "1.6B";
+        case LLM_TYPE_2B:            return "2B";
+        case LLM_TYPE_2_8B:          return "2.8B";
+        case LLM_TYPE_3B:            return "3B";
+        case LLM_TYPE_4B:            return "4B";
+        case LLM_TYPE_6B:            return "6B";
+        case LLM_TYPE_6_9B:          return "6.9B";
+        case LLM_TYPE_7B:            return "7B";
+        case LLM_TYPE_8B:            return "8B";
+        case LLM_TYPE_9B:            return "9B";
+        case LLM_TYPE_11B:           return "11B";
+        case LLM_TYPE_12B:           return "12B";
+        case LLM_TYPE_13B:           return "13B";
+        case LLM_TYPE_14B:           return "14B";
+        case LLM_TYPE_15B:           return "15B";
+        case LLM_TYPE_16B:           return "16B";
+        case LLM_TYPE_20B:           return "20B";
+        case LLM_TYPE_30B:           return "30B";
+        case LLM_TYPE_32B:           return "32B";
+        case LLM_TYPE_34B:           return "34B";
+        case LLM_TYPE_35B:           return "35B";
+        case LLM_TYPE_40B:           return "40B";
+        case LLM_TYPE_65B:           return "65B";
+        case LLM_TYPE_70B:           return "70B";
+        case LLM_TYPE_236B:          return "236B";
+        case LLM_TYPE_314B:          return "314B";
+        case LLM_TYPE_671B:          return "671B";
+        case LLM_TYPE_SMALL:         return "0.1B";
+        case LLM_TYPE_MEDIUM:        return "0.4B";
+        case LLM_TYPE_LARGE:         return "0.8B";
+        case LLM_TYPE_XL:            return "1.5B";
+        case LLM_TYPE_A1_7B:         return "A1.7B";
+        case LLM_TYPE_A2_7B:         return "A2.7B";
+        case LLM_TYPE_8x7B:          return "8x7B";
+        case LLM_TYPE_8x22B:         return "8x22B";
+        case LLM_TYPE_16x12B:        return "16x12B";
+        case LLM_TYPE_16x3_8B:       return "16x3.8B";
+        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
+        case LLM_TYPE_57B_A14B:      return "57B.A14B";
+        case LLM_TYPE_27B:           return "27B";
+        default:                     return "?B";
+    }
+}
+
+static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
+    switch (type) {
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
+        default:                                    return "unknown";
+    }
+}
+
+static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
+    { LLAMA_ROPE_SCALING_TYPE_NONE,       "none"       },
+    { LLAMA_ROPE_SCALING_TYPE_LINEAR,     "linear"     },
+    { LLAMA_ROPE_SCALING_TYPE_YARN,       "yarn"       },
+    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
+};
+
+static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
+    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+        if (kv.second == name) {
+            return (llama_rope_scaling_type) kv.first;
+        }
+    }
+
+    return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+}
+
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+    GGML_ASSERT(w != nullptr);
+
+    if (op == GGML_OP_NONE) {
+        return true;
+    }
+
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
+    if (!ctx_ptr) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
+    ggml_context * ctx = ctx_ptr.get();
+
+    ggml_tensor * op_tensor = nullptr;
+
+    switch (op) {
+        case GGML_OP_GET_ROWS:
+            {
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_get_rows(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT:
+            {
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+                op_tensor = ggml_mul_mat(ctx, w, b);
+            } break;
+        case GGML_OP_MUL_MAT_ID:
+            {
+                int n_expert_used = hparams.n_expert_used;
+                ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+                ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+                op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+            } break;
+        case GGML_OP_ADD:
+            {
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+                op_tensor = ggml_add(ctx, a, w);
+            } break;
+        case GGML_OP_MUL:
+            {
+                ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+                op_tensor = ggml_mul(ctx, a, w);
+            } break;
+        case GGML_OP_DIV:
+            {
+                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+                op_tensor = ggml_div(ctx, a, w);
+            } break;
+        case GGML_OP_ROPE:
+            {
+                int n_embd_head = hparams.n_embd_head_v;
+                int n_head = hparams.n_head();
+                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+                op_tensor = ggml_rope_ext(
+                    ctx, a, b, w,
+                    0, 0, 0, 0, 0,
+                    0, 0, 0, 0
+                );
+
+            } break;
+        case GGML_OP_SSM_CONV:
+            {
+                // FIXME
+                ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
+                op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+            } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                // FIXME
+                const int64_t d_state      = w->ne[0];
+                const int64_t d_inner      = w->ne[1];
+                const int64_t n_seq_tokens = 512;
+                const int64_t n_seqs       = 1;
+                ggml_tensor * s  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
+                ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+                ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+                ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+                op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
+            } break;
+        case GGML_OP_RWKV_WKV6:
+            {
+                // FIXME
+                const int64_t S = 123;
+                const int64_t H = 123;
+                const int64_t n_tokens = 123;
+                const int64_t n_seqs = 123;
+                ggml_tensor  * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * tf = w;
+                ggml_tensor  * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+                ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+                op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+            } break;
+        case GGML_OP_IM2COL:
+            {
+                const int n_embd = hparams.n_embd;
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
+                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+            } break;
+        default:
+            GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+    }
+
+    // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+    GGML_ASSERT(w->buffer == nullptr);
+    w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+    ggml_backend_buffer_free(w->buffer);
+    w->buffer = nullptr;
+
+    return op_supported;
+}
+
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
+    GGML_ASSERT(!buft_list.empty());
+    for (const auto & cur : buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+            return cur_buft;
+        }
+    }
+    return nullptr;
+}
+
+// CPU: ACCEL -> CPU extra -> GPU host -> CPU
+static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
+    buft_list_t buft_list;
+
+    // add ACCEL buffer types
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            auto * buft = ggml_backend_dev_buffer_type(dev);
+            // skip
+            if (buft != ggml_backend_cpu_buffer_type()) {
+                buft_list.emplace_back(dev, buft);
+            }
+        }
+    }
+
+    // add extra buffer types
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+    if (ggml_backend_dev_get_extra_bufts_fn) {
+        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+        while (extra_bufts && *extra_bufts) {
+            buft_list.emplace_back(cpu_dev, *extra_bufts);
+            ++extra_bufts;
+        }
+    }
+
+    // add a host buffer type
+    // storing the tensors in a host buffer is useful when the processing of large batches
+    // is offloaded to a GPU device, since it reduces the time spent on data transfers
+    // generally, this will be done using the first device in the list
+    // a better approach would be to handle this on a weight-by-weight basis using the offload_op
+    // function of the device to determine if it would benefit from being stored in a host buffer
+    for (auto * dev : devices) {
+        ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
+        if (buft) {
+            buft_list.emplace_back(dev, buft);
+            break;
+        }
+    }
+
+    // add the CPU buffer type
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+        }
+    }
+
+    return buft_list;
+}
+
+// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
+static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
+    buft_list_t buft_list;
+
+    // add the device split buffer type if requested and available
+    if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
+            ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+        if (ggml_backend_split_buffer_type_fn) {
+            size_t dev_index = [&]() {
+                auto * reg = ggml_backend_dev_backend_reg(dev);
+                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
+                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
+                        return i;
+                    }
+                }
+                throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
+            }();
+            auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
+            if (buft != nullptr) {
+                buft_list.emplace_back(dev, buft);
+            }
+        }
+    }
+
+    // add the device default buffer type
+    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+
+    return buft_list;
+}
+
+struct llama_model::impl {
+    impl() {}
+    ~impl() {}
+
+    uint64_t n_elements = 0;
+
+    size_t n_bytes = 0;
+
+    std::string desc_str;
+
+    // model memory mapped files
+    llama_mmaps mappings;
+
+    // objects representing data potentially being locked in memory
+    llama_mlocks mlock_bufs;
+    llama_mlocks mlock_mmaps;
+
+    // contexts where the model tensors metadata is stored
+    std::vector<ggml_context_ptr> ctxs;
+
+    // the model memory buffers for the tensor data
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    buft_list_t cpu_buft_list;
+    std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
+
+    struct layer_dev {
+        ggml_backend_dev_t dev;
+        buft_list_t * buft_list;
+    };
+
+    layer_dev dev_input = {};
+    layer_dev dev_output = {};
+    std::vector<layer_dev> dev_layer;
+};
+
+llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
+}
+
+llama_model::~llama_model() {}
+
+void llama_model::load_stats(llama_model_loader & ml) {
+    pimpl->n_elements = ml.n_elements;
+    pimpl->n_bytes = ml.n_bytes;
+}
+
+void llama_model::load_arch(llama_model_loader & ml) {
+    arch = ml.get_arch();
+    if (arch == LLM_ARCH_UNKNOWN) {
+        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
+    }
+}
+
+void llama_model::load_hparams(llama_model_loader & ml) {
+    const gguf_context * ctx = ml.meta.get();
+
+    // get metadata as string
+    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
+        enum gguf_type type = gguf_get_kv_type(ctx, i);
+        if (type == GGUF_TYPE_ARRAY) {
+            continue;
+        }
+        const char * name = gguf_get_key(ctx, i);
+        const std::string value = gguf_kv_to_str(ctx, i);
+        gguf_kv.emplace(name, value);
+    }
+
+    // get general kv
+    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+
+    // everything past this point is not vocab-related
+    if (hparams.vocab_only) {
+        return;
+    }
+
+    ml.get_key(LLM_KV_CONTEXT_LENGTH,    hparams.n_ctx_train);
+    ml.get_key(LLM_KV_EMBEDDING_LENGTH,  hparams.n_embd);
+    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
+    ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+
+    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
+
+        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
+        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
+
+        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
+        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
+    }
+
+    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
+    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
+    if (hparams.n_expert > 0) {
+        GGML_ASSERT(hparams.n_expert_used > 0);
+    } else {
+        GGML_ASSERT(hparams.n_expert_used == 0);
+    }
+
+    // zero-out the array hparams
+    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
+    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
+
+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+
+    // n_head_kv is optional, default to n_head
+    hparams.n_head_kv_arr = hparams.n_head_arr;
+
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
+
+    bool rope_finetuned = false;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
+
+    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
+    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
+
+    // rope_freq_base (optional)
+    hparams.rope_freq_base_train = 10000.0f;
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
+
+    std::string rope_scaling("linear");
+    ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
+    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
+    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
+
+    // rope_freq_scale (inverse of the kv) is optional
+    float ropescale = 0.0f;
+    if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
+        // try the old key name
+        ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
+    }
+    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
+
+    ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
+
+    // non-transformer models do not have attention heads
+    if (hparams.n_head() > 0) {
+        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+        // gpt-j n_rot = rotary_dim
+
+        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+
+        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+
+        // sanity check for n_rot (optional)
+        hparams.n_rot = hparams.n_embd_head_k;
+
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+            if (hparams.n_rot != hparams.n_embd_head_k) {
+                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+            }
+        }
+    } else {
+        hparams.n_rot = 0;
+        hparams.n_embd_head_k = 0;
+        hparams.n_embd_head_v = 0;
+    }
+
+    // for differentiating model types
+    uint32_t n_vocab = 0;
+    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
+
+    // arch-specific KVs
+    switch (arch) {
+        case LLM_ARCH_LLAMA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                if (hparams.n_expert == 8) {
+                    switch (hparams.n_layer) {
+                        case 32: type = LLM_TYPE_8x7B; break;
+                        case 56: type = LLM_TYPE_8x22B; break;
+                        default: type = LLM_TYPE_UNKNOWN;
+                    }
+                } else {
+                    switch (hparams.n_layer) {
+                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
+                        case 22: type = LLM_TYPE_1B; break;
+                        case 26: type = LLM_TYPE_3B; break;
+                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
+                        // granite uses a vocab with len 49152
+                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
+                        case 36: type = LLM_TYPE_8B; break; // granite
+                        case 40: type = LLM_TYPE_13B; break;
+                        case 48: type = LLM_TYPE_34B; break;
+                        case 60: type = LLM_TYPE_30B; break;
+                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
+                        default: type = LLM_TYPE_UNKNOWN;
+                    }
+                }
+            } break;
+        case LLM_ARCH_DECI:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 80: type = LLM_TYPE_70B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_MINICPM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
+                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
+                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+
+                switch (hparams.n_layer) {
+                    case 52: type = LLM_TYPE_1B; break;
+                    case 40: type = LLM_TYPE_2B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_MINICPM3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
+
+                switch (hparams.n_layer) {
+                    case 62: type = LLM_TYPE_4B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_GROK:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 64: type = LLM_TYPE_314B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_FALCON:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 60: type = LLM_TYPE_40B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_BAICHUAN:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_13B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+
+                if (type == LLM_TYPE_13B) {
+                    // TODO: become GGUF KV parameter
+                    hparams.f_max_alibi_bias = 8.0f;
+                }
+            } break;
+        case LLM_ARCH_STARCODER:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1B; break;
+                    case 36: type = LLM_TYPE_3B; break;
+                    case 42: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_15B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_REFACT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_1B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+
+                // TODO: become GGUF KV parameter
+                hparams.f_max_alibi_bias = 8.0f;
+            } break;
+        case LLM_ARCH_BERT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
+
+                switch (hparams.n_layer) {
+                    case 3:
+                        type = LLM_TYPE_17M; break; // bge-micro
+                    case 6:
+                        type = LLM_TYPE_22M; break; // MiniLM-L6
+                    case 12:
+                        switch (hparams.n_embd) {
+                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
+                            case 768: type = LLM_TYPE_109M; break; // bge-base
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 24:
+                        type = LLM_TYPE_335M; break; // bge-large
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_JINA_BERT_V2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
+                hparams.f_max_alibi_bias = 8.0f;
+
+                switch (hparams.n_layer) {
+                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
+                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_NOMIC_BERT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+
+                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+                    type = LLM_TYPE_137M;
+                }
+            } break;
+        case LLM_ARCH_BLOOM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1B; break;
+                    case 30:
+                        switch (hparams.n_embd) {
+                            case 2560: type = LLM_TYPE_3B; break;
+                            case 4096: type = LLM_TYPE_7B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+
+                // TODO: become GGUF KV parameter
+                hparams.f_max_alibi_bias = 8.0f;
+            } break;
+        case LLM_ARCH_MPT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 48: type = LLM_TYPE_30B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_STABLELM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_3B; break;
+                    case 40: type = LLM_TYPE_12B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_QWEN:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_13B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+            }
+            // fall through
+        case LLM_ARCH_QWEN2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
+                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 36: type = LLM_TYPE_3B; break;
+                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
+                    case 48: type = LLM_TYPE_14B; break;
+                    case 64: type = LLM_TYPE_32B; break;
+                    case 80: type = LLM_TYPE_70B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_QWEN2MOE:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_A2_7B; break;
+                    case 28: type = LLM_TYPE_57B_A14B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_PHI2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_3B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_PHI3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_3B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+
+                // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
+                if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
+                    // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
+                    hparams.n_swa = 2047;
+                } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
+                    // default value for Phi-3-mini-128k-instruct
+                    hparams.n_swa = 262144;
+                } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
+                    // default value for Phi-3-medium-128k-instruct
+                    hparams.n_swa = 131072;
+                }
+                bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (!found_swa && hparams.n_swa == 0) {
+                    throw std::runtime_error("invalid value for sliding_window");
+                }
+            } break;
+        case LLM_ARCH_PHIMOE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_16x3_8B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_PLAMO:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_13B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_GPT2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 12: type = LLM_TYPE_SMALL; break;
+                    case 24: type = LLM_TYPE_MEDIUM; break;
+                    case 36: type = LLM_TYPE_LARGE; break;
+                    case 48: type = LLM_TYPE_XL; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_CODESHELL:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 42: type = LLM_TYPE_7B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_ORION:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_14B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_INTERNLM2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 48: type = LLM_TYPE_20B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_GEMMA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 18: type = LLM_TYPE_2B; break;
+                    case 28: type = LLM_TYPE_7B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                hparams.n_swa = 4096; // default value of gemma 2
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
+                hparams.attn_soft_cap = true;
+
+                switch (hparams.n_layer) {
+                    case 26: type = LLM_TYPE_2B; break;
+                    case 42: type = LLM_TYPE_9B; break;
+                    case 46: type = LLM_TYPE_27B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_STARCODER2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 30: type = LLM_TYPE_3B; break;
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_15B; break;
+                    case 52: type = LLM_TYPE_20B; break; // granite
+                    case 88: type = LLM_TYPE_34B; break; // granite
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_MAMBA:
+            {
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
+
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 24:
+                        switch (hparams.n_embd) {
+                            case 768: type = LLM_TYPE_SMALL; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 48:
+                        switch (hparams.n_embd) {
+                            case 1024: type = LLM_TYPE_MEDIUM; break;
+                            case 1536: type = LLM_TYPE_LARGE; break;
+                            case 2048: type = LLM_TYPE_XL; break;
+                            default:   type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 64:
+                        switch (hparams.n_embd) {
+                            case 2560: type = LLM_TYPE_3B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_XVERSE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_13B; break;
+                    case 80: type = LLM_TYPE_65B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_35B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_COHERE2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_8B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_DBRX:
+        {
+            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
+
+            switch (hparams.n_layer) {
+                case 40: type = LLM_TYPE_16x12B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            }
+        } break;
+        case LLM_ARCH_OLMO:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
+
+                switch (hparams.n_layer) {
+                    case 22: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 80: type = LLM_TYPE_70B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_OLMO2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 16: type = LLM_TYPE_1B; break;
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 40: type = LLM_TYPE_13B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_OLMOE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 16: type = LLM_TYPE_A1_7B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_OPENELM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                case 16: type = LLM_TYPE_270M; break;
+                case 20: type = LLM_TYPE_450M; break;
+                case 28: type = LLM_TYPE_1B; break;
+                case 36: type = LLM_TYPE_3B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_GPTNEOX:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
+                switch (hparams.n_layer) {
+                    case 6:
+                        switch (hparams.n_ff()) {
+                            case 512:  type = LLM_TYPE_14M; break;
+                            case 2048: type = LLM_TYPE_70M; break;
+                            default:   type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 12:
+                        switch (hparams.n_ff()) {
+                            case 3072: type = LLM_TYPE_160M; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 16:
+                        switch (hparams.n_ff()) {
+                            case 8192: type = LLM_TYPE_1B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 24:
+                        switch (hparams.n_ff()) {
+                            case 4096: type = LLM_TYPE_410M; break;
+                            case 8192: type = LLM_TYPE_1_4B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 32:
+                        switch (hparams.n_ff()) {
+                            case 10240: type = LLM_TYPE_2_8B; break;
+                            case 16384: type = LLM_TYPE_6_9B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 36:
+                        switch (hparams.n_ff()) {
+                            case 20480: type = LLM_TYPE_12B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 44:
+                        switch (hparams.n_ff()) {
+                            case 24576: type = LLM_TYPE_20B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_ARCTIC:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                if (hparams.n_expert == 128) {
+                    switch (hparams.n_layer) {
+                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
+                        default: type = LLM_TYPE_UNKNOWN;
+                    }
+                } else {
+                    type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+
+                switch (hparams.n_layer) {
+                    case 28: type = LLM_TYPE_20B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_DEEPSEEK2:
+            {
+                bool is_lite = (hparams.n_layer == 27);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                if (!is_lite) {
+                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+                }
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
+                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
+                    // that have no expert_gating_func model parameter set
+                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+                }
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
+
+                switch (hparams.n_layer) {
+                    case 27: type = LLM_TYPE_16B; break;
+                    case 60: type = LLM_TYPE_236B; break;
+                    case 61: type = LLM_TYPE_671B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_CHATGLM:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 28: {
+                        if (hparams.n_head(0) == 16) {
+                            type = LLM_TYPE_1_5B;
+                        } else {
+                            type = LLM_TYPE_6B;
+                        }
+                    } break;
+                    case 40: {
+                        if (hparams.n_head(0) == 24) {
+                            type = LLM_TYPE_4B;
+                        } else {
+                            type = LLM_TYPE_9B;
+                        }
+                    } break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_BITNET:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 26: type = LLM_TYPE_3B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_T5:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+
+                uint32_t dec_start_token_id;
+                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
+                    hparams.dec_start_token_id = dec_start_token_id;
+                }
+
+                switch (hparams.n_layer) {
+                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
+                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
+                    case 12:
+                        switch (hparams.n_ff()) {
+                            case 3072: type = LLM_TYPE_220M; break; // t5-base
+                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 24:
+                        switch (hparams.n_ff()) {
+                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
+                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
+                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
+                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
+                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
+                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_T5ENCODER:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+                type = LLM_TYPE_UNKNOWN;
+            } break;
+        case LLM_ARCH_JAIS:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1_3B; break;
+                    case 40: type = LLM_TYPE_13B; break;
+                    /* TODO: add variants */
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_NEMOTRON:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_4B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_EXAONE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_8B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_RWKV6:
+        case LLM_ARCH_RWKV6QWEN2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
+                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
+                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
+                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
+                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
+
+                switch (hparams.n_layer) {
+                    case 24: type = LLM_TYPE_1_6B; break;
+                    case 32:
+                        switch (hparams.n_embd) {
+                            case 2560: type = LLM_TYPE_3B; break;
+                            case 4096: type = LLM_TYPE_7B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 61: type = LLM_TYPE_14B; break;
+                    case 64: type = LLM_TYPE_32B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
+                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
+                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_3B; break;
+                    case 40: type = LLM_TYPE_3B; break;
+                    // Add additional layer/vocab/etc checks here for other model sizes
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
+                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+
+                switch (hparams.n_layer) {
+                    case 32: type = LLM_TYPE_7B; break;
+                    case 48: type = LLM_TYPE_34B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+               }
+            } break;
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
+            } break;
+        default: throw std::runtime_error("unsupported model architecture");
+    }
+
+    pimpl->n_bytes = ml.n_bytes;
+
+    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
+
+    if (hparams.f_max_alibi_bias > 0.0f) {
+        hparams.use_alibi = true;
+    }
+
+    hparams.rope_type = llama_model_rope_type(this);
+}
+
+void llama_model::load_vocab(llama_model_loader & ml) {
+    const auto kv = LLM_KV(arch);
+
+    vocab.load(ml, kv);
+}
+
+bool llama_model::load_tensors(llama_model_loader & ml) {
+    const auto & split_mode   = params.split_mode;
+    const auto & n_gpu_layers = params.n_gpu_layers;
+    const auto & use_mlock    = params.use_mlock;
+    const auto & tensor_split = params.tensor_split;
+
+    const int n_layer = hparams.n_layer;
+
+    const bool use_mmap_buffer = true;
+
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
+
+    // build a list of buffer types for the CPU and GPU devices
+    pimpl->cpu_buft_list = make_cpu_buft_list(devices);
+    for (auto * dev : devices) {
+        buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
+        // add CPU buffer types as a fallback
+        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
+        pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
+    }
+
+    // calculate the split points
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
+    std::vector<float> splits(n_devices());
+    if (all_zero) {
+        // default split, by free memory
+        for (size_t i = 0; i < n_devices(); ++i) {
+            ggml_backend_dev_t dev = devices[i];
+            size_t total;
+            size_t free;
+            ggml_backend_dev_memory(dev, &free, &total);
+            splits[i] = free;
+        }
+    } else {
+        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
+    }
+
+    // sum and normalize the splits to get the split points
+    float split_sum = 0.0f;
+    for (size_t i = 0; i < n_devices(); ++i) {
+        split_sum += splits[i];
+        splits[i] = split_sum;
+    }
+    for (size_t i = 0; i < n_devices(); ++i) {
+        splits[i] /= split_sum;
+    }
+
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
+        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
+            return {cpu_dev, &pimpl->cpu_buft_list};
+        }
+        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
+        auto * dev = devices.at(layer_gpu);
+        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
+        return {dev, &pimpl->gpu_buft_list.at(dev)};
+    };
+
+    // assign the input layer
+    // there is very little benefit to offloading the input layer, so always keep it on the CPU
+    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
+
+    // assign the repeating layers to the devices according to the splits
+    pimpl->dev_layer.resize(n_layer);
+    for (int il = 0; il < n_layer; ++il) {
+        pimpl->dev_layer[il] = get_layer_buft_list(il);
+    }
+
+    // assign the output layer
+    pimpl->dev_output = get_layer_buft_list(n_layer);
+
+    // one ggml context per buffer type
+    int max_n_tensors = ml.n_tensors;
+    max_n_tensors += 1;         // duplicated output tensor
+    max_n_tensors += n_layer*2; // duplicated rope freq tensors
+    const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ ctx_size,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                throw std::runtime_error(format("failed to create ggml context"));
+            }
+
+            ctx_map[buft] = ctx;
+            pimpl->ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+        return it->second;
+    };
+
+    const auto TENSOR_DUPLICATED   = llama_model_loader::TENSOR_DUPLICATED;
+    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
+
+    // create tensors for the weights
+    {
+        // note: cast to int64_t since we will use these for the tensor dimensions
+        const int64_t n_head        = hparams.n_head();
+        const int64_t n_head_kv     = hparams.n_head_kv();
+        const int64_t n_embd        = hparams.n_embd;
+        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+        const int64_t n_embd_head_v = hparams.n_embd_head_v;
+        const int64_t n_ff          = hparams.n_ff();
+        const int64_t n_embd_gqa    = n_embd_v_gqa;
+        const int64_t n_vocab       = vocab.n_tokens();
+        const int64_t n_token_types = vocab.n_token_types();
+        const int64_t n_rot         = hparams.n_rot;
+        const int64_t n_expert      = hparams.n_expert;
+        const int64_t n_expert_used = hparams.n_expert_used;
+        const int64_t n_ctx_train   = hparams.n_ctx_train;
+
+        if (n_expert > 0 && hparams.n_expert_used == 0) {
+            throw std::runtime_error("model has expert layers but no expert layers are used");
+        }
+
+        int n_moved_tensors = 0;
+        ggml_tensor * first_moved_tensor = nullptr;
+        ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+        ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
+
+        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
+            ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
+
+            if (!t_meta) {
+                if (flags & TENSOR_NOT_REQUIRED) {
+                    return nullptr;
+                }
+                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+            }
+
+            // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+            // the tensor is duplicated
+            // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+            llm_tensor tn_tensor = tn.tensor;
+            if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
+                tn_tensor = LLM_TENSOR_OUTPUT;
+            }
+
+            llm_tensor_info info;
+            try {
+                info = llm_tensor_info_for(tn_tensor);
+            } catch (const std::out_of_range & e) {
+                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+            }
+
+            // tensors with "bias" suffix are always used with GGML_OP_ADD
+            ggml_op op;
+            bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+            if (bias) {
+                op = GGML_OP_ADD;
+            } else {
+                op = info.op;
+            }
+
+            // sanity checks
+            if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+                if (tn.bid != -1) {
+                    GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+                }
+            } else {
+                if (tn.bid == -1) {
+                    GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+                }
+            }
+
+            // select the buffer type for this tensor
+            buft_list_t * buft_list;
+            switch (info.layer) {
+                case LLM_TENSOR_LAYER_INPUT:
+                    buft_list = pimpl->dev_input.buft_list;
+                    break;
+                case LLM_TENSOR_LAYER_OUTPUT:
+                    buft_list = pimpl->dev_output.buft_list;
+                    break;
+                case LLM_TENSOR_LAYER_REPEATING:
+                    buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
+                    break;
+                default:
+                    GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+            }
+
+            ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+            if (!buft) {
+                throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+            }
+
+            // avoid using a host buffer when using mmap
+            auto * buft_dev = ggml_backend_buft_get_device(buft);
+            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                buft = ggml_backend_dev_buffer_type(cpu_dev);
+            }
+
+            if (buft != buft_list->front().second) {
+                n_moved_tensors++;
+                if (!first_moved_tensor) {
+                    first_moved_tensor = t_meta;
+                    first_moved_from_buft = buft_list->front().second;
+                    first_moved_to_buft   = buft;
+                }
+            }
+
+            ggml_context * ctx = ctx_for_buft(buft);
+
+            // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+            if (flags & TENSOR_DUPLICATED) {
+                ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+                if (t) {
+                    return t;
+                }
+            }
+            return ml.create_tensor(ctx, tn, ne, flags);
+        };
+
+        layers.resize(n_layer);
+
+        // TODO: move to a separate function
+        const auto tn = LLM_TN(arch);
+        switch (arch) {
+            case LLM_ARCH_LLAMA:
+            case LLM_ARCH_REFACT:
+            case LLM_ARCH_MINICPM:
+            case LLM_ARCH_GRANITE:
+            case LLM_ARCH_GRANITE_MOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        }
+                        else {
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        }
+
+                        if (n_expert == 0) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                            // optional MLP bias
+                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                        } else {
+                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_DECI:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
+                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
+                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
+                        const int64_t n_ff          = hparams.n_ff(i);
+                        const int64_t n_head        = hparams.n_head(i);
+                        const int64_t n_head_kv     = hparams.n_head_kv(i);
+
+                        if (n_head_kv == 0 && n_head > 0) {
+                            // linear attention for DeciLMCausalModel
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        }
+                        else if (n_head_kv > 0) {
+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                        }
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        }
+                        else {
+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        }
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        // optional MLP bias
+                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                    }
+                } break;
+            case LLM_ARCH_MINICPM3:
+                {
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+                    const int64_t q_lora_rank  = hparams.n_lora_q;
+                    const int64_t kv_lora_rank = hparams.n_lora_kv;
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                    }
+                } break;
+            case LLM_ARCH_GROK:
+                {
+                    if (n_expert == 0) {
+                        throw std::runtime_error("Grok model cannot have zero experts");
+                    }
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_DBRX:
+                {
+                    if (n_expert == 0) {
+                        throw std::runtime_error("DBRX model cannot have zero experts");
+                    }
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                    }
+                } break;
+            case LLM_ARCH_BAICHUAN:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    {
+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_FALCON:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    {
+                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+
+                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                        if (!output) {
+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+                        }
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_STARCODER:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
+
+                    // output
+                    {
+                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                        if (!output) {
+                            // needs to be on GPU
+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                        }
+
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_BERT:
+            case LLM_ARCH_NOMIC_BERT:
+                {
+                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
+
+                    if (arch == LLM_ARCH_BERT) {
+                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
+
+                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         TENSOR_NOT_REQUIRED);
+                    }
+
+                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        if (arch == LLM_ARCH_BERT) {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
+
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa}, 0);
+
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
+                        } else {
+                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        }
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
+
+                        if (arch == LLM_ARCH_BERT) {
+                            layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
+                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                        } else {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        }
+
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_JINA_BERT_V2:
+                {
+                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
+                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
+
+                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
+                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0); //LayerNorm bias
+
+                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i]; // JinaBertLayer
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
+
+                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
+
+                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0); //output_dens
+
+                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
+                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
+
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
+
+                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_BLOOM:
+                {
+                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
+                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias",   i), {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_MPT:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    if (!output) {
+                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+
+                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        // AWQ ScaleActivation layer
+                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                    }
+                } break;
+            case LLM_ARCH_STABLELM:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        // optional bias tensors, present in Stable LM 2 1.6B
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+                        // optional q and k layernorms, present in StableLM 2 12B
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_QWEN:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
+                    }
+                } break;
+            case LLM_ARCH_QWEN2:
+            case LLM_ARCH_QWEN2VL:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_QWEN2MOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                        if (n_expert == 0) {
+                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
+                        }
+                        if (n_expert_used == 0) {
+                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
+                        }
+
+                        // MoE branch
+                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                        // Shared expert branch
+                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
+                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
+                    }
+                } break;
+            case LLM_ARCH_PHI2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+                        if (layer.wqkv == nullptr) {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd}, 0);
+
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa}, 0);
+
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa}, 0);
+                        }
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_PHI3:
+                {
+                    const int64_t n_embd_head = n_embd / n_head;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
+
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                    }
+                } break;
+            case LLM_ARCH_PHIMOE:
+                {
+                    const int64_t n_embd_head = n_embd / n_head;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
+                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        if (layer.wqkv == nullptr) {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias",   i), {n_embd}, 0);
+
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias",   i), {n_embd_gqa}, 0);
+
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias",   i), {n_embd_gqa}, 0);
+                        }
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
+
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                     }
+                } break;
+            case LLM_ARCH_PLAMO:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_GPT2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_CODESHELL:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_ORION:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_INTERNLM2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_GEMMA:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_GEMMA2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_STARCODER2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        // optional bias tensors
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_MAMBA:
+                {
+                    const int64_t d_conv  = hparams.ssm_d_conv;
+                    const int64_t d_inner = hparams.ssm_d_inner;
+                    const int64_t d_state = hparams.ssm_d_state;
+                    const int64_t dt_rank = hparams.ssm_dt_rank;
+
+                    // only an expansion factor of 2 is supported for now
+                    if (2 * n_embd != d_inner) {
+                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
+                    }
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        // norm
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+                        // no "weight" suffix for these
+                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+                        // out_proj
+                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_XVERSE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_COMMAND_R:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    // init output from the input tok embed
+                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (n_layer >= 64){
+                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+                        }
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_COHERE2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    // init output from the input tok embed
+                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
+                                                      TENSOR_DUPLICATED);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+                    }
+                }
+                break;
+            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_OLMO2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_OLMOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                        if (n_expert == 0) {
+                            throw std::runtime_error("n_expert must be > 0");
+                        }
+                        if (n_expert_used == 0) {
+                            throw std::runtime_error("n_expert_used must be > 0");
+                        }
+
+                        // MoE branch
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+                    }
+                } break;
+            case LLM_ARCH_OPENELM:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    // init output from the input tok embed
+                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        const int64_t n_head      =   hparams.n_head(i);
+                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
+                        const int64_t n_ff        =   hparams.n_ff(i);
+
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_GPTNEOX:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_ARCTIC:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+                    }
+                } break;
+            case LLM_ARCH_DEEPSEEK:
+                {
+
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_DEEPSEEK2:
+                {
+                    const bool is_lite = (hparams.n_layer == 27);
+
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+                    const int64_t q_lora_rank  = hparams.n_lora_q;
+                    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        if (!is_lite) {
+                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+                        }
+
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+                        if (!is_lite) {
+                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+                        } else {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        }
+
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_BITNET:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
+                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
+                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
+
+                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_scale   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+                    }
+                } break;
+            case LLM_ARCH_T5:
+                {
+                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        // this tensor seems to be unused in HF transformers implementation
+                        layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_T5ENCODER:
+                {
+                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_JAIS:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, 0);
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_CHATGLM:
+                {
+                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        if (layer.wqkv == nullptr) {
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        }
+
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
+
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                    }
+                } break;
+            case LLM_ARCH_NEMOTRON:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        // optional bias tensors
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+                        // optional MLP bias
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                    }
+                } break;
+            case LLM_ARCH_EXAONE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_RWKV6:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // Block 0, LN0
+                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+                    const int head_size = hparams.wkv_head_size;
+                    const int attn_hidden_size = n_embd;
+                    const int ffn_size = hparams.n_ff_arr[0];
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
+
+                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
+
+                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
+                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
+
+                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
+                    }
+
+                } break;
+            case LLM_ARCH_RWKV6QWEN2:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+                    const int head_size = hparams.wkv_head_size;
+                    const int attn_hidden_size = n_embd;
+                    const int n_head_kv = hparams.n_head_kv();
+                    int attn_key_value_size;
+                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
+                        attn_key_value_size = attn_hidden_size;
+                    } else {
+                        attn_key_value_size = n_head_kv * head_size;
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+
+                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
+                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
+                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+                        // optional bias tensors
+                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_CHAMELEON:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_WAVTOKENIZER_DEC:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
+
+                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
+                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
+
+                    // posnet
+                    {
+                        const int64_t n_embd = hparams.posnet.n_embd;
+
+                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+                            auto & layer = layers[i].posnet;
+
+                            // posnet:
+                            //
+                            //  - resnet
+                            //  - resnet
+                            //  - attn
+                            //  - resnet
+                            //  - resnet
+                            //  - norm
+                            //
+                            switch (i) {
+                                case 0:
+                                case 1:
+                                case 3:
+                                case 4:
+                                    {
+                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 2:
+                                    {
+                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 5:
+                                    {
+                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                default: GGML_ABORT("unknown posnet layer");
+                            };
+                        }
+                    }
+
+                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
+
+                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
+                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
+
+                    // convnext
+                    {
+                        const int64_t n_embd = hparams.convnext.n_embd;
+
+                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+                            auto & layer = layers[i].convnext;
+
+                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
+                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
+
+                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
+                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
+
+                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
+                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
+
+                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
+                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
+
+                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+                        }
+
+                        // output
+                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                    }
+
+                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
+                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
+                } break;
+            default:
+                throw std::runtime_error("unknown architecture");
+        }
+
+        if (n_moved_tensors > 0) {
+            LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
+                __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
+                ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+        }
+    }
+
+    ml.done_getting_tensors();
+
+    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
+    pimpl->mappings.reserve(ml.mappings.size());
+
+    // create the backend buffers
+    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
+    ctx_bufs.reserve(ctx_map.size());
+
+    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+    const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+    pimpl->bufs.reserve(n_max_backend_buffer);
+
+    for (auto & it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx              = it.second;
+
+        // skip contexts without tensors
+        if (ggml_get_first_tensor(ctx) == nullptr) {
+            continue;
+        }
+
+        llama_buf_map buf_map;
+        buf_map.reserve(n_max_backend_buffer);
+
+        // check if it is possible to use buffer_from_host_ptr with this buffer type
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            // FIXME: workaround for CPU backend buft having a NULL device
+            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        }
+        ggml_backend_dev_props props;
+        ggml_backend_dev_get_props(dev, &props);
+        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
+
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                // only the mmap region containing the tensors in the model is mapped to the backend buffer
+                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+                void * addr = nullptr;
+                size_t first, last; // NOLINT
+                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
+                if (first >= last) {
+                    continue;
+                }
+                const size_t max_size = ggml_get_max_tensor_size(ctx);
+                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+                if (buf == nullptr) {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                }
+                pimpl->bufs.emplace_back(buf);
+                buf_map.emplace(idx, buf);
+            }
+        }
+        else {
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (buf == nullptr) {
+                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+            }
+            pimpl->bufs.emplace_back(buf);
+            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+                pimpl->mlock_bufs.emplace_back(new llama_mlock);
+                auto & mlock_buf = pimpl->mlock_bufs.back();
+                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
+                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+            }
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                buf_map.emplace(idx, buf);
+            }
+        }
+
+        if (pimpl->bufs.empty()) {
+            throw std::runtime_error("failed to allocate buffer");
+        }
+
+        for (auto & buf : buf_map) {
+            // indicate that this buffer contains weights
+            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+
+        ctx_bufs.emplace_back(ctx, buf_map);
+    }
+
+    if (llama_supports_gpu_offload()) {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+        }
+
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
+
+        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+    }
+
+    // print memory requirements per buffer type
+    for (auto & buf : pimpl->bufs) {
+        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    }
+
+    // populate tensors_by_name
+    for (auto & ctx : pimpl->ctxs) {
+        for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+        }
+    }
+
+    // load tensor data
+    for (auto & it : ctx_bufs) {
+        ggml_context * ctx = it.first;
+        auto & bufs = it.second;
+        if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+            return false;
+        }
+    }
+
+    if (use_mmap_buffer) {
+        for (auto & mapping : ml.mappings) {
+            pimpl->mappings.emplace_back(std::move(mapping));
+        }
+    }
+
+    return true;
+}
+
+std::string llama_model::arch_name() const {
+    return llm_arch_name(arch);
+}
+
+std::string llama_model::type_name() const {
+    return llm_type_name(type);
+}
+
+std::string llama_model::desc() const {
+    return pimpl->desc_str;
+}
+
+size_t llama_model::size() const {
+    return pimpl->n_bytes;
+}
+
+size_t llama_model::max_nodes() const {
+    return std::max<size_t>(8192, tensors_by_name.size()*5);
+}
+
+size_t llama_model::n_devices() const {
+    return devices.size();
+}
+
+uint64_t llama_model::n_elements() const {
+    return pimpl->n_elements;
+}
+
+void llama_model::print_info() const {
+    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+
+    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+        bool is_var = false;
+
+        std::vector<uint32_t> v;
+        for (uint32_t i = 0; i < n; ++i) {
+            v.push_back(f(i));
+            if (v[i] != v[0]) {
+                is_var = true;
+            }
+        }
+
+        std::stringstream ss;
+
+        if (is_var) {
+            ss << "[";
+            for (uint32_t i = 0; i < n; ++i) {
+                ss << v[i];
+                if (i < n - 1) {
+                    ss << ", ";
+                }
+            }
+            ss << "]";
+        } else {
+            ss << v[0];
+        }
+
+        return ss.str();
+    };
+
+    // hparams
+    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, arch_name().c_str());
+    LLAMA_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
+
+    if (!hparams.vocab_only) {
+        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
+        LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
+        LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
+        LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
+        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
+        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
+        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
+        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
+        LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
+        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
+        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
+        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
+        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
+        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
+        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
+        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
+        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
+        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
+        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
+        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
+        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
+        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
+        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
+        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
+    }
+
+    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
+    if (pimpl->n_elements >= 1e12) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+    } else if (pimpl->n_elements >= 1e9) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+    } else if (pimpl->n_elements >= 1e6) {
+        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, pimpl->n_elements*1e-6);
+    } else {
+        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, pimpl->n_elements*1e-3);
+    }
+
+    // general kv
+    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, name.c_str());
+
+    if (arch == LLM_ARCH_DEEPSEEK) {
+        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
+        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
+        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+    }
+
+    if (arch == LLM_ARCH_DEEPSEEK2) {
+        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
+        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
+        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
+        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
+        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
+        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
+    }
+
+    if (arch == LLM_ARCH_QWEN2MOE) {
+        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
+        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
+    }
+
+    if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
+        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
+        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
+        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
+    }
+
+    vocab.print_info();
+}
+
+ggml_backend_dev_t llama_model::dev_layer(int il) const {
+    return pimpl->dev_layer.at(il).dev;
+}
+
+ggml_backend_dev_t llama_model::dev_output() const {
+    return pimpl->dev_output.dev;
+}
+
+template<typename F>
+static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+
+    ggml_context_ptr ctx { ggml_init(params) };
+    if (!ctx) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
+
+    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+    ggml_tensor * op_tensor = fn(ctx.get());
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op_tensor->src[i] != nullptr) {
+            assert(op_tensor->src[i]->buffer == nullptr);
+            op_tensor->src[i]->buffer = buf.get();
+        }
+    }
+
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+    return op_supported;
+}
+
+template<typename F>
+static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
+    for (const auto & cur : buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (buft_supported(cur_buft, cur_dev, fn)) {
+            return cur_buft;
+        }
+    }
+
+    throw std::runtime_error(format("no suitable buffer type found"));
+}
+
+ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
+    return ::select_buft(
+            *pimpl->dev_layer.at(il).buft_list,
+            [&](ggml_context * ctx) {
+                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+                return ggml_add(ctx, cur, layer_dir);
+            });
+}
+
+const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
+    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
+            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == tensors_by_name.end()) {
+        return nullptr;
+    }
+
+    return it->second;
+}
+
+//
+// interface implementation
+//
+
+struct llama_model_params llama_model_default_params() {
+    struct llama_model_params result = {
+        /*.devices                     =*/ nullptr,
+        /*.n_gpu_layers                =*/ 0,
+        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
+        /*.main_gpu                    =*/ 0,
+        /*.tensor_split                =*/ nullptr,
+        /*.progress_callback           =*/ nullptr,
+        /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
+        /*.vocab_only                  =*/ false,
+        /*.use_mmap                    =*/ true,
+        /*.use_mlock                   =*/ false,
+        /*.check_tensors               =*/ false,
+    };
+
+#ifdef GGML_USE_METAL
+    // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
+    result.n_gpu_layers = 999;
+#endif
+
+    return result;
+}
+
+const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) {
+    return &model->vocab;
+}
+
+void llama_free_model(struct llama_model * model) {
+    llama_model_free(model);
+}
+
+void llama_model_free(struct llama_model * model) {
+    delete model;
+}
+
+int32_t llama_model_n_ctx_train(const struct llama_model * model) {
+    return model->hparams.n_ctx_train;
+}
+
+int32_t llama_model_n_embd(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
+int32_t llama_model_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
+int32_t llama_model_n_head(const struct llama_model * model) {
+    return model->hparams.n_head();
+}
+
+// deprecated
+int32_t llama_n_ctx_train(const struct llama_model * model) {
+    return llama_model_n_ctx_train(model);
+}
+
+// deprecated
+int32_t llama_n_embd(const struct llama_model * model) {
+    return llama_model_n_embd(model);
+}
+
+// deprecated
+int32_t llama_n_layer(const struct llama_model * model) {
+    return llama_model_n_layer(model);
+}
+
+// deprecated
+int32_t llama_n_head(const struct llama_model * model) {
+    return llama_model_n_head(model);
+}
+
+enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
+    switch (model->arch) {
+        // these models do not use RoPE
+        case LLM_ARCH_GPT2:
+        case LLM_ARCH_GPTJ:
+        case LLM_ARCH_MPT:
+        case LLM_ARCH_REFACT:
+        case LLM_ARCH_BLOOM:
+        case LLM_ARCH_MAMBA:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_T5:
+        case LLM_ARCH_T5ENCODER:
+        case LLM_ARCH_JAIS:
+        case LLM_ARCH_RWKV6:
+        case LLM_ARCH_RWKV6QWEN2:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            return LLAMA_ROPE_TYPE_NONE;
+
+        // use what we call a normal RoPE, operating on pairs of consecutive head values
+        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_DECI:
+        case LLM_ARCH_BAICHUAN:
+        case LLM_ARCH_STARCODER:
+        case LLM_ARCH_PLAMO:
+        case LLM_ARCH_ORION:
+        case LLM_ARCH_INTERNLM2:
+        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_XVERSE:
+        case LLM_ARCH_COMMAND_R:
+        case LLM_ARCH_COHERE2:
+        case LLM_ARCH_OLMO:
+        case LLM_ARCH_ARCTIC:
+        case LLM_ARCH_DEEPSEEK:
+        case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_CHATGLM:
+        case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
+        case LLM_ARCH_CHAMELEON:
+            return LLAMA_ROPE_TYPE_NORM;
+
+        // the pairs of head values are offset by n_rot/2
+        case LLM_ARCH_FALCON:
+        case LLM_ARCH_GROK:
+        case LLM_ARCH_DBRX:
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_STABLELM:
+        case LLM_ARCH_BITNET:
+        case LLM_ARCH_QWEN:
+        case LLM_ARCH_QWEN2:
+        case LLM_ARCH_QWEN2MOE:
+        case LLM_ARCH_OLMO2:
+        case LLM_ARCH_OLMOE:
+        case LLM_ARCH_PHI2:
+        case LLM_ARCH_PHI3:
+        case LLM_ARCH_PHIMOE:
+        case LLM_ARCH_GEMMA:
+        case LLM_ARCH_GEMMA2:
+        case LLM_ARCH_STARCODER2:
+        case LLM_ARCH_OPENELM:
+        case LLM_ARCH_GPTNEOX:
+        case LLM_ARCH_CODESHELL:
+        case LLM_ARCH_NEMOTRON:
+        case LLM_ARCH_EXAONE:
+        case LLM_ARCH_MINICPM3:
+            return LLAMA_ROPE_TYPE_NEOX;
+
+        case LLM_ARCH_QWEN2VL:
+            return LLAMA_ROPE_TYPE_MROPE;
+
+        // all model arches should be listed explicitly here
+        case LLM_ARCH_UNKNOWN:
+            GGML_ABORT("unknown architecture");
+    }
+
+    return LLAMA_ROPE_TYPE_NONE;
+}
+
+float llama_model_rope_freq_scale_train(const struct llama_model * model) {
+    return model->hparams.rope_freq_scale_train;
+}
+
+int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+    const auto & it = model->gguf_kv.find(key);
+    if (it == model->gguf_kv.end()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int32_t llama_model_meta_count(const struct llama_model * model) {
+    return (int)model->gguf_kv.size();
+}
+
+int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+
+int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
+    return snprintf(buf, buf_size, "%s", model->desc().c_str());
+}
+
+uint64_t llama_model_size(const struct llama_model * model) {
+    return model->size();
+}
+
+const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
+    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
+        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
+    const auto & it = model->gguf_kv.find(key);
+    if (it == model->gguf_kv.end()) {
+        return nullptr;
+    }
+
+    return it->second.c_str();
+}
+
+uint64_t llama_model_n_params(const struct llama_model * model) {
+    return model->n_elements();
+}
+
+bool llama_model_has_encoder(const struct llama_model * model) {
+    switch (model->arch) {
+        case LLM_ARCH_T5:        return true;
+        case LLM_ARCH_T5ENCODER: return true;
+        default:                 return false;
+    }
+}
+
+bool llama_model_has_decoder(const struct llama_model * model) {
+    switch (model->arch) {
+        case LLM_ARCH_T5ENCODER: return false;
+        default:                 return true;
+    }
+}
+
+llama_token llama_model_decoder_start_token(const struct llama_model * model) {
+    return model->hparams.dec_start_token_id;
+}
+
+bool llama_model_is_recurrent(const struct llama_model * model) {
+    switch (model->arch) {
+        case LLM_ARCH_MAMBA:  return true;
+        case LLM_ARCH_RWKV6:  return true;
+        case LLM_ARCH_RWKV6QWEN2: return true;
+        default:              return false;
+    }
+}
diff --git a/src/llama-model.h b/src/llama-model.h
new file mode 100644
index 000000000..a7c304447
--- /dev/null
+++ b/src/llama-model.h
@@ -0,0 +1,370 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-arch.h"
+#include "llama-hparams.h"
+#include "llama-vocab.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+struct llama_model_loader;
+
+// available models
+enum llm_type {
+    LLM_TYPE_UNKNOWN,
+    LLM_TYPE_14M,
+    LLM_TYPE_17M,
+    LLM_TYPE_22M,
+    LLM_TYPE_33M,
+    LLM_TYPE_60M,
+    LLM_TYPE_70M,
+    LLM_TYPE_80M,
+    LLM_TYPE_109M,
+    LLM_TYPE_137M,
+    LLM_TYPE_160M,
+    LLM_TYPE_220M,
+    LLM_TYPE_250M,
+    LLM_TYPE_270M,
+    LLM_TYPE_335M,
+    LLM_TYPE_410M,
+    LLM_TYPE_450M,
+    LLM_TYPE_770M,
+    LLM_TYPE_780M,
+    LLM_TYPE_0_5B,
+    LLM_TYPE_1B,
+    LLM_TYPE_1_3B,
+    LLM_TYPE_1_4B,
+    LLM_TYPE_1_5B,
+    LLM_TYPE_1_6B,
+    LLM_TYPE_2B,
+    LLM_TYPE_2_8B,
+    LLM_TYPE_3B,
+    LLM_TYPE_4B,
+    LLM_TYPE_6B,
+    LLM_TYPE_6_9B,
+    LLM_TYPE_7B,
+    LLM_TYPE_8B,
+    LLM_TYPE_9B,
+    LLM_TYPE_11B,
+    LLM_TYPE_12B,
+    LLM_TYPE_13B,
+    LLM_TYPE_14B,
+    LLM_TYPE_15B,
+    LLM_TYPE_16B,
+    LLM_TYPE_20B,
+    LLM_TYPE_30B,
+    LLM_TYPE_32B,
+    LLM_TYPE_34B,
+    LLM_TYPE_35B,
+    LLM_TYPE_40B,
+    LLM_TYPE_65B,
+    LLM_TYPE_70B,
+    LLM_TYPE_236B,
+    LLM_TYPE_314B,
+    LLM_TYPE_671B,
+    LLM_TYPE_SMALL,
+    LLM_TYPE_MEDIUM,
+    LLM_TYPE_LARGE,
+    LLM_TYPE_XL,
+    LLM_TYPE_A1_7B,
+    LLM_TYPE_A2_7B,
+    LLM_TYPE_8x7B,
+    LLM_TYPE_8x22B,
+    LLM_TYPE_16x12B,
+    LLM_TYPE_16x3_8B,
+    LLM_TYPE_10B_128x3_66B,
+    LLM_TYPE_57B_A14B,
+    LLM_TYPE_27B,
+};
+
+struct llama_layer_posnet {
+    // resnet
+    struct ggml_tensor * norm1   = nullptr;
+    struct ggml_tensor * norm1_b = nullptr;
+
+    struct ggml_tensor * conv1   = nullptr;
+    struct ggml_tensor * conv1_b = nullptr;
+
+    struct ggml_tensor * norm2   = nullptr;
+    struct ggml_tensor * norm2_b = nullptr;
+
+    struct ggml_tensor * conv2   = nullptr;
+    struct ggml_tensor * conv2_b = nullptr;
+
+    // attention
+    struct ggml_tensor * attn_norm   = nullptr;
+    struct ggml_tensor * attn_norm_b = nullptr;
+
+    struct ggml_tensor * attn_q   = nullptr;
+    struct ggml_tensor * attn_q_b = nullptr;
+
+    struct ggml_tensor * attn_k   = nullptr;
+    struct ggml_tensor * attn_k_b = nullptr;
+
+    struct ggml_tensor * attn_v   = nullptr;
+    struct ggml_tensor * attn_v_b = nullptr;
+
+    struct ggml_tensor * attn_o   = nullptr;
+    struct ggml_tensor * attn_o_b = nullptr;
+
+    // normalize
+    struct ggml_tensor * norm   = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
+};
+
+struct llama_layer_convnext {
+    struct ggml_tensor * dw   = nullptr;
+    struct ggml_tensor * dw_b = nullptr;
+
+    struct ggml_tensor * norm   = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
+
+    struct ggml_tensor * pw1   = nullptr;
+    struct ggml_tensor * pw1_b = nullptr;
+
+    struct ggml_tensor * pw2   = nullptr;
+    struct ggml_tensor * pw2_b = nullptr;
+
+    struct ggml_tensor * gamma = nullptr;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm       = nullptr;
+    struct ggml_tensor * attn_norm_b     = nullptr;
+    struct ggml_tensor * attn_norm_2     = nullptr;
+    struct ggml_tensor * attn_norm_2_b   = nullptr;
+    struct ggml_tensor * attn_q_norm     = nullptr;
+    struct ggml_tensor * attn_q_norm_b   = nullptr;
+    struct ggml_tensor * attn_k_norm     = nullptr;
+    struct ggml_tensor * attn_k_norm_b   = nullptr;
+    struct ggml_tensor * attn_out_norm   = nullptr;
+    struct ggml_tensor * attn_out_norm_b = nullptr;
+    struct ggml_tensor * attn_q_a_norm   = nullptr;
+    struct ggml_tensor * attn_kv_a_norm  = nullptr;
+    struct ggml_tensor * attn_sub_norm   = nullptr;
+    struct ggml_tensor * attn_post_norm  = nullptr;
+    struct ggml_tensor * ffn_sub_norm    = nullptr;
+    struct ggml_tensor * attn_norm_cross = nullptr;
+    struct ggml_tensor * attn_norm_enc   = nullptr;
+
+    // attention
+    struct ggml_tensor * wq        = nullptr;
+    struct ggml_tensor * wk        = nullptr;
+    struct ggml_tensor * wv        = nullptr;
+    struct ggml_tensor * wo        = nullptr;
+    struct ggml_tensor * wqkv      = nullptr;
+    struct ggml_tensor * wq_a      = nullptr;
+    struct ggml_tensor * wq_b      = nullptr;
+    struct ggml_tensor * wkv_a_mqa = nullptr;
+    struct ggml_tensor * wkv_b     = nullptr;
+    struct ggml_tensor * wq_cross  = nullptr;
+    struct ggml_tensor * wk_cross  = nullptr;
+    struct ggml_tensor * wv_cross  = nullptr;
+    struct ggml_tensor * wo_cross  = nullptr;
+    struct ggml_tensor * wq_enc    = nullptr;
+    struct ggml_tensor * wk_enc    = nullptr;
+    struct ggml_tensor * wv_enc    = nullptr;
+    struct ggml_tensor * wo_enc    = nullptr;
+
+    // attention bias
+    struct ggml_tensor * bq   = nullptr;
+    struct ggml_tensor * bk   = nullptr;
+    struct ggml_tensor * bv   = nullptr;
+    struct ggml_tensor * bo   = nullptr;
+    struct ggml_tensor * bqkv = nullptr;
+
+    // relative position bias
+    struct ggml_tensor * attn_rel_b       = nullptr;
+    struct ggml_tensor * attn_rel_b_enc   = nullptr;
+    struct ggml_tensor * attn_rel_b_cross = nullptr;
+
+    // normalization
+    struct ggml_tensor * ffn_norm         = nullptr;
+    struct ggml_tensor * ffn_norm_b       = nullptr;
+    struct ggml_tensor * ffn_post_norm    = nullptr;
+    struct ggml_tensor * layer_out_norm   = nullptr;
+    struct ggml_tensor * layer_out_norm_b = nullptr;
+    struct ggml_tensor * ffn_norm_exps    = nullptr;
+    struct ggml_tensor * ffn_norm_enc     = nullptr;
+
+    // ff
+    struct ggml_tensor * ffn_gate     = nullptr; // w1
+    struct ggml_tensor * ffn_down     = nullptr; // w2
+    struct ggml_tensor * ffn_up       = nullptr; // w3
+    struct ggml_tensor * ffn_gate_enc = nullptr;
+    struct ggml_tensor * ffn_down_enc = nullptr;
+    struct ggml_tensor * ffn_up_enc   = nullptr;
+
+    // ff MoE
+    struct ggml_tensor * ffn_gate_inp  = nullptr;
+    struct ggml_tensor * ffn_gate_exps = nullptr;
+    struct ggml_tensor * ffn_down_exps = nullptr;
+    struct ggml_tensor * ffn_up_exps   = nullptr;
+
+    // ff shared expert (shexp)
+    struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
+    struct ggml_tensor * ffn_gate_shexp     = nullptr;
+    struct ggml_tensor * ffn_down_shexp     = nullptr;
+    struct ggml_tensor * ffn_up_shexp       = nullptr;
+
+    // ff bias
+    struct ggml_tensor * ffn_gate_b = nullptr;
+    struct ggml_tensor * ffn_down_b = nullptr; // b2
+    struct ggml_tensor * ffn_up_b   = nullptr; // b3
+    struct ggml_tensor * ffn_act    = nullptr;
+    struct ggml_tensor * ffn_exp_probs_b = nullptr;
+
+    // mamba proj
+    struct ggml_tensor * ssm_in  = nullptr;
+    struct ggml_tensor * ssm_x   = nullptr;
+    struct ggml_tensor * ssm_dt  = nullptr;
+    struct ggml_tensor * ssm_out = nullptr;
+
+    // mamba
+    struct ggml_tensor * ssm_conv1d = nullptr;
+    struct ggml_tensor * ssm_a      = nullptr;
+    struct ggml_tensor * ssm_d      = nullptr;
+
+    // mamba bias
+    struct ggml_tensor * ssm_conv1d_b = nullptr;
+    struct ggml_tensor * ssm_dt_b     = nullptr;
+
+    // rwkv
+    struct ggml_tensor * time_mix_w1         = nullptr;
+    struct ggml_tensor * time_mix_w2         = nullptr;
+    struct ggml_tensor * time_mix_lerp_x     = nullptr;
+    struct ggml_tensor * time_mix_lerp_w     = nullptr;
+    struct ggml_tensor * time_mix_lerp_k     = nullptr;
+    struct ggml_tensor * time_mix_lerp_v     = nullptr;
+    struct ggml_tensor * time_mix_lerp_r     = nullptr;
+    struct ggml_tensor * time_mix_lerp_g     = nullptr;
+    struct ggml_tensor * time_mix_lerp_fused = nullptr;
+
+    struct ggml_tensor * time_mix_first        = nullptr;
+    struct ggml_tensor * time_mix_decay        = nullptr;
+    struct ggml_tensor * time_mix_decay_w1     = nullptr;
+    struct ggml_tensor * time_mix_decay_w2     = nullptr;
+    struct ggml_tensor * time_mix_key          = nullptr;
+    struct ggml_tensor * time_mix_key_b        = nullptr;
+    struct ggml_tensor * time_mix_value        = nullptr;
+    struct ggml_tensor * time_mix_value_b      = nullptr;
+    struct ggml_tensor * time_mix_receptance   = nullptr;
+    struct ggml_tensor * time_mix_receptance_b = nullptr;
+    struct ggml_tensor * time_mix_gate         = nullptr;
+
+    struct ggml_tensor * time_mix_ln     = nullptr;
+    struct ggml_tensor * time_mix_ln_b   = nullptr;
+    struct ggml_tensor * time_mix_output = nullptr;
+
+    struct ggml_tensor * channel_mix_lerp_k = nullptr;
+    struct ggml_tensor * channel_mix_lerp_r = nullptr;
+
+    struct ggml_tensor * channel_mix_key        = nullptr;
+    struct ggml_tensor * channel_mix_receptance = nullptr;
+    struct ggml_tensor * channel_mix_value      = nullptr;
+
+    // long rope factors
+    struct ggml_tensor * rope_long  = nullptr;
+    struct ggml_tensor * rope_short = nullptr;
+    struct ggml_tensor * rope_freqs = nullptr;
+
+    // bitnet scale
+    struct ggml_tensor * wq_scale       = nullptr;
+    struct ggml_tensor * wk_scale       = nullptr;
+    struct ggml_tensor * wv_scale       = nullptr;
+    struct ggml_tensor * wo_scale       = nullptr;
+    struct ggml_tensor * ffn_gate_scale = nullptr;
+    struct ggml_tensor * ffn_up_scale   = nullptr;
+    struct ggml_tensor * ffn_down_scale = nullptr;
+
+    struct llama_layer_posnet posnet;
+
+    struct llama_layer_convnext convnext;
+};
+
+struct llama_model {
+    llm_type type = LLM_TYPE_UNKNOWN;
+    llm_arch arch = LLM_ARCH_UNKNOWN;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd   = nullptr;
+    struct ggml_tensor * type_embd  = nullptr;
+    struct ggml_tensor * pos_embd   = nullptr;
+    struct ggml_tensor * tok_norm   = nullptr;
+    struct ggml_tensor * tok_norm_b = nullptr;
+
+    struct ggml_tensor * output_norm     = nullptr;
+    struct ggml_tensor * output_norm_b   = nullptr;
+    struct ggml_tensor * output          = nullptr;
+    struct ggml_tensor * output_b        = nullptr;
+    struct ggml_tensor * output_norm_enc = nullptr;
+
+    // classifier
+    struct ggml_tensor * cls       = nullptr;
+    struct ggml_tensor * cls_b     = nullptr;
+    struct ggml_tensor * cls_out   = nullptr;
+    struct ggml_tensor * cls_out_b = nullptr;
+
+    struct ggml_tensor * conv1d   = nullptr;
+    struct ggml_tensor * conv1d_b = nullptr;
+
+    std::vector<llama_layer> layers;
+
+    llama_model_params params;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // list of devices used in this model
+    std::vector<ggml_backend_dev_t> devices;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us  = 0;
+    int64_t t_start_us = 0;
+
+    explicit llama_model(const struct llama_model_params & params);
+    ~llama_model();
+
+    void load_stats  (llama_model_loader & ml);
+    void load_arch   (llama_model_loader & ml);
+    void load_hparams(llama_model_loader & ml);
+    void load_vocab  (llama_model_loader & ml);
+    bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
+
+    std::string arch_name() const;
+    std::string type_name() const;
+
+    std::string desc() const;
+
+    size_t size() const;
+    size_t max_nodes() const;
+    size_t n_devices() const;
+
+    // total number of parameters in the model
+    uint64_t n_elements() const;
+
+    void print_info() const;
+
+    ggml_backend_dev_t dev_layer(int il) const;
+    ggml_backend_dev_t dev_output() const;
+
+    ggml_backend_buffer_type_t select_buft(int il) const;
+
+    const struct ggml_tensor * get_tensor(const char * name) const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+const char * llm_type_name(llm_type type);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
new file mode 100644
index 000000000..fb7982655
--- /dev/null
+++ b/src/llama-quant.cpp
@@ -0,0 +1,934 @@
+#include "llama-quant.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-model-loader.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <cinttypes>
+#include <fstream>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+struct quantize_state_impl {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv = 0;
+    int n_ffn_down     = 0;
+    int n_ffn_gate     = 0;
+    int n_ffn_up       = 0;
+    int i_attention_wv = 0;
+    int i_ffn_down     = 0;
+    int i_ffn_gate     = 0;
+    int i_ffn_up       = 0;
+
+    int n_k_quantized = 0;
+    int n_fallback    = 0;
+
+    bool has_imatrix = false;
+
+    // used to figure out if a model shares tok_embd with the output weight
+    bool has_output = false;
+
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
+static void llama_tensor_dequantize_impl(
+    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    const size_t nelements, const int nthread
+) {
+    if (output.size() < nelements) {
+        output.resize(nelements);
+    }
+    float * f32_output = (float *) output.data();
+
+    const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
+    if (ggml_is_quantized(tensor->type)) {
+        if (qtype->to_float == NULL) {
+            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
+        }
+    } else if (tensor->type != GGML_TYPE_F16 &&
+               tensor->type != GGML_TYPE_BF16) {
+        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
+    }
+
+    if (nthread < 2) {
+        if (tensor->type == GGML_TYPE_F16) {
+            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
+        } else if (tensor->type == GGML_TYPE_BF16) {
+            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
+        } else if (ggml_is_quantized(tensor->type)) {
+            qtype->to_float(tensor->data, f32_output, nelements);
+        } else {
+            GGML_ABORT("fatal error"); // unreachable
+        }
+        return;
+    }
+
+    size_t block_size;
+    if (tensor->type == GGML_TYPE_F16 ||
+        tensor->type == GGML_TYPE_BF16) {
+        block_size = 1;
+    } else {
+        block_size = (size_t)ggml_blck_size(tensor->type);
+    }
+
+    size_t block_size_bytes = ggml_type_size(tensor->type);
+
+    GGML_ASSERT(nelements % block_size == 0);
+    size_t nblocks = nelements / block_size;
+    size_t blocks_per_thread = nblocks / nthread;
+    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+
+    size_t in_buff_offs = 0;
+    size_t out_buff_offs = 0;
+
+    for (int tnum = 0; tnum < nthread; tnum++) {
+        size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+        size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+            if (typ == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+            } else if (typ == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
+            } else {
+                qtype->to_float(inbuf, outbuf, nels);
+            }
+        };
+        workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
+        in_buff_offs += thr_block_bytes;
+        out_buff_offs += thr_elems;
+    }
+    for (auto & w : workers) { w.join(); }
+    workers.clear();
+}
+
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+    const std::string name = ggml_get_name(tensor);
+
+    // TODO: avoid hardcoded tensor names - use the TN_* constants
+    const llm_arch arch = qs.model.arch;
+    const auto       tn = LLM_TN(arch);
+
+    auto use_more_bits = [](int i_layer, int n_layers) -> bool {
+        return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
+    };
+    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+        if (n_expert > 1) {
+            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
+            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
+            // for getting the current layer as I initially thought, and we need to resort to parsing the
+            // tensor name.
+            if (sscanf(name, "blk.%d.", &i_layer) != 1) {
+                throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+            }
+            if (i_layer < 0 || i_layer >= n_layer) {
+                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+            }
+        }
+        return std::make_pair(i_layer, n_layer);
+    };
+
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
+            new_type = qs.params->output_tensor_type;
+        } else {
+            const int64_t nx = tensor->ne[0];
+            const int64_t qk_k = ggml_blck_size(new_type);
+
+            if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (new_type != GGML_TYPE_Q8_0) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+        }
+    } else if (name == "token_embd.weight") {
+        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
+            new_type = qs.params->token_embedding_type;
+        } else {
+            if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
+                ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+                new_type = GGML_TYPE_Q2_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+                new_type = GGML_TYPE_IQ3_S;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+                new_type = GGML_TYPE_IQ3_S;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+        }
+    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+        if (name.find("attn_v.weight") != std::string::npos) {
+            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+            else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            ++qs.i_attention_wv;
+        }
+        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (name.find("ffn_down") != std::string::npos) {
+            if (qs.i_ffn_down < qs.n_ffn_down/8) {
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            }
+            ++qs.i_ffn_down;
+        }
+        else if (name.find("attn_output.weight") != std::string::npos) {
+            if (qs.model.hparams.n_expert == 8) {
+                new_type = GGML_TYPE_Q5_K;
+            } else {
+                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+            }
+        }
+    } else if (name.find("attn_v.weight") != std::string::npos) {
+        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
+        }
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
+                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        if (qs.model.type == LLM_TYPE_70B) {
+            // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
+            // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
+            // nearly negligible increase in model size by quantizing this tensor with more bits:
+            if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
+        }
+        if (qs.model.hparams.n_expert == 8) {
+            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+            // TODO: explore better strategies
+            new_type = GGML_TYPE_Q8_0;
+        }
+        ++qs.i_attention_wv;
+    } else if (name.find("attn_k.weight") != std::string::npos) {
+        if (qs.model.hparams.n_expert == 8) {
+            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+            // TODO: explore better strategies
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+            new_type = GGML_TYPE_IQ3_XXS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+            new_type = GGML_TYPE_IQ2_S;
+        }
+    } else if (name.find("attn_q.weight") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+            new_type = GGML_TYPE_IQ3_XXS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+            new_type = GGML_TYPE_IQ2_S;
+        }
+    } else if (name.find("ffn_down") != std::string::npos) {
+        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
+        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+            if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
+            new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+            new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
+                     : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
+                     : GGML_TYPE_Q3_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
+                    (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+            if (arch == LLM_ARCH_FALCON) {
+                new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
+                           use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            } else {
+                if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+            }
+        }
+        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+                && qs.has_imatrix && i_layer < n_layer/8) {
+            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+        }
+        ++qs.i_ffn_down;
+    } else if (name.find("attn_output.weight") != std::string::npos) {
+        if (arch != LLM_ARCH_FALCON) {
+            if (qs.model.hparams.n_expert == 8) {
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
+                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
+                    new_type = GGML_TYPE_Q5_K;
+                }
+            } else {
+                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
+            }
+        } else {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+        }
+    }
+    else if (name.find("attn_qkv.weight") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+    }
+    else if (name.find("ffn_gate") != std::string::npos) {
+        auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
+        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+            new_type = GGML_TYPE_IQ3_XXS;
+        }
+        ++qs.i_ffn_gate;
+    }
+    else if (name.find("ffn_up") != std::string::npos) {
+        auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+        int i_layer = info.first, n_layer = info.second;
+        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
+            new_type = GGML_TYPE_IQ3_XXS;
+        }
+        ++qs.i_ffn_up;
+    }
+
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
+    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
+    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
+    // This can be used to reduce the size of the Q5_K_S model.
+    // The associated PPL increase is fully in line with the size reduction
+    //else {
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
+    //}
+    bool convert_incompatible_tensor = false;
+    {
+        const int64_t nx = tensor->ne[0];
+        const int64_t ny = tensor->ne[1];
+        const int64_t qk_k = ggml_blck_size(new_type);
+
+        if (nx % qk_k != 0) {
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+            convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
+        }
+    }
+
+    if (convert_incompatible_tensor) {
+        switch (new_type) {
+            case GGML_TYPE_TQ1_0:
+            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+        }
+        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+            new_type = GGML_TYPE_F16;
+        }
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
+    }
+
+    return new_type;
+}
+
+static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+    if (nthread < 2) {
+        // single-thread
+        size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
+        if (!ggml_validate_row_data(new_type, new_data, new_size)) {
+            throw std::runtime_error("quantized data validation failed");
+        }
+        return new_size;
+    }
+
+    std::mutex mutex;
+    int64_t counter = 0;
+    size_t new_size = 0;
+    bool valid = true;
+    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
+            nrows, n_per_row, imatrix]() {
+        const int64_t nrows_per_chunk = chunk_size / n_per_row;
+        size_t local_size = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int64_t first_row = counter; counter += nrows_per_chunk;
+            if (first_row >= nrows) {
+                if (local_size > 0) {
+                    new_size += local_size;
+                }
+                break;
+            }
+            lock.unlock();
+            const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+            size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
+            local_size += this_size;
+
+            // validate the quantized data
+            const size_t row_size  = ggml_row_size(new_type, n_per_row);
+            void * this_data = (char *) new_data + first_row * row_size;
+            if (!ggml_validate_row_data(new_type, this_data, this_size)) {
+                std::unique_lock<std::mutex> lock(mutex);
+                valid = false;
+                break;
+            }
+        }
+    };
+    for (int it = 0; it < nthread - 1; ++it) {
+        workers.emplace_back(compute);
+    }
+    compute();
+    for (auto & w : workers) { w.join(); }
+    workers.clear();
+    if (!valid) {
+        throw std::runtime_error("quantized data validation failed");
+    }
+    return new_size;
+}
+
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+    ggml_type default_type;
+    llama_ftype ftype = params->ftype;
+
+    switch (params->ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
+        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
+        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
+        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+
+        // K-quants
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:
+        case LLAMA_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M:
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  default_type = GGML_TYPE_Q3_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S:
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  default_type = GGML_TYPE_Q4_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S:
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  default_type = GGML_TYPE_Q5_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q6_K:    default_type = GGML_TYPE_Q6_K;    break;
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:   default_type = GGML_TYPE_TQ1_0;   break;
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:   default_type = GGML_TYPE_TQ2_0;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  default_type = GGML_TYPE_IQ2_XS;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:   default_type = GGML_TYPE_IQ2_XS;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+
+        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+    }
+
+    int nthread = params->nthread;
+
+    if (nthread <= 0) {
+        nthread = std::thread::hardware_concurrency();
+    }
+
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
+#if defined(__linux__) || defined(_WIN32)
+    constexpr bool use_mmap = true;
+#else
+    constexpr bool use_mmap = false;
+#endif
+
+    llama_model_kv_override * kv_overrides = nullptr;
+    if (params->kv_overrides) {
+        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        kv_overrides = v->data();
+    }
+
+    std::vector<std::string> splits = {};
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    ml.init_mappings(false); // no prefetching
+
+    llama_model model(llama_model_default_params());
+
+    model.load_arch   (ml);
+    model.load_hparams(ml);
+    model.load_stats  (ml);
+
+    struct quantize_state_impl qs(model, params);
+
+    if (params->only_copy) {
+        ftype = ml.ftype;
+    }
+    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
+    if (params->imatrix) {
+        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+        if (imatrix_data) {
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            qs.has_imatrix = true;
+            // check imatrix for nans or infs
+            for (const auto & kv : *imatrix_data) {
+                for (float f : kv.second) {
+                    if (!std::isfinite(f)) {
+                        throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
+                    }
+                }
+            }
+        }
+    }
+
+    const size_t align = GGUF_DEFAULT_ALIGNMENT;
+    gguf_context_ptr ctx_out { gguf_init_empty() };
+
+    // copy the KV pairs from the input file
+    gguf_set_kv     (ctx_out.get(), ml.meta.get());
+    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
+
+    // Remove split metadata
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
+    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
+
+    if (params->kv_overrides) {
+        const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
+        for (const auto & o : overrides) {
+            if (o.key[0] == 0) break;
+            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+                gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
+            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+                gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
+            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+                gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
+            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+                gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
+            } else {
+                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+            }
+        }
+    }
+
+    // make a list of weights
+    std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
+    tensors.reserve(ml.weights_map.size());
+    for (const auto & it : ml.weights_map) {
+        tensors.push_back(&it.second);
+    }
+
+    // keep_split requires that the weights are sorted by split index
+    if (params->keep_split) {
+        std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
+            if (a->idx == b->idx) {
+                return a->offs < b->offs;
+            }
+            return a->idx < b->idx;
+        });
+    }
+
+    for (const auto * it : tensors) {
+        const struct ggml_tensor * tensor = it->tensor;
+
+        const std::string name = ggml_get_name(tensor);
+
+        // TODO: avoid hardcoded tensor names - use the TN_* constants
+        if (name.find("attn_v.weight")   != std::string::npos ||
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
+            ++qs.n_attention_wv;
+        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+            qs.has_output = true;
+        }
+    }
+
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
+
+    // sanity checks for models that have attention layers
+    if (qs.n_attention_wv != 0)
+    {
+        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
+        // attention layers have a non-zero number of kv heads
+        int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
+        if (llama_model_has_encoder(&model)) {
+            n_attn_layer *= 3;
+        }
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<std::thread> workers;
+    workers.reserve(nthread);
+
+    int idx = 0;
+
+    std::vector<no_init<uint8_t>> read_data;
+    std::vector<no_init<uint8_t>> work;
+    std::vector<no_init<float>> f32_conv_buf;
+
+    uint16_t n_split = 1;
+
+    // Assume split index is continuous
+    if (params->keep_split) {
+        for (const auto * it : tensors) {
+            n_split = std::max(uint16_t(it->idx + 1), n_split);
+        }
+    }
+    std::vector<gguf_context_ptr> ctx_outs(n_split);
+    ctx_outs[0] = std::move(ctx_out);
+
+    // populate the original tensors so we get an initial meta data
+    for (const auto * it : tensors) {
+        uint16_t i_split = params->keep_split ? it->idx : 0;
+        struct ggml_tensor * tensor = it->tensor;
+        if (!ctx_outs[i_split]) {
+            ctx_outs[i_split].reset(gguf_init_empty());
+        }
+        gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+    }
+
+    // Set split info if needed
+    if (n_split > 1) {
+        for (size_t i = 0; i < ctx_outs.size(); ++i) {
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+            gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+        }
+    }
+
+    int cur_split = -1;
+    std::ofstream fout;
+    auto close_ofstream = [&]() {
+        // Write metadata and close file handler
+        if (fout.is_open()) {
+            fout.seekp(0);
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
+            gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
+            fout.write((const char *) data.data(), data.size());
+            fout.close();
+        }
+    };
+    auto new_ofstream = [&](int index) {
+        cur_split = index;
+        GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
+        std::string fname = fname_out;
+        if (params->keep_split) {
+            std::vector<char> split_path(llama_path_max(), 0);
+            llama_split_path(split_path.data(), split_path.size(), fname_out.c_str(), cur_split, n_split);
+            fname = std::string(split_path.data());
+        }
+
+        fout = std::ofstream(fname, std::ios::binary);
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+    };
+
+    const auto tn = LLM_TN(model.arch);
+    new_ofstream(0);
+    for (const auto * it : tensors) {
+        const auto & weight = *it;
+        struct ggml_tensor * tensor = weight.tensor;
+        if (weight.idx != cur_split && params->keep_split) {
+            close_ofstream();
+            new_ofstream(weight.idx);
+        }
+
+        const std::string name = ggml_get_name(tensor);
+
+        if (!ml.use_mmap) {
+            if (read_data.size() < ggml_nbytes(tensor)) {
+                read_data.resize(ggml_nbytes(tensor));
+            }
+            tensor->data = read_data.data();
+        }
+        ml.load_data_for(tensor);
+
+        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+               ++idx, ml.n_tensors,
+               ggml_get_name(tensor),
+               llama_format_tensor_shape(tensor).c_str(),
+               ggml_type_name(tensor->type));
+
+        // This used to be a regex, but <regex> has an extreme cost to compile times.
+        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+        // quantize only 2D and 3D tensors (experts)
+        quantize &= (ggml_n_dims(tensor) >= 2);
+
+        // do not quantize norm tensors
+        quantize &= name.find("_norm.weight") == std::string::npos;
+
+        quantize &= params->quantize_output_tensor || name != "output.weight";
+        quantize &= !params->only_copy;
+
+        // do not quantize expert gating tensors
+        // NOTE: can't use LLM_TN here because the layer number is not known
+        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+        // do not quantize positional embeddings and token types (BERT)
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
+        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+        // do not quantize Mamba's small yet 2D weights
+        // NOTE: can't use LLM_TN here because the layer number is not known
+        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+
+        // do not quantize RWKV's time_mix_first tensors
+        quantize &= name.find("time_mix_first.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+        // do not quantize relative position bias (T5)
+        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+        enum ggml_type new_type;
+        void * new_data;
+        size_t new_size;
+
+        if (quantize) {
+            new_type = default_type;
+
+            // get more optimal quantization type based on the tensor shape, layer, etc.
+            if (!params->pure && ggml_is_quantized(default_type)) {
+                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+            }
+            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+                new_type = params->output_tensor_type;
+            }
+
+            // If we've decided to quantize to the same type the tensor is already
+            // in then there's nothing to do.
+            quantize = tensor->type != new_type;
+        }
+
+        if (!quantize) {
+            new_type = tensor->type;
+            new_data = tensor->data;
+            new_size = ggml_nbytes(tensor);
+            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+        } else {
+            const int64_t nelements = ggml_nelements(tensor);
+
+            const float * imatrix = nullptr;
+            if (imatrix_data) {
+                auto it = imatrix_data->find(tensor->name);
+                if (it == imatrix_data->end()) {
+                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                } else {
+                    if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+                        imatrix = it->second.data();
+                    } else {
+                        LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+                                int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
+
+                        // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+                        // this is a significant error and it may be good idea to abort the process if this happens,
+                        // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+                        // tok_embd should be ignored in this case, since it always causes this warning
+                        if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+                            throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+                                    int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+                        }
+                    }
+                }
+            }
+            if ((new_type == GGML_TYPE_IQ2_XXS ||
+                 new_type == GGML_TYPE_IQ2_XS  ||
+                 new_type == GGML_TYPE_IQ2_S   ||
+                 new_type == GGML_TYPE_IQ1_S   ||
+                (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
+                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                LLAMA_LOG_ERROR("\n\n============================================================\n");
+                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+                LLAMA_LOG_ERROR("============================================================\n\n");
+                throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+            }
+
+            float * f32_data;
+
+            if (tensor->type == GGML_TYPE_F32) {
+                f32_data = (float *) tensor->data;
+            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+            } else {
+                llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+                f32_data = (float *) f32_conv_buf.data();
+            }
+
+            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+            fflush(stdout);
+
+            if (work.size() < (size_t)nelements * 4) {
+                work.resize(nelements * 4); // upper bound on size
+            }
+            new_data = work.data();
+
+            const int64_t n_per_row = tensor->ne[0];
+            const int64_t nrows = tensor->ne[1];
+
+            static const int64_t min_chunk_size = 32 * 512;
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+
+            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+            const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+            // quantize each expert separately since they have different importance matrices
+            new_size = 0;
+            for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+                const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+                void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+                const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+                new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+            }
+            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+        }
+        total_size_org += ggml_nbytes(tensor);
+        total_size_new += new_size;
+
+        // update the gguf meta data as we go
+        gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+        GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+
+        // write tensor data + padding
+        fout.write((const char *) new_data, new_size);
+        zeros(fout, GGML_PAD(new_size, align) - new_size);
+    }
+    close_ofstream();
+
+    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
+
+    if (qs.n_fallback > 0) {
+        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
+                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+    }
+}
+
+//
+// interface implementation
+//
+
+struct llama_model_quantize_params llama_model_quantize_default_params() {
+    struct llama_model_quantize_params result = {
+        /*.nthread                     =*/ 0,
+        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
+        /*.allow_requantize            =*/ false,
+        /*.quantize_output_tensor      =*/ true,
+        /*.only_copy                   =*/ false,
+        /*.pure                        =*/ false,
+        /*.keep_split                  =*/ false,
+        /*.imatrix                     =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr,
+    };
+
+    return result;
+}
+
+uint32_t llama_model_quantize(
+        const char * fname_inp,
+        const char * fname_out,
+        const llama_model_quantize_params * params) {
+    try {
+        llama_model_quantize_impl(fname_inp, fname_out, params);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/src/llama-quant.h b/src/llama-quant.h
new file mode 100644
index 000000000..6f70f09be
--- /dev/null
+++ b/src/llama-quant.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
new file mode 100644
index 000000000..990b61297
--- /dev/null
+++ b/src/llama-sampling.cpp
@@ -0,0 +1,2457 @@
+#include "llama-sampling.h"
+
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-grammar.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <numeric>
+#include <random>
+#include <unordered_map>
+#include <stdexcept>
+
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (capacity == 0) {
+            throw std::runtime_error("ring buffer: capacity is zero");
+        }
+
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    //T & operator[](size_t i) {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    //const T & at(size_t i) const {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+
+    std::vector<T> data;
+};
+
+static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+    // iterator for the probabilities
+#ifdef __GNUC__
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif
+
+    struct probs_iterator {
+        typedef std::input_iterator_tag iterator_category;
+        typedef float value_type;
+        typedef float * pointer;
+        typedef float & reference;
+        typedef ptrdiff_t difference_type;
+
+        const llama_token_data * data;
+
+        bool operator==(const probs_iterator & other) const { return data == other.data; }
+        bool operator!=(const probs_iterator & other) const { return data != other.data; }
+        const float & operator*() const { return data->p; }
+        probs_iterator & operator++() { ++data; return *this; }
+        probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; }
+    };
+
+#ifdef __GNUC__
+    #pragma GCC diagnostic pop
+#endif
+
+    std::discrete_distribution<int> dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size});
+
+    return dist(rng);
+}
+
+/*
+static void llama_log_softmax(float * array, size_t size) {
+    float max_l = *std::max_element(array, array + size);
+    float sum = 0.f;
+    for (size_t i = 0; i < size; ++i) {
+        float p = expf(array[i] - max_l);
+        sum += p;
+        array[i] = p;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        array[i] = logf(array[i] / sum);
+    }
+}
+*/
+
+static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+    if (temp <= 0.0f) {
+        // find the token with the highest logit and set the rest to -inf
+        size_t max_i = 0;
+        float  max_l = cur_p->data[0].logit;
+
+        for (size_t i = 1; i < cur_p->size; ++i) {
+            if (cur_p->data[i    ].logit > max_l) {
+                cur_p->data[max_i].logit = -INFINITY;
+                max_i = i;
+                max_l = cur_p->data[i].logit;
+            } else {
+                cur_p->data[i].logit = -INFINITY;
+            }
+        }
+
+        return;
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= temp;
+    }
+}
+
+static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
+    GGML_ASSERT(cur_p->size > 0);
+
+    // Sort the logits in descending order
+    if (!cur_p->sorted) {
+        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        cur_p->sorted = true;
+    }
+
+    float max_l = cur_p->data[0].logit;
+    float cum_sum = 0.0f;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float p = expf(cur_p->data[i].logit - max_l);
+        cur_p->data[i].p = p;
+        cum_sum += p;
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= cum_sum;
+    }
+}
+
+static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
+    // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
+    // if (k >= (int32_t)cur_p->size) {
+    //     return;
+    // }
+
+    if (k <= 0) {
+        k = cur_p->size;
+    }
+
+    k = std::min(k, (int) cur_p->size);
+
+    // Sort scores in descending order
+    if (!cur_p->sorted) {
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        };
+        if (k <= 128) {
+            std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
+        } else {
+            constexpr int   nbuckets     = 128;
+            constexpr float bucket_low   = -10.0f;
+            constexpr float bucket_high  =  10.0f;
+            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+            constexpr float bucket_inter = -bucket_low * bucket_scale;
+
+            std::vector<int> bucket_idx(cur_p->size);
+            std::vector<int> histo(nbuckets, 0);
+
+            for (int i = 0; i < (int)cur_p->size; ++i) {
+                const float val = cur_p->data[i].logit;
+                int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                ib = std::max(0, std::min(nbuckets - 1, ib));
+                bucket_idx[i] = ib;
+                ++histo[ib];
+            }
+            int nhave = 0;
+            int ib = nbuckets - 1;
+            for ( ; ib >= 0; --ib) {
+                nhave += histo[ib];
+                if (nhave >= k) {
+                    break;
+                }
+            }
+            std::vector<llama_token_data> tmp_tokens(nhave);
+            auto * ptr = tmp_tokens.data();
+            std::vector<llama_token_data*> bucket_ptrs;
+            bucket_ptrs.reserve(nbuckets - ib);
+            for (int j = nbuckets - 1; j >= ib; --j) {
+                bucket_ptrs.push_back(ptr);
+                ptr += histo[j];
+            }
+            for (int i = 0; i < (int)cur_p->size; ++i) {
+                int j = bucket_idx[i];
+                if (j >= ib) {
+                    *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
+                }
+            }
+
+            ptr = tmp_tokens.data();
+            int ndone = 0;
+            for (int j = nbuckets - 1; j > ib; --j) {
+                std::sort(ptr, ptr + histo[j], comp);
+                ptr += histo[j];
+                ndone += histo[j];
+            }
+            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+
+            std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+
+        }
+        cur_p->sorted = true;
+    }
+    cur_p->size = k;
+}
+
+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
+// llama_sampler API
+
+struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
+    return new llama_sampler {
+        /* .iface = */ iface,
+        /* .ctx   = */ ctx,
+    };
+}
+
+const char * llama_sampler_name(const struct llama_sampler * smpl) {
+    if (!smpl->iface) {
+        return "(null)";
+    }
+
+    return smpl->iface->name(smpl);
+}
+
+void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+    if (smpl->iface->accept) {
+        smpl->iface->accept(smpl, token);
+    }
+}
+
+void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+    GGML_ASSERT(smpl->iface->apply);
+    smpl->iface->apply(smpl, cur_p);
+}
+
+void llama_sampler_reset(struct llama_sampler * smpl) {
+    if (smpl->iface->reset) {
+        smpl->iface->reset(smpl);
+    }
+}
+
+struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+    if (smpl->iface->clone) {
+        return smpl->iface->clone(smpl);
+    }
+
+    if (smpl->ctx == nullptr) {
+        return llama_sampler_init(
+            /* .iface = */ smpl->iface,
+            /* .ctx   = */ nullptr
+        );
+    }
+
+    GGML_ABORT("the sampler does not support cloning");
+}
+
+void llama_sampler_free(struct llama_sampler * smpl) {
+    if (smpl == nullptr) {
+        return;
+    }
+
+    if (smpl->iface->free) {
+        smpl->iface->free(smpl);
+    }
+
+    delete smpl;
+}
+
+llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
+    const auto * logits = llama_get_logits_ith(ctx, idx);
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    // TODO: do not allocate each time
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = {
+        /* .data       = */ cur.data(),
+        /* .size       = */ cur.size(),
+        /* .selected   = */ -1,
+        /* .sorted     = */ false,
+    };
+
+    llama_sampler_apply(smpl, &cur_p);
+
+    GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+
+    auto token = cur_p.data[cur_p.selected].id;
+
+    llama_sampler_accept(smpl, token);
+
+    return token;
+}
+
+// sampler chain
+
+static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
+    return "chain";
+}
+
+static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_accept(smpl, token);
+    }
+
+    chain->n_sample++;
+}
+
+static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_apply(smpl, cur_p);
+    }
+}
+
+static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_reset(smpl);
+    }
+
+    chain->t_sample_us = 0;
+    chain->n_sample    = 0;
+}
+
+static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
+    const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
+
+    auto * result = llama_sampler_chain_init(chain_src->params);
+
+    for (auto * smpl : chain_src->samplers) {
+        llama_sampler_chain_add(result, llama_sampler_clone(smpl));
+    }
+
+    return result;
+}
+
+static void llama_sampler_chain_free(struct llama_sampler * smpl) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_free(smpl);
+    }
+
+    delete chain;
+}
+
+static struct llama_sampler_i llama_sampler_chain_i = {
+    /* .name   = */ llama_sampler_chain_name,
+    /* .accept = */ llama_sampler_chain_accept,
+    /* .apply  = */ llama_sampler_chain_apply,
+    /* .reset  = */ llama_sampler_chain_reset,
+    /* .clone  = */ llama_sampler_chain_clone,
+    /* .free   = */ llama_sampler_chain_free,
+};
+
+struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_chain_i,
+        /* .ctx   = */ new llama_sampler_chain {
+            /* .params      = */ params,
+            /* .samplers    = */ {},
+            /* .t_sample_us = */ 0,
+            /* .n_sample    = */ 0,
+        }
+    );
+}
+
+void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
+    auto * p = (llama_sampler_chain *) chain->ctx;
+    p->samplers.push_back(smpl);
+}
+
+struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
+    const auto * p = (const llama_sampler_chain *) chain->ctx;
+
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
+        return nullptr;
+    }
+
+    return p->samplers[i];
+}
+
+struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
+    auto * p = (llama_sampler_chain *) chain->ctx;
+
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
+        return nullptr;
+    }
+
+    auto * result = p->samplers[i];
+    p->samplers.erase(p->samplers.begin() + i);
+
+    return result;
+}
+
+int llama_sampler_chain_n(const struct llama_sampler * chain) {
+    const auto * p = (const llama_sampler_chain *) chain->ctx;
+
+    return p->samplers.size();
+}
+
+//
+// samplers
+//
+
+// greedy
+
+static const char * llama_sampler_greedy_name(const struct llama_sampler * /*smpl*/) {
+    return "greedy";
+}
+
+static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+    cur_p->selected = 0;
+    for (size_t i = 1; i < cur_p->size; ++i) {
+        if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) {
+            cur_p->selected = i;
+        }
+    }
+}
+
+static struct llama_sampler_i llama_sampler_greedy_i = {
+    /* .name   = */ llama_sampler_greedy_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_greedy_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ nullptr,
+    /* .free   = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_greedy() {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_greedy_i,
+        /* .ctx   = */ nullptr
+    );
+}
+
+// dist
+
+struct llama_sampler_dist {
+    const uint32_t seed;
+          uint32_t seed_cur;
+
+    std::mt19937 rng;
+};
+
+static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
+    return "dist";
+}
+
+static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+}
+
+static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
+    auto * result = llama_sampler_init_dist(ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_dist *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static void llama_sampler_dist_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_dist *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_dist_i = {
+    /* .name   = */ llama_sampler_dist_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_dist_apply,
+    /* .reset  = */ llama_sampler_dist_reset,
+    /* .clone  = */ llama_sampler_dist_clone,
+    /* .free   = */ llama_sampler_dist_free,
+};
+
+struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_dist_i,
+        /* .ctx   = */ new llama_sampler_dist {
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .rng      = */ std::mt19937(seed_cur),
+        }
+    );
+}
+
+// softmax
+
+static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
+    return "softmax";
+}
+
+static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+    llama_sampler_softmax_impl(cur_p);
+}
+
+static struct llama_sampler_i llama_sampler_softmax_i = {
+    /* .name   = */ llama_sampler_softmax_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_softmax_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ nullptr,
+    /* .free   = */ nullptr,
+};
+
+struct llama_sampler * llama_sampler_init_softmax() {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_softmax_i,
+        /* .ctx   = */ nullptr
+    );
+}
+
+// top-k
+
+struct llama_sampler_top_k {
+    const int32_t k;
+};
+
+static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) {
+    return "top-k";
+}
+
+static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
+    llama_sampler_top_k_impl(cur_p, ctx->k);
+}
+
+static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_top_k *) smpl->ctx;
+    return llama_sampler_init_top_k(ctx->k);
+}
+
+static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_top_k *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_top_k_i = {
+    /* .name   = */ llama_sampler_top_k_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_top_k_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_top_k_clone,
+    /* .free   = */ llama_sampler_top_k_free,
+};
+
+struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_top_k_i,
+        /* .ctx   = */ new llama_sampler_top_k {
+            /* .k = */ k,
+        }
+    );
+}
+
+// top-p
+
+struct llama_sampler_top_p {
+    const float  p;
+    const size_t min_keep;
+};
+
+static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
+    return "top-p";
+}
+
+static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
+
+    if (ctx->p >= 1.0f) {
+        return;
+    }
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cum_sum += cur_p->data[i].p;
+
+        // Check if the running sum is at least p or if we have kept at least min_keep tokens
+        // we set the last index to i+1 to indicate that the current iterate should be included in the set
+        if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the top-p tokens
+    cur_p->size = last_idx;
+}
+
+static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_top_p *) smpl->ctx;
+    return llama_sampler_init_top_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_top_p *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_top_p_i = {
+    /* .name   = */ llama_sampler_top_p_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_top_p_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_top_p_clone,
+    /* .free   = */ llama_sampler_top_p_free,
+};
+
+struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_top_p_i,
+        /* .ctx   = */ new llama_sampler_top_p {
+            /* .p        = */ p,
+            /* .min_keep = */ min_keep,
+        }
+    );
+}
+
+// min-p
+
+struct llama_sampler_min_p {
+    const float  p;
+    const size_t min_keep;
+};
+
+static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) {
+    return "min-p";
+}
+
+static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
+
+    if (ctx->p <= 0.0f || !cur_p->size) {
+        return;
+    }
+
+    bool min_p_applied = false;
+
+    // if the cur_p aren't sorted, try the unsorted implementation first
+    if (!cur_p->sorted) {
+        std::vector<llama_token_data> filtered_tokens;
+
+        float max_logit = -FLT_MAX;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            max_logit = std::max(max_logit, cur_p->data[i].logit);
+        }
+        const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit >= min_logit) {
+                filtered_tokens.push_back(cur_p->data[i]);
+            }
+        }
+
+        // if we have enough values the operation was a success
+        if (filtered_tokens.size() >= ctx->min_keep) {
+            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            cur_p->size = filtered_tokens.size();
+            min_p_applied = true;
+        }
+    }
+
+    // if the cur_p are sorted or the unsorted implementation failed, use this implementation
+    if (!min_p_applied) {
+        // Sort the logits in descending order
+        if (!cur_p->sorted) {
+            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+            cur_p->sorted = true;
+        }
+
+        const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
+        size_t i = 1; // first token always matches
+
+        for (; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) {
+                break; // prob too small
+            }
+        }
+
+        // Resize the output vector to keep only the matching tokens
+        cur_p->size = i;
+    }
+}
+
+static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_min_p *) smpl->ctx;
+    return llama_sampler_init_min_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_min_p *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_min_p_i = {
+    /* .name   = */ llama_sampler_min_p_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_min_p_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_min_p_clone,
+    /* .free   = */ llama_sampler_min_p_free,
+};
+
+struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_min_p_i,
+        /* .ctx   = */ new llama_sampler_min_p {
+            /* .p        = */ p,
+            /* .min_keep = */ min_keep,
+        }
+    );
+}
+
+// typical
+
+struct llama_sampler_typical {
+    const float  p;
+    const size_t min_keep;
+};
+
+static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
+    return "typical";
+}
+
+static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_typical *) smpl->ctx;
+
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (ctx->p >= 1.0f) {
+        return;
+    }
+
+    // Compute the softmax of logits and calculate entropy
+    llama_sampler_softmax_impl(cur_p);
+
+    float entropy = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
+    }
+
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(cur_p->size);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += cur_p->data[idx].p;
+
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> cur_p_new;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        cur_p_new.push_back(cur_p->data[idx]);
+    }
+
+    // Replace the data in cur_p with the cur_p_new data
+    std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
+    cur_p->size = cur_p_new.size();
+    cur_p->sorted = false;
+}
+
+static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_typical *) smpl->ctx;
+    return llama_sampler_init_typical(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_typical_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_typical *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_typical_i = {
+    /* .name   = */ llama_sampler_typical_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_typical_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_typical_clone,
+    /* .free   = */ llama_sampler_typical_free,
+};
+
+struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_typical_i,
+        /* .ctx   = */ new llama_sampler_typical {
+            /* .p        = */ p,
+            /* .min_keep = */ min_keep,
+        }
+    );
+}
+
+// temp
+
+struct llama_sampler_temp {
+    const float temp;
+};
+
+static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*/) {
+    return "temp";
+}
+
+static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_temp *) smpl->ctx;
+
+    llama_sampler_temp_impl(cur_p, ctx->temp);
+}
+
+static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_temp *) smpl->ctx;
+    return llama_sampler_init_temp(ctx->temp);
+}
+
+static void llama_sampler_temp_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_temp *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_temp_i = {
+    /* .name   = */ llama_sampler_temp_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_temp_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_temp_clone,
+    /* .free   = */ llama_sampler_temp_free,
+};
+
+struct llama_sampler * llama_sampler_init_temp(float temp) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_temp_i,
+        /* .ctx   = */ new llama_sampler_temp {
+            /*.temp = */ temp,
+        }
+    );
+}
+
+// temp-ext
+
+struct llama_sampler_temp_ext {
+    const float temp;
+    const float delta;
+    const float exponent;
+};
+
+static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
+    return "temp-ext";
+}
+
+static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+    if (ctx->delta > 0) {
+        const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
+        const float max_temp = ctx->temp + ctx->delta;
+
+        float exponent_val = ctx->exponent;
+
+        // no need to do anything if there is only one (or zero) candidates
+        if (cur_p->size <= 1) {
+            return;
+        }
+
+        // Calculate maximum possible entropy
+        float max_entropy = -logf(1.0f / cur_p->size);
+
+        llama_sampler_softmax_impl(cur_p);
+
+        // Calculate entropy of the softmax probabilities
+        float entropy = 0.0f;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            float prob = cur_p->data[i].p;
+            if (prob > 0.0f) { // Ensure no log(0)
+                entropy -= prob * logf(prob);
+            }
+        }
+
+        // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
+        float normalized_entropy = entropy / max_entropy;
+
+        // Map the normalized entropy to the desired temperature range using the power function
+        float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+    #ifdef DEBUG
+        LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+        LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+        LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+        LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+        LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+        LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+    #endif
+
+        // Apply the dynamically calculated temperature scaling
+        llama_sampler_temp_impl(cur_p, dyn_temp);
+
+        // Re-compute softmax probabilities after scaling logits with dynamic temperature
+        const double max_l_double = cur_p->data[0].logit;
+
+        double cum_sum_double = 0.0;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            double p = exp(cur_p->data[i].logit - max_l_double);
+            cur_p->data[i].p = p; // Store the scaled probability
+            cum_sum_double += p;
+        }
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+        }
+
+    #ifdef DEBUG
+        // Print the updated top 25 probabilities after temperature scaling
+        LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+        for (size_t i = 0; i < 25 && i < cur_p->size; ++i) {
+            LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
+        }
+    #endif
+    } else {
+        llama_sampler_temp_impl(cur_p, ctx->temp);
+    }
+}
+
+static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
+    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
+}
+
+static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_temp_ext *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_temp_ext_i = {
+    /* .name   = */ llama_sampler_temp_ext_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_temp_ext_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_temp_ext_clone,
+    /* .free   = */ llama_sampler_temp_ext_free,
+};
+
+struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_temp_ext_i,
+        /* .ctx   = */ new llama_sampler_temp_ext {
+            /* .temp     = */ temp,
+            /* .delta    = */ delta,
+            /* .exponent = */ exponent,
+        }
+    );
+}
+
+// xtc
+
+struct llama_sampler_xtc {
+    const float    probability;
+    const float    threshold;
+    const size_t   min_keep;
+
+    const uint32_t seed;
+    uint32_t       seed_cur;
+
+    std::mt19937   rng;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+    return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+    if (ctx->probability <= 0.0f
+        || ctx->threshold > 0.5f
+        || cur_p->size < 2) {
+        return;
+    }
+
+    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+    float chance = distribution(ctx->rng);
+    if (chance > ctx->probability) return;
+
+    // in case it's not sorted/recalculated yet
+    llama_sampler_softmax_impl(cur_p);
+
+    int pos_last = 0;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p >= ctx->threshold) {
+            pos_last = i;
+        } else break;
+    }
+
+    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+        cur_p->data += pos_last;
+        cur_p->size -= pos_last;
+    }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+    /* .name   = */ llama_sampler_xtc_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sample_xtc_apply,
+    /* .reset  = */ llama_sampler_xtc_reset,
+    /* .clone  = */ llama_sampler_xtc_clone,
+    /* .free   = */ llama_sampler_xtc_free,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_xtc_i,
+        /* .ctx   = */ new llama_sampler_xtc {
+            /* .probability   = */ p,
+            /* .threshold     = */ t,
+            /* .min_keep      = */ min_keep,
+            /* .seed          = */ seed,
+            /* .seed_cur      = */ seed_cur,
+            /* .rng           = */ std::mt19937(seed_cur),
+        }
+    );
+}
+
+// mirostat
+
+struct llama_sampler_mirostat {
+    const int32_t n_vocab;
+
+    const uint32_t seed;
+          uint32_t seed_cur;
+
+    const float tau;
+    const float eta;
+
+    const int32_t m;
+
+    float mu;
+
+    std::mt19937 rng;
+};
+
+static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
+    return "mirostat";
+}
+
+static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Estimate s_hat using the most probable m tokens
+    float s_hat = 0.0;
+    float sum_ti_bi = 0.0;
+    float sum_ti_sq = 0.0;
+    for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) {
+        float t_i = logf(float(i + 2) / float(i + 1));
+        float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p);
+        sum_ti_bi += t_i * b_i;
+        sum_ti_sq += t_i * t_i;
+    }
+    s_hat = sum_ti_bi / sum_ti_sq;
+
+    // Compute k from the estimated s_hat and target surprise value
+    float epsilon_hat = s_hat - 1;
+    float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
+
+    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
+    llama_sampler_softmax_impl(cur_p);
+
+    const int idx = llama_sample_dist(cur_p, ctx->rng);
+
+    cur_p->selected = idx;
+
+    float observed_surprise = -log2f(cur_p->data[idx].p);
+    float e = observed_surprise - ctx->tau;
+
+    // Update mu using the learning rate and error
+    ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx;
+    auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+        result_ctx->mu  = ctx->mu;
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+    ctx->mu = 2.0f*ctx->tau;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_mirostat *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_mirostat_i = {
+    /* .name   = */ llama_sampler_mirostat_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_mirostat_apply,
+    /* .reset  = */ llama_sampler_mirostat_reset,
+    /* .clone  = */ llama_sampler_mirostat_clone,
+    /* .free   = */ llama_sampler_mirostat_free,
+};
+
+struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+    auto seed_cur = get_rng_seed(seed);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_mirostat_i,
+        /* .ctx   = */ new llama_sampler_mirostat {
+            /* .n_vocab  = */ n_vocab,
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .m        = */ m,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
+        }
+    );
+}
+
+// mirostat v2
+
+struct llama_sampler_mirostat_v2 {
+    const uint32_t seed;
+          uint32_t seed_cur;
+
+    const float tau;
+    const float eta;
+
+    float mu;
+
+    std::mt19937 rng;
+};
+
+static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
+    return "mirostat-v2";
+}
+
+static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Truncate the words with surprise values greater than mu
+    cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
+        return -log2f(candidate.p) > ctx->mu;
+    }));
+
+    if (cur_p->size == 0) {
+        cur_p->size = 1;
+    }
+
+    // Normalize the probabilities of the remaining words
+    llama_sampler_softmax_impl(cur_p);
+
+    const int idx = llama_sample_dist(cur_p, ctx->rng);
+
+    cur_p->selected = idx;
+
+    float observed_surprise = -log2f(cur_p->data[idx].p);
+    float e = observed_surprise - ctx->tau;
+
+    // Update mu using the learning rate and error
+    ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+    ctx->mu = 2.0f*ctx->tau;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx;
+
+    auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx;
+
+        result_ctx->mu  = ctx->mu;
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_mirostat_v2 *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
+    /* .name   = */ llama_sampler_mirostat_v2_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_mirostat_v2_apply,
+    /* .reset  = */ llama_sampler_mirostat_v2_reset,
+    /* .clone  = */ llama_sampler_mirostat_v2_clone,
+    /* .free   = */ llama_sampler_mirostat_v2_free,
+};
+
+struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+    auto seed_cur = get_rng_seed(seed);
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_mirostat_v2_i,
+        /* .ctx   = */ new llama_sampler_mirostat_v2 {
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
+        }
+    );
+}
+
+// grammar
+
+struct llama_sampler_grammar {
+    const struct llama_vocab * vocab;
+
+    std::string grammar_str;
+    std::string grammar_root;
+
+    struct llama_grammar * grammar;
+};
+
+static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) {
+    return "grammar";
+}
+
+static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (ctx->grammar) {
+        llama_grammar_accept_impl(*ctx->grammar, token);
+    }
+}
+
+static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (ctx->grammar) {
+        llama_grammar_apply_impl(*ctx->grammar, cur_p);
+    }
+}
+
+// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
+static struct llama_sampler * llama_sampler_init_grammar_impl(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                              bool lazy,
+                     const char ** trigger_words,
+                            size_t num_trigger_words,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens);
+
+static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (!ctx->grammar) {
+        return;
+    }
+
+    std::vector<const char *>  trigger_words;
+    for (auto & word : ctx->grammar->trigger_words) {
+        trigger_words.push_back(word.c_str());
+    }
+    auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
+                                                 ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
+                                                 ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
+
+    llama_grammar_free_impl(ctx->grammar);
+    ctx->grammar = grammar_new;
+}
+
+static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
+
+    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_grammar *) result->ctx;
+
+        if (ctx->grammar) {
+            result_ctx->grammar_str  = ctx->grammar_str;
+            result_ctx->grammar_root = ctx->grammar_root;
+
+            result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar);
+        }
+    }
+
+    return result;
+}
+
+static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+
+    if (ctx->grammar) {
+        llama_grammar_free_impl(ctx->grammar);
+    }
+
+    delete ctx;
+}
+
+static struct llama_sampler_i llama_sampler_grammar_i = {
+    /* .name   = */ llama_sampler_grammar_name,
+    /* .accept = */ llama_sampler_grammar_accept_impl,
+    /* .apply  = */ llama_sampler_grammar_apply,
+    /* .reset  = */ llama_sampler_grammar_reset,
+    /* .clone  = */ llama_sampler_grammar_clone,
+    /* .free   = */ llama_sampler_grammar_free,
+};
+
+static struct llama_sampler * llama_sampler_init_grammar_impl(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                              bool lazy,
+                     const char ** trigger_words,
+                            size_t num_trigger_words,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    auto * ctx = new llama_sampler_grammar;
+
+    if (grammar_str != nullptr && grammar_str[0] != '\0') {
+        *ctx = {
+            /* .vocab        = */ vocab,
+            /* .grammar_str  = */ grammar_str,
+            /* .grammar_root = */ grammar_root,
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+        };
+    } else {
+        *ctx = {
+            /* .vocab        = */ vocab,
+            /* .grammar_str  = */ {},
+            /* .grammar_root = */ {},
+            /* .grammar      = */ nullptr,
+        };
+    }
+
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_grammar_i,
+        /* .ctx   = */ ctx
+    );
+}
+
+struct llama_sampler * llama_sampler_init_grammar(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root) {
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_words,
+                            size_t num_trigger_words,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
+}
+
+// penalties
+
+struct llama_sampler_penalties {
+    const int32_t penalty_last_n;
+    const float   penalty_repeat;
+    const float   penalty_freq;
+    const float   penalty_present;
+
+    ring_buffer<llama_token> prev;
+
+    // a frequency map to count token occurrences
+    std::unordered_map<llama_token, int> token_count;
+};
+
+static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
+    return "penalties";
+}
+
+static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    if (ctx->penalty_last_n == 0) {
+        return;
+    }
+
+    ctx->token_count[token]++;
+
+    // if the ring buffer is full, remove the oldest token
+    if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+        const auto old = ctx->prev.front();
+
+        ctx->token_count[old]--;
+        if (ctx->token_count[old] == 0) {
+            ctx->token_count.erase(old);
+        }
+    }
+
+    ctx->prev.push_back(token);
+
+#if 0
+    // sanity check
+    std::unordered_map<llama_token, int> tmp;
+    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+        tmp[ctx->prev.rat(i)]++;
+    }
+
+    assert(ctx->token_count == tmp);
+#endif
+}
+
+static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+
+    if ((ctx->penalty_last_n == 0) ||
+        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
+        return;
+    }
+
+    // Apply frequency and presence penalties to the cur_p
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+        if (token_iter == ctx->token_count.end()) {
+            continue;
+        }
+
+        const int count = token_iter->second;
+
+        assert(count > 0 && count <= ctx->penalty_last_n);
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (cur_p->data[i].logit <= 0) {
+            cur_p->data[i].logit *= ctx->penalty_repeat;
+        } else {
+            cur_p->data[i].logit /= ctx->penalty_repeat;
+        }
+
+        cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present;
+    }
+
+    cur_p->sorted = false;
+}
+
+static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    ctx->prev.clear();
+    ctx->token_count.clear();
+}
+
+static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
+    auto * result = llama_sampler_init_penalties(
+            ctx->penalty_last_n,
+            ctx->penalty_repeat,
+            ctx->penalty_freq,
+            ctx->penalty_present);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_penalties *) result->ctx;
+
+        result_ctx->prev = ctx->prev;
+    }
+
+    return result;
+}
+
+static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_penalties *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_penalties_i = {
+    /* .name   = */ llama_sampler_penalties_name,
+    /* .accept = */ llama_sampler_penalties_accept,
+    /* .apply  = */ llama_sampler_penalties_apply,
+    /* .reset  = */ llama_sampler_penalties_reset,
+    /* .clone  = */ llama_sampler_penalties_clone,
+    /* .free   = */ llama_sampler_penalties_free,
+};
+
+struct llama_sampler * llama_sampler_init_penalties(
+        int32_t penalty_last_n,
+        float penalty_repeat,
+        float penalty_freq,
+        float penalty_present) {
+    penalty_last_n = std::max(penalty_last_n, 0);
+
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_penalties_i,
+        /* .ctx   = */ new llama_sampler_penalties {
+            /* .penalty_last_n  = */ penalty_last_n,
+            /* .penalty_repeat  = */ penalty_repeat,
+            /* .penalty_freq    = */ penalty_freq,
+            /* .penalty_present = */ penalty_present,
+            /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
+            /* .token_count     = */ {},
+        }
+    );
+}
+
+// DRY
+
+struct llama_sampler_dry {
+    int32_t total_context_size;
+
+    const float   dry_multiplier;
+    const float   dry_base;
+    const int32_t dry_allowed_length;
+    const int32_t dry_penalty_last_n;
+
+    std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
+    std::vector<int> dry_repeat_count;
+    std::unordered_map<llama_token, int> dry_max_token_repeat;
+    ring_buffer<llama_token> last_tokens;
+};
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
+    for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
+        std::string word = vocab.detokenize({token_id}, true);
+        if (word.find(str) != std::string::npos) {
+            token_sequences.emplace(token_id, std::vector<llama_token>());
+        } else {
+            size_t word_len = word.size();
+            size_t str_len = str.size();
+            size_t pos = -1;
+            while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
+                bool match = true;
+                size_t i;
+                for (i = 1; i < str_len && i + pos < word_len; ++i) {
+                    if (word[pos + i] != str[i]) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) {
+                    std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
+                    if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
+                        tokenization.resize(max_tail_len);
+                    }
+
+                    // Ensure we don't already have a duplicate matching tokenization
+                    auto its = token_sequences.equal_range(token_id);
+                    bool found = false;
+                    for (auto it = its.first; it != its.second; ++it) {
+                        if (tokenization == it->second) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        token_sequences.emplace(token_id, tokenization);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
+    return "dry";
+}
+
+static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+        return;
+    }
+
+    ctx->last_tokens.push_back(token);
+}
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+    if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+        return;
+    }
+
+    int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
+    int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
+
+    if (last_n_repeat <= ctx->dry_allowed_length) {
+        return;
+    }
+
+    ctx->dry_repeat_count.assign(last_n_repeat, 0);
+    ctx->dry_max_token_repeat.clear();
+
+    // Step 1: Look for restart sequences to limit the maximum repetition length.
+    // Work backwards through the context looking for any token that begins a restart sequence.
+    //
+    // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
+    // sequences that together comprise a restart sequence. This allows us to quickly check
+    // whether each token is the head of a complete sequence. Most restart sequences are actually
+    // a single token, and for these the "tail" is an empty vector.
+    //
+    // If the token is a "head", test all restart sequences that begin with this token
+    // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
+    // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
+    // longest matching sequence (if any) is used to limit the maximum repetition length.
+    //
+    // Note that in the case case of a short sequence contained in a longer one, this might fail to
+    // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
+    // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
+    // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
+    //
+    // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
+    // have already clamped the maximum tail sequence length when generating `restart_sequences`.
+    // With clamping, this scan is O(N) in the context length.
+
+    int rep_limit = last_n_repeat;
+    for (int i = 0; i < last_n_repeat; ++i) {
+        llama_token token = ctx->last_tokens.rat(i);
+        auto its = ctx->dry_processed_breakers.equal_range(token);
+        if (its.first == ctx->dry_processed_breakers.end()) {
+            continue;
+        }
+        int longest_match = -1;
+        for (auto it = its.first; it != its.second; ++it) {
+            // Note that (*it) does not contain the head character, so seq_len will be
+            // the restart sequence length minus 1.
+            // In the common case of a single-token restart sequence, (*it) will be empty
+            // and we will trivially match.
+            int seq_len = (int)it->second.size();
+            if (seq_len > longest_match && seq_len <= (int)i) {
+                bool match = true;
+                for (int offset = 0; offset < seq_len; ++offset) {
+                    // The -1 when indexing `last_tokens` is because we already matched the head.
+                    if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) {
+                    longest_match = seq_len;
+                }
+            }
+        }
+        if (longest_match >= 0) {
+            // We found a restart sequence starting `i` tokens from the end and continuing for
+            // `longest_match` tokens.
+            rep_limit = i - longest_match;
+            break;
+        }
+    }
+    if (rep_limit < ctx->dry_allowed_length) {
+        return;
+    }
+
+    // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
+    // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
+    // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
+    //
+    // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
+    // https://ivanyu.me/blog/2014/10/15/z-algorithm/
+    //
+    // The code below is adapted from the public domain implementation by the same author here:
+    // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
+    //
+    // Example:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //                    ^
+    //   This `3` means that the last three tokens of the context (a b c) also appear here.
+    //
+    // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
+    // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
+    // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
+    // ensure that the inner while loops only examine each token in the context once as the outer
+    // for loop iterates over the context.
+
+    {
+        const int last = last_n_repeat - 1;
+        int rt = 0, lt = 0;
+
+        for (int k = 1; k < last_n_repeat; ++k) {
+            if (k > rt) {
+                // If k is outside the current Z-box, do naive computation.
+                int n = 0;
+                while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
+                    ++n;
+                }
+                ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
+                if (n > 0) {
+                    lt = k;
+                    rt = k + n - 1;
+                }
+            } else {
+                // If k is inside the current Z-box, consider two cases.
+
+                int p = k - lt; // Pair index.
+                int right_part_len = rt - k + 1;
+
+                if (ctx->dry_repeat_count[last - p] < right_part_len) {
+                    int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
+                    ctx->dry_repeat_count[last - k] = n;
+                } else {
+                    int i = rt + 1;
+                    while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
+                        i += 1;
+                    }
+
+                    int n = std::min(i - k, rep_limit);
+                    ctx->dry_repeat_count[last - k] = n;
+                    lt = k;
+                    rt = i - 1;
+                }
+            }
+        }
+    }
+
+    // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
+    // that would be generated by emitting each new token that would extend a sequence.
+    //
+    // Following the same example as above:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //
+    // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
+    // c: 3 -> 4 (from `a b c` to `a b c c`)
+    // b: 1 -> 2 (from `c` to `c b`)
+    // y: 2 -> 3 (from `b c` to `b c y`)
+
+    for (int i = 0; i < last_n_repeat - 1; ++i) {
+        int repeat_len = ctx->dry_repeat_count[i];
+        if (repeat_len >= ctx->dry_allowed_length) {
+            // This token ends a repeat, so the next token would continue one.
+            // By convention, the value of `repeat_len` only includes the tokens currently
+            // in the context, not the new token that would be added.
+            llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+            // Track the maximum sequence ending in this token.
+            const auto& it = ctx->dry_max_token_repeat.find(token);
+            if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
+                ctx->dry_max_token_repeat[token] = repeat_len;
+            }
+        }
+    }
+
+    // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
+
+    // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
+    // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
+    const float FLOAT_MAX_LOG = 88.7228391f;
+    int max_exponent = 0;
+    if (ctx->dry_base > 1.000001f) {
+        max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
+        if (af_kvp != ctx->dry_max_token_repeat.end()) {
+            // Check all sequence breakers starting with this token
+            auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
+            bool is_single_token_breaker = false;
+
+            for (auto it = range.first; it != range.second; ++it) {
+                if (it->second.empty()) {
+                    is_single_token_breaker = true;
+                    break;
+                }
+            }
+
+            // Apply penalty only if it's not a single-token sequence breaker
+            if (!is_single_token_breaker) {
+                int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
+                if (max_exponent > 0 && repeat_exp > max_exponent) {
+                    repeat_exp = max_exponent;
+                }
+                float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
+                cur_p->data[i].logit -= penalty;
+            }
+        }
+    }
+
+    cur_p->sorted = false;
+}
+
+static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_dry *) smpl->ctx;
+    ctx->last_tokens.clear();
+    ctx->dry_repeat_count.clear();
+    ctx->dry_max_token_repeat.clear();
+}
+
+static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+    llama_vocab dummy_vocab;
+
+    // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
+    auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+
+    // Copy the state, including the processed breakers
+    {
+        auto * result_ctx = (llama_sampler_dry *) result->ctx;
+        result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
+        result_ctx->dry_repeat_count = ctx->dry_repeat_count;
+        result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
+        result_ctx->last_tokens = ctx->last_tokens;
+    }
+
+    return result;
+}
+
+static void llama_sampler_dry_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_dry *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_dry_i = {
+    /* .name   = */ llama_sampler_dry_name,
+    /* .accept = */ llama_sampler_dry_accept,
+    /* .apply  = */ llama_sampler_dry_apply,
+    /* .reset  = */ llama_sampler_dry_reset,
+    /* .clone  = */ llama_sampler_dry_clone,
+    /* .free   = */ llama_sampler_dry_free,
+};
+
+struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
+    std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
+    const int MAX_CHAR_LEN = 40;
+    const int MAX_SEQ_LEN = 20;
+
+    const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
+
+    if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
+        // Process sequence breakers
+        for (size_t i = 0; i < num_breakers; ++i) {
+            if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
+                LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
+                continue;
+            }
+
+            std::string sequence_break(seq_breakers[i]);
+            if (sequence_break.empty()) {
+                LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
+                continue;
+            }
+
+            if (sequence_break.size() > MAX_CHAR_LEN) {
+                LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
+                sequence_break.resize(MAX_CHAR_LEN);
+            }
+
+            get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
+        }
+    }
+
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_dry_i,
+        /* .ctx   = */ new llama_sampler_dry {
+            /* .total_context_size     = */ context_size,
+            /* .dry_multiplier         = */ dry_multiplier,
+            /* .dry_base               = */ dry_base,
+            /* .dry_allowed_length     = */ dry_allowed_length,
+            /* .dry_penalty_last_n     = */ dry_penalty_last_n,
+            /* .dry_processed_breakers = */ std::move(processed_breakers),
+            /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
+            /* .dry_max_token_repeat   = */ {},
+            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
+        }
+    );
+}
+
+// wrapper for test-sampling.cpp
+struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
+    llama_vocab dummy_vocab;
+    auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+    auto * ctx = (llama_sampler_dry *) result->ctx;
+
+    // Process the token-based sequence breakers
+    ctx->dry_processed_breakers.clear();
+    if (seq_breakers.empty()) {
+        LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
+    } else {
+        for (const auto& breaker : seq_breakers) {
+            if (breaker.empty()) {
+                LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
+                continue;
+            }
+            llama_token head_token = breaker[0];
+            std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
+            ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
+        }
+
+        if (ctx->dry_processed_breakers.empty()) {
+            LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
+        }
+    }
+
+    return result;
+}
+
+// logit-bias
+
+struct llama_sampler_logit_bias {
+    const int32_t n_vocab;
+
+    const std::vector<llama_logit_bias> logit_bias;
+
+    std::vector<llama_logit_bias> to_search;
+};
+
+static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /*smpl*/) {
+    return "logit-bias";
+}
+
+static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
+
+    if (ctx->logit_bias.empty()) {
+        return;
+    }
+
+    ctx->to_search.clear();
+
+    // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
+    for (const auto & lb : ctx->logit_bias) {
+        if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) {
+            cur_p->data[lb.token].logit += lb.bias;
+        } else {
+            ctx->to_search.push_back(lb);
+        }
+    }
+
+    if (ctx->to_search.empty()) {
+        return;
+    }
+
+    // search for the remaining candidates that were not found in the previous step
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        for (const auto & lb : ctx->to_search) {
+            if (cur_p->data[i].id == lb.token) {
+                cur_p->data[i].logit += lb.bias;
+                break;
+            }
+        }
+    }
+}
+
+static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
+    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
+}
+
+static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_logit_bias *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_logit_bias_i = {
+    /* .name   = */ llama_sampler_logit_bias_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_logit_bias_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_logit_bias_clone,
+    /* .free   = */ llama_sampler_logit_bias_free,
+};
+
+struct llama_sampler * llama_sampler_init_logit_bias(
+                         int32_t   n_vocab,
+                         int32_t   n_logit_bias,
+          const llama_logit_bias * logit_bias) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_logit_bias_i,
+        /* .ctx   = */ new llama_sampler_logit_bias {
+            /* .n_vocab    = */ n_vocab,
+            /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
+            /* .to_search  = */ {},
+        }
+    );
+}
+
+// infill
+
+//#define GGML_DEBUG_SAMPLER_INFILL
+
+struct llama_sampler_infill {
+    const struct llama_vocab * vocab;
+
+    std::vector<char> buf0;
+    std::vector<char> buf1;
+};
+
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+    return "infill";
+}
+
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_infill *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+#if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    float p_txt_sum = 0.0f;
+    float p_eog_sum = 0.0f;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (ctx->vocab->is_eog(cur_p->data[i].id)) {
+            p_eog_sum += cur_p->data[i].p;
+        } else {
+            p_txt_sum += cur_p->data[i].p;
+        }
+    }
+
+    const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
+
+    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+
+        // keep just the EOG tokens
+        const auto size_org = cur_p->size;
+
+        cur_p->size = 0;
+
+        float p_sum = 0.0f;
+
+        for (size_t i = 0; i < size_org; ++i) {
+            if (ctx->vocab->is_eog(cur_p->data[i].id)) {
+                p_sum += cur_p->data[i].p;
+
+                cur_p->data[cur_p->size++] = cur_p->data[i];
+            }
+        }
+
+        // normalize probs
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= p_sum;
+        }
+
+        return;
+    }
+
+    size_t n_combined = 0; GGML_UNUSED(n_combined);
+
+    // combine tokens with common prefix
+    for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+        for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+            if (cur_p->data[i0].logit == -INFINITY) {
+                break;
+            }
+
+            if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
+                continue;
+            }
+
+            int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+            if (len0 < 0) {
+                ctx->buf0.resize(len0);
+                len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+                assert(len0 > 0);
+            }
+
+            int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+            if (len1 < 0) {
+                ctx->buf1.resize(len1);
+                len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+                assert(len1 > 0);
+            }
+
+            // token i0 is a prefix of token i1
+            if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+                int dst = i0;
+                int src = i1;
+
+                // merge into the token with higher probability
+                if (cur_p->data[i1].p > cur_p->data[i0].p) {
+                    std::swap(dst, src);
+                }
+
+                cur_p->data[dst].p += cur_p->data[src].p;
+                cur_p->data[src].logit = -INFINITY;
+                cur_p->data[src].p     = 0.0f;
+
+                n_combined++;
+            }
+        }
+    }
+
+    size_t n_non_eog = 0;
+
+    size_t size_org = cur_p->size;
+
+    float p_sum = 0.0f;
+    float thold = 0.2f;
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        if (!is_eog) {
+            ++n_non_eog;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        // keep this token
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+    if (n_non_eog == 0) {
+        cur_p->size = 1;
+        cur_p->data[0].id = ctx->vocab->token_eot();
+        cur_p->data[0].logit = 1.0f;
+
+        return;
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+    size_org = cur_p->size;
+    p_sum = 0.0f;
+    thold = 1.0/(n_non_eog + 1);
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+#undef LOG_DBG_CUR
+}
+
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+    return llama_sampler_init_infill(ctx->vocab);
+}
+
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_infill *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_infill_i = {
+    /* .name   = */ llama_sampler_infill_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_infill_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_infill_clone,
+    /* .free   = */ llama_sampler_infill_free,
+};
+
+struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
+    return llama_sampler_init(
+        /* .iface = */ &llama_sampler_infill_i,
+        /* .ctx   = */ new llama_sampler_infill {
+            /* .vocab = */ vocab,
+            /* .buf0  = */ std::vector<char>(512),
+            /* .buf1  = */ std::vector<char>(512),
+        }
+    );
+}
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+    if (smpl->iface == &llama_sampler_dist_i) {
+        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_i) {
+        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_chain_i) {
+        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+            const uint32_t seed = llama_sampler_get_seed(*it);
+            if (seed != LLAMA_DEFAULT_SEED) {
+                return seed;
+            }
+        }
+    }
+
+    return LLAMA_DEFAULT_SEED;
+}
+
+// perf
+
+struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
+    struct llama_perf_sampler_data data = {};
+
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+    }
+
+    const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
+
+    data.t_sample_ms = 1e-3 * ctx->t_sample_us;
+    data.n_sample    = std::max(0, ctx->n_sample);
+
+    return data;
+}
+
+void llama_perf_sampler_print(const struct llama_sampler * chain) {
+    const auto data = llama_perf_sampler(chain);
+
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+}
+
+void llama_perf_sampler_reset(struct llama_sampler * chain) {
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+    }
+
+    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
+
+    ctx->t_sample_us = ctx->n_sample = 0;
+}
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
new file mode 100644
index 000000000..759dd7dcb
--- /dev/null
+++ b/src/llama-sampling.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
+
+#include "llama.h"
+
+#include <vector>
+
+struct llama_vocab;
+struct llama_grammar;
+
+// sampler chain
+
+struct llama_sampler_chain {
+    llama_sampler_chain_params params;
+
+    std::vector<struct llama_sampler *> samplers;
+
+    // timing
+
+    mutable int64_t t_sample_us;
+
+    mutable int32_t n_sample;
+};
+
+struct llama_sampler * llama_sampler_init_dry_testing(
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
new file mode 100644
index 000000000..ad9ffe66a
--- /dev/null
+++ b/src/llama-vocab.cpp
@@ -0,0 +1,3252 @@
+#include "llama-vocab.h"
+
+#include "llama-impl.h"
+#include "llama-model-loader.h"
+
+#include "unicode.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cstdarg>
+#include <cstring>
+#include <forward_list>
+#include <map>
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+//
+// helpers
+//
+
+struct naive_trie {
+    naive_trie() : has_value(false), value(0) {
+    }
+    void insert(const char * key, size_t len, int32_t value = 0) {
+        if (len == 0) {
+            this->has_value = true;
+            this->value = value;
+            return;
+        }
+        char c = key[0];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            res->second.insert(key + 1, len - 1, value);
+        } else {
+            auto res = children.insert(std::make_pair(c, naive_trie()));
+            res.first->second.insert(key + 1, len - 1, value);
+        }
+    }
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
+        if (len == 0 || offset == len) {
+            return std::make_pair(key, offset);
+        }
+        char c = key[offset];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return res->second.get_longest_prefix(key, len, offset + 1);
+        }
+
+        return std::make_pair(key, offset);
+    }
+    const struct naive_trie * traverse(const char c) const {
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return &res->second;
+        }
+
+        return NULL;
+    }
+    std::map<char, struct naive_trie> children;
+    bool has_value;
+    llama_token value;
+};
+
+//
+// tokenizers
+//
+
+struct llm_tokenizer {
+    llm_tokenizer() {}
+    virtual ~llm_tokenizer() = default;
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
+
+//
+// SPM tokenizer
+// original implementation:
+// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+//
+
+struct llm_bigram_spm {
+    struct comparator {
+        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+        }
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm : llm_tokenizer {
+    llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
+};
+
+struct llm_tokenizer_spm_session {
+    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+        // split string into utf8 chars
+        int index = 0;
+        size_t offs = 0;
+        while (offs < text.size()) {
+            llm_symbol sym;
+            size_t len = unicode_len_utf8(text[offs]);
+            sym.text = text.c_str() + offs;
+            sym.n = std::min(len, text.size() - offs);
+            offs += sym.n;
+            sym.prev = index - 1;
+            sym.next = offs == text.size() ? -1 : index + 1;
+            index++;
+            symbols.emplace_back(sym);
+        }
+
+        // seed the work queue with all possible 2-character tokens.
+        for (int i = 1; i < (int) symbols.size(); ++i) {
+            try_add_bigram(i - 1, i);
+        }
+
+        // keep substituting the highest frequency pairs for as long as we can.
+        while (!work_queue.empty()) {
+            auto bigram = work_queue.top();
+            work_queue.pop();
+
+            auto & left_sym = symbols[bigram.left];
+            auto & right_sym = symbols[bigram.right];
+
+            // if one of the symbols already got merged, skip it.
+            if (left_sym.n == 0 || right_sym.n == 0 ||
+                left_sym.n + right_sym.n != bigram.size) {
+                continue;
+            }
+
+            // merge the right sym into the left one
+            left_sym.n += right_sym.n;
+            right_sym.n = 0;
+
+            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
+
+            // remove the right sym from the chain
+            left_sym.next = right_sym.next;
+            if (right_sym.next >= 0) {
+                symbols[right_sym.next].prev = bigram.left;
+            }
+
+            // find more substitutions
+            try_add_bigram(left_sym.prev, bigram.left);
+            try_add_bigram(bigram.left, left_sym.next);
+        }
+
+        for (int i = 0; i != -1; i = symbols[i].next) {
+            auto & symbol = symbols[i];
+            resegment(symbol, output);
+        }
+    }
+
+private:
+    void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
+        auto text = std::string(symbol.text, symbol.n);
+        auto token = vocab.text_to_token(text);
+
+        // Do we need to support is_unused?
+        if (token != LLAMA_TOKEN_NULL) {
+            output.push_back(token);
+            return;
+        }
+
+        const auto p = rev_merge.find(text);
+
+        if (p == rev_merge.end()) {
+            // output any symbols that did not form tokens as bytes.
+            output.reserve(output.size() + symbol.n);
+            for (int j = 0; j < (int)symbol.n; ++j) {
+                llama_token id = vocab.byte_to_token(symbol.text[j]);
+                output.push_back(id);
+            }
+            return;
+        }
+
+        resegment(symbols[p->second.first], output);
+        resegment(symbols[p->second.second], output);
+    }
+
+    void try_add_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
+        auto token = vocab.text_to_token(text);
+
+        if (token == LLAMA_TOKEN_NULL) {
+            return;
+        }
+
+        if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
+            return;
+        }
+
+        const auto & tok_data = vocab.get_token_data(token);
+
+        llm_bigram_spm bigram;
+        bigram.left  = left;
+        bigram.right = right;
+        bigram.score = tok_data.score;
+        bigram.size  = text.size();
+
+        work_queue.push(bigram);
+
+        // Do we need to support is_unused?
+        rev_merge[text] = std::make_pair(left, right);
+    }
+
+    const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_spm * spm_tokenizer;
+
+    std::vector<llm_symbol> symbols;
+    llm_bigram_spm::queue work_queue;
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+//
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+//
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
+class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
+public:
+    using std::priority_queue<T, Container, Compare>::priority_queue;
+
+    T pop_move() {
+        T item = std::move(this->c.front());
+        std::pop_heap(this->c.begin(), this->c.end(), this->comp);
+        this->c.pop_back();
+        return item;
+    }
+
+    void pop() =  delete;
+};
+
+struct llm_bigram_bpe {
+    struct comparator {
+        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
+            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+        }
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe : llm_tokenizer {
+    llm_tokenizer_bpe(const llama_vocab & vocab) {
+        GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
+        switch (vocab.get_pre_type()) {
+            case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+
+                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DBRX:
+            case LLAMA_VOCAB_PRE_TYPE_SMAUG:
+                regex_exprs = {
+                    // same as llama3
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
+                regex_exprs = {
+                    "[\r\n]",
+                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+                    "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
+                    "\\s+$",
+                    "[一-龥ࠀ-一가-퟿]+",
+                    "\\p{N}+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
+                regex_exprs = {
+                    "\\p{N}{1,3}",
+                    "[一-龥぀-ゟ゠-ヿ]+",
+                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
+                regex_exprs = {
+                    "[\r\n]",
+                    "\\s?\\p{L}+",
+                    "\\s?\\p{P}+",
+                    "[一-龥ࠀ-一가-퟿]+",
+                    "\\p{N}",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_FALCON:
+                regex_exprs = {
+                    "[\\p{P}\\$\\+<=>\\^~\\|`]+",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                    "[0-9][0-9][0-9]",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+            case LLAMA_VOCAB_PRE_TYPE_REFACT:
+            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
+            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
+            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+            case LLAMA_VOCAB_PRE_TYPE_MINERVA:
+                regex_exprs = {
+                    "\\p{N}",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_GPT2:
+            case LLAMA_VOCAB_PRE_TYPE_MPT:
+            case LLAMA_VOCAB_PRE_TYPE_OLMO:
+            case LLAMA_VOCAB_PRE_TYPE_JAIS:
+                regex_exprs = {
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
+            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_PORO:
+            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
+            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
+                regex_exprs = {
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
+                regex_exprs = {
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                    "\\p{N}",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+                // original regex from tokenizer.json
+                // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                regex_exprs = {
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+                // Note: in theory, the special token (sentinel and image token) regex_exprs below
+                // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
+                // However, since the upstream pre-tokenizer uses them, they are also
+                // included here (see https://huggingface.co/facebook/chameleon-7b).
+                regex_exprs = {
+                    "<sentinel:[0-9]+>",  // Sentinel tokens
+                    "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens
+                    "([\\t\\n]|    |  )",  // directly from tokenizer.json
+                    "\\p{N}", // Individual digits
+                    "[\\p{P}!-/:-@\\[-`{-~]",  // Punctuation, Isolated
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
+            default:
+                // default regex for BPE tokenization pre-processing
+                regex_exprs = {
+                    "[\\p{P}\\$\\+<=>\\^~\\|]+",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                    "\\p{N}+",
+                    "[0-9][0-9][0-9]",
+                };
+                break;
+        }
+    }
+
+    std::vector<std::string> regex_exprs;
+};
+
+struct llm_tokenizer_bpe_session {
+    llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+    static void append(const llama_token token_id, std::vector<llama_token> & output)  {
+        output.push_back(token_id);
+    }
+
+    bool append_bos(std::vector<llama_token> & output) const {
+        if (vocab.get_add_bos()) {
+            GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
+            output.push_back(vocab.token_bos());
+            return true;
+        }
+        return false;
+    }
+
+    bool append_eos(std::vector<llama_token> & output) const {
+        if (vocab.get_add_eos()) {
+            GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
+            output.push_back(vocab.token_eos());
+            return true;
+        }
+        return false;
+    }
+
+    void check_double_bos_eos(const std::vector<llama_token> & output) const {
+        if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
+            LLAMA_LOG_WARN(
+                "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+        if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
+            LLAMA_LOG_WARN(
+                "%s: Added a EOS token to the prompt as specified by the model but the prompt "
+                "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+    }
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+        int final_prev_index = -1;
+        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
+
+        symbols_final.clear();
+
+        for (const auto & word : word_collection) {
+            work_queue = llm_bigram_bpe::queue();
+            symbols.clear();
+
+            int index = 0;
+            size_t offset = 0;
+
+            //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+            if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
+                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                offset = word.size();
+            }
+
+            while (offset < word.size()) {
+                llm_symbol sym;
+                size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
+                sym.text = word.c_str() + offset;
+                sym.n = char_len;
+                offset += sym.n;
+                sym.prev = index - 1;
+                sym.next = offset == word.size() ? -1 : index + 1;
+                index++;
+                symbols.emplace_back(sym);
+            }
+            for (int i = 1; i < (int) symbols.size(); ++i) {
+                add_new_bigram(i - 1, i);
+            }
+
+            // build token(s)
+            while (!work_queue.empty()) {
+                auto bigram = work_queue.pop_move();
+
+                auto & left_symbol = symbols[bigram.left];
+                auto & right_symbol = symbols[bigram.right];
+
+                if (left_symbol.n == 0 || right_symbol.n == 0) {
+                    continue;
+                }
+                std::string left_token = std::string(left_symbol.text, left_symbol.n);
+                std::string right_token = std::string(right_symbol.text, right_symbol.n);
+                if (left_token + right_token != bigram.text) {
+                    continue;  // Skip this bigram if it's outdated
+                }
+
+                // merge the right sym into the left one
+                left_symbol.n += right_symbol.n;
+                right_symbol.n = 0;
+
+                // remove the right sym from the chain
+                left_symbol.next = right_symbol.next;
+                if (right_symbol.next >= 0) {
+                    symbols[right_symbol.next].prev = bigram.left;
+                }
+
+                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
+                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
+            }
+
+            // add the finished tokens to the final list keeping correct order for next and prev
+            for (auto & sym : symbols) {
+                if (sym.n > 0) {
+                    sym.prev = final_prev_index;
+                    sym.next = -1;
+                    if (final_prev_index != -1) {
+                        symbols_final[final_prev_index].next = symbols_final.size();
+                    }
+                    symbols_final.emplace_back(sym);
+                    final_prev_index = symbols_final.size() - 1;
+                }
+            }
+        }
+
+        symbols = symbols_final;
+
+        if (!symbols.empty()) {
+            for (int i = 0; i != -1; i = symbols[i].next) {
+                auto & symbol = symbols[i];
+                if (symbol.n == 0) {
+                    continue;
+                }
+
+                const std::string str = std::string(symbol.text, symbol.n);
+                const auto token = vocab.text_to_token(str);
+
+                if (token == LLAMA_TOKEN_NULL) {
+                    for (auto j = str.begin(); j != str.end(); ++j) {
+                        std::string byte_str(1, *j);
+                        auto token_multibyte = vocab.text_to_token(byte_str);
+                        if (token_multibyte != LLAMA_TOKEN_NULL) {
+                            output.push_back(token_multibyte);
+                        }
+                    }
+                } else {
+                    output.push_back(token);
+                }
+            }
+        }
+    }
+
+private:
+    void add_new_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+        std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
+        std::string right_token = std::string(symbols[right].text, symbols[right].n);
+
+        int rank_found = -1;
+
+        rank_found = vocab.find_bpe_rank(left_token, right_token);
+
+        if (rank_found < 0) {
+            return;
+        }
+
+        llm_bigram_bpe bigram;
+
+        bigram.left  = left;
+        bigram.right = right;
+        bigram.text  = left_token + right_token;
+        bigram.size  = left_token.size() + right_token.size();
+        bigram.rank  = rank_found;
+
+        work_queue.push(bigram);
+    }
+
+    const llama_vocab & vocab;
+    const llm_tokenizer_bpe & tokenizer;
+
+    std::vector<llm_symbol> symbols;
+    std::vector<llm_symbol> symbols_final;
+    llm_bigram_bpe::queue work_queue;
+};
+
+//
+// WPM tokenizer
+//
+
+struct llm_tokenizer_wpm : llm_tokenizer {
+    llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
+};
+
+struct llm_tokenizer_wpm_session {
+    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+        // normalize and split by whitespace
+        std::vector<std::string> words = preprocess(text);
+        // bos token prepended already
+
+        // find the longest tokens that form the words
+        for (const std::string & word : words) {
+            // skip empty words
+            if (word.size() == 0) {
+                continue;
+            }
+
+            // prepend phantom space
+            const std::string word1 = "\xe2\x96\x81" + word;
+            const int n = word1.size();
+
+            const size_t current_tokens = output.size();
+
+            // we're at the start of a new word
+            // move through character position in word
+            for (int i = 0; i < n; ++i) {
+                // loop through possible match length
+                bool match = false;
+                for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
+                    auto id = vocab.text_to_token(word1.substr(i, j - i));
+                    if (id != LLAMA_TOKEN_NULL) {
+                        output.push_back(id);
+                        match = true;
+                        i = j - 1;
+                        break;
+                    }
+                }
+
+                if (!match) { // discard all
+                    output.resize(current_tokens);
+                    break;  // and discard next tokens
+                }
+            }
+
+            // we didn't find any matches for this word
+            if (current_tokens == output.size()) {
+                output.push_back(vocab.token_unk());
+            }
+        }
+    }
+
+    // TODO: reduce string copies by using cpts_offs array
+    static std::vector<std::string> preprocess(const std::string & text)  {
+        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
+        std::vector<std::string> words(1, "");
+
+        for (const uint32_t cpt : cpts_nfd) {
+            const auto flags = unicode_cpt_flags_from_cpt(cpt);
+
+            if (flags.is_whitespace) {
+                if (words.back().size()) {  // finish previous word if any
+                    words.emplace_back();
+                }
+                continue;
+            }
+
+            assert (!flags.is_separator);
+            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
+                continue;
+            }
+
+            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
+                if (words.back().size()) {  // finish previous word if any
+                    words.emplace_back();
+                }
+                words.back() = s;       // single char word
+                words.emplace_back();   // start a new word
+            } else {
+                words.back() += s;  // append char to word
+            }
+        }
+
+        if (!words.back().size()) {
+            words.pop_back();
+        }
+
+        return words;
+    }
+
+    static bool is_chinese_char(uint32_t cpt) {
+        return
+            (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
+            (cpt >= 0x03400 && cpt <= 0x04DBF) ||
+            (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
+            (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
+            (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
+            (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
+            (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
+            (cpt >= 0x2F800 && cpt <= 0x2FA1F);
+            //(cpt >= 0x3000  && cpt <= 0x303F)  ||
+            //(cpt >= 0xFF00  && cpt <= 0xFFEF);
+    }
+
+private:
+    const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_wpm * wpm_tokenizer;
+};
+
+//
+// UGM tokenizer
+//
+
+struct llm_tokenizer_ugm : llm_tokenizer {
+    llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
+        if (precompiled_charsmap.size() > 0) {
+            size_t charsmap_offset = 0;
+
+            // First four bytes of precompiled_charsmap contains length of binary
+            // blob containing XOR-compressed compact double array (XCDA) entries
+            uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
+            charsmap_offset += sizeof(xcda_blob_size);
+            if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+
+            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
+            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
+            xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
+            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
+            charsmap_offset += xcda_blob_size;
+
+            // Remaining bytes of precompiled charsmap contain null-terminated
+            // replacement strings for prefixes matched by the XCDA.
+            prefix_replacements = &precompiled_charsmap[charsmap_offset];
+            prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
+        }
+
+        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+            const auto & token_data = vocab.get_token_data(id);
+
+            if (vocab.is_normal(id)) {
+                min_score = std::min<float>(min_score, token_data.score);
+                max_score = std::max<float>(max_score, token_data.score);
+            }
+
+            if (vocab.is_normal(id) ||
+                vocab.is_user_defined(id) ||
+                vocab.is_unused(id)) {
+                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+            }
+
+            if (vocab.is_user_defined(id)) {
+                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+            }
+        }
+
+        unknown_token_score = min_score - unknown_token_score_penalty;
+    }
+
+    // escaped space symbol - U+2581 (Lower One Eighth Block)
+    const std::string escaped_space = "\xE2\x96\x81";
+
+    const char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;
+
+    const uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;
+
+    struct naive_trie user_defined_token_matcher;
+
+    float min_score = FLT_MAX;
+    float max_score = -FLT_MAX;
+
+    float unknown_token_score_penalty = 10.0;
+    float unknown_token_score;
+
+    struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_ugm_session {
+    llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
+     * unigram language models. The general idea is to:
+     * - move along the input sequence in steps of one UTF code point,
+     * - at each step find all possible tokenizations of the prefix by
+     *   traversing the tokens trie,
+     * - for each tokenization store the best one so far (by higher score)
+     * - use the position in sequence after given token as an index to store
+     *   results
+     * - if there was no valid tokenization of the current UTF code point
+     *   then use unknown token with additional score penalty
+     * After processing the whole sequence we backtrack from the end to get
+     * the best tokenization.
+    */
+    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
+
+        // normalize the input first
+        std::string normalized;
+        normalize(text, &normalized);
+        size_t input_len = normalized.size();
+        if (input_len == 0) {
+            return;
+        }
+
+        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        // at the beginning tokenization score is zero
+        tokenization_results[0] = { vocab.token_unk(), 0, 0 };
+
+        for (size_t input_offset = 0; input_offset < input_len;) {
+            size_t prefix_offset = input_offset;
+            // calculate how many code units are in the currently processed UTF code point
+            size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
+
+            // traverse the token matcher trie to find a matching token
+            bool single_codepoint_token_found = false;
+            const struct best_tokenization & current_best = tokenization_results[input_offset];
+            const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
+
+            while (prefix_offset <= input_len && node != NULL) {
+                // check if we found valid token in prefix
+                if (node->has_value) {
+                    // check if it corresponds to the whole UTF code point
+                    if (prefix_offset - input_offset == n_utf8_code_units) {
+                        single_codepoint_token_found = true;
+                    }
+                    llama_token token_id = node->value;
+                    const auto & token_data = vocab.get_token_data(token_id);
+
+                    // we set the user-defined token scores to 0 to make them more likely to be selected
+                    // (normal token scores are log probabilities, so they are negative)
+                    // score type is double here to make tokenization results exactly
+                    // the same as in the HF tokenizer using SentencePiece
+                    const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
+                    const double challenger_score = current_best.score_sum + token_score;
+                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                    if (challenger_score > current_champ.score_sum) {
+                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        current_champ = challenger;
+                    }
+                }
+                node = node->traverse(normalized[prefix_offset++]);
+            }
+
+            // if we didn't find a valid token corresponding to the whole UTF code point
+            // then use unknown token as the tokenization of this UTF code point
+            if (!single_codepoint_token_found) {
+                const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
+                prefix_offset = input_offset + n_utf8_code_units;
+                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                if (challenger_score > current_champ.score_sum) {
+                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    current_champ = challenger;
+                }
+            }
+
+            // move to the next UTF code point
+            input_offset += n_utf8_code_units;
+        }
+
+        // now backtrack from the end to gather token ids of the best tokenization
+        // merge sequences of consecutive unknown tokens into single unknown tokens
+        bool is_prev_unknown = false;
+        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
+            bool is_unknown = tokenization.token_id == vocab.token_unk();
+            if (!(is_prev_unknown && is_unknown)) {
+                output.push_back(tokenization.token_id);
+            }
+            if (tokenization.input_offset == 0) {
+                break;
+            }
+            is_prev_unknown = is_unknown;
+        }
+
+        // reverse the output since we added tokens starting from the end of the input
+        std::reverse(output.begin() + output_size, output.end());
+    }
+
+private:
+
+    // helper structure for returning normalization results
+    struct normalization_result {
+        const char * normalized;
+        size_t normalized_len;
+        size_t consumed_input;
+    };
+
+    void normalize(const std::string& input, std::string * normalized) {
+        normalized->clear();
+        normalized->reserve(input.size() * 3);
+
+        const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
+
+        const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+        const bool shall_append_space  =  vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+        const bool shall_merge_spaces  =  vocab.get_remove_extra_whitespaces();
+
+        bool is_space_prepended = false;
+        bool processing_non_ws = false;
+
+        size_t input_len = input.size();
+
+        for (size_t input_offset = 0; input_offset < input_len; ) {
+            auto norm_res = normalize_prefix(input, input_offset);
+            for (size_t i = 0; i < norm_res.normalized_len; i++) {
+                char c = norm_res.normalized[i];
+                if (c != ' ') {
+                    if (!processing_non_ws) {
+                        processing_non_ws = true;
+                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
+                            normalized->append(space);
+                            is_space_prepended = true;
+                        }
+                    }
+                    normalized->push_back(c);
+                } else {
+                    if (processing_non_ws) {
+                        processing_non_ws = false;
+                    }
+                    if (!shall_merge_spaces) {
+                        normalized->append(space);
+                    }
+                }
+            }
+
+            input_offset += norm_res.consumed_input;
+        }
+
+        if (shall_append_space) {
+            normalized->append(space);
+        }
+    }
+
+    /*
+     * This structure is a view wrapper for XOR-compressed double array (XCDA)
+     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
+     * Each bit-packed entry contains:
+     * - BASE array value in bits 10-30
+     * - LCHECK array value in bits 0-7
+     * - LEAF array value in bit 9
+     * Entries containing indexes of replacement sequences have set bit 31
+     */
+    struct xcda_array_view {
+    public:
+        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+        }
+        uint32_t get_base(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
+        }
+        uint32_t get_lcheck(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) | 0xff);
+        }
+        bool get_leaf(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 8) & 1;
+        }
+        uint32_t get_value(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) - 1);
+        }
+    private:
+        uint32_t get_node(size_t index) {
+            if (index > xcda_array_size) {
+                throw std::runtime_error("Index out of array bounds in XCDA array!");
+            }
+            return xcda_array[index];
+        }
+        const uint32_t * xcda_array;
+        size_t xcda_array_size;
+    };
+
+    // this structure stores the best tokenization so far at input_offset
+    struct best_tokenization {
+        llama_token token_id;
+        size_t input_offset;
+        float score_sum;
+    };
+
+    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
+        if (input_offset == input.size()) {
+            return { &input[input_offset], 0, 0 };
+        }
+
+        // if input prefix matches some user-defined token return this token as normalization result
+        auto user_defined_token_match =
+           tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        if (user_defined_token_match.second > 0) {
+            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
+        }
+
+        size_t longest_prefix_length = 0;
+        size_t longest_prefix_offset = 0;
+
+        if (tokenizer.xcda_array_size > 0) {
+            struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
+
+            // Find the longest normalized sequence matching the input prefix by walking
+            // the XOR-compressed compact double array (XCDA) starting from the root node
+            // We find the index of the next node by calculating BASE[s] ^ c where s is
+            // the index of the previous node and c is a numerical character value
+            uint32_t node_index = 0;
+            // get BASE of the root node
+            node_index = xcda_view.get_base(node_index);
+            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
+                unsigned char c = input[prefix_offset];
+                if (c == 0) {
+                    break;
+                }
+                node_index ^= c;
+                // if value of LCHECK is not c it means that this is not a child of
+                // the previous node, so we stop matching
+                if (xcda_view.get_lcheck(node_index) != c) {
+                    break;
+                }
+                bool is_leaf = xcda_view.get_leaf(node_index);
+                // get BASE of the current node
+                node_index ^= xcda_view.get_base(node_index);
+                // if LEAF of the current node is true, it means that its BASE points to the node
+                // containing index of replacement sequence for currently matched input prefix
+                if (is_leaf)
+                {
+                    longest_prefix_length = prefix_offset - input_offset + 1;
+                    // get index of replacement sequence for currently matched input prefix
+                    longest_prefix_offset = xcda_view.get_value(node_index);
+                }
+            }
+        }
+
+        if (longest_prefix_length > 0) {
+            // we have a match, so return the replacement sequence
+            if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+            const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
+            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+        }
+
+        // check if the input prefix contains a valid sequence of UTF-8 code units
+        try {
+            // if yes, return this sequence unmodified
+            size_t prefix_offset = input_offset;
+            unicode_cpt_from_utf8(input, prefix_offset);
+            return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+        } catch (std::invalid_argument & /*ex*/) {
+            // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+            return { "\xEF\xBF\xBD", 3, 1 };
+        }
+    }
+
+    const llama_vocab & vocab;
+    const llm_tokenizer_ugm & tokenizer;
+};
+
+//
+// RWKV tokenizer
+//
+
+static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
+    std::vector<uint8_t> output;
+    output.reserve(escaped.size());
+
+    // Parser state
+    bool escaping = false;
+    uint8_t hex_remaining = 0;
+    uint8_t hex_acc = 0;
+
+    // Step through characters, performing parsing
+    for (const char & c : escaped) {
+        // If we're parsing a hex code, interpret the next character
+        if (hex_remaining != 0) {
+            uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
+            hex_acc = (hex_acc << 4) + value;
+
+            hex_remaining -= 1;
+            if (hex_remaining == 0) {
+                output.push_back(hex_acc);
+                hex_acc = 0;
+            }
+
+            continue;
+        }
+
+        // If we got an escape character, interpret it
+        if (escaping) {
+            if (c == 't') {
+                output.push_back('\t');
+            } else if (c == 'n') {
+                output.push_back('\n');
+            } else if (c == 'r') {
+                output.push_back('\r');
+            } else if (c == 'x') {
+                hex_remaining = 2;
+            } else {
+                output.push_back(c);
+            }
+
+            escaping = false;
+            continue;
+        }
+
+        if (c == '\\') {
+            escaping = true;
+            continue;
+        }
+
+        output.push_back(c);
+    }
+
+    return output;
+}
+
+struct llm_tokenizer_rwkv : llm_tokenizer {
+    llm_tokenizer_rwkv(const llama_vocab & vocab) {
+        // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
+        // For now, we decode the vocab here into the lookup we'll use for tokenization.
+
+        // build trie
+        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+            const auto & data = vocab.get_token_data(id);
+            const auto text = llama_unescape_rwkv_token(data.text);
+            token_matcher.insert((const char *) text.data(), text.size(), id);
+        }
+    }
+
+    struct naive_trie token_matcher;
+};
+
+struct llm_tokenizer_rwkv_session {
+    llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+
+    void tokenize(const std::string & text, std::vector<llama_token> & output) {
+        uint32_t position = 0;
+        while (position < text.size()) {
+            const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
+            if (node == NULL) {
+                // no matching token found, add unknown token
+                output.push_back(vocab.token_unk());
+                position += 1;
+                continue;
+            }
+
+            // traverse the trie to find the longest matching token
+            uint32_t token_id = 0;
+            uint32_t token_length = 0;
+            while (node != NULL) {
+                if (node->has_value) {
+                    token_id = node->value;
+                    token_length = position + 1;
+                }
+                node = node->traverse(text[++position]);
+            }
+
+            // add the longest matching token
+            output.push_back(token_id);
+            position = token_length;
+        }
+    }
+
+private:
+    const llama_vocab & vocab;
+    const llm_tokenizer_rwkv & tokenizer;
+};
+
+//
+// impl
+//
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant {
+    fragment_buffer_variant(llama_token _token)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
+        token(_token),
+        raw_text(_dummy),
+        offset(0),
+        length(0) {}
+
+    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
+        token((llama_token) - 1),
+        raw_text(_raw_text),
+        offset(_offset),
+        length(_length){
+            GGML_ASSERT(_offset >= 0);
+            GGML_ASSERT(_length >= 1);
+            GGML_ASSERT(offset + length <= raw_text.length());
+        }
+
+    const FRAGMENT_BUFFER_VARIANT_TYPE type;
+    const llama_token token;
+    const std::string _dummy;
+    const std::string & raw_text;
+    const uint64_t offset;
+    const uint64_t length;
+};
+
+struct llama_vocab::impl {
+    uint32_t n_token_types = 0; // for BERT-style token types
+
+    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+
+    int max_token_len = 0; // used for optimizing longest token search
+
+    // default LLaMA special tokens
+    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
+    llama_token special_bos_id  = 1;
+    llama_token special_eos_id  = 2;
+    llama_token special_eot_id  = LLAMA_TOKEN_NULL;
+    llama_token special_eom_id  = LLAMA_TOKEN_NULL;
+    llama_token special_unk_id  = 0;
+    llama_token special_sep_id  = LLAMA_TOKEN_NULL;
+    llama_token special_pad_id  = LLAMA_TOKEN_NULL;
+    llama_token special_mask_id = LLAMA_TOKEN_NULL;
+
+    llama_token linefeed_id = 13;
+
+    // fim tokens
+    llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
+    llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
+    llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
+    llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
+    llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+    llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
+
+    // tokenizer flags
+    bool add_space_prefix           = false;
+    bool add_bos                    = false;
+    bool add_eos                    = false;
+    bool ignore_merges              = false;
+    bool clean_spaces               = false;  // clean_up_tokenization_spaces
+    bool remove_extra_whitespaces   = false;
+    bool escape_whitespaces         = true;
+    bool treat_whitespace_as_suffix = false;
+
+    std::unordered_map<std::string, llama_token> token_to_id;
+    std::vector<token_data>                      id_to_token;
+
+    std::vector<llama_token> cache_special_tokens;
+    std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
+    struct pair_hash {
+        size_t operator()(const std::pair<std::string, std::string> & p) const {
+            return std::hash<std::string>{}(p.first) ^  //create some hash for pair
+                   (std::hash<std::string>{}(p.second) << 1);
+        }
+    };
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
+
+    // set of all tokens that cause "end of generation"
+    std::set<llama_token> special_eog_ids;
+
+    std::unique_ptr<llm_tokenizer> tokenizer;
+
+    std::vector<char> precompiled_charsmap;
+
+    impl(const llama_vocab & vocab) : vocab(vocab) {
+    }
+
+    ~impl() = default;
+
+    void load(llama_model_loader & ml, const LLM_KV & kv);
+
+    enum llama_vocab_type get_type() const;
+
+    std::string type_name() const;
+
+    bool is_normal      (llama_token id) const;
+    bool is_unknown     (llama_token id) const;
+    bool is_control     (llama_token id) const;
+    bool is_byte        (llama_token id) const;
+    bool is_user_defined(llama_token id) const;
+    bool is_unused      (llama_token id) const;
+    bool is_eog         (llama_token id) const;
+
+    uint8_t token_to_byte(llama_token id) const;
+
+    llama_token_attr token_get_attr(llama_token id) const;
+
+    void init_tokenizer(enum llama_vocab_type type);
+
+    void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
+
+    std::string token_to_piece_for_cache(
+                  llama_token   token,
+                         bool   special) const;
+
+
+    std::vector<llama_token> tokenize(
+            const std::string & raw_text,
+                         bool   add_special,
+                         bool   parse_special = false) const;
+
+    int32_t tokenize(
+                   const char * text,
+                      int32_t   text_len,
+                  llama_token * tokens,
+                      int32_t   n_tokens_max,
+                         bool   add_special,
+                         bool   parse_special) const;
+
+    // does not write null-terminator to buf
+    int32_t token_to_piece(
+                  llama_token   token,
+                         char * buf,
+                      int32_t   length,
+                      int32_t   lstrip,
+                         bool   special) const;
+
+    // use cached data
+    const std::string & token_to_piece(llama_token token) const;
+
+    int32_t detokenize(
+            const llama_token * tokens,
+                      int32_t   n_tokens,
+                         char * text,
+                      int32_t   text_len_max,
+                         bool   remove_special,
+                         bool   unparse_special) const;
+
+    std::string detokenize(
+            const std::vector<llama_token> & tokens,
+                                      bool   special) const;
+
+    void print_info() const;
+
+private:
+    const llama_vocab & vocab;
+};
+
+void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+    struct gguf_context * ctx = ml.meta.get();
+
+    // determine vocab type
+    {
+        std::string tokenizer_model;
+        std::string tokenizer_pre;
+
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
+
+        ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
+
+        if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
+            type = LLAMA_VOCAB_TYPE_NONE;
+
+            // default special tokens
+            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_eos_id  = LLAMA_TOKEN_NULL;
+            special_unk_id  = LLAMA_TOKEN_NULL;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+            linefeed_id     = LLAMA_TOKEN_NULL;
+
+            // read vocab size from metadata
+            uint32_t n_tokens = 0;
+            if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
+                LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
+                id_to_token.resize(n_tokens);
+            }
+
+            return;
+        }
+
+        if (tokenizer_model == "llama") {
+            type = LLAMA_VOCAB_TYPE_SPM;
+
+            // default special tokens
+            special_bos_id  = 1;
+            special_eos_id  = 2;
+            special_unk_id  = 0;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+        } else if (tokenizer_model == "bert") {
+            type = LLAMA_VOCAB_TYPE_WPM;
+
+            // default special tokens
+            special_bos_id  = 101;
+            special_eos_id  = LLAMA_TOKEN_NULL;
+            special_unk_id  = 100;
+            special_sep_id  = 102;
+            special_pad_id  = 0;
+            special_mask_id = 103;
+        } else if (tokenizer_model == "gpt2") {
+            type = LLAMA_VOCAB_TYPE_BPE;
+
+            // read bpe merges and populate bpe ranks
+            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
+
+            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+            for (int i = 0; i < n_merges; i++) {
+                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+                //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
+
+                std::string first;
+                std::string second;
+
+                const size_t pos = word.find(' ', 1);
+
+                if (pos != std::string::npos) {
+                    first  = word.substr(0, pos);
+                    second = word.substr(pos + 1);
+                }
+
+                bpe_ranks.emplace(std::make_pair(first, second), i);
+            }
+
+            // default special tokens
+            special_bos_id  = 11;
+            special_eos_id  = 11;
+            special_unk_id  = LLAMA_TOKEN_NULL;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+        } else if (tokenizer_model == "t5") {
+            type = LLAMA_VOCAB_TYPE_UGM;
+
+            // default special tokens
+            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_eos_id  = 1;
+            special_unk_id  = 2;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = 0;
+            special_mask_id = LLAMA_TOKEN_NULL;
+
+            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+            if (precompiled_charsmap_keyidx != -1) {
+                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
+#ifdef IS_BIG_ENDIAN
+                // correct endiannes of data in precompiled_charsmap binary blob
+                uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
+                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+                size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+                uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
+                for (size_t i = 0; i < xcda_array_size; ++i) {
+                    xcda_array[i] = __builtin_bswap32(xcda_array[i]);
+                }
+#endif
+            }
+        } else if (tokenizer_model == "rwkv") {
+            type = LLAMA_VOCAB_TYPE_RWKV;
+
+            // default special tokens
+            special_bos_id = LLAMA_TOKEN_NULL;
+            special_eos_id = LLAMA_TOKEN_NULL;
+            special_unk_id = LLAMA_TOKEN_NULL;
+            special_sep_id = LLAMA_TOKEN_NULL;
+            special_pad_id = LLAMA_TOKEN_NULL;
+        } else {
+            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
+        }
+
+        // for now, only BPE models have pre-tokenizers
+        if (type == LLAMA_VOCAB_TYPE_BPE) {
+            add_space_prefix = false;
+            clean_spaces = true;
+            if (tokenizer_pre.empty()) {
+                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+                LLAMA_LOG_WARN("%s:                                             \n", __func__);
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
+                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+                LLAMA_LOG_WARN("%s:                                             \n", __func__);
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (tokenizer_pre == "default") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (
+                    tokenizer_pre == "llama3"   ||
+                    tokenizer_pre == "llama-v3" ||
+                    tokenizer_pre == "llama-bpe"||
+                    tokenizer_pre == "falcon3") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+                ignore_merges = true;
+                add_bos = true;
+            } else if (
+                    tokenizer_pre == "deepseek-llm") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+                clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "deepseek-coder") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+                clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "deepseek-v3") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+                clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "falcon") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
+            } else if (
+                    tokenizer_pre == "mpt") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
+            } else if (
+                    tokenizer_pre == "starcoder") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+            } else if (
+                    tokenizer_pre == "gpt-2"   ||
+                    tokenizer_pre == "phi-2"   ||
+                    tokenizer_pre == "jina-es" ||
+                    tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "gigachat"   ||
+                    tokenizer_pre == "jina-v1-en" ||
+                    tokenizer_pre == "jina-v2-es" ||
+                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-code" ||
+                    tokenizer_pre == "roberta-bpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
+            } else if (
+                tokenizer_pre == "command-r") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+                clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "qwen2" ||
+                    tokenizer_pre == "deepseek-r1-qwen") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "stablelm2") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+            } else if (
+                tokenizer_pre == "olmo") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
+            } else if (
+                tokenizer_pre == "dbrx") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
+            } else if (
+                tokenizer_pre == "smaug-bpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+            } else if (
+                tokenizer_pre == "poro-chat") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "chatglm-bpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
+                special_bos_id = LLAMA_TOKEN_NULL;
+            } else if (
+                tokenizer_pre == "viking") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "jais") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
+            } else if (
+                tokenizer_pre == "tekken") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+                clean_spaces = false;
+                ignore_merges = true;
+                add_bos = true;
+            } else if (
+                tokenizer_pre == "smollm") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "codeshell") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+            } else if (
+                tokenizer_pre == "bloom") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+            } else if (
+                tokenizer_pre == "gpt3-finnish") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
+            } else if (
+                tokenizer_pre == "exaone") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+            } else if (
+                tokenizer_pre == "chameleon") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+                add_bos = true;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "minerva-7b") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+            } else if (
+                tokenizer_pre == "megrez") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else {
+                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+            }
+        } else if (type == LLAMA_VOCAB_TYPE_SPM) {
+            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            add_space_prefix = true;
+            clean_spaces = false;
+            add_bos = true;
+            add_eos = false;
+        } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            add_space_prefix = false;
+            clean_spaces = true;
+            add_bos = true;
+            add_eos = false;
+        } else if (type == LLAMA_VOCAB_TYPE_UGM) {
+            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            add_bos = false;
+            add_eos = true;
+        } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            add_space_prefix = false;
+            clean_spaces = false;
+            add_bos = false;
+            add_eos = false;
+        } else {
+            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        }
+
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+    }
+
+    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+    if (token_idx == -1) {
+        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+    }
+
+    const float * scores = nullptr;
+    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+    if (score_idx != -1) {
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+    }
+
+    const int * toktypes = nullptr;
+    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+    if (toktype_idx != -1) {
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+    }
+
+    uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
+    id_to_token.resize(n_tokens);
+
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        std::string word = gguf_get_arr_str(ctx, token_idx, i);
+        if (word.empty()) {
+            LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+            word = "[EMPTY_" + std::to_string(i) + "]";
+        }
+
+        token_to_id[word] = i;
+        max_token_len = std::max(max_token_len, (int) word.size());
+
+        auto & token_data = id_to_token[i];
+        token_data.text  = std::move(word);
+        token_data.score = scores ? scores[i] : 0.0f;
+        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
+
+        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
+            switch(toktypes[i]) {
+                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN;      break;
+                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attr = LLAMA_TOKEN_ATTR_UNUSED;       break;
+                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;       break;
+                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;      break;
+                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
+                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attr = LLAMA_TOKEN_ATTR_BYTE;         break;
+                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+                default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+            }
+        }
+    }
+    GGML_ASSERT(id_to_token.size() == token_to_id.size());
+
+    init_tokenizer(type);
+
+    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+    if (type == LLAMA_VOCAB_TYPE_SPM) {
+        try {
+            linefeed_id = vocab.byte_to_token('\n');
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            linefeed_id = special_pad_id;
+        }
+    } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+        linefeed_id = special_pad_id;
+    } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+        const std::vector<int> ids = tokenize("\n", false);
+        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        linefeed_id = ids[0];
+    } else {
+        const std::vector<int> ids = tokenize("\n", false);
+
+        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        if (ids.empty()) {
+            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+            linefeed_id = special_pad_id;
+        } else {
+            linefeed_id = ids[0];
+        }
+    }
+
+    // special tokens
+    {
+        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
+            { LLM_KV_TOKENIZER_BOS_ID,     special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     special_pad_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
+        };
+
+        for (const auto & it : special_token_types) {
+            const std::string & key = kv(std::get<0>(it));
+            int32_t & id = std::get<1>(it);
+
+            uint32_t new_id;
+            if (!ml.get_key(std::get<0>(it), new_id, false)) {
+                continue;
+            }
+            if (new_id >= id_to_token.size()) {
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
+                    __func__, key.c_str(), new_id, id);
+            } else {
+                id = new_id;
+            }
+        }
+
+        // Handle add_bos and add_eos
+        {
+            bool temp = true;
+
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+                add_bos = temp;
+            }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+                add_eos = temp;
+            }
+        }
+
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
+            if (special_eot_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eot_id|>"
+                        || t.first == "<|im_end|>"
+                        || t.first == "<|end|>"
+                        || t.first == "<end_of_turn>"
+                        || t.first == "<|endoftext|>"
+                        || t.first == "<EOT>"
+                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
+                   ) {
+                    special_eot_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find EOM token: "<|eom_id|>"
+            if (special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    special_eom_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
+            if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == "<fim-prefix>"
+                        || t.first == "<｜fim▁begin｜>" // DeepSeek
+                        || t.first == "<PRE>"
+                        ) {
+                    special_fim_pre_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
+            if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == "<fim-suffix>"
+                        || t.first == "<｜fim▁hole｜>" // DeepSeek
+                        || t.first == "<SUF>"
+                        ) {
+                    special_fim_suf_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
+            if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == "<fim-middle>"
+                        || t.first == "<｜fim▁end｜>"  // DeepSeek
+                        || t.first == "<MID>"
+                        ) {
+                    special_fim_mid_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
+            if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == "<fim-pad>"
+                        || t.first == "<PAD>"
+                        ) {
+                    special_fim_pad_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
+            if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == "<fim-repo>"
+                        || t.first == "<REPO>"
+                        ) {
+                    special_fim_rep_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    special_fim_sep_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+        }
+
+        // maintain a list of tokens that cause end-of-generation
+        // this is currently determined based on the token text, which is obviously not ideal
+        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        special_eog_ids.clear();
+
+        if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
+            special_eog_ids.insert(special_fim_pad_id);
+        }
+
+        if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
+            special_eog_ids.insert(special_fim_rep_id);
+        }
+
+        if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
+            special_eog_ids.insert(special_fim_sep_id);
+        }
+
+        for (const auto & t : token_to_id) {
+            if (false
+                    || t.first == "<|eot_id|>"
+                    || t.first == "<|im_end|>"
+                    || t.first == "<|end|>"
+                    || t.first == "<end_of_turn>"
+                    || t.first == "<|endoftext|>"
+                    || t.first == "<|eom_id|>"
+                    || t.first == "<EOT>"
+               ) {
+                special_eog_ids.insert(t.second);
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                }
+            } else {
+                // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
+            }
+        }
+
+        // sanity checks
+        if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
+            special_eog_ids.insert(special_eos_id);
+            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
+            special_eog_ids.insert(special_eot_id);
+            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
+            special_eog_ids.insert(special_eom_id);
+            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+    }
+
+    // build special tokens cache
+    {
+        for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
+            if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
+                cache_special_tokens.push_back(id);
+            }
+        }
+
+        std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
+            [&] (const llama_token a, const llama_token b) {
+                return id_to_token[a].text.size() > id_to_token[b].text.size();
+            }
+        );
+
+        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
+    }
+
+    // build token to piece cache
+    {
+        size_t size_cache = 0;
+
+        std::vector<std::string> cache(n_tokens);
+
+        for (uint32_t id = 0; id < n_tokens; ++id) {
+            cache[id] = token_to_piece_for_cache(id, true);
+
+            size_cache += cache[id].size();
+        }
+
+        std::swap(cache_token_to_piece, cache);
+
+        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+    }
+
+    // Handle per token attributes
+    //NOTE: Each model customizes per token attributes.
+    //NOTE: Per token attributes are missing from the GGUF file.
+    //TODO: Extract attributes from GGUF file.
+    {
+        auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
+            for (const auto & substr : substrs) {
+                if (str.find(substr) < std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
+            uint32_t current = id_to_token.at(id).attr;
+            current = value ? (current | attr) : (current & ~attr);
+            id_to_token[id].attr = (llama_token_attr) current;
+        };
+
+        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+            _set_tokenid_attr(token_to_id.at(token), attr, value);
+        };
+
+        std::string model_name;
+        std::string tokenizer_pre;
+
+        ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
+        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+
+        // model name to lowercase
+        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+            [] (const std::string::value_type x) {
+                return std::tolower(x);
+            }
+        );
+
+        // set attributes by model/tokenizer name
+        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+            for (auto id : cache_special_tokens) {
+                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
+            }
+            for (const auto * token : {"</s>"}) {
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
+            }
+            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
+            }
+        }
+    }
+}
+
+enum llama_vocab_type llama_vocab::impl::get_type() const {
+    return type;
+}
+
+std::string llama_vocab::impl::type_name() const{
+    switch (type) {
+        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
+        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
+        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
+        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
+        case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
+        default:                    return "unknown";
+    }
+}
+
+bool llama_vocab::impl::is_normal(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+}
+
+bool llama_vocab::impl::is_unknown(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+}
+
+bool llama_vocab::impl::is_control(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+}
+
+bool llama_vocab::impl::is_byte(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+}
+
+bool llama_vocab::impl::is_user_defined(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+}
+
+bool llama_vocab::impl::is_unused(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
+bool llama_vocab::impl::is_eog(llama_token id) const {
+    return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
+}
+
+uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
+    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+    GGML_ASSERT(is_byte(id));
+    const auto & token_data = id_to_token.at(id);
+    switch (get_type()) {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
+            auto buf = token_data.text.substr(3, 2);
+            return strtol(buf.c_str(), NULL, 16);
+        }
+        case LLAMA_VOCAB_TYPE_BPE: {
+            GGML_ABORT("fatal error");
+        }
+        case LLAMA_VOCAB_TYPE_WPM: {
+            GGML_ABORT("fatal error");
+        }
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token.at(id).attr;
+}
+
+void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
+    LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
+
+    switch (type) {
+        case LLAMA_VOCAB_TYPE_SPM:
+            tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
+            break;
+        case LLAMA_VOCAB_TYPE_BPE:
+            tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
+            break;
+        case LLAMA_VOCAB_TYPE_WPM:
+            tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
+            break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
+            break;
+        case LLAMA_VOCAB_TYPE_RWKV:
+            tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
+            break;
+        default:
+            GGML_ABORT("unsupported vocab type");
+    }
+}
+
+//
+// (de-) tokenize
+//
+
+// #define PRETOKENIZERDEBUG
+
+void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
+    // for each special token
+    for (const llama_token special_id : cache_special_tokens) {
+        const auto & data = vocab.get_token_data(special_id);
+        const auto & text = data.text;
+
+        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+            // Ignore control and unknown tokens when parse_special == false
+            continue;
+            // User-defined tokens are still pre-tokenized before everything else
+            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+        }
+
+        // for each text fragment
+        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+        while (it != buffer.end()) {
+            auto & fragment = (*it);
+
+            // if a fragment is text ( not yet processed )
+            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                const auto & raw_text = fragment.raw_text;
+
+                auto raw_text_base_offset = fragment.offset;
+                auto raw_text_base_length = fragment.length;
+
+                // loop over the text
+                while (true) {
+                    // find the first occurrence of a given special token in this fragment
+                    //  passing offset argument only limit the "search area" but match coordinates
+                    //  are still relative to the source full raw_text
+                    auto match = raw_text.find(text, raw_text_base_offset);
+
+                    // no occurrences found, stop processing this fragment for a given special token
+                    if (match == std::string::npos) break;
+
+                    // check if match is within bounds of offset <-> length
+                    if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
+
+#ifdef PRETOKENIZERDEBUG
+                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    auto source = std::distance(buffer.begin(), it);
+
+                    // if match is further than base offset
+                    //  then we have some text to the left of it
+                    if (match > raw_text_base_offset) {
+                        // left
+                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
+                        int64_t left_reminder_length = match - raw_text_base_offset;
+
+                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
+                            while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
+                                left_reminder_length--;
+                            }
+                        }
+
+                        if (left_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+                            it++;
+                        }
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+                    }
+
+                    // special token
+                    buffer.emplace_after(it, special_id);
+                    it++;
+
+                    // right
+                    if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
+                        int64_t right_reminder_offset = match + text.length();
+                        int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
+
+                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
+                            while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
+                                right_reminder_offset++;
+                                right_reminder_length--;
+                            }
+                        }
+
+                        if (right_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+                            it++;
+                        }
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+                        }
+
+                        // repeat for the right side
+                        raw_text_base_offset = right_reminder_offset;
+                        raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    } else {
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+                        }
+                        break;
+                    }
+                }
+            }
+            it++;
+        }
+    }
+}
+
+// NOTE: avoid ever using this except for building the token_to_piece caches
+std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache
+    const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
+    }
+
+    return piece;
+}
+
+static void llama_escape_whitespace(std::string & text) {
+    replace_all(text, " ", "\xe2\x96\x81");
+}
+
+static void llama_unescape_whitespace(std::string & word) {
+    replace_all(word, "\xe2\x96\x81", " ");
+}
+
+static std::string llama_decode_text(const std::string & text) {
+    std::string decoded_text;
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+    for (const auto cpt : cpts) {
+        const auto utf8 = unicode_cpt_to_utf8(cpt);
+        try {
+            decoded_text += unicode_utf8_to_byte(utf8);
+        } catch (const std::out_of_range & /*e*/) {
+            decoded_text += "[UNK_BYTE_0x";
+            for (const auto c : utf8) {
+                decoded_text += format("%02x", (uint8_t) c);
+            }
+            decoded_text += text + "]";
+        }
+    }
+
+    return decoded_text;
+}
+
+std::vector<llama_token> llama_vocab::impl::tokenize(
+        const std::string & raw_text,
+        bool add_special,
+        bool parse_special) const {
+    GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
+    std::vector<llama_token> output;
+    std::forward_list<fragment_buffer_variant> fragment_buffer;
+
+    if (!raw_text.empty()) {
+        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
+        tokenizer_st_partition(fragment_buffer, parse_special);
+    }
+
+    switch (get_type()) {
+        case LLAMA_VOCAB_TYPE_SPM:
+            {
+                // OG tokenizer behavior:
+                //
+                // tokenizer.encode('', add_special_tokens=True)  returns [1]
+                // tokenizer.encode('', add_special_tokens=False) returns []
+
+                bool is_prev_special = true;  // prefix with space if first token
+
+                if (add_special && add_bos) {
+                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_bos_id);
+                    is_prev_special = true;
+                }
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        std::string text;
+
+                        // prefix with space if previous is special
+                        if (add_space_prefix && is_prev_special) {
+                            text = ' ';
+                        }
+
+                        text += fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+                        llama_escape_whitespace(text);
+                        llm_tokenizer_spm_session session(vocab);
+                        session.tokenize(text, output);
+                        is_prev_special = false;
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                        is_prev_special = true;
+                    }
+                }
+
+                if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
+                if (add_special && add_eos) {
+                    GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_eos_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_BPE:
+            {
+                llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
+                // it calls some other methods that are not exist in llm_tokenizer,
+                // here just cast it to bpe tokenizer object
+                if (add_special) {
+                    session.append_bos(output);
+                }
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+                        session.tokenize(text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        session.append(fragment.token, output);
+                    }
+                }
+
+                if (add_special) {
+                    session.append_eos(output);
+                    session.check_double_bos_eos(output);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_WPM:
+            {
+                if (add_special) {
+                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_bos_id);
+                }
+
+                llm_tokenizer_wpm_session session(vocab);
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+                        session.tokenize(text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+
+                if (add_special) {
+                    GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_sep_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            {
+                if (add_special && add_bos) {
+                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_bos_id);
+                }
+                llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+                        session.tokenize(text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+
+                if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
+                if (add_special && add_eos) {
+                    GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_eos_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_RWKV:
+            {
+                llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+
+                        session.tokenize(text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_NONE:
+            GGML_ABORT("fatal error");
+    }
+
+    return output;
+}
+
+int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
+    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
+    static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
+    const llama_token_attr attr = token_get_attr(token);
+    if (!special && (attr & attr_special)) {
+        return 0;
+    }
+
+    // copy piece chars to output text buffer
+    // skip up to 'lstrip' leading spaces before copying
+    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
+        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
+            token++;
+            size--;
+        }
+        if (length < (int32_t)size) {
+            return -(int32_t) size;
+        }
+        memcpy(buf, token, size);
+        return (int32_t) size;
+    };
+
+    // if we have a cache - use it
+    {
+        const auto & cache = cache_token_to_piece;
+
+        if (!cache.empty()) {
+            const auto & result = cache.at(token);
+            return _try_copy(result.data(), result.size());
+        }
+    }
+
+    if (0 <= token && token < (int32_t) id_to_token.size()) {
+        const std::string & token_text = id_to_token[token].text;
+        switch (get_type()) {
+            case LLAMA_VOCAB_TYPE_WPM:
+            case LLAMA_VOCAB_TYPE_SPM:
+            case LLAMA_VOCAB_TYPE_UGM: {
+                // NOTE: we accept all unsupported token types,
+                // suppressing them like CONTROL tokens.
+                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                    return _try_copy(token_text.data(), token_text.size());
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    std::string result = token_text;
+                    llama_unescape_whitespace(result);
+                    return _try_copy(result.data(), result.size());
+                }
+                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+                    char byte = (char) token_to_byte(token);
+                    return _try_copy((char*) &byte, 1);
+                }
+                break;
+            }
+            case LLAMA_VOCAB_TYPE_BPE: {
+                // NOTE: we accept all unsupported token types,
+                // suppressing them like CONTROL tokens.
+                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                    return _try_copy(token_text.data(), token_text.size());
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    std::string result = llama_decode_text(token_text);
+                    return _try_copy(result.data(), result.size());
+                }
+                break;
+            }
+            case LLAMA_VOCAB_TYPE_RWKV: {
+                std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
+
+                // If we don't have enough space, return an error
+                if (result.size() > (size_t)length) {
+                    return -(int)result.size();
+                }
+
+                memcpy(buf, result.data(), result.size());
+                return (int)result.size();
+            }
+            default:
+                GGML_ABORT("fatal error");
+        }
+    }
+
+    return 0;
+}
+
+const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
+    return cache_token_to_piece.at(token);
+}
+
+int32_t llama_vocab::impl::detokenize(
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special) const {
+    if (type == LLAMA_VOCAB_TYPE_NONE) {
+        return 0;
+    }
+
+    GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
+    int32_t avail = text_len_max;
+    int32_t total = 0;
+
+    // remove the leading space
+    bool remove_space = add_space_prefix;
+
+    if (remove_special && add_bos) {
+        if (n_tokens > 0 && tokens[0] == special_bos_id) {
+            remove_space = false;
+            n_tokens--;
+            tokens++;
+        }
+    }
+
+    if (remove_special && add_eos) {
+        if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
+            n_tokens--;
+        }
+    }
+
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        GGML_ASSERT(avail >= 0);
+        int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
+        remove_space = false;
+        if (n_chars < 0) {
+            avail = 0;
+            total -= n_chars;
+        } else if (n_chars > 0) {
+            avail -= n_chars;
+            text  += n_chars;
+            total += n_chars;
+        }
+    }
+
+    if (total > text_len_max) {
+        return -total;
+    }
+
+    if (clean_spaces) {
+        text -= total;  // restart text
+
+        // first pass: characters ?!.,  //TODO: where do these characters come from?
+        const int32_t total1 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total1; ++i) {
+            const char x = text[i];
+            if (text[i - 1] == ' ') {
+                if (x == '?' || x == '!' || x == '.' || x == ',') {  // " ?", " !", " .", " ,"
+                    total--;  // remove space
+                }
+            }
+            text[total++] = x;
+        }
+
+        // second pass: strip single apostrophe between spaces
+        const int32_t total2 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total2; ++i) {
+            const char x = text[i];
+            if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') {  // " ' "
+                total--;           // remove prev space
+                text[++i] = '\0';  // remove next space
+            }
+            text[total++] = x;
+        }
+
+        // third pass: apostrophe contractions  //NOTE: this makes sense?
+        const int32_t total3 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total3; ++i) {
+            const char x = text[i];
+            if (text[i - 1] == ' ') {
+                if (x == '\'' && i + 1 < total3) {
+                    const char x1 = text[i + 1];
+                    if (x1 == 't' || x1 == 'd') {  // " 't", " 'd"
+                        //total--;  // remove space
+                    } else if (x1 == 's' || x1 == 'm') {  // " 's", " 'm"
+                        total--;  // remove space
+                    } else if (i + 2 < total3) {
+                        const char x2 = text[i + 2];
+                        if ((x1 == 'l' && x2 == 'l')) {  // " 'll"
+                            //total--;  // remove space
+                        } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) {  // " 're", " 've"
+                            total--;  // remove space
+                        } else {
+                            //total--;  // remove space
+                        }
+                    } else {
+                        //total--;  // remove space
+                    }
+                }
+            }
+            text[total++] = x;
+        }
+    }
+
+    return total <= text_len_max ? total : -total;
+}
+
+void llama_vocab::impl::print_info() const {
+    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
+    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
+    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
+
+    // special tokens
+    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token[special_bos_id].text.c_str() );  }
+    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token[special_eos_id].text.c_str() );  }
+    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token[special_eot_id].text.c_str() );  }
+    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token[special_eom_id].text.c_str() );  }
+    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token[special_unk_id].text.c_str() );  }
+    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token[special_sep_id].text.c_str() );  }
+    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token[special_pad_id].text.c_str() );  }
+    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token[special_mask_id].text.c_str() ); }
+
+    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token[linefeed_id].text.c_str() ); }
+
+    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
+    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
+    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
+    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
+    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
+    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
+
+    for (const auto & id : special_eog_ids) {
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
+    }
+
+    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
+}
+
+llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
+}
+
+llama_vocab::~llama_vocab() {
+}
+
+void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
+    pimpl->load(ml, kv);
+}
+
+enum llama_vocab_type llama_vocab::get_type() const {
+    return pimpl->type;
+}
+
+enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
+    return pimpl->pre_type;
+}
+
+uint32_t llama_vocab::n_tokens() const {
+    return (uint32_t) pimpl->id_to_token.size();
+}
+
+uint32_t llama_vocab::n_token_types() const {
+    return (uint32_t) pimpl->n_token_types;
+}
+
+std::string llama_vocab::type_name() const{
+    return pimpl->type_name();
+}
+
+bool llama_vocab::is_normal(llama_token id) const {
+    return pimpl->is_normal(id);
+}
+
+bool llama_vocab::is_unknown(llama_token id) const {
+    return pimpl->is_unknown(id);
+}
+
+bool llama_vocab::is_control(llama_token id) const {
+    return pimpl->is_control(id);
+}
+
+bool llama_vocab::is_byte(llama_token id) const {
+    return pimpl->is_byte(id);
+}
+
+bool llama_vocab::is_user_defined(llama_token id) const {
+    return pimpl->is_user_defined(id);
+}
+
+bool llama_vocab::is_unused(llama_token id) const {
+    return pimpl->is_unused(id);
+}
+
+bool llama_vocab::is_eog(llama_token id) const {
+    return pimpl->is_eog(id);
+}
+
+uint8_t llama_vocab::token_to_byte(llama_token id) const {
+    return pimpl->token_to_byte(id);
+}
+
+llama_token llama_vocab::byte_to_token(uint8_t ch) const {
+    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+    static const char * hex = "0123456789ABCDEF";
+    switch (get_type()) {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
+            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+            auto token = pimpl->token_to_id.find(buf);
+            if (token != pimpl->token_to_id.end()) {
+                return (*token).second;
+            }
+            // Try to fall back to just the byte as a string
+            const char buf2[2] = { (char)ch, 0 };
+            return pimpl->token_to_id.at(buf2);
+        }
+        case LLAMA_VOCAB_TYPE_WPM:
+        case LLAMA_VOCAB_TYPE_BPE: {
+            return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
+        }
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+llama_token llama_vocab::text_to_token(const std::string & text) const {
+    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+    auto it = pimpl->token_to_id.find(text);
+    if (it != pimpl->token_to_id.end()) {
+        return (*it).second;
+    }
+    return LLAMA_TOKEN_NULL;
+}
+
+const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
+    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+    return pimpl->id_to_token.at(id);
+}
+
+const char * llama_vocab::token_get_text(llama_token id) const {
+    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+    return pimpl->id_to_token.at(id).text.c_str();
+}
+
+float llama_vocab::token_get_score(llama_token id) const {
+    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+    return pimpl->id_to_token.at(id).score;
+}
+
+llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
+    return pimpl->token_get_attr(id);
+}
+
+llama_token llama_vocab::token_bos() const {
+    return pimpl->special_bos_id;
+}
+
+llama_token llama_vocab::token_eos() const {
+    return pimpl->special_eos_id;
+}
+
+llama_token llama_vocab::token_eot() const {
+    return pimpl->special_eot_id;
+}
+
+llama_token llama_vocab::token_eom() const {
+    return pimpl->special_eom_id;
+}
+
+llama_token llama_vocab::token_unk() const {
+    return pimpl->special_unk_id;
+}
+
+llama_token llama_vocab::token_sep() const {
+    return pimpl->special_sep_id;
+}
+
+llama_token llama_vocab::token_nl() const {
+    return pimpl->linefeed_id;
+}
+
+llama_token llama_vocab::token_pad() const {
+    return pimpl->special_pad_id;
+}
+
+llama_token llama_vocab::token_prefix() const {
+    return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_middle() const {
+    return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_suffix() const {
+    return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_pre() const {
+    return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_fim_suf() const {
+    return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_mid() const {
+    return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_fim_pad() const {
+    return pimpl->special_fim_pad_id;
+}
+
+llama_token llama_vocab::token_fim_rep() const {
+    return pimpl->special_fim_rep_id;
+}
+
+llama_token llama_vocab::token_fim_sep() const {
+    return pimpl->special_fim_sep_id;
+}
+
+bool llama_vocab::get_add_space_prefix() const {
+    return pimpl->add_space_prefix;
+}
+
+bool llama_vocab::get_add_bos() const {
+    return pimpl->add_bos;
+}
+
+bool llama_vocab::get_add_eos() const {
+    return pimpl->add_eos;
+}
+
+bool llama_vocab::get_ignore_merges() const {
+    return pimpl->ignore_merges;
+}
+
+bool llama_vocab::get_clean_spaces() const {
+    return pimpl->clean_spaces;
+}
+
+bool llama_vocab::get_remove_extra_whitespaces() const {
+    return pimpl->remove_extra_whitespaces;
+}
+
+bool llama_vocab::get_escape_whitespaces() const {
+    return pimpl->escape_whitespaces;
+}
+
+bool llama_vocab::get_treat_whitespace_as_suffix() const {
+    return pimpl->treat_whitespace_as_suffix;
+}
+
+int llama_vocab::max_token_len() const {
+    return pimpl->max_token_len;
+}
+
+int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
+    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
+    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
+    GGML_ASSERT(token_right.find('\n') == std::string::npos);
+
+    auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
+    if (it == pimpl->bpe_ranks.end()) {
+        return -1;
+    }
+
+    return it->second;
+}
+
+int32_t llama_vocab::tokenize(
+                  const char * text,
+                     int32_t   text_len,
+                 llama_token * tokens,
+                     int32_t   n_tokens_max,
+                        bool   add_special,
+                        bool   parse_special) const {
+    auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+    if (n_tokens_max < (int) res.size()) {
+        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        return -((int) res.size());
+    }
+
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+std::vector<llama_token> llama_vocab::tokenize(
+        const std::string & raw_text,
+        bool add_special,
+        bool parse_special) const {
+    return pimpl->tokenize(raw_text, add_special, parse_special);
+}
+
+const std::string & llama_vocab::token_to_piece(llama_token token) const {
+    return pimpl->token_to_piece(token);
+}
+
+int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
+    return pimpl->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_vocab::detokenize(
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special) const {
+    return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
+
+std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+
+    text.resize(n_chars);
+
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}
+
+void llama_vocab::print_info() const {
+    pimpl->print_info();
+}
+
+//
+// interface implementation
+//
+
+int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
+    return vocab->n_tokens();
+}
+
+// deprecated
+int32_t llama_n_vocab(const struct llama_vocab * vocab) {
+    return llama_vocab_n_tokens(vocab);
+}
+
+enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
+    return vocab->get_type();
+}
+
+const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
+    return vocab->token_get_text(token);
+}
+
+float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
+    return vocab->token_get_score(token);
+}
+
+enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
+    return vocab->token_get_attr(token);
+}
+
+bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
+    return vocab->is_eog(token);
+}
+
+bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
+    return vocab->is_control(token);
+}
+
+llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
+    return vocab->token_bos();
+}
+
+llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
+    return vocab->token_eos();
+}
+
+llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
+    return vocab->token_eot();
+}
+
+// deprecated
+llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
+    return vocab->token_bos();
+}
+
+llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
+    return vocab->token_sep();
+}
+
+llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
+    return vocab->token_nl();
+}
+
+llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
+    return vocab->token_pad();
+}
+
+bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
+    return vocab->get_add_bos();
+}
+
+bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
+    return vocab->get_add_eos();
+}
+
+llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
+    return vocab->token_fim_pre();
+}
+
+llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
+    return vocab->token_fim_suf();
+}
+
+llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
+    return vocab->token_fim_mid();
+}
+
+llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
+    return vocab->token_fim_pad();
+}
+
+llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
+    return vocab->token_fim_rep();
+}
+
+llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
+    return vocab->token_fim_sep();
+}
+
+// deprecated
+const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
+    return llama_vocab_get_text(vocab, token);
+}
+
+// deprecated
+float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
+    return llama_vocab_get_score(vocab, token);
+}
+
+// deprecated
+enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
+    return llama_vocab_get_attr(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
+    return llama_vocab_is_eog(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
+    return llama_vocab_is_control(vocab, token);
+}
+
+// deprecated
+llama_token llama_token_bos(const struct llama_vocab * vocab) {
+    return llama_vocab_bos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eos(const struct llama_vocab * vocab) {
+    return llama_vocab_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eot(const struct llama_vocab * vocab) {
+    return llama_vocab_eot(vocab);
+}
+
+// deprecated
+llama_token llama_token_cls(const struct llama_vocab * vocab) {
+    //return llama_vocab_cls(vocab);
+    return llama_vocab_bos(vocab); // avoid deprecation warning
+}
+
+// deprecated
+llama_token llama_token_sep(const struct llama_vocab * vocab) {
+    return llama_vocab_sep(vocab);
+}
+
+// deprecated
+llama_token llama_token_nl (const struct llama_vocab * vocab) {
+    return llama_vocab_nl(vocab);
+}
+
+// deprecated
+llama_token llama_token_pad(const struct llama_vocab * vocab) {
+    return llama_vocab_pad(vocab);
+}
+
+// deprecated
+bool llama_add_bos_token(const struct llama_vocab * vocab) {
+    return llama_vocab_get_add_bos(vocab);
+}
+
+// deprecated
+bool llama_add_eos_token(const struct llama_vocab * vocab) {
+    return llama_vocab_get_add_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_pre(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_suf(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_mid(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_pad(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_rep(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
+    return llama_vocab_fim_sep(vocab);
+}
+
+//
+// tokenization
+//
+
+int32_t llama_tokenize(
+    const struct llama_vocab * vocab,
+                  const char * text,
+                     int32_t   text_len,
+                 llama_token * tokens,
+                     int32_t   n_tokens_max,
+                        bool   add_special,
+                        bool   parse_special) {
+    return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
+}
+
+int32_t llama_token_to_piece(
+    const struct llama_vocab * vocab,
+                 llama_token   token,
+                        char * buf,
+                     int32_t   length,
+                     int32_t   lstrip,
+                        bool   special) {
+    return vocab->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_detokenize(
+    const struct llama_vocab * vocab,
+           const llama_token * tokens,
+                     int32_t   n_tokens,
+                        char * text,
+                     int32_t   text_len_max,
+                        bool   remove_special,
+                        bool   unparse_special) {
+    return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
+
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
new file mode 100644
index 000000000..5ce355214
--- /dev/null
+++ b/src/llama-vocab.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <memory>
+
+struct LLM_KV;
+struct llama_model_loader;
+
+struct llama_vocab {
+    struct token_data {
+        std::string      text;
+        float            score;
+        llama_token_attr attr;
+    };
+
+    llama_vocab();
+    ~llama_vocab();
+
+    void load(llama_model_loader & ml, const LLM_KV & kv);
+
+    enum llama_vocab_type     get_type()     const;
+    enum llama_vocab_pre_type get_pre_type() const;
+
+    uint32_t n_tokens() const;
+    uint32_t n_token_types() const;
+
+    std::string type_name() const;
+
+    bool is_normal      (llama_token id) const;
+    bool is_unknown     (llama_token id) const;
+    bool is_control     (llama_token id) const;
+    bool is_byte        (llama_token id) const;
+    bool is_user_defined(llama_token id) const;
+    bool is_unused      (llama_token id) const;
+    bool is_eog         (llama_token id) const;
+
+    uint8_t     token_to_byte(llama_token id) const;
+    llama_token byte_to_token(uint8_t ch)     const;
+
+    llama_token text_to_token(const std::string & text) const;
+
+    const token_data & get_token_data(llama_token id) const;
+
+    const char *     token_get_text (llama_token id) const;
+    float            token_get_score(llama_token id) const;
+    llama_token_attr token_get_attr (llama_token id) const;
+
+    llama_token token_bos() const;
+    llama_token token_eos() const;
+    llama_token token_eot() const;
+    llama_token token_eom() const;
+    llama_token token_unk() const;
+    llama_token token_sep() const;
+    llama_token token_nl () const;
+    llama_token token_pad() const;
+
+    llama_token token_prefix() const;
+    llama_token token_middle() const;
+    llama_token token_suffix() const;
+
+    llama_token token_fim_pre() const;
+    llama_token token_fim_suf() const;
+    llama_token token_fim_mid() const;
+    llama_token token_fim_pad() const;
+    llama_token token_fim_rep() const;
+    llama_token token_fim_sep() const;
+
+    bool get_add_space_prefix          () const;
+    bool get_add_bos                   () const;
+    bool get_add_eos                   () const;
+    bool get_ignore_merges             () const;
+    bool get_clean_spaces              () const;
+    bool get_remove_extra_whitespaces  () const;
+    bool get_escape_whitespaces        () const;
+    bool get_treat_whitespace_as_suffix() const;
+
+    int max_token_len() const;
+
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    int32_t tokenize(
+                   const char * text,
+                      int32_t   text_len,
+                  llama_token * tokens,
+                      int32_t   n_tokens_max,
+                         bool   add_special,
+                         bool   parse_special) const;
+
+    std::vector<llama_token> tokenize(
+            const std::string & raw_text,
+                         bool   add_special,
+                         bool   parse_special = false) const;
+
+    // does not write null-terminator to buf
+    int32_t token_to_piece(
+                  llama_token   token,
+                         char * buf,
+                      int32_t   length,
+                      int32_t   lstrip,
+                         bool   special) const;
+
+    // use cached data
+    const std::string & token_to_piece(llama_token token) const;
+
+    int32_t detokenize(
+            const llama_token * tokens,
+                      int32_t   n_tokens,
+                         char * text,
+                      int32_t   text_len_max,
+                         bool   remove_special,
+                         bool   unparse_special) const;
+
+    std::string detokenize(
+            const std::vector<llama_token> & tokens,
+                                      bool   special) const;
+
+    void print_info() const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
diff --git a/src/llama.cpp b/src/llama.cpp
new file mode 100644
index 000000000..607f27861
--- /dev/null
+++ b/src/llama.cpp
@@ -0,0 +1,10126 @@
+#include "llama-impl.h"
+
+#include "llama-chat.h"
+#include "llama-mmap.h"
+#include "llama-context.h"
+#include "llama-vocab.h"
+#include "llama-sampling.h"
+#include "llama-kv-cache.h"
+#include "llama-model-loader.h"
+#include "llama-model.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpp.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cfloat>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <functional>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+    // loading time will be recalculated after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = 0;
+    time_meas tm(model.t_load_us);
+
+    model.t_start_us = tm.t_start_us;
+
+    try {
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+
+        ml.print_info();
+
+        model.hparams.vocab_only = params.vocab_only;
+
+        try {
+            model.load_arch(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+        }
+        try {
+            model.load_hparams(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+        }
+        try {
+            model.load_vocab(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+        }
+
+        model.load_stats(ml);
+        model.print_info();
+
+        if (params.vocab_only) {
+            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+            return 0;
+        }
+
+        if (!model.load_tensors(ml)) {
+            return -2;
+        }
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return -1;
+    }
+
+    return 0;
+}
+
+//
+// llm_build
+//
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+    LLM_FFN_SWIGLU,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+    LLM_NORM_GROUP,
+};
+
+static struct ggml_tensor * llm_build_inp_embd(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+        const llama_hparams & hparams,
+         const llama_ubatch & ubatch,
+         struct ggml_tensor * tok_embd,
+         const llm_build_cb & cb) {
+    const int64_t n_embd = hparams.n_embd;
+
+    struct ggml_tensor * inpL;
+
+    if (ubatch.token) {
+        lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
+        cb(lctx.inp_tokens, "inp_tokens", -1);
+        ggml_set_input(lctx.inp_tokens);
+
+        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
+
+        // apply lora for embedding tokens if needed
+        for (auto & it : lctx.lora) {
+            struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
+            if (lw == nullptr) {
+                continue;
+            }
+            const float adapter_scale = it.second;
+            const float scale = lw->get_scale(it.first->alpha, adapter_scale);
+            struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
+                ctx, lw->b, // non-transposed lora_b
+                ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
+            ), scale);
+            inpL = ggml_add(ctx, inpL, inpL_delta);
+        }
+    } else {
+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        inpL = lctx.inp_embd;
+        ggml_set_input(lctx.inp_embd);
+    }
+
+    // For Granite architecture
+    if (hparams.f_embedding_scale != 0.0f) {
+        inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
+    }
+
+    cb(inpL, "inp_embd", -1);
+
+    return inpL;
+}
+
+static void llm_build_kv_store(
+        struct ggml_context * ctx,
+        const llama_hparams & hparams,
+        const llama_cparams & cparams,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+         const llm_build_cb & cb,
+                    int64_t   il) {
+    const int64_t n_ctx = cparams.n_ctx;
+
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    GGML_ASSERT(kv.size == n_ctx);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
+    cb(k_cache_view, "k_cache_view", il);
+
+    // note: storing RoPE-ed version of K in the KV cache
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
+
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+
+    struct ggml_tensor * v_cache_view = nullptr;
+
+    if (cparams.flash_attn) {
+        v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
+                (  n_ctx)*ggml_element_size(kv.v_l[il]),
+                (kv_head)*ggml_element_size(kv.v_l[il]));
+
+        v_cur = ggml_transpose(ctx, v_cur);
+    }
+    cb(v_cache_view, "v_cache_view", il);
+
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
+}
+
+// do mat_mul, while optionally apply lora
+static struct ggml_tensor * llm_build_lora_mm(
+        struct llama_context & lctx,
+         struct ggml_context * ctx0,
+          struct ggml_tensor * w,
+          struct ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+    for (auto & it : lctx.lora) {
+        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+        const float adapter_scale = it.second;
+        const float scale = lw->get_scale(it.first->alpha, adapter_scale);
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lw->b,
+            ggml_mul_mat(ctx0, lw->a, cur)
+        );
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+    return res;
+}
+
+// do mat_mul_id, while optionally apply lora
+static struct ggml_tensor * llm_build_lora_mm_id(
+        struct llama_context & lctx,
+         struct ggml_context * ctx0,
+          struct ggml_tensor * w,   // struct ggml_tensor * as
+          struct ggml_tensor * cur, // struct ggml_tensor * b
+          struct ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (auto & it : lctx.lora) {
+        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+        const float alpha = it.first->alpha;
+        const float rank  = (float) lw->b->ne[0];
+        const float scale = alpha ? it.second * alpha / rank : it.second;
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lw->b,
+            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+            ids
+        );
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+    return res;
+}
+
+static struct ggml_tensor * llm_build_norm(
+        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
+        const llama_hparams & hparams,
+         struct ggml_tensor * mw,
+         struct ggml_tensor * mb,
+              llm_norm_type   type,
+         const llm_build_cb & cb,
+                        int   il) {
+    switch (type) {
+        case LLM_NORM:       cur = ggml_norm      (ctx, cur, hparams.f_norm_eps);     break;
+        case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
+        case LLM_NORM_GROUP:
+            {
+                cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
+                cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+                cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
+            } break;
+    }
+
+    if (mw || mb) {
+        cb(cur, "norm", il);
+    }
+
+    if (mw) {
+        cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cb(cur, "norm_w", il);
+        }
+    }
+
+    if (mb) {
+        cur = ggml_add(ctx, cur, mb);
+    }
+
+    return cur;
+}
+
+static struct ggml_tensor * llm_build_ffn(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * up,
+         struct ggml_tensor * up_b,
+         struct ggml_tensor * up_s,
+         struct ggml_tensor * gate,
+         struct ggml_tensor * gate_b,
+         struct ggml_tensor * gate_s,
+         struct ggml_tensor * down,
+         struct ggml_tensor * down_b,
+         struct ggml_tensor * down_s,
+         struct ggml_tensor * act_scales,
+            llm_ffn_op_type   type_op,
+          llm_ffn_gate_type   type_gate,
+         const llm_build_cb & cb,
+                        int   il) {
+    struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
+    cb(tmp, "ffn_up", il);
+
+    if (up_b) {
+        tmp = ggml_add(ctx, tmp, up_b);
+        cb(tmp, "ffn_up_b", il);
+    }
+
+    if (up_s) {
+        tmp = ggml_mul(ctx, tmp, up_s);
+        cb(tmp, "ffn_up_s", il);
+    }
+
+    if (gate) {
+        switch (type_gate) {
+            case LLM_FFN_SEQ:
+                {
+                    cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
+                    cb(cur, "ffn_gate", il);
+                } break;
+            case LLM_FFN_PAR:
+                {
+                    cur = llm_build_lora_mm(lctx, ctx, gate, cur);
+                    cb(cur, "ffn_gate", il);
+                } break;
+        }
+
+        if (gate_b) {
+            cur = ggml_add(ctx, cur, gate_b);
+            cb(cur, "ffn_gate_b", il);
+        }
+
+        if (gate_s) {
+            cur = ggml_mul(ctx, cur, gate_s);
+            cb(cur, "ffn_gate_s", il);
+        }
+
+    } else {
+        cur = tmp;
+    }
+
+    switch (type_op) {
+        case LLM_FFN_SILU:
+            {
+                cur = ggml_silu(ctx, cur);
+                cb(cur, "ffn_silu", il);
+            } break;
+        case LLM_FFN_GELU:
+            {
+                cur = ggml_gelu(ctx, cur);
+                cb(cur, "ffn_gelu", il);
+                if (act_scales != NULL) {
+                    cur = ggml_div(ctx, cur, act_scales);
+                    cb(cur, "ffn_act", il);
+                }
+            } break;
+        case LLM_FFN_RELU:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
+            } break;
+        case LLM_FFN_RELU_SQR:
+            {
+                cur = ggml_relu(ctx, cur);
+                cb(cur, "ffn_relu", il);
+
+                cur = ggml_sqr(ctx, cur);
+                cb(cur, "ffn_sqr(relu)", il);
+            } break;
+        case LLM_FFN_SWIGLU:
+            {
+                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+                int64_t split_point = cur->ne[0] / 2;
+                struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                x0 = ggml_silu(ctx, x0);
+                cb(cur, "ffn_silu", il);
+
+                cur = ggml_mul(ctx, x0, x1);
+                cb(cur, "ffn_mul", il);
+            } break;
+    }
+
+    if (type_gate == LLM_FFN_PAR) {
+        cur = ggml_mul(ctx, cur, tmp);
+        cb(cur, "ffn_gate_par", il);
+    }
+
+    if (down) {
+        cur = llm_build_lora_mm(lctx, ctx, down, cur);
+    }
+
+    if (down_b) {
+        cb(cur, "ffn_down", il);
+    }
+
+    if (down_b) {
+        cur = ggml_add(ctx, cur, down_b);
+    }
+
+    if (down_s) {
+        cur = ggml_mul(ctx, cur, down_s);
+        cb(cur, "ffn_down_s", il);
+    }
+
+    return cur;
+}
+
+static struct ggml_tensor * llm_build_moe_ffn(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * gate_inp,
+         struct ggml_tensor * up_exps,
+         struct ggml_tensor * gate_exps,
+         struct ggml_tensor * down_exps,
+         struct ggml_tensor * exp_probs_b,
+                    int64_t   n_expert,
+                    int64_t   n_expert_used,
+            llm_ffn_op_type   type_op,
+                       bool   norm_w,
+                       bool   scale_w,
+                      float   w_scale,
+llama_expert_gating_func_type gating_op,
+         const llm_build_cb & cb,
+                        int   il) {
+    int64_t n_embd = cur->ne[0];
+    int64_t n_tokens = cur->ne[1];
+
+    ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
+    cb(logits, "ffn_moe_logits", il);
+
+    ggml_tensor * probs = nullptr;
+    switch (gating_op) {
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
+            {
+                probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
+            } break;
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
+            {
+                probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+    cb(probs, "ffn_moe_probs", il);
+
+    // add experts selection bias - introduced in DeepSeek V3
+    // leave probs unbiased as it's later used to get expert weights
+    ggml_tensor * selection_probs = probs;
+    if (exp_probs_b != nullptr) {
+        selection_probs = ggml_add(ctx, probs, exp_probs_b);
+        cb(selection_probs, "ffn_moe_probs_biased", il);
+    }
+
+    // select experts
+    ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+    cb(selected_experts->src[0], "ffn_moe_argsort", il);
+    cb(selected_experts, "ffn_moe_topk", il);
+
+    ggml_tensor * weights = ggml_get_rows(ctx,
+            ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+    cb(weights, "ffn_moe_weights", il);
+
+    if (norm_w) {
+        weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+
+        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
+        cb(weights_sum, "ffn_moe_weights_sum", il);
+
+        weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
+        cb(weights, "ffn_moe_weights_norm", il);
+
+        weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
+    }
+    if (scale_w) {
+        weights = ggml_scale(ctx, weights, w_scale);
+        cb(weights, "ffn_moe_weights_scaled", il);
+    }
+
+    cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
+    ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    cb(up, "ffn_moe_up", il);
+
+    ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+    cb(gate, "ffn_moe_gate", il);
+
+    switch (type_op) {
+        case LLM_FFN_SILU:
+            {
+                gate = ggml_silu(ctx, gate);
+                cb(gate, "ffn_moe_silu", il);
+            } break;
+        case LLM_FFN_GELU:
+            {
+                gate = ggml_gelu(ctx, gate);
+                cb(gate, "ffn_moe_gelu", il);
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+
+    ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
+    cb(par, "ffn_moe_gate_par", il);
+
+    ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    cb(experts, "ffn_moe_down", il);
+
+    experts = ggml_mul(ctx, experts, weights);
+
+    // aggregate experts
+    ggml_tensor * moe_out = nullptr;
+    for (int i = 0; i < n_expert_used; ++i) {
+        ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
+                experts->nb[2], i*experts->nb[1]);
+
+        if (i == 0) {
+            moe_out = cur_expert;
+        } else {
+            moe_out = ggml_add(ctx, moe_out, cur_expert);
+        }
+    }
+
+    if (n_expert_used == 1) {
+        // avoid returning a non-contiguous tensor
+        moe_out = ggml_cont(ctx, moe_out);
+    }
+
+    return moe_out;
+}
+
+static struct ggml_tensor * llm_build_kqv(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_mask,
+                    int32_t   n_tokens,
+                    int32_t   n_kv,
+                    float     kq_scale,
+         const llm_build_cb & cb,
+                    int       il) {
+    const llama_model   & model   = lctx.model;
+    const llama_hparams & hparams = lctx.model.hparams;
+    const llama_cparams & cparams = lctx.cparams;
+
+    const int64_t n_ctx         = cparams.n_ctx;
+    const int64_t n_head        = hparams.n_head(il);
+    const int64_t n_head_kv     = hparams.n_head_kv(il);
+    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_head_v = hparams.n_embd_head_v;
+    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
+
+    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+    cb(q, "q", il);
+
+    struct ggml_tensor * k =
+        ggml_view_3d(ctx, kv.k_l[il],
+                n_embd_head_k, n_kv, n_head_kv,
+                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
+                0);
+    cb(k, "k", il);
+
+    struct ggml_tensor * cur;
+
+    if (cparams.flash_attn) {
+        GGML_UNUSED(model);
+        GGML_UNUSED(n_ctx);
+
+        // split cached v into n_head heads (not transposed)
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx, kv.v_l[il],
+                    n_embd_head_v, n_kv, n_head_kv,
+                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
+                    ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
+                    0);
+        cb(v, "v", il);
+
+        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
+    } else {
+        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+        cb(kq, "kq", il);
+
+        // note: this op tends to require high floating point range
+        //       while for some models F16 is enough, for others it is not, so we default to F32 here
+        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+        if (model.arch == LLM_ARCH_GROK) {
+            // need to do the following:
+            // multiply by attn_output_multiplyer of 0.08838834764831845
+            // and then :
+            // kq = 30 * tanh(kq / 30)
+            // before the softmax below
+
+            kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
+            kq = ggml_scale(ctx, kq, 30);
+        }
+
+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx, kq);
+            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
+        }
+
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+        cb(kq, "kq_soft_max_ext", il);
+
+        GGML_ASSERT(kv.size == n_ctx);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx, kv.v_l[il],
+                    n_kv, n_embd_head_v, n_head_kv,
+                    ggml_element_size(kv.v_l[il])*n_ctx,
+                    ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
+                    0);
+        cb(v, "v", il);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+        cb(kqv, "kqv", il);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+        cb(kqv_merged, "kqv_merged", il);
+
+        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        cb(cur, "kqv_merged_cont", il);
+    }
+
+    ggml_build_forward_expand(graph, cur);
+
+    if (wo) {
+        cur = llm_build_lora_mm(lctx, ctx, wo, cur);
+    }
+
+    if (wo_b) {
+        cb(cur, "kqv_wo", il);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx, cur, wo_b);
+    }
+
+    return cur;
+}
+
+static struct ggml_tensor * llm_build_kv(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_mask,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+                    float     kq_scale,
+         const llm_build_cb & cb,
+                    int       il) {
+    const llama_hparams & hparams = lctx.model.hparams;
+    const llama_cparams & cparams = lctx.cparams;
+
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(graph, q_cur);
+    ggml_build_forward_expand(graph, k_cur);
+    ggml_build_forward_expand(graph, v_cur);
+
+    llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
+
+    struct ggml_tensor * cur;
+
+    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
+    cb(cur, "kqv_out", il);
+
+    return cur;
+}
+
+static struct ggml_tensor * llm_build_copy_mask_state(
+        struct ggml_context * ctx,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * s,
+         struct ggml_tensor * state_copy,
+         struct ggml_tensor * state_mask,
+                    int32_t   n_state,
+                    int32_t   kv_size,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+                    int32_t   n_seqs) {
+    struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size);
+
+    // copy states
+    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
+    // this shrinks the tensors's ne[1] to n_kv
+    states = ggml_get_rows(ctx, states, state_copy);
+
+    // clear states of sequences which are starting at the beginning of this batch
+    // FIXME: zero-out NANs?
+    states = ggml_mul(ctx, states, state_mask);
+
+    // copy states which won't be changed further (between n_seqs and n_kv)
+    ggml_build_forward_expand(graph,
+        ggml_cpy(ctx,
+            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
+            ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
+
+    // the part of the states that will be used and modified
+    return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0);
+}
+
+// TODO: split
+static struct ggml_tensor * llm_build_mamba(
+        struct ggml_context * ctx,
+       struct llama_context & lctx,
+         const llama_ubatch & ubatch,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * cur,
+         struct ggml_tensor * state_copy,
+         struct ggml_tensor * state_mask,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+         const llm_build_cb & cb,
+                    int       il) {
+    const llama_model    & model   = lctx.model;
+    const llama_hparams  & hparams = model.hparams;
+    const llama_kv_cache & kv      = lctx.kv_self;
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t dt_rank = hparams.ssm_dt_rank;
+    const int64_t n_seqs  = ubatch.n_seqs;
+    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+    // Use the same RMS norm as the final layer norm
+    const float norm_rms_eps = hparams.f_norm_rms_eps;
+
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs);
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+    struct ggml_tensor * conv_states_all = kv.k_l[il];
+    struct ggml_tensor * ssm_states_all  = kv.v_l[il];
+
+    // (ab)using the KV cache to store the states
+    struct ggml_tensor * conv = llm_build_copy_mask_state(ctx,
+            graph, conv_states_all, state_copy, state_mask,
+            hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs);
+    conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs);
+    struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx,
+            graph, ssm_states_all, state_copy, state_mask,
+            hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs);
+    ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs);
+
+    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+    cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur);
+    // split the above in two
+    // => {d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+    struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
+
+    // conv
+    {
+        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+        struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0);
+
+        // copy last (d_conv - 1) columns back into the state cache
+        struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
+
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx, last_conv,
+                ggml_view_1d(ctx, conv_states_all,
+                    (d_conv - 1)*(d_inner)*(n_seqs),
+                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+
+        // 1D convolution
+        // The equivalent is to make a self-overlapping view of conv_x
+        // over d_conv columns at each stride in the 3rd dimension,
+        // then element-wise multiply that with the conv1d weight,
+        // then sum the elements of each row,
+        // (the last two steps are a dot product over rows (also doable with mul_mat))
+        // then permute away the ne[0] dimension,
+        // and then you're left with the resulting x tensor.
+        // For simultaneous sequences, all sequences need to have the same length.
+        x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
+
+        // bias
+        x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);
+
+        x = ggml_silu(ctx, x);
+    }
+
+    // ssm
+    {
+        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+        struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x);
+        // split
+        struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+        struct ggml_tensor * B  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
+        struct ggml_tensor * C  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
+
+        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
+        if (ssm_dt_b_c_rms) {
+            dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
+            B = ggml_rms_norm(ctx, B, norm_rms_eps);
+            C = ggml_rms_norm(ctx, C, norm_rms_eps);
+        }
+
+        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+        dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt);
+        dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b);
+
+        // Custom operator to optimize the parallel associative scan
+        // as described in the Annex D of the Mamba paper.
+        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
+
+        // store last states
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx,
+                ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
+                ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+
+        struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
+
+        // TODO: skip computing output earlier for unused tokens
+
+        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+        y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d));
+        y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z)));
+
+        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+        cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y);
+    }
+
+    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+    cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
+    cb(cur, "mamba_out", il);
+
+    return cur;
+}
+
+static struct ggml_tensor * llm_build_rwkv6_time_mix(
+        struct llama_context & lctx,
+        struct ggml_context * ctx,
+        const struct llama_layer * layer,
+        struct ggml_tensor * cur,
+        struct ggml_tensor * x_prev,
+        struct ggml_tensor ** wkv_state,
+        size_t wkv_head_size,
+        size_t head_count_kv) {
+    size_t n_embd       = cur->ne[0];
+    size_t n_seq_tokens = cur->ne[1];
+    size_t n_seqs       = cur->ne[2];
+
+    size_t head_size  = wkv_head_size;
+    size_t head_count = n_embd / head_size;
+
+    size_t n_tokens = n_seqs * n_seq_tokens;
+
+    bool is_qrwkv = layer->time_mix_first == nullptr;
+
+    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
+
+    sx  = ggml_reshape_2d(ctx, sx,  n_embd, n_tokens);
+    cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
+
+    struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
+
+    xxx = ggml_reshape_4d(
+        ctx,
+        ggml_tanh(
+            ctx,
+            ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
+        ),
+        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    );
+
+    xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
+
+    xxx = ggml_mul_mat(
+        ctx,
+        ggml_reshape_4d(
+            ctx,
+            layer->time_mix_w2,
+            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+        ),
+        xxx
+    );
+
+    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    if (layer->time_mix_lerp_fused) {
+        // fusing these weights makes some performance improvement
+        sx  = ggml_reshape_3d(ctx, sx,  n_embd, 1, n_tokens);
+        cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
+        xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
+        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    } else {
+        // for backward compatibility
+        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+        xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
+        xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
+        xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
+        xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
+        xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
+    }
+
+    struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
+    struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key,        xk);
+    struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value,      xv);
+    if (layer->time_mix_receptance_b) {
+        r = ggml_add(ctx, r, layer->time_mix_receptance_b);
+    }
+    if (layer->time_mix_key_b) {
+        k = ggml_add(ctx, k, layer->time_mix_key_b);
+    }
+    if (layer->time_mix_value_b) {
+        v = ggml_add(ctx, v, layer->time_mix_value_b);
+    }
+
+    struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
+    if (is_qrwkv) {
+        g = ggml_sigmoid(ctx, g);
+    } else {
+        g = ggml_silu(ctx, g);
+    }
+
+    if (head_count_kv != head_count) {
+        GGML_ASSERT(head_count % head_count_kv == 0);
+        k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
+        v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
+        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
+        k = ggml_repeat(ctx, k, tmp);
+        v = ggml_repeat(ctx, v, tmp);
+    }
+
+    k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
+    v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
+    r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
+
+    struct ggml_tensor * w = ggml_mul_mat(
+        ctx,
+        layer->time_mix_decay_w2,
+        ggml_tanh(
+            ctx,
+            ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
+        )
+    );
+
+    w = ggml_add(ctx, w, layer->time_mix_decay);
+    w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
+    w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
+
+    if (is_qrwkv) {
+        // k = k * (1 - w)
+        k = ggml_sub(ctx, k, ggml_mul(ctx, k, w));
+    }
+
+    struct ggml_tensor * wkv_output;
+    if (!layer->time_mix_first) {
+        wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
+    } else {
+        wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+    }
+    cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
+    *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+    if (!is_qrwkv) {
+        // group norm with head_count groups
+        cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
+        cur = ggml_norm(ctx, cur, 64e-5f);
+
+        // Convert back to regular vectors.
+        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
+        cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    } else {
+        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
+    }
+
+    cur = ggml_mul(ctx, cur, g);
+    cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
+
+    return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
+}
+
+static struct ggml_tensor * llm_build_rwkv6_channel_mix(
+        struct llama_context & lctx,
+        struct ggml_context * ctx,
+        const struct llama_layer * layer,
+        struct ggml_tensor * cur,
+        struct ggml_tensor * x_prev) {
+    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
+    struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
+    struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
+
+    struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
+    struct ggml_tensor * k = ggml_sqr(
+        ctx,
+        ggml_relu(
+            ctx,
+            llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
+        )
+    );
+
+    return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
+}
+
+struct llm_build_context {
+    const llama_model    & model;
+          llama_context  & lctx;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_ubatch   & ubatch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_rot;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head_k;
+    const int64_t n_embd_k_gqa;
+    const int64_t n_embd_head_v;
+    const int64_t n_embd_v_gqa;
+    const int64_t n_expert;
+    const int64_t n_expert_used;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    const int32_t n_outputs;
+    const int32_t n_outputs_enc;
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_ctx_orig;
+
+    const bool flash_attn;
+
+    const enum llama_pooling_type pooling_type;
+    const enum llama_rope_type    rope_type;
+
+    const llm_build_cb & cb;
+
+    std::vector<uint8_t> & buf_compute_meta;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+        llama_context  & lctx,
+    const llama_ubatch & ubatch,
+    const llm_build_cb & cb,
+                  bool   worst_case) :
+        model            (lctx.model),
+        lctx             (lctx),
+        hparams          (model.hparams),
+        cparams          (lctx.cparams),
+        ubatch           (ubatch),
+        kv_self          (lctx.kv_self),
+        n_embd           (hparams.n_embd),
+        n_layer          (hparams.n_layer),
+        n_rot            (hparams.n_rot),
+        n_ctx            (cparams.n_ctx),
+        n_head           (hparams.n_head()),
+        n_head_kv        (hparams.n_head_kv()),
+        n_embd_head_k    (hparams.n_embd_head_k),
+        n_embd_k_gqa     (hparams.n_embd_k_gqa()),
+        n_embd_head_v    (hparams.n_embd_head_v),
+        n_embd_v_gqa     (hparams.n_embd_v_gqa()),
+        n_expert         (hparams.n_expert),
+        n_expert_used    (hparams.n_expert_used),
+        freq_base        (cparams.rope_freq_base),
+        freq_scale       (cparams.rope_freq_scale),
+        ext_factor       (cparams.yarn_ext_factor),
+        attn_factor      (cparams.yarn_attn_factor),
+        beta_fast        (cparams.yarn_beta_fast),
+        beta_slow        (cparams.yarn_beta_slow),
+        norm_eps         (hparams.f_norm_eps),
+        norm_rms_eps     (hparams.f_norm_rms_eps),
+        n_tokens         (ubatch.n_tokens),
+        n_kv             (worst_case ? kv_self.size : kv_self.n),
+        n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
+        n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
+        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
+        n_ctx_orig       (cparams.n_ctx_orig_yarn),
+        flash_attn       (cparams.flash_attn),
+        pooling_type     (cparams.pooling_type),
+        rope_type        (hparams.rope_type),
+        cb               (cb),
+        buf_compute_meta (lctx.buf_compute_meta) {
+            // all initializations should be done in init()
+        }
+
+    void init() {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ctx0 = ggml_init(params);
+
+        lctx.inp_tokens      = nullptr;
+        lctx.inp_embd        = nullptr;
+        lctx.inp_pos         = nullptr;
+        lctx.inp_out_ids     = nullptr;
+        lctx.inp_KQ_mask     = nullptr;
+        lctx.inp_KQ_mask_swa = nullptr;
+        lctx.inp_K_shift     = nullptr;
+        lctx.inp_mean        = nullptr;
+        lctx.inp_cls         = nullptr;
+        lctx.inp_s_copy      = nullptr;
+        lctx.inp_s_mask      = nullptr;
+        lctx.inp_s_seq       = nullptr;
+        lctx.inp_pos_bucket    = nullptr;
+        lctx.inp_embd_enc      = nullptr;
+        lctx.inp_KQ_mask_cross = nullptr;
+    }
+
+    void free() {
+        ggml_free(ctx0);
+        ctx0 = nullptr;
+    }
+
+    struct ggml_cgraph * build_k_shift() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        GGML_ASSERT(kv_self.size == n_ctx);
+
+        lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        cb(lctx.inp_K_shift, "K_shift", -1);
+        ggml_set_input(lctx.inp_K_shift);
+
+        for (int il = 0; il < n_layer; ++il) {
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            struct ggml_tensor * rope_factors = build_rope_factors(il);
+            struct ggml_tensor * k =
+                ggml_view_3d(ctx0, kv_self.k_l[il],
+                    n_embd_head_k, n_head_kv, n_ctx,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    0);
+
+            struct ggml_tensor * tmp;
+            if (ggml_is_quantized(k->type)) {
+                // dequantize to f32 -> RoPE -> quantize back
+                tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
+                cb(tmp, "K_f32", il);
+                for (auto & backend : lctx.backends) {
+                    // Figure out which backend KV cache belongs to
+                    if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
+                        break;
+                    }
+                }
+                tmp = ggml_rope_ext_inplace(ctx0, tmp,
+                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(tmp, "K_shifted_f32", il);
+                tmp = ggml_cpy(ctx0, tmp, k);
+            } else {
+                // we rotate only the first n_rot dimensions
+                tmp = ggml_rope_ext_inplace(ctx0, k,
+                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
+            cb(tmp, "K_shifted", il);
+            ggml_build_forward_expand(gf, tmp);
+        }
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        for (uint32_t i = 0; i < ids.size(); ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == ids.size()) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            for (int il = 0; il < n_layer; ++il) {
+                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+                        n_embd_k_gqa, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+
+                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+                        n_embd_k_gqa, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+
+                ggml_tensor * view_v_src;
+                ggml_tensor * view_v_dst;
+
+                if (flash_attn) {
+                    // NOTE: the V cache is not transposed when using flash attention
+                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                            n_embd_v_gqa, nm,
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+
+                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                            n_embd_v_gqa, nm,
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+                } else {
+                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                            nm, n_embd_v_gqa,
+                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                            ggml_row_size(kv_self.v_l[il]->type, i));
+
+                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                            nm, n_embd_v_gqa,
+                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                            ggml_row_size(kv_self.v_l[il]->type, id));
+                }
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+            }
+
+            i += nm - 1;
+        }
+
+        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+
+        return gf;
+    }
+
+    struct ggml_tensor * build_inp_pos() {
+        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(lctx.inp_pos, "inp_pos", -1);
+        ggml_set_input(lctx.inp_pos);
+        return lctx.inp_pos;
+    }
+
+    struct ggml_tensor * build_rope_factors(int il) {
+        // choose long/short freq factors based on the context size
+        const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+
+        if (model.layers[il].rope_freqs != nullptr) {
+            return model.layers[il].rope_freqs;
+        }
+
+        if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+            return model.layers[il].rope_long;
+        }
+
+        return model.layers[il].rope_short;
+    }
+
+    struct ggml_tensor * build_inp_out_ids() {
+        lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+        cb(lctx.inp_out_ids, "inp_out_ids", -1);
+        ggml_set_input(lctx.inp_out_ids);
+        return lctx.inp_out_ids;
+    }
+
+    struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
+        lctx.inp_KQ_mask = causal
+            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        cb(lctx.inp_KQ_mask, "KQ_mask", -1);
+        ggml_set_input(lctx.inp_KQ_mask);
+
+        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
+    }
+
+    struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
+        GGML_ASSERT(hparams.n_swa > 0);
+
+        lctx.inp_KQ_mask_swa = causal
+            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(lctx.inp_KQ_mask_swa);
+
+        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
+    }
+
+    struct ggml_tensor * build_inp_mean() {
+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+        cb(lctx.inp_mean, "inp_mean", -1);
+        ggml_set_input(lctx.inp_mean);
+        return lctx.inp_mean;
+    }
+
+    struct ggml_tensor * build_inp_cls() {
+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(lctx.inp_cls, "inp_cls", -1);
+        ggml_set_input(lctx.inp_cls);
+        return lctx.inp_cls;
+    }
+
+    struct ggml_tensor * build_inp_s_copy() {
+        lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+        cb(lctx.inp_s_copy, "inp_s_copy", -1);
+        ggml_set_input(lctx.inp_s_copy);
+        return lctx.inp_s_copy;
+    }
+
+    struct ggml_tensor * build_inp_s_mask() {
+        lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+        cb(lctx.inp_s_mask, "inp_s_mask", -1);
+        ggml_set_input(lctx.inp_s_mask);
+        return lctx.inp_s_mask;
+    }
+
+    struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
+        // find result_norm tensor for input
+        struct ggml_tensor * inp = nullptr;
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
+            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+                break;
+            } else {
+                inp = nullptr;
+            }
+        }
+        GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
+
+        struct ggml_tensor * cur;
+
+        switch (pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    cur = inp;
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+                {
+                    struct ggml_tensor * inp_mean = build_inp_mean();
+                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+                } break;
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    cur = ggml_get_rows(ctx0, inp, inp_cls);
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    inp = ggml_get_rows(ctx0, inp, inp_cls);
+
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    GGML_ASSERT(model.cls       != nullptr);
+                    GGML_ASSERT(model.cls_b     != nullptr);
+
+                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (model.cls_out) {
+                        GGML_ASSERT(model.cls_out_b != nullptr);
+
+                        cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
+                    }
+                } break;
+            default:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
+        }
+
+        cb(cur, "result_embd_pooled", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_tensor * llm_build_pos_bucket(bool causal) {
+        if (causal) {
+            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
+        } else {
+            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+        }
+
+        ggml_set_input(lctx.inp_pos_bucket);
+        cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+
+        return lctx.inp_pos_bucket;
+    }
+
+    struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+        struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
+        cb(pos_bucket_1d, "pos_bucket_1d", -1);
+
+        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+        cb(pos_bias, "pos_bias", -1);
+
+        pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
+        cb(pos_bias, "pos_bias", -1);
+
+        pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+        cb(pos_bias, "pos_bias", -1);
+
+        pos_bias = ggml_cont(ctx0, pos_bias);
+        cb(pos_bias, "pos_bias", -1);
+
+        return pos_bias;
+    }
+
+    struct ggml_tensor * llm_build_inp_embd_enc() {
+        const int64_t n_embd = hparams.n_embd;
+        lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+        ggml_set_input(lctx.inp_embd_enc);
+        cb(lctx.inp_embd_enc, "embd_enc", -1);
+        return lctx.inp_embd_enc;
+    }
+
+    struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
+        lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        ggml_set_input(lctx.inp_KQ_mask_cross);
+        cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
+        return lctx.inp_KQ_mask_cross;
+    }
+
+    struct ggml_cgraph * build_llama() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        cb, il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        // For Granite architecture
+        if (hparams.f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deci() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head    = hparams.n_head(il);
+
+            if (n_head == 0) {
+                // attention-free layer of Llama-3_1-Nemotron-51B
+                cur = inpL;
+            } else {
+                // norm
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            if (n_head > 0 && n_head_kv == 0) {
+                // "linear attention" of Llama-3_1-Nemotron-51B
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cb(cur, "wo", il);
+            } else if (n_head > 0) {
+                // self-attention
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            // modified to support attention-free layer of Llama-3_1-Nemotron-51B
+            struct ggml_tensor * ffn_inp = cur;
+            if (n_head > 0) {
+                ffn_inp = ggml_add(ctx0, cur, inpSA);
+                cb(ffn_inp, "ffn_inp", il);
+            }
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        // For Granite architecture
+        if (hparams.f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_baichuan() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                switch (model.type) {
+                    case LLM_TYPE_7B:
+                        Qcur = ggml_rope_ext(
+                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        Kcur = ggml_rope_ext(
+                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        break;
+                    case LLM_TYPE_13B:
+                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
+                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
+                        break;
+                    default:
+                        GGML_ABORT("fatal error");
+                }
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_xverse() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_falcon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                if (model.layers[il].attn_norm_2) {
+                    // Falcon-40B
+                    cur = llm_build_norm(ctx0, inpL, hparams,
+                            model.layers[il].attn_norm_2,
+                            model.layers[il].attn_norm_2_b,
+                            LLM_NORM, cb, il);
+                    cb(cur, "attn_norm_2", il);
+                } else {
+                    cur = attn_norm;
+                }
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
+                inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
+                attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        // norm
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_grok() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // multiply by embedding_multiplier_scale of 78.38367176906169
+        inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // Grok
+            // if attn_out_norm is present then apply it before adding the input
+            if (model.layers[il].attn_out_norm) {
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].attn_out_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_out_norm", il);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            // MoE branch
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_GELU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            // Grok
+            // if layer_out_norm is present then apply it before adding the input
+            // Idea: maybe ffn_out_norm is a better name
+            if (model.layers[il].layer_out_norm) {
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].layer_out_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "layer_out_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        // Grok
+        // multiply logits by output_multiplier_scale of 0.5773502691896257
+
+        cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_dbrx() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                                 model.layers[il].attn_norm, NULL,
+                                 LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                cb(cur, "wqkv_clamped", il);
+
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            // MoE branch
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                                 model.layers[il].attn_out_norm, NULL,
+                                 LLM_NORM, cb, il);
+            cb(cur, "attn_out_norm", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                             model.output_norm, NULL,
+                             LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_starcoder() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_refact() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                cb(Kcur, "Kcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                cb(Qcur, "Qcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bert() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        struct ggml_tensor * inp_pos = nullptr;
+
+        if (model.arch != LLM_ARCH_JINA_BERT_V2) {
+            inp_pos = build_inp_pos();
+        }
+
+        // construct input embeddings (token, type, position)
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // token types are hardcoded to zero ("Sentence A")
+        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+        inpL = ggml_add(ctx0, inpL, type_row0);
+        if (model.arch == LLM_ARCH_BERT) {
+            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+        }
+        cb(inpL, "inp_embd", -1);
+
+        // embed layer norm
+        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+        cb(inpL, "inp_norm", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
+
+        // iterate layers
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * cur = inpL;
+
+            struct ggml_tensor * Qcur;
+            struct ggml_tensor * Kcur;
+            struct ggml_tensor * Vcur;
+
+            // self-attention
+            if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
+                Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                            model.layers[il].attn_q_norm,
+                            model.layers[il].attn_q_norm_b,
+                            LLM_NORM, cb, il);
+                }
+
+                Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                            model.layers[il].attn_k_norm,
+                            model.layers[il].attn_k_norm_b,
+                            LLM_NORM, cb, il);
+                }
+                Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            } else {
+                // compute Q and K and RoPE them
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+            }
+
+            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+            cb(kq, "kq", il);
+
+            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            cb(kq, "kq_soft_max_ext", il);
+
+            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+            cb(v, "v", il);
+
+            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+            cb(kqv, "kqv", il);
+
+            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+            cb(kqv_merged, "kqv_merged", il);
+
+            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+            cb(cur, "kqv_merged_cont", il);
+
+            ggml_build_forward_expand(gf, cur);
+
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+            if (model.layers[il].bo) {
+                cb(cur, "kqv_wo", il);
+            }
+
+            if (model.layers[il].bo) {
+                cur = ggml_add(ctx0, cur, model.layers[il].bo);
+            }
+            cb(cur, "kqv_out", il);
+
+            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // re-add the layer input
+            cur = ggml_add(ctx0, cur, inpL);
+
+            // attention layer norm
+            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
+
+            if (model.layers[il].attn_norm_2 != nullptr) {
+                cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
+                cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (model.arch == LLM_ARCH_BERT) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL,                        NULL,
+                        model.layers[il].ffn_gate, NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+            } else {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            }
+            cb(cur, "ffn_out", il);
+
+            // attentions bypass the intermediate layer
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            // output layer norm
+            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cb(cur, "result_embd", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bloom() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        inpL = llm_build_norm(ctx0, inpL, hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
+        cb(inpL, "inp_norm", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_mpt() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        if (model.pos_embd) {
+            // inp_pos - contains the positions
+            struct ggml_tensor * inp_pos = build_inp_pos();
+            pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+            cb(pos, "pos_embd", -1);
+
+            inpL = ggml_add(ctx0, inpL, pos);
+            cb(inpL, "inpL", -1);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                if (model.layers[il].bqkv){
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(cur, "wqkv_clamped", il);
+                }
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                // Q/K Layernorm
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                            model.layers[il].attn_q_norm,
+                            model.layers[il].attn_q_norm_b,
+                            LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                            model.layers[il].attn_k_norm,
+                            model.layers[il].attn_k_norm_b,
+                            LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                            model.layers[il].wo, model.layers[il].bo,
+                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                } else {
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                            model.layers[il].wo, model.layers[il].bo,
+                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed forward
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        model.layers[il].ffn_act,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_stablelm() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * inpSA = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                cb(Qcur, "Qcur", il);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                cb(Kcur, "Kcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                            model.layers[il].attn_q_norm,
+                            NULL,
+                            LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+                }
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                            model.layers[il].attn_k_norm,
+                            NULL,
+                            LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpL  = ggml_get_rows(ctx0,  inpL, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                if (model.layers[il].ffn_norm) {
+                    cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                            model.layers[il].ffn_norm,
+                            model.layers[il].ffn_norm_b,
+                            LLM_NORM, cb, il);
+                    cb(cur, "ffn_norm", il);
+                } else {
+                    // parallel residual
+                    cur = inpSA;
+                }
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward forward
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2vl() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
+        cb(lctx.inp_pos, "inp_pos", -1);
+        ggml_set_input(lctx.inp_pos);
+        struct ggml_tensor * inp_pos = lctx.inp_pos;
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        int sections[4];
+        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2moe() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            ggml_tensor * moe_out =
+                    llm_build_moe_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, false,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            // FFN shared expert
+            {
+                ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
+                cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
+
+                // sigmoid
+                ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
+                cb(cur_gate, "ffn_shexp_gate", il);
+
+                ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up_shexp,   NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur_ffn, "ffn_shexp", il);
+
+                ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
+                cb(ffn_shexp_out, "ffn_shexp_out", il);
+
+                moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
+                cb(moe_out, "ffn_out", il);
+
+                cur = moe_out;
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_phi2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * attn_norm_output;
+        struct ggml_tensor * ffn_output;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(attn_norm_output, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv) {
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cb(cur, "wqkv", il);
+
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                }
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                // with phi2, we scale the Q to avoid precision issues
+                // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
+                inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
+                attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
+            }
+
+            // FF
+            {
+                ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(ffn_output, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_output);
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output_no_bias", -1);
+
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_output", -1);
+        ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
+
+    struct ggml_cgraph * build_phi3() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = nullptr;
+        if (hparams.n_swa == 0) {
+            // Phi-4 doesn't use sliding window attention
+            KQ_mask = build_inp_KQ_mask();
+        } else {
+            KQ_mask = build_inp_KQ_mask_swa();
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            auto residual = inpL;
+
+            // self-attention
+            {
+                // rope freq factors for 128k context
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM_RMS, cb, il);
+                cb(attn_norm_output, "attn_norm", il);
+
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv) {
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cb(cur, "wqkv", il);
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                }
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor* inp_out_ids = build_inp_out_ids();
+                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+            }
+
+            cur = ggml_add(ctx0, cur, residual);
+            residual = cur;
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        cb, il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            cur = ggml_add(ctx0, residual, cur);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        if (model.output_b != nullptr) {
+            cb(cur, "result_output_no_bias", -1);
+            cur = ggml_add(ctx0, cur, model.output_b);
+        }
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
+    struct ggml_cgraph * build_plamo() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * attention_norm = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos, nullptr,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+            struct ggml_tensor * sa_out = cur;
+
+            cur = attention_norm;
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
+                sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
+                inpL   = ggml_get_rows(ctx0,   inpL, inp_out_ids);
+            }
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gpt2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_codeshell() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(tmpq, "tmpq", il);
+                cb(tmpk, "tmpk", il);
+                cb(Vcur, "Vcur", il);
+
+                struct ggml_tensor * Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_orion() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                // if (model.layers[il].bq) {
+                //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                //     cb(Qcur, "Qcur", il);
+                // }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                // if (model.layers[il].bk) {
+                //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                //     cb(Kcur, "Kcur", il);
+                // }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                // if (model.layers[il].bv) {
+                //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                //     cb(Vcur, "Vcur", il);
+                // }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_internlm2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_minicpm3() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        //TODO: if the model varies, these parameters need to be read from the model
+        const int64_t n_embd_base = 256;
+        const float scale_embd  = 12.0f;
+        const float scale_depth = 1.4f;
+        const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // scale the input embeddings
+        inpL = ggml_scale(ctx0, inpL, scale_embd);
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            struct ggml_tensor * rope_factors = build_rope_factors(il);
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                struct ggml_tensor * q = NULL;
+                // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                cb(q, "q", il);
+
+                q = llm_build_norm(ctx0, q, hparams,
+                        model.layers[il].attn_q_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(q, "q", il);
+
+                // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+                cb(q, "q", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        0);
+                cb(q_nope, "q_nope", il);
+
+                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+                // split into {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        0);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                cb(k_pe, "k_pe", il);
+
+                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
+                kv_compressed = ggml_cont(ctx0, kv_compressed);
+                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+                cb(kv, "kv", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        0);
+                cb(k_nope, "k_nope", il);
+
+                // and {n_head * n_embd_head_v, n_tokens}
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_cont(ctx0, v_states);
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+                    0);
+                cb(v_states, "v_states", il);
+
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                q_pe = ggml_rope_ext(
+                    ctx0, q_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(q_pe, "q_pe", il);
+
+                // shared RoPE key
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                k_pe = ggml_rope_ext(
+                    ctx0, k_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(k_pe, "k_pe", il);
+
+                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                cb(q_states, "q_states", il);
+
+                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                cb(k_states, "k_states", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // scale_res - scale the hidden states for residual connection
+            const float scale_res = scale_depth/sqrtf(float(n_layer));
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled", il);
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // scale the hidden states for residual connection
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled_ffn", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head scaling
+        const float scale_lmhead = float(n_embd_base)/float(n_embd);
+        cur = ggml_scale(ctx0, cur, scale_lmhead);
+        cb(cur, "lmhead_scaling", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gemma() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gemma2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        // gemma 2 requires different mask for layers using sliding window (SWA)
+        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask(true);
+        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
+
+        for (int il = 0; il < n_layer; ++il) {
+            // (il % 2) layers use SWA
+            struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+                switch (model.type) {
+                    case LLM_TYPE_2B:
+                    case LLM_TYPE_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
+                    case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
+                    default: GGML_ABORT("fatal error");
+                };
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        // final logit soft-capping
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
+    struct ggml_cgraph * build_starcoder2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_mamba() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        // {n_embd, n_tokens}
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
+                    state_copy, state_mask,
+                    kv_head, n_kv, cb, il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // residual
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        // final rmsnorm
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_command_r() {
+
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                                model.layers[il].attn_q_norm,
+                                NULL,
+                                LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                            model.layers[il].attn_k_norm,
+                            NULL,
+                            LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+
+    }
+
+    struct ggml_cgraph * build_cohere2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        // cohere2 requires different mask for layers using sliding window (SWA)
+        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask();
+        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+
+        // sliding window switch pattern
+        const int32_t sliding_window_pattern = 4;
+
+        for (int il = 0; il < n_layer; ++il) {
+            // three layers sliding window attention (window size 4096) and ROPE
+            // fourth layer uses global attention without positional embeddings
+            const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // rope freq factors for 128k context
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                if (is_sliding) {
+                    Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
+                                        beta_fast, beta_slow);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                                        rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+                                        attn_factor, beta_fast, beta_slow);
+                    cb(Kcur, "Kcur", il);
+                } else {
+                    // For non-sliding layers, just reshape without applying RoPE
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
+                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur                              = ggml_get_rows(ctx0, cur, inp_out_ids);
+                inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                ffn_inp                          = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+                                    NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
+                                    cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://allenai.org/olmo
+    // based on the original build_llama() function, changes:
+    //   * non-parametric layer norm
+    //   * clamp qkv
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_olmo() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    NULL, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    NULL, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                NULL, NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_olmo2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = inpL;
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // based on the build_qwen2moe() function, changes:
+    //   * removed shared experts
+    //   * removed bias
+    //   * added q, k norm
+    struct ggml_cgraph * build_olmoe() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, false,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_openelm() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            const int64_t n_head    = hparams.n_head(il);
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head_qkv = 2*n_head_kv + n_head;
+
+            cur = inpL;
+            struct ggml_tensor * residual = cur;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
+                cb(Vcur, "Vcur", il);
+
+                Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                        model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                        model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
+                cb(Qcur, "Vcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        // norm
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gptneox() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // ffn
+            if (hparams.use_par_res) {
+                // attention and ffn are computed in parallel
+                // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+                struct ggml_tensor * attn_out = cur;
+
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, inpL);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, attn_out);
+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            } else {
+                // attention and ffn are computed sequentially
+                // x = x + attn(ln1(x))
+                // x = x + ffn(ln2(x))
+
+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+                cb(ffn_inp, "ffn_inp", il);
+
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, ffn_inp);
+                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            }
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_arctic() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
+            cb(ffn_out, "ffn_out", il);
+
+            // MoE
+            cur = llm_build_norm(ctx0, inpSA, hparams,
+                    model.layers[il].ffn_norm_exps, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm_exps", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_out);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deepseek() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        llm_build_moe_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            nullptr,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, false,
+                            false, hparams.expert_weights_scale,
+                            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                            cb, il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deepseek2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        bool is_lite = (hparams.n_layer == 27);
+
+        // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+        const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
+        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        // {n_embd, n_tokens}
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                struct ggml_tensor * q = NULL;
+                if (!is_lite) {
+                    // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                    cb(q, "q", il);
+
+                    q = llm_build_norm(ctx0, q, hparams,
+                            model.layers[il].attn_q_a_norm, NULL,
+                            LLM_NORM_RMS, cb, il);
+                    cb(q, "q", il);
+
+                    // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+                    cb(q, "q", il);
+                } else {
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                    cb(q, "q", il);
+                }
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        0);
+                cb(q_nope, "q_nope", il);
+
+                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+                // split into {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        0);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                cb(k_pe, "k_pe", il);
+
+                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
+                kv_compressed = ggml_cont(ctx0, kv_compressed);
+                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+                cb(kv, "kv", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        0);
+                cb(k_nope, "k_nope", il);
+
+                // and {n_head * n_embd_head_v, n_tokens}
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_cont(ctx0, v_states);
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+                    0);
+                cb(v_states, "v_states", il);
+
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                q_pe = ggml_rope_ext(
+                    ctx0, q_pe, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(q_pe, "q_pe", il);
+
+                // shared RoPE key
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                k_pe = ggml_rope_ext(
+                    ctx0, k_pe, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(k_pe, "k_pe", il);
+
+                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                cb(q_states, "q_states", il);
+
+                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                cb(k_states, "k_states", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        llm_build_moe_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            model.layers[il].ffn_exp_probs_b,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, hparams.expert_weights_norm,
+                            true, hparams.expert_weights_scale,
+                            (enum llama_expert_gating_func_type) hparams.expert_gating_func,
+                            cb, il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bitnet() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                if (model.layers[il].wq_scale) {
+                    Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+                }
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                // B1.K
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                if (model.layers[il].wk_scale) {
+                    Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+                }
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                // B1.V
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                if (model.layers[il].wv_scale) {
+                    Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+                }
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        NULL, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].attn_sub_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_sub_norm", il);
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                if (model.layers[il].wo_scale) {
+                    cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+                }
+                if (model.layers[il].bo) {
+                    cur = ggml_add(ctx0, cur, model.layers[il].bo);
+                }
+                cb(cur, "attn_o_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward forward
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
+                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+                    NULL,                      NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_sub_out", il);
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                            model.layers[il].ffn_sub_norm, NULL,
+                            LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_sub_norm", il);
+
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
+            if (model.layers[il].ffn_down_scale) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+            }
+            cb(cur, "ffn_down", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        // FIXME: do not use model.tok_embd directly, duplicate as model.output
+        cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
+
+    struct ggml_cgraph * build_t5_enc() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        GGML_ASSERT(lctx.is_encoding);
+        struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm_enc, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+                cb(kq, "kq", il);
+
+                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+                cb(kq_b, "kq_b", il);
+
+                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+                cb(kq, "kq_soft_max_ext", il);
+
+                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+                cb(v, "v", il);
+
+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+                cb(kqv, "kqv", il);
+
+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                cb(kqv_merged, "kqv_merged", il);
+
+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+                cb(cur, "kqv_merged_cont", il);
+
+                ggml_build_forward_expand(gf, cur);
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
+                cb(cur, "kqv_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm_enc, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                // T5 uses relu, flan-T5 uses gelu-gated
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up_enc,   NULL, NULL,
+                        model.layers[il].ffn_gate_enc, NULL, NULL,
+                        model.layers[il].ffn_down_enc, NULL, NULL,
+                        NULL,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
+                        cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+            if (layer_dir != nullptr) {
+                cur = ggml_add(ctx0, cur, layer_dir);
+            }
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        cb(cur, "result_embd", -1);
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm_enc, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_t5_dec() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        GGML_ASSERT(!lctx.is_encoding);
+        GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+
+        struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
+        struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+
+        struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
+        struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+
+                struct ggml_tensor * k =
+                    ggml_view_3d(ctx0, kv_self.k_l[il],
+                            n_embd_head_k, n_kv, n_head_kv,
+                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                            0);
+                cb(k, "k", il);
+
+                struct ggml_tensor * v =
+                    ggml_view_3d(ctx0, kv_self.v_l[il],
+                            n_kv, n_embd_head_v, n_head_kv,
+                            ggml_element_size(kv_self.v_l[il])*n_ctx,
+                            ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+                            0);
+                cb(v, "v", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+
+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+                cb(kq, "kq", il);
+
+                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+                cb(kq_b, "kq_b", il);
+
+                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+                cb(kq, "kq_soft_max_ext", il);
+
+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+                cb(kqv, "kqv", il);
+
+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                cb(kqv_merged, "kqv_merged", il);
+
+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+                cb(cur, "kqv_merged_cont", il);
+
+                ggml_build_forward_expand(gf, cur);
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cb(cur, "kqv_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, inpSA);
+            cb(cur, "cross_inp", il);
+
+            struct ggml_tensor * inpCA = cur;
+
+            // norm
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_norm_cross, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm_cross", il);
+
+            // cross-attention
+            {
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+
+                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+                cb(kq, "kq", il);
+
+                kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+                cb(kq, "kq_soft_max_ext", il);
+
+                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+                cb(v, "v", il);
+
+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+                cb(kqv, "kqv", il);
+
+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                cb(kqv_merged, "kqv_merged", il);
+
+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+                cb(cur, "kqv_merged_cont", il);
+
+                ggml_build_forward_expand(gf, cur);
+
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
+                cb(cur, "kqv_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                // T5 uses relu, flan-T5 uses gelu-gated
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+                        cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+            if (layer_dir != nullptr) {
+                cur = ggml_add(ctx0, cur, layer_dir);
+            }
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        cb(cur, "result_embd", -1);
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_jais() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_chatglm() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+                if (model.layers[il].wqkv == nullptr) {
+                    Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                    if (model.layers[il].bq) {
+                        Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    }
+                    Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                    if (model.layers[il].bk) {
+                        Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    }
+                    Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                    if (model.layers[il].bv) {
+                        Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    }
+                } else {
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                    cb(cur, "wqkv", il);
+                    if (model.layers[il].bqkv) {
+                        cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                        cb(cur, "bqkv", il);
+                    }
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                }
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+                //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_nemotron() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        //GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm,
+                    model.layers[il].ffn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    NULL,                      NULL,                        NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_exaone() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    ggml_cgraph * build_rwkv6() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // Token shift state dimensions should be 2 * n_emb
+        GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+
+        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
+        GGML_ASSERT(n_seqs != 0);
+        GGML_ASSERT(ubatch.equal_seqs);
+        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
+
+            // (ab)using the KV cache to store the states
+            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.k_l[il], state_copy, state_mask,
+                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
+            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.v_l[il], state_copy, state_mask,
+                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+
+            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
+
+            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                att_shift,
+                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+                1
+            );
+
+            cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
+            ggml_build_forward_expand(gf, cur);
+            ggml_build_forward_expand(
+                gf,
+                ggml_cpy(
+                    ctx0,
+                    wkv_states,
+                    ggml_view_1d(
+                        ctx0,
+                        kv_self.v_l[il],
+                        hparams.n_embd_v_s() * n_seqs,
+                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+                    )
+                )
+            );
+
+            struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
+            x_prev = ggml_concat(
+                ctx0,
+                ffn_shift,
+                ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
+                1
+            );
+            cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
+            ggml_build_forward_expand(gf, cur);
+
+            struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
+            struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
+
+            token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
+
+            ggml_build_forward_expand(
+                gf,
+                ggml_cpy(
+                    ctx0,
+                    ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
+                    ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+                )
+            );
+
+            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+                cur = ggml_scale(ctx0, cur, 0.5F);
+            }
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
+    ggml_cgraph * build_rwkv6qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+
+        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_tokens = ubatch.n_tokens;
+        GGML_ASSERT(n_seqs != 0);
+        GGML_ASSERT(ubatch.equal_seqs);
+        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
+
+            // (ab)using the KV cache to store the states
+            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.k_l[il], state_copy, state_mask,
+                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
+            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.v_l[il], state_copy, state_mask,
+                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+
+            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
+
+            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                token_shift,
+                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+                1
+            );
+
+            struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
+            ggml_build_forward_expand(
+                gf,
+                ggml_cpy(
+                    ctx0,
+                    ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
+                    ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+                )
+            );
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
+            ggml_build_forward_expand(gf, ffn_inp);
+            ggml_build_forward_expand(
+                gf,
+                ggml_cpy(
+                    ctx0,
+                    wkv_states,
+                    ggml_view_1d(
+                        ctx0,
+                        kv_self.v_l[il],
+                        hparams.n_embd_v_s() * n_seqs,
+                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+                    )
+                )
+            );
+
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://github.com/facebookresearch/chameleon
+    // based on the original build_llama() function, changes:
+    //   * qk-norm
+    //   * swin-norm
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_chameleon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            if (hparams.swin_norm) {
+                cur = inpL;
+            } else {
+                cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                                model.layers[il].attn_q_norm,
+                                model.layers[il].attn_q_norm_b,
+                                LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                               model.layers[il].attn_k_norm,
+                               model.layers[il].attn_k_norm_b,
+                               LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+                if (hparams.swin_norm) {
+                    cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (!hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            if (hparams.swin_norm) {
+                cur = llm_build_norm(ctx0, cur, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output_with_img_logits", -1);
+
+        // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+        // Needs to be removed once image outputs are supported.
+        int img_token_end_idx = 8196;
+        int img_token_start_idx = 4;
+        int num_img_tokens = img_token_end_idx - img_token_start_idx;
+        // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+        // which ensures that text token values are always at least larger than image token values
+        struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+        img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+        cb(img_logits, "img_logits", -1);
+        cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_wavtokenizer_dec() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_b);
+
+        // posnet
+        for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+            const auto & layer = model.layers[il].posnet;
+
+            inpL = cur;
+
+            switch (il) {
+                case 0:
+                case 1:
+                case 3:
+                case 4:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm1,
+                                layer.norm1_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv1_b);
+
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm2,
+                                layer.norm2_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv2_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 2:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.attn_norm,
+                                layer.attn_norm_b,
+                                LLM_NORM_GROUP, cb, 0);
+
+                        struct ggml_tensor * q;
+                        struct ggml_tensor * k;
+                        struct ggml_tensor * v;
+
+                        q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+                        k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+                        v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
+
+                        q = ggml_add(ctx0, q, layer.attn_q_b);
+                        k = ggml_add(ctx0, k, layer.attn_k_b);
+                        v = ggml_add(ctx0, v, layer.attn_v_b);
+
+                        q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+                        k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
+
+                        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+                        kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
+
+                        cur = ggml_mul_mat(ctx0, kq, v);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.attn_o_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 5:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm,
+                                layer.norm_b,
+                                LLM_NORM_GROUP, cb, 0);
+                    } break;
+                default: GGML_ABORT("unknown posnet layer");
+            };
+        }
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        inpL = cur;
+
+        // convnext
+        for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+            const auto & layer = model.layers[il].convnext;
+
+            cur = inpL;
+
+            cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, layer.dw_b);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    layer.norm,
+                    layer.norm_b,
+                    LLM_NORM, cb, -1);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    layer.pw1, layer.pw1_b, NULL,
+                    NULL,      NULL,        NULL,
+                    layer.pw2, layer.pw2_b, NULL,
+                    NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+
+            cur = ggml_mul(ctx0, cur, layer.gamma);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+
+        cur = inpL;
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_embd", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+};
+
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+    llama_ubatch dummy = {};
+    dummy.equal_seqs = true;
+
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+
+    struct llm_build_context llm(lctx, dummy, cb, false);
+
+    llm.init();
+
+    struct ggml_cgraph * result = llm.build_defrag(ids);
+
+    llm.free();
+
+    return result;
+}
+
+static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
+    llama_ubatch dummy = {};
+    dummy.equal_seqs = true;
+
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+
+    struct llm_build_context llm(lctx, dummy, cb, false);
+
+    llm.init();
+
+    struct ggml_cgraph * result = llm.build_k_shift();
+
+    llm.free();
+
+    return result;
+}
+
+static struct ggml_cgraph * llama_build_graph(
+         llama_context & lctx,
+    const llama_ubatch & ubatch,
+                  bool   worst_case) {
+    const auto & model = lctx.model;
+
+    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
+        if (il >= 0) {
+            ggml_format_name(cur, "%s-%d", name, il);
+        } else {
+            ggml_set_name(cur, name);
+        }
+
+        if (!lctx.cparams.offload_kqv) {
+            if (strcmp(name, "kqv_merged_cont") == 0) {
+                // all nodes between the KV store and the attention output are run on the CPU
+                ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
+            }
+        }
+
+        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
+        // FIXME: fix in ggml_backend_sched
+        const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
+        if (ubatch.n_tokens < 32 || full_offload) {
+            if (il != -1 && strcmp(name, "norm") == 0) {
+                const auto & dev_layer = lctx.model.dev_layer(il);
+                for (auto & backend : lctx.backends) {
+                    if (ggml_backend_get_device(backend.get()) == dev_layer) {
+                        if (ggml_backend_supports_op(backend.get(), cur)) {
+                            ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    struct ggml_cgraph * result = NULL;
+
+    struct llm_build_context llm(lctx, ubatch, cb, worst_case);
+
+    llm.init();
+
+    switch (model.arch) {
+        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
+            {
+                result = llm.build_llama();
+            } break;
+        case LLM_ARCH_DECI:
+            {
+                result = llm.build_deci();
+            } break;
+        case LLM_ARCH_BAICHUAN:
+            {
+                result = llm.build_baichuan();
+            } break;
+        case LLM_ARCH_FALCON:
+            {
+                result = llm.build_falcon();
+            } break;
+        case LLM_ARCH_GROK:
+            {
+                result = llm.build_grok();
+            } break;
+        case LLM_ARCH_STARCODER:
+            {
+                result = llm.build_starcoder();
+            } break;
+        case LLM_ARCH_REFACT:
+            {
+                result = llm.build_refact();
+            } break;
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+            {
+                result = llm.build_bert();
+            } break;
+        case LLM_ARCH_BLOOM:
+            {
+                result = llm.build_bloom();
+            } break;
+        case LLM_ARCH_MPT:
+            {
+                result = llm.build_mpt();
+            } break;
+         case LLM_ARCH_STABLELM:
+            {
+                result = llm.build_stablelm();
+            } break;
+        case LLM_ARCH_QWEN:
+            {
+                result = llm.build_qwen();
+            } break;
+        case LLM_ARCH_QWEN2:
+            {
+                result = llm.build_qwen2();
+            } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                lctx.n_pos_per_token = 4;
+                result = llm.build_qwen2vl();
+            } break;
+        case LLM_ARCH_QWEN2MOE:
+            {
+                result = llm.build_qwen2moe();
+            } break;
+        case LLM_ARCH_PHI2:
+            {
+                result = llm.build_phi2();
+            } break;
+        case LLM_ARCH_PHI3:
+        case LLM_ARCH_PHIMOE:
+            {
+                result = llm.build_phi3();
+            } break;
+        case LLM_ARCH_PLAMO:
+            {
+                result = llm.build_plamo();
+            } break;
+        case LLM_ARCH_GPT2:
+            {
+                result = llm.build_gpt2();
+            } break;
+        case LLM_ARCH_CODESHELL:
+            {
+                result = llm.build_codeshell();
+            } break;
+        case LLM_ARCH_ORION:
+            {
+                result = llm.build_orion();
+            } break;
+        case LLM_ARCH_INTERNLM2:
+            {
+                result = llm.build_internlm2();
+            } break;
+        case LLM_ARCH_MINICPM3:
+            {
+                result = llm.build_minicpm3();
+            } break;
+        case LLM_ARCH_GEMMA:
+            {
+                result = llm.build_gemma();
+            } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                result = llm.build_gemma2();
+            } break;
+        case LLM_ARCH_STARCODER2:
+            {
+                result = llm.build_starcoder2();
+            } break;
+        case LLM_ARCH_MAMBA:
+            {
+                result = llm.build_mamba();
+            } break;
+        case LLM_ARCH_XVERSE:
+            {
+                result = llm.build_xverse();
+            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                result = llm.build_command_r();
+            } break;
+        case LLM_ARCH_COHERE2:
+            {
+                result = llm.build_cohere2();
+            } break;
+        case LLM_ARCH_DBRX:
+            {
+                result = llm.build_dbrx();
+            } break;
+        case LLM_ARCH_OLMO:
+            {
+                result = llm.build_olmo();
+            } break;
+        case LLM_ARCH_OLMO2:
+            {
+                result = llm.build_olmo2();
+            } break;
+        case LLM_ARCH_OLMOE:
+            {
+                result = llm.build_olmoe();
+            } break;
+        case LLM_ARCH_OPENELM:
+            {
+                result = llm.build_openelm();
+            } break;
+        case LLM_ARCH_GPTNEOX:
+            {
+                result = llm.build_gptneox();
+            } break;
+        case LLM_ARCH_ARCTIC:
+            {
+                result = llm.build_arctic();
+            } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                result = llm.build_deepseek();
+            } break;
+        case LLM_ARCH_DEEPSEEK2:
+            {
+                result = llm.build_deepseek2();
+            } break;
+        case LLM_ARCH_CHATGLM:
+            {
+                result = llm.build_chatglm();
+            } break;
+        case LLM_ARCH_BITNET:
+            {
+                result = llm.build_bitnet();
+            } break;
+        case LLM_ARCH_T5:
+            {
+                if (lctx.is_encoding) {
+                    result = llm.build_t5_enc();
+                } else {
+                    result = llm.build_t5_dec();
+                }
+            } break;
+        case LLM_ARCH_T5ENCODER:
+            {
+                result = llm.build_t5_enc();
+            } break;
+        case LLM_ARCH_JAIS:
+            {
+                result = llm.build_jais();
+            } break;
+        case LLM_ARCH_NEMOTRON:
+            {
+                result = llm.build_nemotron();
+            } break;
+        case LLM_ARCH_EXAONE:
+            {
+                result = llm.build_exaone();
+            } break;
+        case LLM_ARCH_RWKV6:
+            {
+                result = llm.build_rwkv6();
+            } break;
+        case LLM_ARCH_RWKV6QWEN2:
+            {
+                result = llm.build_rwkv6qwen2();
+            } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                result = llm.build_chameleon();
+            } break;
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                result = llm.build_wavtokenizer_dec();
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+
+    // add on pooling layer
+    if (lctx.cparams.embeddings) {
+        result = llm.append_pooling(result);
+    }
+
+    llm.free();
+
+    return result;
+}
+
+// returns the result of ggml_backend_sched_graph_compute_async execution
+static enum ggml_status llama_graph_compute(
+          llama_context & lctx,
+            ggml_cgraph * gf,
+                    int   n_threads,
+        ggml_threadpool * threadpool) {
+    if (lctx.backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(lctx.backend_cpu, threadpool);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
+
+    return status;
+}
+
+static int llama_prepare_sbatch(
+        llama_context     & lctx,
+        const llama_batch & batch,
+        uint32_t          & n_outputs) {
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    const uint32_t n_tokens_all = batch.n_tokens;
+    const  int64_t n_embd       = hparams.n_embd;
+
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+    GGML_ASSERT(n_tokens_all <= cparams.n_batch);
+    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+
+    lctx.n_queued_tokens += n_tokens_all;
+    lctx.embd_seq.clear();
+
+    // count outputs
+    if (batch.logits && !embd_pooled) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            n_outputs += batch.logits[i] != 0;
+        }
+    } else if (lctx.logits_all || embd_pooled) {
+        n_outputs = n_tokens_all;
+    } else {
+        // keep last output only
+        n_outputs = 1;
+    }
+
+    lctx.sbatch.from_batch(batch, n_embd,
+        /* simple_split */ !lctx.kv_self.recurrent,
+        /* logits_all   */ n_outputs == n_tokens_all);
+
+    // reserve output buffer
+    if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
+        return -2;
+    };
+
+    return 0;
+}
+
+static int llama_prepare_ubatch(
+        llama_context          & lctx,
+        llama_kv_slot_restorer & kv_slot_restorer,
+        llama_ubatch           & ubatch,
+        const uint32_t           n_outputs,
+        const uint32_t           n_tokens_all) {
+    GGML_ASSERT(lctx.sbatch.n_tokens > 0);
+
+    auto       & kv_self = lctx.kv_self;
+    const auto & cparams = lctx.cparams;
+    const auto & hparams = lctx.model.hparams;
+
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+    if (lctx.kv_self.recurrent) {
+        if (embd_pooled) {
+            // Pooled embeddings cannot be split across ubatches (yet)
+            ubatch = lctx.sbatch.split_seq(cparams.n_ubatch);
+        } else {
+            // recurrent model architectures are easier to implement
+            // with equal-length sequences
+            ubatch = lctx.sbatch.split_equal(cparams.n_ubatch);
+        }
+    } else {
+        ubatch = lctx.sbatch.split_simple(cparams.n_ubatch);
+    }
+
+    // count the outputs in this u_batch
+    {
+        int32_t n_outputs_new = 0;
+
+        if (n_outputs == n_tokens_all) {
+            n_outputs_new = ubatch.n_tokens;
+        } else {
+            GGML_ASSERT(ubatch.output);
+            for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+                n_outputs_new += int32_t(ubatch.output[i] != 0);
+            }
+        }
+
+        // needs to happen before the graph is built
+        lctx.n_outputs = n_outputs_new;
+    }
+
+    // non-causal masks do not use the KV cache
+    if (hparams.causal_attn) {
+        llama_kv_cache_update(&lctx);
+
+        // if we have enough unused cells before the current head ->
+        //   better to start searching from the beginning of the cache, hoping to fill it
+        if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
+            kv_self.head = 0;
+        }
+
+        const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+        if (!slot) {
+            return 1;
+        }
+        kv_slot_restorer.save(slot);
+
+        if (!kv_self.recurrent) {
+            // a heuristic, to avoid attending the full cache if it is not yet utilized
+            // after enough generations, the benefit from this heuristic disappears
+            // if we start defragmenting the cache, the benefit from this will be more important
+            const uint32_t pad = llama_kv_cache_get_padding(cparams);
+            kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
+            //kv_self.n = llama_kv_cache_cell_max(kv_self);
+        }
+    }
+
+    return 0;
+}
+
+// decode a batch of tokens by evaluating the transformer
+// in case of unsuccessful decoding (error or warning),
+// the kv_cache state will be returned to its original state
+// (for non-recurrent models) or cleaned (for recurrent models)
+//
+//   - lctx:      llama context
+//   - inp_batch: batch to evaluate
+//
+// return 0 on success
+// return positive int on warning
+// return negative int on error
+//
+static int llama_decode_impl(
+         llama_context & lctx,
+           llama_batch   inp_batch) {
+
+    lctx.is_encoding = false;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporarily allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & model   = lctx.model;
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    if (lctx.t_compute_start_us == 0) {
+        lctx.t_compute_start_us = ggml_time_us();
+    }
+    auto & kv_self = lctx.kv_self;
+    llama_kv_slot_restorer kv_slot_restorer(kv_self);
+
+    const int64_t n_embd  = hparams.n_embd;
+    const int64_t n_vocab = vocab.n_tokens();
+
+    uint32_t n_outputs = 0;
+    uint32_t n_outputs_prev = 0;
+
+    {
+        const int ret = llama_prepare_sbatch(lctx, batch, n_outputs);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+
+    while (lctx.sbatch.n_tokens > 0) {
+        llama_ubatch ubatch;
+        {
+            const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens);
+            if (ret != 0) {
+                return ret;
+            }
+        }
+
+        const int         n_threads  = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool   : lctx.threadpool_batch;
+
+        GGML_ASSERT(n_threads > 0);
+
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+
+        ggml_backend_sched_reset(lctx.sched.get());
+        ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+
+        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
+
+        // the output is always the last tensor in the graph
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
+
+        if (lctx.n_outputs == 0) {
+            // no output
+            res  = nullptr;
+            embd = nullptr;
+        } else if (cparams.embeddings) {
+            res  = nullptr; // do not extract logits for embedding case
+            embd = nullptr;
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = ggml_graph_node(gf, i);
+                    break;
+                }
+            }
+            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
+        } else {
+            embd = nullptr; // do not extract embeddings when not needed
+            GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
+        }
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+        llama_set_inputs(lctx, ubatch);
+
+        const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            kv_slot_restorer.restore(kv_self);
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
+
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
+
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
+
+        // extract logits
+        if (res) {
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(lctx.logits != nullptr);
+
+            float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
+            const int32_t n_outputs_new = lctx.n_outputs;
+
+            if (n_outputs_new) {
+                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
+                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
+                ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
+            }
+        }
+
+        // extract embeddings
+        if (embd) {
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
+            GGML_ASSERT(backend_embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(lctx.embd != nullptr);
+                        float * embd_out = lctx.embd + n_outputs_prev*n_embd;
+                        const int32_t n_outputs_new = lctx.n_outputs;
+
+                        if (n_outputs_new) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
+                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings (cleared before processing each batch)
+                        auto & embd_seq_out = lctx.embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = lctx.embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+        n_outputs_prev += lctx.n_outputs;
+    }
+
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs);
+
+        for (size_t i = 0; i < n_outputs; ++i) {
+            size_t out_id = lctx.sbatch.out_ids[i];
+            lctx.output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            lctx.sbatch.out_ids.clear();
+        }
+    }
+
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    lctx.n_outputs = n_outputs;
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //llama_synchronize(&lctx);
+
+    // decide if we need to defrag the kv cache
+    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+        // - do not defrag small contexts (i.e. < 2048 tokens)
+        // - count the padding towards the number of used tokens
+        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
+
+        // queue defragmentation for next llama_kv_cache_update
+        if (fragmentation > cparams.defrag_thold) {
+            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+            llama_kv_cache_defrag(kv_self);
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(lctx.sched.get());
+
+    return 0;
+}
+
+// encode a batch of tokens by evaluating the encoder part of the transformer
+//
+//   - lctx:      llama context
+//   - batch:     batch to evaluate
+//
+// return 0 on success
+// return positive int on warning
+// return negative int on error
+//
+static int llama_encode_impl(
+         llama_context & lctx,
+           llama_batch   inp_batch) {
+
+    lctx.is_encoding = true;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
+
+    const auto & model   = lctx.model;
+    const auto & hparams = model.hparams;
+    const auto & cparams = lctx.cparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+    if (lctx.t_compute_start_us == 0) {
+        lctx.t_compute_start_us = ggml_time_us();
+    }
+
+    lctx.n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        lctx.output_ids[i] = i;
+    }
+
+    lctx.inp_embd_enc = NULL;
+    lctx.n_outputs = n_tokens;
+
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
+    GGML_ASSERT(n_threads > 0);
+
+    ggml_backend_sched_reset(lctx.sched.get());
+    ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
+
+    ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
+
+    // the output embeddings after the final encoder normalization
+    struct ggml_tensor * embd = nullptr;
+
+    // there are two cases here
+    if (llama_model_has_decoder(&lctx.model)) {
+        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+        embd = ggml_graph_node(gf, -1);
+        GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
+    } else {
+        // second case is an encoder-only T5 model
+        if (cparams.embeddings) {
+            // only output embeddings if required
+            embd = ggml_graph_node(gf, -1);
+            if (strcmp(embd->name, "result_embd_pooled") != 0) {
+                embd = ggml_graph_node(gf, -2);
+            }
+            GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+        }
+    }
+
+    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+    llama_set_inputs(lctx, ubatch);
+
+    const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    // extract embeddings
+    if (embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        if (llama_model_has_decoder(&lctx.model)) {
+            lctx.embd_enc.resize(n_tokens*n_embd);
+            float * embd_out = lctx.embd_enc.data();
+
+            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+            // remember the sequence ids used during the encoding - needed for cross attention later
+            lctx.seq_ids_enc.resize(n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    lctx.seq_ids_enc[i].insert(seq_id);
+                }
+            }
+        } else {
+            GGML_ASSERT(lctx.embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(lctx.embd != nullptr);
+                        float * embd_out = lctx.embd;
+
+                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings
+                        auto & embd_seq_out = lctx.embd_seq;
+                        embd_seq_out.clear();
+
+                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                        for (uint32_t i = 0; i < n_tokens; i++) {
+                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(lctx.sched.get());
+
+    return 0;
+}
+
+// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
+    auto & kv_self = lctx.kv_self;
+
+    const auto & hparams = lctx.model.hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_used = kv_self.used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = kv_self.cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = kv_self.cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            kv_self.cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            kv_self.head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return;
+    }
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = kv_self.size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    // ggml_graph defrag
+
+    ggml_backend_sched_reset(lctx.sched.get());
+
+    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+#endif
+
+    //const int64_t t_end = ggml_time_us();
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
+}
+
+static void llama_kv_cache_update_impl(struct llama_context & lctx) {
+    bool need_reserve = false;
+
+    if (lctx.kv_self.has_shift) {
+        if (!llama_kv_cache_can_shift(&lctx)) {
+            GGML_ABORT("The current context does not support K-shift");
+        }
+
+        // apply K-shift if needed
+        if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            ggml_backend_sched_reset(lctx.sched.get());
+
+            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
+
+            ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+            llama_set_k_shift(lctx);
+
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+
+            need_reserve = true;
+        }
+
+        {
+            auto & kv_self = lctx.kv_self;
+
+            kv_self.has_shift = false;
+
+            for (uint32_t i = 0; i < kv_self.size; ++i) {
+                kv_self.cells[i].delta = 0;
+            }
+        }
+    }
+
+    // defragment the KV cache if needed
+    if (lctx.kv_self.do_defrag) {
+        llama_kv_cache_defrag_impl(lctx);
+
+        need_reserve = true;
+
+        lctx.kv_self.do_defrag = false;
+    }
+
+    // reserve a worst case graph again
+    if (need_reserve) {
+        // TODO: extract to a function
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
+        llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(lctx.sched.get());
+        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+    }
+}
+
+int32_t llama_set_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter,
+            float scale) {
+    ctx->lora[adapter] = scale;
+    return 0;
+}
+
+int32_t llama_rm_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter) {
+    auto pos = ctx->lora.find(adapter);
+    if (pos != ctx->lora.end()) {
+        ctx->lora.erase(pos);
+        return 0;
+    }
+
+    return -1;
+}
+
+void llama_clear_adapter_lora(struct llama_context * ctx) {
+    ctx->lora.clear();
+}
+
+int32_t llama_apply_adapter_cvec(
+        struct llama_context * ctx,
+                 const float * data,
+                      size_t   len,
+                     int32_t   n_embd,
+                     int32_t   il_start,
+                     int32_t   il_end) {
+    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
+}
+
+//
+// interface implementation
+//
+
+struct llama_context_params llama_context_default_params() {
+    struct llama_context_params result = {
+        /*.n_ctx                       =*/ 512,
+        /*.n_batch                     =*/ 2048,
+        /*.n_ubatch                    =*/ 512,
+        /*.n_seq_max                   =*/ 1,
+        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
+        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
+        /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        /*.rope_freq_base              =*/ 0.0f,
+        /*.rope_freq_scale             =*/ 0.0f,
+        /*.yarn_ext_factor             =*/ -1.0f,
+        /*.yarn_attn_factor            =*/ 1.0f,
+        /*.yarn_beta_fast              =*/ 32.0f,
+        /*.yarn_beta_slow              =*/ 1.0f,
+        /*.yarn_orig_ctx               =*/ 0,
+        /*.defrag_thold                =*/ -1.0f,
+        /*.cb_eval                     =*/ nullptr,
+        /*.cb_eval_user_data           =*/ nullptr,
+        /*.type_k                      =*/ GGML_TYPE_F16,
+        /*.type_v                      =*/ GGML_TYPE_F16,
+        /*.logits_all                  =*/ false,
+        /*.embeddings                  =*/ false,
+        /*.offload_kqv                 =*/ true,
+        /*.flash_attn                  =*/ false,
+        /*.no_perf                     =*/ true,
+        /*.abort_callback              =*/ nullptr,
+        /*.abort_callback_data         =*/ nullptr,
+    };
+
+    return result;
+}
+
+struct llama_sampler_chain_params llama_sampler_chain_default_params() {
+    struct llama_sampler_chain_params result = {
+        /*.no_perf                     =*/ true,
+    };
+
+    return result;
+}
+
+size_t llama_max_devices(void) {
+    return 16;
+}
+
+bool llama_supports_mmap(void) {
+    return llama_mmap::SUPPORTED;
+}
+
+bool llama_supports_mlock(void) {
+    return llama_mlock::SUPPORTED;
+}
+
+bool llama_supports_gpu_offload(void) {
+    return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
+           llama_supports_rpc();
+}
+
+bool llama_supports_rpc(void) {
+    return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
+void llama_backend_init(void) {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+void llama_numa_init(enum ggml_numa_strategy numa) {
+    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
+        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        GGML_ASSERT(dev && "CPU backend is not loaded");
+        auto * reg = ggml_backend_dev_backend_reg(dev);
+        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
+        numa_init_fn(numa);
+    }
+}
+
+void llama_backend_free(void) {
+    ggml_quantize_free();
+}
+
+int64_t llama_time_us(void) {
+    return ggml_time_us();
+}
+
+static struct llama_model * llama_model_load_from_file_impl(
+        const std::string & path_model,
+        std::vector<std::string> & splits,
+        struct llama_model_params params) {
+    ggml_time_init();
+
+    unsigned cur_percentage = 0;
+    if (params.progress_callback == NULL) {
+        params.progress_callback_user_data = &cur_percentage;
+        params.progress_callback = [](float progress, void * ctx) {
+            unsigned * cur_percentage_p = (unsigned *) ctx;
+            unsigned percentage = (unsigned) (100 * progress);
+            while (percentage > *cur_percentage_p) {
+                *cur_percentage_p = percentage;
+                LLAMA_LOG_CONT(".");
+                if (percentage >= 100) {
+                    LLAMA_LOG_CONT("\n");
+                }
+            }
+            return true;
+        };
+    }
+
+    llama_model * model = new llama_model(params);
+
+    // create list of devices to use with this model
+    if (params.devices) {
+        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
+            model->devices.push_back(*dev);
+        }
+    } else {
+        std::vector<ggml_backend_dev_t> rpc_servers;
+        // use all available devices
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            switch (ggml_backend_dev_type(dev)) {
+                case GGML_BACKEND_DEVICE_TYPE_CPU:
+                case GGML_BACKEND_DEVICE_TYPE_ACCEL:
+                    // skip CPU backends since they are handled separately
+                    break;
+
+                case GGML_BACKEND_DEVICE_TYPE_GPU:
+                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
+                        rpc_servers.push_back(dev);
+                    } else {
+                        model->devices.push_back(dev);
+                    }
+                    break;
+            }
+        }
+        // add RPC servers at the front of the list
+        if (!rpc_servers.empty()) {
+            model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+        }
+    }
+
+    // if using single GPU mode, remove all except the main GPU
+    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+        if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
+            LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
+            llama_model_free(model);
+            return nullptr;
+        }
+        ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
+        model->devices.clear();
+        model->devices.push_back(main_gpu);
+    }
+
+    for (auto * dev : model->devices) {
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+    }
+
+    const int status = llama_model_load(path_model, splits, *model, params);
+    GGML_ASSERT(status <= 0);
+    if (status < 0) {
+        if (status == -1) {
+            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+        } else if (status == -2) {
+            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+        }
+
+        llama_model_free(model);
+        return nullptr;
+    }
+
+    return model;
+}
+
+// deprecated
+struct llama_model * llama_load_model_from_file(
+        const char * path_model,
+        struct llama_model_params params) {
+    return llama_model_load_from_file(path_model, params);
+}
+
+struct llama_model * llama_model_load_from_file(
+        const char * path_model,
+        struct llama_model_params params) {
+    std::vector<std::string> splits = {};
+    return llama_model_load_from_file_impl(path_model, splits, params);
+}
+
+struct llama_model * llama_model_load_from_splits(
+        const char ** paths,
+        size_t n_paths,
+        struct llama_model_params params) {
+    std::vector<std::string> splits;
+    if (n_paths == 0) {
+        LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
+        return nullptr;
+    }
+    for (size_t i = 0; i < n_paths; ++i) {
+        splits.push_back(paths[i]);
+    }
+    return llama_model_load_from_file_impl(splits.front(), splits, params);
+}
+
+struct llama_context * llama_init_from_model(
+                 struct llama_model * model,
+        struct llama_context_params   params) {
+
+    if (!model) {
+        LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
+        return nullptr;
+    }
+
+    if (params.n_batch == 0 && params.n_ubatch == 0) {
+        LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
+        return nullptr;
+    }
+
+    if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
+        LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
+        return nullptr;
+    }
+
+    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
+        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
+    if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
+        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
+    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
+        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
+        return nullptr;
+    }
+
+    llama_context * ctx = new llama_context(*model);
+
+    const auto & hparams = model->hparams;
+    auto       & cparams = ctx->cparams;
+
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.defrag_thold     = params.defrag_thold;
+    cparams.embeddings       = params.embeddings;
+    cparams.offload_kqv      = params.offload_kqv;
+    cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
+    cparams.pooling_type     = params.pooling_type;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+    // this is necessary due to kv_self.n being padded later during inference
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
+
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
+
+    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+                                                              hparams.n_ctx_train;
+
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
+    }
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    }
+
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+    }
+
+    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+
+    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+        } else {
+            cparams.pooling_type = hparams.pooling_type;
+        }
+    }
+
+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
+
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    ctx->logits_all = params.logits_all;
+
+    // build worst-case graph for encoder if a model contains encoder
+    ctx->is_encoding = llama_model_has_encoder(model);
+
+    uint32_t kv_size = cparams.n_ctx;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
+
+    // Mamba only needs a constant number of KV cache cells per sequence
+    if (llama_model_is_recurrent(model)) {
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_seq_max);
+        // it's probably best to keep as much precision as possible for the states
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+    }
+
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+
+    if (!hparams.vocab_only) {
+        // GPU backends
+        for (auto * dev : model->devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.emplace_back(backend);
+        }
+
+        // add ACCEL backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.emplace_back(backend);
+            }
+        }
+
+        // add CPU backend
+        ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (ctx->backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+        ctx->backends.emplace_back(ctx->backend_cpu);
+
+        // create a list of the set_n_threads functions in the backends
+        for (auto & backend : ctx->backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
+        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
+
+        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+
+        {
+            size_t memory_size_k = 0;
+            size_t memory_size_v = 0;
+
+            for (auto & k : ctx->kv_self.k_l) {
+                memory_size_k += ggml_nbytes(k);
+            }
+
+            for (auto & v : ctx->kv_self.v_l) {
+                memory_size_v += ggml_nbytes(v);
+            }
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+        }
+
+        // graph outputs buffer
+        {
+            // resized during inference when a batch uses more outputs
+            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
+                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+
+            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name(ctx->buf_output.get()),
+                    ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
+        }
+
+        // scheduler and compute buffers
+        {
+            // buffer types used for the compute buffer of each backend
+            std::vector<ggml_backend_buffer_type_t> backend_buft;
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : ctx->backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
+                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                    auto * dev = model->devices[0];
+                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                    if (host_buft) {
+                        buft = host_buft;
+                    }
+                }
+                backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
+            }
+
+            const size_t max_nodes = model->max_nodes();
+
+            // buffer used to store the computation graph and the tensor meta data
+            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+            // TODO: move these checks to ggml_backend_sched
+            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+            bool pipeline_parallel =
+                model->n_devices() > 1 &&
+                model->params.n_gpu_layers > (int)model->hparams.n_layer &&
+                model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
+
+            // pipeline parallelism requires support for async compute and events in all devices
+            if (pipeline_parallel) {
+                for (auto & backend : ctx->backends) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        // ignore CPU backend
+                        continue;
+                    }
+                    auto * dev = ggml_backend_get_device(backend.get());
+                    ggml_backend_dev_props props;
+                    ggml_backend_dev_get_props(dev, &props);
+                    if (!props.caps.async || !props.caps.events) {
+                        // device does not support async compute or events
+                        pipeline_parallel = false;
+                        break;
+                    }
+                }
+            }
+
+            ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+
+            if (pipeline_parallel) {
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
+            }
+
+            // initialize scheduler with the worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+            llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
+
+            // reserve pp graph first so that buffers are only allocated once
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
+            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
+
+            // reserve with tg graph to get the number of splits and nodes
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
+            ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
+            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
+
+            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+            gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
+            if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
+                ggml_backend_buffer_type_t buft = backend_buft[i];
+                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
+                if (size > 1) {
+                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                            ggml_backend_buft_name(buft),
+                            size / 1024.0 / 1024.0);
+                }
+            }
+
+            if (n_nodes_pp == n_nodes_tg) {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+            }
+            if (n_splits_pp == n_splits_tg) {
+                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+            }
+        }
+    }
+
+    return ctx;
+}
+
+struct llama_context * llama_new_context_with_model(
+                 struct llama_model * model,
+        struct llama_context_params   params) {
+    return llama_init_from_model(model, params);
+}
+
+//
+// kv cache
+//
+
+// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
+    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
+}
+
+void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+    llama_kv_cache_view_update(view, ctx->kv_self);
+}
+
+int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
+    return llama_get_kv_cache_token_count(ctx->kv_self);
+}
+
+int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
+    return llama_get_kv_cache_used_cells(ctx->kv_self);
+}
+
+void llama_kv_cache_clear(struct llama_context * ctx) {
+    llama_kv_cache_clear(ctx->kv_self);
+}
+
+bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
+}
+
+void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
+}
+
+void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
+}
+
+void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
+}
+
+void llama_kv_cache_defrag(struct llama_context * ctx) {
+    llama_kv_cache_defrag(ctx->kv_self);
+}
+
+void llama_kv_cache_update(struct llama_context * ctx) {
+    llama_kv_cache_update_impl(*ctx);
+}
+
+bool llama_kv_cache_can_shift(struct llama_context * ctx) {
+    return llama_kv_cache_can_shift(ctx->kv_self);
+}
+
+///
+
+int32_t llama_encode(
+        struct llama_context * ctx,
+          struct llama_batch   batch) {
+    const int ret = llama_encode_impl(*ctx, batch);
+    if (ret != 0) {
+        LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
+    }
+
+    return ret;
+}
+
+int32_t llama_decode(
+        struct llama_context * ctx,
+          struct llama_batch   batch) {
+    const int ret = llama_decode_impl(*ctx, batch);
+    if (ret != 0) {
+        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+    }
+
+    return ret;
+}
+
+//
+// chat templates
+//
+
+int32_t llama_chat_apply_template(
+                              const char * tmpl,
+         const struct llama_chat_message * chat,
+                                  size_t   n_msg,
+                                    bool   add_ass,
+                                    char * buf,
+                                 int32_t   length) {
+    const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
+
+    // format the chat to string
+    std::vector<const llama_chat_message *> chat_vec;
+    chat_vec.resize(n_msg);
+    for (size_t i = 0; i < n_msg; i++) {
+        chat_vec[i] = &chat[i];
+    }
+
+    std::string formatted_chat;
+    llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
+    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
+        return -1;
+    }
+    int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
+    if (res < 0) {
+        return res;
+    }
+    if (buf && length > 0) {
+        strncpy(buf, formatted_chat.c_str(), length);
+    }
+    return res;
+}
+
+//
+// model split
+//
+
+int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
+    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
+    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
+        return strlen(split_path);
+    }
+    return 0;
+}
+
+int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
+    std::string str_split_path(split_path);
+    char postfix[32];
+    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    std::string str_postfix(postfix);
+
+    // check if split_prefix ends with postfix
+    int size_prefix = str_split_path.size() - str_postfix.size();
+    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
+        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
+        return size_prefix;
+    }
+
+    return 0;
+}
+
+const char * llama_print_system_info(void) {
+    static std::string s;
+    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
+
+
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        auto * reg = ggml_backend_reg_get(i);
+        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+        if (get_features_fn) {
+            ggml_backend_feature * features = get_features_fn(reg);
+            s += ggml_backend_reg_name(reg);
+            s += " : ";
+            for (; features->name; features++) {
+                s += features->name;
+                s += " = ";
+                s += features->value;
+                s += " | ";
+            }
+        }
+    }
+
+    return s.c_str();
+}
+
+//
+// perf
+//
+
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
+
+    if (ctx == nullptr) {
+        return data;
+    }
+
+    data.t_start_ms  = 1e-3 * ctx->t_start_us;
+    data.t_load_ms   = 1e-3 * ctx->t_load_us;
+    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
+    data.n_p_eval    = std::max(1, ctx->n_p_eval);
+    data.n_eval      = std::max(1, ctx->n_eval);
+
+    return data;
+}
+
+void llama_perf_context_print(const struct llama_context * ctx) {
+    const auto data = llama_perf_context(ctx);
+
+    const double t_end_ms = 1e-3 * ggml_time_us();
+
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+}
+
+void llama_perf_context_reset(struct llama_context * ctx) {
+    ctx->t_start_us  = ggml_time_us();
+    ctx->t_eval_us   = ctx->n_eval = 0;
+    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+}
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
new file mode 100644
index 000000000..04dcd7fcf
--- /dev/null
+++ b/src/unicode-data.cpp
@@ -0,0 +1,7034 @@
+// generated with scripts/gen-unicode-data.py
+
+#include "unicode-data.h"
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
+{0x000000, 0x0080},
+{0x000020, 0x0008},
+{0x000021, 0x0020},
+{0x000024, 0x0040},
+{0x000025, 0x0020},
+{0x00002B, 0x0040},
+{0x00002C, 0x0020},
+{0x000030, 0x0002},
+{0x00003A, 0x0020},
+{0x00003C, 0x0040},
+{0x00003F, 0x0020},
+{0x000041, 0x0004},
+{0x00005B, 0x0020},
+{0x00005E, 0x0040},
+{0x00005F, 0x0020},
+{0x000060, 0x0040},
+{0x000061, 0x0004},
+{0x00007B, 0x0020},
+{0x00007C, 0x0040},
+{0x00007D, 0x0020},
+{0x00007E, 0x0040},
+{0x00007F, 0x0080},
+{0x0000A0, 0x0008},
+{0x0000A1, 0x0020},
+{0x0000A2, 0x0040},
+{0x0000A7, 0x0020},
+{0x0000A8, 0x0040},
+{0x0000AA, 0x0004},
+{0x0000AB, 0x0020},
+{0x0000AC, 0x0040},
+{0x0000AD, 0x0080},
+{0x0000AE, 0x0040},
+{0x0000B2, 0x0002},
+{0x0000B4, 0x0040},
+{0x0000B5, 0x0004},
+{0x0000B6, 0x0020},
+{0x0000B8, 0x0040},
+{0x0000B9, 0x0002},
+{0x0000BA, 0x0004},
+{0x0000BB, 0x0020},
+{0x0000BC, 0x0002},
+{0x0000BF, 0x0020},
+{0x0000C0, 0x0004},
+{0x0000D7, 0x0040},
+{0x0000D8, 0x0004},
+{0x0000F7, 0x0040},
+{0x0000F8, 0x0004},
+{0x0002C2, 0x0040},
+{0x0002C6, 0x0004},
+{0x0002D2, 0x0040},
+{0x0002E0, 0x0004},
+{0x0002E5, 0x0040},
+{0x0002EC, 0x0004},
+{0x0002ED, 0x0040},
+{0x0002EE, 0x0004},
+{0x0002EF, 0x0040},
+{0x000300, 0x0010},
+{0x000370, 0x0004},
+{0x000375, 0x0040},
+{0x000376, 0x0004},
+{0x000378, 0x0001},
+{0x00037A, 0x0004},
+{0x00037E, 0x0020},
+{0x00037F, 0x0004},
+{0x000380, 0x0001},
+{0x000384, 0x0040},
+{0x000386, 0x0004},
+{0x000387, 0x0020},
+{0x000388, 0x0004},
+{0x00038B, 0x0001},
+{0x00038C, 0x0004},
+{0x00038D, 0x0001},
+{0x00038E, 0x0004},
+{0x0003A2, 0x0001},
+{0x0003A3, 0x0004},
+{0x0003F6, 0x0040},
+{0x0003F7, 0x0004},
+{0x000482, 0x0040},
+{0x000483, 0x0010},
+{0x00048A, 0x0004},
+{0x000530, 0x0001},
+{0x000531, 0x0004},
+{0x000557, 0x0001},
+{0x000559, 0x0004},
+{0x00055A, 0x0020},
+{0x000560, 0x0004},
+{0x000589, 0x0020},
+{0x00058B, 0x0001},
+{0x00058D, 0x0040},
+{0x000590, 0x0001},
+{0x000591, 0x0010},
+{0x0005BE, 0x0020},
+{0x0005BF, 0x0010},
+{0x0005C0, 0x0020},
+{0x0005C1, 0x0010},
+{0x0005C3, 0x0020},
+{0x0005C4, 0x0010},
+{0x0005C6, 0x0020},
+{0x0005C7, 0x0010},
+{0x0005C8, 0x0001},
+{0x0005D0, 0x0004},
+{0x0005EB, 0x0001},
+{0x0005EF, 0x0004},
+{0x0005F3, 0x0020},
+{0x0005F5, 0x0001},
+{0x000600, 0x0080},
+{0x000606, 0x0040},
+{0x000609, 0x0020},
+{0x00060B, 0x0040},
+{0x00060C, 0x0020},
+{0x00060E, 0x0040},
+{0x000610, 0x0010},
+{0x00061B, 0x0020},
+{0x00061C, 0x0080},
+{0x00061D, 0x0020},
+{0x000620, 0x0004},
+{0x00064B, 0x0010},
+{0x000660, 0x0002},
+{0x00066A, 0x0020},
+{0x00066E, 0x0004},
+{0x000670, 0x0010},
+{0x000671, 0x0004},
+{0x0006D4, 0x0020},
+{0x0006D5, 0x0004},
+{0x0006D6, 0x0010},
+{0x0006DD, 0x0080},
+{0x0006DE, 0x0040},
+{0x0006DF, 0x0010},
+{0x0006E5, 0x0004},
+{0x0006E7, 0x0010},
+{0x0006E9, 0x0040},
+{0x0006EA, 0x0010},
+{0x0006EE, 0x0004},
+{0x0006F0, 0x0002},
+{0x0006FA, 0x0004},
+{0x0006FD, 0x0040},
+{0x0006FF, 0x0004},
+{0x000700, 0x0020},
+{0x00070E, 0x0001},
+{0x00070F, 0x0080},
+{0x000710, 0x0004},
+{0x000711, 0x0010},
+{0x000712, 0x0004},
+{0x000730, 0x0010},
+{0x00074B, 0x0001},
+{0x00074D, 0x0004},
+{0x0007A6, 0x0010},
+{0x0007B1, 0x0004},
+{0x0007B2, 0x0001},
+{0x0007C0, 0x0002},
+{0x0007CA, 0x0004},
+{0x0007EB, 0x0010},
+{0x0007F4, 0x0004},
+{0x0007F6, 0x0040},
+{0x0007F7, 0x0020},
+{0x0007FA, 0x0004},
+{0x0007FB, 0x0001},
+{0x0007FD, 0x0010},
+{0x0007FE, 0x0040},
+{0x000800, 0x0004},
+{0x000816, 0x0010},
+{0x00081A, 0x0004},
+{0x00081B, 0x0010},
+{0x000824, 0x0004},
+{0x000825, 0x0010},
+{0x000828, 0x0004},
+{0x000829, 0x0010},
+{0x00082E, 0x0001},
+{0x000830, 0x0020},
+{0x00083F, 0x0001},
+{0x000840, 0x0004},
+{0x000859, 0x0010},
+{0x00085C, 0x0001},
+{0x00085E, 0x0020},
+{0x00085F, 0x0001},
+{0x000860, 0x0004},
+{0x00086B, 0x0001},
+{0x000870, 0x0004},
+{0x000888, 0x0040},
+{0x000889, 0x0004},
+{0x00088F, 0x0001},
+{0x000890, 0x0080},
+{0x000892, 0x0001},
+{0x000898, 0x0010},
+{0x0008A0, 0x0004},
+{0x0008CA, 0x0010},
+{0x0008E2, 0x0080},
+{0x0008E3, 0x0010},
+{0x000904, 0x0004},
+{0x00093A, 0x0010},
+{0x00093D, 0x0004},
+{0x00093E, 0x0010},
+{0x000950, 0x0004},
+{0x000951, 0x0010},
+{0x000958, 0x0004},
+{0x000962, 0x0010},
+{0x000964, 0x0020},
+{0x000966, 0x0002},
+{0x000970, 0x0020},
+{0x000971, 0x0004},
+{0x000981, 0x0010},
+{0x000984, 0x0001},
+{0x000985, 0x0004},
+{0x00098D, 0x0001},
+{0x00098F, 0x0004},
+{0x000991, 0x0001},
+{0x000993, 0x0004},
+{0x0009A9, 0x0001},
+{0x0009AA, 0x0004},
+{0x0009B1, 0x0001},
+{0x0009B2, 0x0004},
+{0x0009B3, 0x0001},
+{0x0009B6, 0x0004},
+{0x0009BA, 0x0001},
+{0x0009BC, 0x0010},
+{0x0009BD, 0x0004},
+{0x0009BE, 0x0010},
+{0x0009C5, 0x0001},
+{0x0009C7, 0x0010},
+{0x0009C9, 0x0001},
+{0x0009CB, 0x0010},
+{0x0009CE, 0x0004},
+{0x0009CF, 0x0001},
+{0x0009D7, 0x0010},
+{0x0009D8, 0x0001},
+{0x0009DC, 0x0004},
+{0x0009DE, 0x0001},
+{0x0009DF, 0x0004},
+{0x0009E2, 0x0010},
+{0x0009E4, 0x0001},
+{0x0009E6, 0x0002},
+{0x0009F0, 0x0004},
+{0x0009F2, 0x0040},
+{0x0009F4, 0x0002},
+{0x0009FA, 0x0040},
+{0x0009FC, 0x0004},
+{0x0009FD, 0x0020},
+{0x0009FE, 0x0010},
+{0x0009FF, 0x0001},
+{0x000A01, 0x0010},
+{0x000A04, 0x0001},
+{0x000A05, 0x0004},
+{0x000A0B, 0x0001},
+{0x000A0F, 0x0004},
+{0x000A11, 0x0001},
+{0x000A13, 0x0004},
+{0x000A29, 0x0001},
+{0x000A2A, 0x0004},
+{0x000A31, 0x0001},
+{0x000A32, 0x0004},
+{0x000A34, 0x0001},
+{0x000A35, 0x0004},
+{0x000A37, 0x0001},
+{0x000A38, 0x0004},
+{0x000A3A, 0x0001},
+{0x000A3C, 0x0010},
+{0x000A3D, 0x0001},
+{0x000A3E, 0x0010},
+{0x000A43, 0x0001},
+{0x000A47, 0x0010},
+{0x000A49, 0x0001},
+{0x000A4B, 0x0010},
+{0x000A4E, 0x0001},
+{0x000A51, 0x0010},
+{0x000A52, 0x0001},
+{0x000A59, 0x0004},
+{0x000A5D, 0x0001},
+{0x000A5E, 0x0004},
+{0x000A5F, 0x0001},
+{0x000A66, 0x0002},
+{0x000A70, 0x0010},
+{0x000A72, 0x0004},
+{0x000A75, 0x0010},
+{0x000A76, 0x0020},
+{0x000A77, 0x0001},
+{0x000A81, 0x0010},
+{0x000A84, 0x0001},
+{0x000A85, 0x0004},
+{0x000A8E, 0x0001},
+{0x000A8F, 0x0004},
+{0x000A92, 0x0001},
+{0x000A93, 0x0004},
+{0x000AA9, 0x0001},
+{0x000AAA, 0x0004},
+{0x000AB1, 0x0001},
+{0x000AB2, 0x0004},
+{0x000AB4, 0x0001},
+{0x000AB5, 0x0004},
+{0x000ABA, 0x0001},
+{0x000ABC, 0x0010},
+{0x000ABD, 0x0004},
+{0x000ABE, 0x0010},
+{0x000AC6, 0x0001},
+{0x000AC7, 0x0010},
+{0x000ACA, 0x0001},
+{0x000ACB, 0x0010},
+{0x000ACE, 0x0001},
+{0x000AD0, 0x0004},
+{0x000AD1, 0x0001},
+{0x000AE0, 0x0004},
+{0x000AE2, 0x0010},
+{0x000AE4, 0x0001},
+{0x000AE6, 0x0002},
+{0x000AF0, 0x0020},
+{0x000AF1, 0x0040},
+{0x000AF2, 0x0001},
+{0x000AF9, 0x0004},
+{0x000AFA, 0x0010},
+{0x000B00, 0x0001},
+{0x000B01, 0x0010},
+{0x000B04, 0x0001},
+{0x000B05, 0x0004},
+{0x000B0D, 0x0001},
+{0x000B0F, 0x0004},
+{0x000B11, 0x0001},
+{0x000B13, 0x0004},
+{0x000B29, 0x0001},
+{0x000B2A, 0x0004},
+{0x000B31, 0x0001},
+{0x000B32, 0x0004},
+{0x000B34, 0x0001},
+{0x000B35, 0x0004},
+{0x000B3A, 0x0001},
+{0x000B3C, 0x0010},
+{0x000B3D, 0x0004},
+{0x000B3E, 0x0010},
+{0x000B45, 0x0001},
+{0x000B47, 0x0010},
+{0x000B49, 0x0001},
+{0x000B4B, 0x0010},
+{0x000B4E, 0x0001},
+{0x000B55, 0x0010},
+{0x000B58, 0x0001},
+{0x000B5C, 0x0004},
+{0x000B5E, 0x0001},
+{0x000B5F, 0x0004},
+{0x000B62, 0x0010},
+{0x000B64, 0x0001},
+{0x000B66, 0x0002},
+{0x000B70, 0x0040},
+{0x000B71, 0x0004},
+{0x000B72, 0x0002},
+{0x000B78, 0x0001},
+{0x000B82, 0x0010},
+{0x000B83, 0x0004},
+{0x000B84, 0x0001},
+{0x000B85, 0x0004},
+{0x000B8B, 0x0001},
+{0x000B8E, 0x0004},
+{0x000B91, 0x0001},
+{0x000B92, 0x0004},
+{0x000B96, 0x0001},
+{0x000B99, 0x0004},
+{0x000B9B, 0x0001},
+{0x000B9C, 0x0004},
+{0x000B9D, 0x0001},
+{0x000B9E, 0x0004},
+{0x000BA0, 0x0001},
+{0x000BA3, 0x0004},
+{0x000BA5, 0x0001},
+{0x000BA8, 0x0004},
+{0x000BAB, 0x0001},
+{0x000BAE, 0x0004},
+{0x000BBA, 0x0001},
+{0x000BBE, 0x0010},
+{0x000BC3, 0x0001},
+{0x000BC6, 0x0010},
+{0x000BC9, 0x0001},
+{0x000BCA, 0x0010},
+{0x000BCE, 0x0001},
+{0x000BD0, 0x0004},
+{0x000BD1, 0x0001},
+{0x000BD7, 0x0010},
+{0x000BD8, 0x0001},
+{0x000BE6, 0x0002},
+{0x000BF3, 0x0040},
+{0x000BFB, 0x0001},
+{0x000C00, 0x0010},
+{0x000C05, 0x0004},
+{0x000C0D, 0x0001},
+{0x000C0E, 0x0004},
+{0x000C11, 0x0001},
+{0x000C12, 0x0004},
+{0x000C29, 0x0001},
+{0x000C2A, 0x0004},
+{0x000C3A, 0x0001},
+{0x000C3C, 0x0010},
+{0x000C3D, 0x0004},
+{0x000C3E, 0x0010},
+{0x000C45, 0x0001},
+{0x000C46, 0x0010},
+{0x000C49, 0x0001},
+{0x000C4A, 0x0010},
+{0x000C4E, 0x0001},
+{0x000C55, 0x0010},
+{0x000C57, 0x0001},
+{0x000C58, 0x0004},
+{0x000C5B, 0x0001},
+{0x000C5D, 0x0004},
+{0x000C5E, 0x0001},
+{0x000C60, 0x0004},
+{0x000C62, 0x0010},
+{0x000C64, 0x0001},
+{0x000C66, 0x0002},
+{0x000C70, 0x0001},
+{0x000C77, 0x0020},
+{0x000C78, 0x0002},
+{0x000C7F, 0x0040},
+{0x000C80, 0x0004},
+{0x000C81, 0x0010},
+{0x000C84, 0x0020},
+{0x000C85, 0x0004},
+{0x000C8D, 0x0001},
+{0x000C8E, 0x0004},
+{0x000C91, 0x0001},
+{0x000C92, 0x0004},
+{0x000CA9, 0x0001},
+{0x000CAA, 0x0004},
+{0x000CB4, 0x0001},
+{0x000CB5, 0x0004},
+{0x000CBA, 0x0001},
+{0x000CBC, 0x0010},
+{0x000CBD, 0x0004},
+{0x000CBE, 0x0010},
+{0x000CC5, 0x0001},
+{0x000CC6, 0x0010},
+{0x000CC9, 0x0001},
+{0x000CCA, 0x0010},
+{0x000CCE, 0x0001},
+{0x000CD5, 0x0010},
+{0x000CD7, 0x0001},
+{0x000CDD, 0x0004},
+{0x000CDF, 0x0001},
+{0x000CE0, 0x0004},
+{0x000CE2, 0x0010},
+{0x000CE4, 0x0001},
+{0x000CE6, 0x0002},
+{0x000CF0, 0x0001},
+{0x000CF1, 0x0004},
+{0x000CF3, 0x0010},
+{0x000CF4, 0x0001},
+{0x000D00, 0x0010},
+{0x000D04, 0x0004},
+{0x000D0D, 0x0001},
+{0x000D0E, 0x0004},
+{0x000D11, 0x0001},
+{0x000D12, 0x0004},
+{0x000D3B, 0x0010},
+{0x000D3D, 0x0004},
+{0x000D3E, 0x0010},
+{0x000D45, 0x0001},
+{0x000D46, 0x0010},
+{0x000D49, 0x0001},
+{0x000D4A, 0x0010},
+{0x000D4E, 0x0004},
+{0x000D4F, 0x0040},
+{0x000D50, 0x0001},
+{0x000D54, 0x0004},
+{0x000D57, 0x0010},
+{0x000D58, 0x0002},
+{0x000D5F, 0x0004},
+{0x000D62, 0x0010},
+{0x000D64, 0x0001},
+{0x000D66, 0x0002},
+{0x000D79, 0x0040},
+{0x000D7A, 0x0004},
+{0x000D80, 0x0001},
+{0x000D81, 0x0010},
+{0x000D84, 0x0001},
+{0x000D85, 0x0004},
+{0x000D97, 0x0001},
+{0x000D9A, 0x0004},
+{0x000DB2, 0x0001},
+{0x000DB3, 0x0004},
+{0x000DBC, 0x0001},
+{0x000DBD, 0x0004},
+{0x000DBE, 0x0001},
+{0x000DC0, 0x0004},
+{0x000DC7, 0x0001},
+{0x000DCA, 0x0010},
+{0x000DCB, 0x0001},
+{0x000DCF, 0x0010},
+{0x000DD5, 0x0001},
+{0x000DD6, 0x0010},
+{0x000DD7, 0x0001},
+{0x000DD8, 0x0010},
+{0x000DE0, 0x0001},
+{0x000DE6, 0x0002},
+{0x000DF0, 0x0001},
+{0x000DF2, 0x0010},
+{0x000DF4, 0x0020},
+{0x000DF5, 0x0001},
+{0x000E01, 0x0004},
+{0x000E31, 0x0010},
+{0x000E32, 0x0004},
+{0x000E34, 0x0010},
+{0x000E3B, 0x0001},
+{0x000E3F, 0x0040},
+{0x000E40, 0x0004},
+{0x000E47, 0x0010},
+{0x000E4F, 0x0020},
+{0x000E50, 0x0002},
+{0x000E5A, 0x0020},
+{0x000E5C, 0x0001},
+{0x000E81, 0x0004},
+{0x000E83, 0x0001},
+{0x000E84, 0x0004},
+{0x000E85, 0x0001},
+{0x000E86, 0x0004},
+{0x000E8B, 0x0001},
+{0x000E8C, 0x0004},
+{0x000EA4, 0x0001},
+{0x000EA5, 0x0004},
+{0x000EA6, 0x0001},
+{0x000EA7, 0x0004},
+{0x000EB1, 0x0010},
+{0x000EB2, 0x0004},
+{0x000EB4, 0x0010},
+{0x000EBD, 0x0004},
+{0x000EBE, 0x0001},
+{0x000EC0, 0x0004},
+{0x000EC5, 0x0001},
+{0x000EC6, 0x0004},
+{0x000EC7, 0x0001},
+{0x000EC8, 0x0010},
+{0x000ECF, 0x0001},
+{0x000ED0, 0x0002},
+{0x000EDA, 0x0001},
+{0x000EDC, 0x0004},
+{0x000EE0, 0x0001},
+{0x000F00, 0x0004},
+{0x000F01, 0x0040},
+{0x000F04, 0x0020},
+{0x000F13, 0x0040},
+{0x000F14, 0x0020},
+{0x000F15, 0x0040},
+{0x000F18, 0x0010},
+{0x000F1A, 0x0040},
+{0x000F20, 0x0002},
+{0x000F34, 0x0040},
+{0x000F35, 0x0010},
+{0x000F36, 0x0040},
+{0x000F37, 0x0010},
+{0x000F38, 0x0040},
+{0x000F39, 0x0010},
+{0x000F3A, 0x0020},
+{0x000F3E, 0x0010},
+{0x000F40, 0x0004},
+{0x000F48, 0x0001},
+{0x000F49, 0x0004},
+{0x000F6D, 0x0001},
+{0x000F71, 0x0010},
+{0x000F85, 0x0020},
+{0x000F86, 0x0010},
+{0x000F88, 0x0004},
+{0x000F8D, 0x0010},
+{0x000F98, 0x0001},
+{0x000F99, 0x0010},
+{0x000FBD, 0x0001},
+{0x000FBE, 0x0040},
+{0x000FC6, 0x0010},
+{0x000FC7, 0x0040},
+{0x000FCD, 0x0001},
+{0x000FCE, 0x0040},
+{0x000FD0, 0x0020},
+{0x000FD5, 0x0040},
+{0x000FD9, 0x0020},
+{0x000FDB, 0x0001},
+{0x001000, 0x0004},
+{0x00102B, 0x0010},
+{0x00103F, 0x0004},
+{0x001040, 0x0002},
+{0x00104A, 0x0020},
+{0x001050, 0x0004},
+{0x001056, 0x0010},
+{0x00105A, 0x0004},
+{0x00105E, 0x0010},
+{0x001061, 0x0004},
+{0x001062, 0x0010},
+{0x001065, 0x0004},
+{0x001067, 0x0010},
+{0x00106E, 0x0004},
+{0x001071, 0x0010},
+{0x001075, 0x0004},
+{0x001082, 0x0010},
+{0x00108E, 0x0004},
+{0x00108F, 0x0010},
+{0x001090, 0x0002},
+{0x00109A, 0x0010},
+{0x00109E, 0x0040},
+{0x0010A0, 0x0004},
+{0x0010C6, 0x0001},
+{0x0010C7, 0x0004},
+{0x0010C8, 0x0001},
+{0x0010CD, 0x0004},
+{0x0010CE, 0x0001},
+{0x0010D0, 0x0004},
+{0x0010FB, 0x0020},
+{0x0010FC, 0x0004},
+{0x001249, 0x0001},
+{0x00124A, 0x0004},
+{0x00124E, 0x0001},
+{0x001250, 0x0004},
+{0x001257, 0x0001},
+{0x001258, 0x0004},
+{0x001259, 0x0001},
+{0x00125A, 0x0004},
+{0x00125E, 0x0001},
+{0x001260, 0x0004},
+{0x001289, 0x0001},
+{0x00128A, 0x0004},
+{0x00128E, 0x0001},
+{0x001290, 0x0004},
+{0x0012B1, 0x0001},
+{0x0012B2, 0x0004},
+{0x0012B6, 0x0001},
+{0x0012B8, 0x0004},
+{0x0012BF, 0x0001},
+{0x0012C0, 0x0004},
+{0x0012C1, 0x0001},
+{0x0012C2, 0x0004},
+{0x0012C6, 0x0001},
+{0x0012C8, 0x0004},
+{0x0012D7, 0x0001},
+{0x0012D8, 0x0004},
+{0x001311, 0x0001},
+{0x001312, 0x0004},
+{0x001316, 0x0001},
+{0x001318, 0x0004},
+{0x00135B, 0x0001},
+{0x00135D, 0x0010},
+{0x001360, 0x0020},
+{0x001369, 0x0002},
+{0x00137D, 0x0001},
+{0x001380, 0x0004},
+{0x001390, 0x0040},
+{0x00139A, 0x0001},
+{0x0013A0, 0x0004},
+{0x0013F6, 0x0001},
+{0x0013F8, 0x0004},
+{0x0013FE, 0x0001},
+{0x001400, 0x0020},
+{0x001401, 0x0004},
+{0x00166D, 0x0040},
+{0x00166E, 0x0020},
+{0x00166F, 0x0004},
+{0x001680, 0x0008},
+{0x001681, 0x0004},
+{0x00169B, 0x0020},
+{0x00169D, 0x0001},
+{0x0016A0, 0x0004},
+{0x0016EB, 0x0020},
+{0x0016EE, 0x0002},
+{0x0016F1, 0x0004},
+{0x0016F9, 0x0001},
+{0x001700, 0x0004},
+{0x001712, 0x0010},
+{0x001716, 0x0001},
+{0x00171F, 0x0004},
+{0x001732, 0x0010},
+{0x001735, 0x0020},
+{0x001737, 0x0001},
+{0x001740, 0x0004},
+{0x001752, 0x0010},
+{0x001754, 0x0001},
+{0x001760, 0x0004},
+{0x00176D, 0x0001},
+{0x00176E, 0x0004},
+{0x001771, 0x0001},
+{0x001772, 0x0010},
+{0x001774, 0x0001},
+{0x001780, 0x0004},
+{0x0017B4, 0x0010},
+{0x0017D4, 0x0020},
+{0x0017D7, 0x0004},
+{0x0017D8, 0x0020},
+{0x0017DB, 0x0040},
+{0x0017DC, 0x0004},
+{0x0017DD, 0x0010},
+{0x0017DE, 0x0001},
+{0x0017E0, 0x0002},
+{0x0017EA, 0x0001},
+{0x0017F0, 0x0002},
+{0x0017FA, 0x0001},
+{0x001800, 0x0020},
+{0x00180B, 0x0010},
+{0x00180E, 0x0080},
+{0x00180F, 0x0010},
+{0x001810, 0x0002},
+{0x00181A, 0x0001},
+{0x001820, 0x0004},
+{0x001879, 0x0001},
+{0x001880, 0x0004},
+{0x001885, 0x0010},
+{0x001887, 0x0004},
+{0x0018A9, 0x0010},
+{0x0018AA, 0x0004},
+{0x0018AB, 0x0001},
+{0x0018B0, 0x0004},
+{0x0018F6, 0x0001},
+{0x001900, 0x0004},
+{0x00191F, 0x0001},
+{0x001920, 0x0010},
+{0x00192C, 0x0001},
+{0x001930, 0x0010},
+{0x00193C, 0x0001},
+{0x001940, 0x0040},
+{0x001941, 0x0001},
+{0x001944, 0x0020},
+{0x001946, 0x0002},
+{0x001950, 0x0004},
+{0x00196E, 0x0001},
+{0x001970, 0x0004},
+{0x001975, 0x0001},
+{0x001980, 0x0004},
+{0x0019AC, 0x0001},
+{0x0019B0, 0x0004},
+{0x0019CA, 0x0001},
+{0x0019D0, 0x0002},
+{0x0019DB, 0x0001},
+{0x0019DE, 0x0040},
+{0x001A00, 0x0004},
+{0x001A17, 0x0010},
+{0x001A1C, 0x0001},
+{0x001A1E, 0x0020},
+{0x001A20, 0x0004},
+{0x001A55, 0x0010},
+{0x001A5F, 0x0001},
+{0x001A60, 0x0010},
+{0x001A7D, 0x0001},
+{0x001A7F, 0x0010},
+{0x001A80, 0x0002},
+{0x001A8A, 0x0001},
+{0x001A90, 0x0002},
+{0x001A9A, 0x0001},
+{0x001AA0, 0x0020},
+{0x001AA7, 0x0004},
+{0x001AA8, 0x0020},
+{0x001AAE, 0x0001},
+{0x001AB0, 0x0010},
+{0x001ACF, 0x0001},
+{0x001B00, 0x0010},
+{0x001B05, 0x0004},
+{0x001B34, 0x0010},
+{0x001B45, 0x0004},
+{0x001B4D, 0x0001},
+{0x001B50, 0x0002},
+{0x001B5A, 0x0020},
+{0x001B61, 0x0040},
+{0x001B6B, 0x0010},
+{0x001B74, 0x0040},
+{0x001B7D, 0x0020},
+{0x001B7F, 0x0001},
+{0x001B80, 0x0010},
+{0x001B83, 0x0004},
+{0x001BA1, 0x0010},
+{0x001BAE, 0x0004},
+{0x001BB0, 0x0002},
+{0x001BBA, 0x0004},
+{0x001BE6, 0x0010},
+{0x001BF4, 0x0001},
+{0x001BFC, 0x0020},
+{0x001C00, 0x0004},
+{0x001C24, 0x0010},
+{0x001C38, 0x0001},
+{0x001C3B, 0x0020},
+{0x001C40, 0x0002},
+{0x001C4A, 0x0001},
+{0x001C4D, 0x0004},
+{0x001C50, 0x0002},
+{0x001C5A, 0x0004},
+{0x001C7E, 0x0020},
+{0x001C80, 0x0004},
+{0x001C89, 0x0001},
+{0x001C90, 0x0004},
+{0x001CBB, 0x0001},
+{0x001CBD, 0x0004},
+{0x001CC0, 0x0020},
+{0x001CC8, 0x0001},
+{0x001CD0, 0x0010},
+{0x001CD3, 0x0020},
+{0x001CD4, 0x0010},
+{0x001CE9, 0x0004},
+{0x001CED, 0x0010},
+{0x001CEE, 0x0004},
+{0x001CF4, 0x0010},
+{0x001CF5, 0x0004},
+{0x001CF7, 0x0010},
+{0x001CFA, 0x0004},
+{0x001CFB, 0x0001},
+{0x001D00, 0x0004},
+{0x001DC0, 0x0010},
+{0x001E00, 0x0004},
+{0x001F16, 0x0001},
+{0x001F18, 0x0004},
+{0x001F1E, 0x0001},
+{0x001F20, 0x0004},
+{0x001F46, 0x0001},
+{0x001F48, 0x0004},
+{0x001F4E, 0x0001},
+{0x001F50, 0x0004},
+{0x001F58, 0x0001},
+{0x001F59, 0x0004},
+{0x001F5A, 0x0001},
+{0x001F5B, 0x0004},
+{0x001F5C, 0x0001},
+{0x001F5D, 0x0004},
+{0x001F5E, 0x0001},
+{0x001F5F, 0x0004},
+{0x001F7E, 0x0001},
+{0x001F80, 0x0004},
+{0x001FB5, 0x0001},
+{0x001FB6, 0x0004},
+{0x001FBD, 0x0040},
+{0x001FBE, 0x0004},
+{0x001FBF, 0x0040},
+{0x001FC2, 0x0004},
+{0x001FC5, 0x0001},
+{0x001FC6, 0x0004},
+{0x001FCD, 0x0040},
+{0x001FD0, 0x0004},
+{0x001FD4, 0x0001},
+{0x001FD6, 0x0004},
+{0x001FDC, 0x0001},
+{0x001FDD, 0x0040},
+{0x001FE0, 0x0004},
+{0x001FED, 0x0040},
+{0x001FF0, 0x0001},
+{0x001FF2, 0x0004},
+{0x001FF5, 0x0001},
+{0x001FF6, 0x0004},
+{0x001FFD, 0x0040},
+{0x001FFF, 0x0001},
+{0x002000, 0x0008},
+{0x00200B, 0x0080},
+{0x002010, 0x0020},
+{0x002028, 0x0008},
+{0x00202A, 0x0080},
+{0x00202F, 0x0008},
+{0x002030, 0x0020},
+{0x002044, 0x0040},
+{0x002045, 0x0020},
+{0x002052, 0x0040},
+{0x002053, 0x0020},
+{0x00205F, 0x0008},
+{0x002060, 0x0080},
+{0x002065, 0x0001},
+{0x002066, 0x0080},
+{0x002070, 0x0002},
+{0x002071, 0x0004},
+{0x002072, 0x0001},
+{0x002074, 0x0002},
+{0x00207A, 0x0040},
+{0x00207D, 0x0020},
+{0x00207F, 0x0004},
+{0x002080, 0x0002},
+{0x00208A, 0x0040},
+{0x00208D, 0x0020},
+{0x00208F, 0x0001},
+{0x002090, 0x0004},
+{0x00209D, 0x0001},
+{0x0020A0, 0x0040},
+{0x0020C1, 0x0001},
+{0x0020D0, 0x0010},
+{0x0020F1, 0x0001},
+{0x002100, 0x0040},
+{0x002102, 0x0004},
+{0x002103, 0x0040},
+{0x002107, 0x0004},
+{0x002108, 0x0040},
+{0x00210A, 0x0004},
+{0x002114, 0x0040},
+{0x002115, 0x0004},
+{0x002116, 0x0040},
+{0x002119, 0x0004},
+{0x00211E, 0x0040},
+{0x002124, 0x0004},
+{0x002125, 0x0040},
+{0x002126, 0x0004},
+{0x002127, 0x0040},
+{0x002128, 0x0004},
+{0x002129, 0x0040},
+{0x00212A, 0x0004},
+{0x00212E, 0x0040},
+{0x00212F, 0x0004},
+{0x00213A, 0x0040},
+{0x00213C, 0x0004},
+{0x002140, 0x0040},
+{0x002145, 0x0004},
+{0x00214A, 0x0040},
+{0x00214E, 0x0004},
+{0x00214F, 0x0040},
+{0x002150, 0x0002},
+{0x002183, 0x0004},
+{0x002185, 0x0002},
+{0x00218A, 0x0040},
+{0x00218C, 0x0001},
+{0x002190, 0x0040},
+{0x002308, 0x0020},
+{0x00230C, 0x0040},
+{0x002329, 0x0020},
+{0x00232B, 0x0040},
+{0x002427, 0x0001},
+{0x002440, 0x0040},
+{0x00244B, 0x0001},
+{0x002460, 0x0002},
+{0x00249C, 0x0040},
+{0x0024EA, 0x0002},
+{0x002500, 0x0040},
+{0x002768, 0x0020},
+{0x002776, 0x0002},
+{0x002794, 0x0040},
+{0x0027C5, 0x0020},
+{0x0027C7, 0x0040},
+{0x0027E6, 0x0020},
+{0x0027F0, 0x0040},
+{0x002983, 0x0020},
+{0x002999, 0x0040},
+{0x0029D8, 0x0020},
+{0x0029DC, 0x0040},
+{0x0029FC, 0x0020},
+{0x0029FE, 0x0040},
+{0x002B74, 0x0001},
+{0x002B76, 0x0040},
+{0x002B96, 0x0001},
+{0x002B97, 0x0040},
+{0x002C00, 0x0004},
+{0x002CE5, 0x0040},
+{0x002CEB, 0x0004},
+{0x002CEF, 0x0010},
+{0x002CF2, 0x0004},
+{0x002CF4, 0x0001},
+{0x002CF9, 0x0020},
+{0x002CFD, 0x0002},
+{0x002CFE, 0x0020},
+{0x002D00, 0x0004},
+{0x002D26, 0x0001},
+{0x002D27, 0x0004},
+{0x002D28, 0x0001},
+{0x002D2D, 0x0004},
+{0x002D2E, 0x0001},
+{0x002D30, 0x0004},
+{0x002D68, 0x0001},
+{0x002D6F, 0x0004},
+{0x002D70, 0x0020},
+{0x002D71, 0x0001},
+{0x002D7F, 0x0010},
+{0x002D80, 0x0004},
+{0x002D97, 0x0001},
+{0x002DA0, 0x0004},
+{0x002DA7, 0x0001},
+{0x002DA8, 0x0004},
+{0x002DAF, 0x0001},
+{0x002DB0, 0x0004},
+{0x002DB7, 0x0001},
+{0x002DB8, 0x0004},
+{0x002DBF, 0x0001},
+{0x002DC0, 0x0004},
+{0x002DC7, 0x0001},
+{0x002DC8, 0x0004},
+{0x002DCF, 0x0001},
+{0x002DD0, 0x0004},
+{0x002DD7, 0x0001},
+{0x002DD8, 0x0004},
+{0x002DDF, 0x0001},
+{0x002DE0, 0x0010},
+{0x002E00, 0x0020},
+{0x002E2F, 0x0004},
+{0x002E30, 0x0020},
+{0x002E50, 0x0040},
+{0x002E52, 0x0020},
+{0x002E5E, 0x0001},
+{0x002E80, 0x0040},
+{0x002E9A, 0x0001},
+{0x002E9B, 0x0040},
+{0x002EF4, 0x0001},
+{0x002F00, 0x0040},
+{0x002FD6, 0x0001},
+{0x002FF0, 0x0040},
+{0x003000, 0x0008},
+{0x003001, 0x0020},
+{0x003004, 0x0040},
+{0x003005, 0x0004},
+{0x003007, 0x0002},
+{0x003008, 0x0020},
+{0x003012, 0x0040},
+{0x003014, 0x0020},
+{0x003020, 0x0040},
+{0x003021, 0x0002},
+{0x00302A, 0x0010},
+{0x003030, 0x0020},
+{0x003031, 0x0004},
+{0x003036, 0x0040},
+{0x003038, 0x0002},
+{0x00303B, 0x0004},
+{0x00303D, 0x0020},
+{0x00303E, 0x0040},
+{0x003040, 0x0001},
+{0x003041, 0x0004},
+{0x003097, 0x0001},
+{0x003099, 0x0010},
+{0x00309B, 0x0040},
+{0x00309D, 0x0004},
+{0x0030A0, 0x0020},
+{0x0030A1, 0x0004},
+{0x0030FB, 0x0020},
+{0x0030FC, 0x0004},
+{0x003100, 0x0001},
+{0x003105, 0x0004},
+{0x003130, 0x0001},
+{0x003131, 0x0004},
+{0x00318F, 0x0001},
+{0x003190, 0x0040},
+{0x003192, 0x0002},
+{0x003196, 0x0040},
+{0x0031A0, 0x0004},
+{0x0031C0, 0x0040},
+{0x0031E4, 0x0001},
+{0x0031EF, 0x0040},
+{0x0031F0, 0x0004},
+{0x003200, 0x0040},
+{0x00321F, 0x0001},
+{0x003220, 0x0002},
+{0x00322A, 0x0040},
+{0x003248, 0x0002},
+{0x003250, 0x0040},
+{0x003251, 0x0002},
+{0x003260, 0x0040},
+{0x003280, 0x0002},
+{0x00328A, 0x0040},
+{0x0032B1, 0x0002},
+{0x0032C0, 0x0040},
+{0x003400, 0x0004},
+{0x004DC0, 0x0040},
+{0x004E00, 0x0004},
+{0x00A48D, 0x0001},
+{0x00A490, 0x0040},
+{0x00A4C7, 0x0001},
+{0x00A4D0, 0x0004},
+{0x00A4FE, 0x0020},
+{0x00A500, 0x0004},
+{0x00A60D, 0x0020},
+{0x00A610, 0x0004},
+{0x00A620, 0x0002},
+{0x00A62A, 0x0004},
+{0x00A62C, 0x0001},
+{0x00A640, 0x0004},
+{0x00A66F, 0x0010},
+{0x00A673, 0x0020},
+{0x00A674, 0x0010},
+{0x00A67E, 0x0020},
+{0x00A67F, 0x0004},
+{0x00A69E, 0x0010},
+{0x00A6A0, 0x0004},
+{0x00A6E6, 0x0002},
+{0x00A6F0, 0x0010},
+{0x00A6F2, 0x0020},
+{0x00A6F8, 0x0001},
+{0x00A700, 0x0040},
+{0x00A717, 0x0004},
+{0x00A720, 0x0040},
+{0x00A722, 0x0004},
+{0x00A789, 0x0040},
+{0x00A78B, 0x0004},
+{0x00A7CB, 0x0001},
+{0x00A7D0, 0x0004},
+{0x00A7D2, 0x0001},
+{0x00A7D3, 0x0004},
+{0x00A7D4, 0x0001},
+{0x00A7D5, 0x0004},
+{0x00A7DA, 0x0001},
+{0x00A7F2, 0x0004},
+{0x00A802, 0x0010},
+{0x00A803, 0x0004},
+{0x00A806, 0x0010},
+{0x00A807, 0x0004},
+{0x00A80B, 0x0010},
+{0x00A80C, 0x0004},
+{0x00A823, 0x0010},
+{0x00A828, 0x0040},
+{0x00A82C, 0x0010},
+{0x00A82D, 0x0001},
+{0x00A830, 0x0002},
+{0x00A836, 0x0040},
+{0x00A83A, 0x0001},
+{0x00A840, 0x0004},
+{0x00A874, 0x0020},
+{0x00A878, 0x0001},
+{0x00A880, 0x0010},
+{0x00A882, 0x0004},
+{0x00A8B4, 0x0010},
+{0x00A8C6, 0x0001},
+{0x00A8CE, 0x0020},
+{0x00A8D0, 0x0002},
+{0x00A8DA, 0x0001},
+{0x00A8E0, 0x0010},
+{0x00A8F2, 0x0004},
+{0x00A8F8, 0x0020},
+{0x00A8FB, 0x0004},
+{0x00A8FC, 0x0020},
+{0x00A8FD, 0x0004},
+{0x00A8FF, 0x0010},
+{0x00A900, 0x0002},
+{0x00A90A, 0x0004},
+{0x00A926, 0x0010},
+{0x00A92E, 0x0020},
+{0x00A930, 0x0004},
+{0x00A947, 0x0010},
+{0x00A954, 0x0001},
+{0x00A95F, 0x0020},
+{0x00A960, 0x0004},
+{0x00A97D, 0x0001},
+{0x00A980, 0x0010},
+{0x00A984, 0x0004},
+{0x00A9B3, 0x0010},
+{0x00A9C1, 0x0020},
+{0x00A9CE, 0x0001},
+{0x00A9CF, 0x0004},
+{0x00A9D0, 0x0002},
+{0x00A9DA, 0x0001},
+{0x00A9DE, 0x0020},
+{0x00A9E0, 0x0004},
+{0x00A9E5, 0x0010},
+{0x00A9E6, 0x0004},
+{0x00A9F0, 0x0002},
+{0x00A9FA, 0x0004},
+{0x00A9FF, 0x0001},
+{0x00AA00, 0x0004},
+{0x00AA29, 0x0010},
+{0x00AA37, 0x0001},
+{0x00AA40, 0x0004},
+{0x00AA43, 0x0010},
+{0x00AA44, 0x0004},
+{0x00AA4C, 0x0010},
+{0x00AA4E, 0x0001},
+{0x00AA50, 0x0002},
+{0x00AA5A, 0x0001},
+{0x00AA5C, 0x0020},
+{0x00AA60, 0x0004},
+{0x00AA77, 0x0040},
+{0x00AA7A, 0x0004},
+{0x00AA7B, 0x0010},
+{0x00AA7E, 0x0004},
+{0x00AAB0, 0x0010},
+{0x00AAB1, 0x0004},
+{0x00AAB2, 0x0010},
+{0x00AAB5, 0x0004},
+{0x00AAB7, 0x0010},
+{0x00AAB9, 0x0004},
+{0x00AABE, 0x0010},
+{0x00AAC0, 0x0004},
+{0x00AAC1, 0x0010},
+{0x00AAC2, 0x0004},
+{0x00AAC3, 0x0001},
+{0x00AADB, 0x0004},
+{0x00AADE, 0x0020},
+{0x00AAE0, 0x0004},
+{0x00AAEB, 0x0010},
+{0x00AAF0, 0x0020},
+{0x00AAF2, 0x0004},
+{0x00AAF5, 0x0010},
+{0x00AAF7, 0x0001},
+{0x00AB01, 0x0004},
+{0x00AB07, 0x0001},
+{0x00AB09, 0x0004},
+{0x00AB0F, 0x0001},
+{0x00AB11, 0x0004},
+{0x00AB17, 0x0001},
+{0x00AB20, 0x0004},
+{0x00AB27, 0x0001},
+{0x00AB28, 0x0004},
+{0x00AB2F, 0x0001},
+{0x00AB30, 0x0004},
+{0x00AB5B, 0x0040},
+{0x00AB5C, 0x0004},
+{0x00AB6A, 0x0040},
+{0x00AB6C, 0x0001},
+{0x00AB70, 0x0004},
+{0x00ABE3, 0x0010},
+{0x00ABEB, 0x0020},
+{0x00ABEC, 0x0010},
+{0x00ABEE, 0x0001},
+{0x00ABF0, 0x0002},
+{0x00ABFA, 0x0001},
+{0x00AC00, 0x0004},
+{0x00D7A4, 0x0001},
+{0x00D7B0, 0x0004},
+{0x00D7C7, 0x0001},
+{0x00D7CB, 0x0004},
+{0x00D7FC, 0x0001},
+{0x00D800, 0x0080},
+{0x00F900, 0x0004},
+{0x00FA6E, 0x0001},
+{0x00FA70, 0x0004},
+{0x00FADA, 0x0001},
+{0x00FB00, 0x0004},
+{0x00FB07, 0x0001},
+{0x00FB13, 0x0004},
+{0x00FB18, 0x0001},
+{0x00FB1D, 0x0004},
+{0x00FB1E, 0x0010},
+{0x00FB1F, 0x0004},
+{0x00FB29, 0x0040},
+{0x00FB2A, 0x0004},
+{0x00FB37, 0x0001},
+{0x00FB38, 0x0004},
+{0x00FB3D, 0x0001},
+{0x00FB3E, 0x0004},
+{0x00FB3F, 0x0001},
+{0x00FB40, 0x0004},
+{0x00FB42, 0x0001},
+{0x00FB43, 0x0004},
+{0x00FB45, 0x0001},
+{0x00FB46, 0x0004},
+{0x00FBB2, 0x0040},
+{0x00FBC3, 0x0001},
+{0x00FBD3, 0x0004},
+{0x00FD3E, 0x0020},
+{0x00FD40, 0x0040},
+{0x00FD50, 0x0004},
+{0x00FD90, 0x0001},
+{0x00FD92, 0x0004},
+{0x00FDC8, 0x0001},
+{0x00FDCF, 0x0040},
+{0x00FDD0, 0x0001},
+{0x00FDF0, 0x0004},
+{0x00FDFC, 0x0040},
+{0x00FE00, 0x0010},
+{0x00FE10, 0x0020},
+{0x00FE1A, 0x0001},
+{0x00FE20, 0x0010},
+{0x00FE30, 0x0020},
+{0x00FE53, 0x0001},
+{0x00FE54, 0x0020},
+{0x00FE62, 0x0040},
+{0x00FE63, 0x0020},
+{0x00FE64, 0x0040},
+{0x00FE67, 0x0001},
+{0x00FE68, 0x0020},
+{0x00FE69, 0x0040},
+{0x00FE6A, 0x0020},
+{0x00FE6C, 0x0001},
+{0x00FE70, 0x0004},
+{0x00FE75, 0x0001},
+{0x00FE76, 0x0004},
+{0x00FEFD, 0x0001},
+{0x00FEFF, 0x0080},
+{0x00FF00, 0x0001},
+{0x00FF01, 0x0020},
+{0x00FF04, 0x0040},
+{0x00FF05, 0x0020},
+{0x00FF0B, 0x0040},
+{0x00FF0C, 0x0020},
+{0x00FF10, 0x0002},
+{0x00FF1A, 0x0020},
+{0x00FF1C, 0x0040},
+{0x00FF1F, 0x0020},
+{0x00FF21, 0x0004},
+{0x00FF3B, 0x0020},
+{0x00FF3E, 0x0040},
+{0x00FF3F, 0x0020},
+{0x00FF40, 0x0040},
+{0x00FF41, 0x0004},
+{0x00FF5B, 0x0020},
+{0x00FF5C, 0x0040},
+{0x00FF5D, 0x0020},
+{0x00FF5E, 0x0040},
+{0x00FF5F, 0x0020},
+{0x00FF66, 0x0004},
+{0x00FFBF, 0x0001},
+{0x00FFC2, 0x0004},
+{0x00FFC8, 0x0001},
+{0x00FFCA, 0x0004},
+{0x00FFD0, 0x0001},
+{0x00FFD2, 0x0004},
+{0x00FFD8, 0x0001},
+{0x00FFDA, 0x0004},
+{0x00FFDD, 0x0001},
+{0x00FFE0, 0x0040},
+{0x00FFE7, 0x0001},
+{0x00FFE8, 0x0040},
+{0x00FFEF, 0x0001},
+{0x00FFF9, 0x0080},
+{0x00FFFC, 0x0040},
+{0x00FFFE, 0x0001},
+{0x010000, 0x0004},
+{0x01000C, 0x0001},
+{0x01000D, 0x0004},
+{0x010027, 0x0001},
+{0x010028, 0x0004},
+{0x01003B, 0x0001},
+{0x01003C, 0x0004},
+{0x01003E, 0x0001},
+{0x01003F, 0x0004},
+{0x01004E, 0x0001},
+{0x010050, 0x0004},
+{0x01005E, 0x0001},
+{0x010080, 0x0004},
+{0x0100FB, 0x0001},
+{0x010100, 0x0020},
+{0x010103, 0x0001},
+{0x010107, 0x0002},
+{0x010134, 0x0001},
+{0x010137, 0x0040},
+{0x010140, 0x0002},
+{0x010179, 0x0040},
+{0x01018A, 0x0002},
+{0x01018C, 0x0040},
+{0x01018F, 0x0001},
+{0x010190, 0x0040},
+{0x01019D, 0x0001},
+{0x0101A0, 0x0040},
+{0x0101A1, 0x0001},
+{0x0101D0, 0x0040},
+{0x0101FD, 0x0010},
+{0x0101FE, 0x0001},
+{0x010280, 0x0004},
+{0x01029D, 0x0001},
+{0x0102A0, 0x0004},
+{0x0102D1, 0x0001},
+{0x0102E0, 0x0010},
+{0x0102E1, 0x0002},
+{0x0102FC, 0x0001},
+{0x010300, 0x0004},
+{0x010320, 0x0002},
+{0x010324, 0x0001},
+{0x01032D, 0x0004},
+{0x010341, 0x0002},
+{0x010342, 0x0004},
+{0x01034A, 0x0002},
+{0x01034B, 0x0001},
+{0x010350, 0x0004},
+{0x010376, 0x0010},
+{0x01037B, 0x0001},
+{0x010380, 0x0004},
+{0x01039E, 0x0001},
+{0x01039F, 0x0020},
+{0x0103A0, 0x0004},
+{0x0103C4, 0x0001},
+{0x0103C8, 0x0004},
+{0x0103D0, 0x0020},
+{0x0103D1, 0x0002},
+{0x0103D6, 0x0001},
+{0x010400, 0x0004},
+{0x01049E, 0x0001},
+{0x0104A0, 0x0002},
+{0x0104AA, 0x0001},
+{0x0104B0, 0x0004},
+{0x0104D4, 0x0001},
+{0x0104D8, 0x0004},
+{0x0104FC, 0x0001},
+{0x010500, 0x0004},
+{0x010528, 0x0001},
+{0x010530, 0x0004},
+{0x010564, 0x0001},
+{0x01056F, 0x0020},
+{0x010570, 0x0004},
+{0x01057B, 0x0001},
+{0x01057C, 0x0004},
+{0x01058B, 0x0001},
+{0x01058C, 0x0004},
+{0x010593, 0x0001},
+{0x010594, 0x0004},
+{0x010596, 0x0001},
+{0x010597, 0x0004},
+{0x0105A2, 0x0001},
+{0x0105A3, 0x0004},
+{0x0105B2, 0x0001},
+{0x0105B3, 0x0004},
+{0x0105BA, 0x0001},
+{0x0105BB, 0x0004},
+{0x0105BD, 0x0001},
+{0x010600, 0x0004},
+{0x010737, 0x0001},
+{0x010740, 0x0004},
+{0x010756, 0x0001},
+{0x010760, 0x0004},
+{0x010768, 0x0001},
+{0x010780, 0x0004},
+{0x010786, 0x0001},
+{0x010787, 0x0004},
+{0x0107B1, 0x0001},
+{0x0107B2, 0x0004},
+{0x0107BB, 0x0001},
+{0x010800, 0x0004},
+{0x010806, 0x0001},
+{0x010808, 0x0004},
+{0x010809, 0x0001},
+{0x01080A, 0x0004},
+{0x010836, 0x0001},
+{0x010837, 0x0004},
+{0x010839, 0x0001},
+{0x01083C, 0x0004},
+{0x01083D, 0x0001},
+{0x01083F, 0x0004},
+{0x010856, 0x0001},
+{0x010857, 0x0020},
+{0x010858, 0x0002},
+{0x010860, 0x0004},
+{0x010877, 0x0040},
+{0x010879, 0x0002},
+{0x010880, 0x0004},
+{0x01089F, 0x0001},
+{0x0108A7, 0x0002},
+{0x0108B0, 0x0001},
+{0x0108E0, 0x0004},
+{0x0108F3, 0x0001},
+{0x0108F4, 0x0004},
+{0x0108F6, 0x0001},
+{0x0108FB, 0x0002},
+{0x010900, 0x0004},
+{0x010916, 0x0002},
+{0x01091C, 0x0001},
+{0x01091F, 0x0020},
+{0x010920, 0x0004},
+{0x01093A, 0x0001},
+{0x01093F, 0x0020},
+{0x010940, 0x0001},
+{0x010980, 0x0004},
+{0x0109B8, 0x0001},
+{0x0109BC, 0x0002},
+{0x0109BE, 0x0004},
+{0x0109C0, 0x0002},
+{0x0109D0, 0x0001},
+{0x0109D2, 0x0002},
+{0x010A00, 0x0004},
+{0x010A01, 0x0010},
+{0x010A04, 0x0001},
+{0x010A05, 0x0010},
+{0x010A07, 0x0001},
+{0x010A0C, 0x0010},
+{0x010A10, 0x0004},
+{0x010A14, 0x0001},
+{0x010A15, 0x0004},
+{0x010A18, 0x0001},
+{0x010A19, 0x0004},
+{0x010A36, 0x0001},
+{0x010A38, 0x0010},
+{0x010A3B, 0x0001},
+{0x010A3F, 0x0010},
+{0x010A40, 0x0002},
+{0x010A49, 0x0001},
+{0x010A50, 0x0020},
+{0x010A59, 0x0001},
+{0x010A60, 0x0004},
+{0x010A7D, 0x0002},
+{0x010A7F, 0x0020},
+{0x010A80, 0x0004},
+{0x010A9D, 0x0002},
+{0x010AA0, 0x0001},
+{0x010AC0, 0x0004},
+{0x010AC8, 0x0040},
+{0x010AC9, 0x0004},
+{0x010AE5, 0x0010},
+{0x010AE7, 0x0001},
+{0x010AEB, 0x0002},
+{0x010AF0, 0x0020},
+{0x010AF7, 0x0001},
+{0x010B00, 0x0004},
+{0x010B36, 0x0001},
+{0x010B39, 0x0020},
+{0x010B40, 0x0004},
+{0x010B56, 0x0001},
+{0x010B58, 0x0002},
+{0x010B60, 0x0004},
+{0x010B73, 0x0001},
+{0x010B78, 0x0002},
+{0x010B80, 0x0004},
+{0x010B92, 0x0001},
+{0x010B99, 0x0020},
+{0x010B9D, 0x0001},
+{0x010BA9, 0x0002},
+{0x010BB0, 0x0001},
+{0x010C00, 0x0004},
+{0x010C49, 0x0001},
+{0x010C80, 0x0004},
+{0x010CB3, 0x0001},
+{0x010CC0, 0x0004},
+{0x010CF3, 0x0001},
+{0x010CFA, 0x0002},
+{0x010D00, 0x0004},
+{0x010D24, 0x0010},
+{0x010D28, 0x0001},
+{0x010D30, 0x0002},
+{0x010D3A, 0x0001},
+{0x010E60, 0x0002},
+{0x010E7F, 0x0001},
+{0x010E80, 0x0004},
+{0x010EAA, 0x0001},
+{0x010EAB, 0x0010},
+{0x010EAD, 0x0020},
+{0x010EAE, 0x0001},
+{0x010EB0, 0x0004},
+{0x010EB2, 0x0001},
+{0x010EFD, 0x0010},
+{0x010F00, 0x0004},
+{0x010F1D, 0x0002},
+{0x010F27, 0x0004},
+{0x010F28, 0x0001},
+{0x010F30, 0x0004},
+{0x010F46, 0x0010},
+{0x010F51, 0x0002},
+{0x010F55, 0x0020},
+{0x010F5A, 0x0001},
+{0x010F70, 0x0004},
+{0x010F82, 0x0010},
+{0x010F86, 0x0020},
+{0x010F8A, 0x0001},
+{0x010FB0, 0x0004},
+{0x010FC5, 0x0002},
+{0x010FCC, 0x0001},
+{0x010FE0, 0x0004},
+{0x010FF7, 0x0001},
+{0x011000, 0x0010},
+{0x011003, 0x0004},
+{0x011038, 0x0010},
+{0x011047, 0x0020},
+{0x01104E, 0x0001},
+{0x011052, 0x0002},
+{0x011070, 0x0010},
+{0x011071, 0x0004},
+{0x011073, 0x0010},
+{0x011075, 0x0004},
+{0x011076, 0x0001},
+{0x01107F, 0x0010},
+{0x011083, 0x0004},
+{0x0110B0, 0x0010},
+{0x0110BB, 0x0020},
+{0x0110BD, 0x0080},
+{0x0110BE, 0x0020},
+{0x0110C2, 0x0010},
+{0x0110C3, 0x0001},
+{0x0110CD, 0x0080},
+{0x0110CE, 0x0001},
+{0x0110D0, 0x0004},
+{0x0110E9, 0x0001},
+{0x0110F0, 0x0002},
+{0x0110FA, 0x0001},
+{0x011100, 0x0010},
+{0x011103, 0x0004},
+{0x011127, 0x0010},
+{0x011135, 0x0001},
+{0x011136, 0x0002},
+{0x011140, 0x0020},
+{0x011144, 0x0004},
+{0x011145, 0x0010},
+{0x011147, 0x0004},
+{0x011148, 0x0001},
+{0x011150, 0x0004},
+{0x011173, 0x0010},
+{0x011174, 0x0020},
+{0x011176, 0x0004},
+{0x011177, 0x0001},
+{0x011180, 0x0010},
+{0x011183, 0x0004},
+{0x0111B3, 0x0010},
+{0x0111C1, 0x0004},
+{0x0111C5, 0x0020},
+{0x0111C9, 0x0010},
+{0x0111CD, 0x0020},
+{0x0111CE, 0x0010},
+{0x0111D0, 0x0002},
+{0x0111DA, 0x0004},
+{0x0111DB, 0x0020},
+{0x0111DC, 0x0004},
+{0x0111DD, 0x0020},
+{0x0111E0, 0x0001},
+{0x0111E1, 0x0002},
+{0x0111F5, 0x0001},
+{0x011200, 0x0004},
+{0x011212, 0x0001},
+{0x011213, 0x0004},
+{0x01122C, 0x0010},
+{0x011238, 0x0020},
+{0x01123E, 0x0010},
+{0x01123F, 0x0004},
+{0x011241, 0x0010},
+{0x011242, 0x0001},
+{0x011280, 0x0004},
+{0x011287, 0x0001},
+{0x011288, 0x0004},
+{0x011289, 0x0001},
+{0x01128A, 0x0004},
+{0x01128E, 0x0001},
+{0x01128F, 0x0004},
+{0x01129E, 0x0001},
+{0x01129F, 0x0004},
+{0x0112A9, 0x0020},
+{0x0112AA, 0x0001},
+{0x0112B0, 0x0004},
+{0x0112DF, 0x0010},
+{0x0112EB, 0x0001},
+{0x0112F0, 0x0002},
+{0x0112FA, 0x0001},
+{0x011300, 0x0010},
+{0x011304, 0x0001},
+{0x011305, 0x0004},
+{0x01130D, 0x0001},
+{0x01130F, 0x0004},
+{0x011311, 0x0001},
+{0x011313, 0x0004},
+{0x011329, 0x0001},
+{0x01132A, 0x0004},
+{0x011331, 0x0001},
+{0x011332, 0x0004},
+{0x011334, 0x0001},
+{0x011335, 0x0004},
+{0x01133A, 0x0001},
+{0x01133B, 0x0010},
+{0x01133D, 0x0004},
+{0x01133E, 0x0010},
+{0x011345, 0x0001},
+{0x011347, 0x0010},
+{0x011349, 0x0001},
+{0x01134B, 0x0010},
+{0x01134E, 0x0001},
+{0x011350, 0x0004},
+{0x011351, 0x0001},
+{0x011357, 0x0010},
+{0x011358, 0x0001},
+{0x01135D, 0x0004},
+{0x011362, 0x0010},
+{0x011364, 0x0001},
+{0x011366, 0x0010},
+{0x01136D, 0x0001},
+{0x011370, 0x0010},
+{0x011375, 0x0001},
+{0x011400, 0x0004},
+{0x011435, 0x0010},
+{0x011447, 0x0004},
+{0x01144B, 0x0020},
+{0x011450, 0x0002},
+{0x01145A, 0x0020},
+{0x01145C, 0x0001},
+{0x01145D, 0x0020},
+{0x01145E, 0x0010},
+{0x01145F, 0x0004},
+{0x011462, 0x0001},
+{0x011480, 0x0004},
+{0x0114B0, 0x0010},
+{0x0114C4, 0x0004},
+{0x0114C6, 0x0020},
+{0x0114C7, 0x0004},
+{0x0114C8, 0x0001},
+{0x0114D0, 0x0002},
+{0x0114DA, 0x0001},
+{0x011580, 0x0004},
+{0x0115AF, 0x0010},
+{0x0115B6, 0x0001},
+{0x0115B8, 0x0010},
+{0x0115C1, 0x0020},
+{0x0115D8, 0x0004},
+{0x0115DC, 0x0010},
+{0x0115DE, 0x0001},
+{0x011600, 0x0004},
+{0x011630, 0x0010},
+{0x011641, 0x0020},
+{0x011644, 0x0004},
+{0x011645, 0x0001},
+{0x011650, 0x0002},
+{0x01165A, 0x0001},
+{0x011660, 0x0020},
+{0x01166D, 0x0001},
+{0x011680, 0x0004},
+{0x0116AB, 0x0010},
+{0x0116B8, 0x0004},
+{0x0116B9, 0x0020},
+{0x0116BA, 0x0001},
+{0x0116C0, 0x0002},
+{0x0116CA, 0x0001},
+{0x011700, 0x0004},
+{0x01171B, 0x0001},
+{0x01171D, 0x0010},
+{0x01172C, 0x0001},
+{0x011730, 0x0002},
+{0x01173C, 0x0020},
+{0x01173F, 0x0040},
+{0x011740, 0x0004},
+{0x011747, 0x0001},
+{0x011800, 0x0004},
+{0x01182C, 0x0010},
+{0x01183B, 0x0020},
+{0x01183C, 0x0001},
+{0x0118A0, 0x0004},
+{0x0118E0, 0x0002},
+{0x0118F3, 0x0001},
+{0x0118FF, 0x0004},
+{0x011907, 0x0001},
+{0x011909, 0x0004},
+{0x01190A, 0x0001},
+{0x01190C, 0x0004},
+{0x011914, 0x0001},
+{0x011915, 0x0004},
+{0x011917, 0x0001},
+{0x011918, 0x0004},
+{0x011930, 0x0010},
+{0x011936, 0x0001},
+{0x011937, 0x0010},
+{0x011939, 0x0001},
+{0x01193B, 0x0010},
+{0x01193F, 0x0004},
+{0x011940, 0x0010},
+{0x011941, 0x0004},
+{0x011942, 0x0010},
+{0x011944, 0x0020},
+{0x011947, 0x0001},
+{0x011950, 0x0002},
+{0x01195A, 0x0001},
+{0x0119A0, 0x0004},
+{0x0119A8, 0x0001},
+{0x0119AA, 0x0004},
+{0x0119D1, 0x0010},
+{0x0119D8, 0x0001},
+{0x0119DA, 0x0010},
+{0x0119E1, 0x0004},
+{0x0119E2, 0x0020},
+{0x0119E3, 0x0004},
+{0x0119E4, 0x0010},
+{0x0119E5, 0x0001},
+{0x011A00, 0x0004},
+{0x011A01, 0x0010},
+{0x011A0B, 0x0004},
+{0x011A33, 0x0010},
+{0x011A3A, 0x0004},
+{0x011A3B, 0x0010},
+{0x011A3F, 0x0020},
+{0x011A47, 0x0010},
+{0x011A48, 0x0001},
+{0x011A50, 0x0004},
+{0x011A51, 0x0010},
+{0x011A5C, 0x0004},
+{0x011A8A, 0x0010},
+{0x011A9A, 0x0020},
+{0x011A9D, 0x0004},
+{0x011A9E, 0x0020},
+{0x011AA3, 0x0001},
+{0x011AB0, 0x0004},
+{0x011AF9, 0x0001},
+{0x011B00, 0x0020},
+{0x011B0A, 0x0001},
+{0x011C00, 0x0004},
+{0x011C09, 0x0001},
+{0x011C0A, 0x0004},
+{0x011C2F, 0x0010},
+{0x011C37, 0x0001},
+{0x011C38, 0x0010},
+{0x011C40, 0x0004},
+{0x011C41, 0x0020},
+{0x011C46, 0x0001},
+{0x011C50, 0x0002},
+{0x011C6D, 0x0001},
+{0x011C70, 0x0020},
+{0x011C72, 0x0004},
+{0x011C90, 0x0001},
+{0x011C92, 0x0010},
+{0x011CA8, 0x0001},
+{0x011CA9, 0x0010},
+{0x011CB7, 0x0001},
+{0x011D00, 0x0004},
+{0x011D07, 0x0001},
+{0x011D08, 0x0004},
+{0x011D0A, 0x0001},
+{0x011D0B, 0x0004},
+{0x011D31, 0x0010},
+{0x011D37, 0x0001},
+{0x011D3A, 0x0010},
+{0x011D3B, 0x0001},
+{0x011D3C, 0x0010},
+{0x011D3E, 0x0001},
+{0x011D3F, 0x0010},
+{0x011D46, 0x0004},
+{0x011D47, 0x0010},
+{0x011D48, 0x0001},
+{0x011D50, 0x0002},
+{0x011D5A, 0x0001},
+{0x011D60, 0x0004},
+{0x011D66, 0x0001},
+{0x011D67, 0x0004},
+{0x011D69, 0x0001},
+{0x011D6A, 0x0004},
+{0x011D8A, 0x0010},
+{0x011D8F, 0x0001},
+{0x011D90, 0x0010},
+{0x011D92, 0x0001},
+{0x011D93, 0x0010},
+{0x011D98, 0x0004},
+{0x011D99, 0x0001},
+{0x011DA0, 0x0002},
+{0x011DAA, 0x0001},
+{0x011EE0, 0x0004},
+{0x011EF3, 0x0010},
+{0x011EF7, 0x0020},
+{0x011EF9, 0x0001},
+{0x011F00, 0x0010},
+{0x011F02, 0x0004},
+{0x011F03, 0x0010},
+{0x011F04, 0x0004},
+{0x011F11, 0x0001},
+{0x011F12, 0x0004},
+{0x011F34, 0x0010},
+{0x011F3B, 0x0001},
+{0x011F3E, 0x0010},
+{0x011F43, 0x0020},
+{0x011F50, 0x0002},
+{0x011F5A, 0x0001},
+{0x011FB0, 0x0004},
+{0x011FB1, 0x0001},
+{0x011FC0, 0x0002},
+{0x011FD5, 0x0040},
+{0x011FF2, 0x0001},
+{0x011FFF, 0x0020},
+{0x012000, 0x0004},
+{0x01239A, 0x0001},
+{0x012400, 0x0002},
+{0x01246F, 0x0001},
+{0x012470, 0x0020},
+{0x012475, 0x0001},
+{0x012480, 0x0004},
+{0x012544, 0x0001},
+{0x012F90, 0x0004},
+{0x012FF1, 0x0020},
+{0x012FF3, 0x0001},
+{0x013000, 0x0004},
+{0x013430, 0x0080},
+{0x013440, 0x0010},
+{0x013441, 0x0004},
+{0x013447, 0x0010},
+{0x013456, 0x0001},
+{0x014400, 0x0004},
+{0x014647, 0x0001},
+{0x016800, 0x0004},
+{0x016A39, 0x0001},
+{0x016A40, 0x0004},
+{0x016A5F, 0x0001},
+{0x016A60, 0x0002},
+{0x016A6A, 0x0001},
+{0x016A6E, 0x0020},
+{0x016A70, 0x0004},
+{0x016ABF, 0x0001},
+{0x016AC0, 0x0002},
+{0x016ACA, 0x0001},
+{0x016AD0, 0x0004},
+{0x016AEE, 0x0001},
+{0x016AF0, 0x0010},
+{0x016AF5, 0x0020},
+{0x016AF6, 0x0001},
+{0x016B00, 0x0004},
+{0x016B30, 0x0010},
+{0x016B37, 0x0020},
+{0x016B3C, 0x0040},
+{0x016B40, 0x0004},
+{0x016B44, 0x0020},
+{0x016B45, 0x0040},
+{0x016B46, 0x0001},
+{0x016B50, 0x0002},
+{0x016B5A, 0x0001},
+{0x016B5B, 0x0002},
+{0x016B62, 0x0001},
+{0x016B63, 0x0004},
+{0x016B78, 0x0001},
+{0x016B7D, 0x0004},
+{0x016B90, 0x0001},
+{0x016E40, 0x0004},
+{0x016E80, 0x0002},
+{0x016E97, 0x0020},
+{0x016E9B, 0x0001},
+{0x016F00, 0x0004},
+{0x016F4B, 0x0001},
+{0x016F4F, 0x0010},
+{0x016F50, 0x0004},
+{0x016F51, 0x0010},
+{0x016F88, 0x0001},
+{0x016F8F, 0x0010},
+{0x016F93, 0x0004},
+{0x016FA0, 0x0001},
+{0x016FE0, 0x0004},
+{0x016FE2, 0x0020},
+{0x016FE3, 0x0004},
+{0x016FE4, 0x0010},
+{0x016FE5, 0x0001},
+{0x016FF0, 0x0010},
+{0x016FF2, 0x0001},
+{0x017000, 0x0004},
+{0x0187F8, 0x0001},
+{0x018800, 0x0004},
+{0x018CD6, 0x0001},
+{0x018D00, 0x0004},
+{0x018D09, 0x0001},
+{0x01AFF0, 0x0004},
+{0x01AFF4, 0x0001},
+{0x01AFF5, 0x0004},
+{0x01AFFC, 0x0001},
+{0x01AFFD, 0x0004},
+{0x01AFFF, 0x0001},
+{0x01B000, 0x0004},
+{0x01B123, 0x0001},
+{0x01B132, 0x0004},
+{0x01B133, 0x0001},
+{0x01B150, 0x0004},
+{0x01B153, 0x0001},
+{0x01B155, 0x0004},
+{0x01B156, 0x0001},
+{0x01B164, 0x0004},
+{0x01B168, 0x0001},
+{0x01B170, 0x0004},
+{0x01B2FC, 0x0001},
+{0x01BC00, 0x0004},
+{0x01BC6B, 0x0001},
+{0x01BC70, 0x0004},
+{0x01BC7D, 0x0001},
+{0x01BC80, 0x0004},
+{0x01BC89, 0x0001},
+{0x01BC90, 0x0004},
+{0x01BC9A, 0x0001},
+{0x01BC9C, 0x0040},
+{0x01BC9D, 0x0010},
+{0x01BC9F, 0x0020},
+{0x01BCA0, 0x0080},
+{0x01BCA4, 0x0001},
+{0x01CF00, 0x0010},
+{0x01CF2E, 0x0001},
+{0x01CF30, 0x0010},
+{0x01CF47, 0x0001},
+{0x01CF50, 0x0040},
+{0x01CFC4, 0x0001},
+{0x01D000, 0x0040},
+{0x01D0F6, 0x0001},
+{0x01D100, 0x0040},
+{0x01D127, 0x0001},
+{0x01D129, 0x0040},
+{0x01D165, 0x0010},
+{0x01D16A, 0x0040},
+{0x01D16D, 0x0010},
+{0x01D173, 0x0080},
+{0x01D17B, 0x0010},
+{0x01D183, 0x0040},
+{0x01D185, 0x0010},
+{0x01D18C, 0x0040},
+{0x01D1AA, 0x0010},
+{0x01D1AE, 0x0040},
+{0x01D1EB, 0x0001},
+{0x01D200, 0x0040},
+{0x01D242, 0x0010},
+{0x01D245, 0x0040},
+{0x01D246, 0x0001},
+{0x01D2C0, 0x0002},
+{0x01D2D4, 0x0001},
+{0x01D2E0, 0x0002},
+{0x01D2F4, 0x0001},
+{0x01D300, 0x0040},
+{0x01D357, 0x0001},
+{0x01D360, 0x0002},
+{0x01D379, 0x0001},
+{0x01D400, 0x0004},
+{0x01D455, 0x0001},
+{0x01D456, 0x0004},
+{0x01D49D, 0x0001},
+{0x01D49E, 0x0004},
+{0x01D4A0, 0x0001},
+{0x01D4A2, 0x0004},
+{0x01D4A3, 0x0001},
+{0x01D4A5, 0x0004},
+{0x01D4A7, 0x0001},
+{0x01D4A9, 0x0004},
+{0x01D4AD, 0x0001},
+{0x01D4AE, 0x0004},
+{0x01D4BA, 0x0001},
+{0x01D4BB, 0x0004},
+{0x01D4BC, 0x0001},
+{0x01D4BD, 0x0004},
+{0x01D4C4, 0x0001},
+{0x01D4C5, 0x0004},
+{0x01D506, 0x0001},
+{0x01D507, 0x0004},
+{0x01D50B, 0x0001},
+{0x01D50D, 0x0004},
+{0x01D515, 0x0001},
+{0x01D516, 0x0004},
+{0x01D51D, 0x0001},
+{0x01D51E, 0x0004},
+{0x01D53A, 0x0001},
+{0x01D53B, 0x0004},
+{0x01D53F, 0x0001},
+{0x01D540, 0x0004},
+{0x01D545, 0x0001},
+{0x01D546, 0x0004},
+{0x01D547, 0x0001},
+{0x01D54A, 0x0004},
+{0x01D551, 0x0001},
+{0x01D552, 0x0004},
+{0x01D6A6, 0x0001},
+{0x01D6A8, 0x0004},
+{0x01D6C1, 0x0040},
+{0x01D6C2, 0x0004},
+{0x01D6DB, 0x0040},
+{0x01D6DC, 0x0004},
+{0x01D6FB, 0x0040},
+{0x01D6FC, 0x0004},
+{0x01D715, 0x0040},
+{0x01D716, 0x0004},
+{0x01D735, 0x0040},
+{0x01D736, 0x0004},
+{0x01D74F, 0x0040},
+{0x01D750, 0x0004},
+{0x01D76F, 0x0040},
+{0x01D770, 0x0004},
+{0x01D789, 0x0040},
+{0x01D78A, 0x0004},
+{0x01D7A9, 0x0040},
+{0x01D7AA, 0x0004},
+{0x01D7C3, 0x0040},
+{0x01D7C4, 0x0004},
+{0x01D7CC, 0x0001},
+{0x01D7CE, 0x0002},
+{0x01D800, 0x0040},
+{0x01DA00, 0x0010},
+{0x01DA37, 0x0040},
+{0x01DA3B, 0x0010},
+{0x01DA6D, 0x0040},
+{0x01DA75, 0x0010},
+{0x01DA76, 0x0040},
+{0x01DA84, 0x0010},
+{0x01DA85, 0x0040},
+{0x01DA87, 0x0020},
+{0x01DA8C, 0x0001},
+{0x01DA9B, 0x0010},
+{0x01DAA0, 0x0001},
+{0x01DAA1, 0x0010},
+{0x01DAB0, 0x0001},
+{0x01DF00, 0x0004},
+{0x01DF1F, 0x0001},
+{0x01DF25, 0x0004},
+{0x01DF2B, 0x0001},
+{0x01E000, 0x0010},
+{0x01E007, 0x0001},
+{0x01E008, 0x0010},
+{0x01E019, 0x0001},
+{0x01E01B, 0x0010},
+{0x01E022, 0x0001},
+{0x01E023, 0x0010},
+{0x01E025, 0x0001},
+{0x01E026, 0x0010},
+{0x01E02B, 0x0001},
+{0x01E030, 0x0004},
+{0x01E06E, 0x0001},
+{0x01E08F, 0x0010},
+{0x01E090, 0x0001},
+{0x01E100, 0x0004},
+{0x01E12D, 0x0001},
+{0x01E130, 0x0010},
+{0x01E137, 0x0004},
+{0x01E13E, 0x0001},
+{0x01E140, 0x0002},
+{0x01E14A, 0x0001},
+{0x01E14E, 0x0004},
+{0x01E14F, 0x0040},
+{0x01E150, 0x0001},
+{0x01E290, 0x0004},
+{0x01E2AE, 0x0010},
+{0x01E2AF, 0x0001},
+{0x01E2C0, 0x0004},
+{0x01E2EC, 0x0010},
+{0x01E2F0, 0x0002},
+{0x01E2FA, 0x0001},
+{0x01E2FF, 0x0040},
+{0x01E300, 0x0001},
+{0x01E4D0, 0x0004},
+{0x01E4EC, 0x0010},
+{0x01E4F0, 0x0002},
+{0x01E4FA, 0x0001},
+{0x01E7E0, 0x0004},
+{0x01E7E7, 0x0001},
+{0x01E7E8, 0x0004},
+{0x01E7EC, 0x0001},
+{0x01E7ED, 0x0004},
+{0x01E7EF, 0x0001},
+{0x01E7F0, 0x0004},
+{0x01E7FF, 0x0001},
+{0x01E800, 0x0004},
+{0x01E8C5, 0x0001},
+{0x01E8C7, 0x0002},
+{0x01E8D0, 0x0010},
+{0x01E8D7, 0x0001},
+{0x01E900, 0x0004},
+{0x01E944, 0x0010},
+{0x01E94B, 0x0004},
+{0x01E94C, 0x0001},
+{0x01E950, 0x0002},
+{0x01E95A, 0x0001},
+{0x01E95E, 0x0020},
+{0x01E960, 0x0001},
+{0x01EC71, 0x0002},
+{0x01ECAC, 0x0040},
+{0x01ECAD, 0x0002},
+{0x01ECB0, 0x0040},
+{0x01ECB1, 0x0002},
+{0x01ECB5, 0x0001},
+{0x01ED01, 0x0002},
+{0x01ED2E, 0x0040},
+{0x01ED2F, 0x0002},
+{0x01ED3E, 0x0001},
+{0x01EE00, 0x0004},
+{0x01EE04, 0x0001},
+{0x01EE05, 0x0004},
+{0x01EE20, 0x0001},
+{0x01EE21, 0x0004},
+{0x01EE23, 0x0001},
+{0x01EE24, 0x0004},
+{0x01EE25, 0x0001},
+{0x01EE27, 0x0004},
+{0x01EE28, 0x0001},
+{0x01EE29, 0x0004},
+{0x01EE33, 0x0001},
+{0x01EE34, 0x0004},
+{0x01EE38, 0x0001},
+{0x01EE39, 0x0004},
+{0x01EE3A, 0x0001},
+{0x01EE3B, 0x0004},
+{0x01EE3C, 0x0001},
+{0x01EE42, 0x0004},
+{0x01EE43, 0x0001},
+{0x01EE47, 0x0004},
+{0x01EE48, 0x0001},
+{0x01EE49, 0x0004},
+{0x01EE4A, 0x0001},
+{0x01EE4B, 0x0004},
+{0x01EE4C, 0x0001},
+{0x01EE4D, 0x0004},
+{0x01EE50, 0x0001},
+{0x01EE51, 0x0004},
+{0x01EE53, 0x0001},
+{0x01EE54, 0x0004},
+{0x01EE55, 0x0001},
+{0x01EE57, 0x0004},
+{0x01EE58, 0x0001},
+{0x01EE59, 0x0004},
+{0x01EE5A, 0x0001},
+{0x01EE5B, 0x0004},
+{0x01EE5C, 0x0001},
+{0x01EE5D, 0x0004},
+{0x01EE5E, 0x0001},
+{0x01EE5F, 0x0004},
+{0x01EE60, 0x0001},
+{0x01EE61, 0x0004},
+{0x01EE63, 0x0001},
+{0x01EE64, 0x0004},
+{0x01EE65, 0x0001},
+{0x01EE67, 0x0004},
+{0x01EE6B, 0x0001},
+{0x01EE6C, 0x0004},
+{0x01EE73, 0x0001},
+{0x01EE74, 0x0004},
+{0x01EE78, 0x0001},
+{0x01EE79, 0x0004},
+{0x01EE7D, 0x0001},
+{0x01EE7E, 0x0004},
+{0x01EE7F, 0x0001},
+{0x01EE80, 0x0004},
+{0x01EE8A, 0x0001},
+{0x01EE8B, 0x0004},
+{0x01EE9C, 0x0001},
+{0x01EEA1, 0x0004},
+{0x01EEA4, 0x0001},
+{0x01EEA5, 0x0004},
+{0x01EEAA, 0x0001},
+{0x01EEAB, 0x0004},
+{0x01EEBC, 0x0001},
+{0x01EEF0, 0x0040},
+{0x01EEF2, 0x0001},
+{0x01F000, 0x0040},
+{0x01F02C, 0x0001},
+{0x01F030, 0x0040},
+{0x01F094, 0x0001},
+{0x01F0A0, 0x0040},
+{0x01F0AF, 0x0001},
+{0x01F0B1, 0x0040},
+{0x01F0C0, 0x0001},
+{0x01F0C1, 0x0040},
+{0x01F0D0, 0x0001},
+{0x01F0D1, 0x0040},
+{0x01F0F6, 0x0001},
+{0x01F100, 0x0002},
+{0x01F10D, 0x0040},
+{0x01F1AE, 0x0001},
+{0x01F1E6, 0x0040},
+{0x01F203, 0x0001},
+{0x01F210, 0x0040},
+{0x01F23C, 0x0001},
+{0x01F240, 0x0040},
+{0x01F249, 0x0001},
+{0x01F250, 0x0040},
+{0x01F252, 0x0001},
+{0x01F260, 0x0040},
+{0x01F266, 0x0001},
+{0x01F300, 0x0040},
+{0x01F6D8, 0x0001},
+{0x01F6DC, 0x0040},
+{0x01F6ED, 0x0001},
+{0x01F6F0, 0x0040},
+{0x01F6FD, 0x0001},
+{0x01F700, 0x0040},
+{0x01F777, 0x0001},
+{0x01F77B, 0x0040},
+{0x01F7DA, 0x0001},
+{0x01F7E0, 0x0040},
+{0x01F7EC, 0x0001},
+{0x01F7F0, 0x0040},
+{0x01F7F1, 0x0001},
+{0x01F800, 0x0040},
+{0x01F80C, 0x0001},
+{0x01F810, 0x0040},
+{0x01F848, 0x0001},
+{0x01F850, 0x0040},
+{0x01F85A, 0x0001},
+{0x01F860, 0x0040},
+{0x01F888, 0x0001},
+{0x01F890, 0x0040},
+{0x01F8AE, 0x0001},
+{0x01F8B0, 0x0040},
+{0x01F8B2, 0x0001},
+{0x01F900, 0x0040},
+{0x01FA54, 0x0001},
+{0x01FA60, 0x0040},
+{0x01FA6E, 0x0001},
+{0x01FA70, 0x0040},
+{0x01FA7D, 0x0001},
+{0x01FA80, 0x0040},
+{0x01FA89, 0x0001},
+{0x01FA90, 0x0040},
+{0x01FABE, 0x0001},
+{0x01FABF, 0x0040},
+{0x01FAC6, 0x0001},
+{0x01FACE, 0x0040},
+{0x01FADC, 0x0001},
+{0x01FAE0, 0x0040},
+{0x01FAE9, 0x0001},
+{0x01FAF0, 0x0040},
+{0x01FAF9, 0x0001},
+{0x01FB00, 0x0040},
+{0x01FB93, 0x0001},
+{0x01FB94, 0x0040},
+{0x01FBCB, 0x0001},
+{0x01FBF0, 0x0002},
+{0x01FBFA, 0x0001},
+{0x020000, 0x0004},
+{0x02A6E0, 0x0001},
+{0x02A700, 0x0004},
+{0x02B73A, 0x0001},
+{0x02B740, 0x0004},
+{0x02B81E, 0x0001},
+{0x02B820, 0x0004},
+{0x02CEA2, 0x0001},
+{0x02CEB0, 0x0004},
+{0x02EBE1, 0x0001},
+{0x02EBF0, 0x0004},
+{0x02EE5E, 0x0001},
+{0x02F800, 0x0004},
+{0x02FA1E, 0x0001},
+{0x030000, 0x0004},
+{0x03134B, 0x0001},
+{0x031350, 0x0004},
+{0x0323B0, 0x0001},
+{0x0E0001, 0x0080},
+{0x0E0002, 0x0001},
+{0x0E0020, 0x0080},
+{0x0E0080, 0x0001},
+{0x0E0100, 0x0010},
+{0x0E01F0, 0x0001},
+{0x0F0000, 0x0080},
+{0x0FFFFE, 0x0001},
+{0x100000, 0x0080},
+{0x10FFFE, 0x0001},
+{0x110000, 0x0000},
+};
+
+const std::unordered_set<uint32_t> unicode_set_whitespace = {
+0x000009,
+0x00000A,
+0x00000B,
+0x00000C,
+0x00000D,
+0x000020,
+0x000085,
+0x0000A0,
+0x001680,
+0x002000,
+0x002001,
+0x002002,
+0x002003,
+0x002004,
+0x002005,
+0x002006,
+0x002007,
+0x002008,
+0x002009,
+0x00200A,
+0x002028,
+0x002029,
+0x00202F,
+0x00205F,
+0x003000,
+};
+
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
+{0x000041, 0x000061},
+{0x000042, 0x000062},
+{0x000043, 0x000063},
+{0x000044, 0x000064},
+{0x000045, 0x000065},
+{0x000046, 0x000066},
+{0x000047, 0x000067},
+{0x000048, 0x000068},
+{0x000049, 0x000069},
+{0x00004A, 0x00006A},
+{0x00004B, 0x00006B},
+{0x00004C, 0x00006C},
+{0x00004D, 0x00006D},
+{0x00004E, 0x00006E},
+{0x00004F, 0x00006F},
+{0x000050, 0x000070},
+{0x000051, 0x000071},
+{0x000052, 0x000072},
+{0x000053, 0x000073},
+{0x000054, 0x000074},
+{0x000055, 0x000075},
+{0x000056, 0x000076},
+{0x000057, 0x000077},
+{0x000058, 0x000078},
+{0x000059, 0x000079},
+{0x00005A, 0x00007A},
+{0x0000C0, 0x0000E0},
+{0x0000C1, 0x0000E1},
+{0x0000C2, 0x0000E2},
+{0x0000C3, 0x0000E3},
+{0x0000C4, 0x0000E4},
+{0x0000C5, 0x0000E5},
+{0x0000C6, 0x0000E6},
+{0x0000C7, 0x0000E7},
+{0x0000C8, 0x0000E8},
+{0x0000C9, 0x0000E9},
+{0x0000CA, 0x0000EA},
+{0x0000CB, 0x0000EB},
+{0x0000CC, 0x0000EC},
+{0x0000CD, 0x0000ED},
+{0x0000CE, 0x0000EE},
+{0x0000CF, 0x0000EF},
+{0x0000D0, 0x0000F0},
+{0x0000D1, 0x0000F1},
+{0x0000D2, 0x0000F2},
+{0x0000D3, 0x0000F3},
+{0x0000D4, 0x0000F4},
+{0x0000D5, 0x0000F5},
+{0x0000D6, 0x0000F6},
+{0x0000D8, 0x0000F8},
+{0x0000D9, 0x0000F9},
+{0x0000DA, 0x0000FA},
+{0x0000DB, 0x0000FB},
+{0x0000DC, 0x0000FC},
+{0x0000DD, 0x0000FD},
+{0x0000DE, 0x0000FE},
+{0x000100, 0x000101},
+{0x000102, 0x000103},
+{0x000104, 0x000105},
+{0x000106, 0x000107},
+{0x000108, 0x000109},
+{0x00010A, 0x00010B},
+{0x00010C, 0x00010D},
+{0x00010E, 0x00010F},
+{0x000110, 0x000111},
+{0x000112, 0x000113},
+{0x000114, 0x000115},
+{0x000116, 0x000117},
+{0x000118, 0x000119},
+{0x00011A, 0x00011B},
+{0x00011C, 0x00011D},
+{0x00011E, 0x00011F},
+{0x000120, 0x000121},
+{0x000122, 0x000123},
+{0x000124, 0x000125},
+{0x000126, 0x000127},
+{0x000128, 0x000129},
+{0x00012A, 0x00012B},
+{0x00012C, 0x00012D},
+{0x00012E, 0x00012F},
+{0x000130, 0x000069},
+{0x000132, 0x000133},
+{0x000134, 0x000135},
+{0x000136, 0x000137},
+{0x000139, 0x00013A},
+{0x00013B, 0x00013C},
+{0x00013D, 0x00013E},
+{0x00013F, 0x000140},
+{0x000141, 0x000142},
+{0x000143, 0x000144},
+{0x000145, 0x000146},
+{0x000147, 0x000148},
+{0x00014A, 0x00014B},
+{0x00014C, 0x00014D},
+{0x00014E, 0x00014F},
+{0x000150, 0x000151},
+{0x000152, 0x000153},
+{0x000154, 0x000155},
+{0x000156, 0x000157},
+{0x000158, 0x000159},
+{0x00015A, 0x00015B},
+{0x00015C, 0x00015D},
+{0x00015E, 0x00015F},
+{0x000160, 0x000161},
+{0x000162, 0x000163},
+{0x000164, 0x000165},
+{0x000166, 0x000167},
+{0x000168, 0x000169},
+{0x00016A, 0x00016B},
+{0x00016C, 0x00016D},
+{0x00016E, 0x00016F},
+{0x000170, 0x000171},
+{0x000172, 0x000173},
+{0x000174, 0x000175},
+{0x000176, 0x000177},
+{0x000178, 0x0000FF},
+{0x000179, 0x00017A},
+{0x00017B, 0x00017C},
+{0x00017D, 0x00017E},
+{0x000181, 0x000253},
+{0x000182, 0x000183},
+{0x000184, 0x000185},
+{0x000186, 0x000254},
+{0x000187, 0x000188},
+{0x000189, 0x000256},
+{0x00018A, 0x000257},
+{0x00018B, 0x00018C},
+{0x00018E, 0x0001DD},
+{0x00018F, 0x000259},
+{0x000190, 0x00025B},
+{0x000191, 0x000192},
+{0x000193, 0x000260},
+{0x000194, 0x000263},
+{0x000196, 0x000269},
+{0x000197, 0x000268},
+{0x000198, 0x000199},
+{0x00019C, 0x00026F},
+{0x00019D, 0x000272},
+{0x00019F, 0x000275},
+{0x0001A0, 0x0001A1},
+{0x0001A2, 0x0001A3},
+{0x0001A4, 0x0001A5},
+{0x0001A6, 0x000280},
+{0x0001A7, 0x0001A8},
+{0x0001A9, 0x000283},
+{0x0001AC, 0x0001AD},
+{0x0001AE, 0x000288},
+{0x0001AF, 0x0001B0},
+{0x0001B1, 0x00028A},
+{0x0001B2, 0x00028B},
+{0x0001B3, 0x0001B4},
+{0x0001B5, 0x0001B6},
+{0x0001B7, 0x000292},
+{0x0001B8, 0x0001B9},
+{0x0001BC, 0x0001BD},
+{0x0001C4, 0x0001C6},
+{0x0001C5, 0x0001C6},
+{0x0001C7, 0x0001C9},
+{0x0001C8, 0x0001C9},
+{0x0001CA, 0x0001CC},
+{0x0001CB, 0x0001CC},
+{0x0001CD, 0x0001CE},
+{0x0001CF, 0x0001D0},
+{0x0001D1, 0x0001D2},
+{0x0001D3, 0x0001D4},
+{0x0001D5, 0x0001D6},
+{0x0001D7, 0x0001D8},
+{0x0001D9, 0x0001DA},
+{0x0001DB, 0x0001DC},
+{0x0001DE, 0x0001DF},
+{0x0001E0, 0x0001E1},
+{0x0001E2, 0x0001E3},
+{0x0001E4, 0x0001E5},
+{0x0001E6, 0x0001E7},
+{0x0001E8, 0x0001E9},
+{0x0001EA, 0x0001EB},
+{0x0001EC, 0x0001ED},
+{0x0001EE, 0x0001EF},
+{0x0001F1, 0x0001F3},
+{0x0001F2, 0x0001F3},
+{0x0001F4, 0x0001F5},
+{0x0001F6, 0x000195},
+{0x0001F7, 0x0001BF},
+{0x0001F8, 0x0001F9},
+{0x0001FA, 0x0001FB},
+{0x0001FC, 0x0001FD},
+{0x0001FE, 0x0001FF},
+{0x000200, 0x000201},
+{0x000202, 0x000203},
+{0x000204, 0x000205},
+{0x000206, 0x000207},
+{0x000208, 0x000209},
+{0x00020A, 0x00020B},
+{0x00020C, 0x00020D},
+{0x00020E, 0x00020F},
+{0x000210, 0x000211},
+{0x000212, 0x000213},
+{0x000214, 0x000215},
+{0x000216, 0x000217},
+{0x000218, 0x000219},
+{0x00021A, 0x00021B},
+{0x00021C, 0x00021D},
+{0x00021E, 0x00021F},
+{0x000220, 0x00019E},
+{0x000222, 0x000223},
+{0x000224, 0x000225},
+{0x000226, 0x000227},
+{0x000228, 0x000229},
+{0x00022A, 0x00022B},
+{0x00022C, 0x00022D},
+{0x00022E, 0x00022F},
+{0x000230, 0x000231},
+{0x000232, 0x000233},
+{0x00023A, 0x002C65},
+{0x00023B, 0x00023C},
+{0x00023D, 0x00019A},
+{0x00023E, 0x002C66},
+{0x000241, 0x000242},
+{0x000243, 0x000180},
+{0x000244, 0x000289},
+{0x000245, 0x00028C},
+{0x000246, 0x000247},
+{0x000248, 0x000249},
+{0x00024A, 0x00024B},
+{0x00024C, 0x00024D},
+{0x00024E, 0x00024F},
+{0x000370, 0x000371},
+{0x000372, 0x000373},
+{0x000376, 0x000377},
+{0x00037F, 0x0003F3},
+{0x000386, 0x0003AC},
+{0x000388, 0x0003AD},
+{0x000389, 0x0003AE},
+{0x00038A, 0x0003AF},
+{0x00038C, 0x0003CC},
+{0x00038E, 0x0003CD},
+{0x00038F, 0x0003CE},
+{0x000391, 0x0003B1},
+{0x000392, 0x0003B2},
+{0x000393, 0x0003B3},
+{0x000394, 0x0003B4},
+{0x000395, 0x0003B5},
+{0x000396, 0x0003B6},
+{0x000397, 0x0003B7},
+{0x000398, 0x0003B8},
+{0x000399, 0x0003B9},
+{0x00039A, 0x0003BA},
+{0x00039B, 0x0003BB},
+{0x00039C, 0x0003BC},
+{0x00039D, 0x0003BD},
+{0x00039E, 0x0003BE},
+{0x00039F, 0x0003BF},
+{0x0003A0, 0x0003C0},
+{0x0003A1, 0x0003C1},
+{0x0003A3, 0x0003C3},
+{0x0003A4, 0x0003C4},
+{0x0003A5, 0x0003C5},
+{0x0003A6, 0x0003C6},
+{0x0003A7, 0x0003C7},
+{0x0003A8, 0x0003C8},
+{0x0003A9, 0x0003C9},
+{0x0003AA, 0x0003CA},
+{0x0003AB, 0x0003CB},
+{0x0003CF, 0x0003D7},
+{0x0003D8, 0x0003D9},
+{0x0003DA, 0x0003DB},
+{0x0003DC, 0x0003DD},
+{0x0003DE, 0x0003DF},
+{0x0003E0, 0x0003E1},
+{0x0003E2, 0x0003E3},
+{0x0003E4, 0x0003E5},
+{0x0003E6, 0x0003E7},
+{0x0003E8, 0x0003E9},
+{0x0003EA, 0x0003EB},
+{0x0003EC, 0x0003ED},
+{0x0003EE, 0x0003EF},
+{0x0003F4, 0x0003B8},
+{0x0003F7, 0x0003F8},
+{0x0003F9, 0x0003F2},
+{0x0003FA, 0x0003FB},
+{0x0003FD, 0x00037B},
+{0x0003FE, 0x00037C},
+{0x0003FF, 0x00037D},
+{0x000400, 0x000450},
+{0x000401, 0x000451},
+{0x000402, 0x000452},
+{0x000403, 0x000453},
+{0x000404, 0x000454},
+{0x000405, 0x000455},
+{0x000406, 0x000456},
+{0x000407, 0x000457},
+{0x000408, 0x000458},
+{0x000409, 0x000459},
+{0x00040A, 0x00045A},
+{0x00040B, 0x00045B},
+{0x00040C, 0x00045C},
+{0x00040D, 0x00045D},
+{0x00040E, 0x00045E},
+{0x00040F, 0x00045F},
+{0x000410, 0x000430},
+{0x000411, 0x000431},
+{0x000412, 0x000432},
+{0x000413, 0x000433},
+{0x000414, 0x000434},
+{0x000415, 0x000435},
+{0x000416, 0x000436},
+{0x000417, 0x000437},
+{0x000418, 0x000438},
+{0x000419, 0x000439},
+{0x00041A, 0x00043A},
+{0x00041B, 0x00043B},
+{0x00041C, 0x00043C},
+{0x00041D, 0x00043D},
+{0x00041E, 0x00043E},
+{0x00041F, 0x00043F},
+{0x000420, 0x000440},
+{0x000421, 0x000441},
+{0x000422, 0x000442},
+{0x000423, 0x000443},
+{0x000424, 0x000444},
+{0x000425, 0x000445},
+{0x000426, 0x000446},
+{0x000427, 0x000447},
+{0x000428, 0x000448},
+{0x000429, 0x000449},
+{0x00042A, 0x00044A},
+{0x00042B, 0x00044B},
+{0x00042C, 0x00044C},
+{0x00042D, 0x00044D},
+{0x00042E, 0x00044E},
+{0x00042F, 0x00044F},
+{0x000460, 0x000461},
+{0x000462, 0x000463},
+{0x000464, 0x000465},
+{0x000466, 0x000467},
+{0x000468, 0x000469},
+{0x00046A, 0x00046B},
+{0x00046C, 0x00046D},
+{0x00046E, 0x00046F},
+{0x000470, 0x000471},
+{0x000472, 0x000473},
+{0x000474, 0x000475},
+{0x000476, 0x000477},
+{0x000478, 0x000479},
+{0x00047A, 0x00047B},
+{0x00047C, 0x00047D},
+{0x00047E, 0x00047F},
+{0x000480, 0x000481},
+{0x00048A, 0x00048B},
+{0x00048C, 0x00048D},
+{0x00048E, 0x00048F},
+{0x000490, 0x000491},
+{0x000492, 0x000493},
+{0x000494, 0x000495},
+{0x000496, 0x000497},
+{0x000498, 0x000499},
+{0x00049A, 0x00049B},
+{0x00049C, 0x00049D},
+{0x00049E, 0x00049F},
+{0x0004A0, 0x0004A1},
+{0x0004A2, 0x0004A3},
+{0x0004A4, 0x0004A5},
+{0x0004A6, 0x0004A7},
+{0x0004A8, 0x0004A9},
+{0x0004AA, 0x0004AB},
+{0x0004AC, 0x0004AD},
+{0x0004AE, 0x0004AF},
+{0x0004B0, 0x0004B1},
+{0x0004B2, 0x0004B3},
+{0x0004B4, 0x0004B5},
+{0x0004B6, 0x0004B7},
+{0x0004B8, 0x0004B9},
+{0x0004BA, 0x0004BB},
+{0x0004BC, 0x0004BD},
+{0x0004BE, 0x0004BF},
+{0x0004C0, 0x0004CF},
+{0x0004C1, 0x0004C2},
+{0x0004C3, 0x0004C4},
+{0x0004C5, 0x0004C6},
+{0x0004C7, 0x0004C8},
+{0x0004C9, 0x0004CA},
+{0x0004CB, 0x0004CC},
+{0x0004CD, 0x0004CE},
+{0x0004D0, 0x0004D1},
+{0x0004D2, 0x0004D3},
+{0x0004D4, 0x0004D5},
+{0x0004D6, 0x0004D7},
+{0x0004D8, 0x0004D9},
+{0x0004DA, 0x0004DB},
+{0x0004DC, 0x0004DD},
+{0x0004DE, 0x0004DF},
+{0x0004E0, 0x0004E1},
+{0x0004E2, 0x0004E3},
+{0x0004E4, 0x0004E5},
+{0x0004E6, 0x0004E7},
+{0x0004E8, 0x0004E9},
+{0x0004EA, 0x0004EB},
+{0x0004EC, 0x0004ED},
+{0x0004EE, 0x0004EF},
+{0x0004F0, 0x0004F1},
+{0x0004F2, 0x0004F3},
+{0x0004F4, 0x0004F5},
+{0x0004F6, 0x0004F7},
+{0x0004F8, 0x0004F9},
+{0x0004FA, 0x0004FB},
+{0x0004FC, 0x0004FD},
+{0x0004FE, 0x0004FF},
+{0x000500, 0x000501},
+{0x000502, 0x000503},
+{0x000504, 0x000505},
+{0x000506, 0x000507},
+{0x000508, 0x000509},
+{0x00050A, 0x00050B},
+{0x00050C, 0x00050D},
+{0x00050E, 0x00050F},
+{0x000510, 0x000511},
+{0x000512, 0x000513},
+{0x000514, 0x000515},
+{0x000516, 0x000517},
+{0x000518, 0x000519},
+{0x00051A, 0x00051B},
+{0x00051C, 0x00051D},
+{0x00051E, 0x00051F},
+{0x000520, 0x000521},
+{0x000522, 0x000523},
+{0x000524, 0x000525},
+{0x000526, 0x000527},
+{0x000528, 0x000529},
+{0x00052A, 0x00052B},
+{0x00052C, 0x00052D},
+{0x00052E, 0x00052F},
+{0x000531, 0x000561},
+{0x000532, 0x000562},
+{0x000533, 0x000563},
+{0x000534, 0x000564},
+{0x000535, 0x000565},
+{0x000536, 0x000566},
+{0x000537, 0x000567},
+{0x000538, 0x000568},
+{0x000539, 0x000569},
+{0x00053A, 0x00056A},
+{0x00053B, 0x00056B},
+{0x00053C, 0x00056C},
+{0x00053D, 0x00056D},
+{0x00053E, 0x00056E},
+{0x00053F, 0x00056F},
+{0x000540, 0x000570},
+{0x000541, 0x000571},
+{0x000542, 0x000572},
+{0x000543, 0x000573},
+{0x000544, 0x000574},
+{0x000545, 0x000575},
+{0x000546, 0x000576},
+{0x000547, 0x000577},
+{0x000548, 0x000578},
+{0x000549, 0x000579},
+{0x00054A, 0x00057A},
+{0x00054B, 0x00057B},
+{0x00054C, 0x00057C},
+{0x00054D, 0x00057D},
+{0x00054E, 0x00057E},
+{0x00054F, 0x00057F},
+{0x000550, 0x000580},
+{0x000551, 0x000581},
+{0x000552, 0x000582},
+{0x000553, 0x000583},
+{0x000554, 0x000584},
+{0x000555, 0x000585},
+{0x000556, 0x000586},
+{0x0010A0, 0x002D00},
+{0x0010A1, 0x002D01},
+{0x0010A2, 0x002D02},
+{0x0010A3, 0x002D03},
+{0x0010A4, 0x002D04},
+{0x0010A5, 0x002D05},
+{0x0010A6, 0x002D06},
+{0x0010A7, 0x002D07},
+{0x0010A8, 0x002D08},
+{0x0010A9, 0x002D09},
+{0x0010AA, 0x002D0A},
+{0x0010AB, 0x002D0B},
+{0x0010AC, 0x002D0C},
+{0x0010AD, 0x002D0D},
+{0x0010AE, 0x002D0E},
+{0x0010AF, 0x002D0F},
+{0x0010B0, 0x002D10},
+{0x0010B1, 0x002D11},
+{0x0010B2, 0x002D12},
+{0x0010B3, 0x002D13},
+{0x0010B4, 0x002D14},
+{0x0010B5, 0x002D15},
+{0x0010B6, 0x002D16},
+{0x0010B7, 0x002D17},
+{0x0010B8, 0x002D18},
+{0x0010B9, 0x002D19},
+{0x0010BA, 0x002D1A},
+{0x0010BB, 0x002D1B},
+{0x0010BC, 0x002D1C},
+{0x0010BD, 0x002D1D},
+{0x0010BE, 0x002D1E},
+{0x0010BF, 0x002D1F},
+{0x0010C0, 0x002D20},
+{0x0010C1, 0x002D21},
+{0x0010C2, 0x002D22},
+{0x0010C3, 0x002D23},
+{0x0010C4, 0x002D24},
+{0x0010C5, 0x002D25},
+{0x0010C7, 0x002D27},
+{0x0010CD, 0x002D2D},
+{0x0013A0, 0x00AB70},
+{0x0013A1, 0x00AB71},
+{0x0013A2, 0x00AB72},
+{0x0013A3, 0x00AB73},
+{0x0013A4, 0x00AB74},
+{0x0013A5, 0x00AB75},
+{0x0013A6, 0x00AB76},
+{0x0013A7, 0x00AB77},
+{0x0013A8, 0x00AB78},
+{0x0013A9, 0x00AB79},
+{0x0013AA, 0x00AB7A},
+{0x0013AB, 0x00AB7B},
+{0x0013AC, 0x00AB7C},
+{0x0013AD, 0x00AB7D},
+{0x0013AE, 0x00AB7E},
+{0x0013AF, 0x00AB7F},
+{0x0013B0, 0x00AB80},
+{0x0013B1, 0x00AB81},
+{0x0013B2, 0x00AB82},
+{0x0013B3, 0x00AB83},
+{0x0013B4, 0x00AB84},
+{0x0013B5, 0x00AB85},
+{0x0013B6, 0x00AB86},
+{0x0013B7, 0x00AB87},
+{0x0013B8, 0x00AB88},
+{0x0013B9, 0x00AB89},
+{0x0013BA, 0x00AB8A},
+{0x0013BB, 0x00AB8B},
+{0x0013BC, 0x00AB8C},
+{0x0013BD, 0x00AB8D},
+{0x0013BE, 0x00AB8E},
+{0x0013BF, 0x00AB8F},
+{0x0013C0, 0x00AB90},
+{0x0013C1, 0x00AB91},
+{0x0013C2, 0x00AB92},
+{0x0013C3, 0x00AB93},
+{0x0013C4, 0x00AB94},
+{0x0013C5, 0x00AB95},
+{0x0013C6, 0x00AB96},
+{0x0013C7, 0x00AB97},
+{0x0013C8, 0x00AB98},
+{0x0013C9, 0x00AB99},
+{0x0013CA, 0x00AB9A},
+{0x0013CB, 0x00AB9B},
+{0x0013CC, 0x00AB9C},
+{0x0013CD, 0x00AB9D},
+{0x0013CE, 0x00AB9E},
+{0x0013CF, 0x00AB9F},
+{0x0013D0, 0x00ABA0},
+{0x0013D1, 0x00ABA1},
+{0x0013D2, 0x00ABA2},
+{0x0013D3, 0x00ABA3},
+{0x0013D4, 0x00ABA4},
+{0x0013D5, 0x00ABA5},
+{0x0013D6, 0x00ABA6},
+{0x0013D7, 0x00ABA7},
+{0x0013D8, 0x00ABA8},
+{0x0013D9, 0x00ABA9},
+{0x0013DA, 0x00ABAA},
+{0x0013DB, 0x00ABAB},
+{0x0013DC, 0x00ABAC},
+{0x0013DD, 0x00ABAD},
+{0x0013DE, 0x00ABAE},
+{0x0013DF, 0x00ABAF},
+{0x0013E0, 0x00ABB0},
+{0x0013E1, 0x00ABB1},
+{0x0013E2, 0x00ABB2},
+{0x0013E3, 0x00ABB3},
+{0x0013E4, 0x00ABB4},
+{0x0013E5, 0x00ABB5},
+{0x0013E6, 0x00ABB6},
+{0x0013E7, 0x00ABB7},
+{0x0013E8, 0x00ABB8},
+{0x0013E9, 0x00ABB9},
+{0x0013EA, 0x00ABBA},
+{0x0013EB, 0x00ABBB},
+{0x0013EC, 0x00ABBC},
+{0x0013ED, 0x00ABBD},
+{0x0013EE, 0x00ABBE},
+{0x0013EF, 0x00ABBF},
+{0x0013F0, 0x0013F8},
+{0x0013F1, 0x0013F9},
+{0x0013F2, 0x0013FA},
+{0x0013F3, 0x0013FB},
+{0x0013F4, 0x0013FC},
+{0x0013F5, 0x0013FD},
+{0x001C90, 0x0010D0},
+{0x001C91, 0x0010D1},
+{0x001C92, 0x0010D2},
+{0x001C93, 0x0010D3},
+{0x001C94, 0x0010D4},
+{0x001C95, 0x0010D5},
+{0x001C96, 0x0010D6},
+{0x001C97, 0x0010D7},
+{0x001C98, 0x0010D8},
+{0x001C99, 0x0010D9},
+{0x001C9A, 0x0010DA},
+{0x001C9B, 0x0010DB},
+{0x001C9C, 0x0010DC},
+{0x001C9D, 0x0010DD},
+{0x001C9E, 0x0010DE},
+{0x001C9F, 0x0010DF},
+{0x001CA0, 0x0010E0},
+{0x001CA1, 0x0010E1},
+{0x001CA2, 0x0010E2},
+{0x001CA3, 0x0010E3},
+{0x001CA4, 0x0010E4},
+{0x001CA5, 0x0010E5},
+{0x001CA6, 0x0010E6},
+{0x001CA7, 0x0010E7},
+{0x001CA8, 0x0010E8},
+{0x001CA9, 0x0010E9},
+{0x001CAA, 0x0010EA},
+{0x001CAB, 0x0010EB},
+{0x001CAC, 0x0010EC},
+{0x001CAD, 0x0010ED},
+{0x001CAE, 0x0010EE},
+{0x001CAF, 0x0010EF},
+{0x001CB0, 0x0010F0},
+{0x001CB1, 0x0010F1},
+{0x001CB2, 0x0010F2},
+{0x001CB3, 0x0010F3},
+{0x001CB4, 0x0010F4},
+{0x001CB5, 0x0010F5},
+{0x001CB6, 0x0010F6},
+{0x001CB7, 0x0010F7},
+{0x001CB8, 0x0010F8},
+{0x001CB9, 0x0010F9},
+{0x001CBA, 0x0010FA},
+{0x001CBD, 0x0010FD},
+{0x001CBE, 0x0010FE},
+{0x001CBF, 0x0010FF},
+{0x001E00, 0x001E01},
+{0x001E02, 0x001E03},
+{0x001E04, 0x001E05},
+{0x001E06, 0x001E07},
+{0x001E08, 0x001E09},
+{0x001E0A, 0x001E0B},
+{0x001E0C, 0x001E0D},
+{0x001E0E, 0x001E0F},
+{0x001E10, 0x001E11},
+{0x001E12, 0x001E13},
+{0x001E14, 0x001E15},
+{0x001E16, 0x001E17},
+{0x001E18, 0x001E19},
+{0x001E1A, 0x001E1B},
+{0x001E1C, 0x001E1D},
+{0x001E1E, 0x001E1F},
+{0x001E20, 0x001E21},
+{0x001E22, 0x001E23},
+{0x001E24, 0x001E25},
+{0x001E26, 0x001E27},
+{0x001E28, 0x001E29},
+{0x001E2A, 0x001E2B},
+{0x001E2C, 0x001E2D},
+{0x001E2E, 0x001E2F},
+{0x001E30, 0x001E31},
+{0x001E32, 0x001E33},
+{0x001E34, 0x001E35},
+{0x001E36, 0x001E37},
+{0x001E38, 0x001E39},
+{0x001E3A, 0x001E3B},
+{0x001E3C, 0x001E3D},
+{0x001E3E, 0x001E3F},
+{0x001E40, 0x001E41},
+{0x001E42, 0x001E43},
+{0x001E44, 0x001E45},
+{0x001E46, 0x001E47},
+{0x001E48, 0x001E49},
+{0x001E4A, 0x001E4B},
+{0x001E4C, 0x001E4D},
+{0x001E4E, 0x001E4F},
+{0x001E50, 0x001E51},
+{0x001E52, 0x001E53},
+{0x001E54, 0x001E55},
+{0x001E56, 0x001E57},
+{0x001E58, 0x001E59},
+{0x001E5A, 0x001E5B},
+{0x001E5C, 0x001E5D},
+{0x001E5E, 0x001E5F},
+{0x001E60, 0x001E61},
+{0x001E62, 0x001E63},
+{0x001E64, 0x001E65},
+{0x001E66, 0x001E67},
+{0x001E68, 0x001E69},
+{0x001E6A, 0x001E6B},
+{0x001E6C, 0x001E6D},
+{0x001E6E, 0x001E6F},
+{0x001E70, 0x001E71},
+{0x001E72, 0x001E73},
+{0x001E74, 0x001E75},
+{0x001E76, 0x001E77},
+{0x001E78, 0x001E79},
+{0x001E7A, 0x001E7B},
+{0x001E7C, 0x001E7D},
+{0x001E7E, 0x001E7F},
+{0x001E80, 0x001E81},
+{0x001E82, 0x001E83},
+{0x001E84, 0x001E85},
+{0x001E86, 0x001E87},
+{0x001E88, 0x001E89},
+{0x001E8A, 0x001E8B},
+{0x001E8C, 0x001E8D},
+{0x001E8E, 0x001E8F},
+{0x001E90, 0x001E91},
+{0x001E92, 0x001E93},
+{0x001E94, 0x001E95},
+{0x001E9E, 0x0000DF},
+{0x001EA0, 0x001EA1},
+{0x001EA2, 0x001EA3},
+{0x001EA4, 0x001EA5},
+{0x001EA6, 0x001EA7},
+{0x001EA8, 0x001EA9},
+{0x001EAA, 0x001EAB},
+{0x001EAC, 0x001EAD},
+{0x001EAE, 0x001EAF},
+{0x001EB0, 0x001EB1},
+{0x001EB2, 0x001EB3},
+{0x001EB4, 0x001EB5},
+{0x001EB6, 0x001EB7},
+{0x001EB8, 0x001EB9},
+{0x001EBA, 0x001EBB},
+{0x001EBC, 0x001EBD},
+{0x001EBE, 0x001EBF},
+{0x001EC0, 0x001EC1},
+{0x001EC2, 0x001EC3},
+{0x001EC4, 0x001EC5},
+{0x001EC6, 0x001EC7},
+{0x001EC8, 0x001EC9},
+{0x001ECA, 0x001ECB},
+{0x001ECC, 0x001ECD},
+{0x001ECE, 0x001ECF},
+{0x001ED0, 0x001ED1},
+{0x001ED2, 0x001ED3},
+{0x001ED4, 0x001ED5},
+{0x001ED6, 0x001ED7},
+{0x001ED8, 0x001ED9},
+{0x001EDA, 0x001EDB},
+{0x001EDC, 0x001EDD},
+{0x001EDE, 0x001EDF},
+{0x001EE0, 0x001EE1},
+{0x001EE2, 0x001EE3},
+{0x001EE4, 0x001EE5},
+{0x001EE6, 0x001EE7},
+{0x001EE8, 0x001EE9},
+{0x001EEA, 0x001EEB},
+{0x001EEC, 0x001EED},
+{0x001EEE, 0x001EEF},
+{0x001EF0, 0x001EF1},
+{0x001EF2, 0x001EF3},
+{0x001EF4, 0x001EF5},
+{0x001EF6, 0x001EF7},
+{0x001EF8, 0x001EF9},
+{0x001EFA, 0x001EFB},
+{0x001EFC, 0x001EFD},
+{0x001EFE, 0x001EFF},
+{0x001F08, 0x001F00},
+{0x001F09, 0x001F01},
+{0x001F0A, 0x001F02},
+{0x001F0B, 0x001F03},
+{0x001F0C, 0x001F04},
+{0x001F0D, 0x001F05},
+{0x001F0E, 0x001F06},
+{0x001F0F, 0x001F07},
+{0x001F18, 0x001F10},
+{0x001F19, 0x001F11},
+{0x001F1A, 0x001F12},
+{0x001F1B, 0x001F13},
+{0x001F1C, 0x001F14},
+{0x001F1D, 0x001F15},
+{0x001F28, 0x001F20},
+{0x001F29, 0x001F21},
+{0x001F2A, 0x001F22},
+{0x001F2B, 0x001F23},
+{0x001F2C, 0x001F24},
+{0x001F2D, 0x001F25},
+{0x001F2E, 0x001F26},
+{0x001F2F, 0x001F27},
+{0x001F38, 0x001F30},
+{0x001F39, 0x001F31},
+{0x001F3A, 0x001F32},
+{0x001F3B, 0x001F33},
+{0x001F3C, 0x001F34},
+{0x001F3D, 0x001F35},
+{0x001F3E, 0x001F36},
+{0x001F3F, 0x001F37},
+{0x001F48, 0x001F40},
+{0x001F49, 0x001F41},
+{0x001F4A, 0x001F42},
+{0x001F4B, 0x001F43},
+{0x001F4C, 0x001F44},
+{0x001F4D, 0x001F45},
+{0x001F59, 0x001F51},
+{0x001F5B, 0x001F53},
+{0x001F5D, 0x001F55},
+{0x001F5F, 0x001F57},
+{0x001F68, 0x001F60},
+{0x001F69, 0x001F61},
+{0x001F6A, 0x001F62},
+{0x001F6B, 0x001F63},
+{0x001F6C, 0x001F64},
+{0x001F6D, 0x001F65},
+{0x001F6E, 0x001F66},
+{0x001F6F, 0x001F67},
+{0x001F88, 0x001F80},
+{0x001F89, 0x001F81},
+{0x001F8A, 0x001F82},
+{0x001F8B, 0x001F83},
+{0x001F8C, 0x001F84},
+{0x001F8D, 0x001F85},
+{0x001F8E, 0x001F86},
+{0x001F8F, 0x001F87},
+{0x001F98, 0x001F90},
+{0x001F99, 0x001F91},
+{0x001F9A, 0x001F92},
+{0x001F9B, 0x001F93},
+{0x001F9C, 0x001F94},
+{0x001F9D, 0x001F95},
+{0x001F9E, 0x001F96},
+{0x001F9F, 0x001F97},
+{0x001FA8, 0x001FA0},
+{0x001FA9, 0x001FA1},
+{0x001FAA, 0x001FA2},
+{0x001FAB, 0x001FA3},
+{0x001FAC, 0x001FA4},
+{0x001FAD, 0x001FA5},
+{0x001FAE, 0x001FA6},
+{0x001FAF, 0x001FA7},
+{0x001FB8, 0x001FB0},
+{0x001FB9, 0x001FB1},
+{0x001FBA, 0x001F70},
+{0x001FBB, 0x001F71},
+{0x001FBC, 0x001FB3},
+{0x001FC8, 0x001F72},
+{0x001FC9, 0x001F73},
+{0x001FCA, 0x001F74},
+{0x001FCB, 0x001F75},
+{0x001FCC, 0x001FC3},
+{0x001FD8, 0x001FD0},
+{0x001FD9, 0x001FD1},
+{0x001FDA, 0x001F76},
+{0x001FDB, 0x001F77},
+{0x001FE8, 0x001FE0},
+{0x001FE9, 0x001FE1},
+{0x001FEA, 0x001F7A},
+{0x001FEB, 0x001F7B},
+{0x001FEC, 0x001FE5},
+{0x001FF8, 0x001F78},
+{0x001FF9, 0x001F79},
+{0x001FFA, 0x001F7C},
+{0x001FFB, 0x001F7D},
+{0x001FFC, 0x001FF3},
+{0x002126, 0x0003C9},
+{0x00212A, 0x00006B},
+{0x00212B, 0x0000E5},
+{0x002132, 0x00214E},
+{0x002160, 0x002170},
+{0x002161, 0x002171},
+{0x002162, 0x002172},
+{0x002163, 0x002173},
+{0x002164, 0x002174},
+{0x002165, 0x002175},
+{0x002166, 0x002176},
+{0x002167, 0x002177},
+{0x002168, 0x002178},
+{0x002169, 0x002179},
+{0x00216A, 0x00217A},
+{0x00216B, 0x00217B},
+{0x00216C, 0x00217C},
+{0x00216D, 0x00217D},
+{0x00216E, 0x00217E},
+{0x00216F, 0x00217F},
+{0x002183, 0x002184},
+{0x0024B6, 0x0024D0},
+{0x0024B7, 0x0024D1},
+{0x0024B8, 0x0024D2},
+{0x0024B9, 0x0024D3},
+{0x0024BA, 0x0024D4},
+{0x0024BB, 0x0024D5},
+{0x0024BC, 0x0024D6},
+{0x0024BD, 0x0024D7},
+{0x0024BE, 0x0024D8},
+{0x0024BF, 0x0024D9},
+{0x0024C0, 0x0024DA},
+{0x0024C1, 0x0024DB},
+{0x0024C2, 0x0024DC},
+{0x0024C3, 0x0024DD},
+{0x0024C4, 0x0024DE},
+{0x0024C5, 0x0024DF},
+{0x0024C6, 0x0024E0},
+{0x0024C7, 0x0024E1},
+{0x0024C8, 0x0024E2},
+{0x0024C9, 0x0024E3},
+{0x0024CA, 0x0024E4},
+{0x0024CB, 0x0024E5},
+{0x0024CC, 0x0024E6},
+{0x0024CD, 0x0024E7},
+{0x0024CE, 0x0024E8},
+{0x0024CF, 0x0024E9},
+{0x002C00, 0x002C30},
+{0x002C01, 0x002C31},
+{0x002C02, 0x002C32},
+{0x002C03, 0x002C33},
+{0x002C04, 0x002C34},
+{0x002C05, 0x002C35},
+{0x002C06, 0x002C36},
+{0x002C07, 0x002C37},
+{0x002C08, 0x002C38},
+{0x002C09, 0x002C39},
+{0x002C0A, 0x002C3A},
+{0x002C0B, 0x002C3B},
+{0x002C0C, 0x002C3C},
+{0x002C0D, 0x002C3D},
+{0x002C0E, 0x002C3E},
+{0x002C0F, 0x002C3F},
+{0x002C10, 0x002C40},
+{0x002C11, 0x002C41},
+{0x002C12, 0x002C42},
+{0x002C13, 0x002C43},
+{0x002C14, 0x002C44},
+{0x002C15, 0x002C45},
+{0x002C16, 0x002C46},
+{0x002C17, 0x002C47},
+{0x002C18, 0x002C48},
+{0x002C19, 0x002C49},
+{0x002C1A, 0x002C4A},
+{0x002C1B, 0x002C4B},
+{0x002C1C, 0x002C4C},
+{0x002C1D, 0x002C4D},
+{0x002C1E, 0x002C4E},
+{0x002C1F, 0x002C4F},
+{0x002C20, 0x002C50},
+{0x002C21, 0x002C51},
+{0x002C22, 0x002C52},
+{0x002C23, 0x002C53},
+{0x002C24, 0x002C54},
+{0x002C25, 0x002C55},
+{0x002C26, 0x002C56},
+{0x002C27, 0x002C57},
+{0x002C28, 0x002C58},
+{0x002C29, 0x002C59},
+{0x002C2A, 0x002C5A},
+{0x002C2B, 0x002C5B},
+{0x002C2C, 0x002C5C},
+{0x002C2D, 0x002C5D},
+{0x002C2E, 0x002C5E},
+{0x002C2F, 0x002C5F},
+{0x002C60, 0x002C61},
+{0x002C62, 0x00026B},
+{0x002C63, 0x001D7D},
+{0x002C64, 0x00027D},
+{0x002C67, 0x002C68},
+{0x002C69, 0x002C6A},
+{0x002C6B, 0x002C6C},
+{0x002C6D, 0x000251},
+{0x002C6E, 0x000271},
+{0x002C6F, 0x000250},
+{0x002C70, 0x000252},
+{0x002C72, 0x002C73},
+{0x002C75, 0x002C76},
+{0x002C7E, 0x00023F},
+{0x002C7F, 0x000240},
+{0x002C80, 0x002C81},
+{0x002C82, 0x002C83},
+{0x002C84, 0x002C85},
+{0x002C86, 0x002C87},
+{0x002C88, 0x002C89},
+{0x002C8A, 0x002C8B},
+{0x002C8C, 0x002C8D},
+{0x002C8E, 0x002C8F},
+{0x002C90, 0x002C91},
+{0x002C92, 0x002C93},
+{0x002C94, 0x002C95},
+{0x002C96, 0x002C97},
+{0x002C98, 0x002C99},
+{0x002C9A, 0x002C9B},
+{0x002C9C, 0x002C9D},
+{0x002C9E, 0x002C9F},
+{0x002CA0, 0x002CA1},
+{0x002CA2, 0x002CA3},
+{0x002CA4, 0x002CA5},
+{0x002CA6, 0x002CA7},
+{0x002CA8, 0x002CA9},
+{0x002CAA, 0x002CAB},
+{0x002CAC, 0x002CAD},
+{0x002CAE, 0x002CAF},
+{0x002CB0, 0x002CB1},
+{0x002CB2, 0x002CB3},
+{0x002CB4, 0x002CB5},
+{0x002CB6, 0x002CB7},
+{0x002CB8, 0x002CB9},
+{0x002CBA, 0x002CBB},
+{0x002CBC, 0x002CBD},
+{0x002CBE, 0x002CBF},
+{0x002CC0, 0x002CC1},
+{0x002CC2, 0x002CC3},
+{0x002CC4, 0x002CC5},
+{0x002CC6, 0x002CC7},
+{0x002CC8, 0x002CC9},
+{0x002CCA, 0x002CCB},
+{0x002CCC, 0x002CCD},
+{0x002CCE, 0x002CCF},
+{0x002CD0, 0x002CD1},
+{0x002CD2, 0x002CD3},
+{0x002CD4, 0x002CD5},
+{0x002CD6, 0x002CD7},
+{0x002CD8, 0x002CD9},
+{0x002CDA, 0x002CDB},
+{0x002CDC, 0x002CDD},
+{0x002CDE, 0x002CDF},
+{0x002CE0, 0x002CE1},
+{0x002CE2, 0x002CE3},
+{0x002CEB, 0x002CEC},
+{0x002CED, 0x002CEE},
+{0x002CF2, 0x002CF3},
+{0x00A640, 0x00A641},
+{0x00A642, 0x00A643},
+{0x00A644, 0x00A645},
+{0x00A646, 0x00A647},
+{0x00A648, 0x00A649},
+{0x00A64A, 0x00A64B},
+{0x00A64C, 0x00A64D},
+{0x00A64E, 0x00A64F},
+{0x00A650, 0x00A651},
+{0x00A652, 0x00A653},
+{0x00A654, 0x00A655},
+{0x00A656, 0x00A657},
+{0x00A658, 0x00A659},
+{0x00A65A, 0x00A65B},
+{0x00A65C, 0x00A65D},
+{0x00A65E, 0x00A65F},
+{0x00A660, 0x00A661},
+{0x00A662, 0x00A663},
+{0x00A664, 0x00A665},
+{0x00A666, 0x00A667},
+{0x00A668, 0x00A669},
+{0x00A66A, 0x00A66B},
+{0x00A66C, 0x00A66D},
+{0x00A680, 0x00A681},
+{0x00A682, 0x00A683},
+{0x00A684, 0x00A685},
+{0x00A686, 0x00A687},
+{0x00A688, 0x00A689},
+{0x00A68A, 0x00A68B},
+{0x00A68C, 0x00A68D},
+{0x00A68E, 0x00A68F},
+{0x00A690, 0x00A691},
+{0x00A692, 0x00A693},
+{0x00A694, 0x00A695},
+{0x00A696, 0x00A697},
+{0x00A698, 0x00A699},
+{0x00A69A, 0x00A69B},
+{0x00A722, 0x00A723},
+{0x00A724, 0x00A725},
+{0x00A726, 0x00A727},
+{0x00A728, 0x00A729},
+{0x00A72A, 0x00A72B},
+{0x00A72C, 0x00A72D},
+{0x00A72E, 0x00A72F},
+{0x00A732, 0x00A733},
+{0x00A734, 0x00A735},
+{0x00A736, 0x00A737},
+{0x00A738, 0x00A739},
+{0x00A73A, 0x00A73B},
+{0x00A73C, 0x00A73D},
+{0x00A73E, 0x00A73F},
+{0x00A740, 0x00A741},
+{0x00A742, 0x00A743},
+{0x00A744, 0x00A745},
+{0x00A746, 0x00A747},
+{0x00A748, 0x00A749},
+{0x00A74A, 0x00A74B},
+{0x00A74C, 0x00A74D},
+{0x00A74E, 0x00A74F},
+{0x00A750, 0x00A751},
+{0x00A752, 0x00A753},
+{0x00A754, 0x00A755},
+{0x00A756, 0x00A757},
+{0x00A758, 0x00A759},
+{0x00A75A, 0x00A75B},
+{0x00A75C, 0x00A75D},
+{0x00A75E, 0x00A75F},
+{0x00A760, 0x00A761},
+{0x00A762, 0x00A763},
+{0x00A764, 0x00A765},
+{0x00A766, 0x00A767},
+{0x00A768, 0x00A769},
+{0x00A76A, 0x00A76B},
+{0x00A76C, 0x00A76D},
+{0x00A76E, 0x00A76F},
+{0x00A779, 0x00A77A},
+{0x00A77B, 0x00A77C},
+{0x00A77D, 0x001D79},
+{0x00A77E, 0x00A77F},
+{0x00A780, 0x00A781},
+{0x00A782, 0x00A783},
+{0x00A784, 0x00A785},
+{0x00A786, 0x00A787},
+{0x00A78B, 0x00A78C},
+{0x00A78D, 0x000265},
+{0x00A790, 0x00A791},
+{0x00A792, 0x00A793},
+{0x00A796, 0x00A797},
+{0x00A798, 0x00A799},
+{0x00A79A, 0x00A79B},
+{0x00A79C, 0x00A79D},
+{0x00A79E, 0x00A79F},
+{0x00A7A0, 0x00A7A1},
+{0x00A7A2, 0x00A7A3},
+{0x00A7A4, 0x00A7A5},
+{0x00A7A6, 0x00A7A7},
+{0x00A7A8, 0x00A7A9},
+{0x00A7AA, 0x000266},
+{0x00A7AB, 0x00025C},
+{0x00A7AC, 0x000261},
+{0x00A7AD, 0x00026C},
+{0x00A7AE, 0x00026A},
+{0x00A7B0, 0x00029E},
+{0x00A7B1, 0x000287},
+{0x00A7B2, 0x00029D},
+{0x00A7B3, 0x00AB53},
+{0x00A7B4, 0x00A7B5},
+{0x00A7B6, 0x00A7B7},
+{0x00A7B8, 0x00A7B9},
+{0x00A7BA, 0x00A7BB},
+{0x00A7BC, 0x00A7BD},
+{0x00A7BE, 0x00A7BF},
+{0x00A7C0, 0x00A7C1},
+{0x00A7C2, 0x00A7C3},
+{0x00A7C4, 0x00A794},
+{0x00A7C5, 0x000282},
+{0x00A7C6, 0x001D8E},
+{0x00A7C7, 0x00A7C8},
+{0x00A7C9, 0x00A7CA},
+{0x00A7D0, 0x00A7D1},
+{0x00A7D6, 0x00A7D7},
+{0x00A7D8, 0x00A7D9},
+{0x00A7F5, 0x00A7F6},
+{0x00FF21, 0x00FF41},
+{0x00FF22, 0x00FF42},
+{0x00FF23, 0x00FF43},
+{0x00FF24, 0x00FF44},
+{0x00FF25, 0x00FF45},
+{0x00FF26, 0x00FF46},
+{0x00FF27, 0x00FF47},
+{0x00FF28, 0x00FF48},
+{0x00FF29, 0x00FF49},
+{0x00FF2A, 0x00FF4A},
+{0x00FF2B, 0x00FF4B},
+{0x00FF2C, 0x00FF4C},
+{0x00FF2D, 0x00FF4D},
+{0x00FF2E, 0x00FF4E},
+{0x00FF2F, 0x00FF4F},
+{0x00FF30, 0x00FF50},
+{0x00FF31, 0x00FF51},
+{0x00FF32, 0x00FF52},
+{0x00FF33, 0x00FF53},
+{0x00FF34, 0x00FF54},
+{0x00FF35, 0x00FF55},
+{0x00FF36, 0x00FF56},
+{0x00FF37, 0x00FF57},
+{0x00FF38, 0x00FF58},
+{0x00FF39, 0x00FF59},
+{0x00FF3A, 0x00FF5A},
+{0x010400, 0x010428},
+{0x010401, 0x010429},
+{0x010402, 0x01042A},
+{0x010403, 0x01042B},
+{0x010404, 0x01042C},
+{0x010405, 0x01042D},
+{0x010406, 0x01042E},
+{0x010407, 0x01042F},
+{0x010408, 0x010430},
+{0x010409, 0x010431},
+{0x01040A, 0x010432},
+{0x01040B, 0x010433},
+{0x01040C, 0x010434},
+{0x01040D, 0x010435},
+{0x01040E, 0x010436},
+{0x01040F, 0x010437},
+{0x010410, 0x010438},
+{0x010411, 0x010439},
+{0x010412, 0x01043A},
+{0x010413, 0x01043B},
+{0x010414, 0x01043C},
+{0x010415, 0x01043D},
+{0x010416, 0x01043E},
+{0x010417, 0x01043F},
+{0x010418, 0x010440},
+{0x010419, 0x010441},
+{0x01041A, 0x010442},
+{0x01041B, 0x010443},
+{0x01041C, 0x010444},
+{0x01041D, 0x010445},
+{0x01041E, 0x010446},
+{0x01041F, 0x010447},
+{0x010420, 0x010448},
+{0x010421, 0x010449},
+{0x010422, 0x01044A},
+{0x010423, 0x01044B},
+{0x010424, 0x01044C},
+{0x010425, 0x01044D},
+{0x010426, 0x01044E},
+{0x010427, 0x01044F},
+{0x0104B0, 0x0104D8},
+{0x0104B1, 0x0104D9},
+{0x0104B2, 0x0104DA},
+{0x0104B3, 0x0104DB},
+{0x0104B4, 0x0104DC},
+{0x0104B5, 0x0104DD},
+{0x0104B6, 0x0104DE},
+{0x0104B7, 0x0104DF},
+{0x0104B8, 0x0104E0},
+{0x0104B9, 0x0104E1},
+{0x0104BA, 0x0104E2},
+{0x0104BB, 0x0104E3},
+{0x0104BC, 0x0104E4},
+{0x0104BD, 0x0104E5},
+{0x0104BE, 0x0104E6},
+{0x0104BF, 0x0104E7},
+{0x0104C0, 0x0104E8},
+{0x0104C1, 0x0104E9},
+{0x0104C2, 0x0104EA},
+{0x0104C3, 0x0104EB},
+{0x0104C4, 0x0104EC},
+{0x0104C5, 0x0104ED},
+{0x0104C6, 0x0104EE},
+{0x0104C7, 0x0104EF},
+{0x0104C8, 0x0104F0},
+{0x0104C9, 0x0104F1},
+{0x0104CA, 0x0104F2},
+{0x0104CB, 0x0104F3},
+{0x0104CC, 0x0104F4},
+{0x0104CD, 0x0104F5},
+{0x0104CE, 0x0104F6},
+{0x0104CF, 0x0104F7},
+{0x0104D0, 0x0104F8},
+{0x0104D1, 0x0104F9},
+{0x0104D2, 0x0104FA},
+{0x0104D3, 0x0104FB},
+{0x010570, 0x010597},
+{0x010571, 0x010598},
+{0x010572, 0x010599},
+{0x010573, 0x01059A},
+{0x010574, 0x01059B},
+{0x010575, 0x01059C},
+{0x010576, 0x01059D},
+{0x010577, 0x01059E},
+{0x010578, 0x01059F},
+{0x010579, 0x0105A0},
+{0x01057A, 0x0105A1},
+{0x01057C, 0x0105A3},
+{0x01057D, 0x0105A4},
+{0x01057E, 0x0105A5},
+{0x01057F, 0x0105A6},
+{0x010580, 0x0105A7},
+{0x010581, 0x0105A8},
+{0x010582, 0x0105A9},
+{0x010583, 0x0105AA},
+{0x010584, 0x0105AB},
+{0x010585, 0x0105AC},
+{0x010586, 0x0105AD},
+{0x010587, 0x0105AE},
+{0x010588, 0x0105AF},
+{0x010589, 0x0105B0},
+{0x01058A, 0x0105B1},
+{0x01058C, 0x0105B3},
+{0x01058D, 0x0105B4},
+{0x01058E, 0x0105B5},
+{0x01058F, 0x0105B6},
+{0x010590, 0x0105B7},
+{0x010591, 0x0105B8},
+{0x010592, 0x0105B9},
+{0x010594, 0x0105BB},
+{0x010595, 0x0105BC},
+{0x010C80, 0x010CC0},
+{0x010C81, 0x010CC1},
+{0x010C82, 0x010CC2},
+{0x010C83, 0x010CC3},
+{0x010C84, 0x010CC4},
+{0x010C85, 0x010CC5},
+{0x010C86, 0x010CC6},
+{0x010C87, 0x010CC7},
+{0x010C88, 0x010CC8},
+{0x010C89, 0x010CC9},
+{0x010C8A, 0x010CCA},
+{0x010C8B, 0x010CCB},
+{0x010C8C, 0x010CCC},
+{0x010C8D, 0x010CCD},
+{0x010C8E, 0x010CCE},
+{0x010C8F, 0x010CCF},
+{0x010C90, 0x010CD0},
+{0x010C91, 0x010CD1},
+{0x010C92, 0x010CD2},
+{0x010C93, 0x010CD3},
+{0x010C94, 0x010CD4},
+{0x010C95, 0x010CD5},
+{0x010C96, 0x010CD6},
+{0x010C97, 0x010CD7},
+{0x010C98, 0x010CD8},
+{0x010C99, 0x010CD9},
+{0x010C9A, 0x010CDA},
+{0x010C9B, 0x010CDB},
+{0x010C9C, 0x010CDC},
+{0x010C9D, 0x010CDD},
+{0x010C9E, 0x010CDE},
+{0x010C9F, 0x010CDF},
+{0x010CA0, 0x010CE0},
+{0x010CA1, 0x010CE1},
+{0x010CA2, 0x010CE2},
+{0x010CA3, 0x010CE3},
+{0x010CA4, 0x010CE4},
+{0x010CA5, 0x010CE5},
+{0x010CA6, 0x010CE6},
+{0x010CA7, 0x010CE7},
+{0x010CA8, 0x010CE8},
+{0x010CA9, 0x010CE9},
+{0x010CAA, 0x010CEA},
+{0x010CAB, 0x010CEB},
+{0x010CAC, 0x010CEC},
+{0x010CAD, 0x010CED},
+{0x010CAE, 0x010CEE},
+{0x010CAF, 0x010CEF},
+{0x010CB0, 0x010CF0},
+{0x010CB1, 0x010CF1},
+{0x010CB2, 0x010CF2},
+{0x0118A0, 0x0118C0},
+{0x0118A1, 0x0118C1},
+{0x0118A2, 0x0118C2},
+{0x0118A3, 0x0118C3},
+{0x0118A4, 0x0118C4},
+{0x0118A5, 0x0118C5},
+{0x0118A6, 0x0118C6},
+{0x0118A7, 0x0118C7},
+{0x0118A8, 0x0118C8},
+{0x0118A9, 0x0118C9},
+{0x0118AA, 0x0118CA},
+{0x0118AB, 0x0118CB},
+{0x0118AC, 0x0118CC},
+{0x0118AD, 0x0118CD},
+{0x0118AE, 0x0118CE},
+{0x0118AF, 0x0118CF},
+{0x0118B0, 0x0118D0},
+{0x0118B1, 0x0118D1},
+{0x0118B2, 0x0118D2},
+{0x0118B3, 0x0118D3},
+{0x0118B4, 0x0118D4},
+{0x0118B5, 0x0118D5},
+{0x0118B6, 0x0118D6},
+{0x0118B7, 0x0118D7},
+{0x0118B8, 0x0118D8},
+{0x0118B9, 0x0118D9},
+{0x0118BA, 0x0118DA},
+{0x0118BB, 0x0118DB},
+{0x0118BC, 0x0118DC},
+{0x0118BD, 0x0118DD},
+{0x0118BE, 0x0118DE},
+{0x0118BF, 0x0118DF},
+{0x016E40, 0x016E60},
+{0x016E41, 0x016E61},
+{0x016E42, 0x016E62},
+{0x016E43, 0x016E63},
+{0x016E44, 0x016E64},
+{0x016E45, 0x016E65},
+{0x016E46, 0x016E66},
+{0x016E47, 0x016E67},
+{0x016E48, 0x016E68},
+{0x016E49, 0x016E69},
+{0x016E4A, 0x016E6A},
+{0x016E4B, 0x016E6B},
+{0x016E4C, 0x016E6C},
+{0x016E4D, 0x016E6D},
+{0x016E4E, 0x016E6E},
+{0x016E4F, 0x016E6F},
+{0x016E50, 0x016E70},
+{0x016E51, 0x016E71},
+{0x016E52, 0x016E72},
+{0x016E53, 0x016E73},
+{0x016E54, 0x016E74},
+{0x016E55, 0x016E75},
+{0x016E56, 0x016E76},
+{0x016E57, 0x016E77},
+{0x016E58, 0x016E78},
+{0x016E59, 0x016E79},
+{0x016E5A, 0x016E7A},
+{0x016E5B, 0x016E7B},
+{0x016E5C, 0x016E7C},
+{0x016E5D, 0x016E7D},
+{0x016E5E, 0x016E7E},
+{0x016E5F, 0x016E7F},
+{0x01E900, 0x01E922},
+{0x01E901, 0x01E923},
+{0x01E902, 0x01E924},
+{0x01E903, 0x01E925},
+{0x01E904, 0x01E926},
+{0x01E905, 0x01E927},
+{0x01E906, 0x01E928},
+{0x01E907, 0x01E929},
+{0x01E908, 0x01E92A},
+{0x01E909, 0x01E92B},
+{0x01E90A, 0x01E92C},
+{0x01E90B, 0x01E92D},
+{0x01E90C, 0x01E92E},
+{0x01E90D, 0x01E92F},
+{0x01E90E, 0x01E930},
+{0x01E90F, 0x01E931},
+{0x01E910, 0x01E932},
+{0x01E911, 0x01E933},
+{0x01E912, 0x01E934},
+{0x01E913, 0x01E935},
+{0x01E914, 0x01E936},
+{0x01E915, 0x01E937},
+{0x01E916, 0x01E938},
+{0x01E917, 0x01E939},
+{0x01E918, 0x01E93A},
+{0x01E919, 0x01E93B},
+{0x01E91A, 0x01E93C},
+{0x01E91B, 0x01E93D},
+{0x01E91C, 0x01E93E},
+{0x01E91D, 0x01E93F},
+{0x01E91E, 0x01E940},
+{0x01E91F, 0x01E941},
+{0x01E920, 0x01E942},
+{0x01E921, 0x01E943},
+};
+
+// list is always in ascending order, to enable binary search
+const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
+{0x000061, 0x000041},
+{0x000062, 0x000042},
+{0x000063, 0x000043},
+{0x000064, 0x000044},
+{0x000065, 0x000045},
+{0x000066, 0x000046},
+{0x000067, 0x000047},
+{0x000068, 0x000048},
+{0x000069, 0x000049},
+{0x00006A, 0x00004A},
+{0x00006B, 0x00004B},
+{0x00006C, 0x00004C},
+{0x00006D, 0x00004D},
+{0x00006E, 0x00004E},
+{0x00006F, 0x00004F},
+{0x000070, 0x000050},
+{0x000071, 0x000051},
+{0x000072, 0x000052},
+{0x000073, 0x000053},
+{0x000074, 0x000054},
+{0x000075, 0x000055},
+{0x000076, 0x000056},
+{0x000077, 0x000057},
+{0x000078, 0x000058},
+{0x000079, 0x000059},
+{0x00007A, 0x00005A},
+{0x0000B5, 0x00039C},
+{0x0000E0, 0x0000C0},
+{0x0000E1, 0x0000C1},
+{0x0000E2, 0x0000C2},
+{0x0000E3, 0x0000C3},
+{0x0000E4, 0x0000C4},
+{0x0000E5, 0x0000C5},
+{0x0000E6, 0x0000C6},
+{0x0000E7, 0x0000C7},
+{0x0000E8, 0x0000C8},
+{0x0000E9, 0x0000C9},
+{0x0000EA, 0x0000CA},
+{0x0000EB, 0x0000CB},
+{0x0000EC, 0x0000CC},
+{0x0000ED, 0x0000CD},
+{0x0000EE, 0x0000CE},
+{0x0000EF, 0x0000CF},
+{0x0000F0, 0x0000D0},
+{0x0000F1, 0x0000D1},
+{0x0000F2, 0x0000D2},
+{0x0000F3, 0x0000D3},
+{0x0000F4, 0x0000D4},
+{0x0000F5, 0x0000D5},
+{0x0000F6, 0x0000D6},
+{0x0000F8, 0x0000D8},
+{0x0000F9, 0x0000D9},
+{0x0000FA, 0x0000DA},
+{0x0000FB, 0x0000DB},
+{0x0000FC, 0x0000DC},
+{0x0000FD, 0x0000DD},
+{0x0000FE, 0x0000DE},
+{0x0000FF, 0x000178},
+{0x000101, 0x000100},
+{0x000103, 0x000102},
+{0x000105, 0x000104},
+{0x000107, 0x000106},
+{0x000109, 0x000108},
+{0x00010B, 0x00010A},
+{0x00010D, 0x00010C},
+{0x00010F, 0x00010E},
+{0x000111, 0x000110},
+{0x000113, 0x000112},
+{0x000115, 0x000114},
+{0x000117, 0x000116},
+{0x000119, 0x000118},
+{0x00011B, 0x00011A},
+{0x00011D, 0x00011C},
+{0x00011F, 0x00011E},
+{0x000121, 0x000120},
+{0x000123, 0x000122},
+{0x000125, 0x000124},
+{0x000127, 0x000126},
+{0x000129, 0x000128},
+{0x00012B, 0x00012A},
+{0x00012D, 0x00012C},
+{0x00012F, 0x00012E},
+{0x000131, 0x000049},
+{0x000133, 0x000132},
+{0x000135, 0x000134},
+{0x000137, 0x000136},
+{0x00013A, 0x000139},
+{0x00013C, 0x00013B},
+{0x00013E, 0x00013D},
+{0x000140, 0x00013F},
+{0x000142, 0x000141},
+{0x000144, 0x000143},
+{0x000146, 0x000145},
+{0x000148, 0x000147},
+{0x00014B, 0x00014A},
+{0x00014D, 0x00014C},
+{0x00014F, 0x00014E},
+{0x000151, 0x000150},
+{0x000153, 0x000152},
+{0x000155, 0x000154},
+{0x000157, 0x000156},
+{0x000159, 0x000158},
+{0x00015B, 0x00015A},
+{0x00015D, 0x00015C},
+{0x00015F, 0x00015E},
+{0x000161, 0x000160},
+{0x000163, 0x000162},
+{0x000165, 0x000164},
+{0x000167, 0x000166},
+{0x000169, 0x000168},
+{0x00016B, 0x00016A},
+{0x00016D, 0x00016C},
+{0x00016F, 0x00016E},
+{0x000171, 0x000170},
+{0x000173, 0x000172},
+{0x000175, 0x000174},
+{0x000177, 0x000176},
+{0x00017A, 0x000179},
+{0x00017C, 0x00017B},
+{0x00017E, 0x00017D},
+{0x00017F, 0x000053},
+{0x000180, 0x000243},
+{0x000183, 0x000182},
+{0x000185, 0x000184},
+{0x000188, 0x000187},
+{0x00018C, 0x00018B},
+{0x000192, 0x000191},
+{0x000195, 0x0001F6},
+{0x000199, 0x000198},
+{0x00019A, 0x00023D},
+{0x00019E, 0x000220},
+{0x0001A1, 0x0001A0},
+{0x0001A3, 0x0001A2},
+{0x0001A5, 0x0001A4},
+{0x0001A8, 0x0001A7},
+{0x0001AD, 0x0001AC},
+{0x0001B0, 0x0001AF},
+{0x0001B4, 0x0001B3},
+{0x0001B6, 0x0001B5},
+{0x0001B9, 0x0001B8},
+{0x0001BD, 0x0001BC},
+{0x0001BF, 0x0001F7},
+{0x0001C5, 0x0001C4},
+{0x0001C6, 0x0001C4},
+{0x0001C8, 0x0001C7},
+{0x0001C9, 0x0001C7},
+{0x0001CB, 0x0001CA},
+{0x0001CC, 0x0001CA},
+{0x0001CE, 0x0001CD},
+{0x0001D0, 0x0001CF},
+{0x0001D2, 0x0001D1},
+{0x0001D4, 0x0001D3},
+{0x0001D6, 0x0001D5},
+{0x0001D8, 0x0001D7},
+{0x0001DA, 0x0001D9},
+{0x0001DC, 0x0001DB},
+{0x0001DD, 0x00018E},
+{0x0001DF, 0x0001DE},
+{0x0001E1, 0x0001E0},
+{0x0001E3, 0x0001E2},
+{0x0001E5, 0x0001E4},
+{0x0001E7, 0x0001E6},
+{0x0001E9, 0x0001E8},
+{0x0001EB, 0x0001EA},
+{0x0001ED, 0x0001EC},
+{0x0001EF, 0x0001EE},
+{0x0001F2, 0x0001F1},
+{0x0001F3, 0x0001F1},
+{0x0001F5, 0x0001F4},
+{0x0001F9, 0x0001F8},
+{0x0001FB, 0x0001FA},
+{0x0001FD, 0x0001FC},
+{0x0001FF, 0x0001FE},
+{0x000201, 0x000200},
+{0x000203, 0x000202},
+{0x000205, 0x000204},
+{0x000207, 0x000206},
+{0x000209, 0x000208},
+{0x00020B, 0x00020A},
+{0x00020D, 0x00020C},
+{0x00020F, 0x00020E},
+{0x000211, 0x000210},
+{0x000213, 0x000212},
+{0x000215, 0x000214},
+{0x000217, 0x000216},
+{0x000219, 0x000218},
+{0x00021B, 0x00021A},
+{0x00021D, 0x00021C},
+{0x00021F, 0x00021E},
+{0x000223, 0x000222},
+{0x000225, 0x000224},
+{0x000227, 0x000226},
+{0x000229, 0x000228},
+{0x00022B, 0x00022A},
+{0x00022D, 0x00022C},
+{0x00022F, 0x00022E},
+{0x000231, 0x000230},
+{0x000233, 0x000232},
+{0x00023C, 0x00023B},
+{0x00023F, 0x002C7E},
+{0x000240, 0x002C7F},
+{0x000242, 0x000241},
+{0x000247, 0x000246},
+{0x000249, 0x000248},
+{0x00024B, 0x00024A},
+{0x00024D, 0x00024C},
+{0x00024F, 0x00024E},
+{0x000250, 0x002C6F},
+{0x000251, 0x002C6D},
+{0x000252, 0x002C70},
+{0x000253, 0x000181},
+{0x000254, 0x000186},
+{0x000256, 0x000189},
+{0x000257, 0x00018A},
+{0x000259, 0x00018F},
+{0x00025B, 0x000190},
+{0x00025C, 0x00A7AB},
+{0x000260, 0x000193},
+{0x000261, 0x00A7AC},
+{0x000263, 0x000194},
+{0x000265, 0x00A78D},
+{0x000266, 0x00A7AA},
+{0x000268, 0x000197},
+{0x000269, 0x000196},
+{0x00026A, 0x00A7AE},
+{0x00026B, 0x002C62},
+{0x00026C, 0x00A7AD},
+{0x00026F, 0x00019C},
+{0x000271, 0x002C6E},
+{0x000272, 0x00019D},
+{0x000275, 0x00019F},
+{0x00027D, 0x002C64},
+{0x000280, 0x0001A6},
+{0x000282, 0x00A7C5},
+{0x000283, 0x0001A9},
+{0x000287, 0x00A7B1},
+{0x000288, 0x0001AE},
+{0x000289, 0x000244},
+{0x00028A, 0x0001B1},
+{0x00028B, 0x0001B2},
+{0x00028C, 0x000245},
+{0x000292, 0x0001B7},
+{0x00029D, 0x00A7B2},
+{0x00029E, 0x00A7B0},
+{0x000345, 0x000399},
+{0x000371, 0x000370},
+{0x000373, 0x000372},
+{0x000377, 0x000376},
+{0x00037B, 0x0003FD},
+{0x00037C, 0x0003FE},
+{0x00037D, 0x0003FF},
+{0x0003AC, 0x000386},
+{0x0003AD, 0x000388},
+{0x0003AE, 0x000389},
+{0x0003AF, 0x00038A},
+{0x0003B1, 0x000391},
+{0x0003B2, 0x000392},
+{0x0003B3, 0x000393},
+{0x0003B4, 0x000394},
+{0x0003B5, 0x000395},
+{0x0003B6, 0x000396},
+{0x0003B7, 0x000397},
+{0x0003B8, 0x000398},
+{0x0003B9, 0x000399},
+{0x0003BA, 0x00039A},
+{0x0003BB, 0x00039B},
+{0x0003BC, 0x00039C},
+{0x0003BD, 0x00039D},
+{0x0003BE, 0x00039E},
+{0x0003BF, 0x00039F},
+{0x0003C0, 0x0003A0},
+{0x0003C1, 0x0003A1},
+{0x0003C2, 0x0003A3},
+{0x0003C3, 0x0003A3},
+{0x0003C4, 0x0003A4},
+{0x0003C5, 0x0003A5},
+{0x0003C6, 0x0003A6},
+{0x0003C7, 0x0003A7},
+{0x0003C8, 0x0003A8},
+{0x0003C9, 0x0003A9},
+{0x0003CA, 0x0003AA},
+{0x0003CB, 0x0003AB},
+{0x0003CC, 0x00038C},
+{0x0003CD, 0x00038E},
+{0x0003CE, 0x00038F},
+{0x0003D0, 0x000392},
+{0x0003D1, 0x000398},
+{0x0003D5, 0x0003A6},
+{0x0003D6, 0x0003A0},
+{0x0003D7, 0x0003CF},
+{0x0003D9, 0x0003D8},
+{0x0003DB, 0x0003DA},
+{0x0003DD, 0x0003DC},
+{0x0003DF, 0x0003DE},
+{0x0003E1, 0x0003E0},
+{0x0003E3, 0x0003E2},
+{0x0003E5, 0x0003E4},
+{0x0003E7, 0x0003E6},
+{0x0003E9, 0x0003E8},
+{0x0003EB, 0x0003EA},
+{0x0003ED, 0x0003EC},
+{0x0003EF, 0x0003EE},
+{0x0003F0, 0x00039A},
+{0x0003F1, 0x0003A1},
+{0x0003F2, 0x0003F9},
+{0x0003F3, 0x00037F},
+{0x0003F5, 0x000395},
+{0x0003F8, 0x0003F7},
+{0x0003FB, 0x0003FA},
+{0x000430, 0x000410},
+{0x000431, 0x000411},
+{0x000432, 0x000412},
+{0x000433, 0x000413},
+{0x000434, 0x000414},
+{0x000435, 0x000415},
+{0x000436, 0x000416},
+{0x000437, 0x000417},
+{0x000438, 0x000418},
+{0x000439, 0x000419},
+{0x00043A, 0x00041A},
+{0x00043B, 0x00041B},
+{0x00043C, 0x00041C},
+{0x00043D, 0x00041D},
+{0x00043E, 0x00041E},
+{0x00043F, 0x00041F},
+{0x000440, 0x000420},
+{0x000441, 0x000421},
+{0x000442, 0x000422},
+{0x000443, 0x000423},
+{0x000444, 0x000424},
+{0x000445, 0x000425},
+{0x000446, 0x000426},
+{0x000447, 0x000427},
+{0x000448, 0x000428},
+{0x000449, 0x000429},
+{0x00044A, 0x00042A},
+{0x00044B, 0x00042B},
+{0x00044C, 0x00042C},
+{0x00044D, 0x00042D},
+{0x00044E, 0x00042E},
+{0x00044F, 0x00042F},
+{0x000450, 0x000400},
+{0x000451, 0x000401},
+{0x000452, 0x000402},
+{0x000453, 0x000403},
+{0x000454, 0x000404},
+{0x000455, 0x000405},
+{0x000456, 0x000406},
+{0x000457, 0x000407},
+{0x000458, 0x000408},
+{0x000459, 0x000409},
+{0x00045A, 0x00040A},
+{0x00045B, 0x00040B},
+{0x00045C, 0x00040C},
+{0x00045D, 0x00040D},
+{0x00045E, 0x00040E},
+{0x00045F, 0x00040F},
+{0x000461, 0x000460},
+{0x000463, 0x000462},
+{0x000465, 0x000464},
+{0x000467, 0x000466},
+{0x000469, 0x000468},
+{0x00046B, 0x00046A},
+{0x00046D, 0x00046C},
+{0x00046F, 0x00046E},
+{0x000471, 0x000470},
+{0x000473, 0x000472},
+{0x000475, 0x000474},
+{0x000477, 0x000476},
+{0x000479, 0x000478},
+{0x00047B, 0x00047A},
+{0x00047D, 0x00047C},
+{0x00047F, 0x00047E},
+{0x000481, 0x000480},
+{0x00048B, 0x00048A},
+{0x00048D, 0x00048C},
+{0x00048F, 0x00048E},
+{0x000491, 0x000490},
+{0x000493, 0x000492},
+{0x000495, 0x000494},
+{0x000497, 0x000496},
+{0x000499, 0x000498},
+{0x00049B, 0x00049A},
+{0x00049D, 0x00049C},
+{0x00049F, 0x00049E},
+{0x0004A1, 0x0004A0},
+{0x0004A3, 0x0004A2},
+{0x0004A5, 0x0004A4},
+{0x0004A7, 0x0004A6},
+{0x0004A9, 0x0004A8},
+{0x0004AB, 0x0004AA},
+{0x0004AD, 0x0004AC},
+{0x0004AF, 0x0004AE},
+{0x0004B1, 0x0004B0},
+{0x0004B3, 0x0004B2},
+{0x0004B5, 0x0004B4},
+{0x0004B7, 0x0004B6},
+{0x0004B9, 0x0004B8},
+{0x0004BB, 0x0004BA},
+{0x0004BD, 0x0004BC},
+{0x0004BF, 0x0004BE},
+{0x0004C2, 0x0004C1},
+{0x0004C4, 0x0004C3},
+{0x0004C6, 0x0004C5},
+{0x0004C8, 0x0004C7},
+{0x0004CA, 0x0004C9},
+{0x0004CC, 0x0004CB},
+{0x0004CE, 0x0004CD},
+{0x0004CF, 0x0004C0},
+{0x0004D1, 0x0004D0},
+{0x0004D3, 0x0004D2},
+{0x0004D5, 0x0004D4},
+{0x0004D7, 0x0004D6},
+{0x0004D9, 0x0004D8},
+{0x0004DB, 0x0004DA},
+{0x0004DD, 0x0004DC},
+{0x0004DF, 0x0004DE},
+{0x0004E1, 0x0004E0},
+{0x0004E3, 0x0004E2},
+{0x0004E5, 0x0004E4},
+{0x0004E7, 0x0004E6},
+{0x0004E9, 0x0004E8},
+{0x0004EB, 0x0004EA},
+{0x0004ED, 0x0004EC},
+{0x0004EF, 0x0004EE},
+{0x0004F1, 0x0004F0},
+{0x0004F3, 0x0004F2},
+{0x0004F5, 0x0004F4},
+{0x0004F7, 0x0004F6},
+{0x0004F9, 0x0004F8},
+{0x0004FB, 0x0004FA},
+{0x0004FD, 0x0004FC},
+{0x0004FF, 0x0004FE},
+{0x000501, 0x000500},
+{0x000503, 0x000502},
+{0x000505, 0x000504},
+{0x000507, 0x000506},
+{0x000509, 0x000508},
+{0x00050B, 0x00050A},
+{0x00050D, 0x00050C},
+{0x00050F, 0x00050E},
+{0x000511, 0x000510},
+{0x000513, 0x000512},
+{0x000515, 0x000514},
+{0x000517, 0x000516},
+{0x000519, 0x000518},
+{0x00051B, 0x00051A},
+{0x00051D, 0x00051C},
+{0x00051F, 0x00051E},
+{0x000521, 0x000520},
+{0x000523, 0x000522},
+{0x000525, 0x000524},
+{0x000527, 0x000526},
+{0x000529, 0x000528},
+{0x00052B, 0x00052A},
+{0x00052D, 0x00052C},
+{0x00052F, 0x00052E},
+{0x000561, 0x000531},
+{0x000562, 0x000532},
+{0x000563, 0x000533},
+{0x000564, 0x000534},
+{0x000565, 0x000535},
+{0x000566, 0x000536},
+{0x000567, 0x000537},
+{0x000568, 0x000538},
+{0x000569, 0x000539},
+{0x00056A, 0x00053A},
+{0x00056B, 0x00053B},
+{0x00056C, 0x00053C},
+{0x00056D, 0x00053D},
+{0x00056E, 0x00053E},
+{0x00056F, 0x00053F},
+{0x000570, 0x000540},
+{0x000571, 0x000541},
+{0x000572, 0x000542},
+{0x000573, 0x000543},
+{0x000574, 0x000544},
+{0x000575, 0x000545},
+{0x000576, 0x000546},
+{0x000577, 0x000547},
+{0x000578, 0x000548},
+{0x000579, 0x000549},
+{0x00057A, 0x00054A},
+{0x00057B, 0x00054B},
+{0x00057C, 0x00054C},
+{0x00057D, 0x00054D},
+{0x00057E, 0x00054E},
+{0x00057F, 0x00054F},
+{0x000580, 0x000550},
+{0x000581, 0x000551},
+{0x000582, 0x000552},
+{0x000583, 0x000553},
+{0x000584, 0x000554},
+{0x000585, 0x000555},
+{0x000586, 0x000556},
+{0x0010D0, 0x001C90},
+{0x0010D1, 0x001C91},
+{0x0010D2, 0x001C92},
+{0x0010D3, 0x001C93},
+{0x0010D4, 0x001C94},
+{0x0010D5, 0x001C95},
+{0x0010D6, 0x001C96},
+{0x0010D7, 0x001C97},
+{0x0010D8, 0x001C98},
+{0x0010D9, 0x001C99},
+{0x0010DA, 0x001C9A},
+{0x0010DB, 0x001C9B},
+{0x0010DC, 0x001C9C},
+{0x0010DD, 0x001C9D},
+{0x0010DE, 0x001C9E},
+{0x0010DF, 0x001C9F},
+{0x0010E0, 0x001CA0},
+{0x0010E1, 0x001CA1},
+{0x0010E2, 0x001CA2},
+{0x0010E3, 0x001CA3},
+{0x0010E4, 0x001CA4},
+{0x0010E5, 0x001CA5},
+{0x0010E6, 0x001CA6},
+{0x0010E7, 0x001CA7},
+{0x0010E8, 0x001CA8},
+{0x0010E9, 0x001CA9},
+{0x0010EA, 0x001CAA},
+{0x0010EB, 0x001CAB},
+{0x0010EC, 0x001CAC},
+{0x0010ED, 0x001CAD},
+{0x0010EE, 0x001CAE},
+{0x0010EF, 0x001CAF},
+{0x0010F0, 0x001CB0},
+{0x0010F1, 0x001CB1},
+{0x0010F2, 0x001CB2},
+{0x0010F3, 0x001CB3},
+{0x0010F4, 0x001CB4},
+{0x0010F5, 0x001CB5},
+{0x0010F6, 0x001CB6},
+{0x0010F7, 0x001CB7},
+{0x0010F8, 0x001CB8},
+{0x0010F9, 0x001CB9},
+{0x0010FA, 0x001CBA},
+{0x0010FD, 0x001CBD},
+{0x0010FE, 0x001CBE},
+{0x0010FF, 0x001CBF},
+{0x0013F8, 0x0013F0},
+{0x0013F9, 0x0013F1},
+{0x0013FA, 0x0013F2},
+{0x0013FB, 0x0013F3},
+{0x0013FC, 0x0013F4},
+{0x0013FD, 0x0013F5},
+{0x001C80, 0x000412},
+{0x001C81, 0x000414},
+{0x001C82, 0x00041E},
+{0x001C83, 0x000421},
+{0x001C84, 0x000422},
+{0x001C85, 0x000422},
+{0x001C86, 0x00042A},
+{0x001C87, 0x000462},
+{0x001C88, 0x00A64A},
+{0x001D79, 0x00A77D},
+{0x001D7D, 0x002C63},
+{0x001D8E, 0x00A7C6},
+{0x001E01, 0x001E00},
+{0x001E03, 0x001E02},
+{0x001E05, 0x001E04},
+{0x001E07, 0x001E06},
+{0x001E09, 0x001E08},
+{0x001E0B, 0x001E0A},
+{0x001E0D, 0x001E0C},
+{0x001E0F, 0x001E0E},
+{0x001E11, 0x001E10},
+{0x001E13, 0x001E12},
+{0x001E15, 0x001E14},
+{0x001E17, 0x001E16},
+{0x001E19, 0x001E18},
+{0x001E1B, 0x001E1A},
+{0x001E1D, 0x001E1C},
+{0x001E1F, 0x001E1E},
+{0x001E21, 0x001E20},
+{0x001E23, 0x001E22},
+{0x001E25, 0x001E24},
+{0x001E27, 0x001E26},
+{0x001E29, 0x001E28},
+{0x001E2B, 0x001E2A},
+{0x001E2D, 0x001E2C},
+{0x001E2F, 0x001E2E},
+{0x001E31, 0x001E30},
+{0x001E33, 0x001E32},
+{0x001E35, 0x001E34},
+{0x001E37, 0x001E36},
+{0x001E39, 0x001E38},
+{0x001E3B, 0x001E3A},
+{0x001E3D, 0x001E3C},
+{0x001E3F, 0x001E3E},
+{0x001E41, 0x001E40},
+{0x001E43, 0x001E42},
+{0x001E45, 0x001E44},
+{0x001E47, 0x001E46},
+{0x001E49, 0x001E48},
+{0x001E4B, 0x001E4A},
+{0x001E4D, 0x001E4C},
+{0x001E4F, 0x001E4E},
+{0x001E51, 0x001E50},
+{0x001E53, 0x001E52},
+{0x001E55, 0x001E54},
+{0x001E57, 0x001E56},
+{0x001E59, 0x001E58},
+{0x001E5B, 0x001E5A},
+{0x001E5D, 0x001E5C},
+{0x001E5F, 0x001E5E},
+{0x001E61, 0x001E60},
+{0x001E63, 0x001E62},
+{0x001E65, 0x001E64},
+{0x001E67, 0x001E66},
+{0x001E69, 0x001E68},
+{0x001E6B, 0x001E6A},
+{0x001E6D, 0x001E6C},
+{0x001E6F, 0x001E6E},
+{0x001E71, 0x001E70},
+{0x001E73, 0x001E72},
+{0x001E75, 0x001E74},
+{0x001E77, 0x001E76},
+{0x001E79, 0x001E78},
+{0x001E7B, 0x001E7A},
+{0x001E7D, 0x001E7C},
+{0x001E7F, 0x001E7E},
+{0x001E81, 0x001E80},
+{0x001E83, 0x001E82},
+{0x001E85, 0x001E84},
+{0x001E87, 0x001E86},
+{0x001E89, 0x001E88},
+{0x001E8B, 0x001E8A},
+{0x001E8D, 0x001E8C},
+{0x001E8F, 0x001E8E},
+{0x001E91, 0x001E90},
+{0x001E93, 0x001E92},
+{0x001E95, 0x001E94},
+{0x001E9B, 0x001E60},
+{0x001EA1, 0x001EA0},
+{0x001EA3, 0x001EA2},
+{0x001EA5, 0x001EA4},
+{0x001EA7, 0x001EA6},
+{0x001EA9, 0x001EA8},
+{0x001EAB, 0x001EAA},
+{0x001EAD, 0x001EAC},
+{0x001EAF, 0x001EAE},
+{0x001EB1, 0x001EB0},
+{0x001EB3, 0x001EB2},
+{0x001EB5, 0x001EB4},
+{0x001EB7, 0x001EB6},
+{0x001EB9, 0x001EB8},
+{0x001EBB, 0x001EBA},
+{0x001EBD, 0x001EBC},
+{0x001EBF, 0x001EBE},
+{0x001EC1, 0x001EC0},
+{0x001EC3, 0x001EC2},
+{0x001EC5, 0x001EC4},
+{0x001EC7, 0x001EC6},
+{0x001EC9, 0x001EC8},
+{0x001ECB, 0x001ECA},
+{0x001ECD, 0x001ECC},
+{0x001ECF, 0x001ECE},
+{0x001ED1, 0x001ED0},
+{0x001ED3, 0x001ED2},
+{0x001ED5, 0x001ED4},
+{0x001ED7, 0x001ED6},
+{0x001ED9, 0x001ED8},
+{0x001EDB, 0x001EDA},
+{0x001EDD, 0x001EDC},
+{0x001EDF, 0x001EDE},
+{0x001EE1, 0x001EE0},
+{0x001EE3, 0x001EE2},
+{0x001EE5, 0x001EE4},
+{0x001EE7, 0x001EE6},
+{0x001EE9, 0x001EE8},
+{0x001EEB, 0x001EEA},
+{0x001EED, 0x001EEC},
+{0x001EEF, 0x001EEE},
+{0x001EF1, 0x001EF0},
+{0x001EF3, 0x001EF2},
+{0x001EF5, 0x001EF4},
+{0x001EF7, 0x001EF6},
+{0x001EF9, 0x001EF8},
+{0x001EFB, 0x001EFA},
+{0x001EFD, 0x001EFC},
+{0x001EFF, 0x001EFE},
+{0x001F00, 0x001F08},
+{0x001F01, 0x001F09},
+{0x001F02, 0x001F0A},
+{0x001F03, 0x001F0B},
+{0x001F04, 0x001F0C},
+{0x001F05, 0x001F0D},
+{0x001F06, 0x001F0E},
+{0x001F07, 0x001F0F},
+{0x001F10, 0x001F18},
+{0x001F11, 0x001F19},
+{0x001F12, 0x001F1A},
+{0x001F13, 0x001F1B},
+{0x001F14, 0x001F1C},
+{0x001F15, 0x001F1D},
+{0x001F20, 0x001F28},
+{0x001F21, 0x001F29},
+{0x001F22, 0x001F2A},
+{0x001F23, 0x001F2B},
+{0x001F24, 0x001F2C},
+{0x001F25, 0x001F2D},
+{0x001F26, 0x001F2E},
+{0x001F27, 0x001F2F},
+{0x001F30, 0x001F38},
+{0x001F31, 0x001F39},
+{0x001F32, 0x001F3A},
+{0x001F33, 0x001F3B},
+{0x001F34, 0x001F3C},
+{0x001F35, 0x001F3D},
+{0x001F36, 0x001F3E},
+{0x001F37, 0x001F3F},
+{0x001F40, 0x001F48},
+{0x001F41, 0x001F49},
+{0x001F42, 0x001F4A},
+{0x001F43, 0x001F4B},
+{0x001F44, 0x001F4C},
+{0x001F45, 0x001F4D},
+{0x001F51, 0x001F59},
+{0x001F53, 0x001F5B},
+{0x001F55, 0x001F5D},
+{0x001F57, 0x001F5F},
+{0x001F60, 0x001F68},
+{0x001F61, 0x001F69},
+{0x001F62, 0x001F6A},
+{0x001F63, 0x001F6B},
+{0x001F64, 0x001F6C},
+{0x001F65, 0x001F6D},
+{0x001F66, 0x001F6E},
+{0x001F67, 0x001F6F},
+{0x001F70, 0x001FBA},
+{0x001F71, 0x001FBB},
+{0x001F72, 0x001FC8},
+{0x001F73, 0x001FC9},
+{0x001F74, 0x001FCA},
+{0x001F75, 0x001FCB},
+{0x001F76, 0x001FDA},
+{0x001F77, 0x001FDB},
+{0x001F78, 0x001FF8},
+{0x001F79, 0x001FF9},
+{0x001F7A, 0x001FEA},
+{0x001F7B, 0x001FEB},
+{0x001F7C, 0x001FFA},
+{0x001F7D, 0x001FFB},
+{0x001F80, 0x001F88},
+{0x001F81, 0x001F89},
+{0x001F82, 0x001F8A},
+{0x001F83, 0x001F8B},
+{0x001F84, 0x001F8C},
+{0x001F85, 0x001F8D},
+{0x001F86, 0x001F8E},
+{0x001F87, 0x001F8F},
+{0x001F90, 0x001F98},
+{0x001F91, 0x001F99},
+{0x001F92, 0x001F9A},
+{0x001F93, 0x001F9B},
+{0x001F94, 0x001F9C},
+{0x001F95, 0x001F9D},
+{0x001F96, 0x001F9E},
+{0x001F97, 0x001F9F},
+{0x001FA0, 0x001FA8},
+{0x001FA1, 0x001FA9},
+{0x001FA2, 0x001FAA},
+{0x001FA3, 0x001FAB},
+{0x001FA4, 0x001FAC},
+{0x001FA5, 0x001FAD},
+{0x001FA6, 0x001FAE},
+{0x001FA7, 0x001FAF},
+{0x001FB0, 0x001FB8},
+{0x001FB1, 0x001FB9},
+{0x001FB3, 0x001FBC},
+{0x001FBE, 0x000399},
+{0x001FC3, 0x001FCC},
+{0x001FD0, 0x001FD8},
+{0x001FD1, 0x001FD9},
+{0x001FE0, 0x001FE8},
+{0x001FE1, 0x001FE9},
+{0x001FE5, 0x001FEC},
+{0x001FF3, 0x001FFC},
+{0x00214E, 0x002132},
+{0x002170, 0x002160},
+{0x002171, 0x002161},
+{0x002172, 0x002162},
+{0x002173, 0x002163},
+{0x002174, 0x002164},
+{0x002175, 0x002165},
+{0x002176, 0x002166},
+{0x002177, 0x002167},
+{0x002178, 0x002168},
+{0x002179, 0x002169},
+{0x00217A, 0x00216A},
+{0x00217B, 0x00216B},
+{0x00217C, 0x00216C},
+{0x00217D, 0x00216D},
+{0x00217E, 0x00216E},
+{0x00217F, 0x00216F},
+{0x002184, 0x002183},
+{0x0024D0, 0x0024B6},
+{0x0024D1, 0x0024B7},
+{0x0024D2, 0x0024B8},
+{0x0024D3, 0x0024B9},
+{0x0024D4, 0x0024BA},
+{0x0024D5, 0x0024BB},
+{0x0024D6, 0x0024BC},
+{0x0024D7, 0x0024BD},
+{0x0024D8, 0x0024BE},
+{0x0024D9, 0x0024BF},
+{0x0024DA, 0x0024C0},
+{0x0024DB, 0x0024C1},
+{0x0024DC, 0x0024C2},
+{0x0024DD, 0x0024C3},
+{0x0024DE, 0x0024C4},
+{0x0024DF, 0x0024C5},
+{0x0024E0, 0x0024C6},
+{0x0024E1, 0x0024C7},
+{0x0024E2, 0x0024C8},
+{0x0024E3, 0x0024C9},
+{0x0024E4, 0x0024CA},
+{0x0024E5, 0x0024CB},
+{0x0024E6, 0x0024CC},
+{0x0024E7, 0x0024CD},
+{0x0024E8, 0x0024CE},
+{0x0024E9, 0x0024CF},
+{0x002C30, 0x002C00},
+{0x002C31, 0x002C01},
+{0x002C32, 0x002C02},
+{0x002C33, 0x002C03},
+{0x002C34, 0x002C04},
+{0x002C35, 0x002C05},
+{0x002C36, 0x002C06},
+{0x002C37, 0x002C07},
+{0x002C38, 0x002C08},
+{0x002C39, 0x002C09},
+{0x002C3A, 0x002C0A},
+{0x002C3B, 0x002C0B},
+{0x002C3C, 0x002C0C},
+{0x002C3D, 0x002C0D},
+{0x002C3E, 0x002C0E},
+{0x002C3F, 0x002C0F},
+{0x002C40, 0x002C10},
+{0x002C41, 0x002C11},
+{0x002C42, 0x002C12},
+{0x002C43, 0x002C13},
+{0x002C44, 0x002C14},
+{0x002C45, 0x002C15},
+{0x002C46, 0x002C16},
+{0x002C47, 0x002C17},
+{0x002C48, 0x002C18},
+{0x002C49, 0x002C19},
+{0x002C4A, 0x002C1A},
+{0x002C4B, 0x002C1B},
+{0x002C4C, 0x002C1C},
+{0x002C4D, 0x002C1D},
+{0x002C4E, 0x002C1E},
+{0x002C4F, 0x002C1F},
+{0x002C50, 0x002C20},
+{0x002C51, 0x002C21},
+{0x002C52, 0x002C22},
+{0x002C53, 0x002C23},
+{0x002C54, 0x002C24},
+{0x002C55, 0x002C25},
+{0x002C56, 0x002C26},
+{0x002C57, 0x002C27},
+{0x002C58, 0x002C28},
+{0x002C59, 0x002C29},
+{0x002C5A, 0x002C2A},
+{0x002C5B, 0x002C2B},
+{0x002C5C, 0x002C2C},
+{0x002C5D, 0x002C2D},
+{0x002C5E, 0x002C2E},
+{0x002C5F, 0x002C2F},
+{0x002C61, 0x002C60},
+{0x002C65, 0x00023A},
+{0x002C66, 0x00023E},
+{0x002C68, 0x002C67},
+{0x002C6A, 0x002C69},
+{0x002C6C, 0x002C6B},
+{0x002C73, 0x002C72},
+{0x002C76, 0x002C75},
+{0x002C81, 0x002C80},
+{0x002C83, 0x002C82},
+{0x002C85, 0x002C84},
+{0x002C87, 0x002C86},
+{0x002C89, 0x002C88},
+{0x002C8B, 0x002C8A},
+{0x002C8D, 0x002C8C},
+{0x002C8F, 0x002C8E},
+{0x002C91, 0x002C90},
+{0x002C93, 0x002C92},
+{0x002C95, 0x002C94},
+{0x002C97, 0x002C96},
+{0x002C99, 0x002C98},
+{0x002C9B, 0x002C9A},
+{0x002C9D, 0x002C9C},
+{0x002C9F, 0x002C9E},
+{0x002CA1, 0x002CA0},
+{0x002CA3, 0x002CA2},
+{0x002CA5, 0x002CA4},
+{0x002CA7, 0x002CA6},
+{0x002CA9, 0x002CA8},
+{0x002CAB, 0x002CAA},
+{0x002CAD, 0x002CAC},
+{0x002CAF, 0x002CAE},
+{0x002CB1, 0x002CB0},
+{0x002CB3, 0x002CB2},
+{0x002CB5, 0x002CB4},
+{0x002CB7, 0x002CB6},
+{0x002CB9, 0x002CB8},
+{0x002CBB, 0x002CBA},
+{0x002CBD, 0x002CBC},
+{0x002CBF, 0x002CBE},
+{0x002CC1, 0x002CC0},
+{0x002CC3, 0x002CC2},
+{0x002CC5, 0x002CC4},
+{0x002CC7, 0x002CC6},
+{0x002CC9, 0x002CC8},
+{0x002CCB, 0x002CCA},
+{0x002CCD, 0x002CCC},
+{0x002CCF, 0x002CCE},
+{0x002CD1, 0x002CD0},
+{0x002CD3, 0x002CD2},
+{0x002CD5, 0x002CD4},
+{0x002CD7, 0x002CD6},
+{0x002CD9, 0x002CD8},
+{0x002CDB, 0x002CDA},
+{0x002CDD, 0x002CDC},
+{0x002CDF, 0x002CDE},
+{0x002CE1, 0x002CE0},
+{0x002CE3, 0x002CE2},
+{0x002CEC, 0x002CEB},
+{0x002CEE, 0x002CED},
+{0x002CF3, 0x002CF2},
+{0x002D00, 0x0010A0},
+{0x002D01, 0x0010A1},
+{0x002D02, 0x0010A2},
+{0x002D03, 0x0010A3},
+{0x002D04, 0x0010A4},
+{0x002D05, 0x0010A5},
+{0x002D06, 0x0010A6},
+{0x002D07, 0x0010A7},
+{0x002D08, 0x0010A8},
+{0x002D09, 0x0010A9},
+{0x002D0A, 0x0010AA},
+{0x002D0B, 0x0010AB},
+{0x002D0C, 0x0010AC},
+{0x002D0D, 0x0010AD},
+{0x002D0E, 0x0010AE},
+{0x002D0F, 0x0010AF},
+{0x002D10, 0x0010B0},
+{0x002D11, 0x0010B1},
+{0x002D12, 0x0010B2},
+{0x002D13, 0x0010B3},
+{0x002D14, 0x0010B4},
+{0x002D15, 0x0010B5},
+{0x002D16, 0x0010B6},
+{0x002D17, 0x0010B7},
+{0x002D18, 0x0010B8},
+{0x002D19, 0x0010B9},
+{0x002D1A, 0x0010BA},
+{0x002D1B, 0x0010BB},
+{0x002D1C, 0x0010BC},
+{0x002D1D, 0x0010BD},
+{0x002D1E, 0x0010BE},
+{0x002D1F, 0x0010BF},
+{0x002D20, 0x0010C0},
+{0x002D21, 0x0010C1},
+{0x002D22, 0x0010C2},
+{0x002D23, 0x0010C3},
+{0x002D24, 0x0010C4},
+{0x002D25, 0x0010C5},
+{0x002D27, 0x0010C7},
+{0x002D2D, 0x0010CD},
+{0x00A641, 0x00A640},
+{0x00A643, 0x00A642},
+{0x00A645, 0x00A644},
+{0x00A647, 0x00A646},
+{0x00A649, 0x00A648},
+{0x00A64B, 0x00A64A},
+{0x00A64D, 0x00A64C},
+{0x00A64F, 0x00A64E},
+{0x00A651, 0x00A650},
+{0x00A653, 0x00A652},
+{0x00A655, 0x00A654},
+{0x00A657, 0x00A656},
+{0x00A659, 0x00A658},
+{0x00A65B, 0x00A65A},
+{0x00A65D, 0x00A65C},
+{0x00A65F, 0x00A65E},
+{0x00A661, 0x00A660},
+{0x00A663, 0x00A662},
+{0x00A665, 0x00A664},
+{0x00A667, 0x00A666},
+{0x00A669, 0x00A668},
+{0x00A66B, 0x00A66A},
+{0x00A66D, 0x00A66C},
+{0x00A681, 0x00A680},
+{0x00A683, 0x00A682},
+{0x00A685, 0x00A684},
+{0x00A687, 0x00A686},
+{0x00A689, 0x00A688},
+{0x00A68B, 0x00A68A},
+{0x00A68D, 0x00A68C},
+{0x00A68F, 0x00A68E},
+{0x00A691, 0x00A690},
+{0x00A693, 0x00A692},
+{0x00A695, 0x00A694},
+{0x00A697, 0x00A696},
+{0x00A699, 0x00A698},
+{0x00A69B, 0x00A69A},
+{0x00A723, 0x00A722},
+{0x00A725, 0x00A724},
+{0x00A727, 0x00A726},
+{0x00A729, 0x00A728},
+{0x00A72B, 0x00A72A},
+{0x00A72D, 0x00A72C},
+{0x00A72F, 0x00A72E},
+{0x00A733, 0x00A732},
+{0x00A735, 0x00A734},
+{0x00A737, 0x00A736},
+{0x00A739, 0x00A738},
+{0x00A73B, 0x00A73A},
+{0x00A73D, 0x00A73C},
+{0x00A73F, 0x00A73E},
+{0x00A741, 0x00A740},
+{0x00A743, 0x00A742},
+{0x00A745, 0x00A744},
+{0x00A747, 0x00A746},
+{0x00A749, 0x00A748},
+{0x00A74B, 0x00A74A},
+{0x00A74D, 0x00A74C},
+{0x00A74F, 0x00A74E},
+{0x00A751, 0x00A750},
+{0x00A753, 0x00A752},
+{0x00A755, 0x00A754},
+{0x00A757, 0x00A756},
+{0x00A759, 0x00A758},
+{0x00A75B, 0x00A75A},
+{0x00A75D, 0x00A75C},
+{0x00A75F, 0x00A75E},
+{0x00A761, 0x00A760},
+{0x00A763, 0x00A762},
+{0x00A765, 0x00A764},
+{0x00A767, 0x00A766},
+{0x00A769, 0x00A768},
+{0x00A76B, 0x00A76A},
+{0x00A76D, 0x00A76C},
+{0x00A76F, 0x00A76E},
+{0x00A77A, 0x00A779},
+{0x00A77C, 0x00A77B},
+{0x00A77F, 0x00A77E},
+{0x00A781, 0x00A780},
+{0x00A783, 0x00A782},
+{0x00A785, 0x00A784},
+{0x00A787, 0x00A786},
+{0x00A78C, 0x00A78B},
+{0x00A791, 0x00A790},
+{0x00A793, 0x00A792},
+{0x00A794, 0x00A7C4},
+{0x00A797, 0x00A796},
+{0x00A799, 0x00A798},
+{0x00A79B, 0x00A79A},
+{0x00A79D, 0x00A79C},
+{0x00A79F, 0x00A79E},
+{0x00A7A1, 0x00A7A0},
+{0x00A7A3, 0x00A7A2},
+{0x00A7A5, 0x00A7A4},
+{0x00A7A7, 0x00A7A6},
+{0x00A7A9, 0x00A7A8},
+{0x00A7B5, 0x00A7B4},
+{0x00A7B7, 0x00A7B6},
+{0x00A7B9, 0x00A7B8},
+{0x00A7BB, 0x00A7BA},
+{0x00A7BD, 0x00A7BC},
+{0x00A7BF, 0x00A7BE},
+{0x00A7C1, 0x00A7C0},
+{0x00A7C3, 0x00A7C2},
+{0x00A7C8, 0x00A7C7},
+{0x00A7CA, 0x00A7C9},
+{0x00A7D1, 0x00A7D0},
+{0x00A7D7, 0x00A7D6},
+{0x00A7D9, 0x00A7D8},
+{0x00A7F6, 0x00A7F5},
+{0x00AB53, 0x00A7B3},
+{0x00AB70, 0x0013A0},
+{0x00AB71, 0x0013A1},
+{0x00AB72, 0x0013A2},
+{0x00AB73, 0x0013A3},
+{0x00AB74, 0x0013A4},
+{0x00AB75, 0x0013A5},
+{0x00AB76, 0x0013A6},
+{0x00AB77, 0x0013A7},
+{0x00AB78, 0x0013A8},
+{0x00AB79, 0x0013A9},
+{0x00AB7A, 0x0013AA},
+{0x00AB7B, 0x0013AB},
+{0x00AB7C, 0x0013AC},
+{0x00AB7D, 0x0013AD},
+{0x00AB7E, 0x0013AE},
+{0x00AB7F, 0x0013AF},
+{0x00AB80, 0x0013B0},
+{0x00AB81, 0x0013B1},
+{0x00AB82, 0x0013B2},
+{0x00AB83, 0x0013B3},
+{0x00AB84, 0x0013B4},
+{0x00AB85, 0x0013B5},
+{0x00AB86, 0x0013B6},
+{0x00AB87, 0x0013B7},
+{0x00AB88, 0x0013B8},
+{0x00AB89, 0x0013B9},
+{0x00AB8A, 0x0013BA},
+{0x00AB8B, 0x0013BB},
+{0x00AB8C, 0x0013BC},
+{0x00AB8D, 0x0013BD},
+{0x00AB8E, 0x0013BE},
+{0x00AB8F, 0x0013BF},
+{0x00AB90, 0x0013C0},
+{0x00AB91, 0x0013C1},
+{0x00AB92, 0x0013C2},
+{0x00AB93, 0x0013C3},
+{0x00AB94, 0x0013C4},
+{0x00AB95, 0x0013C5},
+{0x00AB96, 0x0013C6},
+{0x00AB97, 0x0013C7},
+{0x00AB98, 0x0013C8},
+{0x00AB99, 0x0013C9},
+{0x00AB9A, 0x0013CA},
+{0x00AB9B, 0x0013CB},
+{0x00AB9C, 0x0013CC},
+{0x00AB9D, 0x0013CD},
+{0x00AB9E, 0x0013CE},
+{0x00AB9F, 0x0013CF},
+{0x00ABA0, 0x0013D0},
+{0x00ABA1, 0x0013D1},
+{0x00ABA2, 0x0013D2},
+{0x00ABA3, 0x0013D3},
+{0x00ABA4, 0x0013D4},
+{0x00ABA5, 0x0013D5},
+{0x00ABA6, 0x0013D6},
+{0x00ABA7, 0x0013D7},
+{0x00ABA8, 0x0013D8},
+{0x00ABA9, 0x0013D9},
+{0x00ABAA, 0x0013DA},
+{0x00ABAB, 0x0013DB},
+{0x00ABAC, 0x0013DC},
+{0x00ABAD, 0x0013DD},
+{0x00ABAE, 0x0013DE},
+{0x00ABAF, 0x0013DF},
+{0x00ABB0, 0x0013E0},
+{0x00ABB1, 0x0013E1},
+{0x00ABB2, 0x0013E2},
+{0x00ABB3, 0x0013E3},
+{0x00ABB4, 0x0013E4},
+{0x00ABB5, 0x0013E5},
+{0x00ABB6, 0x0013E6},
+{0x00ABB7, 0x0013E7},
+{0x00ABB8, 0x0013E8},
+{0x00ABB9, 0x0013E9},
+{0x00ABBA, 0x0013EA},
+{0x00ABBB, 0x0013EB},
+{0x00ABBC, 0x0013EC},
+{0x00ABBD, 0x0013ED},
+{0x00ABBE, 0x0013EE},
+{0x00ABBF, 0x0013EF},
+{0x00FF41, 0x00FF21},
+{0x00FF42, 0x00FF22},
+{0x00FF43, 0x00FF23},
+{0x00FF44, 0x00FF24},
+{0x00FF45, 0x00FF25},
+{0x00FF46, 0x00FF26},
+{0x00FF47, 0x00FF27},
+{0x00FF48, 0x00FF28},
+{0x00FF49, 0x00FF29},
+{0x00FF4A, 0x00FF2A},
+{0x00FF4B, 0x00FF2B},
+{0x00FF4C, 0x00FF2C},
+{0x00FF4D, 0x00FF2D},
+{0x00FF4E, 0x00FF2E},
+{0x00FF4F, 0x00FF2F},
+{0x00FF50, 0x00FF30},
+{0x00FF51, 0x00FF31},
+{0x00FF52, 0x00FF32},
+{0x00FF53, 0x00FF33},
+{0x00FF54, 0x00FF34},
+{0x00FF55, 0x00FF35},
+{0x00FF56, 0x00FF36},
+{0x00FF57, 0x00FF37},
+{0x00FF58, 0x00FF38},
+{0x00FF59, 0x00FF39},
+{0x00FF5A, 0x00FF3A},
+{0x010428, 0x010400},
+{0x010429, 0x010401},
+{0x01042A, 0x010402},
+{0x01042B, 0x010403},
+{0x01042C, 0x010404},
+{0x01042D, 0x010405},
+{0x01042E, 0x010406},
+{0x01042F, 0x010407},
+{0x010430, 0x010408},
+{0x010431, 0x010409},
+{0x010432, 0x01040A},
+{0x010433, 0x01040B},
+{0x010434, 0x01040C},
+{0x010435, 0x01040D},
+{0x010436, 0x01040E},
+{0x010437, 0x01040F},
+{0x010438, 0x010410},
+{0x010439, 0x010411},
+{0x01043A, 0x010412},
+{0x01043B, 0x010413},
+{0x01043C, 0x010414},
+{0x01043D, 0x010415},
+{0x01043E, 0x010416},
+{0x01043F, 0x010417},
+{0x010440, 0x010418},
+{0x010441, 0x010419},
+{0x010442, 0x01041A},
+{0x010443, 0x01041B},
+{0x010444, 0x01041C},
+{0x010445, 0x01041D},
+{0x010446, 0x01041E},
+{0x010447, 0x01041F},
+{0x010448, 0x010420},
+{0x010449, 0x010421},
+{0x01044A, 0x010422},
+{0x01044B, 0x010423},
+{0x01044C, 0x010424},
+{0x01044D, 0x010425},
+{0x01044E, 0x010426},
+{0x01044F, 0x010427},
+{0x0104D8, 0x0104B0},
+{0x0104D9, 0x0104B1},
+{0x0104DA, 0x0104B2},
+{0x0104DB, 0x0104B3},
+{0x0104DC, 0x0104B4},
+{0x0104DD, 0x0104B5},
+{0x0104DE, 0x0104B6},
+{0x0104DF, 0x0104B7},
+{0x0104E0, 0x0104B8},
+{0x0104E1, 0x0104B9},
+{0x0104E2, 0x0104BA},
+{0x0104E3, 0x0104BB},
+{0x0104E4, 0x0104BC},
+{0x0104E5, 0x0104BD},
+{0x0104E6, 0x0104BE},
+{0x0104E7, 0x0104BF},
+{0x0104E8, 0x0104C0},
+{0x0104E9, 0x0104C1},
+{0x0104EA, 0x0104C2},
+{0x0104EB, 0x0104C3},
+{0x0104EC, 0x0104C4},
+{0x0104ED, 0x0104C5},
+{0x0104EE, 0x0104C6},
+{0x0104EF, 0x0104C7},
+{0x0104F0, 0x0104C8},
+{0x0104F1, 0x0104C9},
+{0x0104F2, 0x0104CA},
+{0x0104F3, 0x0104CB},
+{0x0104F4, 0x0104CC},
+{0x0104F5, 0x0104CD},
+{0x0104F6, 0x0104CE},
+{0x0104F7, 0x0104CF},
+{0x0104F8, 0x0104D0},
+{0x0104F9, 0x0104D1},
+{0x0104FA, 0x0104D2},
+{0x0104FB, 0x0104D3},
+{0x010597, 0x010570},
+{0x010598, 0x010571},
+{0x010599, 0x010572},
+{0x01059A, 0x010573},
+{0x01059B, 0x010574},
+{0x01059C, 0x010575},
+{0x01059D, 0x010576},
+{0x01059E, 0x010577},
+{0x01059F, 0x010578},
+{0x0105A0, 0x010579},
+{0x0105A1, 0x01057A},
+{0x0105A3, 0x01057C},
+{0x0105A4, 0x01057D},
+{0x0105A5, 0x01057E},
+{0x0105A6, 0x01057F},
+{0x0105A7, 0x010580},
+{0x0105A8, 0x010581},
+{0x0105A9, 0x010582},
+{0x0105AA, 0x010583},
+{0x0105AB, 0x010584},
+{0x0105AC, 0x010585},
+{0x0105AD, 0x010586},
+{0x0105AE, 0x010587},
+{0x0105AF, 0x010588},
+{0x0105B0, 0x010589},
+{0x0105B1, 0x01058A},
+{0x0105B3, 0x01058C},
+{0x0105B4, 0x01058D},
+{0x0105B5, 0x01058E},
+{0x0105B6, 0x01058F},
+{0x0105B7, 0x010590},
+{0x0105B8, 0x010591},
+{0x0105B9, 0x010592},
+{0x0105BB, 0x010594},
+{0x0105BC, 0x010595},
+{0x010CC0, 0x010C80},
+{0x010CC1, 0x010C81},
+{0x010CC2, 0x010C82},
+{0x010CC3, 0x010C83},
+{0x010CC4, 0x010C84},
+{0x010CC5, 0x010C85},
+{0x010CC6, 0x010C86},
+{0x010CC7, 0x010C87},
+{0x010CC8, 0x010C88},
+{0x010CC9, 0x010C89},
+{0x010CCA, 0x010C8A},
+{0x010CCB, 0x010C8B},
+{0x010CCC, 0x010C8C},
+{0x010CCD, 0x010C8D},
+{0x010CCE, 0x010C8E},
+{0x010CCF, 0x010C8F},
+{0x010CD0, 0x010C90},
+{0x010CD1, 0x010C91},
+{0x010CD2, 0x010C92},
+{0x010CD3, 0x010C93},
+{0x010CD4, 0x010C94},
+{0x010CD5, 0x010C95},
+{0x010CD6, 0x010C96},
+{0x010CD7, 0x010C97},
+{0x010CD8, 0x010C98},
+{0x010CD9, 0x010C99},
+{0x010CDA, 0x010C9A},
+{0x010CDB, 0x010C9B},
+{0x010CDC, 0x010C9C},
+{0x010CDD, 0x010C9D},
+{0x010CDE, 0x010C9E},
+{0x010CDF, 0x010C9F},
+{0x010CE0, 0x010CA0},
+{0x010CE1, 0x010CA1},
+{0x010CE2, 0x010CA2},
+{0x010CE3, 0x010CA3},
+{0x010CE4, 0x010CA4},
+{0x010CE5, 0x010CA5},
+{0x010CE6, 0x010CA6},
+{0x010CE7, 0x010CA7},
+{0x010CE8, 0x010CA8},
+{0x010CE9, 0x010CA9},
+{0x010CEA, 0x010CAA},
+{0x010CEB, 0x010CAB},
+{0x010CEC, 0x010CAC},
+{0x010CED, 0x010CAD},
+{0x010CEE, 0x010CAE},
+{0x010CEF, 0x010CAF},
+{0x010CF0, 0x010CB0},
+{0x010CF1, 0x010CB1},
+{0x010CF2, 0x010CB2},
+{0x0118C0, 0x0118A0},
+{0x0118C1, 0x0118A1},
+{0x0118C2, 0x0118A2},
+{0x0118C3, 0x0118A3},
+{0x0118C4, 0x0118A4},
+{0x0118C5, 0x0118A5},
+{0x0118C6, 0x0118A6},
+{0x0118C7, 0x0118A7},
+{0x0118C8, 0x0118A8},
+{0x0118C9, 0x0118A9},
+{0x0118CA, 0x0118AA},
+{0x0118CB, 0x0118AB},
+{0x0118CC, 0x0118AC},
+{0x0118CD, 0x0118AD},
+{0x0118CE, 0x0118AE},
+{0x0118CF, 0x0118AF},
+{0x0118D0, 0x0118B0},
+{0x0118D1, 0x0118B1},
+{0x0118D2, 0x0118B2},
+{0x0118D3, 0x0118B3},
+{0x0118D4, 0x0118B4},
+{0x0118D5, 0x0118B5},
+{0x0118D6, 0x0118B6},
+{0x0118D7, 0x0118B7},
+{0x0118D8, 0x0118B8},
+{0x0118D9, 0x0118B9},
+{0x0118DA, 0x0118BA},
+{0x0118DB, 0x0118BB},
+{0x0118DC, 0x0118BC},
+{0x0118DD, 0x0118BD},
+{0x0118DE, 0x0118BE},
+{0x0118DF, 0x0118BF},
+{0x016E60, 0x016E40},
+{0x016E61, 0x016E41},
+{0x016E62, 0x016E42},
+{0x016E63, 0x016E43},
+{0x016E64, 0x016E44},
+{0x016E65, 0x016E45},
+{0x016E66, 0x016E46},
+{0x016E67, 0x016E47},
+{0x016E68, 0x016E48},
+{0x016E69, 0x016E49},
+{0x016E6A, 0x016E4A},
+{0x016E6B, 0x016E4B},
+{0x016E6C, 0x016E4C},
+{0x016E6D, 0x016E4D},
+{0x016E6E, 0x016E4E},
+{0x016E6F, 0x016E4F},
+{0x016E70, 0x016E50},
+{0x016E71, 0x016E51},
+{0x016E72, 0x016E52},
+{0x016E73, 0x016E53},
+{0x016E74, 0x016E54},
+{0x016E75, 0x016E55},
+{0x016E76, 0x016E56},
+{0x016E77, 0x016E57},
+{0x016E78, 0x016E58},
+{0x016E79, 0x016E59},
+{0x016E7A, 0x016E5A},
+{0x016E7B, 0x016E5B},
+{0x016E7C, 0x016E5C},
+{0x016E7D, 0x016E5D},
+{0x016E7E, 0x016E5E},
+{0x016E7F, 0x016E5F},
+{0x01E922, 0x01E900},
+{0x01E923, 0x01E901},
+{0x01E924, 0x01E902},
+{0x01E925, 0x01E903},
+{0x01E926, 0x01E904},
+{0x01E927, 0x01E905},
+{0x01E928, 0x01E906},
+{0x01E929, 0x01E907},
+{0x01E92A, 0x01E908},
+{0x01E92B, 0x01E909},
+{0x01E92C, 0x01E90A},
+{0x01E92D, 0x01E90B},
+{0x01E92E, 0x01E90C},
+{0x01E92F, 0x01E90D},
+{0x01E930, 0x01E90E},
+{0x01E931, 0x01E90F},
+{0x01E932, 0x01E910},
+{0x01E933, 0x01E911},
+{0x01E934, 0x01E912},
+{0x01E935, 0x01E913},
+{0x01E936, 0x01E914},
+{0x01E937, 0x01E915},
+{0x01E938, 0x01E916},
+{0x01E939, 0x01E917},
+{0x01E93A, 0x01E918},
+{0x01E93B, 0x01E919},
+{0x01E93C, 0x01E91A},
+{0x01E93D, 0x01E91B},
+{0x01E93E, 0x01E91C},
+{0x01E93F, 0x01E91D},
+{0x01E940, 0x01E91E},
+{0x01E941, 0x01E91F},
+{0x01E942, 0x01E920},
+{0x01E943, 0x01E921},
+};
+
+const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
+{0x000000, 0x000000, 0x000000},
+{0x0000C0, 0x0000C5, 0x000041},
+{0x0000C7, 0x0000C7, 0x000043},
+{0x0000C8, 0x0000CB, 0x000045},
+{0x0000CC, 0x0000CF, 0x000049},
+{0x0000D1, 0x0000D1, 0x00004E},
+{0x0000D2, 0x0000D6, 0x00004F},
+{0x0000D9, 0x0000DC, 0x000055},
+{0x0000DD, 0x0000DD, 0x000059},
+{0x0000E0, 0x0000E5, 0x000061},
+{0x0000E7, 0x0000E7, 0x000063},
+{0x0000E8, 0x0000EB, 0x000065},
+{0x0000EC, 0x0000EF, 0x000069},
+{0x0000F1, 0x0000F1, 0x00006E},
+{0x0000F2, 0x0000F6, 0x00006F},
+{0x0000F9, 0x0000FC, 0x000075},
+{0x0000FD, 0x0000FD, 0x000079},
+{0x0000FF, 0x0000FF, 0x000079},
+{0x000100, 0x000100, 0x000041},
+{0x000101, 0x000101, 0x000061},
+{0x000102, 0x000102, 0x000041},
+{0x000103, 0x000103, 0x000061},
+{0x000104, 0x000104, 0x000041},
+{0x000105, 0x000105, 0x000061},
+{0x000106, 0x000106, 0x000043},
+{0x000107, 0x000107, 0x000063},
+{0x000108, 0x000108, 0x000043},
+{0x000109, 0x000109, 0x000063},
+{0x00010A, 0x00010A, 0x000043},
+{0x00010B, 0x00010B, 0x000063},
+{0x00010C, 0x00010C, 0x000043},
+{0x00010D, 0x00010D, 0x000063},
+{0x00010E, 0x00010E, 0x000044},
+{0x00010F, 0x00010F, 0x000064},
+{0x000112, 0x000112, 0x000045},
+{0x000113, 0x000113, 0x000065},
+{0x000114, 0x000114, 0x000045},
+{0x000115, 0x000115, 0x000065},
+{0x000116, 0x000116, 0x000045},
+{0x000117, 0x000117, 0x000065},
+{0x000118, 0x000118, 0x000045},
+{0x000119, 0x000119, 0x000065},
+{0x00011A, 0x00011A, 0x000045},
+{0x00011B, 0x00011B, 0x000065},
+{0x00011C, 0x00011C, 0x000047},
+{0x00011D, 0x00011D, 0x000067},
+{0x00011E, 0x00011E, 0x000047},
+{0x00011F, 0x00011F, 0x000067},
+{0x000120, 0x000120, 0x000047},
+{0x000121, 0x000121, 0x000067},
+{0x000122, 0x000122, 0x000047},
+{0x000123, 0x000123, 0x000067},
+{0x000124, 0x000124, 0x000048},
+{0x000125, 0x000125, 0x000068},
+{0x000128, 0x000128, 0x000049},
+{0x000129, 0x000129, 0x000069},
+{0x00012A, 0x00012A, 0x000049},
+{0x00012B, 0x00012B, 0x000069},
+{0x00012C, 0x00012C, 0x000049},
+{0x00012D, 0x00012D, 0x000069},
+{0x00012E, 0x00012E, 0x000049},
+{0x00012F, 0x00012F, 0x000069},
+{0x000130, 0x000130, 0x000049},
+{0x000134, 0x000134, 0x00004A},
+{0x000135, 0x000135, 0x00006A},
+{0x000136, 0x000136, 0x00004B},
+{0x000137, 0x000137, 0x00006B},
+{0x000139, 0x000139, 0x00004C},
+{0x00013A, 0x00013A, 0x00006C},
+{0x00013B, 0x00013B, 0x00004C},
+{0x00013C, 0x00013C, 0x00006C},
+{0x00013D, 0x00013D, 0x00004C},
+{0x00013E, 0x00013E, 0x00006C},
+{0x000143, 0x000143, 0x00004E},
+{0x000144, 0x000144, 0x00006E},
+{0x000145, 0x000145, 0x00004E},
+{0x000146, 0x000146, 0x00006E},
+{0x000147, 0x000147, 0x00004E},
+{0x000148, 0x000148, 0x00006E},
+{0x00014C, 0x00014C, 0x00004F},
+{0x00014D, 0x00014D, 0x00006F},
+{0x00014E, 0x00014E, 0x00004F},
+{0x00014F, 0x00014F, 0x00006F},
+{0x000150, 0x000150, 0x00004F},
+{0x000151, 0x000151, 0x00006F},
+{0x000154, 0x000154, 0x000052},
+{0x000155, 0x000155, 0x000072},
+{0x000156, 0x000156, 0x000052},
+{0x000157, 0x000157, 0x000072},
+{0x000158, 0x000158, 0x000052},
+{0x000159, 0x000159, 0x000072},
+{0x00015A, 0x00015A, 0x000053},
+{0x00015B, 0x00015B, 0x000073},
+{0x00015C, 0x00015C, 0x000053},
+{0x00015D, 0x00015D, 0x000073},
+{0x00015E, 0x00015E, 0x000053},
+{0x00015F, 0x00015F, 0x000073},
+{0x000160, 0x000160, 0x000053},
+{0x000161, 0x000161, 0x000073},
+{0x000162, 0x000162, 0x000054},
+{0x000163, 0x000163, 0x000074},
+{0x000164, 0x000164, 0x000054},
+{0x000165, 0x000165, 0x000074},
+{0x000168, 0x000168, 0x000055},
+{0x000169, 0x000169, 0x000075},
+{0x00016A, 0x00016A, 0x000055},
+{0x00016B, 0x00016B, 0x000075},
+{0x00016C, 0x00016C, 0x000055},
+{0x00016D, 0x00016D, 0x000075},
+{0x00016E, 0x00016E, 0x000055},
+{0x00016F, 0x00016F, 0x000075},
+{0x000170, 0x000170, 0x000055},
+{0x000171, 0x000171, 0x000075},
+{0x000172, 0x000172, 0x000055},
+{0x000173, 0x000173, 0x000075},
+{0x000174, 0x000174, 0x000057},
+{0x000175, 0x000175, 0x000077},
+{0x000176, 0x000176, 0x000059},
+{0x000177, 0x000177, 0x000079},
+{0x000178, 0x000178, 0x000059},
+{0x000179, 0x000179, 0x00005A},
+{0x00017A, 0x00017A, 0x00007A},
+{0x00017B, 0x00017B, 0x00005A},
+{0x00017C, 0x00017C, 0x00007A},
+{0x00017D, 0x00017D, 0x00005A},
+{0x00017E, 0x00017E, 0x00007A},
+{0x0001A0, 0x0001A0, 0x00004F},
+{0x0001A1, 0x0001A1, 0x00006F},
+{0x0001AF, 0x0001AF, 0x000055},
+{0x0001B0, 0x0001B0, 0x000075},
+{0x0001CD, 0x0001CD, 0x000041},
+{0x0001CE, 0x0001CE, 0x000061},
+{0x0001CF, 0x0001CF, 0x000049},
+{0x0001D0, 0x0001D0, 0x000069},
+{0x0001D1, 0x0001D1, 0x00004F},
+{0x0001D2, 0x0001D2, 0x00006F},
+{0x0001D3, 0x0001D3, 0x000055},
+{0x0001D4, 0x0001D4, 0x000075},
+{0x0001D5, 0x0001D5, 0x000055},
+{0x0001D6, 0x0001D6, 0x000075},
+{0x0001D7, 0x0001D7, 0x000055},
+{0x0001D8, 0x0001D8, 0x000075},
+{0x0001D9, 0x0001D9, 0x000055},
+{0x0001DA, 0x0001DA, 0x000075},
+{0x0001DB, 0x0001DB, 0x000055},
+{0x0001DC, 0x0001DC, 0x000075},
+{0x0001DE, 0x0001DE, 0x000041},
+{0x0001DF, 0x0001DF, 0x000061},
+{0x0001E0, 0x0001E0, 0x000041},
+{0x0001E1, 0x0001E1, 0x000061},
+{0x0001E2, 0x0001E2, 0x0000C6},
+{0x0001E3, 0x0001E3, 0x0000E6},
+{0x0001E6, 0x0001E6, 0x000047},
+{0x0001E7, 0x0001E7, 0x000067},
+{0x0001E8, 0x0001E8, 0x00004B},
+{0x0001E9, 0x0001E9, 0x00006B},
+{0x0001EA, 0x0001EA, 0x00004F},
+{0x0001EB, 0x0001EB, 0x00006F},
+{0x0001EC, 0x0001EC, 0x00004F},
+{0x0001ED, 0x0001ED, 0x00006F},
+{0x0001EE, 0x0001EE, 0x0001B7},
+{0x0001EF, 0x0001EF, 0x000292},
+{0x0001F0, 0x0001F0, 0x00006A},
+{0x0001F4, 0x0001F4, 0x000047},
+{0x0001F5, 0x0001F5, 0x000067},
+{0x0001F8, 0x0001F8, 0x00004E},
+{0x0001F9, 0x0001F9, 0x00006E},
+{0x0001FA, 0x0001FA, 0x000041},
+{0x0001FB, 0x0001FB, 0x000061},
+{0x0001FC, 0x0001FC, 0x0000C6},
+{0x0001FD, 0x0001FD, 0x0000E6},
+{0x0001FE, 0x0001FE, 0x0000D8},
+{0x0001FF, 0x0001FF, 0x0000F8},
+{0x000200, 0x000200, 0x000041},
+{0x000201, 0x000201, 0x000061},
+{0x000202, 0x000202, 0x000041},
+{0x000203, 0x000203, 0x000061},
+{0x000204, 0x000204, 0x000045},
+{0x000205, 0x000205, 0x000065},
+{0x000206, 0x000206, 0x000045},
+{0x000207, 0x000207, 0x000065},
+{0x000208, 0x000208, 0x000049},
+{0x000209, 0x000209, 0x000069},
+{0x00020A, 0x00020A, 0x000049},
+{0x00020B, 0x00020B, 0x000069},
+{0x00020C, 0x00020C, 0x00004F},
+{0x00020D, 0x00020D, 0x00006F},
+{0x00020E, 0x00020E, 0x00004F},
+{0x00020F, 0x00020F, 0x00006F},
+{0x000210, 0x000210, 0x000052},
+{0x000211, 0x000211, 0x000072},
+{0x000212, 0x000212, 0x000052},
+{0x000213, 0x000213, 0x000072},
+{0x000214, 0x000214, 0x000055},
+{0x000215, 0x000215, 0x000075},
+{0x000216, 0x000216, 0x000055},
+{0x000217, 0x000217, 0x000075},
+{0x000218, 0x000218, 0x000053},
+{0x000219, 0x000219, 0x000073},
+{0x00021A, 0x00021A, 0x000054},
+{0x00021B, 0x00021B, 0x000074},
+{0x00021E, 0x00021E, 0x000048},
+{0x00021F, 0x00021F, 0x000068},
+{0x000226, 0x000226, 0x000041},
+{0x000227, 0x000227, 0x000061},
+{0x000228, 0x000228, 0x000045},
+{0x000229, 0x000229, 0x000065},
+{0x00022A, 0x00022A, 0x00004F},
+{0x00022B, 0x00022B, 0x00006F},
+{0x00022C, 0x00022C, 0x00004F},
+{0x00022D, 0x00022D, 0x00006F},
+{0x00022E, 0x00022E, 0x00004F},
+{0x00022F, 0x00022F, 0x00006F},
+{0x000230, 0x000230, 0x00004F},
+{0x000231, 0x000231, 0x00006F},
+{0x000232, 0x000232, 0x000059},
+{0x000233, 0x000233, 0x000079},
+{0x000340, 0x000340, 0x000300},
+{0x000341, 0x000341, 0x000301},
+{0x000343, 0x000343, 0x000313},
+{0x000344, 0x000344, 0x000308},
+{0x000374, 0x000374, 0x0002B9},
+{0x00037E, 0x00037E, 0x00003B},
+{0x000385, 0x000385, 0x0000A8},
+{0x000386, 0x000386, 0x000391},
+{0x000387, 0x000387, 0x0000B7},
+{0x000388, 0x000388, 0x000395},
+{0x000389, 0x000389, 0x000397},
+{0x00038A, 0x00038A, 0x000399},
+{0x00038C, 0x00038C, 0x00039F},
+{0x00038E, 0x00038E, 0x0003A5},
+{0x00038F, 0x00038F, 0x0003A9},
+{0x000390, 0x000390, 0x0003B9},
+{0x0003AA, 0x0003AA, 0x000399},
+{0x0003AB, 0x0003AB, 0x0003A5},
+{0x0003AC, 0x0003AC, 0x0003B1},
+{0x0003AD, 0x0003AD, 0x0003B5},
+{0x0003AE, 0x0003AE, 0x0003B7},
+{0x0003AF, 0x0003AF, 0x0003B9},
+{0x0003B0, 0x0003B0, 0x0003C5},
+{0x0003CA, 0x0003CA, 0x0003B9},
+{0x0003CB, 0x0003CB, 0x0003C5},
+{0x0003CC, 0x0003CC, 0x0003BF},
+{0x0003CD, 0x0003CD, 0x0003C5},
+{0x0003CE, 0x0003CE, 0x0003C9},
+{0x0003D3, 0x0003D4, 0x0003D2},
+{0x000400, 0x000401, 0x000415},
+{0x000403, 0x000403, 0x000413},
+{0x000407, 0x000407, 0x000406},
+{0x00040C, 0x00040C, 0x00041A},
+{0x00040D, 0x00040D, 0x000418},
+{0x00040E, 0x00040E, 0x000423},
+{0x000419, 0x000419, 0x000418},
+{0x000439, 0x000439, 0x000438},
+{0x000450, 0x000451, 0x000435},
+{0x000453, 0x000453, 0x000433},
+{0x000457, 0x000457, 0x000456},
+{0x00045C, 0x00045C, 0x00043A},
+{0x00045D, 0x00045D, 0x000438},
+{0x00045E, 0x00045E, 0x000443},
+{0x000476, 0x000476, 0x000474},
+{0x000477, 0x000477, 0x000475},
+{0x0004C1, 0x0004C1, 0x000416},
+{0x0004C2, 0x0004C2, 0x000436},
+{0x0004D0, 0x0004D0, 0x000410},
+{0x0004D1, 0x0004D1, 0x000430},
+{0x0004D2, 0x0004D2, 0x000410},
+{0x0004D3, 0x0004D3, 0x000430},
+{0x0004D6, 0x0004D6, 0x000415},
+{0x0004D7, 0x0004D7, 0x000435},
+{0x0004DA, 0x0004DA, 0x0004D8},
+{0x0004DB, 0x0004DB, 0x0004D9},
+{0x0004DC, 0x0004DC, 0x000416},
+{0x0004DD, 0x0004DD, 0x000436},
+{0x0004DE, 0x0004DE, 0x000417},
+{0x0004DF, 0x0004DF, 0x000437},
+{0x0004E2, 0x0004E2, 0x000418},
+{0x0004E3, 0x0004E3, 0x000438},
+{0x0004E4, 0x0004E4, 0x000418},
+{0x0004E5, 0x0004E5, 0x000438},
+{0x0004E6, 0x0004E6, 0x00041E},
+{0x0004E7, 0x0004E7, 0x00043E},
+{0x0004EA, 0x0004EA, 0x0004E8},
+{0x0004EB, 0x0004EB, 0x0004E9},
+{0x0004EC, 0x0004EC, 0x00042D},
+{0x0004ED, 0x0004ED, 0x00044D},
+{0x0004EE, 0x0004EE, 0x000423},
+{0x0004EF, 0x0004EF, 0x000443},
+{0x0004F0, 0x0004F0, 0x000423},
+{0x0004F1, 0x0004F1, 0x000443},
+{0x0004F2, 0x0004F2, 0x000423},
+{0x0004F3, 0x0004F3, 0x000443},
+{0x0004F4, 0x0004F4, 0x000427},
+{0x0004F5, 0x0004F5, 0x000447},
+{0x0004F8, 0x0004F8, 0x00042B},
+{0x0004F9, 0x0004F9, 0x00044B},
+{0x000622, 0x000623, 0x000627},
+{0x000624, 0x000624, 0x000648},
+{0x000625, 0x000625, 0x000627},
+{0x000626, 0x000626, 0x00064A},
+{0x0006C0, 0x0006C0, 0x0006D5},
+{0x0006C2, 0x0006C2, 0x0006C1},
+{0x0006D3, 0x0006D3, 0x0006D2},
+{0x000929, 0x000929, 0x000928},
+{0x000931, 0x000931, 0x000930},
+{0x000934, 0x000934, 0x000933},
+{0x000958, 0x000958, 0x000915},
+{0x000959, 0x000959, 0x000916},
+{0x00095A, 0x00095A, 0x000917},
+{0x00095B, 0x00095B, 0x00091C},
+{0x00095C, 0x00095C, 0x000921},
+{0x00095D, 0x00095D, 0x000922},
+{0x00095E, 0x00095E, 0x00092B},
+{0x00095F, 0x00095F, 0x00092F},
+{0x0009CB, 0x0009CC, 0x0009C7},
+{0x0009DC, 0x0009DC, 0x0009A1},
+{0x0009DD, 0x0009DD, 0x0009A2},
+{0x0009DF, 0x0009DF, 0x0009AF},
+{0x000A33, 0x000A33, 0x000A32},
+{0x000A36, 0x000A36, 0x000A38},
+{0x000A59, 0x000A59, 0x000A16},
+{0x000A5A, 0x000A5A, 0x000A17},
+{0x000A5B, 0x000A5B, 0x000A1C},
+{0x000A5E, 0x000A5E, 0x000A2B},
+{0x000B48, 0x000B48, 0x000B47},
+{0x000B4B, 0x000B4C, 0x000B47},
+{0x000B5C, 0x000B5C, 0x000B21},
+{0x000B5D, 0x000B5D, 0x000B22},
+{0x000B94, 0x000B94, 0x000B92},
+{0x000BCA, 0x000BCA, 0x000BC6},
+{0x000BCB, 0x000BCB, 0x000BC7},
+{0x000BCC, 0x000BCC, 0x000BC6},
+{0x000C48, 0x000C48, 0x000C46},
+{0x000CC0, 0x000CC0, 0x000CBF},
+{0x000CC7, 0x000CC8, 0x000CC6},
+{0x000CCA, 0x000CCB, 0x000CC6},
+{0x000D4A, 0x000D4A, 0x000D46},
+{0x000D4B, 0x000D4B, 0x000D47},
+{0x000D4C, 0x000D4C, 0x000D46},
+{0x000DDA, 0x000DDA, 0x000DD9},
+{0x000DDC, 0x000DDE, 0x000DD9},
+{0x000F43, 0x000F43, 0x000F42},
+{0x000F4D, 0x000F4D, 0x000F4C},
+{0x000F52, 0x000F52, 0x000F51},
+{0x000F57, 0x000F57, 0x000F56},
+{0x000F5C, 0x000F5C, 0x000F5B},
+{0x000F69, 0x000F69, 0x000F40},
+{0x000F73, 0x000F73, 0x000F71},
+{0x000F75, 0x000F75, 0x000F71},
+{0x000F76, 0x000F76, 0x000FB2},
+{0x000F78, 0x000F78, 0x000FB3},
+{0x000F81, 0x000F81, 0x000F71},
+{0x000F93, 0x000F93, 0x000F92},
+{0x000F9D, 0x000F9D, 0x000F9C},
+{0x000FA2, 0x000FA2, 0x000FA1},
+{0x000FA7, 0x000FA7, 0x000FA6},
+{0x000FAC, 0x000FAC, 0x000FAB},
+{0x000FB9, 0x000FB9, 0x000F90},
+{0x001026, 0x001026, 0x001025},
+{0x001B06, 0x001B06, 0x001B05},
+{0x001B08, 0x001B08, 0x001B07},
+{0x001B0A, 0x001B0A, 0x001B09},
+{0x001B0C, 0x001B0C, 0x001B0B},
+{0x001B0E, 0x001B0E, 0x001B0D},
+{0x001B12, 0x001B12, 0x001B11},
+{0x001B3B, 0x001B3B, 0x001B3A},
+{0x001B3D, 0x001B3D, 0x001B3C},
+{0x001B40, 0x001B40, 0x001B3E},
+{0x001B41, 0x001B41, 0x001B3F},
+{0x001B43, 0x001B43, 0x001B42},
+{0x001E00, 0x001E00, 0x000041},
+{0x001E01, 0x001E01, 0x000061},
+{0x001E02, 0x001E02, 0x000042},
+{0x001E03, 0x001E03, 0x000062},
+{0x001E04, 0x001E04, 0x000042},
+{0x001E05, 0x001E05, 0x000062},
+{0x001E06, 0x001E06, 0x000042},
+{0x001E07, 0x001E07, 0x000062},
+{0x001E08, 0x001E08, 0x000043},
+{0x001E09, 0x001E09, 0x000063},
+{0x001E0A, 0x001E0A, 0x000044},
+{0x001E0B, 0x001E0B, 0x000064},
+{0x001E0C, 0x001E0C, 0x000044},
+{0x001E0D, 0x001E0D, 0x000064},
+{0x001E0E, 0x001E0E, 0x000044},
+{0x001E0F, 0x001E0F, 0x000064},
+{0x001E10, 0x001E10, 0x000044},
+{0x001E11, 0x001E11, 0x000064},
+{0x001E12, 0x001E12, 0x000044},
+{0x001E13, 0x001E13, 0x000064},
+{0x001E14, 0x001E14, 0x000045},
+{0x001E15, 0x001E15, 0x000065},
+{0x001E16, 0x001E16, 0x000045},
+{0x001E17, 0x001E17, 0x000065},
+{0x001E18, 0x001E18, 0x000045},
+{0x001E19, 0x001E19, 0x000065},
+{0x001E1A, 0x001E1A, 0x000045},
+{0x001E1B, 0x001E1B, 0x000065},
+{0x001E1C, 0x001E1C, 0x000045},
+{0x001E1D, 0x001E1D, 0x000065},
+{0x001E1E, 0x001E1E, 0x000046},
+{0x001E1F, 0x001E1F, 0x000066},
+{0x001E20, 0x001E20, 0x000047},
+{0x001E21, 0x001E21, 0x000067},
+{0x001E22, 0x001E22, 0x000048},
+{0x001E23, 0x001E23, 0x000068},
+{0x001E24, 0x001E24, 0x000048},
+{0x001E25, 0x001E25, 0x000068},
+{0x001E26, 0x001E26, 0x000048},
+{0x001E27, 0x001E27, 0x000068},
+{0x001E28, 0x001E28, 0x000048},
+{0x001E29, 0x001E29, 0x000068},
+{0x001E2A, 0x001E2A, 0x000048},
+{0x001E2B, 0x001E2B, 0x000068},
+{0x001E2C, 0x001E2C, 0x000049},
+{0x001E2D, 0x001E2D, 0x000069},
+{0x001E2E, 0x001E2E, 0x000049},
+{0x001E2F, 0x001E2F, 0x000069},
+{0x001E30, 0x001E30, 0x00004B},
+{0x001E31, 0x001E31, 0x00006B},
+{0x001E32, 0x001E32, 0x00004B},
+{0x001E33, 0x001E33, 0x00006B},
+{0x001E34, 0x001E34, 0x00004B},
+{0x001E35, 0x001E35, 0x00006B},
+{0x001E36, 0x001E36, 0x00004C},
+{0x001E37, 0x001E37, 0x00006C},
+{0x001E38, 0x001E38, 0x00004C},
+{0x001E39, 0x001E39, 0x00006C},
+{0x001E3A, 0x001E3A, 0x00004C},
+{0x001E3B, 0x001E3B, 0x00006C},
+{0x001E3C, 0x001E3C, 0x00004C},
+{0x001E3D, 0x001E3D, 0x00006C},
+{0x001E3E, 0x001E3E, 0x00004D},
+{0x001E3F, 0x001E3F, 0x00006D},
+{0x001E40, 0x001E40, 0x00004D},
+{0x001E41, 0x001E41, 0x00006D},
+{0x001E42, 0x001E42, 0x00004D},
+{0x001E43, 0x001E43, 0x00006D},
+{0x001E44, 0x001E44, 0x00004E},
+{0x001E45, 0x001E45, 0x00006E},
+{0x001E46, 0x001E46, 0x00004E},
+{0x001E47, 0x001E47, 0x00006E},
+{0x001E48, 0x001E48, 0x00004E},
+{0x001E49, 0x001E49, 0x00006E},
+{0x001E4A, 0x001E4A, 0x00004E},
+{0x001E4B, 0x001E4B, 0x00006E},
+{0x001E4C, 0x001E4C, 0x00004F},
+{0x001E4D, 0x001E4D, 0x00006F},
+{0x001E4E, 0x001E4E, 0x00004F},
+{0x001E4F, 0x001E4F, 0x00006F},
+{0x001E50, 0x001E50, 0x00004F},
+{0x001E51, 0x001E51, 0x00006F},
+{0x001E52, 0x001E52, 0x00004F},
+{0x001E53, 0x001E53, 0x00006F},
+{0x001E54, 0x001E54, 0x000050},
+{0x001E55, 0x001E55, 0x000070},
+{0x001E56, 0x001E56, 0x000050},
+{0x001E57, 0x001E57, 0x000070},
+{0x001E58, 0x001E58, 0x000052},
+{0x001E59, 0x001E59, 0x000072},
+{0x001E5A, 0x001E5A, 0x000052},
+{0x001E5B, 0x001E5B, 0x000072},
+{0x001E5C, 0x001E5C, 0x000052},
+{0x001E5D, 0x001E5D, 0x000072},
+{0x001E5E, 0x001E5E, 0x000052},
+{0x001E5F, 0x001E5F, 0x000072},
+{0x001E60, 0x001E60, 0x000053},
+{0x001E61, 0x001E61, 0x000073},
+{0x001E62, 0x001E62, 0x000053},
+{0x001E63, 0x001E63, 0x000073},
+{0x001E64, 0x001E64, 0x000053},
+{0x001E65, 0x001E65, 0x000073},
+{0x001E66, 0x001E66, 0x000053},
+{0x001E67, 0x001E67, 0x000073},
+{0x001E68, 0x001E68, 0x000053},
+{0x001E69, 0x001E69, 0x000073},
+{0x001E6A, 0x001E6A, 0x000054},
+{0x001E6B, 0x001E6B, 0x000074},
+{0x001E6C, 0x001E6C, 0x000054},
+{0x001E6D, 0x001E6D, 0x000074},
+{0x001E6E, 0x001E6E, 0x000054},
+{0x001E6F, 0x001E6F, 0x000074},
+{0x001E70, 0x001E70, 0x000054},
+{0x001E71, 0x001E71, 0x000074},
+{0x001E72, 0x001E72, 0x000055},
+{0x001E73, 0x001E73, 0x000075},
+{0x001E74, 0x001E74, 0x000055},
+{0x001E75, 0x001E75, 0x000075},
+{0x001E76, 0x001E76, 0x000055},
+{0x001E77, 0x001E77, 0x000075},
+{0x001E78, 0x001E78, 0x000055},
+{0x001E79, 0x001E79, 0x000075},
+{0x001E7A, 0x001E7A, 0x000055},
+{0x001E7B, 0x001E7B, 0x000075},
+{0x001E7C, 0x001E7C, 0x000056},
+{0x001E7D, 0x001E7D, 0x000076},
+{0x001E7E, 0x001E7E, 0x000056},
+{0x001E7F, 0x001E7F, 0x000076},
+{0x001E80, 0x001E80, 0x000057},
+{0x001E81, 0x001E81, 0x000077},
+{0x001E82, 0x001E82, 0x000057},
+{0x001E83, 0x001E83, 0x000077},
+{0x001E84, 0x001E84, 0x000057},
+{0x001E85, 0x001E85, 0x000077},
+{0x001E86, 0x001E86, 0x000057},
+{0x001E87, 0x001E87, 0x000077},
+{0x001E88, 0x001E88, 0x000057},
+{0x001E89, 0x001E89, 0x000077},
+{0x001E8A, 0x001E8A, 0x000058},
+{0x001E8B, 0x001E8B, 0x000078},
+{0x001E8C, 0x001E8C, 0x000058},
+{0x001E8D, 0x001E8D, 0x000078},
+{0x001E8E, 0x001E8E, 0x000059},
+{0x001E8F, 0x001E8F, 0x000079},
+{0x001E90, 0x001E90, 0x00005A},
+{0x001E91, 0x001E91, 0x00007A},
+{0x001E92, 0x001E92, 0x00005A},
+{0x001E93, 0x001E93, 0x00007A},
+{0x001E94, 0x001E94, 0x00005A},
+{0x001E95, 0x001E95, 0x00007A},
+{0x001E96, 0x001E96, 0x000068},
+{0x001E97, 0x001E97, 0x000074},
+{0x001E98, 0x001E98, 0x000077},
+{0x001E99, 0x001E99, 0x000079},
+{0x001E9B, 0x001E9B, 0x00017F},
+{0x001EA0, 0x001EA0, 0x000041},
+{0x001EA1, 0x001EA1, 0x000061},
+{0x001EA2, 0x001EA2, 0x000041},
+{0x001EA3, 0x001EA3, 0x000061},
+{0x001EA4, 0x001EA4, 0x000041},
+{0x001EA5, 0x001EA5, 0x000061},
+{0x001EA6, 0x001EA6, 0x000041},
+{0x001EA7, 0x001EA7, 0x000061},
+{0x001EA8, 0x001EA8, 0x000041},
+{0x001EA9, 0x001EA9, 0x000061},
+{0x001EAA, 0x001EAA, 0x000041},
+{0x001EAB, 0x001EAB, 0x000061},
+{0x001EAC, 0x001EAC, 0x000041},
+{0x001EAD, 0x001EAD, 0x000061},
+{0x001EAE, 0x001EAE, 0x000041},
+{0x001EAF, 0x001EAF, 0x000061},
+{0x001EB0, 0x001EB0, 0x000041},
+{0x001EB1, 0x001EB1, 0x000061},
+{0x001EB2, 0x001EB2, 0x000041},
+{0x001EB3, 0x001EB3, 0x000061},
+{0x001EB4, 0x001EB4, 0x000041},
+{0x001EB5, 0x001EB5, 0x000061},
+{0x001EB6, 0x001EB6, 0x000041},
+{0x001EB7, 0x001EB7, 0x000061},
+{0x001EB8, 0x001EB8, 0x000045},
+{0x001EB9, 0x001EB9, 0x000065},
+{0x001EBA, 0x001EBA, 0x000045},
+{0x001EBB, 0x001EBB, 0x000065},
+{0x001EBC, 0x001EBC, 0x000045},
+{0x001EBD, 0x001EBD, 0x000065},
+{0x001EBE, 0x001EBE, 0x000045},
+{0x001EBF, 0x001EBF, 0x000065},
+{0x001EC0, 0x001EC0, 0x000045},
+{0x001EC1, 0x001EC1, 0x000065},
+{0x001EC2, 0x001EC2, 0x000045},
+{0x001EC3, 0x001EC3, 0x000065},
+{0x001EC4, 0x001EC4, 0x000045},
+{0x001EC5, 0x001EC5, 0x000065},
+{0x001EC6, 0x001EC6, 0x000045},
+{0x001EC7, 0x001EC7, 0x000065},
+{0x001EC8, 0x001EC8, 0x000049},
+{0x001EC9, 0x001EC9, 0x000069},
+{0x001ECA, 0x001ECA, 0x000049},
+{0x001ECB, 0x001ECB, 0x000069},
+{0x001ECC, 0x001ECC, 0x00004F},
+{0x001ECD, 0x001ECD, 0x00006F},
+{0x001ECE, 0x001ECE, 0x00004F},
+{0x001ECF, 0x001ECF, 0x00006F},
+{0x001ED0, 0x001ED0, 0x00004F},
+{0x001ED1, 0x001ED1, 0x00006F},
+{0x001ED2, 0x001ED2, 0x00004F},
+{0x001ED3, 0x001ED3, 0x00006F},
+{0x001ED4, 0x001ED4, 0x00004F},
+{0x001ED5, 0x001ED5, 0x00006F},
+{0x001ED6, 0x001ED6, 0x00004F},
+{0x001ED7, 0x001ED7, 0x00006F},
+{0x001ED8, 0x001ED8, 0x00004F},
+{0x001ED9, 0x001ED9, 0x00006F},
+{0x001EDA, 0x001EDA, 0x00004F},
+{0x001EDB, 0x001EDB, 0x00006F},
+{0x001EDC, 0x001EDC, 0x00004F},
+{0x001EDD, 0x001EDD, 0x00006F},
+{0x001EDE, 0x001EDE, 0x00004F},
+{0x001EDF, 0x001EDF, 0x00006F},
+{0x001EE0, 0x001EE0, 0x00004F},
+{0x001EE1, 0x001EE1, 0x00006F},
+{0x001EE2, 0x001EE2, 0x00004F},
+{0x001EE3, 0x001EE3, 0x00006F},
+{0x001EE4, 0x001EE4, 0x000055},
+{0x001EE5, 0x001EE5, 0x000075},
+{0x001EE6, 0x001EE6, 0x000055},
+{0x001EE7, 0x001EE7, 0x000075},
+{0x001EE8, 0x001EE8, 0x000055},
+{0x001EE9, 0x001EE9, 0x000075},
+{0x001EEA, 0x001EEA, 0x000055},
+{0x001EEB, 0x001EEB, 0x000075},
+{0x001EEC, 0x001EEC, 0x000055},
+{0x001EED, 0x001EED, 0x000075},
+{0x001EEE, 0x001EEE, 0x000055},
+{0x001EEF, 0x001EEF, 0x000075},
+{0x001EF0, 0x001EF0, 0x000055},
+{0x001EF1, 0x001EF1, 0x000075},
+{0x001EF2, 0x001EF2, 0x000059},
+{0x001EF3, 0x001EF3, 0x000079},
+{0x001EF4, 0x001EF4, 0x000059},
+{0x001EF5, 0x001EF5, 0x000079},
+{0x001EF6, 0x001EF6, 0x000059},
+{0x001EF7, 0x001EF7, 0x000079},
+{0x001EF8, 0x001EF8, 0x000059},
+{0x001EF9, 0x001EF9, 0x000079},
+{0x001F00, 0x001F07, 0x0003B1},
+{0x001F08, 0x001F0F, 0x000391},
+{0x001F10, 0x001F15, 0x0003B5},
+{0x001F18, 0x001F1D, 0x000395},
+{0x001F20, 0x001F27, 0x0003B7},
+{0x001F28, 0x001F2F, 0x000397},
+{0x001F30, 0x001F37, 0x0003B9},
+{0x001F38, 0x001F3F, 0x000399},
+{0x001F40, 0x001F45, 0x0003BF},
+{0x001F48, 0x001F4D, 0x00039F},
+{0x001F50, 0x001F57, 0x0003C5},
+{0x001F59, 0x001F59, 0x0003A5},
+{0x001F5B, 0x001F5B, 0x0003A5},
+{0x001F5D, 0x001F5D, 0x0003A5},
+{0x001F5F, 0x001F5F, 0x0003A5},
+{0x001F60, 0x001F67, 0x0003C9},
+{0x001F68, 0x001F6F, 0x0003A9},
+{0x001F70, 0x001F71, 0x0003B1},
+{0x001F72, 0x001F73, 0x0003B5},
+{0x001F74, 0x001F75, 0x0003B7},
+{0x001F76, 0x001F77, 0x0003B9},
+{0x001F78, 0x001F79, 0x0003BF},
+{0x001F7A, 0x001F7B, 0x0003C5},
+{0x001F7C, 0x001F7D, 0x0003C9},
+{0x001F80, 0x001F87, 0x0003B1},
+{0x001F88, 0x001F8F, 0x000391},
+{0x001F90, 0x001F97, 0x0003B7},
+{0x001F98, 0x001F9F, 0x000397},
+{0x001FA0, 0x001FA7, 0x0003C9},
+{0x001FA8, 0x001FAF, 0x0003A9},
+{0x001FB0, 0x001FB4, 0x0003B1},
+{0x001FB6, 0x001FB7, 0x0003B1},
+{0x001FB8, 0x001FBC, 0x000391},
+{0x001FBE, 0x001FBE, 0x0003B9},
+{0x001FC1, 0x001FC1, 0x0000A8},
+{0x001FC2, 0x001FC4, 0x0003B7},
+{0x001FC6, 0x001FC7, 0x0003B7},
+{0x001FC8, 0x001FC9, 0x000395},
+{0x001FCA, 0x001FCC, 0x000397},
+{0x001FCD, 0x001FCF, 0x001FBF},
+{0x001FD0, 0x001FD3, 0x0003B9},
+{0x001FD6, 0x001FD7, 0x0003B9},
+{0x001FD8, 0x001FDB, 0x000399},
+{0x001FDD, 0x001FDF, 0x001FFE},
+{0x001FE0, 0x001FE3, 0x0003C5},
+{0x001FE4, 0x001FE5, 0x0003C1},
+{0x001FE6, 0x001FE7, 0x0003C5},
+{0x001FE8, 0x001FEB, 0x0003A5},
+{0x001FEC, 0x001FEC, 0x0003A1},
+{0x001FED, 0x001FEE, 0x0000A8},
+{0x001FEF, 0x001FEF, 0x000060},
+{0x001FF2, 0x001FF4, 0x0003C9},
+{0x001FF6, 0x001FF7, 0x0003C9},
+{0x001FF8, 0x001FF9, 0x00039F},
+{0x001FFA, 0x001FFC, 0x0003A9},
+{0x001FFD, 0x001FFD, 0x0000B4},
+{0x002000, 0x002000, 0x002002},
+{0x002001, 0x002001, 0x002003},
+{0x002126, 0x002126, 0x0003A9},
+{0x00212A, 0x00212A, 0x00004B},
+{0x00212B, 0x00212B, 0x000041},
+{0x00219A, 0x00219A, 0x002190},
+{0x00219B, 0x00219B, 0x002192},
+{0x0021AE, 0x0021AE, 0x002194},
+{0x0021CD, 0x0021CD, 0x0021D0},
+{0x0021CE, 0x0021CE, 0x0021D4},
+{0x0021CF, 0x0021CF, 0x0021D2},
+{0x002204, 0x002204, 0x002203},
+{0x002209, 0x002209, 0x002208},
+{0x00220C, 0x00220C, 0x00220B},
+{0x002224, 0x002224, 0x002223},
+{0x002226, 0x002226, 0x002225},
+{0x002241, 0x002241, 0x00223C},
+{0x002244, 0x002244, 0x002243},
+{0x002247, 0x002247, 0x002245},
+{0x002249, 0x002249, 0x002248},
+{0x002260, 0x002260, 0x00003D},
+{0x002262, 0x002262, 0x002261},
+{0x00226D, 0x00226D, 0x00224D},
+{0x00226E, 0x00226E, 0x00003C},
+{0x00226F, 0x00226F, 0x00003E},
+{0x002270, 0x002270, 0x002264},
+{0x002271, 0x002271, 0x002265},
+{0x002274, 0x002274, 0x002272},
+{0x002275, 0x002275, 0x002273},
+{0x002278, 0x002278, 0x002276},
+{0x002279, 0x002279, 0x002277},
+{0x002280, 0x002280, 0x00227A},
+{0x002281, 0x002281, 0x00227B},
+{0x002284, 0x002284, 0x002282},
+{0x002285, 0x002285, 0x002283},
+{0x002288, 0x002288, 0x002286},
+{0x002289, 0x002289, 0x002287},
+{0x0022AC, 0x0022AC, 0x0022A2},
+{0x0022AD, 0x0022AD, 0x0022A8},
+{0x0022AE, 0x0022AE, 0x0022A9},
+{0x0022AF, 0x0022AF, 0x0022AB},
+{0x0022E0, 0x0022E0, 0x00227C},
+{0x0022E1, 0x0022E1, 0x00227D},
+{0x0022E2, 0x0022E2, 0x002291},
+{0x0022E3, 0x0022E3, 0x002292},
+{0x0022EA, 0x0022EA, 0x0022B2},
+{0x0022EB, 0x0022EB, 0x0022B3},
+{0x0022EC, 0x0022EC, 0x0022B4},
+{0x0022ED, 0x0022ED, 0x0022B5},
+{0x002329, 0x002329, 0x003008},
+{0x00232A, 0x00232A, 0x003009},
+{0x002ADC, 0x002ADC, 0x002ADD},
+{0x00304C, 0x00304C, 0x00304B},
+{0x00304E, 0x00304E, 0x00304D},
+{0x003050, 0x003050, 0x00304F},
+{0x003052, 0x003052, 0x003051},
+{0x003054, 0x003054, 0x003053},
+{0x003056, 0x003056, 0x003055},
+{0x003058, 0x003058, 0x003057},
+{0x00305A, 0x00305A, 0x003059},
+{0x00305C, 0x00305C, 0x00305B},
+{0x00305E, 0x00305E, 0x00305D},
+{0x003060, 0x003060, 0x00305F},
+{0x003062, 0x003062, 0x003061},
+{0x003065, 0x003065, 0x003064},
+{0x003067, 0x003067, 0x003066},
+{0x003069, 0x003069, 0x003068},
+{0x003070, 0x003071, 0x00306F},
+{0x003073, 0x003074, 0x003072},
+{0x003076, 0x003077, 0x003075},
+{0x003079, 0x00307A, 0x003078},
+{0x00307C, 0x00307D, 0x00307B},
+{0x003094, 0x003094, 0x003046},
+{0x00309E, 0x00309E, 0x00309D},
+{0x0030AC, 0x0030AC, 0x0030AB},
+{0x0030AE, 0x0030AE, 0x0030AD},
+{0x0030B0, 0x0030B0, 0x0030AF},
+{0x0030B2, 0x0030B2, 0x0030B1},
+{0x0030B4, 0x0030B4, 0x0030B3},
+{0x0030B6, 0x0030B6, 0x0030B5},
+{0x0030B8, 0x0030B8, 0x0030B7},
+{0x0030BA, 0x0030BA, 0x0030B9},
+{0x0030BC, 0x0030BC, 0x0030BB},
+{0x0030BE, 0x0030BE, 0x0030BD},
+{0x0030C0, 0x0030C0, 0x0030BF},
+{0x0030C2, 0x0030C2, 0x0030C1},
+{0x0030C5, 0x0030C5, 0x0030C4},
+{0x0030C7, 0x0030C7, 0x0030C6},
+{0x0030C9, 0x0030C9, 0x0030C8},
+{0x0030D0, 0x0030D1, 0x0030CF},
+{0x0030D3, 0x0030D4, 0x0030D2},
+{0x0030D6, 0x0030D7, 0x0030D5},
+{0x0030D9, 0x0030DA, 0x0030D8},
+{0x0030DC, 0x0030DD, 0x0030DB},
+{0x0030F4, 0x0030F4, 0x0030A6},
+{0x0030F7, 0x0030F7, 0x0030EF},
+{0x0030F8, 0x0030F8, 0x0030F0},
+{0x0030F9, 0x0030F9, 0x0030F1},
+{0x0030FA, 0x0030FA, 0x0030F2},
+{0x0030FE, 0x0030FE, 0x0030FD},
+{0x00AC00, 0x00AE4B, 0x001100},
+{0x00AE4C, 0x00B097, 0x001101},
+{0x00B098, 0x00B2E3, 0x001102},
+{0x00B2E4, 0x00B52F, 0x001103},
+{0x00B530, 0x00B77B, 0x001104},
+{0x00B77C, 0x00B9C7, 0x001105},
+{0x00B9C8, 0x00BC13, 0x001106},
+{0x00BC14, 0x00BE5F, 0x001107},
+{0x00BE60, 0x00C0AB, 0x001108},
+{0x00C0AC, 0x00C2F7, 0x001109},
+{0x00C2F8, 0x00C543, 0x00110A},
+{0x00C544, 0x00C78F, 0x00110B},
+{0x00C790, 0x00C9DB, 0x00110C},
+{0x00C9DC, 0x00CC27, 0x00110D},
+{0x00CC28, 0x00CE73, 0x00110E},
+{0x00CE74, 0x00D0BF, 0x00110F},
+{0x00D0C0, 0x00D30B, 0x001110},
+{0x00D30C, 0x00D557, 0x001111},
+{0x00D558, 0x00D7A3, 0x001112},
+{0x00F900, 0x00F900, 0x008C48},
+{0x00F901, 0x00F901, 0x0066F4},
+{0x00F902, 0x00F902, 0x008ECA},
+{0x00F903, 0x00F903, 0x008CC8},
+{0x00F904, 0x00F904, 0x006ED1},
+{0x00F905, 0x00F905, 0x004E32},
+{0x00F906, 0x00F906, 0x0053E5},
+{0x00F907, 0x00F908, 0x009F9C},
+{0x00F909, 0x00F909, 0x005951},
+{0x00F90A, 0x00F90A, 0x0091D1},
+{0x00F90B, 0x00F90B, 0x005587},
+{0x00F90C, 0x00F90C, 0x005948},
+{0x00F90D, 0x00F90D, 0x0061F6},
+{0x00F90E, 0x00F90E, 0x007669},
+{0x00F90F, 0x00F90F, 0x007F85},
+{0x00F910, 0x00F910, 0x00863F},
+{0x00F911, 0x00F911, 0x0087BA},
+{0x00F912, 0x00F912, 0x0088F8},
+{0x00F913, 0x00F913, 0x00908F},
+{0x00F914, 0x00F914, 0x006A02},
+{0x00F915, 0x00F915, 0x006D1B},
+{0x00F916, 0x00F916, 0x0070D9},
+{0x00F917, 0x00F917, 0x0073DE},
+{0x00F918, 0x00F918, 0x00843D},
+{0x00F919, 0x00F919, 0x00916A},
+{0x00F91A, 0x00F91A, 0x0099F1},
+{0x00F91B, 0x00F91B, 0x004E82},
+{0x00F91C, 0x00F91C, 0x005375},
+{0x00F91D, 0x00F91D, 0x006B04},
+{0x00F91E, 0x00F91E, 0x00721B},
+{0x00F91F, 0x00F91F, 0x00862D},
+{0x00F920, 0x00F920, 0x009E1E},
+{0x00F921, 0x00F921, 0x005D50},
+{0x00F922, 0x00F922, 0x006FEB},
+{0x00F923, 0x00F923, 0x0085CD},
+{0x00F924, 0x00F924, 0x008964},
+{0x00F925, 0x00F925, 0x0062C9},
+{0x00F926, 0x00F926, 0x0081D8},
+{0x00F927, 0x00F927, 0x00881F},
+{0x00F928, 0x00F928, 0x005ECA},
+{0x00F929, 0x00F929, 0x006717},
+{0x00F92A, 0x00F92A, 0x006D6A},
+{0x00F92B, 0x00F92B, 0x0072FC},
+{0x00F92C, 0x00F92C, 0x0090CE},
+{0x00F92D, 0x00F92D, 0x004F86},
+{0x00F92E, 0x00F92E, 0x0051B7},
+{0x00F92F, 0x00F92F, 0x0052DE},
+{0x00F930, 0x00F930, 0x0064C4},
+{0x00F931, 0x00F931, 0x006AD3},
+{0x00F932, 0x00F932, 0x007210},
+{0x00F933, 0x00F933, 0x0076E7},
+{0x00F934, 0x00F934, 0x008001},
+{0x00F935, 0x00F935, 0x008606},
+{0x00F936, 0x00F936, 0x00865C},
+{0x00F937, 0x00F937, 0x008DEF},
+{0x00F938, 0x00F938, 0x009732},
+{0x00F939, 0x00F939, 0x009B6F},
+{0x00F93A, 0x00F93A, 0x009DFA},
+{0x00F93B, 0x00F93B, 0x00788C},
+{0x00F93C, 0x00F93C, 0x00797F},
+{0x00F93D, 0x00F93D, 0x007DA0},
+{0x00F93E, 0x00F93E, 0x0083C9},
+{0x00F93F, 0x00F93F, 0x009304},
+{0x00F940, 0x00F940, 0x009E7F},
+{0x00F941, 0x00F941, 0x008AD6},
+{0x00F942, 0x00F942, 0x0058DF},
+{0x00F943, 0x00F943, 0x005F04},
+{0x00F944, 0x00F944, 0x007C60},
+{0x00F945, 0x00F945, 0x00807E},
+{0x00F946, 0x00F946, 0x007262},
+{0x00F947, 0x00F947, 0x0078CA},
+{0x00F948, 0x00F948, 0x008CC2},
+{0x00F949, 0x00F949, 0x0096F7},
+{0x00F94A, 0x00F94A, 0x0058D8},
+{0x00F94B, 0x00F94B, 0x005C62},
+{0x00F94C, 0x00F94C, 0x006A13},
+{0x00F94D, 0x00F94D, 0x006DDA},
+{0x00F94E, 0x00F94E, 0x006F0F},
+{0x00F94F, 0x00F94F, 0x007D2F},
+{0x00F950, 0x00F950, 0x007E37},
+{0x00F951, 0x00F951, 0x00964B},
+{0x00F952, 0x00F952, 0x0052D2},
+{0x00F953, 0x00F953, 0x00808B},
+{0x00F954, 0x00F954, 0x0051DC},
+{0x00F955, 0x00F955, 0x0051CC},
+{0x00F956, 0x00F956, 0x007A1C},
+{0x00F957, 0x00F957, 0x007DBE},
+{0x00F958, 0x00F958, 0x0083F1},
+{0x00F959, 0x00F959, 0x009675},
+{0x00F95A, 0x00F95A, 0x008B80},
+{0x00F95B, 0x00F95B, 0x0062CF},
+{0x00F95C, 0x00F95C, 0x006A02},
+{0x00F95D, 0x00F95D, 0x008AFE},
+{0x00F95E, 0x00F95E, 0x004E39},
+{0x00F95F, 0x00F95F, 0x005BE7},
+{0x00F960, 0x00F960, 0x006012},
+{0x00F961, 0x00F961, 0x007387},
+{0x00F962, 0x00F962, 0x007570},
+{0x00F963, 0x00F963, 0x005317},
+{0x00F964, 0x00F964, 0x0078FB},
+{0x00F965, 0x00F965, 0x004FBF},
+{0x00F966, 0x00F966, 0x005FA9},
+{0x00F967, 0x00F967, 0x004E0D},
+{0x00F968, 0x00F968, 0x006CCC},
+{0x00F969, 0x00F969, 0x006578},
+{0x00F96A, 0x00F96A, 0x007D22},
+{0x00F96B, 0x00F96B, 0x0053C3},
+{0x00F96C, 0x00F96C, 0x00585E},
+{0x00F96D, 0x00F96D, 0x007701},
+{0x00F96E, 0x00F96E, 0x008449},
+{0x00F96F, 0x00F96F, 0x008AAA},
+{0x00F970, 0x00F970, 0x006BBA},
+{0x00F971, 0x00F971, 0x008FB0},
+{0x00F972, 0x00F972, 0x006C88},
+{0x00F973, 0x00F973, 0x0062FE},
+{0x00F974, 0x00F974, 0x0082E5},
+{0x00F975, 0x00F975, 0x0063A0},
+{0x00F976, 0x00F976, 0x007565},
+{0x00F977, 0x00F977, 0x004EAE},
+{0x00F978, 0x00F978, 0x005169},
+{0x00F979, 0x00F979, 0x0051C9},
+{0x00F97A, 0x00F97A, 0x006881},
+{0x00F97B, 0x00F97B, 0x007CE7},
+{0x00F97C, 0x00F97C, 0x00826F},
+{0x00F97D, 0x00F97D, 0x008AD2},
+{0x00F97E, 0x00F97E, 0x0091CF},
+{0x00F97F, 0x00F97F, 0x0052F5},
+{0x00F980, 0x00F980, 0x005442},
+{0x00F981, 0x00F981, 0x005973},
+{0x00F982, 0x00F982, 0x005EEC},
+{0x00F983, 0x00F983, 0x0065C5},
+{0x00F984, 0x00F984, 0x006FFE},
+{0x00F985, 0x00F985, 0x00792A},
+{0x00F986, 0x00F986, 0x0095AD},
+{0x00F987, 0x00F987, 0x009A6A},
+{0x00F988, 0x00F988, 0x009E97},
+{0x00F989, 0x00F989, 0x009ECE},
+{0x00F98A, 0x00F98A, 0x00529B},
+{0x00F98B, 0x00F98B, 0x0066C6},
+{0x00F98C, 0x00F98C, 0x006B77},
+{0x00F98D, 0x00F98D, 0x008F62},
+{0x00F98E, 0x00F98E, 0x005E74},
+{0x00F98F, 0x00F98F, 0x006190},
+{0x00F990, 0x00F990, 0x006200},
+{0x00F991, 0x00F991, 0x00649A},
+{0x00F992, 0x00F992, 0x006F23},
+{0x00F993, 0x00F993, 0x007149},
+{0x00F994, 0x00F994, 0x007489},
+{0x00F995, 0x00F995, 0x0079CA},
+{0x00F996, 0x00F996, 0x007DF4},
+{0x00F997, 0x00F997, 0x00806F},
+{0x00F998, 0x00F998, 0x008F26},
+{0x00F999, 0x00F999, 0x0084EE},
+{0x00F99A, 0x00F99A, 0x009023},
+{0x00F99B, 0x00F99B, 0x00934A},
+{0x00F99C, 0x00F99C, 0x005217},
+{0x00F99D, 0x00F99D, 0x0052A3},
+{0x00F99E, 0x00F99E, 0x0054BD},
+{0x00F99F, 0x00F99F, 0x0070C8},
+{0x00F9A0, 0x00F9A0, 0x0088C2},
+{0x00F9A1, 0x00F9A1, 0x008AAA},
+{0x00F9A2, 0x00F9A2, 0x005EC9},
+{0x00F9A3, 0x00F9A3, 0x005FF5},
+{0x00F9A4, 0x00F9A4, 0x00637B},
+{0x00F9A5, 0x00F9A5, 0x006BAE},
+{0x00F9A6, 0x00F9A6, 0x007C3E},
+{0x00F9A7, 0x00F9A7, 0x007375},
+{0x00F9A8, 0x00F9A8, 0x004EE4},
+{0x00F9A9, 0x00F9A9, 0x0056F9},
+{0x00F9AA, 0x00F9AA, 0x005BE7},
+{0x00F9AB, 0x00F9AB, 0x005DBA},
+{0x00F9AC, 0x00F9AC, 0x00601C},
+{0x00F9AD, 0x00F9AD, 0x0073B2},
+{0x00F9AE, 0x00F9AE, 0x007469},
+{0x00F9AF, 0x00F9AF, 0x007F9A},
+{0x00F9B0, 0x00F9B0, 0x008046},
+{0x00F9B1, 0x00F9B1, 0x009234},
+{0x00F9B2, 0x00F9B2, 0x0096F6},
+{0x00F9B3, 0x00F9B3, 0x009748},
+{0x00F9B4, 0x00F9B4, 0x009818},
+{0x00F9B5, 0x00F9B5, 0x004F8B},
+{0x00F9B6, 0x00F9B6, 0x0079AE},
+{0x00F9B7, 0x00F9B7, 0x0091B4},
+{0x00F9B8, 0x00F9B8, 0x0096B8},
+{0x00F9B9, 0x00F9B9, 0x0060E1},
+{0x00F9BA, 0x00F9BA, 0x004E86},
+{0x00F9BB, 0x00F9BB, 0x0050DA},
+{0x00F9BC, 0x00F9BC, 0x005BEE},
+{0x00F9BD, 0x00F9BD, 0x005C3F},
+{0x00F9BE, 0x00F9BE, 0x006599},
+{0x00F9BF, 0x00F9BF, 0x006A02},
+{0x00F9C0, 0x00F9C0, 0x0071CE},
+{0x00F9C1, 0x00F9C1, 0x007642},
+{0x00F9C2, 0x00F9C2, 0x0084FC},
+{0x00F9C3, 0x00F9C3, 0x00907C},
+{0x00F9C4, 0x00F9C4, 0x009F8D},
+{0x00F9C5, 0x00F9C5, 0x006688},
+{0x00F9C6, 0x00F9C6, 0x00962E},
+{0x00F9C7, 0x00F9C7, 0x005289},
+{0x00F9C8, 0x00F9C8, 0x00677B},
+{0x00F9C9, 0x00F9C9, 0x0067F3},
+{0x00F9CA, 0x00F9CA, 0x006D41},
+{0x00F9CB, 0x00F9CB, 0x006E9C},
+{0x00F9CC, 0x00F9CC, 0x007409},
+{0x00F9CD, 0x00F9CD, 0x007559},
+{0x00F9CE, 0x00F9CE, 0x00786B},
+{0x00F9CF, 0x00F9CF, 0x007D10},
+{0x00F9D0, 0x00F9D0, 0x00985E},
+{0x00F9D1, 0x00F9D1, 0x00516D},
+{0x00F9D2, 0x00F9D2, 0x00622E},
+{0x00F9D3, 0x00F9D3, 0x009678},
+{0x00F9D4, 0x00F9D4, 0x00502B},
+{0x00F9D5, 0x00F9D5, 0x005D19},
+{0x00F9D6, 0x00F9D6, 0x006DEA},
+{0x00F9D7, 0x00F9D7, 0x008F2A},
+{0x00F9D8, 0x00F9D8, 0x005F8B},
+{0x00F9D9, 0x00F9D9, 0x006144},
+{0x00F9DA, 0x00F9DA, 0x006817},
+{0x00F9DB, 0x00F9DB, 0x007387},
+{0x00F9DC, 0x00F9DC, 0x009686},
+{0x00F9DD, 0x00F9DD, 0x005229},
+{0x00F9DE, 0x00F9DE, 0x00540F},
+{0x00F9DF, 0x00F9DF, 0x005C65},
+{0x00F9E0, 0x00F9E0, 0x006613},
+{0x00F9E1, 0x00F9E1, 0x00674E},
+{0x00F9E2, 0x00F9E2, 0x0068A8},
+{0x00F9E3, 0x00F9E3, 0x006CE5},
+{0x00F9E4, 0x00F9E4, 0x007406},
+{0x00F9E5, 0x00F9E5, 0x0075E2},
+{0x00F9E6, 0x00F9E6, 0x007F79},
+{0x00F9E7, 0x00F9E7, 0x0088CF},
+{0x00F9E8, 0x00F9E8, 0x0088E1},
+{0x00F9E9, 0x00F9E9, 0x0091CC},
+{0x00F9EA, 0x00F9EA, 0x0096E2},
+{0x00F9EB, 0x00F9EB, 0x00533F},
+{0x00F9EC, 0x00F9EC, 0x006EBA},
+{0x00F9ED, 0x00F9ED, 0x00541D},
+{0x00F9EE, 0x00F9EE, 0x0071D0},
+{0x00F9EF, 0x00F9EF, 0x007498},
+{0x00F9F0, 0x00F9F0, 0x0085FA},
+{0x00F9F1, 0x00F9F1, 0x0096A3},
+{0x00F9F2, 0x00F9F2, 0x009C57},
+{0x00F9F3, 0x00F9F3, 0x009E9F},
+{0x00F9F4, 0x00F9F4, 0x006797},
+{0x00F9F5, 0x00F9F5, 0x006DCB},
+{0x00F9F6, 0x00F9F6, 0x0081E8},
+{0x00F9F7, 0x00F9F7, 0x007ACB},
+{0x00F9F8, 0x00F9F8, 0x007B20},
+{0x00F9F9, 0x00F9F9, 0x007C92},
+{0x00F9FA, 0x00F9FA, 0x0072C0},
+{0x00F9FB, 0x00F9FB, 0x007099},
+{0x00F9FC, 0x00F9FC, 0x008B58},
+{0x00F9FD, 0x00F9FD, 0x004EC0},
+{0x00F9FE, 0x00F9FE, 0x008336},
+{0x00F9FF, 0x00F9FF, 0x00523A},
+{0x00FA00, 0x00FA00, 0x005207},
+{0x00FA01, 0x00FA01, 0x005EA6},
+{0x00FA02, 0x00FA02, 0x0062D3},
+{0x00FA03, 0x00FA03, 0x007CD6},
+{0x00FA04, 0x00FA04, 0x005B85},
+{0x00FA05, 0x00FA05, 0x006D1E},
+{0x00FA06, 0x00FA06, 0x0066B4},
+{0x00FA07, 0x00FA07, 0x008F3B},
+{0x00FA08, 0x00FA08, 0x00884C},
+{0x00FA09, 0x00FA09, 0x00964D},
+{0x00FA0A, 0x00FA0A, 0x00898B},
+{0x00FA0B, 0x00FA0B, 0x005ED3},
+{0x00FA0C, 0x00FA0C, 0x005140},
+{0x00FA0D, 0x00FA0D, 0x0055C0},
+{0x00FA10, 0x00FA10, 0x00585A},
+{0x00FA12, 0x00FA12, 0x006674},
+{0x00FA15, 0x00FA15, 0x0051DE},
+{0x00FA16, 0x00FA16, 0x00732A},
+{0x00FA17, 0x00FA17, 0x0076CA},
+{0x00FA18, 0x00FA18, 0x00793C},
+{0x00FA19, 0x00FA19, 0x00795E},
+{0x00FA1A, 0x00FA1A, 0x007965},
+{0x00FA1B, 0x00FA1B, 0x00798F},
+{0x00FA1C, 0x00FA1C, 0x009756},
+{0x00FA1D, 0x00FA1D, 0x007CBE},
+{0x00FA1E, 0x00FA1E, 0x007FBD},
+{0x00FA20, 0x00FA20, 0x008612},
+{0x00FA22, 0x00FA22, 0x008AF8},
+{0x00FA25, 0x00FA25, 0x009038},
+{0x00FA26, 0x00FA26, 0x0090FD},
+{0x00FA2A, 0x00FA2A, 0x0098EF},
+{0x00FA2B, 0x00FA2B, 0x0098FC},
+{0x00FA2C, 0x00FA2C, 0x009928},
+{0x00FA2D, 0x00FA2D, 0x009DB4},
+{0x00FA2E, 0x00FA2E, 0x0090DE},
+{0x00FA2F, 0x00FA2F, 0x0096B7},
+{0x00FA30, 0x00FA30, 0x004FAE},
+{0x00FA31, 0x00FA31, 0x0050E7},
+{0x00FA32, 0x00FA32, 0x00514D},
+{0x00FA33, 0x00FA33, 0x0052C9},
+{0x00FA34, 0x00FA34, 0x0052E4},
+{0x00FA35, 0x00FA35, 0x005351},
+{0x00FA36, 0x00FA36, 0x00559D},
+{0x00FA37, 0x00FA37, 0x005606},
+{0x00FA38, 0x00FA38, 0x005668},
+{0x00FA39, 0x00FA39, 0x005840},
+{0x00FA3A, 0x00FA3A, 0x0058A8},
+{0x00FA3B, 0x00FA3B, 0x005C64},
+{0x00FA3C, 0x00FA3C, 0x005C6E},
+{0x00FA3D, 0x00FA3D, 0x006094},
+{0x00FA3E, 0x00FA3E, 0x006168},
+{0x00FA3F, 0x00FA3F, 0x00618E},
+{0x00FA40, 0x00FA40, 0x0061F2},
+{0x00FA41, 0x00FA41, 0x00654F},
+{0x00FA42, 0x00FA42, 0x0065E2},
+{0x00FA43, 0x00FA43, 0x006691},
+{0x00FA44, 0x00FA44, 0x006885},
+{0x00FA45, 0x00FA45, 0x006D77},
+{0x00FA46, 0x00FA46, 0x006E1A},
+{0x00FA47, 0x00FA47, 0x006F22},
+{0x00FA48, 0x00FA48, 0x00716E},
+{0x00FA49, 0x00FA49, 0x00722B},
+{0x00FA4A, 0x00FA4A, 0x007422},
+{0x00FA4B, 0x00FA4B, 0x007891},
+{0x00FA4C, 0x00FA4C, 0x00793E},
+{0x00FA4D, 0x00FA4D, 0x007949},
+{0x00FA4E, 0x00FA4E, 0x007948},
+{0x00FA4F, 0x00FA4F, 0x007950},
+{0x00FA50, 0x00FA50, 0x007956},
+{0x00FA51, 0x00FA51, 0x00795D},
+{0x00FA52, 0x00FA52, 0x00798D},
+{0x00FA53, 0x00FA53, 0x00798E},
+{0x00FA54, 0x00FA54, 0x007A40},
+{0x00FA55, 0x00FA55, 0x007A81},
+{0x00FA56, 0x00FA56, 0x007BC0},
+{0x00FA57, 0x00FA57, 0x007DF4},
+{0x00FA58, 0x00FA58, 0x007E09},
+{0x00FA59, 0x00FA59, 0x007E41},
+{0x00FA5A, 0x00FA5A, 0x007F72},
+{0x00FA5B, 0x00FA5B, 0x008005},
+{0x00FA5C, 0x00FA5C, 0x0081ED},
+{0x00FA5D, 0x00FA5E, 0x008279},
+{0x00FA5F, 0x00FA5F, 0x008457},
+{0x00FA60, 0x00FA60, 0x008910},
+{0x00FA61, 0x00FA61, 0x008996},
+{0x00FA62, 0x00FA62, 0x008B01},
+{0x00FA63, 0x00FA63, 0x008B39},
+{0x00FA64, 0x00FA64, 0x008CD3},
+{0x00FA65, 0x00FA65, 0x008D08},
+{0x00FA66, 0x00FA66, 0x008FB6},
+{0x00FA67, 0x00FA67, 0x009038},
+{0x00FA68, 0x00FA68, 0x0096E3},
+{0x00FA69, 0x00FA69, 0x0097FF},
+{0x00FA6A, 0x00FA6A, 0x00983B},
+{0x00FA6B, 0x00FA6B, 0x006075},
+{0x00FA6C, 0x00FA6C, 0x0242EE},
+{0x00FA6D, 0x00FA6D, 0x008218},
+{0x00FA70, 0x00FA70, 0x004E26},
+{0x00FA71, 0x00FA71, 0x0051B5},
+{0x00FA72, 0x00FA72, 0x005168},
+{0x00FA73, 0x00FA73, 0x004F80},
+{0x00FA74, 0x00FA74, 0x005145},
+{0x00FA75, 0x00FA75, 0x005180},
+{0x00FA76, 0x00FA76, 0x0052C7},
+{0x00FA77, 0x00FA77, 0x0052FA},
+{0x00FA78, 0x00FA78, 0x00559D},
+{0x00FA79, 0x00FA79, 0x005555},
+{0x00FA7A, 0x00FA7A, 0x005599},
+{0x00FA7B, 0x00FA7B, 0x0055E2},
+{0x00FA7C, 0x00FA7C, 0x00585A},
+{0x00FA7D, 0x00FA7D, 0x0058B3},
+{0x00FA7E, 0x00FA7E, 0x005944},
+{0x00FA7F, 0x00FA7F, 0x005954},
+{0x00FA80, 0x00FA80, 0x005A62},
+{0x00FA81, 0x00FA81, 0x005B28},
+{0x00FA82, 0x00FA82, 0x005ED2},
+{0x00FA83, 0x00FA83, 0x005ED9},
+{0x00FA84, 0x00FA84, 0x005F69},
+{0x00FA85, 0x00FA85, 0x005FAD},
+{0x00FA86, 0x00FA86, 0x0060D8},
+{0x00FA87, 0x00FA87, 0x00614E},
+{0x00FA88, 0x00FA88, 0x006108},
+{0x00FA89, 0x00FA89, 0x00618E},
+{0x00FA8A, 0x00FA8A, 0x006160},
+{0x00FA8B, 0x00FA8B, 0x0061F2},
+{0x00FA8C, 0x00FA8C, 0x006234},
+{0x00FA8D, 0x00FA8D, 0x0063C4},
+{0x00FA8E, 0x00FA8E, 0x00641C},
+{0x00FA8F, 0x00FA8F, 0x006452},
+{0x00FA90, 0x00FA90, 0x006556},
+{0x00FA91, 0x00FA91, 0x006674},
+{0x00FA92, 0x00FA92, 0x006717},
+{0x00FA93, 0x00FA93, 0x00671B},
+{0x00FA94, 0x00FA94, 0x006756},
+{0x00FA95, 0x00FA95, 0x006B79},
+{0x00FA96, 0x00FA96, 0x006BBA},
+{0x00FA97, 0x00FA97, 0x006D41},
+{0x00FA98, 0x00FA98, 0x006EDB},
+{0x00FA99, 0x00FA99, 0x006ECB},
+{0x00FA9A, 0x00FA9A, 0x006F22},
+{0x00FA9B, 0x00FA9B, 0x00701E},
+{0x00FA9C, 0x00FA9C, 0x00716E},
+{0x00FA9D, 0x00FA9D, 0x0077A7},
+{0x00FA9E, 0x00FA9E, 0x007235},
+{0x00FA9F, 0x00FA9F, 0x0072AF},
+{0x00FAA0, 0x00FAA0, 0x00732A},
+{0x00FAA1, 0x00FAA1, 0x007471},
+{0x00FAA2, 0x00FAA2, 0x007506},
+{0x00FAA3, 0x00FAA3, 0x00753B},
+{0x00FAA4, 0x00FAA4, 0x00761D},
+{0x00FAA5, 0x00FAA5, 0x00761F},
+{0x00FAA6, 0x00FAA6, 0x0076CA},
+{0x00FAA7, 0x00FAA7, 0x0076DB},
+{0x00FAA8, 0x00FAA8, 0x0076F4},
+{0x00FAA9, 0x00FAA9, 0x00774A},
+{0x00FAAA, 0x00FAAA, 0x007740},
+{0x00FAAB, 0x00FAAB, 0x0078CC},
+{0x00FAAC, 0x00FAAC, 0x007AB1},
+{0x00FAAD, 0x00FAAD, 0x007BC0},
+{0x00FAAE, 0x00FAAE, 0x007C7B},
+{0x00FAAF, 0x00FAAF, 0x007D5B},
+{0x00FAB0, 0x00FAB0, 0x007DF4},
+{0x00FAB1, 0x00FAB1, 0x007F3E},
+{0x00FAB2, 0x00FAB2, 0x008005},
+{0x00FAB3, 0x00FAB3, 0x008352},
+{0x00FAB4, 0x00FAB4, 0x0083EF},
+{0x00FAB5, 0x00FAB5, 0x008779},
+{0x00FAB6, 0x00FAB6, 0x008941},
+{0x00FAB7, 0x00FAB7, 0x008986},
+{0x00FAB8, 0x00FAB8, 0x008996},
+{0x00FAB9, 0x00FAB9, 0x008ABF},
+{0x00FABA, 0x00FABA, 0x008AF8},
+{0x00FABB, 0x00FABB, 0x008ACB},
+{0x00FABC, 0x00FABC, 0x008B01},
+{0x00FABD, 0x00FABD, 0x008AFE},
+{0x00FABE, 0x00FABE, 0x008AED},
+{0x00FABF, 0x00FABF, 0x008B39},
+{0x00FAC0, 0x00FAC0, 0x008B8A},
+{0x00FAC1, 0x00FAC1, 0x008D08},
+{0x00FAC2, 0x00FAC2, 0x008F38},
+{0x00FAC3, 0x00FAC3, 0x009072},
+{0x00FAC4, 0x00FAC4, 0x009199},
+{0x00FAC5, 0x00FAC5, 0x009276},
+{0x00FAC6, 0x00FAC6, 0x00967C},
+{0x00FAC7, 0x00FAC7, 0x0096E3},
+{0x00FAC8, 0x00FAC8, 0x009756},
+{0x00FAC9, 0x00FAC9, 0x0097DB},
+{0x00FACA, 0x00FACA, 0x0097FF},
+{0x00FACB, 0x00FACB, 0x00980B},
+{0x00FACC, 0x00FACC, 0x00983B},
+{0x00FACD, 0x00FACD, 0x009B12},
+{0x00FACE, 0x00FACE, 0x009F9C},
+{0x00FACF, 0x00FACF, 0x02284A},
+{0x00FAD0, 0x00FAD0, 0x022844},
+{0x00FAD1, 0x00FAD1, 0x0233D5},
+{0x00FAD2, 0x00FAD2, 0x003B9D},
+{0x00FAD3, 0x00FAD3, 0x004018},
+{0x00FAD4, 0x00FAD4, 0x004039},
+{0x00FAD5, 0x00FAD5, 0x025249},
+{0x00FAD6, 0x00FAD6, 0x025CD0},
+{0x00FAD7, 0x00FAD7, 0x027ED3},
+{0x00FAD8, 0x00FAD8, 0x009F43},
+{0x00FAD9, 0x00FAD9, 0x009F8E},
+{0x00FB1D, 0x00FB1D, 0x0005D9},
+{0x00FB1F, 0x00FB1F, 0x0005F2},
+{0x00FB2A, 0x00FB2D, 0x0005E9},
+{0x00FB2E, 0x00FB30, 0x0005D0},
+{0x00FB31, 0x00FB31, 0x0005D1},
+{0x00FB32, 0x00FB32, 0x0005D2},
+{0x00FB33, 0x00FB33, 0x0005D3},
+{0x00FB34, 0x00FB34, 0x0005D4},
+{0x00FB35, 0x00FB35, 0x0005D5},
+{0x00FB36, 0x00FB36, 0x0005D6},
+{0x00FB38, 0x00FB38, 0x0005D8},
+{0x00FB39, 0x00FB39, 0x0005D9},
+{0x00FB3A, 0x00FB3A, 0x0005DA},
+{0x00FB3B, 0x00FB3B, 0x0005DB},
+{0x00FB3C, 0x00FB3C, 0x0005DC},
+{0x00FB3E, 0x00FB3E, 0x0005DE},
+{0x00FB40, 0x00FB40, 0x0005E0},
+{0x00FB41, 0x00FB41, 0x0005E1},
+{0x00FB43, 0x00FB43, 0x0005E3},
+{0x00FB44, 0x00FB44, 0x0005E4},
+{0x00FB46, 0x00FB46, 0x0005E6},
+{0x00FB47, 0x00FB47, 0x0005E7},
+{0x00FB48, 0x00FB48, 0x0005E8},
+{0x00FB49, 0x00FB49, 0x0005E9},
+{0x00FB4A, 0x00FB4A, 0x0005EA},
+{0x00FB4B, 0x00FB4B, 0x0005D5},
+{0x00FB4C, 0x00FB4C, 0x0005D1},
+{0x00FB4D, 0x00FB4D, 0x0005DB},
+{0x00FB4E, 0x00FB4E, 0x0005E4},
+{0x01109A, 0x01109A, 0x011099},
+{0x01109C, 0x01109C, 0x01109B},
+{0x0110AB, 0x0110AB, 0x0110A5},
+{0x01112E, 0x01112E, 0x011131},
+{0x01112F, 0x01112F, 0x011132},
+{0x01134B, 0x01134C, 0x011347},
+{0x0114BB, 0x0114BC, 0x0114B9},
+{0x0114BE, 0x0114BE, 0x0114B9},
+{0x0115BA, 0x0115BA, 0x0115B8},
+{0x0115BB, 0x0115BB, 0x0115B9},
+{0x011938, 0x011938, 0x011935},
+{0x01D15E, 0x01D15E, 0x01D157},
+{0x01D15F, 0x01D164, 0x01D158},
+{0x01D1BB, 0x01D1BB, 0x01D1B9},
+{0x01D1BC, 0x01D1BC, 0x01D1BA},
+{0x01D1BD, 0x01D1BD, 0x01D1B9},
+{0x01D1BE, 0x01D1BE, 0x01D1BA},
+{0x01D1BF, 0x01D1BF, 0x01D1B9},
+{0x01D1C0, 0x01D1C0, 0x01D1BA},
+{0x02F800, 0x02F800, 0x004E3D},
+{0x02F801, 0x02F801, 0x004E38},
+{0x02F802, 0x02F802, 0x004E41},
+{0x02F803, 0x02F803, 0x020122},
+{0x02F804, 0x02F804, 0x004F60},
+{0x02F805, 0x02F805, 0x004FAE},
+{0x02F806, 0x02F806, 0x004FBB},
+{0x02F807, 0x02F807, 0x005002},
+{0x02F808, 0x02F808, 0x00507A},
+{0x02F809, 0x02F809, 0x005099},
+{0x02F80A, 0x02F80A, 0x0050E7},
+{0x02F80B, 0x02F80B, 0x0050CF},
+{0x02F80C, 0x02F80C, 0x00349E},
+{0x02F80D, 0x02F80D, 0x02063A},
+{0x02F80E, 0x02F80E, 0x00514D},
+{0x02F80F, 0x02F80F, 0x005154},
+{0x02F810, 0x02F810, 0x005164},
+{0x02F811, 0x02F811, 0x005177},
+{0x02F812, 0x02F812, 0x02051C},
+{0x02F813, 0x02F813, 0x0034B9},
+{0x02F814, 0x02F814, 0x005167},
+{0x02F815, 0x02F815, 0x00518D},
+{0x02F816, 0x02F816, 0x02054B},
+{0x02F817, 0x02F817, 0x005197},
+{0x02F818, 0x02F818, 0x0051A4},
+{0x02F819, 0x02F819, 0x004ECC},
+{0x02F81A, 0x02F81A, 0x0051AC},
+{0x02F81B, 0x02F81B, 0x0051B5},
+{0x02F81C, 0x02F81C, 0x0291DF},
+{0x02F81D, 0x02F81D, 0x0051F5},
+{0x02F81E, 0x02F81E, 0x005203},
+{0x02F81F, 0x02F81F, 0x0034DF},
+{0x02F820, 0x02F820, 0x00523B},
+{0x02F821, 0x02F821, 0x005246},
+{0x02F822, 0x02F822, 0x005272},
+{0x02F823, 0x02F823, 0x005277},
+{0x02F824, 0x02F824, 0x003515},
+{0x02F825, 0x02F825, 0x0052C7},
+{0x02F826, 0x02F826, 0x0052C9},
+{0x02F827, 0x02F827, 0x0052E4},
+{0x02F828, 0x02F828, 0x0052FA},
+{0x02F829, 0x02F829, 0x005305},
+{0x02F82A, 0x02F82A, 0x005306},
+{0x02F82B, 0x02F82B, 0x005317},
+{0x02F82C, 0x02F82C, 0x005349},
+{0x02F82D, 0x02F82D, 0x005351},
+{0x02F82E, 0x02F82E, 0x00535A},
+{0x02F82F, 0x02F82F, 0x005373},
+{0x02F830, 0x02F830, 0x00537D},
+{0x02F831, 0x02F833, 0x00537F},
+{0x02F834, 0x02F834, 0x020A2C},
+{0x02F835, 0x02F835, 0x007070},
+{0x02F836, 0x02F836, 0x0053CA},
+{0x02F837, 0x02F837, 0x0053DF},
+{0x02F838, 0x02F838, 0x020B63},
+{0x02F839, 0x02F839, 0x0053EB},
+{0x02F83A, 0x02F83A, 0x0053F1},
+{0x02F83B, 0x02F83B, 0x005406},
+{0x02F83C, 0x02F83C, 0x00549E},
+{0x02F83D, 0x02F83D, 0x005438},
+{0x02F83E, 0x02F83E, 0x005448},
+{0x02F83F, 0x02F83F, 0x005468},
+{0x02F840, 0x02F840, 0x0054A2},
+{0x02F841, 0x02F841, 0x0054F6},
+{0x02F842, 0x02F842, 0x005510},
+{0x02F843, 0x02F843, 0x005553},
+{0x02F844, 0x02F844, 0x005563},
+{0x02F845, 0x02F846, 0x005584},
+{0x02F847, 0x02F847, 0x005599},
+{0x02F848, 0x02F848, 0x0055AB},
+{0x02F849, 0x02F849, 0x0055B3},
+{0x02F84A, 0x02F84A, 0x0055C2},
+{0x02F84B, 0x02F84B, 0x005716},
+{0x02F84C, 0x02F84C, 0x005606},
+{0x02F84D, 0x02F84D, 0x005717},
+{0x02F84E, 0x02F84E, 0x005651},
+{0x02F84F, 0x02F84F, 0x005674},
+{0x02F850, 0x02F850, 0x005207},
+{0x02F851, 0x02F851, 0x0058EE},
+{0x02F852, 0x02F852, 0x0057CE},
+{0x02F853, 0x02F853, 0x0057F4},
+{0x02F854, 0x02F854, 0x00580D},
+{0x02F855, 0x02F855, 0x00578B},
+{0x02F856, 0x02F856, 0x005832},
+{0x02F857, 0x02F857, 0x005831},
+{0x02F858, 0x02F858, 0x0058AC},
+{0x02F859, 0x02F859, 0x0214E4},
+{0x02F85A, 0x02F85A, 0x0058F2},
+{0x02F85B, 0x02F85B, 0x0058F7},
+{0x02F85C, 0x02F85C, 0x005906},
+{0x02F85D, 0x02F85D, 0x00591A},
+{0x02F85E, 0x02F85E, 0x005922},
+{0x02F85F, 0x02F85F, 0x005962},
+{0x02F860, 0x02F860, 0x0216A8},
+{0x02F861, 0x02F861, 0x0216EA},
+{0x02F862, 0x02F862, 0x0059EC},
+{0x02F863, 0x02F863, 0x005A1B},
+{0x02F864, 0x02F864, 0x005A27},
+{0x02F865, 0x02F865, 0x0059D8},
+{0x02F866, 0x02F866, 0x005A66},
+{0x02F867, 0x02F867, 0x0036EE},
+{0x02F868, 0x02F868, 0x0036FC},
+{0x02F869, 0x02F869, 0x005B08},
+{0x02F86A, 0x02F86B, 0x005B3E},
+{0x02F86C, 0x02F86C, 0x0219C8},
+{0x02F86D, 0x02F86D, 0x005BC3},
+{0x02F86E, 0x02F86E, 0x005BD8},
+{0x02F86F, 0x02F86F, 0x005BE7},
+{0x02F870, 0x02F870, 0x005BF3},
+{0x02F871, 0x02F871, 0x021B18},
+{0x02F872, 0x02F872, 0x005BFF},
+{0x02F873, 0x02F873, 0x005C06},
+{0x02F874, 0x02F874, 0x005F53},
+{0x02F875, 0x02F875, 0x005C22},
+{0x02F876, 0x02F876, 0x003781},
+{0x02F877, 0x02F877, 0x005C60},
+{0x02F878, 0x02F878, 0x005C6E},
+{0x02F879, 0x02F879, 0x005CC0},
+{0x02F87A, 0x02F87A, 0x005C8D},
+{0x02F87B, 0x02F87B, 0x021DE4},
+{0x02F87C, 0x02F87C, 0x005D43},
+{0x02F87D, 0x02F87D, 0x021DE6},
+{0x02F87E, 0x02F87E, 0x005D6E},
+{0x02F87F, 0x02F87F, 0x005D6B},
+{0x02F880, 0x02F880, 0x005D7C},
+{0x02F881, 0x02F881, 0x005DE1},
+{0x02F882, 0x02F882, 0x005DE2},
+{0x02F883, 0x02F883, 0x00382F},
+{0x02F884, 0x02F884, 0x005DFD},
+{0x02F885, 0x02F885, 0x005E28},
+{0x02F886, 0x02F886, 0x005E3D},
+{0x02F887, 0x02F887, 0x005E69},
+{0x02F888, 0x02F888, 0x003862},
+{0x02F889, 0x02F889, 0x022183},
+{0x02F88A, 0x02F88A, 0x00387C},
+{0x02F88B, 0x02F88B, 0x005EB0},
+{0x02F88C, 0x02F88C, 0x005EB3},
+{0x02F88D, 0x02F88D, 0x005EB6},
+{0x02F88E, 0x02F88E, 0x005ECA},
+{0x02F88F, 0x02F88F, 0x02A392},
+{0x02F890, 0x02F890, 0x005EFE},
+{0x02F891, 0x02F892, 0x022331},
+{0x02F893, 0x02F893, 0x008201},
+{0x02F894, 0x02F895, 0x005F22},
+{0x02F896, 0x02F896, 0x0038C7},
+{0x02F897, 0x02F897, 0x0232B8},
+{0x02F898, 0x02F898, 0x0261DA},
+{0x02F899, 0x02F899, 0x005F62},
+{0x02F89A, 0x02F89A, 0x005F6B},
+{0x02F89B, 0x02F89B, 0x0038E3},
+{0x02F89C, 0x02F89C, 0x005F9A},
+{0x02F89D, 0x02F89D, 0x005FCD},
+{0x02F89E, 0x02F89E, 0x005FD7},
+{0x02F89F, 0x02F89F, 0x005FF9},
+{0x02F8A0, 0x02F8A0, 0x006081},
+{0x02F8A1, 0x02F8A1, 0x00393A},
+{0x02F8A2, 0x02F8A2, 0x00391C},
+{0x02F8A3, 0x02F8A3, 0x006094},
+{0x02F8A4, 0x02F8A4, 0x0226D4},
+{0x02F8A5, 0x02F8A5, 0x0060C7},
+{0x02F8A6, 0x02F8A6, 0x006148},
+{0x02F8A7, 0x02F8A7, 0x00614C},
+{0x02F8A8, 0x02F8A8, 0x00614E},
+{0x02F8A9, 0x02F8A9, 0x00614C},
+{0x02F8AA, 0x02F8AA, 0x00617A},
+{0x02F8AB, 0x02F8AB, 0x00618E},
+{0x02F8AC, 0x02F8AC, 0x0061B2},
+{0x02F8AD, 0x02F8AD, 0x0061A4},
+{0x02F8AE, 0x02F8AE, 0x0061AF},
+{0x02F8AF, 0x02F8AF, 0x0061DE},
+{0x02F8B0, 0x02F8B0, 0x0061F2},
+{0x02F8B1, 0x02F8B1, 0x0061F6},
+{0x02F8B2, 0x02F8B2, 0x006210},
+{0x02F8B3, 0x02F8B3, 0x00621B},
+{0x02F8B4, 0x02F8B4, 0x00625D},
+{0x02F8B5, 0x02F8B5, 0x0062B1},
+{0x02F8B6, 0x02F8B6, 0x0062D4},
+{0x02F8B7, 0x02F8B7, 0x006350},
+{0x02F8B8, 0x02F8B8, 0x022B0C},
+{0x02F8B9, 0x02F8B9, 0x00633D},
+{0x02F8BA, 0x02F8BA, 0x0062FC},
+{0x02F8BB, 0x02F8BB, 0x006368},
+{0x02F8BC, 0x02F8BC, 0x006383},
+{0x02F8BD, 0x02F8BD, 0x0063E4},
+{0x02F8BE, 0x02F8BE, 0x022BF1},
+{0x02F8BF, 0x02F8BF, 0x006422},
+{0x02F8C0, 0x02F8C0, 0x0063C5},
+{0x02F8C1, 0x02F8C1, 0x0063A9},
+{0x02F8C2, 0x02F8C2, 0x003A2E},
+{0x02F8C3, 0x02F8C3, 0x006469},
+{0x02F8C4, 0x02F8C4, 0x00647E},
+{0x02F8C5, 0x02F8C5, 0x00649D},
+{0x02F8C6, 0x02F8C6, 0x006477},
+{0x02F8C7, 0x02F8C7, 0x003A6C},
+{0x02F8C8, 0x02F8C8, 0x00654F},
+{0x02F8C9, 0x02F8C9, 0x00656C},
+{0x02F8CA, 0x02F8CA, 0x02300A},
+{0x02F8CB, 0x02F8CB, 0x0065E3},
+{0x02F8CC, 0x02F8CC, 0x0066F8},
+{0x02F8CD, 0x02F8CD, 0x006649},
+{0x02F8CE, 0x02F8CE, 0x003B19},
+{0x02F8CF, 0x02F8CF, 0x006691},
+{0x02F8D0, 0x02F8D0, 0x003B08},
+{0x02F8D1, 0x02F8D1, 0x003AE4},
+{0x02F8D2, 0x02F8D2, 0x005192},
+{0x02F8D3, 0x02F8D3, 0x005195},
+{0x02F8D4, 0x02F8D4, 0x006700},
+{0x02F8D5, 0x02F8D5, 0x00669C},
+{0x02F8D6, 0x02F8D6, 0x0080AD},
+{0x02F8D7, 0x02F8D7, 0x0043D9},
+{0x02F8D8, 0x02F8D8, 0x006717},
+{0x02F8D9, 0x02F8D9, 0x00671B},
+{0x02F8DA, 0x02F8DA, 0x006721},
+{0x02F8DB, 0x02F8DB, 0x00675E},
+{0x02F8DC, 0x02F8DC, 0x006753},
+{0x02F8DD, 0x02F8DD, 0x0233C3},
+{0x02F8DE, 0x02F8DE, 0x003B49},
+{0x02F8DF, 0x02F8DF, 0x0067FA},
+{0x02F8E0, 0x02F8E0, 0x006785},
+{0x02F8E1, 0x02F8E1, 0x006852},
+{0x02F8E2, 0x02F8E2, 0x006885},
+{0x02F8E3, 0x02F8E3, 0x02346D},
+{0x02F8E4, 0x02F8E4, 0x00688E},
+{0x02F8E5, 0x02F8E5, 0x00681F},
+{0x02F8E6, 0x02F8E6, 0x006914},
+{0x02F8E7, 0x02F8E7, 0x003B9D},
+{0x02F8E8, 0x02F8E8, 0x006942},
+{0x02F8E9, 0x02F8E9, 0x0069A3},
+{0x02F8EA, 0x02F8EA, 0x0069EA},
+{0x02F8EB, 0x02F8EB, 0x006AA8},
+{0x02F8EC, 0x02F8EC, 0x0236A3},
+{0x02F8ED, 0x02F8ED, 0x006ADB},
+{0x02F8EE, 0x02F8EE, 0x003C18},
+{0x02F8EF, 0x02F8EF, 0x006B21},
+{0x02F8F0, 0x02F8F0, 0x0238A7},
+{0x02F8F1, 0x02F8F1, 0x006B54},
+{0x02F8F2, 0x02F8F2, 0x003C4E},
+{0x02F8F3, 0x02F8F3, 0x006B72},
+{0x02F8F4, 0x02F8F4, 0x006B9F},
+{0x02F8F5, 0x02F8F5, 0x006BBA},
+{0x02F8F6, 0x02F8F6, 0x006BBB},
+{0x02F8F7, 0x02F8F7, 0x023A8D},
+{0x02F8F8, 0x02F8F8, 0x021D0B},
+{0x02F8F9, 0x02F8F9, 0x023AFA},
+{0x02F8FA, 0x02F8FA, 0x006C4E},
+{0x02F8FB, 0x02F8FB, 0x023CBC},
+{0x02F8FC, 0x02F8FC, 0x006CBF},
+{0x02F8FD, 0x02F8FD, 0x006CCD},
+{0x02F8FE, 0x02F8FE, 0x006C67},
+{0x02F8FF, 0x02F8FF, 0x006D16},
+{0x02F900, 0x02F900, 0x006D3E},
+{0x02F901, 0x02F901, 0x006D77},
+{0x02F902, 0x02F902, 0x006D41},
+{0x02F903, 0x02F903, 0x006D69},
+{0x02F904, 0x02F904, 0x006D78},
+{0x02F905, 0x02F905, 0x006D85},
+{0x02F906, 0x02F906, 0x023D1E},
+{0x02F907, 0x02F907, 0x006D34},
+{0x02F908, 0x02F908, 0x006E2F},
+{0x02F909, 0x02F909, 0x006E6E},
+{0x02F90A, 0x02F90A, 0x003D33},
+{0x02F90B, 0x02F90B, 0x006ECB},
+{0x02F90C, 0x02F90C, 0x006EC7},
+{0x02F90D, 0x02F90D, 0x023ED1},
+{0x02F90E, 0x02F90E, 0x006DF9},
+{0x02F90F, 0x02F90F, 0x006F6E},
+{0x02F910, 0x02F910, 0x023F5E},
+{0x02F911, 0x02F911, 0x023F8E},
+{0x02F912, 0x02F912, 0x006FC6},
+{0x02F913, 0x02F913, 0x007039},
+{0x02F914, 0x02F914, 0x00701E},
+{0x02F915, 0x02F915, 0x00701B},
+{0x02F916, 0x02F916, 0x003D96},
+{0x02F917, 0x02F917, 0x00704A},
+{0x02F918, 0x02F918, 0x00707D},
+{0x02F919, 0x02F919, 0x007077},
+{0x02F91A, 0x02F91A, 0x0070AD},
+{0x02F91B, 0x02F91B, 0x020525},
+{0x02F91C, 0x02F91C, 0x007145},
+{0x02F91D, 0x02F91D, 0x024263},
+{0x02F91E, 0x02F91E, 0x00719C},
+{0x02F91F, 0x02F91F, 0x0243AB},
+{0x02F920, 0x02F920, 0x007228},
+{0x02F921, 0x02F921, 0x007235},
+{0x02F922, 0x02F922, 0x007250},
+{0x02F923, 0x02F923, 0x024608},
+{0x02F924, 0x02F924, 0x007280},
+{0x02F925, 0x02F925, 0x007295},
+{0x02F926, 0x02F926, 0x024735},
+{0x02F927, 0x02F927, 0x024814},
+{0x02F928, 0x02F928, 0x00737A},
+{0x02F929, 0x02F929, 0x00738B},
+{0x02F92A, 0x02F92A, 0x003EAC},
+{0x02F92B, 0x02F92B, 0x0073A5},
+{0x02F92C, 0x02F92D, 0x003EB8},
+{0x02F92E, 0x02F92E, 0x007447},
+{0x02F92F, 0x02F92F, 0x00745C},
+{0x02F930, 0x02F930, 0x007471},
+{0x02F931, 0x02F931, 0x007485},
+{0x02F932, 0x02F932, 0x0074CA},
+{0x02F933, 0x02F933, 0x003F1B},
+{0x02F934, 0x02F934, 0x007524},
+{0x02F935, 0x02F935, 0x024C36},
+{0x02F936, 0x02F936, 0x00753E},
+{0x02F937, 0x02F937, 0x024C92},
+{0x02F938, 0x02F938, 0x007570},
+{0x02F939, 0x02F939, 0x02219F},
+{0x02F93A, 0x02F93A, 0x007610},
+{0x02F93B, 0x02F93B, 0x024FA1},
+{0x02F93C, 0x02F93C, 0x024FB8},
+{0x02F93D, 0x02F93D, 0x025044},
+{0x02F93E, 0x02F93E, 0x003FFC},
+{0x02F93F, 0x02F93F, 0x004008},
+{0x02F940, 0x02F940, 0x0076F4},
+{0x02F941, 0x02F941, 0x0250F3},
+{0x02F942, 0x02F942, 0x0250F2},
+{0x02F943, 0x02F943, 0x025119},
+{0x02F944, 0x02F944, 0x025133},
+{0x02F945, 0x02F945, 0x00771E},
+{0x02F946, 0x02F947, 0x00771F},
+{0x02F948, 0x02F948, 0x00774A},
+{0x02F949, 0x02F949, 0x004039},
+{0x02F94A, 0x02F94A, 0x00778B},
+{0x02F94B, 0x02F94B, 0x004046},
+{0x02F94C, 0x02F94C, 0x004096},
+{0x02F94D, 0x02F94D, 0x02541D},
+{0x02F94E, 0x02F94E, 0x00784E},
+{0x02F94F, 0x02F94F, 0x00788C},
+{0x02F950, 0x02F950, 0x0078CC},
+{0x02F951, 0x02F951, 0x0040E3},
+{0x02F952, 0x02F952, 0x025626},
+{0x02F953, 0x02F953, 0x007956},
+{0x02F954, 0x02F954, 0x02569A},
+{0x02F955, 0x02F955, 0x0256C5},
+{0x02F956, 0x02F956, 0x00798F},
+{0x02F957, 0x02F957, 0x0079EB},
+{0x02F958, 0x02F958, 0x00412F},
+{0x02F959, 0x02F959, 0x007A40},
+{0x02F95A, 0x02F95A, 0x007A4A},
+{0x02F95B, 0x02F95B, 0x007A4F},
+{0x02F95C, 0x02F95C, 0x02597C},
+{0x02F95D, 0x02F95E, 0x025AA7},
+{0x02F95F, 0x02F95F, 0x007AEE},
+{0x02F960, 0x02F960, 0x004202},
+{0x02F961, 0x02F961, 0x025BAB},
+{0x02F962, 0x02F962, 0x007BC6},
+{0x02F963, 0x02F963, 0x007BC9},
+{0x02F964, 0x02F964, 0x004227},
+{0x02F965, 0x02F965, 0x025C80},
+{0x02F966, 0x02F966, 0x007CD2},
+{0x02F967, 0x02F967, 0x0042A0},
+{0x02F968, 0x02F968, 0x007CE8},
+{0x02F969, 0x02F969, 0x007CE3},
+{0x02F96A, 0x02F96A, 0x007D00},
+{0x02F96B, 0x02F96B, 0x025F86},
+{0x02F96C, 0x02F96C, 0x007D63},
+{0x02F96D, 0x02F96D, 0x004301},
+{0x02F96E, 0x02F96E, 0x007DC7},
+{0x02F96F, 0x02F96F, 0x007E02},
+{0x02F970, 0x02F970, 0x007E45},
+{0x02F971, 0x02F971, 0x004334},
+{0x02F972, 0x02F972, 0x026228},
+{0x02F973, 0x02F973, 0x026247},
+{0x02F974, 0x02F974, 0x004359},
+{0x02F975, 0x02F975, 0x0262D9},
+{0x02F976, 0x02F976, 0x007F7A},
+{0x02F977, 0x02F977, 0x02633E},
+{0x02F978, 0x02F978, 0x007F95},
+{0x02F979, 0x02F979, 0x007FFA},
+{0x02F97A, 0x02F97A, 0x008005},
+{0x02F97B, 0x02F97B, 0x0264DA},
+{0x02F97C, 0x02F97C, 0x026523},
+{0x02F97D, 0x02F97D, 0x008060},
+{0x02F97E, 0x02F97E, 0x0265A8},
+{0x02F97F, 0x02F97F, 0x008070},
+{0x02F980, 0x02F980, 0x02335F},
+{0x02F981, 0x02F981, 0x0043D5},
+{0x02F982, 0x02F982, 0x0080B2},
+{0x02F983, 0x02F983, 0x008103},
+{0x02F984, 0x02F984, 0x00440B},
+{0x02F985, 0x02F985, 0x00813E},
+{0x02F986, 0x02F986, 0x005AB5},
+{0x02F987, 0x02F987, 0x0267A7},
+{0x02F988, 0x02F988, 0x0267B5},
+{0x02F989, 0x02F989, 0x023393},
+{0x02F98A, 0x02F98A, 0x02339C},
+{0x02F98B, 0x02F98B, 0x008201},
+{0x02F98C, 0x02F98C, 0x008204},
+{0x02F98D, 0x02F98D, 0x008F9E},
+{0x02F98E, 0x02F98E, 0x00446B},
+{0x02F98F, 0x02F98F, 0x008291},
+{0x02F990, 0x02F990, 0x00828B},
+{0x02F991, 0x02F991, 0x00829D},
+{0x02F992, 0x02F992, 0x0052B3},
+{0x02F993, 0x02F993, 0x0082B1},
+{0x02F994, 0x02F994, 0x0082B3},
+{0x02F995, 0x02F995, 0x0082BD},
+{0x02F996, 0x02F996, 0x0082E6},
+{0x02F997, 0x02F997, 0x026B3C},
+{0x02F998, 0x02F998, 0x0082E5},
+{0x02F999, 0x02F999, 0x00831D},
+{0x02F99A, 0x02F99A, 0x008363},
+{0x02F99B, 0x02F99B, 0x0083AD},
+{0x02F99C, 0x02F99C, 0x008323},
+{0x02F99D, 0x02F99D, 0x0083BD},
+{0x02F99E, 0x02F99E, 0x0083E7},
+{0x02F99F, 0x02F99F, 0x008457},
+{0x02F9A0, 0x02F9A0, 0x008353},
+{0x02F9A1, 0x02F9A1, 0x0083CA},
+{0x02F9A2, 0x02F9A2, 0x0083CC},
+{0x02F9A3, 0x02F9A3, 0x0083DC},
+{0x02F9A4, 0x02F9A4, 0x026C36},
+{0x02F9A5, 0x02F9A5, 0x026D6B},
+{0x02F9A6, 0x02F9A6, 0x026CD5},
+{0x02F9A7, 0x02F9A7, 0x00452B},
+{0x02F9A8, 0x02F9A8, 0x0084F1},
+{0x02F9A9, 0x02F9A9, 0x0084F3},
+{0x02F9AA, 0x02F9AA, 0x008516},
+{0x02F9AB, 0x02F9AB, 0x0273CA},
+{0x02F9AC, 0x02F9AC, 0x008564},
+{0x02F9AD, 0x02F9AD, 0x026F2C},
+{0x02F9AE, 0x02F9AE, 0x00455D},
+{0x02F9AF, 0x02F9AF, 0x004561},
+{0x02F9B0, 0x02F9B0, 0x026FB1},
+{0x02F9B1, 0x02F9B1, 0x0270D2},
+{0x02F9B2, 0x02F9B2, 0x00456B},
+{0x02F9B3, 0x02F9B3, 0x008650},
+{0x02F9B4, 0x02F9B4, 0x00865C},
+{0x02F9B5, 0x02F9B5, 0x008667},
+{0x02F9B6, 0x02F9B6, 0x008669},
+{0x02F9B7, 0x02F9B7, 0x0086A9},
+{0x02F9B8, 0x02F9B8, 0x008688},
+{0x02F9B9, 0x02F9B9, 0x00870E},
+{0x02F9BA, 0x02F9BA, 0x0086E2},
+{0x02F9BB, 0x02F9BB, 0x008779},
+{0x02F9BC, 0x02F9BC, 0x008728},
+{0x02F9BD, 0x02F9BD, 0x00876B},
+{0x02F9BE, 0x02F9BE, 0x008786},
+{0x02F9BF, 0x02F9BF, 0x0045D7},
+{0x02F9C0, 0x02F9C0, 0x0087E1},
+{0x02F9C1, 0x02F9C1, 0x008801},
+{0x02F9C2, 0x02F9C2, 0x0045F9},
+{0x02F9C3, 0x02F9C3, 0x008860},
+{0x02F9C4, 0x02F9C4, 0x008863},
+{0x02F9C5, 0x02F9C5, 0x027667},
+{0x02F9C6, 0x02F9C6, 0x0088D7},
+{0x02F9C7, 0x02F9C7, 0x0088DE},
+{0x02F9C8, 0x02F9C8, 0x004635},
+{0x02F9C9, 0x02F9C9, 0x0088FA},
+{0x02F9CA, 0x02F9CA, 0x0034BB},
+{0x02F9CB, 0x02F9CB, 0x0278AE},
+{0x02F9CC, 0x02F9CC, 0x027966},
+{0x02F9CD, 0x02F9CD, 0x0046BE},
+{0x02F9CE, 0x02F9CE, 0x0046C7},
+{0x02F9CF, 0x02F9CF, 0x008AA0},
+{0x02F9D0, 0x02F9D0, 0x008AED},
+{0x02F9D1, 0x02F9D1, 0x008B8A},
+{0x02F9D2, 0x02F9D2, 0x008C55},
+{0x02F9D3, 0x02F9D3, 0x027CA8},
+{0x02F9D4, 0x02F9D4, 0x008CAB},
+{0x02F9D5, 0x02F9D5, 0x008CC1},
+{0x02F9D6, 0x02F9D6, 0x008D1B},
+{0x02F9D7, 0x02F9D7, 0x008D77},
+{0x02F9D8, 0x02F9D8, 0x027F2F},
+{0x02F9D9, 0x02F9D9, 0x020804},
+{0x02F9DA, 0x02F9DA, 0x008DCB},
+{0x02F9DB, 0x02F9DB, 0x008DBC},
+{0x02F9DC, 0x02F9DC, 0x008DF0},
+{0x02F9DD, 0x02F9DD, 0x0208DE},
+{0x02F9DE, 0x02F9DE, 0x008ED4},
+{0x02F9DF, 0x02F9DF, 0x008F38},
+{0x02F9E0, 0x02F9E0, 0x0285D2},
+{0x02F9E1, 0x02F9E1, 0x0285ED},
+{0x02F9E2, 0x02F9E2, 0x009094},
+{0x02F9E3, 0x02F9E3, 0x0090F1},
+{0x02F9E4, 0x02F9E4, 0x009111},
+{0x02F9E5, 0x02F9E5, 0x02872E},
+{0x02F9E6, 0x02F9E6, 0x00911B},
+{0x02F9E7, 0x02F9E7, 0x009238},
+{0x02F9E8, 0x02F9E8, 0x0092D7},
+{0x02F9E9, 0x02F9E9, 0x0092D8},
+{0x02F9EA, 0x02F9EA, 0x00927C},
+{0x02F9EB, 0x02F9EB, 0x0093F9},
+{0x02F9EC, 0x02F9EC, 0x009415},
+{0x02F9ED, 0x02F9ED, 0x028BFA},
+{0x02F9EE, 0x02F9EE, 0x00958B},
+{0x02F9EF, 0x02F9EF, 0x004995},
+{0x02F9F0, 0x02F9F0, 0x0095B7},
+{0x02F9F1, 0x02F9F1, 0x028D77},
+{0x02F9F2, 0x02F9F2, 0x0049E6},
+{0x02F9F3, 0x02F9F3, 0x0096C3},
+{0x02F9F4, 0x02F9F4, 0x005DB2},
+{0x02F9F5, 0x02F9F5, 0x009723},
+{0x02F9F6, 0x02F9F6, 0x029145},
+{0x02F9F7, 0x02F9F7, 0x02921A},
+{0x02F9F8, 0x02F9F8, 0x004A6E},
+{0x02F9F9, 0x02F9F9, 0x004A76},
+{0x02F9FA, 0x02F9FA, 0x0097E0},
+{0x02F9FB, 0x02F9FB, 0x02940A},
+{0x02F9FC, 0x02F9FC, 0x004AB2},
+{0x02F9FD, 0x02F9FD, 0x029496},
+{0x02F9FE, 0x02F9FF, 0x00980B},
+{0x02FA00, 0x02FA00, 0x009829},
+{0x02FA01, 0x02FA01, 0x0295B6},
+{0x02FA02, 0x02FA02, 0x0098E2},
+{0x02FA03, 0x02FA03, 0x004B33},
+{0x02FA04, 0x02FA04, 0x009929},
+{0x02FA05, 0x02FA05, 0x0099A7},
+{0x02FA06, 0x02FA06, 0x0099C2},
+{0x02FA07, 0x02FA07, 0x0099FE},
+{0x02FA08, 0x02FA08, 0x004BCE},
+{0x02FA09, 0x02FA09, 0x029B30},
+{0x02FA0A, 0x02FA0A, 0x009B12},
+{0x02FA0B, 0x02FA0B, 0x009C40},
+{0x02FA0C, 0x02FA0C, 0x009CFD},
+{0x02FA0D, 0x02FA0D, 0x004CCE},
+{0x02FA0E, 0x02FA0E, 0x004CED},
+{0x02FA0F, 0x02FA0F, 0x009D67},
+{0x02FA10, 0x02FA10, 0x02A0CE},
+{0x02FA11, 0x02FA11, 0x004CF8},
+{0x02FA12, 0x02FA12, 0x02A105},
+{0x02FA13, 0x02FA13, 0x02A20E},
+{0x02FA14, 0x02FA14, 0x02A291},
+{0x02FA15, 0x02FA15, 0x009EBB},
+{0x02FA16, 0x02FA16, 0x004D56},
+{0x02FA17, 0x02FA17, 0x009EF9},
+{0x02FA18, 0x02FA18, 0x009EFE},
+{0x02FA19, 0x02FA19, 0x009F05},
+{0x02FA1A, 0x02FA1A, 0x009F0F},
+{0x02FA1B, 0x02FA1B, 0x009F16},
+{0x02FA1C, 0x02FA1C, 0x009F3B},
+{0x02FA1D, 0x02FA1D, 0x02A600},
+};
diff --git a/src/unicode-data.h b/src/unicode-data.h
new file mode 100644
index 000000000..f6973ebd2
--- /dev/null
+++ b/src/unicode-data.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+struct range_nfd {
+    uint32_t first;
+    uint32_t last;
+    uint32_t nfd;
+};
+
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/src/unicode.cpp b/src/unicode.cpp
new file mode 100644
index 000000000..a32ae6d08
--- /dev/null
+++ b/src/unicode.cpp
@@ -0,0 +1,849 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
+#include "unicode.h"
+#include "unicode-data.h"
+
+#include <algorithm>
+#include <cassert>
+#include <codecvt>
+#include <cstddef>
+#include <cstdint>
+#include <locale>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+size_t unicode_len_utf8(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
+    std::string result;
+    for (size_t i = 0; i < cps.size(); ++i) {
+        result.append(unicode_cpt_to_utf8(cps[i]));
+    }
+    return result;
+}
+
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+    assert(offset < utf8.size());
+    if (!(utf8[offset + 0] & 0x80)) {
+        auto result = utf8[offset + 0];
+        offset += 1;
+        return result;
+    }
+    if (!(utf8[offset + 0] & 0x40)) {
+        throw std::invalid_argument("invalid character");
+    }
+    if (!(utf8[offset + 0] & 0x20)) {
+        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
+            throw std::invalid_argument("invalid character");
+        }
+        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
+        offset += 2;
+        return result;
+    }
+    if (!(utf8[offset + 0] & 0x10)) {
+        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
+            throw std::invalid_argument("invalid character");
+        }
+        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
+        offset += 3;
+        return result;
+    }
+    if (!(utf8[offset + 0] & 0x08)) {
+        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
+            throw std::invalid_argument("invalid character");
+        }
+        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
+        offset += 4;
+        return result;
+    }
+    throw std::invalid_argument("failed to convert utf8 to codepoint");
+}
+
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
+//    std::vector<uint16_t> result;
+//    if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
+//        result.emplace_back(cpt);
+//        return result;
+//    }
+//    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+//        result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
+//        result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
+//        return result;
+//    }
+//    throw std::invalid_argument("failed to convert codepoint to utf16");
+//}
+
+//static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
+//    std::vector<uint16_t> result;
+//    for (size_t i = 0; i < cps.size(); ++i) {
+//        auto temp = unicode_cpt_to_utf16(cps[i]);
+//        result.insert(result.end(), temp.begin(), temp.end());
+//    }
+//    return result;
+//}
+
+//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
+//    assert(offset < utf16.size());
+//    if (((utf16[0] >> 10) << 10) != 0xd800) {
+//        auto result = utf16[offset + 0];
+//        offset += 1;
+//        return result;
+//    }
+//
+//    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+//        throw std::invalid_argument("invalid character");
+//    }
+//
+//    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+//    offset += 2;
+//    return result;
+//}
+
+//static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
+//    std::vector<uint32_t> result;
+//    size_t offset = 0;
+//    while (offset < utf16.size()) {
+//        result.push_back(unicode_cpt_from_utf16(utf16, offset));
+//    }
+//    return result;
+//}
+
+static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
+    std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
+
+    assert (unicode_ranges_flags.begin()[0].first == 0);
+    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
+    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
+        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
+        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
+        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
+            cpt_flags[cpt] = range_ini.second;
+        }
+    }
+
+    for (auto cpt : unicode_set_whitespace) {
+        cpt_flags[cpt].is_whitespace = true;
+    }
+
+    for (auto p : unicode_map_lowercase) {
+        cpt_flags[p.second].is_lowercase = true;
+    }
+
+    for (auto p : unicode_map_uppercase) {
+        cpt_flags[p.second].is_uppercase = true;
+    }
+
+    for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
+        cpt_flags[range.nfd].is_nfd = true;
+    }
+
+    return cpt_flags;
+}
+
+static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
+    std::unordered_map<uint8_t, std::string> map;
+    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
+        assert(0 <= ch && ch < 256);
+        map[ch] = unicode_cpt_to_utf8(ch);
+    }
+    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
+        assert(0 <= ch && ch < 256);
+        map[ch] = unicode_cpt_to_utf8(ch);
+    }
+    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
+        assert(0 <= ch && ch < 256);
+        map[ch] = unicode_cpt_to_utf8(ch);
+    }
+    auto n = 0;
+    for (int ch = 0; ch < 256; ++ch) {
+        if (map.find(ch) == map.end()) {
+            map[ch] = unicode_cpt_to_utf8(256 + n);
+            ++n;
+        }
+    }
+    return map;
+}
+
+static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+    std::unordered_map<std::string, uint8_t> map;
+    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
+        assert(0 <= ch && ch < 256);
+        map[unicode_cpt_to_utf8(ch)] = ch;
+    }
+    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
+        assert(0 <= ch && ch < 256);
+        map[unicode_cpt_to_utf8(ch)] = ch;
+    }
+    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
+        assert(0 <= ch && ch < 256);
+        map[unicode_cpt_to_utf8(ch)] = ch;
+    }
+    auto n = 0;
+    for (int ch = 0; ch < 256; ++ch) {
+        if (map.find(unicode_cpt_to_utf8(ch)) == map.end()) {
+            map[unicode_cpt_to_utf8(256 + n)] = ch;
+            ++n;
+        }
+    }
+    return map;
+}
+
+static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#if defined(__clang__)
+    // disable C++17 deprecation warning for std::codecvt_utf8
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+    return conv.from_bytes(s);
+}
+
+static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
+    std::vector<std::string> bpe_encoded_words;
+    for (const auto & word : bpe_words) {
+        std::string text_utf;
+        auto utf_word =  unicode_cpts_from_utf8(word);
+        for (size_t i = 0; i < utf_word.size(); ++i) {
+            text_utf += unicode_cpt_to_utf8(utf_word[i]);
+        }
+
+        std::string encoded_token;
+        for (char & c : text_utf) {
+            encoded_token += unicode_byte_to_utf8(c);
+        }
+        bpe_encoded_words.emplace_back(encoded_token);
+    }
+    return bpe_encoded_words;
+}
+
+// GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
+static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets; // store the offset of each word
+    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+        };
+
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+        };
+
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            //if (len > 0) {
+            //    std::string s = "";
+            //    for(size_t p = end-len; p < end; p++)
+            //        s += unicode_cpt_to_utf8(cpts[p]);
+            //    printf(">>> '%s'\n", s.c_str());
+            //}
+            return len;
+        };
+
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+            const uint32_t cpt = _get_cpt(pos);
+            const auto flags = _get_flags(pos);
+
+            // regex: 's|'t|'re|'ve|'m|'ll|'d
+            if (cpt == '\'' && pos+1 < offset_end) {
+                uint32_t cpt_next = _get_cpt(pos+1);
+                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+                    pos += _add_token(pos+2);
+                    continue;
+                }
+                if (pos+2 < offset_end) {
+                    uint32_t cpt_next_next = _get_cpt(pos+2);
+                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+                        (cpt_next == 'v' && cpt_next_next == 'e') ||
+                        (cpt_next == 'l' && cpt_next_next == 'l')) {
+                        pos += _add_token(pos+3);
+                        continue;
+                    }
+                }
+            }
+
+            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+            // regex: <space>?\p{L}+
+            if (flags2.is_letter) {
+                pos += (cpt == ' ');
+                while (flags2.is_letter) {
+                    flags2 = _get_flags(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+            // regex: <space>?\p{N}+
+            if (flags2.is_number) {
+                pos += (cpt == ' ');
+                while (flags2.is_number) {
+                    flags2 = _get_flags(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+            // regex: <space>?[^\s\p{L}\p{N}]+
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                pos += (cpt == ' ');
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                    flags2 = _get_flags(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            size_t num_whitespaces = 0;
+            while (_get_flags(pos+num_whitespaces).is_whitespace) {
+                num_whitespaces++;
+            }
+
+            // regex: \s+(?!\S)
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
+                pos += num_whitespaces - 1;
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: \s+
+            if (num_whitespaces > 0) {
+                pos += num_whitespaces;
+                _add_token(pos);
+                continue;
+            }
+
+            // no matches
+            _add_token(++pos);
+        }
+    }
+
+    return bpe_offsets;
+}
+
+// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets; // store the offset of each word
+    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
+        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
+            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
+        };
+
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+        };
+
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            //if (len > 0) {
+            //    std::string s = "";
+            //    for(size_t p = end-len; p < end; p++)
+            //        s += unicode_cpt_to_utf8(cpts[p]);
+            //    printf(">>> '%s'\n", s.c_str());
+            //}
+            return len;
+        };
+
+        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
+            const uint32_t cpt = _get_cpt(pos);
+            const auto flags = _get_flags(pos);
+
+            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
+            if (cpt == '\'' && pos+1 < offset_end) {
+                uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
+                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
+                    pos += _add_token(pos+2);
+                    continue;
+                }
+                if (pos+2 < offset_end) {
+                    uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
+                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
+                        (cpt_next == 'v' && cpt_next_next == 'e') ||
+                        (cpt_next == 'l' && cpt_next_next == 'l')) {
+                        pos += _add_token(pos+3);
+                        continue;
+                    }
+                }
+            }
+
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+
+            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
+                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
+                    pos++;
+                    while (_get_flags(pos).is_letter) {
+                        pos++;
+                    }
+                    _add_token(pos);
+                    continue;
+                }
+            }
+
+            // regex: \p{N}{1,3}
+            if (flags.is_number) {
+                size_t ini = pos;
+                while (_get_flags(pos).is_number) {
+                    if (++pos - ini >= 3 ) {
+                        _add_token(pos);
+                        ini = pos;
+                    }
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
+            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
+                pos += (cpt == ' ');
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                    flags2 = _get_flags(++pos);
+                }
+                uint32_t cpt2 = _get_cpt(pos);
+                while (cpt2 == '\r' || cpt2 == '\n') {
+                    cpt2 = _get_cpt(++pos);
+                }
+                _add_token(pos);
+                continue;
+            }
+
+            size_t num_whitespaces = 0;
+            size_t last_end_r_or_n = 0;
+            while (_get_flags(pos+num_whitespaces).is_whitespace) {
+                uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
+                if (cpt2 == '\r' || cpt2 == '\n') {
+                    last_end_r_or_n = pos + num_whitespaces + 1;
+                }
+                num_whitespaces++;
+            }
+
+            // regex: \s*[\r\n]+
+            if (last_end_r_or_n > 0) {
+                pos = last_end_r_or_n;
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: \s+(?!\S)
+            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
+                pos += num_whitespaces - 1;
+                _add_token(pos);
+                continue;
+            }
+
+            // regex: \s+
+            if (num_whitespaces > 0) {
+                pos += num_whitespaces;
+                _add_token(pos);
+                continue;
+            }
+
+            // no matches
+            _add_token(++pos);
+        }
+    }
+
+    return bpe_offsets;
+}
+
+// use std::wregex to split the text
+static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
+    std::wregex expr(regex_expr);
+    std::vector<size_t> bpe_offsets; // store the offset of each word
+    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+    size_t start = 0;
+    for (auto offset : offsets) {
+        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
+        std::wcregex_iterator end;
+
+        int64_t start_idx = 0;
+        while (it != end) {
+            std::wcmatch match = *it;
+            if (match.position() > start_idx) {
+                bpe_offsets.emplace_back(match.position() - start_idx);
+            }
+            bpe_offsets.emplace_back(match.length());
+            start_idx = match.position() + match.length();
+            ++it;
+        }
+
+        if (start_idx < (int64_t) offset) {
+            bpe_offsets.emplace_back(offset - start_idx);
+        }
+        start += offset;
+    }
+
+    return bpe_offsets;
+}
+
+// use std::regex to split the text
+static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
+    std::regex expr(regex_expr);
+    std::vector<size_t> bpe_offsets; // store the offset of each word
+    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
+    size_t start = 0;
+    for (auto offset : offsets) {
+        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
+        std::cregex_iterator end;
+
+        int64_t start_idx = 0;
+        while (it != end) {
+            std::cmatch match = *it;
+            if (match.position() > start_idx) {
+                bpe_offsets.emplace_back(match.position() - start_idx);
+            }
+            bpe_offsets.emplace_back(match.length());
+            start_idx = match.position() + match.length();
+            ++it;
+        }
+
+        if (start_idx < (int64_t) offset) {
+            bpe_offsets.emplace_back(offset - start_idx);
+        }
+        start += offset;
+    }
+
+    return bpe_offsets;
+}
+
+static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+
+    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+    } else if (
+            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
+            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
+
+        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
+    }
+
+    return bpe_offsets;
+}
+
+//
+// interface
+//
+
+std::string unicode_cpt_to_utf8(uint32_t cpt) {
+    std::string result;
+
+    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
+        result.push_back(cpt);
+        return result;
+    }
+    if (0x80 <= cpt && cpt <= 0x7ff) {
+        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
+        result.push_back(0x80 | (cpt & 0x3f));
+        return result;
+    }
+    if (0x800 <= cpt && cpt <= 0xffff) {
+        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
+        return result;
+    }
+    if (0x10000 <= cpt && cpt <= 0x10ffff) {
+        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
+        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
+        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
+        result.push_back(0x80 | (cpt & 0x3f));
+        return result;
+    }
+
+    throw std::invalid_argument("invalid codepoint");
+}
+
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
+    auto comp = [] (const uint32_t cpt, const range_nfd & range) {
+        return cpt < range.first;
+    };
+    std::vector<uint32_t> result(cpts.size());
+    for (size_t i = 0; i < cpts.size(); ++i) {
+        const uint32_t cpt = cpts[i];
+        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
+        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
+    }
+    return result;
+}
+
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
+    std::vector<uint32_t> result;
+    result.reserve(utf8.size());
+    size_t offset = 0;
+    while (offset < utf8.size()) {
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
+    }
+    return result;
+}
+
+unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
+    static const auto cpt_flags = unicode_cpt_flags_array();
+    return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
+}
+
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
+    static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
+    if (utf8.empty()) {
+        return undef;  // undefined
+    }
+    size_t offset = 0;
+    return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
+}
+
+std::string unicode_byte_to_utf8(uint8_t byte) {
+    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
+    return map.at(byte);
+}
+
+uint8_t unicode_utf8_to_byte(const std::string & utf8) {
+    static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
+    return map.at(utf8);
+}
+
+uint32_t unicode_tolower(uint32_t cpt) {
+    // binary search
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
+        [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
+            return pair.first < value;
+        });
+    if (it != unicode_map_lowercase.end() && it->first == cpt) {
+        return it->second;
+    }
+    return cpt;  // Return the original code point if no lowercase mapping is found
+}
+
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+    // unicode categories
+    static const std::map<std::string, int> k_ucat_enum = {
+        { "\\p{N}", unicode_cpt_flags::NUMBER },
+        { "\\p{L}", unicode_cpt_flags::LETTER },
+        { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
+        { "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
+        { "\\p{S}", unicode_cpt_flags::SYMBOL },
+    };
+
+    static const std::map<int, int> k_ucat_cpt = {
+        { unicode_cpt_flags::NUMBER,      0xD1 },
+        { unicode_cpt_flags::LETTER,      0xD2 },
+        { unicode_cpt_flags::PUNCTUATION, 0xD3 },
+        { unicode_cpt_flags::ACCENT_MARK, 0xD4 },
+        { unicode_cpt_flags::SYMBOL,      0xD5 },
+    };
+
+    static const std::map<int, std::string> k_ucat_map = {
+        { unicode_cpt_flags::NUMBER,      "\x30-\x39" }, // 0-9
+        { unicode_cpt_flags::LETTER,      "\x41-\x5A\x61-\x7A" }, // A-Za-z
+        { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
+        { unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
+        { unicode_cpt_flags::SYMBOL,      "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`|
+    };
+
+    // compute collapsed codepoints only if needed by at least one regex
+    bool need_collapse = false;
+    for (const auto & regex_expr : regex_exprs) {
+        // search for unicode categories
+        for (const auto & ucat : k_ucat_enum) {
+            if (std::string::npos != regex_expr.find(ucat.first)) {
+                need_collapse = true;
+                break;
+            }
+        }
+    }
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
+    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+    std::string text_collapsed;
+    if (need_collapse) {
+        // collapse all unicode categories
+        text_collapsed.resize(cpts.size());
+
+        for (size_t i = 0; i < cpts.size(); ++i) {
+            // keep single-byte codepoints as is
+            if (cpts[i] < 128) {
+                text_collapsed[i] = cpts[i];
+                continue;
+            }
+
+            const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
+
+            if (flags.is_whitespace) {
+                //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
+                //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
+                text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
+            } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
+                text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
+            } else {
+                text_collapsed[i] = (char) 0xD0; // fallback
+            }
+        }
+    }
+
+    std::vector<size_t> bpe_offsets = { cpts.size() };
+
+    for (const auto & regex_expr : regex_exprs) {
+        // first, see if we have an efficient custom regex implementation
+        auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
+
+        if (!tmp.empty()) {
+            bpe_offsets = std::move(tmp);
+            continue;
+        }
+
+        // fallback to general-purpose std::regex / std::wregex
+        try {
+            // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
+            // with the corresponding collapsed representation
+            bool use_collapsed = false;
+            for (const auto & ucat : k_ucat_enum) {
+                if (std::string::npos != regex_expr.find(ucat.first)) {
+                    use_collapsed = true;
+                    break;
+                }
+            }
+
+            if (use_collapsed) {
+                // sanity-check that the original regex does not contain any non-ASCII characters
+                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
+                for (size_t i = 0; i < cpts_regex.size(); ++i) {
+                    if (cpts_regex[i] >= 128) {
+                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
+                    }
+                }
+
+                // generate a collapsed representation of the regex
+                std::string regex_expr_collapsed;
+
+                // track if we are inside [], because nested [] are not allowed
+                bool inside = false;
+                for (size_t i = 0; i < regex_expr.size(); ++i) {
+                    if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
+                        regex_expr_collapsed += '[';
+                        inside = true;
+                        continue;
+                    }
+
+                    if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
+                        regex_expr_collapsed += ']';
+                        inside = false;
+                        continue;
+                    }
+
+                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
+                        regex_expr[i + 1] == 'p' &&
+                        regex_expr[i + 2] == '{' &&
+                        regex_expr[i + 4] == '}') {
+                        const std::string pat = regex_expr.substr(i, 5);
+                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+                            if (!inside) {
+                                regex_expr_collapsed += '[';
+                            }
+                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+                            if (!inside) {
+                                regex_expr_collapsed += ']';
+                            }
+                            i += 4;
+                            continue;
+                        }
+                    }
+
+                    regex_expr_collapsed += regex_expr[i];
+                }
+
+                //printf("text_collapsed: %s\n", text_collapsed.c_str());
+                //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
+                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
+            } else {
+                // no unicode category used, we can use std::wregex directly
+                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+
+                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
+                std::wstring wtext(cpts.begin(), cpts.end());
+                for (size_t i = 0; i < wtext.size(); ++i) {
+                    if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
+                        wtext[i] = 0x0B;
+                    }
+                }
+
+                //printf("text: %s\n", text.c_str());
+                //printf("regex_expr: %s\n", regex_expr.c_str());
+                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
+            }
+        } catch (std::regex_error & e) {
+            fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
+            fprintf(stderr, "Regex error: %s\n", e.what());
+            throw std::runtime_error("Failed to process regex");
+        }
+    }
+
+    std::vector<std::string> bpe_words;
+    bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
+
+    size_t start = 0;
+    for (size_t & offset : bpe_offsets) {
+        bpe_words.emplace_back();
+        for (size_t i = start; i < start + offset; ++i) {
+            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
+        }
+        start += offset;
+    }
+
+    return unicode_byte_encoding_process(bpe_words);
+}
diff --git a/src/unicode.h b/src/unicode.h
new file mode 100644
index 000000000..c27098df7
--- /dev/null
+++ b/src/unicode.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+struct unicode_cpt_flags {
+    enum {
+        UNDEFINED       = 0x0001,
+        NUMBER          = 0x0002,  // regex: \p{N}
+        LETTER          = 0x0004,  // regex: \p{L}
+        SEPARATOR       = 0x0008,  // regex: \p{Z}
+        ACCENT_MARK     = 0x0010,  // regex: \p{M}
+        PUNCTUATION     = 0x0020,  // regex: \p{P}
+        SYMBOL          = 0x0040,  // regex: \p{S}
+        CONTROL         = 0x0080,  // regex: \p{C}
+        MASK_CATEGORIES = 0x00FF,
+    };
+
+    // codepoint type
+    uint16_t is_undefined   : 1;
+    uint16_t is_number      : 1;  // regex: \p{N}
+    uint16_t is_letter      : 1;  // regex: \p{L}
+    uint16_t is_separator   : 1;  // regex: \p{Z}
+    uint16_t is_accent_mark : 1;  // regex: \p{M}
+    uint16_t is_punctuation : 1;  // regex: \p{P}
+    uint16_t is_symbol      : 1;  // regex: \p{S}
+    uint16_t is_control     : 1;  // regex: \p{C}
+    // helper flags
+    uint16_t is_whitespace  : 1;  // regex: \s
+    uint16_t is_lowercase   : 1;
+    uint16_t is_uppercase   : 1;
+    uint16_t is_nfd         : 1;
+
+    // decode from uint16
+    inline unicode_cpt_flags(const uint16_t flags = 0) {
+        *reinterpret_cast<uint16_t*>(this) = flags;
+    }
+
+    inline uint16_t as_uint() const {
+        return *reinterpret_cast<const uint16_t*>(this);
+    }
+
+    inline uint16_t category_flag() const {
+        return this->as_uint() & MASK_CATEGORIES;
+    }
+};
+
+size_t unicode_len_utf8(char src);
+
+std::string unicode_cpt_to_utf8  (uint32_t cpt);
+uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
+
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
+
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
+
+std::string unicode_byte_to_utf8(uint8_t byte);
+uint8_t     unicode_utf8_to_byte(const std::string & utf8);
+
+uint32_t unicode_tolower(uint32_t cpt);
+
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
diff --git a/tests/.gitignore b/tests/.gitignore
index 9427cf13d..620a48ee4 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,4 @@
 *
 !*.*
 *.o
+ggml-common.h
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 10326d531..7a158d602 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,66 +1,154 @@
-function(llama_build_executable source)
-    get_filename_component(TEST_TARGET ${source} NAME_WE)
+llama_add_compile_flags()
+
+function(llama_test target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    set(TEST_TARGET ${target})
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+# Builds and runs a test source file.
+# Optional args:
+# - NAME: name of the executable & test target (defaults to the source file name without extension)
+# - LABEL: label for the test (defaults to main)
+# - ARGS: arguments to pass to the test executable
+# - WORKING_DIRECTORY
+function(llama_target_and_test source)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
     add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE common)
+    add_test(
+        NAME ${TEST_TARGET}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
 
-function(llama_test_executable name source)
-    get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-    set_property(TEST ${name} PROPERTY LABELS "main")
-endfunction()
+# build test-tokenizer-0 target once and add many tests
+add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
+target_link_libraries(test-tokenizer-0 PRIVATE common)
+install(TARGETS test-tokenizer-0 RUNTIME)
 
-function(llama_build_and_test_executable source)
-    llama_build_and_test_executable_with_label(${source} "main")
-endfunction()
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
-function(llama_build_and_test_executable_with_label source label)
-    get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source} get-model.cpp)
-    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
-    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
-endfunction()
+if (LLAMA_LLGUIDANCE)
+    llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+endif ()
 
-# llama_build_and_test_executable(test-double-float.cpp) # SLOW
-llama_build_and_test_executable(test-quantize-fns.cpp)
-llama_build_and_test_executable(test-quantize-perf.cpp)
-llama_build_and_test_executable(test-sampling.cpp)
-llama_build_and_test_executable(test-chat-template.cpp)
+if (NOT WIN32)
+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
+    llama_target_and_test(test-sampling.cpp)
+    llama_target_and_test(test-grammar-parser.cpp)
+    llama_target_and_test(test-grammar-integration.cpp)
+    llama_target_and_test(test-llama-grammar.cpp)
+    llama_target_and_test(test-chat.cpp)
+    # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
+    if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+        target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
+    endif()
 
-llama_build_executable(test-tokenizer-0-llama.cpp)
-llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 
-llama_build_executable(test-tokenizer-0-falcon.cpp)
-llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+    # build test-tokenizer-1-bpe target once and add many tests
+    add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
+    target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
+    install(TARGETS test-tokenizer-1-bpe RUNTIME)
 
-llama_build_executable(test-tokenizer-1-llama.cpp)
-llama_test_executable (test-tokenizer-1-llama    test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+    # TODO: disabled due to slowness
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
-llama_build_executable(test-tokenizer-1-bpe.cpp)
-llama_test_executable (test-tokenizer-1-falcon           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test_executable (test-tokenizer-1-aquila           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test_executable (test-tokenizer-1-mpt              test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
-llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-llama_test_executable (test-tokenizer-1-gpt2             test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
-# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+    # build test-tokenizer-1-spm target once and add many tests
+    add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
+    target_link_libraries(test-tokenizer-1-spm PRIVATE common)
+    install(TARGETS test-tokenizer-1-spm RUNTIME)
 
-llama_build_and_test_executable(test-grammar-parser.cpp)
-llama_build_and_test_executable(test-llama-grammar.cpp)
-llama_build_and_test_executable(test-grad0.cpp)
-# llama_build_and_test_executable(test-opt.cpp) # SLOW
-llama_build_and_test_executable(test-backend-ops.cpp)
+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
-llama_build_and_test_executable(test-rope.cpp)
+    # llama_target_and_test(test-double-float.cpp) # SLOW
+endif()
+
+llama_target_and_test(test-log.cpp)
+llama_target_and_test(test-arg-parser.cpp)
+llama_target_and_test(test-chat-template.cpp)
+
+# llama_target_and_test(test-opt.cpp) # SLOW
+llama_target_and_test(test-gguf.cpp)
+llama_target_and_test(test-backend-ops.cpp)
+
+llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
+llama_target_and_test(test-autorelease.cpp        LABEL "model")
+
+if (NOT GGML_BACKEND_DL)
+    # these tests use the backends directly and cannot be built with dynamic loading
+    llama_target_and_test(test-barrier.cpp)
+    llama_target_and_test(test-quantize-fns.cpp)
+    llama_target_and_test(test-quantize-perf.cpp)
+    llama_target_and_test(test-rope.cpp)
+endif()
 
-llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
-llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
diff --git a/tests/run-json-schema-to-grammar.mjs b/tests/run-json-schema-to-grammar.mjs
new file mode 100644
index 000000000..b20ac1d6b
--- /dev/null
+++ b/tests/run-json-schema-to-grammar.mjs
@@ -0,0 +1,10 @@
+import { readFileSync } from "fs"
+import { SchemaConverter } from "../examples/server/public_legacy/json-schema-to-grammar.mjs"
+
+const [, , file] = process.argv
+const url = `file://${file}`
+let schema = JSON.parse(readFileSync(file, "utf8"));
+const converter = new SchemaConverter({})
+schema = await converter.resolveRefs(schema, url)
+converter.visit(schema, '')
+console.log(converter.formatGrammar())
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
new file mode 100644
index 000000000..69604b87c
--- /dev/null
+++ b/tests/test-arg-parser.cpp
@@ -0,0 +1,131 @@
+#include "arg.h"
+#include "common.h"
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_set>
+
+#undef NDEBUG
+#include <cassert>
+
+int main(void) {
+    common_params params;
+
+    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
+    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+        try {
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            std::unordered_set<std::string> seen_args;
+            std::unordered_set<std::string> seen_env_vars;
+            for (const auto & opt : ctx_arg.options) {
+                // check for args duplications
+                for (const auto & arg : opt.args) {
+                    if (seen_args.find(arg) == seen_args.end()) {
+                        seen_args.insert(arg);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        exit(1);
+                    }
+                }
+                // check for env var duplications
+                if (opt.env) {
+                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(opt.env);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        exit(1);
+                    }
+                }
+            }
+        } catch (std::exception & e) {
+            printf("%s\n", e.what());
+            assert(false);
+        }
+    }
+
+    auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
+        std::vector<char *> res;
+        for (auto & arg : argv) {
+            res.push_back(const_cast<char *>(arg.data()));
+        }
+        return res;
+    };
+
+    std::vector<std::string> argv;
+
+    printf("test-arg-parser: test invalid usage\n\n");
+
+    // missing value
+    argv = {"binary_name", "-m"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (int)
+    argv = {"binary_name", "-ngl", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (enum)
+    argv = {"binary_name", "-sm", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
+
+
+    printf("test-arg-parser: test valid usage\n\n");
+
+    argv = {"binary_name", "-m", "model_file.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model == "model_file.gguf");
+
+    argv = {"binary_name", "-t", "1234"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.cpuparams.n_threads == 1234);
+
+    argv = {"binary_name", "--verbose"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.verbosity > 1);
+
+    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model == "abc.gguf");
+    assert(params.n_predict == 6789);
+    assert(params.n_batch == 9090);
+
+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--draft", "123"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.speculative.n_max == 123);
+
+// skip this part on windows, because setenv is not supported
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
+    printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
+
+    setenv("LLAMA_ARG_THREADS", "blah", true);
+    argv = {"binary_name"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model == "blah.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+
+    printf("test-arg-parser: test environment variables being overwritten\n\n");
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name", "-m", "overwritten.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model == "overwritten.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+#endif // _WIN32
+
+
+    printf("test-arg-parser: all tests OK\n\n");
+}
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
index 57fa00011..35b09aaea 100644
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -13,10 +13,10 @@ int main(int argc, char ** argv) {
 
     std::thread([&model_path]() {
         llama_backend_init();
-        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
-        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
+        auto * model = llama_model_load_from_file(model_path, llama_model_default_params());
+        auto * ctx = llama_init_from_model(model, llama_context_default_params());
         llama_free(ctx);
-        llama_free_model(model);
+        llama_model_free(model);
         llama_backend_free();
     }).join();
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d4cea805f..1bfd41254 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1,60 +1,82 @@
+// This file defines tests for various GGML ops and backends.
+// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
+// For the backward pass it asserts that the gradients from backpropagation are consistent
+// with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
+// It is also possible to check the performance ("perf" mode).
+//
+// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
+// and section 3 defines which tests to run.
+// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
+// then go to section 3 and add an instantiation of your struct.
+
+
+// ##############################
+// ## Section 1: General Setup ##
+// ##############################
+
+
 #include <ggml.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
-#include <ggml-backend-impl.h>
+
 #include <algorithm>
 #include <array>
 #include <cfloat>
+#include <cstdint>
 #include <cstring>
-#include <functional>
+#include <cinttypes>
 #include <memory>
 #include <random>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string>
 #include <thread>
+#include <future>
 #include <vector>
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    // static RNG initialization (revisit if n_threads stops being constant)
-    static const size_t n_threads = std::thread::hardware_concurrency();
-    static std::vector<std::default_random_engine> generators = []() {
-        std::random_device rd;
-        std::vector<std::default_random_engine> vec;
-        vec.reserve(n_threads);
-        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-        return vec;
-    }();
+    size_t nels = ggml_nelements(tensor);
+    std::vector<float> data(nels);
+    {
+        // parallel initialization
+        static const size_t n_threads = std::thread::hardware_concurrency();
+        // static RNG initialization (revisit if n_threads stops being constant)
+        static std::vector<std::default_random_engine> generators = []() {
+            std::random_device rd;
+            std::vector<std::default_random_engine> vec;
+            vec.reserve(n_threads);
+            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+            return vec;
+        }();
 
-    size_t size = ggml_nelements(tensor);
-    std::vector<float> data(size);
+        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+            std::uniform_real_distribution<float> distribution(min, max);
+            auto & gen = generators[ith];
+            for (size_t i = start; i < end; i++) {
+                data[i] = distribution(gen);
+            }
+        };
 
-    auto init_thread = [&](size_t ith, size_t start, size_t end) {
-        std::uniform_real_distribution<float> distribution(min, max);
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generators[ith]);
+        std::vector<std::future<void>> tasks;
+        tasks.reserve(n_threads);
+        for (size_t i = 0; i < n_threads; i++) {
+            size_t start =     i*nels/n_threads;
+            size_t end   = (i+1)*nels/n_threads;
+            tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
+        }
+        for (auto & t : tasks) {
+            t.get();
         }
-    };
-
-    std::vector<std::thread> threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, i, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
     }
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
-    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
-        int64_t hist[16];
-        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+
+         // dummy importance matrix
+        std::vector<float> imatrix(tensor->ne[0], 1.0f);
         const float * im = imatrix.data();
         if (!ggml_quantize_requires_imatrix(tensor->type)) {
             // when the imatrix is optional, we want to test both quantization with and without imatrix
@@ -63,13 +85,43 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
                 im = nullptr;
             }
         }
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
+
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
+        {
+            // parallel quantization by block
+            size_t blck_size = ggml_blck_size(tensor->type);
+            size_t n_blocks = nels / blck_size;
+
+            auto quantize_thread = [&](size_t start, size_t end) {
+                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+                    start * blck_size, end - start, blck_size, im);
+            };
+
+            const size_t min_blocks_per_thread = 1;
+            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
+                                                      std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+            std::vector<std::future<void>> tasks;
+            tasks.reserve(n_threads);
+            for (size_t i = 0; i < n_threads; i++) {
+                size_t start =     i*n_blocks/n_threads;
+                size_t end   = (i+1)*n_blocks/n_threads;
+                tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+            }
+            for (auto & t : tasks) {
+                t.get();
+            }
+        }
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
         ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_I64) {
+        // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -80,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
     std::vector<uint8_t> buf(ggml_nbytes(t));
     ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
 
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+    const auto * tt = ggml_get_type_traits(t->type);
     size_t bs = ggml_blck_size(t->type);
     std::vector<float> vq(ggml_blck_size(t->type));
     bool quantized = ggml_is_quantized(t->type);
@@ -93,8 +145,12 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                     size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
                     if (t->type == GGML_TYPE_F16) {
                         tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_BF16) {
+                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
                     } else if (t->type == GGML_TYPE_F32) {
                         tv.push_back(*(float *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I64) {
+                        tv.push_back((float)*(int64_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I32) {
                         tv.push_back((float)*(int32_t *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I16) {
@@ -102,10 +158,10 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                     } else if (t->type == GGML_TYPE_I8) {
                         tv.push_back((float)*(int8_t *) &buf[i]);
                     } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
+                        tt->to_float(&buf[i], vq.data(), bs);
                         tv.insert(tv.end(), vq.begin(), vq.end());
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                 }
             }
@@ -115,60 +171,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
     return tv;
 }
 
-/*
-static double cosine_similarity(const float * v1, const float * v2, size_t n) {
-    double dot = 0.0;
-    double mag1 = 0.0;
-    double mag2 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return -1.0f;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        dot  += v1[i]*v2[i];
-        mag1 += v1[i]*v1[i];
-        mag2 += v2[i]*v2[i];
-    }
-
-    return dot/sqrt(mag1*mag2);
-}
-
-static float distance(const float * v1, const float * v2, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        d += (v1[i] - v2[i])*(v1[i] - v2[i]);
-    }
-
-    return sqrt(d);
-}
-
-static float vec_len(const float * v, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v[i])) {
-            continue;
-        }
-        d += v[i]*v[i];
-    }
-
-    return sqrt(d);
-}
-*/
-
 // normalized mean squared error = mse(a, b) / mse(a, 0)
 static double nmse(const float * a, const float * b, size_t n) {
     double mse_a_b = 0.0;
@@ -185,8 +187,40 @@ static double nmse(const float * a, const float * b, size_t n) {
     return mse_a_b / mse_a_0;
 }
 
+// maximum absolute asymmetry between a and b
+// asymmetry: (a - b) / (a + b)
+// This is more stable than relative error if one of the values fluctuates towards zero.
+// n: number of values to compare.
+// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
+//     a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
+static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
+    double sum = 0.0f;
+
+    size_t nvalid = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (!expected_vals.empty()) {
+            bool matches_any = false;
+            for (const float & ev : expected_vals) {
+                if (fabsf(a[i] - ev) < 1e-3f) {
+                    matches_any = true;
+                    break;
+                }
+            }
+            if (!matches_any) {
+                continue;
+            }
+        }
+
+        const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
+
+        sum += fabsf(asymm);
+        nvalid++;
+    }
+
+    return sum/nvalid;
+}
+
 // utils for printing the variables of the test cases
-#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
 
 template<typename T>
 static std::string var_to_str(const T & x) {
@@ -219,10 +253,6 @@ static std::string var_to_str(const std::array<T, N> & x) {
     return s;
 }
 
-//static std::string var_to_str(ggml_unary_op unary_op) {
-//    return ggml_unary_op_name(unary_op);
-//}
-
 static std::string var_to_str(ggml_type type) {
     return ggml_type_name(type);
 }
@@ -235,6 +265,8 @@ static std::string var_to_str(ggml_op_pool pool) {
     }
 }
 
+#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+
 #define VARS_TO_STR1(a) VAR_TO_STR(a)
 #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
 #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
@@ -268,6 +300,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
 enum test_mode {
     MODE_TEST,
     MODE_PERF,
+    MODE_GRAD,
 };
 
 struct test_case {
@@ -287,6 +320,32 @@ struct test_case {
         return 1e-7;
     }
 
+    virtual double max_maa_err() {
+        return 1e-4;
+    }
+
+    virtual float grad_eps() {
+        return 1e-1f;
+    }
+
+    // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
+    // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
+    virtual bool grad_precise() {
+        return false;
+    }
+
+    // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
+    virtual int64_t grad_nmax() {
+        return 10000;
+    }
+
+    // No effect if empty.
+    // If not empty, skip all gradient checks where the numerical result does not match any of the values.
+    // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
+    virtual std::vector<float> grad_expect() {
+        return {};
+    }
+
     virtual void initialize_tensors(ggml_context * ctx) {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
             init_tensor_uniform(t);
@@ -304,7 +363,13 @@ struct test_case {
         return size;
     }
 
+    virtual uint64_t op_flops(ggml_tensor * t) {
+        GGML_UNUSED(t);
+        return 0;
+    }
+
     ggml_cgraph * gf = nullptr;
+    ggml_cgraph * gb = nullptr;
 
     static const int sentinel_size = 1024;
 
@@ -313,7 +378,7 @@ struct test_case {
     std::vector<ggml_tensor *> sentinels;
 
     void add_sentinel(ggml_context * ctx) {
-        if (mode == MODE_PERF) {
+        if (mode == MODE_PERF || mode == MODE_GRAD) {
             return;
         }
         ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
@@ -362,6 +427,7 @@ struct test_case {
             /* .no_alloc = */ true,
         };
         ggml_context * ctx = ggml_init(params);
+        GGML_ASSERT(ctx);
 
         gf = ggml_new_graph(ctx);
 
@@ -412,7 +478,7 @@ struct test_case {
 
         // add sentinels as graph nodes so that they are checked in the callback
         for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
+            ggml_graph_add_node(gf, sentinel);
         }
 
         // randomize tensors
@@ -523,6 +589,7 @@ struct test_case {
             /* .no_alloc = */ true,
         };
         ggml_context * ctx = ggml_init(params);
+        GGML_ASSERT(ctx);
 
         ggml_tensor * out = build_graph(ctx);
 
@@ -543,12 +610,11 @@ struct test_case {
         }
 
         // align while also leaving some margin for variations in parameters
-        int align = 20;
+        int align = 8;
         int last = (len + align - 1) / align * align;
         if (last - len < 5) {
             last += align;
         }
-        last = std::max(last, 60);
         printf("%*s", last - len, "");
 
         // allocate
@@ -569,11 +635,28 @@ struct test_case {
         // warmup run
         ggml_backend_graph_compute(backend, gf);
 
+        // determine number of runs
+        int n_runs;
+        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
+        if (op_flops(out) > 0) {
+            // based on flops
+            const uint64_t GFLOP = 1000 * 1000 * 1000;
+            const uint64_t target_flops_cpu =   8ULL * GFLOP;
+            const uint64_t target_flops_gpu = 100ULL * GFLOP;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+        } else {
+            // based on memory size
+            const size_t GB = 1ULL << 30;
+            const size_t target_size_cpu =  8 * GB;
+            const size_t target_size_gpu = 32 * GB;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
+            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+        }
+
         // duplicate the op
-        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
         for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
+            ggml_graph_add_node(gf, out);
         }
 
         // calculate memory
@@ -588,27 +671,56 @@ struct test_case {
             }
             return size;
         };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
                 continue;
             }
-            mem += tensor_op_size(gf->nodes[i]);
+            mem += tensor_op_size(ggml_graph_node(gf, i));
         }
 
         // run
-        ggml_backend_synchronize(backend);
+        int64_t total_time_us = 0;
+        int64_t total_mem = 0;
+        int total_runs = 0;
+        do {
+            int64_t start_time = ggml_time_us();
+            ggml_backend_graph_compute(backend, gf);
+            int64_t end_time = ggml_time_us();
 
-        int64_t start_time = ggml_time_us();
-        ggml_backend_graph_compute(backend, gf);
-        ggml_backend_synchronize(backend);
-        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
+            total_time_us += end_time - start_time;
+            total_mem += mem;
+            total_runs += n_runs;
+        } while (total_time_us < 1000*1000); // run for at least 1 second
 
-        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
-            n_runs,
-            time_us / n_runs,
-            op_size(out) / 1024,
-            mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
+        printf("    %8d runs - %8.2f us/run - ",
+            total_runs,
+            (double)total_time_us / total_runs);
+
+        if (op_flops(out) > 0) {
+            double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
+            auto format_flops = [](double flops) -> std::string {
+                char buf[256];
+                if (flops >= 1e12) {
+                    snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+                } else if (flops >= 1e9) {
+                    snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
+                } else if (flops >= 1e6) {
+                    snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
+                }
+                return buf;
+            };
+            printf("%s/run - \033[1;34m%sS\033[0m",
+                format_flops(op_flops(out)).c_str(),
+                format_flops(flops_per_sec).c_str());
+
+        } else {
+            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
+                op_size(out) / 1024,
+                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+        }
+        printf("\n");
 
         ggml_backend_buffer_free(buf);
 
@@ -616,26 +728,323 @@ struct test_case {
 
         return true;
     }
+
+    bool eval_grad(ggml_backend_t backend, const char * op_name) {
+        mode = MODE_GRAD;
+        const std::vector<float> expect = grad_expect();
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
+        gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
+
+        ggml_tensor * out = build_graph(ctx);
+
+        if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            ggml_free(ctx);
+            return true;
+        }
+
+        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
+        fflush(stdout);
+
+        if (out->type != GGML_TYPE_F32) {
+            ggml_free(ctx);
+            printf("not supported [%s->type != FP32]\n", out->name);
+            return true;
+        }
+
+        // check if the backend supports the ops
+        bool supported = true;
+        bool any_params = false;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                printf("not supported [%s] ", ggml_backend_name(backend));
+                supported = false;
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                any_params = true;
+                if (t->type != GGML_TYPE_F32) {
+                    printf("not supported [%s->type != FP32] ", t->name);
+                    supported = false;
+                    break;
+                }
+            }
+        }
+        if (!any_params) {
+            printf("not supported [%s] \n", op_desc(out).c_str());
+            supported = false;
+        }
+        if (!supported) {
+            printf("\n");
+            ggml_free(ctx);
+            return true;
+        }
+
+        int64_t ngrads = 0;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->flags & GGML_TENSOR_FLAG_PARAM) {
+                ngrads += ggml_nelements(t);
+            }
+        }
+        if (ngrads > grad_nmax()) {
+            printf("skipping large tensors for speed \n");
+            ggml_free(ctx);
+            return true;
+        }
+
+
+        if (!ggml_is_scalar(out)) {
+            out = ggml_sum(ctx, out);
+            ggml_set_name(out, "sum_of_out");
+        }
+        ggml_set_loss(out);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_graph_cpy(gf, gb);
+        ggml_build_backward_expand(ctx, ctx, gb, false);
+        if (expect.size() != 1 || expect[0] != 0.0f) {
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
+            }
+        }
+
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                printf("not supported [%s] ", ggml_backend_name(backend));
+                supported = false;
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
+                printf("not supported [%s->type != FP32] ", t->name);
+                supported = false;
+                break;
+            }
+        }
+        if (!supported) {
+            printf("\n");
+            ggml_free(ctx);
+            return true;
+        }
+
+        // allocate
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
+            ggml_free(ctx);
+            return false;
+        }
+
+
+        initialize_tensors(ctx); // Randomizes all tensors (including gradients).
+        ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise.
+
+        ggml_backend_graph_compute(backend, gf);
+        ggml_backend_graph_compute(backend, gb);
+
+        bool ok = true;
+        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                continue;
+            }
+
+            const char * bn = ggml_backend_name(backend);
+            const int64_t ne = ggml_nelements(t);
+
+            std::vector<float> ga;
+            struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
+            if (grad) {
+                ga = tensor_to_float(grad);
+            } else {
+                ga.resize(ne); // default value is 0.0f
+            }
+
+            for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
+                // check for nans
+                if (!std::isfinite(ga[i])) {
+                    printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
+                    ok = false;
+                    break;
+                }
+            }
+            if (!ok) {
+                break;
+            }
+
+            std::vector<float> gn(ne); // gradient numeric
+            GGML_ASSERT(ga.size() == gn.size());
+
+            std::vector<float> x0 = tensor_to_float(t); // original t data
+            GGML_ASSERT(ggml_is_scalar(out));
+            GGML_ASSERT(out->type == GGML_TYPE_F32);
+
+            const float eps = grad_eps();
+            for (int64_t i = 0; i < ne; ++i) {
+                const float xiu  = x0[i] + 1.0f*eps; // x, index i, up
+                const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
+                const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
+                const float xid  = x0[i] - 1.0f*eps; // x, index i, down
+
+                float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
+
+                ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
+                ggml_backend_graph_compute(backend, gf);
+                ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
+
+                ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
+                ggml_backend_graph_compute(backend, gf);
+                ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
+
+                if (grad_precise()) {
+                    ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
+                    ggml_backend_graph_compute(backend, gf);
+                    ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
+
+                    ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
+                    ggml_backend_graph_compute(backend, gf);
+                    ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
+
+                    gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
+                } else {
+                    gn[i] = (fu - fd) / (2.0f*eps);
+                }
+
+                ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
+            }
+
+            const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
+            if (err > max_maa_err()) {
+                printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
+                ok = false;
+                break;
+            }
+            if (!ok) {
+                break;
+            }
+        }
+
+        if (!ok) {
+            printf("compare failed ");
+        }
+
+        ggml_backend_buffer_free(buf);
+
+        ggml_free(ctx);
+
+        if (ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
+    }
 };
 
+
+// ###################################
+// ## Section 2: GGML Op Defintions ##
+// ###################################
+
+
+// The following is an example showing the bare minimum for creating a test for a GGML op.
+
+// GGML_OP_EXAMPLE
+struct test_example : public test_case {
+    // Always define these 2 or variants thereof:
+    const ggml_type type; // The type of the input tensors.
+    const std::array<int64_t, 4> ne; // The shape of the input tensors.
+    // For some ops it's necessary to define multiple types or shapes for the inputs.
+    // Or they may need additional parameters.
+
+    // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
+    // In most cases these are just the properties of the struct that you defined above.
+    // This is needed for info prints.
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    // Define a constructor for the struct.
+    // In most cases it will be sufficient to have the same arguments as the struct has properties
+    // and just use initializer lists.
+    test_example(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    // Define how a simple GGML compute graph can be constructed for the new GGML op.
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Step 1: create input tensors that don't depend on any other tensors:
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        // Step 2: use the op that you want to test in the GGML compute graph.
+        ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
+        ggml_set_name(out, "out");
+
+        // Step 3: return the output tensor.
+        return out;
+    }
+    // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
+    // immediately after you create the tensors.
+    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
+};
+
+
 // GGML_OP_UNARY
 struct test_unary : public test_case {
     const ggml_unary_op op;
     const ggml_type type;
-    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
 
     std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        return VARS_TO_STR3(type, ne_a, v);
     }
 
     test_unary(ggml_unary_op op,
             ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {128, 10, 10, 10})
-        : op(op), type(type), ne(ne) {}
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0)
+        : op(op), type(type), ne_a(ne_a), v(v) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_unary(ctx, in, op);
+        const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
+            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
+
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (grad_supported) {
+                ggml_set_param(ctx, a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (grad_supported) {
+                ggml_set_param(ctx, a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        ggml_tensor * out = ggml_unary(ctx, a, op);
+        ggml_set_name(out, "out");
+
         return out;
     }
 
@@ -645,6 +1054,24 @@ struct test_unary : public test_case {
             init_tensor_uniform(t, -150.f, 150.f);
         }
     }
+
+    float grad_eps() override {
+        return 15.0f;
+    }
+
+    std::vector<float> grad_expect() override {
+        if (op == GGML_UNARY_OP_ABS) {
+            return {-1.0f, 1.0f};
+        }
+        if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
+            return {0.0f};
+        }
+        if (op == GGML_UNARY_OP_RELU) {
+            return {0.0f, 1.0f};
+        }
+        return {};
+    }
+
 };
 
 // GGML_OP_GET_ROWS
@@ -665,11 +1092,24 @@ struct test_get_rows : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
+        ggml_set_name(in, "in");
+
         ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+        ggml_set_name(rows, "rows");
         if (v) {
             rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
+            ggml_set_name(rows, "view_of_rows");
         }
+
+        const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
+        if (grad_supported) {
+            ggml_set_param(ctx, in);
+            // rows is a constant input -> no gradients
+        }
+
         ggml_tensor * out = ggml_get_rows(ctx, in, rows);
+        ggml_set_name(out, "out");
+
         return out;
     }
 
@@ -690,6 +1130,144 @@ struct test_get_rows : public test_case {
     }
 };
 
+// GGML_OP_GET_ROWS_BACK
+struct test_get_rows_back : public test_case {
+    const ggml_type type;
+    const int n; // cols
+    const int m; // rows
+    const int r; // rows to get
+    const int b; // batch size
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, n, m, r, b, v);
+    }
+
+    test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+        : type(type), n(n), m(m), r(r), b(b), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * in_forward = ggml_new_tensor_3d(ctx, type, n, m, b);
+        ggml_set_name(in_forward, "in_forward");
+
+        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+        ggml_set_name(rows, "rows");
+        if (v) {
+            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
+            ggml_set_name(rows, "view_of_rows");
+        }
+
+        ggml_tensor * grad = ggml_new_tensor_3d(ctx, type, n, r, b);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_get_rows_back(ctx, grad, rows, in_forward);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // rows
+                std::vector<int> data(r*b);
+                for (int i = 0; i < r*b; i++) {
+                    data[i] = rand() % m;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_ARGMAX
+struct test_argmax : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_argmax(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 100, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argmax(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_COUNT_EQUAL
+struct test_count_equal : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_count_equal(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {4, 500, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * a_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(a_argmax, "a_argmax");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * b_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(b_argmax, "b_argmax");
+
+        ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
 // GGML_OP_REPEAT
 struct test_repeat : public test_case {
     const ggml_type type;
@@ -705,14 +1283,74 @@ struct test_repeat : public test_case {
     }
 
     test_repeat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
             std::array<int, 4> nr = {2, 2, 2, 2})
         : type(type), ne(ne), nr(nr) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(target, "target");
+
         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, src);
+        ggml_set_name(src, "src");
+
         ggml_tensor * out = ggml_repeat(ctx, src, target);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_REPEAT_BACK
+struct test_repeat_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+    const bool v; // whether src is a noncontiguous view
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, nr, v);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 2;
+    }
+
+    test_repeat_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {8, 6, 4, 2},
+            std::array<int, 4> nr = {2, 2, 2, 2},
+            bool v = false)
+        : type(type), ne(ne), nr(nr), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(src, "src");
+
+        if (v) {
+            GGML_ASSERT(ne[0] % 2 == 0);
+            GGML_ASSERT(ne[1] % 2 == 0);
+            GGML_ASSERT(ne[2] % 2 == 0);
+            GGML_ASSERT(ne[3] % 2 == 0);
+            GGML_ASSERT(nr[0] % 2 == 0 || nr[0] == 1);
+            GGML_ASSERT(nr[1] % 2 == 0 || nr[1] == 1);
+            GGML_ASSERT(nr[2] % 2 == 0 || nr[2] == 1);
+            GGML_ASSERT(nr[3] % 2 == 0 || nr[3] == 1);
+
+            const int64_t ne00 = nr[0] == 1 ? src->ne[0] : src->ne[0] / 2;
+            const int64_t ne01 = nr[1] == 1 ? src->ne[1] : src->ne[1] / 2;
+            const int64_t ne02 = nr[2] == 1 ? src->ne[2] : src->ne[2] / 2;
+            const int64_t ne03 = nr[3] == 1 ? src->ne[3] : src->ne[3] / 2;
+
+            src = ggml_view_4d(ctx, src, ne00, ne01, ne02, ne03, src->nb[1], src->nb[2], src->nb[3], 0);
+        }
+
+        ggml_tensor * target = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(target, "target");
+
+        ggml_tensor * out = ggml_repeat_back(ctx, src, target);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -731,17 +1369,69 @@ struct test_dup : public test_case {
     }
 
     test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne = {10, 10, 20, 1},
             std::array<int64_t, 4> permute = {0, 0, 0, 0})
         : type(type), ne(ne), permute(permute),
             _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, src);
+        ggml_set_name(src, "src");
+
         if (_use_permute) {
             src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(src, "src_permuted");
         }
+
         ggml_tensor * out = ggml_dup(ctx, src);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SET
+struct test_set : public test_case {
+    const ggml_type type_src;
+    const ggml_type type_dst;
+    const std::array<int64_t, 4> ne;
+    const int dim;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type_src, type_dst, ne, dim);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+    }
+
+    test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
+        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_set_param(ctx, src);
+        ggml_set_name(src, "src");
+
+        auto ne_dst = ne;
+        for (int i = 0; i < dim; ++i) {
+            ne_dst[i] *= 2;
+        }
+        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
+        ggml_set_param(ctx, dst);
+        ggml_set_name(dst, "dst");
+
+        size_t offset = 0;
+        for (int i = 0; i < dim; ++i) {
+            offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
+        }
+        ggml_tensor * out = ggml_set(ctx, dst, src,
+            // The backward pass requires setting a contiguous region:
+            src->nb[1], src->nb[2], src->nb[3], offset);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -751,9 +1441,15 @@ struct test_cpy : public test_case {
     const ggml_type type_src;
     const ggml_type type_dst;
     const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _src_use_permute;
 
     std::string vars() override {
-        return VARS_TO_STR3(type_src, type_dst, ne);
+        return VARS_TO_STR4(type_src, type_dst, ne, permute);
+    }
+
+    double max_nmse_err() override {
+        return 1e-6;
     }
 
     size_t op_size(ggml_tensor * t) override {
@@ -761,13 +1457,27 @@ struct test_cpy : public test_case {
     }
 
     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type_src(type_src), type_dst(type_dst), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
+          _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
+        ggml_set_param(ctx, src);
+        ggml_set_name(src, "src");
+
+        if (_src_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(src, "src_permuted");
+        }
+
+        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
+        ggml_set_name(dst, "dst");
+
         ggml_tensor * out = ggml_cpy(ctx, src, dst);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -787,8 +1497,14 @@ struct test_cont : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, src);
+        ggml_set_name(src, "src");
+
         src = ggml_transpose(ctx, src);
+        ggml_set_name(src, "src_transposed");
+
         ggml_tensor * out = ggml_cont(ctx, src);
+        ggml_set_name(out, "out");
 
         return out;
     }
@@ -819,21 +1535,79 @@ struct test_bin_bcast : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(a, "a");
+
         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        // The backward pass supports broadcasting only for GGML_ADD:
+        const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
+        if (grad_supported) {
+            ggml_set_param(ctx, a);
+            ggml_set_param(ctx, b);
+        }
+
         ggml_tensor * out = op(ctx, a, b);
+        ggml_set_name(out, "out");
+
         return out;
     }
 
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (op == ggml_div) {
-                // avoid division by zero
-                init_tensor_uniform(t, 1.0f, 2.0f);
+            if (op == ggml_mul || op == ggml_div) {
+                // MUL and DIV have numerical issues around zero:
+                init_tensor_uniform(t, 0.9f, 1.1f);
             } else {
                 init_tensor_uniform(t);
             }
         }
     }
+
+    float grad_eps() override {
+        return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
+    }
+
+    bool grad_precise() override {
+        return op == ggml_div;
+    }
+
+    double max_maa_err() override {
+        return op == ggml_add ? 1e-4 : 1e-3;
+    }
+};
+
+// GGML_OP_ADD1
+struct test_add1 : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_add1(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
+        // ggml_set_param(ctx, b); // TODO: implement
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_add1(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
 };
 
 // GGML_OP_SCALE
@@ -853,13 +1627,18 @@ struct test_scale : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_scale(ctx, a, scale);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
 
-// GGML_OP_NORM
-struct test_norm : public test_case {
+// GGML_OP_SILU_BACK
+struct test_silu_back : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     float eps;
@@ -868,14 +1647,58 @@ struct test_norm : public test_case {
         return VARS_TO_STR3(type, ne, eps);
     }
 
-    test_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
+    test_silu_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
             float eps = 1e-6f)
         : type(type), ne(ne), eps(eps) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_silu_back(ctx, a, grad);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_NORM
+struct test_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, v, eps);
+    }
+
+    test_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f)
+        : type(type), ne(ne), v(v), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
         ggml_tensor * out = ggml_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -884,20 +1707,192 @@ struct test_norm : public test_case {
 struct test_rms_norm : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
-    float eps;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, v, eps);
+    }
+
+    test_rms_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f)
+        : type(type), ne(ne), v(v), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
+        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_RMS_NORM_BACK
+struct test_rms_norm_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
 
     std::string vars() override {
         return VARS_TO_STR3(type, ne, eps);
     }
 
-    test_rms_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
+    test_rms_norm_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
             float eps = 1e-6f)
         : type(type), ne(ne), eps(eps) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_rms_norm_back(ctx, a, b, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+};
+
+// GGML_OP_SSM_CONV
+struct test_ssm_conv : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_b;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, ne_b);
+    }
+
+    test_ssm_conv(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
+        : type(type), ne_a(ne_a), ne_b(ne_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a   = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_tensor * b   = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
+        return out;
+    }
+};
+
+// GGML_OP_SSM_SCAN
+struct test_ssm_scan : public test_case {
+    const ggml_type type;
+
+    const int64_t d_state;
+    const int64_t d_inner;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
+    }
+
+    test_ssm_scan(ggml_type type = GGML_TYPE_F32,
+            int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      n_seqs, 1 }.data());
+        ggml_tensor * x   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
+        ggml_tensor * dt  = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
+        ggml_tensor * A   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner,      1     , 1 }.data());
+        ggml_tensor * B   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
+        ggml_tensor * C   = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
+        ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
+        return out;
+    }
+};
+
+// GGML_OP_RWKV_WKV6
+struct test_rwkv_wkv6 : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
+        ggml_tensor * td  = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
+        return out;
+    }
+};
+
+// GGML_OP_GATED_LINEAR_ATTN
+struct test_gla : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_gla(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * q   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * g   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_gated_linear_attn(ctx, k, v, q, g, s, pow(head_size, -0.5));
         return out;
     }
 };
@@ -909,37 +1904,80 @@ struct test_mul_mat : public test_case {
     const int64_t m;
     const int64_t n;
     const int64_t k;
-    const std::array<int64_t, 2> bs; // dims 3 and 4
-    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+    const std::array<int64_t, 2> bs;  // dims 3 and 4
+    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
+    const std::array<int64_t, 4> per; // permutation of dimensions
 
     std::string vars() override {
-        return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
+        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
     }
 
     double max_nmse_err() override {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
+    int64_t grad_nmax() override {
+        return 20000;
+    }
 
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
     }
 
     test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
             int64_t m = 32, int64_t n = 32, int64_t k = 32,
             std::array<int64_t, 2> bs = {10, 10},
-            std::array<int64_t, 2> nr = {2, 2})
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
+            std::array<int64_t, 2> nr = {2, 2},
+            std::array<int64_t, 4> per = {0, 1, 2, 3})
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0]      , bs[1]);
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+        ggml_tensor * a;
+        ggml_tensor * b;
+
+        const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
+        if (npermuted > 0) {
+            GGML_ASSERT(npermuted == 2);
+            GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
+            GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
+
+            // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
+            const int64_t ne_a[4] = {k, m, bs[0],       bs[1]};
+            const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
+
+            a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
+            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(ctx, a);
+                }
+                ggml_set_param(ctx, b);
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+
+            a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
+            b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
+            ggml_set_name(a, "a_permuted");
+            ggml_set_name(b, "b_permuted");
+        } else {
+            a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(ctx, a);
+                }
+                ggml_set_param(ctx, b);
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+        }
+
         ggml_tensor * out = ggml_mul_mat(ctx, a, b);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -949,48 +1987,51 @@ struct test_mul_mat_id : public test_case {
     const ggml_type type_a;
     const ggml_type type_b;
     const int n_mats;
-    const int id;
+    const int n_used;
+    const bool b; // brodcast b matrix
     const int64_t m;
     const int64_t n;
     const int64_t k;
-    const bool v; // view (non-contiguous ids)
 
     std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
+        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
     }
 
     double max_nmse_err() override {
         return 5e-4;
     }
 
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n;
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
+    uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
     }
 
     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int n_mats = 2, int id = 0,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
-        : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
-            m(m), n(n), k(k), v(v) {}
+            int n_mats = 8, int n_used = 2, bool b = false,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32)
+        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
+            m(m), n(n), k(k) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        std::vector<ggml_tensor *> mats;
-        for (int i = 0; i < n_mats; i++) {
-            ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
-            mats.push_back(a);
-        }
+        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+        ggml_set_name(as, "as");
+
         ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
-        if (v) {
-            ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
+        ggml_set_name(ids, "ids");
+        if (n_used != n_mats) {
+            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
+            ggml_set_name(ids, "view_of_ids");
         }
-        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
+
+        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
+        ggml_set_name(out, "out");
+
         return out;
     }
 
@@ -1016,6 +2057,52 @@ struct test_mul_mat_id : public test_case {
     }
 };
 
+// GGML_OP_OUT_PROD
+struct test_out_prod : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs; // dims 3 and 4
+    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+    const bool trans_b;
+
+    std::string vars() override {
+        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            std::array<int64_t, 2> nr = {2, 2},
+            bool trans_b = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), trans_b(trans_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b;
+        if (trans_b) {
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+            b = ggml_transpose(ctx, b);
+        } else {
+            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0]*nr[0], bs[1]*nr[1]);
+        }
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_out_prod(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_SQR
 struct test_sqr : public test_case {
     const ggml_type type;
@@ -1026,14 +2113,185 @@ struct test_sqr : public test_case {
     }
 
     test_sqr(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
         : type(type), ne(ne) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_sqr(ctx, a);
+        ggml_set_name(out, "out");
+
         return out;
     }
+
+    float grad_eps() override {
+        return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
+    }
+};
+
+// GGML_OP_SQRT
+struct test_sqrt : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqrt(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sqrt(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // fill with positive values
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 50.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 20.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_LOG
+struct test_log : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_log(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_log(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
+            init_tensor_uniform(t, 0.9f, 1.1f);
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SIN
+struct test_sin : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sin(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sin(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_COS
+struct test_cos : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cos(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_cos(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 
 // GGML_OP_CLAMP
@@ -1048,15 +2306,27 @@ struct test_clamp : public test_case {
     }
 
     test_clamp(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
             float min = -0.5f, float max = 0.5f)
         : type(type), ne(ne), min(min), max(max) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_clamp(ctx, a, min, max);
+        ggml_set_name(out, "out");
+
         return out;
     }
+
+    float grad_eps() override {
+        return 1e-2f;
+    }
+
+    std::vector<float> grad_expect() override {
+        return {0.0f, 1.0f};
+    }
 };
 
 // GGML_OP_DIAG_MASK_INF
@@ -1070,13 +2340,18 @@ struct test_diag_mask_inf : public test_case {
     }
 
     test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne = {10, 10, 3, 2},
             int n_past = 5)
         : type(type), ne(ne), n_past(n_past) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1086,56 +2361,171 @@ struct test_soft_max : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     const bool mask;
+    const ggml_type m_prec;
     const float scale;
     const float max_bias;
 
     std::string vars() override {
-        return VARS_TO_STR5(type, ne, mask, scale, max_bias);
+        return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias);
+    }
+
+    // the 1024 test with bias occasionally fails:
+    // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
+    virtual double max_nmse_err() override {
+        return 1e-6;
     }
 
     test_soft_max(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
             bool mask = false,
+            ggml_type m_prec = GGML_TYPE_F32,
             float scale = 1.0f,
             float max_bias = 0.0f)
-        : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
+        : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * mask = nullptr;
         if (this->mask) {
-            mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
+            mask = ggml_new_tensor_2d(ctx, m_prec, ne[0], ne[1]);
+            ggml_set_name(mask, "mask");
         }
-        ggml_tensor * pos = nullptr;
-        if (max_bias > 0.0f) {
-            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
-        }
-        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
+
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SOFT_MAX_BACK
+struct test_soft_max_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float scale;
+    const float max_bias;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, scale, max_bias);
+    }
+
+    test_soft_max_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float scale = 1.0f,
+            float max_bias = 0.0f)
+        : type(type), ne(ne), scale(scale), max_bias(max_bias) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_soft_max_ext_back(ctx, a, b, scale, max_bias);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
 
-// GGML_OP_ROPE
+// GGML_OP_ROPE + GGML_OP_ROPE_BACK
 struct test_rope : public test_case {
     const ggml_type type;
-    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_a;
     int n_dims;
     int mode;
-    int n_ctx;
+    int n_ctx; // used to generate positions
+    float fs; // freq_scale
+    float ef; // ext_factor
+    float af; // attn_factor
+    bool ff;
+    int v; // view (1 : non-contiguous a)
+    bool forward;
 
     std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
+        // forward can be inferred from the op, does not need to be printed
+        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
     }
 
     test_rope(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            int n_dims = 10, int mode = 0, int n_ctx = 512)
-        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
+            std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
+            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f,
+            float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true)
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
-        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (forward) {
+                ggml_set_param(ctx, a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (forward) {
+                ggml_set_param(ctx, a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+        const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+        ggml_tensor * pos;
+        if (is_mrope || is_vision) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+        } else {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        }
+        ggml_set_name(pos, "pos");
+
+        ggml_tensor * freq = nullptr;
+        if (ff) {
+            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
+            ggml_set_name(freq, "freq");
+        }
+
+        ggml_tensor * out;
+        if (is_mrope) {
+            if (is_vision) {
+                GGML_ASSERT(n_dims/4 > 0);
+                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                if (forward) {
+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            } else {
+                GGML_ASSERT(n_dims/3 > 0);
+                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+                if (forward) {
+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            }
+        } else {
+            if (forward) {
+                out = ggml_rope_ext     (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            } else {
+                out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            }
+        }
+        ggml_set_name(out, "out");
+
         return out;
     }
 
@@ -1143,16 +2533,30 @@ struct test_rope : public test_case {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (t->type == GGML_TYPE_I32) {
                 // pos
-                std::vector<int> data(ne[2]);
-                for (int i = 0; i < ne[2]; i++) {
+                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                std::vector<int> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
                     data[i] = rand() % n_ctx;
                 }
-                ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
             } else {
-                init_tensor_uniform(t);
+                if (t->ne[0] == n_dims/2) {
+                    // frequency factors in the range [0.9f, 1.1f]
+                    init_tensor_uniform(t, 0.9f, 1.1f);
+                } else {
+                    init_tensor_uniform(t);
+                }
             }
         }
     }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
 };
 
 // GGML_OP_POOL2D
@@ -1184,7 +2588,44 @@ struct test_pool2d : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(ctx, input);
+        ggml_set_name(input, "input");
+
         ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_TRANSPOSE_1D
+struct test_conv_transpose_1d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+
+    const int s0; // stride
+    const int p0; // padding
+    const int d0; // dilation
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+    }
+
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int s0 = 1, int p0 = 0, int d0 = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1202,7 +2643,7 @@ struct test_im2col : public test_case {
     // padding
     const int p0;
     const int p1;
-    // dilatation
+    // dilation
     const int d0;
     const int d1;
     // mode
@@ -1223,8 +2664,15 @@ struct test_im2col : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(ctx, input);
+        ggml_set_name(input, "input");
+
         ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
         ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1232,22 +2680,52 @@ struct test_im2col : public test_case {
 // GGML_OP_CONCAT
 struct test_concat : public test_case {
     const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int64_t b_ne2;
+    const std::array<int64_t, 4> ne_a;
+    const int64_t ne_b_d;
+    const int dim;
+    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
 
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, b_ne2);
+        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
     }
 
     test_concat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int64_t b_ne2 = 10)
-        : type(type), ne(ne), b_ne2(b_ne2) {}
+            std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
+            int64_t ne_b_d = 5,
+            int dim = 2, int v = 0)
+        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
-        ggml_tensor * out = ggml_concat(ctx, a, b);
+        auto ne_b = ne_a;
+        ne_b[dim] = ne_b_d;
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+        }
+        ggml_tensor * b;
+        if (v & 2) {
+            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(b, "view_of_b");
+        } else {
+            b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1269,7 +2747,11 @@ struct test_argsort : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_argsort(ctx, a, order);
+        ggml_set_name(out, "out");
+
         return out;
     }
 
@@ -1296,12 +2778,41 @@ struct test_argsort : public test_case {
                     ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
                 }
             } else {
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
         }
     }
 };
 
+// GGML_OP_SUM
+struct test_sum : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sum(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sum(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
+    }
+};
+
 // GGML_OP_SUM_ROWS
 struct test_sum_rows : public test_case {
     const ggml_type type;
@@ -1312,34 +2823,104 @@ struct test_sum_rows : public test_case {
     }
 
     test_sum_rows(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
         : type(type), ne(ne) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_sum_rows(ctx, a);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
 
+// GGML_OP_MEAN
+struct test_mean : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_mean(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_mean(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+};
+
 // GGML_OP_UPSCALE
 struct test_upscale : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     const int32_t scale_factor;
+    const bool transpose;
 
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale_factor);
+        return VARS_TO_STR4(type, ne, scale_factor, transpose);
     }
 
     test_upscale(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2)
-        : type(type), ne(ne), scale_factor(scale_factor) {}
+            int32_t scale_factor = 2, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (transpose) {
+            a = ggml_transpose(ctx, a);
+            ggml_set_name(a, "a_transposed");
+        }
+
         ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_UPSCALE (ext)
+struct test_upscale_ext : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, ne_tgt);
+    }
+
+    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
+        : type(type), ne(ne), ne_tgt(ne_tgt) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1349,19 +2930,25 @@ struct test_group_norm : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     const int32_t num_groups;
+    const float eps;
 
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, num_groups);
+        return VARS_TO_STR4(type, ne, num_groups, eps);
     }
 
     test_group_norm(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {64, 64, 320, 1},
-            int32_t num_groups = 32)
-        : type(type), ne(ne), num_groups(num_groups) {}
+            int32_t num_groups = 32,
+            float eps = 1e-6f)
+        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1377,14 +2964,22 @@ struct test_acc : public test_case {
     }
 
     test_acc(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
-            std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
+            std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
+            std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
         : type(type), ne_a(ne_a), ne_b(ne_b) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_param(ctx, a);
+        ggml_set_name(a, "a");
+
         ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_set_param(ctx, b);
+        ggml_set_name(b, "b");
+
         ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1407,7 +3002,88 @@ struct test_pad : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD_REFLECT_1D
+struct test_pad_reflect_1d : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int pad_0;
+    const int pad_1;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+    }
+
+    test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
+            int pad_0 = 10, int pad_1 = 9)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ARANGE
+struct test_arange : public test_case {
+    const ggml_type type;
+    const float start;
+    const float stop;
+    const float step;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, start, stop, step);
+    }
+
+    test_arange(ggml_type type = GGML_TYPE_F32,
+            float start = 0.f, float stop = 10.f, float step = 1.f)
+        : type(type), start(start), stop(stop), step(step)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * out = ggml_arange(ctx, start, stop, step);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_TIMESTEP_EMBEDDING
+struct test_timestep_embedding : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int dim;
+    const int max_period;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, dim, max_period);
+    }
+
+    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
+            int dim = 320, int max_period=10000)
+        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
@@ -1423,101 +3099,226 @@ struct test_leaky_relu : public test_case {
     }
 
     test_leaky_relu(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
             float negative_slope = 0.1f)
         : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
         ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
+        ggml_set_name(out, "out");
+
         return out;
     }
 };
 
-// Mixtral MOE
-struct test_moe : public test_case {
-    const int n_experts;
-    const int n_experts_per_tok;
-    const int n_tokens;
-    const int n_embd;
-    const int n_ff;
+// GGML_OP_FLASH_ATTN_EXT
+struct test_flash_attn_ext : public test_case {
+    const int64_t hs; // head size
+    const int64_t nh; // num heads
+    const int64_t kv; // kv size
+    const int64_t nb; // batch size
 
-    std::string op_desc(ggml_tensor * t) override {
-        return "MOE";
+    const bool mask; // use mask
 
-        GGML_UNUSED(t);
-    }
+    const float max_bias; // ALiBi
+    const float logit_softcap; // Gemma 2
+
+    const ggml_type type_KV;
+    std::array<int32_t, 4> permute;
 
     std::string vars() override {
-        return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
+        return VARS_TO_STR9(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, permute);
     }
 
-    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
-        : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
+    double max_nmse_err() override {
+        return 5e-4;
     }
 
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
+        return 2 * 2 * nh * nb * hs * kv;
+    }
+
+    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
+                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16,
+                        std::array<int32_t, 4> permute = {0, 1, 2, 3})
+        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV), permute(permute) {}
+
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
+        const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
 
-        std::vector<ggml_tensor *> ffn_up_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_down_exp(n_experts);
-
-        for (int i = 0; i < n_experts; ++i) {
-            ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
-        }
-
-        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
-
-        ggml_tensor * weights = ggml_get_rows(ctx,
-                ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
-
-        weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
-
-        weights = ggml_div(ctx, weights, weights_sum);
-
-        // compute expert outputs
-        ggml_tensor * moe_out = nullptr;
-
-        for (int i = 0; i < n_experts_per_tok; ++i) {
-            ggml_tensor * cur_expert;
-
-            ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
-
-            ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
-
-            cur_gate = ggml_silu(ctx, cur_gate);
-
-            cur_expert = ggml_mul(ctx, cur_up, cur_gate);
-
-            cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
-
-            cur_expert = ggml_mul(ctx, cur_expert,
-                    ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
-
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx, moe_out, cur_expert);
+        auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
+            int64_t ne[4] = {ne0, ne1, ne2, ne3};
+            int64_t ne_perm[4];
+            for (int i = 0; i < 4; ++i) {
+                ne_perm[permute[i]] = ne[i];
             }
+            ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
+            if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
+                t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
+            }
+            return t;
+        };
+
+        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hs_padded, nb, nh, 1);
+        ggml_set_name(q, "q");
+
+        ggml_tensor * k = create_permuted(type_KV,       hs_padded, kv, nh, 1);
+        ggml_set_name(k, "k");
+
+        ggml_tensor * v = create_permuted(type_KV,       hs_padded, kv, nh, 1);
+        ggml_set_name(v, "v");
+
+        ggml_tensor * m = nullptr;
+        if (mask) {
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
+            ggml_set_name(m, "m");
         }
 
-        cur = moe_out;
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
+        ggml_set_name(out, "out");
 
-        return cur;
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
     }
 };
 
+// GGML_OP_CROSS_ENTROPY_LOSS
+struct test_cross_entropy_loss : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(ctx, logits);
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        // The labels are assumed to be constant -> no gradients.
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -100.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CROSS_ENTROPY_LOSS_BACK
+struct test_cross_entropy_loss_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * grad = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss_back(ctx, grad, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_OPT_STEP_ADAMW
+struct test_opt_step_adamw : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_m, "grad_m");
+
+        ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_v, "grad_v");
+
+        ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
+        ggml_set_name(adamw_params, "adamw_params");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
 
 enum llm_norm_type {
     LLM_NORM,
@@ -1539,7 +3340,7 @@ struct llama_hparams {
 
     // cparams
     static constexpr uint32_t n_ctx = 512; // user-specified context size
-    static constexpr uint32_t n_orig_ctx = n_ctx;
+    static constexpr uint32_t n_ctx_orig = n_ctx;
 
     // batch
     int32_t n_tokens;
@@ -1619,7 +3420,7 @@ public:
 
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
 
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
 
         // split cached v into n_head heads
         struct ggml_tensor * v =
@@ -1657,7 +3458,6 @@ public:
     }
 };
 
-
 // Llama
 struct test_llama : public test_llm {
     static constexpr float freq_base = 10000.0f;
@@ -1706,7 +3506,7 @@ struct test_llama : public test_llm {
         struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
 
         ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
         ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
@@ -1729,15 +3529,15 @@ struct test_llama : public test_llm {
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
 
-                Qcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos,
-                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
+                Qcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
-                Kcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
-                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
+                Kcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
@@ -1828,7 +3628,7 @@ struct test_falcon : public test_llm {
         struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
 
         ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
         ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
@@ -1855,13 +3655,13 @@ struct test_falcon : public test_llm {
                 Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
 
                 // using mode = 2 for neox mode
-                Qcur = ggml_rope_custom(
-                    ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                Qcur = ggml_rope_ext(
+                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
-                Kcur = ggml_rope_custom(
-                    ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                Kcur = ggml_rope_ext(
+                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
@@ -1904,26 +3704,58 @@ struct test_falcon : public test_llm {
     }
 };
 
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
+
+// ###########################################
+// ## Section 3: GGML Op Test Instantiation ##
+// ###########################################
+static const ggml_type all_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+};
+
+static const ggml_type base_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q8_0, // for I8MM tests
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1, // for I8MM tests
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_IQ2_XXS
+};
+
+static const ggml_type other_types[] = {
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+    GGML_TYPE_BF16,
+};
+
+// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     std::vector<std::unique_ptr<test_case>> test_cases;
     std::default_random_engine rng(0);
 
-    const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
-        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-    };
-
     // unary ops
-    for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
-        test_cases.emplace_back(new test_unary((ggml_unary_op) op));
+    for (int v : {0, 1}) {
+        for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
+            test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v));
+            test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v));
+        }
     }
 
     test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
@@ -1940,6 +3772,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         }
     }
 
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    for (ggml_type type : all_types) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+        }
+    }
+    for (bool v : {false, true}) {
+        test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v));
+    }
+
     for (ggml_type type_input : {GGML_TYPE_F32}) {
         for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
             for (int k0 : {1, 3}) {
@@ -1958,31 +3800,142 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         }
     }
 
+    // im2col 1D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    for (int s0 : {1, 3}) {
+        for (int p0 : {0, 3}) {
+            for (int d0 : {1, 3}) {
+                test_cases.emplace_back(new test_im2col(
+                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
+                    s0, 0, p0, 0, d0, 0, false));
+            }
+        }
+    }
+
+    // im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    for (int s0 : {1, 3}) {
+        for (int s1 : {1, 3}) {
+            for (int p0 : {0, 3}) {
+                for (int p1 : {0, 3}) {
+                    for (int d0 : {1, 3}) {
+                        for (int d1 : {1, 3}) {
+                            test_cases.emplace_back(new test_im2col(
+                                GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
+                                s0, s1, p0, p1, d0, d1, true));
+                        }
+                    }
+                }
+            }
+        }
+    }
 
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
+    // extra tests for im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
+    // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+
+    test_cases.emplace_back(new test_conv_transpose_1d());
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+
+    test_cases.emplace_back(new test_count_equal());
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,    1, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100,  10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438,  3, 1, 1}));
+
+    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+    }
+
+    for (bool view : {false, true}) {
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I16, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
+    }
 
     test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
     test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
     test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
+    }
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
+    }
 
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : all_types) {
-           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_dst : {GGML_TYPE_F32}) {
+        for (ggml_type type_src : all_types) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
         }
     }
 
     test_cases.emplace_back(new test_cont());
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
 
     auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
         for (auto op : {ggml_add, ggml_mul, ggml_div}) {
@@ -1993,16 +3946,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2});
+    add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2});
 
     // stable diffusion
     add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
@@ -2021,39 +3974,162 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
     //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
 
+    test_cases.emplace_back(new test_add1());
     test_cases.emplace_back(new test_scale());
+    test_cases.emplace_back(new test_silu_back());
 
-    for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
-        test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
-        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
+    for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_norm    (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+            test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
+        }
+        test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
     }
 
-    for (ggml_type type_a : all_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
 
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
+
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    for (ggml_type type_a : all_types) {
+        for (int i = 1; i < 10; ++i) {
+            test_cases.emplace_back(new test_mul_mat(type_a,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         }
     }
 
-    for (ggml_type type_a : all_types) {
+#if 1
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            // test cases without permutation
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 2}));
+
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 2}));
+
+            // test cases with permutation
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+        }
+    }
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
+        }
+    }
+#else
+    // m = a rows
+    // n = b rows
+    // k = cols
+    std::uniform_int_distribution<> dist_m(1, 128);
+    std::uniform_int_distribution<> dist_n(16, 128);
+    std::uniform_int_distribution<> dist_k(1, 16);
+    for (int i = 0; i < 1000; i++) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                int m = dist_m(rng);
+                int n = dist_n(rng);
+                int k = dist_k(rng) * ggml_blck_size(type_a);
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
+            }
+        }
+    }
+#endif
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
+    // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
+
+    for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {2, 4, 8}) {
-                for (int id = 0; id < n_mats; id++) {
-                    for (bool v : {false, true}) {
-                        test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
+            for (int n_mats : {4, 8}) {
+                for (int n_used : {1, 2, 4}) {
+                    for (bool b : {false, true}) {
+                        for (int n : {1, 32}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+            for (int n_mats : {4}) {
+                for (int n_used : {2}) {
+                    for (bool b : {false}) {
+                        for (int n : {1, 32}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            for (int n : {1, 16}) {
+                for (int k : {1, 16}) {
+                    for (int bs2 : {1, 3}) {
+                        for (int bs3 : {1, 3}) {
+                            for (int nr2 : {1, 2}) {
+                                for (int nr3 : {1, 2}) {
+                                    test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, n, k, {bs2, bs3}, {nr2, nr3}));
+                                }
+                            }
+                        }
                     }
                 }
             }
@@ -2061,11 +4137,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     }
 
     test_cases.emplace_back(new test_sqr());
+    test_cases.emplace_back(new test_sqrt());
+    test_cases.emplace_back(new test_log());
+    test_cases.emplace_back(new test_sin());
+    test_cases.emplace_back(new test_cos());
     test_cases.emplace_back(new test_clamp());
 
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10,  1,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
 
 #if 0
     std::uniform_int_distribution<> dist_ne1(1, 50);
@@ -2076,7 +4156,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         for (int n = 0; n < 10; ++n) {
             int64_t ne0 = dist_ne0(rng);
             int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
         }
 
         exponent <<= 1;
@@ -2084,66 +4164,210 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 #endif
     for (bool mask : {false, true}) {
         for (float max_bias : {0.0f, 8.0f}) {
+            if (!mask && max_bias > 0.0f) continue;
             for (float scale : {1.0f, 0.1f}) {
                 for (int64_t ne0 : {16, 1024}) {
                     for (int64_t ne1 : {16, 1024}) {
-                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
-                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
+                        if (mask) {
+                            for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, m_prec, scale, max_bias));
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias));
+                            }
+                        } else {
+                            /* The precision of mask here doesn't matter as boolean mask is false */
+                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
+                            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
+                        }
+                    }
+                }
+            }
+        }
+    }
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32,  0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16,  0.1f, 8.0f));
+
+    for (float max_bias : {0.0f, 8.0f}) {
+        for (float scale : {1.0f, 0.1f}) {
+            for (int64_t ne0 : {16, 1024}) {
+                for (int64_t ne1 : {16, 1024}) {
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, scale, max_bias));
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias));
+                }
+            }
+        }
+    }
+
+    for (bool fw : {true, false}) { // fw == forward
+        bool all = true;
+
+        for (float v : { 0, 1 }) {
+            for (float fs : { 1.0f, 1.4245f }) {
+                for (float ef : { 0.0f, 0.7465f }) {
+                    for (float af : { 1.0f, 1.4245f }) {
+                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                            for (bool ff : {false, true}) { // freq_factors
+                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B
+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B
+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
+                                }
+
+                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+                            }
+                        }
+
+                        all = false;
                     }
                 }
             }
         }
     }
 
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
-
-    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
-        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512)); // llama 13B
-        test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512)); // llama 30B
-        test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512)); // llama 65B
-        test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512)); // neox (stablelm)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
+    for (int v : { 0, 1, 2, 3 }) {
+        for (int dim : { 0, 1, 2, 3, }) {
+            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
+        }
     }
 
-    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
-
     for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
         test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
         test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
     }
 
+    test_cases.emplace_back(new test_sum());
     test_cases.emplace_back(new test_sum_rows());
+    test_cases.emplace_back(new test_mean());
     test_cases.emplace_back(new test_upscale());
-    test_cases.emplace_back(new test_group_norm());
+    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
+    test_cases.emplace_back(new test_upscale_ext());
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
+    test_cases.emplace_back(new test_pad_reflect_1d());
+    test_cases.emplace_back(new test_arange());
+    test_cases.emplace_back(new test_timestep_embedding());
     test_cases.emplace_back(new test_leaky_relu());
 
+    for (int hs : { 64, 80, 128, 256, }) {
+        for (bool mask : { true, false } ) {
+            for (float max_bias : { 0.0f, 8.0f }) {
+                if (!mask && max_bias > 0.0f) continue;
+                for (float logit_softcap : {0.0f, 10.0f}) {
+                    if (hs != 128 && logit_softcap != 0.0f) continue;
+                    for (int nh : { 32, }) {
+                        for (int kv : { 512, 1024, }) {
+                            for (int nb : { 1, 3, 32, 35, }) {
+                                for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+                                    test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
+                                    // run fewer test cases permuted
+                                    if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
+                                        test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, {0, 2, 1, 3}));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {30000, 1, 1, 1}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1}));
+
+    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
+
     // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
-#if !defined(__SANITIZE_THREAD__)
-    // FIXME: these tests use too much memory with thread sanitizer
-    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
-    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
-#endif
     test_cases.emplace_back(new test_llama(1));
     test_cases.emplace_back(new test_llama(2));
     test_cases.emplace_back(new test_falcon(1));
     test_cases.emplace_back(new test_falcon(2));
 #endif
 
-    // run tests
+    return test_cases;
+}
+
+// Test cases for performance evaluation: should be representative of real-world use cases
+static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
+
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
+
+    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
+            }
+        }
+    }
+
+    for (int K : {3, 5}) {
+        for (int IC : {256, 2560}) {
+            for (int IW_IH : {32, 64, 256}) {
+                if (IC == 2560 && IW_IH == 256) {
+                    // too big
+                    continue;
+                }
+                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
+            }
+        }
+    }
+
+    return test_cases;
+}
+
+static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     if (mode == MODE_TEST) {
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+        auto test_cases = make_test_cases_eval();
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (backend_cpu == NULL) {
+            printf("  Failed to initialize CPU backend\n");
+            return false;
+        }
 
         size_t n_ok = 0;
         for (auto & test : test_cases) {
@@ -2158,43 +4382,61 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         return n_ok == test_cases.size();
     }
 
+    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval_grad(backend, op_name)) {
+                n_ok++;
+            }
+        }
+        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+        return n_ok == test_cases.size();
+    }
+
     if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
         for (auto & test : test_cases) {
             test->eval_perf(backend, op_name);
         }
         return true;
     }
 
-    GGML_ASSERT(false);
-    return false;
+    GGML_ABORT("fatal error");
 }
 
 static void usage(char ** argv) {
     printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
-    printf("  valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
-    printf("  op names are as given by ggml_op_desc()\n");
+    printf("    valid modes:\n");
+    printf("      - test (default, compare with CPU backend for correctness)\n");
+    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
+    printf("      - perf (performance evaluation)\n");
+    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
 }
 
 int main(int argc, char ** argv) {
     test_mode mode = MODE_TEST;
-    const char * op_name = NULL;
-    const char * backend = NULL;
+    const char * op_name_filter = NULL;
+    const char * backend_filter = NULL;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "test") == 0) {
             mode = MODE_TEST;
         } else if (strcmp(argv[i], "perf") == 0) {
             mode = MODE_PERF;
+        } else if (strcmp(argv[i], "grad") == 0) {
+            mode = MODE_GRAD;
         } else if (strcmp(argv[i], "-o") == 0) {
             if (i + 1 < argc) {
-                op_name = argv[++i];
+                op_name_filter = argv[++i];
             } else {
                 usage(argv);
                 return 1;
             }
         } else if (strcmp(argv[i], "-b") == 0) {
             if (i + 1 < argc) {
-                backend = argv[++i];
+                backend_filter = argv[++i];
             } else {
                 usage(argv);
                 return 1;
@@ -2205,25 +4447,47 @@ int main(int argc, char ** argv) {
         }
     }
 
-    // enumerate backends
-    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
+    // load and enumerate backends
+    ggml_backend_load_all();
+
+    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
 
     size_t n_ok = 0;
 
-    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
-        printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
-        if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) {
+        printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
+
+        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
             printf("  Skipping\n");
             n_ok++;
             continue;
         }
 
-        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
-        GGML_ASSERT(backend != NULL);
-        printf("  Backend name: %s\n", ggml_backend_name(backend));
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
+            printf("  Skipping CPU backend\n");
+            n_ok++;
+            continue;
+        }
 
-        bool ok = test_backend(backend, mode, op_name);
+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+        GGML_ASSERT(backend != NULL);
+
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            // TODO: better value for n_threads
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+        }
+
+        printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
+
+        bool ok = test_backend(backend, mode, op_name_filter);
 
         printf("  Backend %s: ", ggml_backend_name(backend));
         if (ok) {
@@ -2238,15 +4502,15 @@ int main(int argc, char ** argv) {
         ggml_backend_free(backend);
     }
 
-    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
+    ggml_quantize_free();
 
-    if (n_ok != ggml_backend_reg_get_count()) {
+    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
+
+    if (n_ok != ggml_backend_dev_count()) {
         printf("\033[1;31mFAIL\033[0m\n");
         return 1;
     }
 
-    ggml_quantize_free();
-
     printf("\033[1;32mOK\033[0m\n");
     return 0;
 }
diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp
new file mode 100644
index 000000000..d85bf912b
--- /dev/null
+++ b/tests/test-barrier.cpp
@@ -0,0 +1,94 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-backend.h"
+
+#include <chrono>
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+
+#define MAX_NARGS 2
+
+int main(int argc, char *argv[]) {
+
+    int n_threads = 4;
+    int n_rounds  = 100;
+
+    if (argc > 1) {
+        n_threads = std::atoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        n_rounds  = std::atoi(argv[2]);
+    }
+
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graph
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Lots of small, parallel ops where barriers in between will dominate
+    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+    for (int i = 0; i < 1000; i++) {
+        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+        out = ggml_mul_mat(ctx, a, out);
+
+        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+        out = ggml_mul_mat(ctx, d, out);
+    }
+
+    ggml_build_forward_expand(gf, out);
+    int n_nodes = ggml_graph_n_nodes(gf);
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    // Create compute plan
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
+
+    std::vector<uint8_t> work_data(cplan.work_size);
+    cplan.work_data = work_data.data();
+
+    std::cerr << "graph-compute with"
+              << "\n n_threads: " << n_threads
+              << "\n   n_nodes: " << n_nodes
+              << "\n  n_rounds: " << n_rounds
+              << "\n";
+    // ggml_graph_print(gf);
+
+    // Warmup
+    ggml_graph_compute(gf, &cplan);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    for (int i=0; i < n_rounds; i++) {
+        ggml_graph_compute(gf, &cplan);
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
+    auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
+    std::cerr << "graph-compute took " << usec << " usec "
+              << "\n " << (float) usec / n_rounds << " usec per-iter"
+              << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
+              << "\n";
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+
+    return 0;
+}
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index fa2eb577b..e0314ae1d 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -1,4 +1,3 @@
-#include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -7,9 +6,20 @@
 #include <cassert>
 
 #include "llama.h"
+#include "common.h"
+#include "chat-template.hpp"
+
+static std::string normalize_newlines(const std::string & s) {
+#ifdef _WIN32
+  static const std::regex nl_regex("\r\n");
+  return std::regex_replace(s, nl_regex, "\n");
+#else
+  return s;
+#endif
+}
 
 int main(void) {
-    llama_chat_message conversation[] = {
+    std::vector<llama_chat_message> conversation {
         {"system", "You are a helpful assistant"},
         {"user", "Hello"},
         {"assistant", "Hi there"},
@@ -17,59 +27,361 @@ int main(void) {
         {"assistant", "   I am an assistant   "},
         {"user", "Another question"},
     };
-    size_t message_count = 6;
-    std::vector<std::string> templates = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
-        // bofenghuang/vigogne-2-70b-chat
-        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
-        // mlabonne/AlphaMonarch-7B
-        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
-        // google/gemma-7b-it
-        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+    struct TestCase {
+        std::string name;
+        std::string template_str;
+        std::string expected_output;
+        std::string expected_output_jinja;
+        std::string bos_token = "";
+        std::string eos_token = "";
+        bool supported_with_jinja = true;
     };
-    std::vector<std::string> expected_output = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
-        // bofenghuang/vigogne-2-70b-chat
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
-        // mlabonne/AlphaMonarch-7B
-        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
-        // google/gemma-7b-it
-        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+    std::vector<TestCase> test_cases {
+        {
+            /* .name= */ "teknium/OpenHermes-2.5-Mistral-7B",
+            /* .template_str= */ "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+            /* .expected_output= */ "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "TheBloke/FusionNet_34Bx2_MoE-AWQ",
+            /* .template_str= */ "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s><s>[INST] Who are you [/INST]   I am an assistant   </s><s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "bofenghuang/vigogne-2-70b-chat",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s>[INST] Who are you [/INST]I am an assistant</s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mlabonne/AlphaMonarch-7B",
+            /* .template_str= */ "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
+            /* .expected_output= */          "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .expected_output_jinja= */ "<s>system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "google/gemma-7b-it",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+            /* .expected_output= */       "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+            /* .expected_output_jinja= */ "<start_of_turn>user\nYou are a helpful assistant\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        },
+        {
+            /* .name= */ "OrionStarAI/Orion-14B-Chat",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .expected_output_jinja= */ "Human: You are a helpful assistant\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "openchat/openchat-3.5-0106",
+            // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
+            // So we match against the included template but implement the suggested version.
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+            /* .expected_output= */                            "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+            /* .expected_output_jinja= */ "GPT4 Correct System: You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+        },
+        {
+            /* .name= */ "deepseek-ai/deepseek-coder-33b-instruct",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+            /* .expected_output= */ "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "eachadea/vicuna-13b-1.1",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "Orca-Vicuna",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "CohereForAI/c4ai-command-r-plus",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+            /* .expected_output= */ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Llama-3",
+            /* .template_str= */ "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+            /* .expected_output= */ "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-mini",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-small",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-medium",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-vision",
+            /* .template_str= */ "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "ChatGLM3",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+            /* .expected_output= */       "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+            /* .expected_output_jinja= */ "[gMASK]sop<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+        },
+        {
+            /* .name= */ "ChatGLM4",
+            /* .template_str= */ u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+            /* .expected_output= */ "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "GLMEdge",
+            /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
+            /* .template_str= */ u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+            /* .expected_output= */ u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "DeepSeek-V2",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+            /* .expected_output= */ u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "<｜end▁of▁sentence｜>",
+        },
+        {
+            /* .name= */ "ibm-granite/granite-3.0-8b-instruct",
+            /* .template_str= */ "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+            /* .expected_output= */       "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
+            /* .expected_output_jinja= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)",
+            /* .template_str= */ "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */ " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST]Hello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "ai-sage/GigaChat-20B-A3B-instruct",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' -%}\n    {%- set loop_messages = messages[1:] -%}\n    {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n    {%- set loop_messages = messages -%}\n    {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n    \n    {%- if loop.index0 == 0 -%}\n        {{ system_message -}}\n    {%- endif -%}\n    {%- if message['role'] == 'user' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n        {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3]  + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if message['role'] == 'assistant' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if loop.last and add_generation_prompt -%}\n        {{ 'assistant' + additional_special_tokens[0] -}}\n    {%- endif -%}\n{%- endfor %}",
+            /* .expected_output= */ "<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>   I am an assistant   <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false, // Requires additional_special_tokens as extra context
+        },
+        {
+            /* .name= */ "Infinigence/Megrez-3B-Instruct",
+            /* .template_str= */ u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}",
+            /* .expected_output= */ "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|>   I am an assistant   <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "phi-4",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
+            /* .expected_output= */ "<|im_start|>system<|im_sep|>You are a helpful assistant<|im_end|><|im_start|>user<|im_sep|>Hello<|im_end|><|im_start|>assistant<|im_sep|>Hi there<|im_end|><|im_start|>user<|im_sep|>Who are you<|im_end|><|im_start|>assistant<|im_sep|>   I am an assistant   <|im_end|><|im_start|>user<|im_sep|>Another question<|im_end|><|im_start|>assistant<|im_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;
 
-    // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
-    assert(res < 0);
+    // list all supported templates
+    std::vector<const char *> supported_tmpl;
+    res = llama_chat_builtin_templates(nullptr, 0);
+    assert(res > 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    printf("Built-in chat templates:\n");
+    for (auto tmpl : supported_tmpl) {
+        printf("  %s\n", tmpl);
+    }
 
-    for (size_t i = 0; i < templates.size(); i++) {
-        std::string custom_template = templates[i];
-        std::string expected = expected_output[i];
+    // test invalid chat template
+    res = llama_chat_apply_template("INVALID TEMPLATE", conversation.data(), conversation.size(), true, formatted_chat.data(), formatted_chat.size());
+    assert(res < 0);
+    const auto add_generation_prompt = true;
+
+    for (const auto & test_case : test_cases) {
+        printf("\n\n=== %s ===\n\n", test_case.name.c_str());
         formatted_chat.resize(1024);
         res = llama_chat_apply_template(
-            nullptr,
-            custom_template.c_str(),
-            conversation,
-            message_count,
-            true,
+            test_case.template_str.c_str(),
+            conversation.data(),
+            conversation.size(),
+            add_generation_prompt,
             formatted_chat.data(),
             formatted_chat.size()
         );
         formatted_chat.resize(res);
         std::string output(formatted_chat.data(), formatted_chat.size());
-        std::cout << output << "\n-------------------------\n";
-        assert(output == expected);
+        if (output != test_case.expected_output) {
+            printf("Expected:\n%s\n", test_case.expected_output.c_str());
+            printf("-------------------------\n");
+            printf("Actual:\n%s\n", output.c_str());
+            fflush(stdout);
+            assert(output == test_case.expected_output);
+        }
     }
+
+    json messages = json::array();
+    for (const auto & msg : conversation) {
+        messages.push_back({
+            {"role", msg.role},
+            {"content", msg.content},
+        });
+    }
+    for (const auto & test_case : test_cases) {
+        if (!test_case.supported_with_jinja) {
+            continue;
+        }
+        printf("\n\n=== %s (jinja) ===\n\n", test_case.name.c_str());
+        try {
+            minja::chat_template tmpl(test_case.template_str, test_case.bos_token, test_case.eos_token);
+            auto output = normalize_newlines(tmpl.apply(messages, json(), add_generation_prompt));
+            auto expected_output = normalize_newlines(test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja);
+            if (output != expected_output) {
+                printf("Expected:\n%s\n", expected_output.c_str());
+                printf("-------------------------\n");
+                printf("Actual:\n%s\n", output.c_str());
+                fflush(stdout);
+                assert(output == expected_output);
+            }
+        } catch (const std::exception & e) {
+            printf("ERROR: %s\n", e.what());
+            assert(false);
+        }
+    }
+
+    // test llama_chat_format_single for system message
+    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
+    std::vector<common_chat_msg> chat2;
+    common_chat_msg sys_msg{"system", "You are a helpful assistant", {}};
+
+    auto fmt_sys = [&](std::string tmpl_str) {
+        minja::chat_template tmpl(tmpl_str, "", "");
+        auto output = common_chat_format_single(tmpl, chat2, sys_msg, false, /* use_jinja= */ false);
+        printf("fmt_sys(%s) : %s\n", tmpl_str.c_str(), output.c_str());
+        printf("-------------------------\n");
+        return output;
+    };
+    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
+    assert(fmt_sys("mistral-v1") == " [INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("llama2-sys") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
+    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
+    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+    assert(fmt_sys("gigachat") == "<s>You are a helpful assistant<|message_sep|>");
+
+
+    // test llama_chat_format_single for user message
+    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
+    chat2.push_back({"system", "You are a helpful assistant", {}});
+    chat2.push_back({"user", "Hello", {}});
+    chat2.push_back({"assistant", "I am assistant", {}});
+    common_chat_msg new_msg{"user", "How are you", {}};
+
+    auto fmt_single = [&](std::string tmpl_str) {
+        minja::chat_template tmpl(tmpl_str, "", "");
+        auto output = common_chat_format_single(tmpl, chat2, new_msg, true, /* use_jinja= */ false);
+        printf("fmt_single(%s) : %s\n", tmpl_str.c_str(), output.c_str());
+        printf("-------------------------\n");
+        return output;
+    };
+    assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("mistral-v1") == " [INST] How are you [/INST]");
+    assert(fmt_single("mistral-v3") == "[INST] How are you[/INST]");
+    assert(fmt_single("mistral-v3-tekken") == "[INST]How are you[/INST]");
+    assert(fmt_single("mistral-v7") == "[INST] How are you[/INST]");
+    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
+    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+    assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>");
+
     return 0;
 }
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
new file mode 100644
index 000000000..b78da2cdb
--- /dev/null
+++ b/tests/test-chat.cpp
@@ -0,0 +1,607 @@
+//  Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
+//
+//  Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
+//  e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
+//
+//    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+//
+#include <fstream>
+#include <iostream>
+#include <json.hpp>
+#include <string>
+
+#include "chat-template.hpp"
+#include "chat.hpp"
+#include "llama-grammar.h"
+#include "unicode.h"
+
+using json = nlohmann::ordered_json;
+
+static common_chat_msg msg_from_json(const json & message) {
+    common_chat_msg ret;
+    ret.role = "assistant";
+    if (message.contains("content") && !message.at("content").is_null()) {
+        ret.content = message.at("content");
+    }
+    if (message.contains("tool_plan")) {
+        ret.tool_plan = message.at("tool_plan");
+    }
+    auto has_tool_calls = message.contains("tool_calls");
+    if (has_tool_calls) {
+        for (const auto & tc : message.at("tool_calls")) {
+            const auto & arguments = tc.at("function").at("arguments");
+            ret.tool_calls.push_back({
+                tc.at("function").at("name").get<std::string>(),
+                arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
+                tc.contains("id") ? tc.at("id").get<std::string>() : "",
+            });
+        }
+    }
+    return ret;
+}
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+    if (expected != actual) {
+        std::cerr << "Expected: " << expected << std::endl;
+        std::cerr << "Actual: " << actual << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+
+static std::string read_file(const std::string & path) {
+    std::cerr << "# Reading: " << path << std::endl << std::flush;
+    std::ifstream fs(path, std::ios_base::binary);
+    if (!fs.is_open()) {
+        fs = std::ifstream("../" + path, std::ios_base::binary);
+        if (!fs.is_open()) {
+            throw std::runtime_error("Failed to open file: " + path);
+        }
+    }
+    fs.seekg(0, std::ios_base::end);
+    auto size = fs.tellg();
+    fs.seekg(0);
+    std::string out;
+    out.resize(static_cast<size_t>(size));
+    fs.read(&out[0], static_cast<std::streamsize>(size));
+    return out;
+}
+
+static std::unique_ptr<llama_grammar> build_grammar(const std::string & grammar_str) {
+    return std::unique_ptr<llama_grammar>(
+        llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0));
+}
+
+// TODO: extract to common helper (copied from test-grammar-integration.cpp)
+static bool match_string(const std::string & input, llama_grammar * grammar) {
+    const auto cpts = unicode_cpts_from_utf8(input);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            // no stacks means that the grammar failed to match at this point
+            return false;
+        }
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            // An empty stack means that the grammar has been completed
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// Dumps `{"a": 1}` as `"{\"a\": 1}"`, unlike nlohmann::json::dump which would dump it as `"{\"a\":1}"`.
+static std::string dump(const json & j) {
+    return minja::Value(j).dump(-1, /* to_json= */ true);
+}
+
+static void assert_msg_equals(const common_chat_msg & expected, const common_chat_msg & actual) {
+    assert_equals(expected.role, actual.role);
+    assert_equals(expected.content, actual.content);
+    assert_equals(expected.tool_calls.size(), actual.tool_calls.size());
+    for (size_t i = 0; i < expected.tool_calls.size(); i++) {
+        const auto & expected_tool_call = expected.tool_calls[i];
+        const auto & actual_tool_call   = actual.tool_calls[i];
+        assert_equals(expected_tool_call.name, actual_tool_call.name);
+        assert_equals(dump(json::parse(expected_tool_call.arguments)), dump(json::parse(actual_tool_call.arguments)));
+        assert_equals(expected_tool_call.id, actual_tool_call.id);
+    }
+}
+
+const auto special_function_tool = json::parse(R"({
+  "type": "function",
+  "function": {
+    "name": "special_function",
+    "description": "I'm special",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "arg1": {
+          "type": "integer",
+          "description": "The arg."
+        }
+      },
+      "required": ["arg1"]
+    }
+  }
+})");
+const auto python_tool           = json::parse(R"({
+  "type": "function",
+  "function": {
+    "name": "python",
+    "description": "an ipython interpreter",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "code": {
+          "type": "string",
+          "description": "Python code to execute."
+        }
+      },
+      "required": ["code"]
+    }
+  }
+})");
+const auto code_interpreter_tool = json::parse(R"({
+  "type": "function",
+  "function": {
+    "name": "code_interpreter",
+    "description": "an ipython interpreter",
+    "parameters": {
+      "type": "object",
+      "properties": {
+        "code": {
+          "type": "string",
+          "description": "Python code to execute."
+        }
+      },
+      "required": ["code"]
+    }
+  }
+})");
+const json tools                 = { special_function_tool, python_tool };
+const json llama_3_1_tools       = { special_function_tool, code_interpreter_tool };
+
+struct delta_data {
+    std::string        delta;
+    common_chat_params params;
+};
+
+static delta_data init_delta(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
+                             const json & user_message, const json & delta_message, const json & tools,
+                             const json & tool_choice) {
+    common_chat_inputs inputs;
+    inputs.parallel_tool_calls = true;
+    inputs.messages            = json::array();
+    inputs.messages.push_back(user_message);
+    inputs.tools       = tools;
+    inputs.tool_choice = tool_choice;
+    auto params_prefix = common_chat_params_init(tmpl, inputs);
+
+    inputs.messages.push_back(delta_message);
+    inputs.add_generation_prompt = false;
+    auto params_full             = common_chat_params_init(tmpl, inputs);
+
+    std::string prefix = params_prefix.prompt;
+    std::string full   = params_full.prompt;
+
+    // Check full starts with prefix
+    if (full.find(prefix) != 0) {
+        fprintf(stderr, "Full:\n%s\n\nPrefix:\n%s\n\n", full.c_str(), prefix.c_str());
+        throw std::runtime_error("Full message does not start with prefix");
+    }
+
+    if (full == prefix) {
+        throw std::runtime_error("Full message is the same as the prefix");
+    }
+
+    auto delta = full.substr(prefix.size());
+
+    // Strip end tokens
+    for (const auto & end_token : end_tokens) {
+        // rfind to find the last occurrence
+        auto pos = delta.rfind(end_token);
+        if (pos != std::string::npos) {
+            delta = delta.substr(0, pos);
+            break;
+        }
+    }
+    return { delta, params_full };
+}
+
+/*
+  Applies the template to 1 user message w/ add_generation_prompt=true, then w/ the test message w/ add_generation_prompt=false,
+  gets the diff, removes any end tokens and parses the result w/ the grammar, checking that
+  the parsed message is the same as the test_message
+*/
+static void test_template(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
+                          const json & test_message, const json & tools = {}, const std::string & expected_delta = "",
+                          bool expect_grammar_triggered = true) {
+    common_chat_msg expected_msg = msg_from_json(test_message);
+
+    auto user_message = json{
+        { "role",    "user"          },
+        { "content", "Hello, world!" }
+    };
+
+    for (const auto & tool_choice : json({ "auto", "required" })) {
+        auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools, tool_choice);
+        if (!expected_delta.empty()) {
+            assert_equals(expected_delta, data.delta);
+        }
+
+        if (expect_grammar_triggered) {
+            const auto msg = common_chat_parse(data.delta, data.params.format);
+            assert_msg_equals(expected_msg, msg);
+        }
+
+        if (!expected_msg.tool_calls.empty()) {
+            GGML_ASSERT(!data.params.grammar.empty());
+        }
+        if (!data.params.grammar.empty()) {
+            auto grammar = build_grammar(data.params.grammar);
+            if (!grammar) {
+                throw std::runtime_error("Failed to build grammar");
+            }
+            auto earliest_trigger_pos = std::string::npos;
+            auto constrained = data.delta;
+            for (const auto & trigger : data.params.grammar_triggers) {
+                auto pos = constrained.find(trigger.word);
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                if (pos > 0 && trigger.at_start) {
+                    fprintf(stderr, "Trigger %s not at start of message, skipping:\n\n%s\n\n", trigger.word.c_str(), constrained.c_str());
+                    continue;
+                }
+                if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
+                    earliest_trigger_pos = pos;
+                }
+            }
+            auto grammar_triggered = false;
+            if (earliest_trigger_pos != std::string::npos) {
+                constrained = constrained.substr(earliest_trigger_pos);
+                grammar_triggered = true;
+            }
+            if (data.params.grammar_lazy) {
+                assert_equals(expect_grammar_triggered, grammar_triggered);
+            }
+
+            if (grammar_triggered && !match_string(constrained, grammar.get())) {
+                throw std::runtime_error("Failed to match delta against grammar:\n\n" + data.delta +
+                                            "\n\nGrammar: " + data.params.grammar);
+            }
+        }
+    }
+}
+
+static void test_template_output_parsers() {
+    json text_message {
+        { "role",    "assistant"     },
+        { "content", "Hello, world!\nWhat's up?" },
+    };
+    json tool_calls = json::array({{
+        { "type", "function" },
+        { "function", { { "name", "special_function" }, { "arguments", "{\"arg1\": 1}" } } },
+    }});
+
+    json tool_call_message {
+        { "role",       "assistant"},
+        { "content",    {}},
+        { "tool_calls", {
+            {
+                { "type", "function" },
+                { "function", {
+                    { "name", "special_function" },
+                    { "arguments", "{\"arg1\": 1}" },
+                }},
+            },
+        }},
+    };
+    json tool_call_message_with_id {
+        { "role",       "assistant"},
+        { "content",    {}},
+        { "tool_calls", {
+            {
+                { "type", "function" },
+                { "function", {
+                    { "name", "special_function" },
+                    { "arguments", "{\"arg1\": 1}" },
+                }},
+                {"id", "123456789"},
+            },
+        }},
+        { "role",       "assistant"                },
+        { "content",    {}                         },
+        { "tool_calls", tool_calls                  }
+    };
+    json tool_call_plan_message_with_idx {
+        { "role",       "assistant"},
+        { "content",    {}},
+        { "tool_plan",  "I'm not so sure"},
+        { "tool_calls", {
+            {
+                { "type", "function" },
+                { "function", {
+                    { "name", "special_function" },
+                    { "arguments", "{\"arg1\": 1}" },
+                }},
+                // Index of the tool call in the tool_calls array
+                {"id", "0"},
+            },
+        }},
+        { "role",       "assistant"                },
+        { "content",    {}                         },
+        { "tool_calls", tool_calls                  }
+    };
+
+    auto python_tool_call_message = json{
+        { "role",       "assistant"                },
+        { "content",    {}                         },
+        { "tool_calls", json{ {
+                            { "type", "function" },
+                            { "function",
+                              {
+                                  { "name", "python" },
+                                  { "arguments",
+                                    {
+                                        { "code", "print('hey')" },
+                                    } },
+                              } },
+                        } } }
+    };
+    auto code_interpreter_tool_call_message = json{
+        { "role",       "assistant"                },
+        { "content",    {}                         },
+        { "tool_calls", json{ {
+                            { "type", "function" },
+                            { "function",
+                              {
+                                  { "name", "code_interpreter" },
+                                  { "arguments",
+                                    {
+                                        { "code", "print('hey')" },
+                                    } },
+                              } },
+                        } } }
+    };
+
+    common_chat_inputs inputs_no_tools;
+    inputs_no_tools.messages = {
+        { { "role", "user" }, { "content", "Hey\nThere" } }
+    };
+
+    common_chat_inputs inputs_tools = inputs_no_tools;
+    inputs_tools.tools              = json::array();
+    inputs_tools.tools.push_back(special_function_tool);
+
+    common_chat_inputs inputs_tools_builtin = inputs_no_tools;
+    inputs_tools_builtin.tools              = json::array();
+    inputs_tools_builtin.tools.push_back(python_tool);
+
+    {
+        // Not supported yet
+        const common_chat_template tmpl(read_file("models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja"), "<s>", "</s>");
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC, common_chat_params_init(tmpl, inputs_tools).format);
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja"), "<s>", "</s>");
+        std::vector<std::string>   end_tokens{ "<|END_OF_TURN_TOKEN|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_params_init(tmpl, inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,    common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, tool_call_plan_message_with_idx, tools,
+                      "<|START_THINKING|>I'm not so sure<|END_THINKING|>"
+                      "<|START_ACTION|>[\n"
+                      "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+                      "]<|END_ACTION|>");
+        test_template(tmpl, end_tokens, text_message, tools,
+                      "<|START_RESPONSE|>Hello, world!\n"
+                      "What's up?<|END_RESPONSE|>",
+                      /* expect_grammar_triggered= */ false);
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/google-gemma-2-2b-it.jinja"), "<s>", "</s>");
+        std::vector<std::string>   end_tokens{ "<end_of_turn>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_params_init(tmpl, inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC, common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC,
+                      common_chat_params_init(
+                          common_chat_template(read_file("models/templates/microsoft-Phi-3.5-mini-instruct.jinja"),
+                                               "<s>", "</s>"),
+                          inputs_tools)
+                          .format);
+
+        // Generic tool calls doesn't generate / parse content-only messages symmetrically.
+
+        assert_msg_equals(msg_from_json(text_message),
+                          common_chat_parse("{\n"
+                                            "  \"response\": \"Hello, world!\\nWhat's up?\"\n"
+                                            "}",
+                                            common_chat_params_init(tmpl, inputs_tools).format));
+        test_template(tmpl, end_tokens, tool_call_message_with_id, tools,
+                      "{\n"
+                      "  \"tool_calls\": [\n"
+                      "    {\n"
+                      "      \"name\": \"special_function\",\n"
+                      "      \"arguments\": {\n"
+                      "        \"arg1\": 1\n"
+                      "      },\n"
+                      "      \"id\": \"123456789\"\n"
+                      "    }\n"
+                      "  ]\n"
+                      "}");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "</s>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(
+            tmpl, end_tokens, tool_call_message_with_id, tools,
+            "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
+    }
+    {
+        const common_chat_template tmpl(
+            read_file("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja"), "<s>", "</s>");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(
+            COMMON_CHAT_FORMAT_HERMES_2_PRO,
+            common_chat_params_init(
+                common_chat_template(read_file("models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja"),
+                                     "<s>", "</s>"),
+                inputs_tools)
+                .format);
+        assert_equals(
+            COMMON_CHAT_FORMAT_HERMES_2_PRO,
+            common_chat_params_init(
+                common_chat_template(read_file("models/templates/Qwen-Qwen2.5-7B-Instruct.jinja"), "<s>", "</s>"),
+                inputs_tools)
+                .format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                      "</tool_call>");
+        test_template(tmpl, end_tokens, python_tool_call_message, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"python\", \"arguments\": {\"code\": \"print('hey')\"}}\n"
+                      "</tool_call>");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+                      common_chat_params_init(tmpl, inputs_tools_builtin).format);
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+                      common_chat_params_init(
+                          common_chat_template(read_file("models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja"),
+                                               "<s>", "</s>"),
+                          inputs_tools_builtin)
+                          .format);
+
+        // test_template(tmpl, end_tokens, text_message, tools, R"(?)", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, code_interpreter_tool_call_message, llama_3_1_tools,
+                      "<|python_tag|>code_interpreter.call(code=\"print('hey')\")");
+        test_template(tmpl, end_tokens, python_tool_call_message, tools,
+                      "<|python_tag|>python.call(code=\"print('hey')\")");
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/meetkai-functionary-medium-v3.1.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+                      common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "<function=special_function>{\"arg1\": 1}</function>");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/meetkai-functionary-medium-v3.2.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_params_init(tmpl, inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, {},
+                      "all\n"
+                      "Hello, world!\n"
+                      "What's up?",
+                      /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "special_function\n"
+                      "{\"arg1\": 1}");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/fireworks-ai-llama-3-firefunction-v2.jinja"), "<s>",
+                                        "</s>");
+        std::vector<std::string>   end_tokens{ "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      " functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]");
+    }
+    {
+        const common_chat_template tmpl(read_file("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja"),
+                                        "<s>", "</s>");
+        std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
+
+        test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_template(tmpl, end_tokens, tool_call_message, tools,
+                      "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+                      "```json\n"
+                      "{\"arg1\": 1}\n"
+                      "```<｜tool▁call▁end｜>");
+    }
+}
+
+int main(int argc, char ** argv) {
+#ifndef _WIN32
+    if (argc > 1) {
+        common_chat_inputs inputs;
+        inputs.messages = {
+            { { "role", "user" }, { "content", "Hey" } }
+        };
+        inputs.tools = json::array({ special_function_tool });
+
+        std::cout << "| Template | Format |\n";
+        std::cout << "|----------|--------|\n";
+
+        for (int i = 1; i < argc; i++) {
+            std::string path = argv[i];
+            if (path.rfind(".jinja") != path.size() - 6) {
+                std::cerr << "Skipping non-jinja file: " << path << std::endl;
+                continue;
+            }
+            common_chat_template tmpl(read_file(path), "", "");
+            auto                 parts = string_split(path, "/");
+            auto                 name  = parts[parts.size() - 1];
+            std::cout << "| " << name << " | " << common_chat_format_name(common_chat_params_init(tmpl, inputs).format)
+                      << " |\n";
+        }
+    } else
+#endif
+    {
+        test_template_output_parsers();
+        std::cout << "\n[chat] All tests passed!" << std::endl;
+    }
+    return 0;
+}
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index 753dae911..6aac4737a 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -14,7 +14,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdouble-promotion"
 
-// ggml.c::quantize_row_q4_0_reference
+// ggml.c::quantize_row_q4_0_ref
 inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
 
 // ggml.c::ggml_silu_f32
@@ -24,7 +24,7 @@ inline static float silu_orig(float x) {
 
 #pragma GCC diagnostic pop
 
-// ggml.c::quantize_row_q4_0_reference
+// ggml.c::quantize_row_q4_0_ref
 inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
 
 // ggml.c::ggml_silu_f32
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
new file mode 100644
index 000000000..6ed696328
--- /dev/null
+++ b/tests/test-gguf.cpp
@@ -0,0 +1,1338 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "../ggml/src/ggml-impl.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <random>
+#include <string>
+#include <vector>
+
+constexpr int offset_has_kv      = 1000;
+constexpr int offset_has_tensors = 2000;
+constexpr int offset_has_data    = 3000;
+
+enum handcrafted_file_type {
+    HANDCRAFTED_HEADER_BAD_MAGIC           =  10,
+    HANDCRAFTED_HEADER_BAD_VERSION_1       =  20,
+    HANDCRAFTED_HEADER_BAD_VERSION_FUTURE  =  30,
+    HANDCRAFTED_HEADER_BAD_N_TENSORS       =  40,
+    HANDCRAFTED_HEADER_BAD_N_KV            =  50,
+    HANDCRAFTED_HEADER_EMPTY               = 800,
+
+    HANDCRAFTED_KV_BAD_KEY_SIZE            =  10 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_TYPE                =  20 + offset_has_kv,
+    // HANDCRAFTED_KV_BAD_VALUE_SIZE          =  30 + offset_has_kv, // removed because it can result in allocations > 1 TB (default sanitizer limit)
+    HANDCRAFTED_KV_DUPLICATE_KEY           =  40 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_ALIGN               =  50 + offset_has_kv,
+    HANDCRAFTED_KV_SUCCESS                 = 800 + offset_has_kv,
+
+    HANDCRAFTED_TENSORS_BAD_NAME_SIZE      =  10 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_N_DIMS         =  20 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_SHAPE          =  30 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_NE_TOO_BIG         =  40 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_TYPE           =  50 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_OFFSET         =  60 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_DUPLICATE_NAME     =  70 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_ALIGN          =  75 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN =  80 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_SUCCESS            = 800 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_CUSTOM_ALIGN       = 810 + offset_has_tensors,
+
+    HANDCRAFTED_DATA_NOT_ENOUGH_DATA       =  10 + offset_has_data,
+    HANDCRAFTED_DATA_BAD_ALIGN             =  15 + offset_has_data,
+    HANDCRAFTED_DATA_INCONSISTENT_ALIGN    =  20 + offset_has_data,
+    HANDCRAFTED_DATA_SUCCESS               = 800 + offset_has_data,
+    HANDCRAFTED_DATA_CUSTOM_ALIGN          = 810 + offset_has_data,
+};
+
+static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
+    switch (hft) {
+        case HANDCRAFTED_HEADER_BAD_MAGIC:           return "HEADER_BAD_MAGIC";
+        case HANDCRAFTED_HEADER_BAD_VERSION_1:       return "HEADER_BAD_VERSION_1";
+        case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE:  return "HEADER_BAD_VERSION_FUTURE";
+        case HANDCRAFTED_HEADER_BAD_N_KV:            return "HEADER_BAD_N_KV";
+        case HANDCRAFTED_HEADER_BAD_N_TENSORS:       return "HEADER_BAD_N_TENSORS";
+        case HANDCRAFTED_HEADER_EMPTY:               return "HEADER_EMPTY";
+
+        case HANDCRAFTED_KV_BAD_KEY_SIZE:            return "KV_BAD_KEY_SIZE";
+        case HANDCRAFTED_KV_BAD_TYPE:                return "KV_BAD_TYPE";
+        case HANDCRAFTED_KV_DUPLICATE_KEY:           return "KV_DUPLICATE_KEY";
+        case HANDCRAFTED_KV_BAD_ALIGN:               return "KV_BAD_ALIGN";
+        case HANDCRAFTED_KV_SUCCESS:                 return "KV_RANDOM_KV";
+
+        case HANDCRAFTED_TENSORS_BAD_NAME_SIZE:      return "TENSORS_BAD_NAME_SIZE";
+        case HANDCRAFTED_TENSORS_BAD_N_DIMS:         return "TENSORS_BAD_N_DIMS";
+        case HANDCRAFTED_TENSORS_BAD_SHAPE:          return "TENSORS_BAD_SHAPE";
+        case HANDCRAFTED_TENSORS_NE_TOO_BIG:         return "TENSORS_NE_TOO_BIG";
+        case HANDCRAFTED_TENSORS_BAD_TYPE:           return "TENSORS_BAD_TYPE";
+        case HANDCRAFTED_TENSORS_BAD_OFFSET:         return "TENSORS_BAD_OFFSET";
+        case HANDCRAFTED_TENSORS_DUPLICATE_NAME:     return "TENSORS_DUPLICATE_NAME";
+        case HANDCRAFTED_TENSORS_BAD_ALIGN:          return "TENSORS_BAD_ALIGN";
+        case HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN: return "TENSORS_INCONSISTENT_ALIGN";
+        case HANDCRAFTED_TENSORS_SUCCESS:            return "TENSORS_SUCCESS";
+        case HANDCRAFTED_TENSORS_CUSTOM_ALIGN:       return "TENSORS_CUSTOM_ALIGN";
+
+        case HANDCRAFTED_DATA_NOT_ENOUGH_DATA:       return "DATA_NOT_ENOUGH_DATA";
+        case HANDCRAFTED_DATA_BAD_ALIGN:             return "DATA_BAD_ALIGN";
+        case HANDCRAFTED_DATA_INCONSISTENT_ALIGN:    return "DATA_INCONSISTENT_ALIGN";
+        case HANDCRAFTED_DATA_SUCCESS:               return "DATA_SUCCESS";
+        case HANDCRAFTED_DATA_CUSTOM_ALIGN:          return "DATA_CUSTOM_ALIGN";
+    }
+    GGML_ABORT("fatal error");
+}
+
+static bool expect_context_not_null(const enum handcrafted_file_type hft) {
+    if (hft < offset_has_kv) {
+        return hft >= HANDCRAFTED_HEADER_EMPTY;
+    }
+    if (hft < offset_has_tensors) {
+        return hft >= HANDCRAFTED_KV_SUCCESS;
+    }
+    if (hft < offset_has_data) {
+        return hft >= HANDCRAFTED_TENSORS_SUCCESS;
+    }
+    return hft >= HANDCRAFTED_DATA_SUCCESS;
+}
+
+typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;
+
+static std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
+    std::vector<tensor_config_t> tensor_configs;
+    tensor_configs.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        if (ggml_type_size(type) == 0) {
+            continue;
+        }
+
+        std::array<int64_t, GGML_MAX_DIMS> shape = {1, 1, 1, 1};
+        shape[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        for (int i = 1; i < n_dims; ++i) {
+            shape[i] = 1 + rng() % 10;
+        }
+
+        tensor_configs.push_back(std::make_pair(type, shape));
+    }
+
+    return tensor_configs;
+}
+
+static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    kv_types.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+            if (type_arr == GGUF_TYPE_ARRAY) {
+                continue;
+            }
+            kv_types.push_back(std::make_pair(type, type_arr));
+            continue;
+        }
+
+        kv_types.push_back(std::make_pair(type, gguf_type(-1)));
+    }
+    std::shuffle(kv_types.begin(), kv_types.end(), rng);
+
+    return kv_types;
+}
+
+template <typename T>
+static void helper_write(FILE * file, const T & val) {
+    GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val));
+}
+
+static void helper_write(FILE * file, const void * data, const size_t nbytes) {
+    GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
+}
+
+static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
+    FILE * file = tmpfile();
+
+    if (!file) {
+        return file;
+    }
+
+    std::mt19937 rng(seed);
+    uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+
+    if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
+        const char bad_magic[4] = {'F', 'U', 'G', 'G'};
+        helper_write(file, bad_magic, sizeof(bad_magic));
+    } else {
+        helper_write(file, GGUF_MAGIC, 4);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+        const uint32_t version = 1;
+        helper_write(file, version);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
+        const uint32_t version = GGUF_VERSION + 1;
+        helper_write(file, version);
+    } else {
+        const uint32_t version = GGUF_VERSION;
+        helper_write(file, version);
+    }
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (hft >= offset_has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
+        const uint64_t n_tensors = -1;
+        helper_write(file, n_tensors);
+    } else {
+        const uint64_t n_tensors = tensor_configs.size();
+        helper_write(file, n_tensors);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (hft >= offset_has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+    {
+        uint64_t n_kv = kv_types.size();
+        if (hft == HANDCRAFTED_KV_BAD_ALIGN      ||
+            hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
+            hft == HANDCRAFTED_DATA_BAD_ALIGN    || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+
+            n_kv += 1;
+        } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
+            n_kv = -1;
+        }
+        helper_write(file, n_kv);
+    }
+
+    if (hft < offset_has_kv) {
+        while (ftell(file) % alignment != 0) {
+            const char pad = 0;
+            helper_write(file, pad);
+        }
+
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(file, tmp);
+        }
+        rewind(file);
+        return file;
+    }
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
+
+        if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
+            const uint64_t n = -1;
+            helper_write(file, n);
+        } else {
+            const uint64_t n = key.length();
+            helper_write(file, n);
+        }
+        helper_write(file, key.data(), key.length());
+
+        {
+            const int32_t type32 = int32_t(type);
+            helper_write(file, type32);
+        }
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const uint64_t n = rng() % sizeof(data);
+            helper_write(file, n);
+            helper_write(file, data, n);
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            {
+                const int32_t type32 = int32_t(type_arr);
+                helper_write(file, type32);
+            }
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr = rng() % (16 + 1);
+                helper_write(file, nstr);
+                for (uint64_t istr = 0; istr < nstr; ++istr) {
+                    const uint64_t n = rng() % (sizeof(uint32_t) + 1);
+                    helper_write(file, n);
+                    helper_write(file, &data[istr], n);
+                }
+                continue;
+            }
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t n = (rng() % sizeof(data)) / type_size;
+            helper_write(file, n);
+            helper_write(file, &data, n*type_size);
+            continue;
+        }
+
+        helper_write(file, data, hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type));
+    }
+
+    if (hft == HANDCRAFTED_KV_BAD_ALIGN      ||
+        hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
+        hft == HANDCRAFTED_DATA_BAD_ALIGN    || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+
+        const uint64_t n = strlen(GGUF_KEY_GENERAL_ALIGNMENT);
+        helper_write(file, n);
+        helper_write(file, GGUF_KEY_GENERAL_ALIGNMENT, n);
+
+        const int32_t type = gguf_type(GGUF_TYPE_UINT32);
+        helper_write(file, type);
+
+        alignment = expect_context_not_null(hft) ? 1 : 13;
+        helper_write(file, alignment);
+    }
+
+    if (hft < offset_has_tensors) {
+        while (ftell(file) % alignment != 0) {
+            const char pad = 0;
+            helper_write(file, pad);
+        }
+
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(file, tmp);
+        }
+        rewind(file);
+        return file;
+    }
+
+    if (hft == HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN || hft == HANDCRAFTED_DATA_INCONSISTENT_ALIGN) {
+        alignment = 1;
+    }
+
+    uint64_t offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        std::string name = "my_tensor";
+        if (hft != HANDCRAFTED_TENSORS_DUPLICATE_NAME) {
+            name += "_" + std::to_string(i);
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) {
+            name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+            GGML_ASSERT(name.length() >= GGML_MAX_NAME);
+        }
+        {
+            const uint64_t n = name.length();
+            helper_write(file, n);
+        }
+        helper_write(file, name.data(), name.length());
+
+        uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1;
+        for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
+            if (shape[i] != 1) {
+                n_dims = i + 1;
+                break;
+            }
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
+            const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
+            helper_write(file, n_dims_bad);
+        } else {
+            helper_write(file, n_dims);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                const int64_t bad_dim = -1;
+                helper_write(file, bad_dim);
+            }
+        } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                const int64_t big_dim = 4*int64_t(INT32_MAX);
+                helper_write(file, big_dim);
+            }
+        } else {
+            helper_write(file, shape.data(), n_dims*sizeof(int64_t));
+        }
+
+        {
+            const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? GGML_TYPE_COUNT : int32_t(type);
+            helper_write(file, type32);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
+            const uint64_t bad_offset = -1;
+            helper_write(file, bad_offset);
+        } else {
+            helper_write(file, offset);
+        }
+
+        int64_t ne = shape[0];
+        for (uint32_t i = 1; i < n_dims; ++i) {
+            ne *= shape[i];
+        }
+        offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    while (ftell(file) % alignment != 0) {
+        const char pad = 0;
+        helper_write(file, pad);
+    }
+
+    if (hft >= offset_has_data) {
+        rng.seed(seed + 1);
+        uint64_t nbytes = offset;
+        if (hft == HANDCRAFTED_DATA_NOT_ENOUGH_DATA) {
+            nbytes -= 1;
+        }
+        for (uint64_t i = 0; i < nbytes; ++i) {
+            const uint8_t random_byte = i % 256;
+            helper_write(file, random_byte);
+        }
+    }
+
+    for (int i = 0; i < extra_bytes; ++i) {
+        const char tmp = 0;
+        helper_write(file, tmp);
+    }
+    rewind(file);
+    return file;
+}
+
+static bool handcrafted_check_header(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_kv, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+
+    bool ok = true;
+
+    if (gguf_get_version(gguf_ctx) != GGUF_VERSION) {
+        ok = false;
+    }
+    if (gguf_get_n_tensors(gguf_ctx) != int(tensor_configs.size())) {
+        ok = false;
+    }
+    if (gguf_get_n_kv(gguf_ctx) != int(alignment_defined ? kv_types.size() + 1 : kv_types.size())) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types = get_kv_types(rng);
+
+    bool ok = true;
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string(i);
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        const char * data8 = reinterpret_cast<const char *>(data);
+        const int id = gguf_find_key(gguf_ctx, key.c_str());
+
+        if (type == GGUF_TYPE_STRING) {
+            const char * str = gguf_get_val_str(gguf_ctx, id);
+            const uint64_t n = strlen(str);
+            const uint64_t n_expected = rng() % sizeof(data);
+            if (n != n_expected) {
+                ok = false;
+                continue;
+            }
+            if (!std::equal(str, str + n, data8)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t arr_n = gguf_get_arr_n(gguf_ctx, id);
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr_expected = rng() % (16 + 1);
+                if (arr_n != nstr_expected) {
+                    ok = false;
+                    continue;
+                }
+                for (uint64_t istr = 0; istr < nstr_expected; ++istr) {
+                    const char * str = gguf_get_arr_str(gguf_ctx, id, istr);
+                    const uint64_t n = strlen(str);
+                    const uint64_t n_expected = rng() % (sizeof(uint32_t) + 1);
+
+                    if (n != n_expected) {
+                        ok = false;
+                        continue;
+                    }
+                    const char * str_expected = reinterpret_cast<const char *>(&data[istr]);
+                    if (strncmp(str, str_expected, n) != 0) {
+                        ok = false;
+                        continue;
+                    }
+                }
+                continue;
+            }
+
+            const uint64_t arr_n_expected = (rng() % sizeof(data)) / type_size;
+            if (arr_n != arr_n_expected) {
+                ok = false;
+                continue;
+            }
+
+            const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
+
+            if (type_arr == GGUF_TYPE_BOOL) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    if (bool(data8[arr_i]) != bool(data_gguf[arr_i])) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
+
+        if (type == GGUF_TYPE_BOOL) {
+            if (bool(*data8) != bool(*data_gguf)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
+            ok = false;
+        }
+    }
+
+    const uint32_t expected_alignment = alignment_defined ? 1 : GGUF_DEFAULT_ALIGNMENT;
+    if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsigned int seed) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    // Call get_kv_types to get the same RNG state:
+    get_kv_types(rng);
+
+    bool ok = true;
+
+    const int id_alignment = gguf_find_key(gguf_ctx, GGUF_KEY_GENERAL_ALIGNMENT);
+    const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
+
+    uint64_t expected_offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const int id = gguf_find_tensor(gguf_ctx, name.c_str());
+
+        if (id >= 0) {
+            if (std::string(gguf_get_tensor_name(gguf_ctx, id)) != name) {
+                ok = false;
+            }
+
+            if (gguf_get_tensor_type(gguf_ctx, id) != type) {
+                ok = false;
+            }
+        } else {
+            ok = false;
+            continue;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, id);
+
+        if (offset != expected_offset) {
+            ok = false;
+        }
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        expected_offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const unsigned int seed, FILE * file) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    bool ok = true;
+
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        const size_t size = ggml_row_size(type, ne);
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, gguf_find_tensor(gguf_ctx, name.c_str()));
+
+        std::vector<uint8_t> data(size);
+        GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
+        GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
+
+        for (size_t j = 0; j < size; ++j) {
+            const uint8_t expected_byte = (j + offset) % 256;
+            if (data[j] != expected_byte) {
+                ok = false;
+            }
+        }
+    }
+
+    return ok;
+}
+
+static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
+    int npass = 0;
+    int ntest = 0;
+
+    const std::vector<handcrafted_file_type> hfts = {
+        HANDCRAFTED_HEADER_BAD_MAGIC,
+        HANDCRAFTED_HEADER_BAD_VERSION_1,
+        HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
+        HANDCRAFTED_HEADER_BAD_N_KV,
+        HANDCRAFTED_HEADER_BAD_N_TENSORS,
+        HANDCRAFTED_HEADER_EMPTY,
+
+        HANDCRAFTED_KV_BAD_KEY_SIZE,
+        HANDCRAFTED_KV_BAD_TYPE,
+        HANDCRAFTED_KV_DUPLICATE_KEY,
+        HANDCRAFTED_KV_BAD_ALIGN,
+        HANDCRAFTED_KV_SUCCESS,
+
+        HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
+        HANDCRAFTED_TENSORS_BAD_N_DIMS,
+        HANDCRAFTED_TENSORS_BAD_SHAPE,
+        HANDCRAFTED_TENSORS_NE_TOO_BIG,
+        HANDCRAFTED_TENSORS_BAD_TYPE,
+        HANDCRAFTED_TENSORS_BAD_OFFSET,
+        HANDCRAFTED_TENSORS_DUPLICATE_NAME,
+        HANDCRAFTED_TENSORS_BAD_ALIGN,
+        HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN,
+        HANDCRAFTED_TENSORS_SUCCESS,
+        HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
+
+        HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
+        HANDCRAFTED_DATA_BAD_ALIGN,
+        HANDCRAFTED_DATA_INCONSISTENT_ALIGN,
+        HANDCRAFTED_DATA_SUCCESS,
+        HANDCRAFTED_DATA_CUSTOM_ALIGN,
+    };
+
+    for (enum handcrafted_file_type hft : hfts) {
+        printf("%s: handcrafted_file_type=%s\n", __func__, handcrafted_file_type_name(hft).c_str());
+        FILE * file = get_handcrafted_file(seed, hft);
+
+#ifdef _WIN32
+        if (!file) {
+            printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+            printf("%s: skipping tests");
+            continue;
+        }
+#else
+        GGML_ASSERT(file);
+#endif // _WIN32
+
+        struct ggml_context * ctx = nullptr;
+        struct gguf_init_params gguf_params = {
+            /*no_alloc =*/ false,
+            /*ctx      =*/ hft >= offset_has_data ? &ctx : nullptr,
+        };
+
+        struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - context_not_null: ", __func__);
+        } else {
+            printf("%s:   - context_null: ", __func__);
+        }
+        if (bool(gguf_ctx) == expect_context_not_null(hft)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+
+        if (hft >= offset_has_data && !expect_context_not_null(hft)) {
+            printf("%s:   - no_dangling_ggml_context_pointer: ", __func__);
+            if (ctx) {
+                printf("\033[1;31mFAIL\033[0m\n");
+            } else {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            }
+            ntest++;
+        }
+
+        const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - check_header: ", __func__);
+            if (handcrafted_check_header(gguf_ctx, seed, hft >= offset_has_kv, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_kv) {
+            printf("%s:   - check_kv: ", __func__);
+            if (handcrafted_check_kv(gguf_ctx, seed, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_tensors) {
+            printf("%s:   - check_tensors: ", __func__);
+            if (handcrafted_check_tensors(gguf_ctx, seed)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_data) {
+            printf("%s:   - check_tensor_data: ", __func__);
+            if (handcrafted_check_tensor_data(gguf_ctx, seed, file)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        fclose(file);
+        if (gguf_ctx) {
+            ggml_free(ctx);
+            gguf_free(gguf_ctx);
+        }
+        printf("\n");
+    }
+
+
+    return std::make_pair(npass, ntest);
+}
+
+struct random_gguf_context_result {
+    struct gguf_context * gguf_ctx;
+    struct ggml_context * ctx;
+    ggml_backend_buffer_t buffer;
+};
+
+static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t backend, const unsigned int seed) {
+    std::mt19937 rng(seed);
+
+    struct gguf_context * gguf_ctx = gguf_init_empty();
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string key = "my_key_" + std::to_string(rng() % 1024);
+        const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        switch (type) {
+            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (gguf_ctx, key.c_str(), rng() % (1 <<  7));             break;
+            case GGUF_TYPE_INT8:    gguf_set_val_i8  (gguf_ctx, key.c_str(), rng() % (1 <<  7) - (1 <<  6)); break;
+            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (gguf_ctx, key.c_str(), rng() % (1 << 15));             break;
+            case GGUF_TYPE_INT16:   gguf_set_val_i16 (gguf_ctx, key.c_str(), rng() % (1 << 15) - (1 << 14)); break;
+            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT32:   gguf_set_val_i32 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(gguf_ctx, key.c_str(), rng() % 2 == 0);                break;
+            case GGUF_TYPE_STRING:  gguf_set_val_str (gguf_ctx, key.c_str(), std::to_string(rng()).c_str()); break;
+            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT64:   gguf_set_val_i64 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT64: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_ARRAY: {
+                const enum gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+                const uint64_t ne = rng() % 1024;
+
+                switch (type_arr) {
+                    case GGUF_TYPE_UINT8:
+                    case GGUF_TYPE_INT8:
+                    case GGUF_TYPE_UINT16:
+                    case GGUF_TYPE_INT16:
+                    case GGUF_TYPE_UINT32:
+                    case GGUF_TYPE_INT32:
+                    case GGUF_TYPE_FLOAT32:
+                    case GGUF_TYPE_BOOL:
+                    case GGUF_TYPE_UINT64:
+                    case GGUF_TYPE_INT64:
+                    case GGUF_TYPE_FLOAT64: {
+                        const size_t nbytes = ne*gguf_type_size(type_arr);
+                        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+                        for (size_t j = 0; j < random_data.size(); ++j) {
+                            random_data[j] = rng();
+                            if (type_arr == GGUF_TYPE_BOOL) {
+                                random_data[j] &= 0x01010101; // the sanitizer complains if booleans are not 0 or 1
+                            }
+                        }
+                        gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
+                    } break;
+                    case GGUF_TYPE_STRING: {
+                        std::vector<std::string>  data_cpp(ne);
+                        std::vector<const char *> data_c(ne);
+                        for (size_t j = 0; j < data_cpp.size(); ++j) {
+                            data_cpp[j] = std::to_string(rng());
+                            data_c[j]   = data_cpp[j].c_str();
+                        }
+                        gguf_set_arr_str(gguf_ctx, key.c_str(), data_c.data(), ne);
+                    } break;
+                    case GGUF_TYPE_ARRAY: {
+                        break; // not supported
+                    }
+                    case GGUF_TYPE_COUNT:
+                    default: {
+                        GGML_ABORT("fatal error");
+                    }
+                }
+            } break;
+            case GGUF_TYPE_COUNT:
+            default: {
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ 256*ggml_tensor_overhead(),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        const size_t type_size = ggml_type_size(type);
+
+        if (type_size == 0) {
+            continue;
+        }
+
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        int64_t ne[GGML_MAX_DIMS];
+        ne[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        for (int j = 1; j < n_dims; ++j) {
+            ne[j] = 1 + rng() % 10;
+        }
+
+        struct ggml_tensor * tensor = ggml_new_tensor(ctx, type, n_dims, ne);
+        ggml_set_name(tensor, name.c_str());
+    }
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        const size_t nbytes = ggml_nbytes(t);
+        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+        for (size_t j = 0; j < random_data.size(); ++j) {
+            random_data[j] = rng();
+        }
+        ggml_backend_tensor_set(t, random_data.data(), 0, nbytes);
+
+        gguf_add_tensor(gguf_ctx, t);
+    }
+
+    return {gguf_ctx, ctx, buf};
+}
+
+static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_kv = gguf_get_n_kv(ctx);
+    for (int id = 0; id < n_kv; ++id) {
+        const char * name = gguf_get_key(ctx, id);
+
+        const int idx_other = gguf_find_key(other, name);
+        if (idx_other < 0) {
+            ok = false;
+            continue;
+        }
+
+        const gguf_type type = gguf_get_kv_type(ctx, id);
+        if (type != gguf_get_kv_type(other, idx_other)) {
+            ok = false;
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const size_t arr_n = gguf_get_arr_n(ctx, id);
+            if (arr_n != gguf_get_arr_n(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            const gguf_type type_arr = gguf_get_arr_type(ctx, id);
+            if (type_arr != gguf_get_arr_type(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            if (type_arr == GGUF_TYPE_BOOL) {
+                const int8_t * data       = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx,   id));
+                const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    if (bool(data[arr_i]) != bool(data_other[arr_i])) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    const std::string str       = gguf_get_arr_str(ctx,   id,       arr_i);
+                    const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
+                    if (str != str_other) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            const int8_t * data       = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx,   id));
+            const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
+            if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const std::string str       = gguf_get_val_str(ctx,   id);
+            const std::string str_other = gguf_get_val_str(other, idx_other);
+            if (str != str_other) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data       = reinterpret_cast<const char *>(gguf_get_val_data(ctx,   id));
+        const char * data_other = reinterpret_cast<const char *>(gguf_get_val_data(other, idx_other));
+        if (!std::equal(data, data + gguf_type_size(type), data_other)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool all_tensors_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+    for (int id = 0; id < n_tensors; ++id) {
+        const std::string name = gguf_get_tensor_name(ctx, id);
+
+        const int idx_other = gguf_find_tensor(other, name.c_str());
+        if (id != idx_other) {
+            ok = false;
+            if (idx_other < 0) {
+                continue;
+            }
+        }
+
+        const ggml_type type = gguf_get_tensor_type(ctx, id);
+        if (type != gguf_get_tensor_type(other, id)) {
+            ok = false;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(ctx, id);
+        if (offset != gguf_get_tensor_offset(other, id)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool same_tensor_data(const struct ggml_context * orig, const struct ggml_context * read) {
+    bool ok = true;
+
+    struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
+    struct ggml_tensor * t_read = ggml_get_first_tensor(read);
+
+    if (std::string(t_read->name) != "GGUF tensor data binary blob") {
+        return false;
+    }
+    t_read = ggml_get_next_tensor(read, t_read);
+
+    while (t_orig) {
+        if (!t_read) {
+            ok = false;
+            break;
+        }
+
+        const size_t nbytes = ggml_nbytes(t_orig);
+        if (ggml_nbytes(t_read) != nbytes) {
+            ok = false;
+            break;
+        }
+        std::vector<char> data_orig(nbytes);
+        ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes);
+        if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast<const char *>(t_read->data))) {
+            ok = false;
+        }
+
+        t_orig = ggml_get_next_tensor(orig, t_orig);
+        t_read = ggml_get_next_tensor(read, t_read);
+    }
+    if (t_read) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s, only_meta=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf       = result.buffer;
+    }
+
+    FILE * file = tmpfile();
+
+#ifdef _WIN32
+    if (!file) {
+        printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
+        printf("%s: skipping tests");
+        return std::make_pair(0, 0);
+    }
+#else
+    GGML_ASSERT(file);
+#endif // _WIN32
+
+    {
+        std::vector<int8_t> buf;
+        gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
+        GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
+        rewind(file);
+    }
+
+    struct ggml_context * ctx_1 = nullptr;
+    struct gguf_init_params gguf_params = {
+        /*no_alloc =*/ false,
+        /*ctx      =*/ only_meta ? nullptr : &ctx_1,
+    };
+    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+
+    printf("%s: same_version: ", __func__);
+    if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_tensors: ", __func__);
+    if (gguf_get_n_tensors(gguf_ctx_0) == gguf_get_n_tensors(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_kv_in_read: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_kv_in_orig: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_tensors_in_read: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_tensors_in_orig: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    if (!only_meta) {
+        printf("%s: same_tensor_data: ", __func__);
+        if (same_tensor_data(ctx_0, ctx_1)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    ggml_backend_buffer_free(bbuf);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    ggml_backend_free(backend);
+    fclose(file);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_gguf_set_kv(ggml_backend_dev_t dev, const unsigned int seed) {
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend));
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf_0;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf_0     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_1;
+    struct ggml_context * ctx_1;
+    ggml_backend_buffer_t bbuf_1;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed + 1);
+        gguf_ctx_1 = result.gguf_ctx;
+        ctx_1      = result.ctx;
+        bbuf_1     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_2 = gguf_init_empty();
+
+    gguf_set_kv(gguf_ctx_1, gguf_ctx_0);
+    gguf_set_kv(gguf_ctx_2, gguf_ctx_0);
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_1: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_2: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    gguf_set_kv(gguf_ctx_0, gguf_ctx_1);
+
+    printf("%s: same_n_kv_after_double_copy: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_1_in_0_after_double_copy: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    ggml_backend_buffer_free(bbuf_0);
+    ggml_backend_buffer_free(bbuf_1);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    gguf_free(gguf_ctx_2);
+    ggml_backend_free(backend);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static void print_usage() {
+    printf("usage: test-gguf [seed]\n");
+    printf("  if no seed is unspecified then a random seed is used\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc > 2) {
+        print_usage();
+        return 1;
+    }
+
+    std::random_device rd;
+    const unsigned int seed = argc < 2 ? rd() : std::stoi(argv[1]);
+
+    // Initialize ggml backends early so the prints aren't interleaved with the test results:
+    ggml_backend_dev_count();
+    fprintf(stderr, "\n");
+
+    int npass = 0;
+    int ntest = 0;
+    {
+        std::pair<int, int> result = test_handcrafted_file(seed);
+        npass += result.first;
+        ntest += result.second;
+    }
+
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+        for (bool only_meta : {true, false}) {
+            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
+            npass += result.first;
+            ntest += result.second;
+        }
+
+        {
+            std::pair<int, int> result = test_gguf_set_kv(dev, seed);
+            npass += result.first;
+            ntest += result.second;
+        }
+    }
+
+    printf("%d/%d tests passed\n", npass, ntest);
+    if (npass != ntest) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
deleted file mode 100644
index 8ff76c891..000000000
--- a/tests/test-grad0.cpp
+++ /dev/null
@@ -1,1606 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_f16(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_i32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        int32_t imin,
-        int32_t imax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel) {
-
-    static int n_threads = -1;
-    if (n_threads < 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-
-        const char *env = getenv("GGML_N_THREADS");
-        if (env) {
-            n_threads = atoi(env);
-        }
-
-        printf("GGML_N_THREADS = %d\n", n_threads);
-    }
-
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(gf, f);
-    ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
-
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        const int nelements = ggml_nelements(x[i]);
-        for (int k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
-            const float x0 = ggml_get_f32_1d(x[i], k);
-            const float xm = x0 - eps;
-            const float xp = x0 + eps;
-            ggml_set_f32_1d(x[i], k, xp);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f0 = ggml_get_f32_1d(f, 0);
-
-            ggml_set_f32_1d(x[i], k, xm);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f1 = ggml_get_f32_1d(f, 0);
-            const double g0 = (f0 - f1)/(2.0*(double) eps);
-
-            ggml_set_f32_1d(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
-
-            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
-                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
-                //assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-// TODO: clean-up this ..
-static bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    float * dst  = (float *) y->data;
-    float * src0 = (float *) x0->data;
-    float * src1 = (float *) x1->data;
-
-    const int nc = x0->ne[1];
-    const int nr = x1->ne[1];
-    const int nk = x0->ne[0];
-
-    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
-
-    GGML_PRINT_DEBUG("x0:\n");
-    for (int j = 0; j < x0->ne[1]; ++j) {
-        for (int i = 0; i < x0->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("x1:\n");
-    for (int j = 0; j < x1->ne[1]; ++j) {
-        for (int i = 0; i < x1->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
-    for (int j = 0; j < y->ne[1]; ++j) {
-        for (int i = 0; i < y->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    for (int i = 0; i < nr; ++i) {
-        for (int j = 0; j < nc; ++j) {
-            float sum = 0.0f;
-
-            for (int k = 0; k < nk; ++k) {
-                sum += src0[j*nk + k]*src1[i*nk + k];
-            }
-
-            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
-                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
-                assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-#define NUM_PERMUTATIONS (4*3*2*1)
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 256*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    int64_t ne[4];
-
-    int all_permutations[4 * NUM_PERMUTATIONS];
-    {
-        int count = 0;
-        for (int ax0=0; ax0<4; ++ax0) {
-            for (int ax1=0; ax1<4; ++ax1) {
-                if (ax1 == ax0) continue;
-                for (int ax2=0; ax2<4; ++ax2) {
-                    if (ax2 == ax0) continue;
-                    if (ax2 == ax1) continue;
-                    for (int ax3=0; ax3<4; ++ax3) {
-                        if (ax3 == ax0) continue;
-                        if (ax3 == ax1) continue;
-                        if (ax3 == ax2) continue;
-                        assert(count < NUM_PERMUTATIONS);
-                        all_permutations[count*4+0] = ax0;
-                        all_permutations[count*4+1] = ax1;
-                        all_permutations[count*4+2] = ax2;
-                        all_permutations[count*4+3] = ax3;
-                        ++count;
-                    }
-                }
-            }
-        }
-    }
-
-    unsigned seed_iter = 1;
-
-    // original loop: 1000
-    int niter = 4;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-    for (int iter = 0; iter < niter; ++iter) {
-        srand(seed_iter);
-        seed_iter = rand();
-        unsigned seed = rand();
-
-        printf("test-grad0: iter:%d/%d\n", iter, niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // add f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
-            }
-        }
-
-        // add f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
-            }
-        }
-
-        // sub
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // div
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
-            }
-        }
-
-        // sqr
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // sqrt
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
-            }
-        }
-
-        // log
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
-            }
-        }
-
-        // sum
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-
-        // sum_rows
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // mean, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // argmax
-        if (0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // repeat
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // repeat back
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // abs (finite differences do not work)
-        //{
-        //    const int nargs = 1;
-
-        //    for (int ndims = 1; ndims <= 2; ++ndims) {
-        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-        //            ggml_set_param(ctx0, x[i]);
-        //        }
-
-        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-
-        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
-        //    }
-        //}
-
-        // sgn
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // neg
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // step
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // tanh, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul_mat
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                int max_nrep = (ndims >= 3) ? 2 : 1;
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
-                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
-                        {
-                            int64_t ne2[4];
-                            get_random_dims(ne2, 4);
-                            ne2[0] = ne[0];
-                            ne2[2] = nrep2 * ne[2];
-                            ne2[3] = nrep3 * ne[3];
-                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-
-                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                        struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                        if (ndims == 2) {
-                            // check_mat_mul does not support ndims > 2
-                            check_mat_mul(m, x[1], x[0]);
-                        }
-                    }
-                }
-            }
-        }
-
-        // elu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // relu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // gelu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // silu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
-
-#ifdef GGML_SILU_FP16
-                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
-#else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-#endif
-            }
-        }
-
-        // rms_norm
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
-            }
-        }
-
-        // scale
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                const float s = -1.0f + 2.0f*frand();
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
-
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-            }
-        }
-
-        // reshape (1d->nd)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // reshape (nd->1d)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 1d
-        {
-            srand(seed);
-            int64_t ne2[4] = { 1, 1, 1, 1 };
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 2d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 3d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 3);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 3);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                const int offset = offsets[0] + offsets[1] + offsets[2];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 4d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 4; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 4);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 4);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
-                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_1d
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 1;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_1d
-        {
-            srand(seed);
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = irand(ggml_nelements(x[0]));
-                const int k1 = irand(ggml_nelements(x[0]));
-                const int i0 = MIN(k0, k1);
-                const int i1 = MAX(k0, k1);
-
-                const int offset = i0 * sizeof(float);
-                const int nelem  = i1 - i0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t nb2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 2);
-                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 2);
-                }
-                const int count = ne2[0]*ne2[1];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_3d
-        {
-            srand(seed);
-            int64_t ne2[4] = {1,1,1,1};
-            int64_t nb2[4] = {0,0,0,0};
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 3);
-                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 3);
-                }
-                const int count = ne2[0]*ne2[1]*ne2[2];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-                nb2[2] = nb2[1]*ne2[1];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // permute
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_permute will set axes of dimensions below n_dims to 1.
-                // to make ggml_permute work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int p = irand(NUM_PERMUTATIONS);
-                const int ax0 = all_permutations[p*4+0];
-                const int ax1 = all_permutations[p*4+1];
-                const int ax2 = all_permutations[p*4+2];
-                const int ax3 = all_permutations[p*4+3];
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // transpose
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_transpose will set axes of dimensions below n_dims to 1.
-                // to make ggml_transpose work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // get_rows
-        {
-            srand(seed);
-            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
-            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
-            const int nargs = 1;
-            const int ndims = 2;
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
-
-            ggml_set_param(ctx0, x[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_inf
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_zero
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // softmax
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                float eps = 1e-6f;
-                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
-                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0,
-                                            ggml_log(ctx0,
-                                                ggml_add1(ctx0,
-                                                    ggml_scale(ctx0,
-                                                        ggml_soft_max(ctx0, x[0]),
-                                                        1.0f - eps),
-                                                    ggml_new_f32(ctx0, eps))));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
-                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
-                // this may result in different gradients too finite differences.
-                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
-                // if only the table lookup causes gradients to differ this is acceptable.
-            }
-        }
-
-        // cross_entropy_loss
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                // the second argument to cross_entropy_loss must sum up to 1 for each row
-                int nr = ggml_nrows(x[1]);
-                int nc = ggml_nelements(x[1]) / nr;
-                for (int ir = 0; ir < nr; ++ir) {
-                    float sum = 0;
-                    for (int ic = 0; ic < nc; ++ic) {
-                        sum += ((float *) x[1]->data)[ic + ir*nc];
-                    }
-                    for (int ic = 0; ic < nc; ++ic) {
-                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
-                    }
-                }
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
-
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
-            }
-        }
-
-        // rope f32
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // rope f16
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f32
-        {
-            srand(seed);
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int max_nrep = (ndims >= 3) ? 2 : 1;
-                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
-                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
-                        int64_t nek[4] = { D, M, B, ne[3] };
-                        int64_t nev[4] = { M, D, B, ne[3] };
-                        if (ndims == 2) {
-                            neq[2] = 1; neq[3] = 1;
-                            nek[2] = 1; nek[3] = 1;
-                            nev[2] = 1; nev[3] = 1;
-                        } else if (ndims == 3) {
-                            neq[3] = 1;
-                            nek[3] = 1;
-                            nev[3] = 1;
-                        }
-                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-                        ggml_set_param(ctx0, x[2]);
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f16, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                }
-            }
-        }
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
new file mode 100644
index 000000000..890608648
--- /dev/null
+++ b/tests/test-grammar-integration.cpp
@@ -0,0 +1,1307 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "unicode.h"
+#include "llama-grammar.h"
+#include "json-schema-to-grammar.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+static llama_grammar * build_grammar(const std::string & grammar_str) {
+    return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+}
+
+static bool test_build_grammar_fails(const std::string & grammar_str) {
+    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
+    bool grammar_fails = false;
+    llama_grammar * grammar = build_grammar(grammar_str);
+    if (grammar != nullptr) {
+        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
+    } else {
+        grammar_fails = true;
+        fprintf(stdout, "  ✅︎\n");
+    }
+    return grammar_fails;
+}
+
+static bool match_string(const std::string & input, llama_grammar * grammar) {
+    const auto cpts = unicode_cpts_from_utf8(input);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            // no stacks means that the grammar failed to match at this point
+            return false;
+        }
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            // An empty stack means that the grammar has been completed
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
+    fflush(stderr);
+
+    auto * grammar = build_grammar(grammar_str);
+
+    // Save the original grammar stacks so that we can reset after every new string we want to test
+    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
+
+    llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    fprintf(stderr, "  🔵 Valid strings:\n");
+
+    // Passing strings
+    for (const auto & test_string : passing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (!matched) {
+            fprintf(stderr, "❌ (failed to match)\n");
+
+            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
+            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
+            FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
+            if (grammar_file) {
+                fprintf(grammar_file, "%s", grammar_str.c_str());
+                fclose(grammar_file);
+            }
+
+            // DEBUG: Write the test string to test-grammar-integration.string.txt
+            FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
+            if (string_file) {
+                fprintf(string_file, "%s", test_string.c_str());
+                fclose(string_file);
+            }
+
+            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+
+        assert(matched);
+
+        // Reset the grammar stacks
+        stacks_cur = stacks_org;
+    }
+
+    fprintf(stderr, "  🟠 Invalid strings:\n");
+
+    // Failing strings
+    for (const auto & test_string : failing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (matched) {
+            fprintf(stderr, "❌ (incorrectly matched)\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+        assert(!matched);
+
+        // Reset the grammar stacks
+        stacks_cur = stacks_org;
+    }
+
+    // Clean up allocated memory
+    llama_grammar_free_impl(grammar);
+}
+static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
+}
+static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str), true), passing_strings, failing_strings);
+}
+
+static void test_simple_grammar() {
+    test_schema(
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        // Passing strings
+        {
+            "0",
+            "10",
+            "12",
+            "10000",
+        },
+        // Failing strings
+        {
+            "-1",
+            "-10",
+            "-10000",
+            "-100000000000000000000000000000000",
+            "100000000000000000000000000000000",
+            "00",
+            "01",
+            "-0",
+        }
+    );
+    test_schema(
+        "min 2",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 2
+        })""",
+        // Passing strings
+        {
+            "2",
+            "3",
+            "4",
+            "10",
+            "20",
+            "1234567890000000",
+        },
+        // Failing strings
+        {
+            "0",
+            "1",
+            "-1",
+            "-100",
+            "0",
+            "1",
+            "01",
+            "02",
+            "12345678900000000",
+        }
+    );
+    test_schema(
+        "min 456",
+        R"""({
+            "type": "integer",
+            "minimum": 456
+        })""",
+        // Passing strings
+        {
+            "456",
+            "4560",
+            "457",
+            "460",
+            "500",
+        },
+        // Failing strings
+        {
+            "455",
+            "356",
+            "50",
+            "050",
+            "-1",
+            "-456",
+        }
+    );
+    test_schema(
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-11",
+            "-1",
+            "0",
+            "1",
+            "123",
+            "1234",
+            "2345",
+        },
+        // Failing strings
+        {
+            "-1234",
+            "-124",
+        }
+    );
+
+    test_schema(
+        "max 9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": 9999
+        })""",
+        // Passing strings
+        {
+            "-99999",
+            "0",
+            "9999",
+        },
+        // Failing strings
+        {
+            "10000",
+            "99991",
+        }
+    );
+    test_schema(
+        "max -9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": -9999
+        })""",
+        // Passing strings
+        {
+            "-10000",
+            "-9999",
+        },
+        // Failing strings
+        {
+            "-9998",
+            "0",
+            "9999",
+        }
+    );
+    test_schema(
+        "min 5 max 30",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        // Passing strings
+        {
+            "5",
+            "10",
+            "30",
+        },
+        // Failing strings
+        {
+            "05",
+            "4",
+            "-1",
+            "31",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "min -1 max 1",
+        R"""({
+            "type": "integer",
+            "minimum": -1,
+            "maximum": 1
+        })""",
+        // Passing strings
+        {
+            "-1",
+            "0",
+            "1",
+        },
+        // Failing strings
+        {
+            "-11",
+            "-10",
+            "-2",
+            "2",
+            "10",
+            "11",
+        }
+    );
+    test_schema(
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-13",
+            "-11",
+            "-2",
+            "-1",
+            "0",
+            "1",
+            "5",
+            "10",
+            "39",
+            "40",
+            "42",
+        },
+        // Failing strings
+        {
+            "-0123",
+            "-124",
+            "-1123",
+            "-200",
+            "43",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "exclusive min / max",
+        // Schema
+        R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "exclusiveMaximum": 10000
+        })""",
+        // Passing strings
+        {
+            "1",
+            "9999",
+        },
+        // Failing strings
+        {
+            "0",
+            "01",
+            "10000",
+            "99999",
+        }
+    );
+
+    // Test case for a simple grammar
+    test_grammar(
+        "simple grammar",
+        R"""(
+            root ::= expr
+            expr ::= term ("+" term)*
+            term ::= number
+            number ::= [0-9]+)""",
+        // Passing strings
+        {
+            "42",
+            "1+2+3+4+5",
+            "123+456",
+        },
+        // Failing strings
+        {
+            "+",
+            "/ 3",
+            "1+2+3+4+5+",
+            "12a45",
+        }
+    );
+}
+
+static void test_complex_grammar() {
+    // Test case for a more complex grammar, with both failure strings and success strings
+    test_grammar(
+        "medium complexity grammar",
+        // Grammar
+        R"""(
+            root ::= expression
+            expression ::= term ws (("+"|"-") ws term)*
+            term ::= factor ws (("*"|"/") ws factor)*
+            factor ::= number | variable | "(" expression ")" | function-call
+            number ::= [0-9]+
+            variable ::= [a-zA-Z_][a-zA-Z0-9_]*
+            function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
+            ws ::= [ \t\n\r]?)""",
+        // Passing strings
+        {
+            "42",
+            "1*2*3*4*5",
+            "x",
+            "x+10",
+            "x1+y2",
+            "(a+b)*(c-d)",
+            "func()",
+            "func(x,y+2)",
+            "a*(b+c)-d/e",
+            "f(g(x),h(y,z))",
+            "x + 10",
+            "x1 + y2",
+            "(a + b) * (c - d)",
+            "func()",
+            "func(x, y + 2)",
+            "a * (b + c) - d / e",
+            "f(g(x), h(y, z))",
+            "123+456",
+            "123*456*789-123/456+789*123",
+            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
+        },
+        // Failing strings
+        {
+            "+",
+            "/ 3x",
+            "x + + y",
+            "a * / b",
+            "func(,)",
+            "func(x y)",
+            "(a + b",
+            "x + y)",
+            "a + b * (c - d",
+            "42 +",
+            "x +",
+            "x + 10 +",
+            "(a + b) * (c - d",
+            "func(",
+            "func(x, y + 2",
+            "a * (b + c) - d /",
+            "f(g(x), h(y, z)",
+            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+        }
+    );
+}
+
+static void test_special_chars() {
+    // A collection of tests to exercise special characters such as "."
+    test_grammar(
+        "special characters",
+        // Grammar
+        R"""(
+            root ::= ... "abc" ...
+            )""",
+        // Passing strings
+        {
+            "abcabcabc",
+            "aaaabcccc",
+            // NOTE: Also ensures that multi-byte characters still count as a single character
+            "🔵🟠✅abc❌🟠🔵"
+        },
+        // Failing strings
+        {
+            "aaabcccc",
+            "aaaaabcccc",
+            "aaaabccc",
+            "aaaabccccc",
+            "🔵🟠✅❌abc❌✅🟠🔵",
+            "🔵🟠abc🟠🔵"
+        }
+    );
+}
+
+static void test_quantifiers() {
+    // A collection of tests to exercise * + and ? quantifiers
+
+    test_grammar(
+        "* quantifier",
+        // Grammar
+        R"""(root ::= "a"*)""",
+        // Passing strings
+        {
+            "",
+            "a",
+            "aaaaa",
+            "aaaaaaaaaaaaaaaaaa",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+        },
+        // Failing strings
+        {
+            "b",
+            "ab",
+            "aab",
+            "ba",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
+        }
+    );
+    test_grammar(
+        "+ quantifier",
+        // Grammar
+        R"""(root ::= "a"+)""",
+        // Passing strings
+        {
+            "a",
+            "aaaaa",
+            "aaaaaaaaaaaaaaaaaa",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+        },
+        // Failing strings
+        {
+            "",
+            "b",
+            "ab",
+            "aab",
+            "ba",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
+        }
+    );
+    test_grammar(
+        "? quantifier",
+        // Grammar
+        R"""(root ::= "a"?)""",
+        // Passing strings
+        {
+            "",
+            "a"
+        },
+        // Failing strings
+        {
+            "b",
+            "ab",
+            "aa",
+            "ba",
+        }
+    );
+    test_grammar(
+        "mixed quantifiers",
+        // Grammar
+        R"""(
+            root ::= cons+ vowel* cons? (vowel cons)*
+            vowel ::= [aeiouy]
+            cons ::= [bcdfghjklmnpqrstvwxyz]
+            )""",
+        // Passing strings
+        {
+            "yes",
+            "no",
+            "noyes",
+            "crwth",
+            "four",
+            "bryyyy",
+        },
+        // Failing strings
+        {
+            "yess",
+            "yesno",
+            "forty",
+            "catyyy",
+        }
+    );
+    test_grammar(
+        "simple exact repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "bbbb",
+            "abab",
+        },
+        // Failing strings
+        {
+            "a",
+            "b",
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "simple min repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4,}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "aaaaab",
+            "bbbb",
+            "ababab",
+        },
+        // Failing strings
+        {
+            "",
+            "aba",
+        }
+    );
+    test_grammar(
+        "simple max repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{0,4}
+        )""",
+        // Passing strings
+        {
+            "",
+            "a",
+            "aa",
+            "aaa",
+            "aaab",
+        },
+        // Failing strings
+        {
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "min / max repetition",
+        // Grammar
+        R"""(
+            root ::= ("0x" [A-F0-9]{2} " "?){3,5}
+        )""",
+        // Passing strings
+        {
+            "0xFF 0x12 0xAB",
+            "0xFF 0x12 0xAB 0x00 0x00",
+        },
+        // Failing strings
+        {
+            "",
+            "0xFF",
+            "0xFF 0x12",
+            "0xFF 0x12 0xAB 0x00 0x00 0x00",
+        }
+    );
+}
+
+static void test_failure_missing_root() {
+    fprintf(stderr, "⚫ Testing missing root node:\n");
+    // Test case for a grammar that is missing a root rule
+    const std::string grammar_str = R"""(
+        rot ::= expr
+        expr ::= term ("+" term)*
+        term ::= number
+        number ::= [0-9]+)""";
+
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_str.c_str());
+
+    // Ensure we parsed correctly
+    assert(!parsed_grammar.rules.empty());
+
+    // Ensure we do NOT have a root node
+    assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_failure_missing_reference() {
+    fprintf(stderr, "⚫ Testing missing reference node:\n");
+
+    // Test case for a grammar that is missing a referenced rule
+    const std::string grammar_str =
+        R"""(root ::= expr
+        expr ::= term ("+" term)*
+        term ::= numero
+        number ::= [0-9]+)""";
+
+    fprintf(stderr, "    Expected error:  ");
+
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_str.c_str());
+
+    // Ensure we did NOT parsed correctly
+    assert(parsed_grammar.rules.empty());
+
+    fprintf(stderr, "    End of expected error.\n");
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_failure_left_recursion() {
+    fprintf(stderr, "⚫ Testing left recursion detection:\n");
+
+    // Test simple left recursion detection
+    const std::string simple_str = R"""(root ::= "a" | root "a")""";
+    assert(test_build_grammar_fails(simple_str));
+
+    // Test more complicated left recursion detection
+    const std::string medium_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | asdf "a"
+        )""";
+    assert(test_build_grammar_fails(medium_str));
+
+    // Test even more complicated left recursion detection
+    const std::string hard_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | asdf "d" | "e")""";
+    assert(test_build_grammar_fails(hard_str));
+
+    // Test yet even more complicated left recursion detection
+    const std::string hardest_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | empty asdf "d" | "e"
+        empty ::= "blah" | )""";
+    assert(test_build_grammar_fails(hardest_str));
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_json_schema() {
+    // Note that this is similar to the regular grammar tests,
+    //  but we convert each json schema to a grammar before parsing.
+    // Otherwise, this test structure is the same.
+
+    test_schema(
+        "empty schema (object)",
+        // Schema
+        R"""(
+            {}
+        )""",
+        // Passing strings
+        {
+            R"""({})""",
+            R"""({"foo": "bar"})""",
+        },
+        // Failing strings
+        {
+            "",
+            "[]",
+            "null",
+            R"""("")""",
+            "true",
+        }
+    );
+
+    test_schema(
+        "exotic formats (list)",
+        // Schema
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        // Passing strings
+        {
+            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
+            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+        },
+        // Failing strings
+        {
+            R"""(["foo", "bar"])""",
+            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
+        }
+    );
+
+    test_schema(
+        "string",
+        // Schema
+        R"""({
+            "type": "string"
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
+        },
+        // Failing strings
+        {
+            R"""({})""",
+            R"""("foo": "bar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 1",
+        // Schema
+        R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""({})""",
+            R"""("foo": "bar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 3",
+        // Schema
+        R"""({
+                "type": "string",
+                "minLength": 3
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("foobar")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
+        }
+    );
+
+    test_schema(
+        "string w/ max length",
+        // Schema
+        R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
+        },
+        // Failing strings
+        {
+            R"""("foobar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min & max length",
+        // Schema
+        R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("f")""",
+            R"""("barf")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("barfo")""",
+            R"""("foobar")""",
+        }
+    );
+
+    test_schema(
+        "boolean",
+        // Schema
+        R"""({
+            "type": "boolean"
+        })""",
+        // Passing strings
+        {
+            "true",
+            "false",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("true")""",
+            R"""(True)""",
+            R"""(FALSE)""",
+        }
+    );
+
+    test_schema(
+        "integer",
+        // Schema
+        R"""({
+            "type": "integer"
+        })""",
+        // Passing strings
+        {
+            R"""(0)""",
+            R"""(12345)""",
+            R"""(1234567890123456)""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(01)""",
+            R"""(007)""",
+            R"""(12345678901234567  )""",
+        }
+    );
+
+    test_schema(
+        "string const",
+        // Schema
+        R"""({
+            "const": "foo"
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+        },
+        // Failing strings
+        {
+            R"""(foo)""",
+            R"""("bar")""",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""({
+            "const": true
+        })""",
+        // Passing strings
+        {
+            R"""(true)""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(foo)""",
+            R"""("true")""",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+        // Passing strings
+        {
+            R"""("red")""",
+            R"""(null)""",
+            R"""(42)""",
+            R"""(["foo"])""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(420)""",
+            R"""(true)""",
+            R"""(foo)""",
+        }
+    );
+
+    test_schema(
+        "simple pattern",
+        // Schema
+        R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+        // Passing strings
+        {
+            R"""("")""",
+            R"""("He_llo-12")""",
+        },
+        // Failing strings
+        {
+            R"""("!")""",
+            R"""("Hello World")""",
+        }
+    );
+
+    test_schema(
+        "pattern with escapes",
+        // Schema
+        R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+        // Passing strings
+        {
+            R"""("a^$.[]()|{}*+?b")""",
+        },
+        // Failing strings
+        {
+            R"""("ab")""",
+        }
+    );
+
+    test_schema(
+        "",
+        // Schema
+        R"""(
+            {
+                "type": ["array", "null"],
+                "items": { "type": "string" }
+            }
+        )""",
+        // Passing strings
+        {
+            "null",
+            "[]",
+            "[\"123\"]",
+            "[\"foo\", \"bar\"]",
+        },
+        // Failing strings
+        {
+            "",
+            "[123]",
+            "\"foo\"",
+            "[\"foo\", 42]",
+        }
+    );
+
+    test_schema(
+        "min+max items",
+        // Schema
+        R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        // Passing strings
+        {
+            R"""([1, 2, 3])""",
+            R"""([1, 2, 3, 4])""",
+            R"""([1, 2, 3, 4, 5])""",
+        },
+        // Failing strings
+        {
+            R"""([1, 2])""",
+            R"""([1, 2, 3, 4, 5, 6])""",
+            R"""(1)""",
+        }
+    );
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            }
+        })""",
+        // Passing strings
+        {
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
+            // Reorder properties
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "Additional properties default to false for generation, even though the spec says true.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
+        }
+    );
+
+    test_schema(
+        "additional properties can't override other properties",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"}
+            },
+            "additionalProperties": true
+        })""",
+        // Passing strings
+        {
+            R"""({"a": 42})""",
+            R"""({"c": ""})""",
+            R"""({"a": 42, "c": ""})""",
+            R"""({"a_": ""})""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""({"a": ""})""",
+            R"""({"a": "", "b": ""})""",
+        }
+    );
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties, additionalProperties: true",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": true
+        })""",
+        // Passing strings
+        {
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+            R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By default, providing additional properties is valid"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
+        }
+    );
+
+    // Additional properties: false
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_type":"Avenue"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Spaces are permitted around enum values
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Reorder properties
+            R"""({ "street_type": "Avenue", "number": 1600 })""",
+            // Add "direction"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
+        }
+    );
+
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({"b": "foo", "a": "bar"})""",
+            R"""({"b":"foo","a":"bar","d":"qux"})""",
+            R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
+        },
+        // Failing strings
+        {
+            R"""({"a": "foo", "b": "bar"})""",
+            R"""({"b": "bar"})""",
+            R"""({"a": "foo", "c": "baz"})""",
+            R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
+        }
+    );
+
+    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
+    test_schema(
+        "required props",
+        // Schema
+        R"""({
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "$id": "https://example.com/product.schema.json",
+            "title": "Product",
+            "description": "A product from Acme's catalog",
+            "type": "object",
+            "properties": {
+                "productId": {
+                "description": "The unique identifier for a product",
+                "type": "integer"
+                },
+                "productName": {
+                "description": "Name of the product",
+                "type": "string"
+                },
+                "price": {
+                "description": "The price of the product",
+                "type": "number",
+                "exclusiveMinimum": 0
+                },
+                "tags": {
+                "description": "Tags for the product",
+                "type": "array",
+                "items": {
+                    "type": "string"
+                },
+                "minItems": 1,
+                "uniqueItems": true
+                },
+                "dimensions": {
+                "type": "object",
+                "properties": {
+                    "length": {
+                    "type": "number"
+                    },
+                    "width": {
+                    "type": "number"
+                    },
+                    "height": {
+                    "type": "number"
+                    }
+                },
+                "required": [ "length", "width", "height" ]
+                }
+            },
+            "required": [ "productId", "productName", "price" ]
+        })""",
+        // Passing strings
+        {
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
+        },
+        // Failing strings
+        {
+            R"""({})""", // Missing all required properties
+            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
+            // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
+            //  Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
+            // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
+            R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
+            R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
+            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
+            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
+        }
+    );
+}
+
+int main() {
+    fprintf(stdout, "Running grammar integration tests...\n");
+    test_simple_grammar();
+    test_complex_grammar();
+    test_special_chars();
+    test_quantifiers();
+    test_failure_missing_root();
+    test_failure_missing_reference();
+    test_failure_left_recursion();
+    test_json_schema();
+    fprintf(stdout, "All tests passed.\n");
+    return 0;
+}
diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp
new file mode 100644
index 000000000..8b696006b
--- /dev/null
+++ b/tests/test-grammar-llguidance.cpp
@@ -0,0 +1,1140 @@
+#ifdef NDEBUG
+#    undef NDEBUG
+#endif
+
+#include "unicode.h"
+#include "sampling.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+static const llama_vocab * vocab;
+
+static bool match_string(const std::string & input, llama_sampler * grammar) {
+    llama_sampler_reset(grammar);
+    auto tokens = common_tokenize(vocab, input, false, false);
+
+    auto n_vocab = llama_vocab_n_tokens(vocab);
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
+    }
+    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
+
+    for (const auto token : tokens) {
+        for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+            cur[token_id].logit = 0.0f;
+        }
+        llama_sampler_apply(grammar, &tok_arr);
+        if (cur[token].logit < 0.0f) {
+            return false;
+        }
+        llama_sampler_accept(grammar, token);
+    }
+
+    // do we allow EOS at the end? if so the grammar is accepting
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    cur[tok_eos].logit = 0.0f;
+    llama_sampler_apply(grammar, &tok_arr);
+
+    return cur[tok_eos].logit >= 0.0f;
+}
+
+static void test(const std::string & test_desc, const std::string & grammar_str,
+                 const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
+    fflush(stderr);
+
+    auto * grammar = llama_sampler_init_llg(vocab, "lark", grammar_str.c_str());
+
+    fprintf(stderr, "  🔵 Valid strings:\n");
+
+    // Passing strings
+    for (const auto & test_string : passing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (!matched) {
+            fprintf(stderr, "❌ (failed to match)\n");
+
+            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
+            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
+            FILE * grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
+            if (grammar_file) {
+                fprintf(grammar_file, "%s", grammar_str.c_str());
+                fclose(grammar_file);
+            }
+
+            // DEBUG: Write the test string to test-grammar-integration.string.txt
+            FILE * string_file = fopen("test-grammar-integration.string.txt", "w");
+            if (string_file) {
+                fprintf(string_file, "%s", test_string.c_str());
+                fclose(string_file);
+            }
+
+            fprintf(stderr,
+                    "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
+                    "command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
+                    "test-grammar-integration.string.txt\n\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+
+        assert(matched);
+    }
+
+    fprintf(stderr, "  🟠 Invalid strings:\n");
+
+    // Failing strings
+    for (const auto & test_string : failing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (matched) {
+            fprintf(stderr, "❌ (incorrectly matched)\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+        assert(!matched);
+    }
+
+    llama_sampler_free(grammar);
+}
+
+static void test_grammar(const std::string & test_desc, const std::string & grammar_str,
+                         const std::vector<std::string> & passing_strings,
+                         const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
+}
+
+static void test_schema(const std::string & test_desc, const std::string & schema_str,
+                        const std::vector<std::string> & passing_strings,
+                        const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Schema: " + schema_str, "%llguidance {}\nstart: %json " + schema_str, passing_strings,
+         failing_strings);
+}
+
+static void test_simple_grammar() {
+    test_schema("min 0",
+                R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+                // Passing strings
+                {
+                    "0",
+                    "10",
+                    "12",
+                    "10000",
+                },
+                // Failing strings
+                {
+                    "-1",
+                    "-10",
+                    "-10000",
+                    "-100000000000000000000000000000000",
+                    // "100000000000000000000000000000000",
+                    "00",
+                    "01",
+                    "-0",
+                });
+    test_schema("min 2",
+                // Schema
+                R"""({
+            "type": "integer",
+            "minimum": 2
+        })""",
+                // Passing strings
+                {
+                    "2",
+                    "3",
+                    "4",
+                    "10",
+                    "20",
+                    "1234567890000000",
+                },
+                // Failing strings
+                {
+                    "0", "1", "-1", "-100", "0", "1", "01", "02",
+                    // "12345678900000000",
+                });
+    test_schema("min 456",
+                R"""({
+            "type": "integer",
+            "minimum": 456
+        })""",
+                // Passing strings
+                {
+                    "456",
+                    "4560",
+                    "457",
+                    "460",
+                    "500",
+                },
+                // Failing strings
+                {
+                    "455",
+                    "356",
+                    "50",
+                    "050",
+                    "-1",
+                    "-456",
+                });
+    test_schema("min -123",
+                R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+                // Passing strings
+                {
+                    "-123",
+                    "-122",
+                    "-11",
+                    "-1",
+                    "0",
+                    "1",
+                    "123",
+                    "1234",
+                    "2345",
+                },
+                // Failing strings
+                {
+                    "-1234",
+                    "-124",
+                });
+
+    test_schema("max 9999",
+                // Schema
+                R"""({
+            "type": "integer",
+            "maximum": 9999
+        })""",
+                // Passing strings
+                {
+                    "-99999",
+                    "0",
+                    "9999",
+                },
+                // Failing strings
+                {
+                    "10000",
+                    "99991",
+                });
+    test_schema("max -9999",
+                // Schema
+                R"""({
+            "type": "integer",
+            "maximum": -9999
+        })""",
+                // Passing strings
+                {
+                    "-10000",
+                    "-9999",
+                },
+                // Failing strings
+                {
+                    "-9998",
+                    "0",
+                    "9999",
+                });
+    test_schema("min 5 max 30",
+                // Schema
+                R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+                // Passing strings
+                {
+                    "5",
+                    "10",
+                    "30",
+                },
+                // Failing strings
+                {
+                    "05",
+                    "4",
+                    "-1",
+                    "31",
+                    "123",
+                    "0123",
+                });
+    test_schema("min -1 max 1",
+                R"""({
+            "type": "integer",
+            "minimum": -1,
+            "maximum": 1
+        })""",
+                // Passing strings
+                {
+                    "-1",
+                    "0",
+                    "1",
+                },
+                // Failing strings
+                {
+                    "-11",
+                    "-10",
+                    "-2",
+                    "2",
+                    "10",
+                    "11",
+                });
+    test_schema("min -123 max 42",
+                R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+                // Passing strings
+                {
+                    "-123",
+                    "-122",
+                    "-13",
+                    "-11",
+                    "-2",
+                    "-1",
+                    "0",
+                    "1",
+                    "5",
+                    "10",
+                    "39",
+                    "40",
+                    "42",
+                },
+                // Failing strings
+                {
+                    "-0123",
+                    "-124",
+                    "-1123",
+                    "-200",
+                    "43",
+                    "123",
+                    "0123",
+                });
+    test_schema("exclusive min / max",
+                // Schema
+                R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "exclusiveMaximum": 10000
+        })""",
+                // Passing strings
+                {
+                    "1",
+                    "9999",
+                },
+                // Failing strings
+                {
+                    "0",
+                    "01",
+                    "10000",
+                    "99999",
+                });
+
+    // Test case for a simple grammar
+    test_grammar("simple grammar",
+                 R"""(
+            start: expr
+            expr: term ("+" term)*
+            term: number
+            number: /[0-9]+/ )""",
+                 // Passing strings
+                 {
+                     "42",
+                     "1+2+3+4+5",
+                     "123+456",
+                 },
+                 // Failing strings
+                 {
+                     "+",
+                     "/ 3",
+                     "1+2+3+4+5+",
+                     "12a45",
+                 });
+}
+
+static void test_complex_grammar() {
+    // Test case for a more complex grammar, with both failure strings and success strings
+    test_grammar("medium complexity grammar",
+                 // Grammar
+                 R"""(
+            start: expression
+            expression: term ws (("+"|"-") ws term)*
+            term: factor ws (("*"|"/") ws factor)*
+            factor: number | variable | "(" expression ")" | function-call
+            number: /[0-9]+/
+            variable: /[a-zA-Z_][a-zA-Z0-9_]*/
+            function-call: variable ws "(" (expression ("," ws expression)*)? ")"
+            ws: /[ \t\n\r]?/ )""",
+                 // Passing strings
+                 { "42",
+                   "1*2*3*4*5",
+                   "x",
+                   "x+10",
+                   "x1+y2",
+                   "(a+b)*(c-d)",
+                   "func()",
+                   "func(x,y+2)",
+                   "a*(b+c)-d/e",
+                   "f(g(x),h(y,z))",
+                   "x + 10",
+                   "x1 + y2",
+                   "(a + b) * (c - d)",
+                   "func()",
+                   "func(x, y + 2)",
+                   "a * (b + c) - d / e",
+                   "f(g(x), h(y, z))",
+                   "123+456",
+                   "123*456*789-123/456+789*123",
+                   "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456" },
+                 // Failing strings
+                 {
+                     "+",
+                     "/ 3x",
+                     "x + + y",
+                     "a * / b",
+                     "func(,)",
+                     "func(x y)",
+                     "(a + b",
+                     "x + y)",
+                     "a + b * (c - d",
+                     "42 +",
+                     "x +",
+                     "x + 10 +",
+                     "(a + b) * (c - d",
+                     "func(",
+                     "func(x, y + 2",
+                     "a * (b + c) - d /",
+                     "f(g(x), h(y, z)",
+                     "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+                 });
+}
+
+static void test_special_chars() {
+    // A collection of tests to exercise special characters such as "."
+    test_grammar("special characters",
+                 // Grammar
+                 R"""(
+            start: /.../ "abc" /.../
+            )""",
+                 // Passing strings
+                 { "abcabcabc", "aaaabcccc",
+                   // NOTE: Also ensures that multi-byte characters still count as a single character
+                   "🔵🟠✅abc❌🟠🔵" },
+                 // Failing strings
+                 { "aaabcccc", "aaaaabcccc", "aaaabccc", "aaaabccccc", "🔵🟠✅❌abc❌✅🟠🔵", "🔵🟠abc🟠🔵" });
+}
+
+static void test_quantifiers() {
+    // A collection of tests to exercise * + and ? quantifiers
+
+    test_grammar(
+        "* quantifier",
+        // Grammar
+        R"""(start: "a"*)""",
+        // Passing strings
+        { "", "a", "aaaaa", "aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" },
+        // Failing strings
+        { "b", "ab", "aab", "ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" });
+    test_grammar(
+        "+ quantifier",
+        // Grammar
+        R"""(start: "a"+)""",
+        // Passing strings
+        { "a", "aaaaa", "aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" },
+        // Failing strings
+        { "", "b", "ab", "aab", "ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" });
+    test_grammar("? quantifier",
+                 // Grammar
+                 R"""(start: "a"?)""",
+                 // Passing strings
+                 { "", "a" },
+                 // Failing strings
+                 {
+                     "b",
+                     "ab",
+                     "aa",
+                     "ba",
+                 });
+    test_grammar("mixed quantifiers",
+                 // Grammar
+                 R"""(
+            start: cons+ vowel* cons? (vowel cons)*
+            vowel: /[aeiouy]/
+            cons: /[bcdfghjklmnpqrstvwxyz]/
+            )""",
+                 // Passing strings
+                 {
+                     "yes",
+                     "no",
+                     "noyes",
+                     "crwth",
+                     "four",
+                     "bryyyy",
+                 },
+                 // Failing strings
+                 {
+                     "yess",
+                     "yesno",
+                     "forty",
+                     "catyyy",
+                 });
+    test_grammar("simple exact repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{4}/
+        )""",
+                 // Passing strings
+                 {
+                     "aaaa",
+                     "bbbb",
+                     "abab",
+                 },
+                 // Failing strings
+                 {
+                     "a",
+                     "b",
+                     "aaaaa",
+                 });
+    test_grammar("simple min repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{4,}/
+        )""",
+                 // Passing strings
+                 {
+                     "aaaa",
+                     "aaaaab",
+                     "bbbb",
+                     "ababab",
+                 },
+                 // Failing strings
+                 {
+                     "",
+                     "aba",
+                 });
+    test_grammar("simple max repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{0,4}/
+        )""",
+                 // Passing strings
+                 {
+                     "",
+                     "a",
+                     "aa",
+                     "aaa",
+                     "aaab",
+                 },
+                 // Failing strings
+                 {
+                     "aaaaa",
+                 });
+    // test_grammar("min / max repetition",
+    //              // Grammar
+    //              R"""(
+    //         start: ("0x" /[A-F0-9]{2}/ " "?){3,5}
+    //     )""",
+    //              // Passing strings
+    //              {
+    //                  "0xFF 0x12 0xAB",
+    //                  "0xFF 0x12 0xAB 0x00 0x00",
+    //              },
+    //              // Failing strings
+    //              {
+    //                  "",
+    //                  "0xFF",
+    //                  "0xFF 0x12",
+    //                  "0xFF 0x12 0xAB 0x00 0x00 0x00",
+    //              });
+}
+
+static void test_json_schema() {
+    // Note that this is similar to the regular grammar tests,
+    //  but we convert each json schema to a grammar before parsing.
+    // Otherwise, this test structure is the same.
+
+    test_schema("empty schema (object)",
+                // Schema
+                R"""(
+            {"type":"object"}
+        )""",
+                // Passing strings
+                {
+                    R"""({})""",
+                    R"""({"foo": "bar"})""",
+                },
+                // Failing strings
+                {
+                    "",
+                    "[]",
+                    "null",
+                    R"""("")""",
+                    "true",
+                });
+
+    test_schema(
+        "exotic formats (list)",
+        // Schema
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        // Passing strings
+        {
+            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
+            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+        },
+        // Failing strings
+        {
+            R"""(["foo", "bar"])""",
+            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
+        });
+
+    test_schema("string",
+                // Schema
+                R"""({
+            "type": "string"
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("")""",
+                },
+                // Failing strings
+                {
+                    R"""({})""",
+                    R"""("foo": "bar")""",
+                });
+
+    test_schema("string w/ min length 1",
+                // Schema
+                R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""({})""",
+                    R"""("foo": "bar")""",
+                });
+
+    test_schema("string w/ min length 3",
+                // Schema
+                R"""({
+                "type": "string",
+                "minLength": 3
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("foobar")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("f")""",
+                    R"""("fo")""",
+                });
+
+    test_schema("string w/ max length",
+                // Schema
+                R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("")""",
+                    R"""("f")""",
+                    R"""("fo")""",
+                },
+                // Failing strings
+                {
+                    R"""("foobar")""",
+                });
+
+    test_schema("string w/ min & max length",
+                // Schema
+                R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("f")""",
+                    R"""("barf")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("barfo")""",
+                    R"""("foobar")""",
+                });
+
+    test_schema("boolean",
+                // Schema
+                R"""({
+            "type": "boolean"
+        })""",
+                // Passing strings
+                {
+                    "true",
+                    "false",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("true")""",
+                    R"""(True)""",
+                    R"""(FALSE)""",
+                });
+
+    test_schema("integer",
+                // Schema
+                R"""({
+            "type": "integer"
+        })""",
+                // Passing strings
+                {
+                    R"""(0)""",
+                    R"""(12345)""",
+                    R"""(1234567890123456)""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(01)""",
+                    R"""(007)""",
+                    R"""(12345678901234567  )""",
+                });
+
+    test_schema("string const",
+                // Schema
+                R"""({
+            "const": "foo"
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                },
+                // Failing strings
+                {
+                    R"""(foo)""",
+                    R"""("bar")""",
+                });
+
+    test_schema("non-string const",
+                // Schema
+                R"""({
+            "const": true
+        })""",
+                // Passing strings
+                {
+                    R"""(true)""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(foo)""",
+                    R"""("true")""",
+                });
+
+    test_schema("non-string const",
+                // Schema
+                R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+                // Passing strings
+                {
+                    R"""("red")""",
+                    R"""(null)""",
+                    R"""(42)""",
+                    R"""(["foo"])""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(420)""",
+                    R"""(true)""",
+                    R"""(foo)""",
+                });
+
+    test_schema("simple pattern",
+                // Schema
+                R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+                // Passing strings
+                {
+                    R"""("")""",
+                    R"""("He_llo-12")""",
+                },
+                // Failing strings
+                {
+                    R"""("!")""",
+                    R"""("Hello World")""",
+                });
+
+    test_schema("pattern with escapes",
+                // Schema
+                R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+                // Passing strings
+                {
+                    R"""("a^$.[]()|{}*+?b")""",
+                },
+                // Failing strings
+                {
+                    R"""("ab")""",
+                });
+
+    test_schema("",
+                // Schema
+                R"""(
+            {
+                "type": ["array", "null"],
+                "items": { "type": "string" }
+            }
+        )""",
+                // Passing strings
+                {
+                    "null",
+                    "[]",
+                    "[\"123\"]",
+                    "[\"foo\", \"bar\"]",
+                },
+                // Failing strings
+                {
+                    "",
+                    "[123]",
+                    "\"foo\"",
+                    "[\"foo\", 42]",
+                });
+
+    test_schema("min+max items",
+                // Schema
+                R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+                // Passing strings
+                {
+                    R"""([1, 2, 3])""",
+                    R"""([1, 2, 3, 4])""",
+                    R"""([1, 2, 3, 4, 5])""",
+                    // this is in fact correct; keyword do not apply if the type is wrong
+                    R"""(1)""",
+                },
+                // Failing strings
+                {
+                    R"""([1, 2])""",
+                    R"""([1, 2, 3, 4, 5, 6])""",
+                });
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema("object properties",
+                // Schema
+                R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+                // Passing strings
+                {
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // "By default, leaving out properties is valid"
+                    R"""({ "street_name": "Pennsylvania" })""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+                    // "By extension, even an empty object is valid"
+                    R"""({})""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+                },
+                // Failing strings
+                {
+                    // Change datatype from number to string
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Reorder properties
+                    R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
+                    // Reorder properties
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Additional properties set to false
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
+                });
+
+    test_schema("additional properties can't override other properties",
+                R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"}
+            },
+            "additionalProperties": true
+        })""",
+                // Passing strings
+                {
+                    R"""({"a": 42})""",
+                    R"""({"c": ""})""",
+                    R"""({"a": 42, "c": ""})""",
+                    R"""({"a_": ""})""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""({"a": ""})""",
+                    R"""({"a": "", "b": ""})""",
+                });
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema("object properties, additionalProperties: true",
+                // Schema
+                R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": true
+        })""",
+                // Passing strings
+                {
+                    // "By extension, even an empty object is valid"
+                    R"""({})""",
+                    R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
+                    // "By default, leaving out properties is valid"
+                    R"""({ "street_name": "Pennsylvania" })""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+                    // "By default, providing additional properties is valid"
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+                },
+                // Failing strings
+                {
+                    // Change datatype from number to string
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Reorder properties
+                    R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
+                });
+
+    // Additional properties: false
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_type":"Avenue"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Spaces are permitted around enum values
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Reorder properties
+            R"""({ "street_type": "Avenue", "number": 1600 })""",
+            // Add "direction"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
+        });
+
+    test_schema("required + optional props each in original order",
+                // Schema
+                R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+                // Passing strings
+                {
+                    R"""({"b": "foo", "a": "bar"})""",
+                    R"""({"b":"foo","a":"bar","d":"qux"})""",
+                    R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
+                },
+                // Failing strings
+                {
+                    R"""({"a": "foo", "b": "bar"})""",
+                    R"""({"b": "bar"})""",
+                    R"""({"a": "foo", "c": "baz"})""",
+                    R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
+                });
+
+    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
+    test_schema(
+        "required props",
+        // Schema
+        R"""({
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "$id": "https://example.com/product.schema.json",
+            "title": "Product",
+            "description": "A product from Acme's catalog",
+            "type": "object",
+            "properties": {
+                "productId": {
+                "description": "The unique identifier for a product",
+                "type": "integer"
+                },
+                "productName": {
+                "description": "Name of the product",
+                "type": "string"
+                },
+                "price": {
+                "description": "The price of the product",
+                "type": "number",
+                "exclusiveMinimum": 0
+                },
+                "tags": {
+                "description": "Tags for the product",
+                "type": "array",
+                "items": {
+                    "type": "string"
+                },
+                "minItems": 1,
+                "DISABLED_uniqueItems": true
+                },
+                "dimensions": {
+                "type": "object",
+                "properties": {
+                    "length": {
+                    "type": "number"
+                    },
+                    "width": {
+                    "type": "number"
+                    },
+                    "height": {
+                    "type": "number"
+                    }
+                },
+                "required": [ "length", "width", "height" ]
+                }
+            },
+            "required": [ "productId", "productName", "price" ]
+        })""",
+        // Passing strings
+        {
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
+        },
+        // Failing strings
+        {
+            R"""({})""",  // Missing all required properties
+            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""",  // Out of order properties
+            // `exclusiveMinimum` is OK for llg
+            R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
+            R"""({"productId": 1, "productName": "A green door"})""",  // Missing required property (price)
+            R"""({"productName": "A green door", "price": 12.50})""",  // Missing required property (productId)
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""",  // tags is empty, but minItems is 1
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""",  // Tags and dimensions are out of order
+            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
+            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
+        });
+}
+
+int main(int argc, const char ** argv) {
+    fprintf(stdout, "Running llguidance integration tests...\n");
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const char * vocab_file = argv[1];
+
+    fprintf(stderr, "reading vocab from: '%s'\n", vocab_file);
+
+    llama_model *   model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(vocab_file, mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, vocab_file);
+            return 1;
+        }
+
+        // needed?
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, vocab_file);
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    vocab = llama_model_get_vocab(model);
+
+    test_simple_grammar();
+    test_complex_grammar();
+    test_special_chars();
+    test_quantifiers();
+    test_json_schema();
+    fprintf(stdout, "All tests passed.\n");
+    return 0;
+}
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 91939e276..259172d99 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -3,32 +3,84 @@
 #endif
 
 #include "llama.h"
-#include "grammar-parser.h"
+#include "llama-grammar.h"
 
 #include <cassert>
 
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
+static const char * type_str(llama_gretype type) {
+    switch (type) {
+        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
+        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
+        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
+        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
+        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
+        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
+        default: return "?";
+    }
+}
 
-    const char *grammar_bytes = R"""(root  ::= (expr "=" term "\n")+
-expr  ::= term ([-+*/] term)*
-term  ::= [0-9]+)""";
+static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
+    uint32_t index = 0;
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_bytes);
 
-    parsed_grammar = grammar_parser::parse(grammar_bytes);
+    std::map<uint32_t, std::string> symbol_names;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+        symbol_names[it->second] = it->first;
+    }
 
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_5", 5},
-        {"expr_6", 6},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_4", 4},
-        {"term", 3},
-        {"term_7", 7},
+    auto print_all = [&]() {
+        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
+        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
+        }
+        fprintf(stderr, "    }, {\n");
+        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
+            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
+            auto & rule = parsed_grammar.rules[i_rule];
+            for (uint32_t i = 0; i < rule.size(); i++) {
+                std::string rule_str;
+                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
+                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
+                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+                    char c = rule[i].value;
+                    if (c == '\n') {
+                        fprintf(stderr, "'\\n'");
+                    } else if (c == '\t') {
+                        fprintf(stderr, "'\\t'");
+                    } else if (c == '\r') {
+                        fprintf(stderr, "'\\r'");
+                    } else if (c == '\0') {
+                        fprintf(stderr, "'\\0'");
+                    } else {
+                        fprintf(stderr, "'%c'", c);
+                    }
+                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
+                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
+                } else {
+                    fprintf(stderr, "%u", rule[i].value);
+                }
+                fprintf(stderr, "},\n");
+            }
+        }
+        fprintf(stderr, "    });\n");
     };
 
-    uint32_t index = 0;
+    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
+        print_all();
+        fprintf(stderr, "\n");
+        return;
+    }
+
+    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
+
+    if (parsed_grammar.symbol_ids.size() != expected.size()) {
+        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+        print_all();
+        assert(parsed_grammar.symbol_ids.size() == expected.size());
+    }
+
     for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
     {
         std::string key = it->first;
@@ -38,51 +90,18 @@ term  ::= [0-9]+)""";
         // pretty print error message before asserting
         if (expected_pair.first != key || expected_pair.second != value)
         {
+            fprintf(stderr, "index: %u\n", index);
             fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
             fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
             fprintf(stderr, "expected_pair != actual_pair\n");
+            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+            print_all();
         }
 
         assert(expected_pair.first == key && expected_pair.second == value);
 
         index++;
     }
-    std::vector<llama_grammar_element> expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_CHAR, 10},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_END, 0},
-    };
 
     index = 0;
     for (auto rule : parsed_grammar.rules)
@@ -97,28 +116,307 @@ term  ::= [0-9]+)""";
             if (expected_element.type != element.type || expected_element.value != element.value)
             {
                 fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
+                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
+                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
                 fprintf(stderr, "expected_element != actual_element\n");
+                fprintf(stderr, "all elements:\n");
+                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+                print_all();
             }
 
             assert(expected_element.type == element.type && expected_element.value == element.value);
             index++;
         }
     }
+}
 
-    const char *longer_grammar_bytes = R"""(
-    root  ::= (expr "=" ws term "\n")+
-    expr  ::= term ([-+*/] term)*
-    term  ::= ident | num | "(" ws expr ")" ws
-    ident ::= [a-z] [a-z0-9_]* ws
-    num   ::= [0-9]+ ws
-    ws    ::= [ \t\n]*
-    )""";
+static void verify_failure(const char * grammar_bytes) {
+    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
+    llama_grammar_parser result;
+    result.parse(grammar_bytes);
+    assert(result.rules.empty() && "should have failed");
+}
 
-    parsed_grammar = grammar_parser::parse(longer_grammar_bytes);
+int main()
+{
+    verify_failure(R"""(
+        root ::= "a"{,}"
+    )""");
 
-    expected = {
+    verify_failure(R"""(
+        root ::= "a"{,10}"
+    )""");
+
+    verify_parsing(R"""(
+        root  ::= "a"
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a" | [bdx-z] | [^1-3]
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 'b'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR_NOT, '1'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a+
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"+
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a?
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"?
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a*
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"*
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{ 4}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,4}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= [0-9]+
+    )""", {
+        {"expr", 2},
+        {"expr_5", 5},
+        {"expr_6", 6},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_4", 4},
+        {"term", 3},
+        {"term_7", 7},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 3)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // root_4 (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_5 (index 5)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // term_7 (index 7)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" ws term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= ident | num | "(" ws expr ")" ws
+        ident ::= [a-z] [a-z0-9_]* ws
+        num   ::= [0-9]+ ws
+        ws    ::= [ \t\n]*
+    )""", {
         {"expr", 2},
         {"expr_6", 6},
         {"expr_7", 7},
@@ -132,119 +430,88 @@ term  ::= [0-9]+)""";
         {"term", 4},
         {"ws", 3},
         {"ws_12", 12},
-    };
-
-    index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-    expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 5},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_CHAR, 10},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_CHAR, '\n'},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_RULE_REF, 7},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 12},
+        // ws (index 3)
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 8},
+        // term (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
         {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 9},
+        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
         {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 40},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 41},
-        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_CHAR, '('},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, ')'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_RULE_REF, 7},
+        // root_5 (index 5)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_RULE_REF, 10},
-        {LLAMA_GRETYPE_RULE_REF, 3},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_CHAR_ALT, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_CHAR_ALT, 95},
-        {LLAMA_GRETYPE_RULE_REF, 10},
+        // expr_7 (index 7)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        // ident (index 8)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 32},
-        {LLAMA_GRETYPE_CHAR_ALT, 9},
-        {LLAMA_GRETYPE_CHAR_ALT, 10},
-        {LLAMA_GRETYPE_RULE_REF, 12},
+        // num (index 9)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // ident_10 (index 10)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_CHAR_ALT, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_CHAR_ALT, '_'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-    };
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
+        // num_11 (index 11)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ws_12 (index 12)
+        {LLAMA_GRETYPE_CHAR, ' '},
+        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
+        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
 
     return 0;
 }
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
new file mode 100755
index 000000000..f38994c92
--- /dev/null
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -0,0 +1,1288 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "json-schema-to-grammar.h"
+
+#include "llama-grammar.h"
+
+#include <cassert>
+#include <fstream>
+#include <sstream>
+#include <regex>
+
+static std::string trim(const std::string & source) {
+    std::string s(source);
+    s.erase(0,s.find_first_not_of(" \n\r\t"));
+    s.erase(s.find_last_not_of(" \n\r\t")+1);
+    return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
+}
+
+enum TestCaseStatus {
+    SUCCESS,
+    FAILURE
+};
+
+struct TestCase {
+    TestCaseStatus expected_status;
+    std::string name;
+    std::string schema;
+    std::string expected_grammar;
+
+    void _print_failure_header() const {
+        fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
+    }
+    void verify(const std::string & actual_grammar) const {
+        if (trim(actual_grammar) != trim(expected_grammar)) {
+        _print_failure_header();
+        fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
+        assert(false);
+        }
+    }
+    void verify_expectation_parseable() const {
+        try {
+            llama_grammar_parser state;
+            state.parse(expected_grammar.c_str());
+            if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
+                throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
+            }
+        } catch (const std::runtime_error & ex) {
+            _print_failure_header();
+            fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
+            assert(false);
+        }
+    }
+    void verify_status(TestCaseStatus status) const {
+        if (status != expected_status) {
+            _print_failure_header();
+            fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
+            fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
+            assert(false);
+        }
+    }
+};
+
+static void write(const std::string & file, const std::string & content) {
+    std::ofstream f;
+    f.open(file.c_str());
+    f << content.c_str();
+    f.close();
+}
+
+static std::string read(const std::string & file) {
+    std::ostringstream actuals;
+    actuals << std::ifstream(file.c_str()).rdbuf();
+    return actuals.str();
+}
+
+static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
+    fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
+    auto test = [&](const TestCase & tc) {
+        fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
+        runner(tc);
+    };
+
+    test({
+        SUCCESS,
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        R"""(
+            root ::= ([0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 1",
+        R"""({
+            "type": "integer",
+            "minimum": 1
+        })""",
+        R"""(
+            root ::= ([1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 3",
+        R"""({
+            "type": "integer",
+            "minimum": 3
+        })""",
+        R"""(
+            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 9",
+        R"""({
+            "type": "integer",
+            "minimum": 9
+        })""",
+        R"""(
+            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 10",
+        R"""({
+            "type": "integer",
+            "minimum": 10
+        })""",
+        R"""(
+            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 25",
+        R"""({
+            "type": "integer",
+            "minimum": 25
+        })""",
+        R"""(
+            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 30",
+        R"""({
+            "type": "integer",
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -5",
+        R"""({
+            "type": "integer",
+            "minimum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max -5",
+        R"""({
+            "type": "integer",
+            "maximum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 1",
+        R"""({
+            "type": "integer",
+            "maximum": 1
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 100",
+        R"""({
+            "type": "integer",
+            "maximum": 100
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 0 max 23",
+        R"""({
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 23
+        })""",
+        R"""(
+            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 15 max 300",
+        R"""({
+            "type": "integer",
+            "minimum": 15,
+            "maximum": 300
+        })""",
+        R"""(
+            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 5 max 30",
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -10 max 10",
+        R"""({
+            "type": "integer",
+            "minimum": -10,
+            "maximum": 10
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        FAILURE,
+        "unknown type",
+        R"""({
+            "type": "kaboom"
+        })""",
+        ""
+    });
+
+    test({
+        FAILURE,
+        "invalid type",
+        R"""({
+            "type": 123
+        })""",
+        ""
+    });
+
+    test({
+        SUCCESS,
+        "empty schema (object)",
+        "{}",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "exotic formats",
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        R"""(
+            date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
+            date-string ::= "\"" date "\"" space
+            date-time ::= date "T" time
+            date-time-string ::= "\"" date-time "\"" space
+            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
+            time-string ::= "\"" time "\"" space
+            tuple-0 ::= date-string
+            tuple-2 ::= time-string
+            tuple-3 ::= date-time-string
+            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string",
+        R"""({
+            "type": "string"
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char* "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min length 1",
+        R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char+ "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min length 3",
+        R"""({
+            "type": "string",
+            "minLength": 3
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{3,} "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ max length",
+        R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{0,3} "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min & max length",
+        R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{1,4} "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "boolean",
+        R"""({
+            "type": "boolean"
+        })""",
+        R"""(
+            root ::= ("true" | "false") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "integer",
+        R"""({
+            "type": "integer"
+        })""",
+        R"""(
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string const",
+        R"""({
+            "const": "foo"
+        })""",
+        R"""(
+            root ::= "\"foo\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "non-string const",
+        R"""({
+            "const": 123
+        })""",
+        R"""(
+            root ::= "123" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "non-string enum",
+        R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+        R"""(
+            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string array",
+        R"""({
+            "type": "array",
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "[" space (string ("," space string)*)? "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "nullable string array",
+        R"""({
+            "type": ["array", "null"],
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            alternative-0 ::= "[" space (string ("," space string)*)? "]" space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            null ::= "null" space
+            root ::= alternative-0 | null
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "tuple1",
+        R"""({
+            "prefixItems": [{ "type": "string" }]
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "[" space string "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "tuple2",
+        R"""({
+            "prefixItems": [{ "type": "string" }, { "type": "number" }]
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "[" space string "," space number "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "number",
+        R"""({
+            "type": "number"
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "minItems",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "minItems": 2
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space boolean ("," space boolean)+ "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "maxItems 1",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 1
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space boolean? "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "maxItems 2",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 2
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space (boolean ("," space boolean)?)? "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + maxItems",
+        R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            item ::= number | integer
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + max items with min + max values across zero",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": -12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + max items with min + max values",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": 12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "simple regexp",
+        R"""({
+            "type": "string",
+            "pattern": "^abc?d*efg+(hij)?kl$"
+        })""",
+        R"""(
+            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp escapes",
+        R"""({
+            "type": "string",
+            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
+        })""",
+        R"""(
+            root ::= "\"" ("[]{}()|+*?") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp quote",
+        R"""({
+            "type": "string",
+            "pattern": "^\"$"
+        })""",
+        R"""(
+            root ::= "\"" ("\"") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp with top-level alternation",
+        R"""({
+            "type": "string",
+            "pattern": "^A|B|C|D$"
+        })""",
+        R"""(
+            root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp",
+        R"""({
+            "type": "string",
+            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} a{3,5}nd...$"
+        })""",
+        R"""(
+            dot ::= [^\x0A\x0D]
+            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
+            root-1 ::= [0-9]
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required props in original order",
+        R"""({
+            "type": "object",
+            "properties": {
+                "b": {"type": "string"},
+                "c": {"type": "string"},
+                "a": {"type": "string"}
+            },
+            "required": [
+                "a",
+                "b",
+                "c"
+            ],
+            "additionalProperties": false,
+            "definitions": {}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            b-kv ::= "\"b\"" space ":" space string
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "1 optional prop",
+        R"""({
+            "properties": {
+                "a": {
+                "type": "string"
+                }
+            },
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space  (a-kv )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "N optional props",
+        R"""({
+            "properties": {
+                "a": {"type": "string"},
+                "b": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            a-rest ::= ( "," space b-kv )? b-rest
+            b-kv ::= "\"b\"" space ":" space string
+            b-rest ::= ( "," space c-kv )?
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + optional props each in original order",
+        R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            b-kv ::= "\"b\"" space ":" space string
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            d-kv ::= "\"d\"" space ":" space string
+            d-rest ::= ( "," space c-kv )?
+            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props",
+        R"""({
+            "type": "object",
+            "additionalProperties": {"type": "array", "items": {"type": "number"}}
+        })""",
+        R"""(
+            additional-kv ::= string ":" space additional-value
+            additional-value ::= "[" space (number ("," space number)*)? "]" space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props (true)",
+        R"""({
+            "type": "object",
+            "additionalProperties": true
+        })""",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props (implicit)",
+        R"""({
+            "type": "object"
+        })""",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "empty w/o additional props",
+        R"""({
+            "type": "object",
+            "additionalProperties": false
+        })""",
+        R"""(
+            root ::= "{" space  "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "a": {"type": "number"}
+            },
+            "required": ["a"],
+            "additionalProperties": {"type": "string"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "a": {"type": "number"}
+            },
+            "additionalProperties": {"type": "number"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + optional + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "and": {"type": "number"},
+                "also": {"type": "number"}
+            },
+            "required": ["and"],
+            "additionalProperties": {"type": "number"}
+        })""",
+        R"""(
+            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
+            also-kv ::= "\"also\"" space ":" space number
+            also-rest ::= ( "," space additional-kv )*
+            and-kv ::= "\"and\"" space ":" space number
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with empty name",
+        R"""({
+            "properties": {
+                "": {"type": "integer"},
+                "a": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            -kv ::= "\"\"" space ":" space root
+            -rest ::= ( "," space a-kv )? a-rest
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) space
+            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with nested names",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "aa": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space aa-kv )? aa-rest
+            aa-kv ::= "\"aa\"" space ":" space integer
+            aa-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with common prefix",
+        R"""({
+            "properties": {
+                "ab": {"type": "integer"},
+                "ac": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            ab-kv ::= "\"ab\"" space ":" space integer
+            ab-rest ::= ( "," space ac-kv )? ac-rest
+            ac-kv ::= "\"ac\"" space ":" space integer
+            ac-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "top-level $ref",
+        R"""({
+            "$ref": "#/definitions/foo",
+            "definitions": {
+                "foo": {
+                    "type": "object",
+                    "properties": {
+                        "a": {
+                            "type": "string"
+                        }
+                    },
+                    "required": [
+                        "a"
+                    ],
+                    "additionalProperties": false
+                }
+            }
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            foo ::= "{" space foo-a-kv "}" space
+            foo-a-kv ::= "\"a\"" space ":" space string
+            root ::= foo
+            space ::= | " " | "\n" [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "anyOf",
+        R"""({
+            "anyOf": [
+                {"$ref": "#/definitions/foo"},
+                {"$ref": "#/definitions/bar"}
+            ],
+            "definitions": {
+                "foo": {
+                    "properties": {"a": {"type": "number"}}
+                },
+                "bar": {
+                    "properties": {"b": {"type": "number"}}
+                }
+            },
+            "type": "object"
+        })""",
+        R"""(
+            alternative-0 ::= foo
+            alternative-1 ::= bar
+            bar ::= "{" space  (bar-b-kv )? "}" space
+            bar-b-kv ::= "\"b\"" space ":" space number
+            decimal-part ::= [0-9]{1,16}
+            foo ::= "{" space  (foo-a-kv )? "}" space
+            foo-a-kv ::= "\"a\"" space ":" space number
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= alternative-0 | alternative-1
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
+        R"""({
+            "allOf": [
+                {"$ref": "#/definitions/foo"},
+                {"$ref": "#/definitions/bar"},
+                {
+                "anyOf": [
+                    {"$ref": "#/definitions/baz"},
+                    {"$ref": "#/definitions/bam"}
+                ]
+                }
+            ],
+            "definitions": {
+                "foo": {
+                    "properties": {"a": {"type": "number"}}
+                },
+                "bar": {
+                    "properties": {"b": {"type": "number"}}
+                },
+                "bam": {
+                    "properties": {"c": {"type": "number"}}
+                },
+                "baz": {
+                    "properties": {"d": {"type": "number"}}
+                }
+            },
+            "type": "object"
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            b-kv ::= "\"b\"" space ":" space number
+            c-kv ::= "\"c\"" space ":" space number
+            d-kv ::= "\"d\"" space ":" space number
+            d-rest ::= ( "," space c-kv )?
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "conflicting names",
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": {
+                "type": "object",
+                "properties": {
+                    "number": {
+                    "type": "object",
+                        "properties": {
+                            "root": {
+                                "type": "number"
+                            }
+                        },
+                        "required": [
+                            "root"
+                        ],
+                        "additionalProperties": false
+                    }
+                },
+                "required": [
+                    "number"
+                ],
+                "additionalProperties": false
+                }
+            },
+            "required": [
+                "number"
+            ],
+            "additionalProperties": false,
+            "definitions": {}
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            number- ::= "{" space number-number-kv "}" space
+            number-kv ::= "\"number\"" space ":" space number-
+            number-number ::= "{" space number-number-root-kv "}" space
+            number-number-kv ::= "\"number\"" space ":" space number-number
+            number-number-root-kv ::= "\"root\"" space ":" space number
+            root ::= "{" space number-kv "}" space
+            space ::= | " " | "\n" [ \t]{0,20}
+        )"""
+    });
+}
+
+int main() {
+    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
+    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
+
+    test_all("C++", [](const TestCase & tc) {
+        try {
+            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
+            tc.verify_status(SUCCESS);
+        } catch (const std::runtime_error & ex) {
+            fprintf(stderr, "Error: %s\n", ex.what());
+            tc.verify_status(FAILURE);
+        }
+    });
+
+    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
+        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
+    } else {
+        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+            test_all("Python", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
+        }
+
+        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+            test_all("JavaScript", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        }
+    }
+
+    test_all("Check Expectations Validity", [](const TestCase & tc) {
+        if (tc.expected_status == SUCCESS) {
+            tc.verify_expectation_parseable();
+        }
+    });
+}
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index 27ca4d265..e2129206b 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -2,14 +2,15 @@
 #undef NDEBUG
 #endif
 
-#include "llama.cpp" // TODO: not great
-#include "grammar-parser.h"
+#include "llama.h"
+#include "llama-grammar.h"
 
 #include <cassert>
+#include <stdexcept>
 
 int main()
 {
-    grammar_parser::parse_state parsed_grammar;
+    llama_grammar_parser parsed_grammar;
 
     std::vector<std::pair<std::string, uint32_t>> expected = {
         {"expr", 2},
@@ -112,10 +113,12 @@ int main()
         }
     }
 
-    llama_grammar *grammar = NULL;
     std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-    grammar = llama_grammar_init(
-        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr) {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
 
     std::vector<std::vector<llama_grammar_element>> expected_stacks = {
         {
@@ -168,13 +171,13 @@ int main()
         }};
 
     auto index = 0;
-    for (auto stack : grammar->stacks)
+    for (const llama_grammar_stack & stack : llama_grammar_get_stacks(grammar))
     {
         // compare stack to expected_stack
         for (uint32_t i = 0; i < stack.size(); i++)
         {
-            auto element = stack[i];
-            auto expected_element = expected_stacks[index][i];
+            const llama_grammar_element * element = stack[i];
+            const llama_grammar_element & expected_element = expected_stacks[index][i];
 
             // pretty print error message before asserting
             if (expected_element.type != element->type || expected_element.value != element->value)
@@ -370,13 +373,13 @@ int main()
         },
     };
 
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
+    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
 
     std::vector<std::vector<llama_grammar_candidate>> all_rejects;
 
-    for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
+    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
     {
-        rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
+        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
         all_rejects.push_back(rejects);
     }
 
@@ -397,6 +400,8 @@ int main()
         delete[] candidate.code_points;
         candidate.code_points = nullptr;
     }
-    delete grammar;
+
+    llama_grammar_free_impl(grammar);
+
     return 0;
 }
diff --git a/tests/test-log.cpp b/tests/test-log.cpp
new file mode 100644
index 000000000..306f28c61
--- /dev/null
+++ b/tests/test-log.cpp
@@ -0,0 +1,39 @@
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+    const int n_thread = 8;
+
+    std::thread threads[n_thread];
+    for (int i = 0; i < n_thread; i++) {
+        threads[i] = std::thread([i]() {
+            const int n_msg = 1000;
+
+            for (int j = 0; j < n_msg; j++) {
+                const int log_type = std::rand() % 4;
+
+                switch (log_type) {
+                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+                    default:
+                        break;
+                }
+
+                if (rand () % 10 < 5) {
+                    common_log_set_timestamps(common_log_main(), rand() % 2);
+                    common_log_set_prefix    (common_log_main(), rand() % 2);
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < n_thread; i++) {
+        threads[i].join();
+    }
+
+    return 0;
+}
diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh
new file mode 100755
index 000000000..1d1f4886c
--- /dev/null
+++ b/tests/test-lora-conversion-inference.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+set -e
+
+# Array of models to iterate over
+declare -a params=(
+    "Gemma2ForCausalLM 64"
+    "LlamaForCausalLM 64"
+    "Phi3ForCausalLM 64"
+)
+
+MODELS_REPO=lora-tests
+MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
+COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890
+
+# Clone the Hugging Face repository if the directory does not exist
+if [ ! -d "$MODELS_REPO" ]; then
+    echo "Cloning the Hugging Face repository..."
+    git clone $MODELS_REPO_URL --depth 1
+    cd $MODELS_REPO
+    git fetch --depth=1 origin $COMMIT
+    git reset --hard $COMMIT
+    cd -
+else
+    echo "Repository already exists. Skipping clone."
+fi
+
+# Array to store results to print
+results=()
+
+trim_leading_whitespace() {
+    local input_string="$1"
+    echo "${input_string#"${input_string%%[![:space:]]*}"}"
+}
+
+extract_starting_substring() {
+    local reference_string="$1"
+    local target_string="$2"
+
+    local target_length=${#target_string}
+    echo "${reference_string:0:$target_length}"
+}
+
+get_first_word() {
+    local input_string="$1"
+    read -r first_word _ <<< "$input_string"
+    echo "$first_word"
+}
+
+# Load the expected strings
+EXPECTED_BASE_FULL=$(cat $MODELS_REPO/data/pale_blue_dot.txt)
+EXPECTED_LORA_FULL=$(cat $MODELS_REPO/data/bohemian_rhapsody.txt)
+EXPECTED_BASE_FIRST_WORD=$(get_first_word "$EXPECTED_BASE_FULL")
+EXPECTED_LORA_FIRST_WORD=$(get_first_word "$EXPECTED_LORA_FULL")
+
+run_conversion_and_inference_lora() {
+    local model_name=$1
+    local hidden_size=$2
+
+    echo -e "\n\n-------- RUNNING TEST FOR MODEL $model_name --------\n\n"
+
+    # Convert safetensors to gguf
+    echo "Running convert_hf_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python convert_hf_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outfile $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running convert_lora_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python3 convert_lora_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora \
+        --base $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..."
+    ./llama-export-lora \
+        -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf
+
+    # Run inference
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    # Remove any initial white space
+    OUTPUT_BASE=$(trim_leading_whitespace "$OUTPUT_BASE")
+    OUTPUT_LORA_HOT=$(trim_leading_whitespace "$OUTPUT_LORA_HOT")
+    OUTPUT_LORA_MERGED=$(trim_leading_whitespace "$OUTPUT_LORA_MERGED")
+    # Extract the corresponding substring from full string
+    EXPECTED_BASE=$(extract_starting_substring "$EXPECTED_BASE_FULL" "$OUTPUT_BASE")
+    EXPECTED_LORA=$(extract_starting_substring "$EXPECTED_LORA_FULL" "$OUTPUT_LORA_HOT")
+
+    # Assert output equals the expected output
+    if [[ "$OUTPUT_BASE" != "$EXPECTED_BASE" ]]; then
+        echo "Error: $model_name OUTPUT_BASE does not start with the expected string."
+        echo -e "Out=$OUTPUT_BASE\n\nExp=$EXPECTED_BASE"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_HOT" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_HOT does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_HOT\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_MERGED" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_MERGED does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_MERGED\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+
+    # Store the results
+    results+=("
+    \n\033[1mResults for $model_name with hidden_size $hidden_size:\033[0m
+    \n\033[32m  • Base:\n$OUTPUT_BASE
+    \n\033[34m  • Lora hot:\n$OUTPUT_LORA_HOT
+    \n\033[36m  • Lora merged:\n$OUTPUT_LORA_MERGED
+    \n \033[0m
+    ")
+
+    echo "All tests passed for $model_name with hidden_size $hidden_size!"
+}
+
+# Run test for each model
+for param in "${params[@]}"; do
+    run_conversion_and_inference_lora $param
+done
+
+# Print results
+echo -e "\n\n---------------------------\n\n"
+echo -e "\n\033[1mSummary of All Results:\033[0m"
+for result in "${results[@]}"; do
+    echo -e "$result"
+done
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index 858535c3c..9095826fa 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -21,7 +21,7 @@ int main(int argc, char *argv[] ) {
         (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file(model_path, params);
+    auto * model = llama_model_load_from_file(model_path, params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
index 546ca230b..f90c92b4b 100644
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -1,181 +1,892 @@
 #include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml-opt.h"
 
 #include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
+#include <cinttypes>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
 
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
+static bool almost_equal(const double a, const double b, const double atol) {
+    return fabs(a - b) < atol;
 }
 
-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+constexpr int64_t ne_datapoint = 2;
+constexpr int64_t ne_label     = 1;
+constexpr int64_t ndata        = 6;
 
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+struct helper_ctx_data {
+    std::vector<ggml_opt_dataset_t>   datasets_supervised;
+    std::vector<struct ggml_tensor *> data_batch;
+    std::vector<struct ggml_tensor *> labels_batch;
+
+    ggml_opt_dataset_t       dataset_unsupervised;
+    struct ggml_context    * ctx_static;
+    struct ggml_context    * ctx_compute;
+    struct ggml_opt_params   opt_params;
+    ggml_opt_context_t       opt_ctx;
+    struct ggml_tensor     * inputs;
+    struct ggml_tensor     * weights;
+    struct ggml_tensor     * outputs;
+    ggml_backend_buffer_t    buf;
+    ggml_opt_result_t        result;
+    ggml_opt_result_t        result2;
+};
+
+// These default values make it easier to check optimization results vs. expected values.
+static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+    result.adamw.alpha = 1.0f;
+    result.adamw.beta1 = 0.0f;
+    result.adamw.beta2 = 0.0f;
+    result.adamw.eps   = 0.0f;
+    return result;
+}
+
+static helper_ctx_data helper_get_ctx_data(
+        ggml_backend_sched_t    backend_sched,
+        ggml_backend_t          backend,
+        const bool              init_opt_ctx       = true,
+        const bool              optimizer_defaults = true,
+        int64_t                 nbatch_logical     = 1,
+        int64_t                 nbatch_physical    = 1,
+        enum ggml_opt_loss_type loss_type          = GGML_OPT_LOSS_TYPE_SUM) {
+    std::vector<ggml_opt_dataset_t> datasets(ndata);
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = ggml_opt_dataset_init(ne_datapoint, ne_label, ndata, ndata_shard);
+
+        float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+        float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
+
+        for (int64_t idata = 0; idata < ndata; ++idata) {
+            for (int64_t id = 0; id < ne_datapoint; ++id) {
+                data[  idata*ne_datapoint + id] =     16*idata + id;
             }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
+            for (int64_t il = 0; il < ne_label;     ++il) {
+                labels[idata*ne_label     + il] = 16*(16*idata + il);
             }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+        }
+
+        datasets[ndata_shard-1] = dataset;
+    }
+
+    ggml_opt_dataset_t dataset_unsupervised = ggml_opt_dataset_init(1, 0, ndata, /*ndata_shard =*/ 1);
+
+    float * data = ggml_get_data_f32(ggml_opt_dataset_data(dataset_unsupervised));
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        data[idata] = idata;
+    }
+
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ (2*ndata + 2)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
+
+    std::vector<struct ggml_tensor *>   data_batch(ndata);
+    std::vector<struct ggml_tensor *> labels_batch(ndata);
+    for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+        data_batch[ndata_batch-1]   = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_datapoint);
+        labels_batch[ndata_batch-1] = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_label);
+    }
+
+    struct ggml_tensor * inputs = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, nbatch_physical);
+    ggml_set_name(inputs, "inputs");
+
+    struct ggml_tensor * weights = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(weights, "weights");
+    ggml_set_param(ctx_static, weights);
+
+    struct ggml_tensor * intermediary = ggml_add(ctx_compute, inputs, weights);
+
+    struct ggml_tensor * outputs = ggml_scale(ctx_compute, intermediary, 1.0f);
+    ggml_set_name(outputs, "outputs");
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float w0 = float(ndata)/2;
+    ggml_backend_tensor_set(weights, &w0, 0, sizeof(float));
+
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+    const int32_t opt_period = nbatch_logical / nbatch_physical;
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    opt_params.opt_period = opt_period;
+    if (!optimizer_defaults) {
+        opt_params.get_opt_pars = helper_get_test_opt_pars;
+    }
+    ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr;
+
+    ggml_opt_result_t result  = ggml_opt_result_init();
+    ggml_opt_result_t result2 = ggml_opt_result_init();
+
+    return {datasets, data_batch, labels_batch, dataset_unsupervised, ctx_static, ctx_compute, opt_params, opt_ctx, inputs, weights, outputs, buf, result, result2};
+}
+
+static void helper_free_ctx_data(struct helper_ctx_data ctx_data) {
+    ggml_opt_result_free(ctx_data.result);
+    ggml_opt_result_free(ctx_data.result2);
+    ggml_opt_free(ctx_data.opt_ctx);
+    ggml_backend_buffer_free(ctx_data.buf);
+    ggml_free(ctx_data.ctx_static);
+    ggml_free(ctx_data.ctx_compute);
+    for (ggml_opt_dataset_t dataset : ctx_data.datasets_supervised) {
+        ggml_opt_dataset_free(dataset);
+    }
+    ggml_opt_dataset_free(ctx_data.dataset_unsupervised);
+}
+
+static void helper_after_test(
+        const char * func, const bool high_level, const std::string options,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    printf("  %s(high_level=%s%s, subtest=%s): ",
+           func, high_level ? "yes" : "no", options.c_str(), subtest.c_str());
+    if (subtest_ok) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+}
+
+static std::pair<int, int> test_dataset(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend);
+
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1];
+
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+
+        for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+            if (ndata_batch % ndata_shard != 0) {
+                continue;
+            }
+            bool subtest_ok = true;
+
+            struct ggml_tensor *   data_batch =   cd.data_batch[ndata_batch-1];
+            struct ggml_tensor * labels_batch = cd.labels_batch[ndata_batch-1];
+
+            std::vector<float>   data(ggml_nelements(  data_batch));
+            std::vector<float> labels(ggml_nelements(labels_batch));
+
+            std::vector<int64_t> idata_shuffled;
+            const int64_t nbatches = ndata / ndata_batch;
+            for (int64_t ibatch = 0; ibatch < nbatches; ++ibatch) {
+                ggml_opt_dataset_get_batch(dataset, data_batch, labels_batch, ibatch);
+
+                ggml_backend_tensor_get(  data_batch,   data.data(), 0, ggml_nbytes(  data_batch));
+                ggml_backend_tensor_get(labels_batch, labels.data(), 0, ggml_nbytes(labels_batch));
+
+                for (int64_t idata_batch = 0; idata_batch < ndata_batch; ++idata_batch) {
+                    const int64_t idata = ibatch*ndata_batch + idata_batch;
+                    const int64_t idata_found = data[idata_batch*ne_datapoint] / 16;
+                    subtest_ok = subtest_ok && (shuffle || idata_found == idata);
+                    idata_shuffled.push_back(idata_found);
+
+                    for (int64_t id = 0; id < ne_datapoint; ++id) {
+                        if (data[  idata_batch*ne_datapoint + id] != 16*idata_found + id) {
+                            subtest_ok = false;
+                        }
                     }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    for (int64_t il = 0; il < ne_label;     ++il) {
+                        if (labels[idata_batch*ne_label     + il] != 16*(16*idata_found + il)) {
+                            subtest_ok = false;
                         }
                     }
                 }
             }
-            break;
-        default:
-            assert(false);
+
+            if (!shuffle || ndata % ndata_batch == 0) {
+                const int ndata_max = (ndata / ndata_batch) * ndata_batch;
+
+                for (int64_t idata = 0; subtest_ok && idata < ndata_max; ++idata) {
+                    int ninstances = 0;
+                    for (int64_t id : idata_shuffled) {
+                        ninstances += id == idata;
+                    }
+                    if (ninstances != 1) {
+                        subtest_ok = false;
+                    }
+                }
+            }
+
+            printf("  %s(shuffle=%s, ndata_shard=%" PRId64 ", ndata_batch=%" PRId64 "): ",
+                   __func__, shuffle ? "yes" : "no", ndata_shard, ndata_batch);
+            if (subtest_ok) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
     }
 
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
+    /*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    for (int idata = 0; idata < ndata; ++idata) {
+        const float idataf = idata;
+        ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+        ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+        ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, sizeof(float));
+    }
+
+    {
+        bool subtest_ok = true;
+        for (int idata = 0; idata < ndata; ++idata) {
+            if (grad_history[idata] != idata + 1) {
+                subtest_ok = false;
+            }
+        }
+        printf("  %s(): ", __func__);
+        if (subtest_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_forward_backward(
+        const char * func, const bool high_level, const bool shuffle,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", shuffle=";
+    options += shuffle ? "yes" : "no";
+    helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_forward_backward(
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, nullptr, cd.result, 0, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_forward(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = weights == ndata/2;
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
+    }
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && loss == 33.0 && almost_equal(loss_unc, sqrt(3.5), 1e-10);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
+    }
+
+    float w0;
+    ggml_backend_tensor_get(cd.weights, &w0, 0, sizeof(float));
+    for (int i = 0; i < 10; ++i) {
+        ggml_opt_forward_backward(cd.opt_ctx, nullptr);
+    }
+    ggml_backend_tensor_set(cd.weights, &w0, 0, sizeof(float));
+
+    ggml_opt_reset(cd.opt_ctx, /*optimizer =*/ false);
+    ggml_opt_result_reset(cd.result);
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = weights == -ndata/2;
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
+    }
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && loss == 18.0 && (shuffle || loss_unc == 0.0);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(__func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    float weights_epoch;
+    float weights_fit;
+
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+
+        ggml_backend_tensor_get(cd.weights, &weights_epoch, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ false);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset,
+            GGML_OPT_LOSS_TYPE_SUM, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
+
+        ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+
+    const bool subtest_ok = weights_epoch == weights_fit;
+
+    printf("  %s(): ", __func__);
+    if (subtest_ok) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_idata_split(
+        const char * func, const bool high_level, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+    const int idata_split = ndata * 2/3;
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (high_level) {
+            ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr);
+        } else {
+            int idata = 0;
+            for (; idata < idata_split; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+            for (; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_forward(cd.opt_ctx, cd.result2);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+        }
+
+        {
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = weights == ndata/2 - epoch*idata_split;
+            helper_after_test_idata_split(__func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = ndata_result == idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+            subtest_ok = subtest_ok && loss == 28.0 - epoch*16.0 && loss_unc == 0.0;
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(__func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result2, &ndata_result);
+            bool subtest_ok = ndata_result == ndata - idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result2, &loss, &loss_unc);
+            subtest_ok = subtest_ok && loss == 15.0 - epoch*8 && almost_equal(loss_unc, sqrt(0.5), 1e-10);
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(__func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+        ggml_opt_result_reset(cd.result2);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_gradient_accumulation(
+        const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", nbatch_physical=";
+    options += std::to_string(nbatch_physical);
+    options += ", loss_type=";
+    options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum";
+    options += ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(func, false, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_gradient_accumulation(
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(
+        backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (nbatch_physical == 1) {
+            for (int idata = 0; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, 1*sizeof(float));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, 1*sizeof(float));
+            }
+        } else if (nbatch_physical == 2) {
+            for (int idata = 0; idata < ndata; idata += 2) {
+                const float idataf[2] = {float(idata + 0), float(idata + 1)};
+                ggml_backend_tensor_set(cd.inputs, idataf, 0, 2*sizeof(float));
+                ggml_opt_forward_backward(cd.opt_ctx, cd.result);
+
+                grad_history[idata + 0] = 0.0f;
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata + 1, 0, 1*sizeof(float));
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+
+        {
+            GGML_ASSERT(ndata == 6);
+            constexpr double atol = 1e-6;
+            bool subtest_ok = true;
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 0.0, atol);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0/ndata, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0/ndata, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 0.0/ndata, atol);
+            } else {
+                GGML_ASSERT(false);
+            }
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
+        }
+        {
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = weights == (ndata/2) - epoch;
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        {
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = ndata_result == ndata/nbatch_physical;
+
+            double loss;
+            ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr);
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                subtest_ok = subtest_ok && loss == (39.0 - epoch*6.0);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, 1e-6);
+            } else {
+                GGML_ASSERT(false);
+            }
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+    result.adamw.alpha = 0.1f;
     return result;
 }
 
-int main(void) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
+static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
 
-    struct ggml_context * ctx = ggml_init(params);
+    // Test for simple regression with f(x) = a*x + b
 
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};
-    int64_t ne3[4] = {128, 256, 1, 1};
+    constexpr int64_t ndata_regression = 201;
+    constexpr float a_true = 1.2f;
+    constexpr float b_true = 3.4f;
 
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
+    std::mt19937 gen(12345);
+    std::normal_distribution<float> nd{0.0f, 0.1f};
 
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+    ggml_opt_dataset_t dataset = ggml_opt_dataset_init(1, 1, ndata_regression, ndata_regression);
 
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+    float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+    float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
 
-    struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(ge, e);
-    ggml_graph_reset(ge);
+    constexpr float x_min = -100.0f;
+    constexpr float x_max =  100.0f;
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    for (int64_t idata = 0; idata < ndata_regression; ++idata) {
+        const float x = x_min + (x_max - x_min) * idata/(ndata_regression-1);
+        const float y = a_true*x + b_true + nd(gen);
 
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
+        data[idata]   = x;
+        labels[idata] = y;
+    }
 
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 3*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
 
-    ggml_opt(ctx, opt_params, e);
+    // The first dimension is the dimension of the datapoints, the second dimension is the number of datapoints.
+    struct ggml_tensor * x = ggml_new_tensor_2d(ctx_static, GGML_TYPE_F32, 1, ndata_regression);
+    ggml_set_name(x, "x");
 
-    ggml_graph_reset(ge);
+    struct ggml_tensor * a = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(a, "a");
+    ggml_set_param(ctx_static, a);
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(b, "b");
+    ggml_set_param(ctx_static, b);
 
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+    struct ggml_tensor * f = ggml_add(ctx_compute, ggml_mul(ctx_compute, x, a), b);
+    ggml_set_name(f, "f");
+    ggml_set_param(ctx_static, f);
 
-    const bool success = (fe_opt <= fe);
-    assert(success);
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float a0 = 1.0f;
+    const float b0 = 3.0f;
+    ggml_backend_tensor_set(a, &a0, 0, sizeof(float));
+    ggml_backend_tensor_set(b, &b0, 0, sizeof(float));
 
-    ggml_free(ctx);
-    return success ? 0 : -1;
+    ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+        helper_get_regression_opt_pars, 100, ndata_regression, 0.0f, true);
+
+    {
+        float a_fit;
+        ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float));
+        float b_fit;
+        ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float));
+        const bool subtest_ok = almost_equal(a_fit, a_true, 1e-2) && almost_equal(b_fit, b_true, 1e-2);
+        printf("  %s(subtest=weights): ", __func__);
+        if (subtest_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    ggml_backend_buffer_free(buf);
+    ggml_free(ctx_static);
+    ggml_opt_dataset_free(dataset);
+
+    return std::make_pair(npass, ntest);
 }
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
 
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
+static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int npass = 0;
+    int ntest = 0;
 
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
+    for (bool shuffle : {false, true}) {
+        std::pair<int, int> partial = test_dataset(backend_sched, backend, shuffle);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    {
+        std::pair<int, int> partial = test_grad(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        for (bool shuffle : {false, true}) {
+            if (!high_level && shuffle) {
+                continue;
+            }
 
+            std::pair<int, int> partial = test_forward_backward(backend_sched, backend, high_level, shuffle);
+            npass += partial.first;
+            ntest += partial.second;
+        }
+    }
+    {
+        std::pair<int, int> partial = test_epoch_vs_fit(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        std::pair<int, int> partial = test_idata_split(backend_sched, backend, high_level);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (int32_t nbatch_physical : {2, 1}) {
+        for (enum ggml_opt_loss_type loss_type : {GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN}) {
+            std::pair<int, int> partial = test_gradient_accumulation(backend_sched, backend, nbatch_physical, loss_type);
+            npass += partial.first;
+            ntest += partial.second;
+        }
+    }
+    {
+        std::pair<int, int> partial = test_regression(backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
 
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
+    return std::make_pair(npass, ntest);
+}
 
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
+int main(void) {
+    const size_t dev_count = ggml_backend_dev_count();
+    printf("Testing %zu devices\n\n", dev_count);
+    size_t n_ok = 0;
 
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
+    std::vector<ggml_backend_dev_t> devs;
+    std::vector<ggml_backend_t>     backends;
 
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
+    for (size_t i = 0; i < dev_count; ++i) {
+        devs.push_back(ggml_backend_dev_get(i));
+
+        ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
+        GGML_ASSERT(backend != NULL);
+
+        if (ggml_backend_is_cpu(backend)) {
+            ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
+        }
+
+        backends.push_back(backend);
+    }
+
+    for (size_t i = 0; i < dev_count; ++i) {
+        // Put the backend to be tested in front so that it's prioritized:
+        std::vector<ggml_backend_t> backends_modded = {backends[i]};
+        backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
+
+        ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
+            backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false);
+
+        printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
+        printf("  Device description: %s\n", ggml_backend_dev_description(devs[i]));
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(devs[i], &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
+
+        std::pair<int, int> result = test_backend(backend_sched, backends[i]);
+
+        printf("  %d/%d tests passed\n", result.first, result.second);
+        printf("  Backend %s: ", ggml_backend_name(backends[i]));
+        if (result.first == result.second) {
+            printf("\033[1;32mOK\033[0m\n");
+            n_ok++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+
+        printf("\n");
+
+        ggml_backend_sched_free(backend_sched);
+    }
+
+    for (ggml_backend_t backend : backends) {
+        ggml_backend_free(backend);
+    }
+
+    printf("%zu/%zu backends passed\n", n_ok, dev_count);
+    if (n_ok != dev_count) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index f615b612d..c77c8ed13 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -1,6 +1,7 @@
 // Unit tests for quantization specific functions - quantize, dequantize and dot product
 
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #undef NDEBUG
 #include <assert.h>
@@ -15,11 +16,13 @@
 
 constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
 constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
 constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
 constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
+constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
 
 static const char* RESULT_STR[] = {"ok", "FAILED"};
 
@@ -42,26 +45,27 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
 }
 
 // Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
     std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    // FIXME: why is done twice?
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
 
-    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
 
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@@ -75,19 +79,19 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 }
 
 // Total dot product error
-static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
-) {
+static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
+    GGML_UNUSED(qfns);
+
     std::vector<uint8_t> tmp_q1(2*test_size);
     std::vector<uint8_t> tmp_q2(2*test_size);
 
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+    const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
 
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+    qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -129,26 +133,24 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
 
         // deprecated - skip
-        if (qfns.blck_size == 0) {
+        if (qfns->blck_size == 0) {
             continue;
         }
 
         const ggml_type ei = (ggml_type)i;
 
-        if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
-            printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
-            continue;
-        }
-
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
         ggml_quantize_init(ei);
 
-        if (qfns.from_float && qfns.to_float) {
-            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+        if (qfns_cpu->from_float && qfns->to_float) {
+            const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
             const float max_quantization_error =
+                type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
+                type == GGML_TYPE_TQ2_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
                 type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
                 type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
                 type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
@@ -160,17 +162,19 @@ int main(int argc, char * argv[]) {
                 printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
             }
 
-            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
             failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
             num_failed += failed;
             if (failed || verbose) {
                 printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
             }
 
-            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
             const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
                                             type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
                                           ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                                          : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
+                                          ? MAX_DOT_PRODUCT_ERROR_TERNARY
                                           : MAX_DOT_PRODUCT_ERROR;
             failed = !(vec_dot_error < max_allowed_error);
             num_failed += failed;
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 48d9fae3d..288288493 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -1,12 +1,12 @@
 // Benchmark quantization specific functions on synthetic data
 
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #undef NDEBUG
 #include <algorithm>
 #include <assert.h>
 #include <functional>
-#include <inttypes.h>
 #include <math.h>
 #include <memory>
 #include <stdio.h>
@@ -122,9 +122,10 @@ static void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
+            if (qfns_cpu->from_float && qfns->to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -270,12 +271,13 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns_cpu->from_float && qfns->to_float) {
             printf("%s\n", ggml_type_name(type));
 
             ggml_quantize_init(type);
@@ -285,7 +287,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_reference(test_data1, test_q1, size);
+                        qfns->from_float_ref(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +301,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
+                        qfns_cpu->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +312,11 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size);
                         return test_out[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +330,8 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
+                        const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +342,13 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
                         float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                         return result;
                     };
                     size_t quantized_size = ggml_row_size(type, size);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 26c1f42dc..322b8bb99 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #include <cmath>
 #include <cstdio>
@@ -113,7 +114,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 }
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
@@ -137,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
     struct ggml_tensor * x;
 
     // rope f32
-    for (int m = 0; m < 3; ++m) {
+    for (int m = 0; m < 5; ++m) {
         const int ndims = 4;
 
         const int64_t n_rot = 128;
@@ -146,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
         const int n_past_0 = 100;
         const int n_past_2 = 33;
 
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
+        struct ggml_tensor * r0;
+        struct ggml_tensor * r1;
+        struct ggml_tensor * r2;
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        int mode = -1;
 
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+        if (m < 3) {
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
 
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
+            for (int i = 0; i < ne[2]; ++i) {
+                ((int32_t *) p0->data)[i] = n_past_0 + i;
+                ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+                ((int32_t *) p2->data)[i] = n_past_2 + i;
+            }
+            // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+            mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+            // 100, 101, 102, ..., 172
+            r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
+            // -67, -67, -67, ..., -67
+            r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+            //  33,  34,  35, ..., 105
+            r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+        } else {
+            // testing multi-dimension rope position embedding mode
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+
+            int sections[4] = {16, 24, 24, 0};
+            mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
+
+            for (int i = 0; i < ne[2]; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+                    ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+                    ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+                }
+            }
+
+            // [[100, 101, 102, ..., 172],
+            // [101, 102, 103, ..., 173],
+            // [102, 103, 104, ..., 174]]
+            r0 = ggml_rope_multi(
+                ctx0, x, p0, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+            // [[-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]]
+            r1 = ggml_rope_multi(
+                ctx0, r0, p1, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+            //  [[33,  34,  35, ..., 105]
+            //  [34,  35,  36, ..., 106]
+            //  [35,  36,  37, ..., 107]]
+            r2 = ggml_rope_multi(
+                ctx0, x, p2, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+        }
 
         ggml_cgraph * gf = ggml_new_graph(ctx0);
 
@@ -218,4 +260,3 @@ int main(int /*argc*/, const char ** /*argv*/) {
 
     return 0;
 }
-
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6374958fe..61bd67850 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -10,181 +10,207 @@
 #include <string>
 #include <vector>
 
-static void dump(const llama_token_data_array * candidates) {
-    for (size_t i = 0; i < candidates->size; i++) {
-        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
+extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
+
+static void dump(const llama_token_data_array * cur_p) {
+    for (size_t i = 0; i < cur_p->size; i++) {
+        printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
 }
 
-#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
+#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
 
-static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+struct sampler_tester {
+    sampler_tester(size_t n_vocab) {
+        cur.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+            const float logit = logf(token_id);
+            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
     }
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_k(nullptr, &candidates_p, k, 1);
-    DUMP(&candidates_p);
+    sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
+        cur.reserve(probs.size());
+        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+            const float logit = logf(probs[token_id]);
+            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+        }
 
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
     }
+
+    void apply(llama_sampler * sampler) {
+        llama_sampler_apply(sampler, &cur_p);
+        llama_sampler_free(sampler);
+    }
+
+    void check() {
+        GGML_ASSERT(cur_p.size == probs_expected.size());
+        for (size_t i = 0; i < cur_p.size; i++) {
+            GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
+        }
+    }
+
+    llama_token_data_array cur_p;
+
+private:
+    const std::vector<float> probs_expected;
+
+    std::vector<llama_token_data> cur;
+};
+
+static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp(temp));
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
 }
 
-static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
+    sampler_tester tester(probs, probs_expected);
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
+    sampler_tester tester(probs, probs_expected);
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
-    DUMP(&candidates_p);
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_k(k));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_min_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-    llama_sample_softmax(nullptr, &candidates_p);
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
+static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_typical(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_min_p(p, 1));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
 
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    tester.check();
 }
 
-static void test_repetition_penalties(
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_typical(p, 1));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_penalties(
     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
+    const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
+) {
+    GGML_ASSERT(probs.size() == probs_expected.size());
+
+    sampler_tester tester(probs, probs_expected);
+
+    auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_dry(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
+    int dry_allowed_length, int dry_penalty_last_n,
+    const std::vector<std::vector<llama_token>> & seq_breakers
 ) {
     GGML_ASSERT(probs.size() == expected_probs.size());
 
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    sampler_tester tester(probs, expected_probs);
+
+    auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
     }
 
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+    tester.check();
 }
 
-static void test_sampler_queue(
-    const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
+static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(token_id);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    sampler_tester tester(n_vocab);
 
           llama_token min_token_id = 0;
     const llama_token max_token_id = n_vocab-1;
 
     for (auto s : samplers_sequence) {
         switch (s){
-            case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
-            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
-            case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
-            case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
-            default : GGML_ASSERT(false && "Unknown sampler");                  break;
+            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
+            case 'y': GGML_ABORT("typical test not implemented");
+            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
+            case 't': GGML_ABORT("temperature test not implemented");
+            default : GGML_ABORT("Unknown sampler");
         }
 
-        llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
+        tester.apply(llama_sampler_init_dist(0));
 
-        const int size = candidates_p.size;
+        auto & cur_p = tester.cur_p;
+
+        const int size = cur_p.size;
 
         if (s == 'k') {
             const int expected_size = std::min(size, top_k);
             min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
 
             GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
+            GGML_ASSERT(cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
         } else if (s == 'p') {
             const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
             const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
@@ -206,8 +232,8 @@ static void test_sampler_queue(
             }
 
             GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
+            GGML_ASSERT(cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
         } else if (s == 'm') {
             int expected_size = ceilf((1.0f-min_p) * n_vocab);
             expected_size = std::max(expected_size, 1);
@@ -219,29 +245,73 @@ static void test_sampler_queue(
             min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
 
             GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
+            GGML_ASSERT(cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
            samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 
+static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
+    std::vector<llama_token_data> cur(data.size());
+    std::copy(data.begin(), data.end(), cur.begin());
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_sampler_apply(cnstr, &cur_p);
+    llama_sampler_reset(cnstr);
+    const int64_t t_start = ggml_time_us();
+    for (int i = 0; i < n_iter; i++) {
+        std::copy(data.begin(), data.end(), cur.begin());
+        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_sampler_apply(cnstr, &cur_p);
+        llama_sampler_reset(cnstr);
+    }
+    const int64_t t_end = ggml_time_us();
+    llama_sampler_free(cnstr);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+}
+
+#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
+
+static void test_perf() {
+    const int n_vocab = 1 << 17;
+
+    std::vector<llama_token_data> data;
+
+    data.reserve(n_vocab);
+    for (int i = 0; i < n_vocab; i++) {
+        const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
+        data.emplace_back(llama_token_data{i, logit, 0.0f});
+    }
+
+    BENCH(llama_sampler_init_top_k  (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p  (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p  (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc    (1.0f, 0.1f, 1, 1),       data, 32);
+}
+
 int main(void) {
     ggml_time_init();
 
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
+
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
 
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
 
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
@@ -252,20 +322,31 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
 
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
 
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
 
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
+
+
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
 
     test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
     test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
@@ -297,5 +378,7 @@ int main(void) {
 
     printf("OK\n");
 
+    test_perf();
+
     return 0;
 }
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
deleted file mode 100644
index 472b0b3a8..000000000
--- a/tests/test-tokenizer-0-falcon.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-falcon.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     204, }, },
-        { "  "                    , {     258, }, },
-        { "   "                   , {     466, }, },
-        { "\t"                    , {     192, }, },
-        { "\n"                    , {     193, }, },
-        { "\t\n"                  , {   19125, }, },
-        { "Hello world"           , {    9856,   1079, }, },
-        { " Hello world"          , {   23090,   1079, }, },
-        { "Hello World"           , {    9856,   2889, }, },
-        { " Hello World"          , {   23090,   2889, }, },
-        { " Hello World!"         , {   23090,   2889,     12, }, },
-        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
-        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
-        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
-        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
-        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
-        { "Hello"                 , {    9856, }, },
-        { " Hello"                , {   23090, }, },
-        { "  Hello"               , {     204,  23090, }, },
-        { "   Hello"              , {     258,  23090, }, },
-        { "    Hello"             , {     466,  23090, }, },
-        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
-        { "\n ="                  , {    1212,     40, }, },
-        { "' era"                 , {      18,   4932, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py
deleted file mode 100644
index 4f06ec9bb..000000000
--- a/tests/test-tokenizer-0-falcon.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# tests with BPE tokenizer
-
-import argparse
-
-from transformers import AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-
-tests = [
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-    "\n =",
-    "' era",
-]
-
-for text in tests:
-    print('text: ', text)
-    print(tokenizer.encode(text))
-    print(tokenizer.decode(tokenizer.encode(text)))
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
deleted file mode 100644
index 0a16cd7eb..000000000
--- a/tests/test-tokenizer-0-llama.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-llama.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     259, }, },
-        { "  "                    , {    1678, }, },
-        { "   "                   , {     268, }, },
-        { "\t"                    , {   29871,     12, }, },
-        { "\n"                    , {   29871,     13, }, },
-        { "\t\n"                  , {   29871,     12,     13, }, },
-        { "Hello world"           , {   15043,   3186, }, },
-        { " Hello world"          , {   29871,  15043,   3186, }, },
-        { "Hello World"           , {   15043,   2787, }, },
-        { " Hello World"          , {   29871,  15043,   2787, }, },
-        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
-        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
-        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
-        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello"                 , {   15043, }, },
-        { " Hello"                , {   29871,  15043, }, },
-        { "  Hello"               , {     259,  15043, }, },
-        { "   Hello"              , {    1678,  15043, }, },
-        { "    Hello"             , {     268,  15043, }, },
-        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
-        { " ("                    , {   29871,  313, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
-        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
-        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
-        printf("tok: ");
-        for (const auto & tok : res_bos) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
-
-        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
-            if (test_kv.second[i] != res_bos[i + 1]) {
-                correct = false;
-            }
-            if (test_kv.second[i] != res_nobos[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_spm(ctx, res_nobos).c_str(),
-                llama_detokenize_spm(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res_nobos) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py
deleted file mode 100644
index f3d4d7e3d..000000000
--- a/tests/test-tokenizer-0-llama.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# tests with SPM tokenizer
-
-import argparse
-
-from sentencepiece import SentencePieceProcessor
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
-
-tests = [
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-]
-
-
-for text in tests:
-    print('text: ', text)
-    print('\nwith bos:')
-    print(tokenizer.encode(text, add_bos=True))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
-    print('\nwithout bos:')
-    print(tokenizer.encode(text, add_bos=False))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
-
-print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
-print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
-print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
-print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
-print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
-print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text, add_bos=False)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s, add_bos=True)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
new file mode 100644
index 000000000..59dda4877
--- /dev/null
+++ b/tests/test-tokenizer-0.cpp
@@ -0,0 +1,312 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+#include <thread>
+
+//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//        { ""                      , {  }, },
+//        { " "                     , {     220, }, },
+//        { "  "                    , {     256, }, },
+//        { "   "                   , {     262, }, },
+//        { "\t"                    , {     197, }, },
+//        { "\n"                    , {     198, }, },
+//        { "\n\n"                  , {     271, }, },
+//        { "\n\n\n"                , {    1432, }, },
+//        { "\t\n"                  , {    1602, }, },
+//        { "Hello world"           , {    9906,   1917, }, },
+//        { " Hello world"          , {   22691,   1917, }, },
+//        { "Hello World"           , {    9906,   4435, }, },
+//        { " Hello World"          , {   22691,   4435, }, },
+//        { " Hello World!"         , {   22691,   4435,      0, }, },
+//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
+//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
+//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
+//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
+//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
+//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
+//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
+//        { "Hello"                 , {    9906, }, },
+//        { " Hello"                , {   22691, }, },
+//        { "  Hello"               , {     220,  22691, }, },
+//        { "   Hello"              , {     256,  22691, }, },
+//        { "    Hello"             , {     262,  22691, }, },
+//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
+//        { " ("                    , {     320, }, },
+//        { "\n ="                  , {     198,    284, }, },
+//        { "' era"                 , {       6,  11639, }, },
+//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
+//        { "3"                     , {      18, }, },
+//        { "33"                    , {    1644, }, },
+//        { "333"                   , {    8765, }, },
+//        { "3333"                  , {    8765,     18, }, },
+//        { "33333"                 , {    8765,   1644, }, },
+//        { "333333"                , {    8765,   8765, }, },
+//        { "3333333"               , {    8765,   8765,     18, }, },
+//        { "33333333"              , {    8765,   8765,   1644, }, },
+//        { "333333333"             , {    8765,   8765,   8765, }, },
+//    };
+//
+//    return _k_tests;
+//}
+
+using llama_tests = std::map<std::string, std::vector<llama_token>>;
+
+static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    llama_tests tests;
+
+    std::ifstream ifs_inp(fname_inp);
+    if (!ifs_inp) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
+        return tests;
+    }
+
+    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
+
+    std::ifstream ifs_out(fname_out);
+    if (!ifs_out) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+        return tests;
+    }
+
+    std::vector<std::string> sout;
+    for (std::string line; std::getline(ifs_out, line);) {
+        sout.push_back(line);
+    }
+
+    const std::string sep = "\n__ggml_vocab_test__\n";
+
+    std::vector<std::string> sinp;
+
+    size_t pos = 0;
+    while (pos < sraw.size()) {
+        const size_t next = sraw.find(sep, pos);
+        if (next == std::string::npos) {
+            sinp.push_back(sraw.substr(pos));
+            break;
+        }
+        sinp.push_back(sraw.substr(pos, next - pos));
+        pos = next + sep.size();
+    }
+
+    if (sinp.size() != sout.size()) {
+        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
+        return tests;
+    }
+
+    for (size_t i = 0; i < sinp.size(); ++i) {
+        const std::string & s = sinp[i];
+        const std::string & o = string_strip(sout[i]);
+
+        std::vector<llama_token> toks;
+
+        size_t pos = 0;
+        while (pos < o.size()) {
+            size_t next = o.find(' ', pos);
+            if (next == std::string::npos) {
+                next = o.size();
+            }
+            const std::string stok = o.substr(pos, next - pos);
+            toks.push_back(std::stoi(stok));
+            pos = next + 1;
+        }
+
+        tests[s] = toks;
+    }
+
+    return tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    const std::string fname_inp = fname + ".inp";
+    const std::string fname_out = fname + ".out";
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    bool success = true;
+
+    const auto k_tests = [&]() -> llama_tests {
+        if (!fname_text.empty()) {
+            return {};
+        }
+
+        const auto res = read_tests(fname_inp, fname_out);
+
+        if (res.empty()) {
+            fprintf(stderr, "%s : error: no tests found\n", __func__);
+            exit(1);
+        }
+
+        return res;
+    }();
+
+    const bool add_special = false;
+
+    // multi-threaded tokenization
+    const int nthread = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads(nthread);
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i] = std::thread([&, i]() {
+            for (const auto & test_kv : k_tests) {
+                const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
+
+                // here only print the result of the first thread
+                // because the other threads are running the same tests
+                if (i != 0) {
+                    continue;
+                }
+
+                printf("\n");
+                printf("src: '%s'\n", test_kv.first.c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
+                printf("tok: ");
+                for (const auto & tok : res) {
+                    printf("%d ", tok);
+                }
+                printf("\n");
+
+                bool correct = res.size() == test_kv.second.size();
+                for (int i = 0; i < (int) res.size() && correct; ++i) {
+                    if (test_kv.second[i] != res[i]) {
+                        correct = false;
+                    }
+                }
+
+                if (!correct) {
+                    fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+                    fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
+                    fprintf(stderr, "%s : expected tokens: ", __func__);
+                    for (const auto & t : test_kv.second) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+                    fprintf(stderr, "%s : got tokens:      ", __func__);
+                    for (const auto & t : res) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+
+                    success = false;
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i].join();
+    }
+
+    // single threaded tokenization
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        std::vector<llama_token> res;
+
+        {
+            const auto t_start = ggml_time_us();
+
+            res = common_tokenize(ctx, text, add_special, false);
+
+            const auto t_end = ggml_time_us();
+
+            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
+        }
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                ofs << tok << "\n";
+            }
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_model_free(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    printf("\n");
+    printf("Tests %s\n", success ? "passed" : "failed");
+
+    return success ? 0 : 3;
+}
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
new file mode 100644
index 000000000..cd760d1ce
--- /dev/null
+++ b/tests/test-tokenizer-0.py
@@ -0,0 +1,46 @@
+import time
+import argparse
+
+from transformers import AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+fname_tok = args.fname_tok
+
+tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+
+print('tokenizing file: ', fname_tok) # noqa: NP100
+fname_out = fname_tok + '.tok'
+with open(fname_tok, 'r', encoding='utf-8') as f:
+    lines = f.readlines()
+    s = ''.join(lines)
+    t_start = time.time()
+    res = tokenizer.encode(s, add_special_tokens=False)
+    t_end = time.time()
+    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
+    with open(fname_out, 'w', encoding='utf-8') as f:
+        for x in res:
+            # LLaMA v3 for some reason strips the space for these tokens (and others)
+            # if x == 662:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 1174:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 2564:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 758:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 949:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 5354:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # else:
+            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
+            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
+            f.write(str(x) + '\n')
+    print('len(res): ', len(res)) # noqa: NP100
+    print('len(lines): ', len(lines)) # noqa: NP100
+print('results written to: ', fname_out) # noqa: NP100
diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh
new file mode 100755
index 000000000..4d2b83655
--- /dev/null
+++ b/tests/test-tokenizer-0.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+#
+# Usage:
+#
+#   test-tokenizer-0.sh <name> <input>
+#
+
+if [ $# -ne 2 ]; then
+    printf "Usage: $0 <name> <input>\n"
+    exit 1
+fi
+
+name=$1
+input=$2
+
+make -j tests/test-tokenizer-0
+
+printf "Testing %s on %s ...\n" $name $input
+
+set -e
+
+printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
+python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
+
+printf "Tokenizing using (cpp) llama.cpp ...\n"
+./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+
+cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
+cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
+
+set +e
+
+diff $input.tok $input.tokcpp > /dev/null 2>&1
+
+if [ $? -eq 0 ]; then
+    printf "Tokenization is correct!\n"
+else
+    diff $input.tok $input.tokcpp | head -n 32
+
+    printf "Tokenization differs!\n"
+fi
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 3596ce55a..55425d88a 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -11,17 +11,30 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <atomic>
 
 int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
         return 1;
     }
 
     const std::string fname = argv[1];
+    bool ignore_merges = false;
+    if (argc == 3) {
+        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
+            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+            return 1;
+        }
+        ignore_merges = true;
+    }
 
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
+    if (ignore_merges) {
+        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
+    }
+
     llama_model * model;
     llama_context * ctx;
 
@@ -33,7 +46,7 @@ int main(int argc, char **argv) {
 
         mparams.vocab_only = true;
 
-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = llama_model_load_from_file(fname.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@@ -42,16 +55,21 @@ int main(int argc, char **argv) {
 
         auto cparams = llama_context_default_params();
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = llama_init_from_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
             return 1;
         }
     }
 
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_BPE);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_BPE) {
+        return 99;
+    }
 
 #ifdef _WIN32
     // We need this for unicode console support
@@ -59,14 +77,26 @@ int main(int argc, char **argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_n_vocab(model);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i));
         try {
-            auto cps = codepoints_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
+            auto cps = unicode_cpts_from_utf8(str);
+            std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+            if (ignore_merges && tokens.size() > 1) {
+                fprintf(stderr,
+                        "%s : error: token %d detokenizes to '%s'(%zu) but "
+                        "tokenization of this to multiple tokens: [",
+                        __func__, i, str.c_str(), str.length());
+                fprintf(stderr, "%d", tokens[0]);
+                for (size_t i = 1; i < tokens.size(); i++) {
+                    fprintf(stderr, ", %d", tokens[i]);
+                }
+                fprintf(stderr, "]\n");
+                return 2;
+            }
+            std::string check = common_detokenize(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                     __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -84,26 +114,23 @@ int main(int argc, char **argv) {
 
         std::vector<std::thread> threads(nthread);
 
+        std::atomic_int errcode = {};
+
         for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
-                    if (!( // NOLINT
-                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
-                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
-                                (cp < 0x1c       || cp >  0x1e)   &&
-                                (cp < 0xd800     || cp >  0xdfff) &&
-                                (cp < 0x00040000 || cp >= 0x000e0000)
-                        )) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
                         continue;
                     }
 
-                    std::string str = codepoint_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize_bpe(ctx, tokens);
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        std::exit(3);
+                        errcode = 3;
                     }
                 }
             });
@@ -112,9 +139,13 @@ int main(int argc, char **argv) {
         for (auto & t : threads) {
             t.join();
         }
+
+        if (errcode) {
+            return errcode;
+        }
     }
 
-    llama_free_model(model);
+    llama_model_free(model);
     llama_free(ctx);
 
     llama_backend_free();
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-spm.cpp
similarity index 57%
rename from tests/test-tokenizer-1-llama.cpp
rename to tests/test-tokenizer-1-spm.cpp
index 9333f8686..9e7b77f31 100644
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -11,8 +11,9 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <atomic>
 
-int main(int argc, char **argv) {
+int main(int argc, char ** argv) {
     if (argc < 2) {
         fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
         return 1;
@@ -33,7 +34,7 @@ int main(int argc, char **argv) {
 
         mparams.vocab_only = true;
 
-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = llama_model_load_from_file(fname.c_str(), mparams);
 
         if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@@ -42,16 +43,21 @@ int main(int argc, char **argv) {
 
         auto cparams = llama_context_default_params();
 
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = llama_init_from_model(model, cparams);
 
         if (ctx == NULL) {
             fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
             return 1;
         }
     }
 
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_SPM) {
+        return 99;
+    }
 
 #ifdef _WIN32
     // We need this for unicode console support
@@ -59,12 +65,12 @@ int main(int argc, char **argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_n_vocab(model);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
+        std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                 __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -78,20 +84,23 @@ int main(int argc, char **argv) {
 
         std::vector<std::thread> threads(nthread);
 
+        std::atomic_int errcode = {};
+
         for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
-                    if (cp >= 0xd800 && cp <= 0xdfff) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
                         continue;
                     }
 
-                    std::string str = codepoint_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize_spm(ctx, tokens);
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        std::exit(3);
+                        errcode = 3;
                     }
                 }
             });
@@ -100,9 +109,13 @@ int main(int argc, char **argv) {
         for (auto & t : threads) {
             t.join();
         }
+
+        if(errcode) {
+            return errcode;
+        }
     }
 
-    llama_free_model(model);
+    llama_model_free(model);
     llama_free(ctx);
 
     llama_backend_free();
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
new file mode 100644
index 000000000..c6cdcb554
--- /dev/null
+++ b/tests/test-tokenizer-random.py
@@ -0,0 +1,566 @@
+# Test libllama tokenizer == AutoTokenizer.
+# Brute force random words/text generation.
+#
+# Sample usage:
+#
+#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
+#
+
+from __future__ import annotations
+
+import time
+import logging
+import argparse
+import subprocess
+import random
+import unicodedata
+
+from pathlib import Path
+from typing import Any, Iterator, cast
+from typing_extensions import Buffer
+
+import cffi
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+logger = logging.getLogger("test-tokenizer-random")
+
+
+class LibLlama:
+
+    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
+    DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
+    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+
+    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
+        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
+        path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
+        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
+        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
+        self.lib.llama_backend_init()
+
+    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
+        cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
+        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
+        res = subprocess.run(cmd, stdout=subprocess.PIPE)
+        assert (res.returncode == 0)
+        source = res.stdout.decode()
+        ffi = cffi.FFI()
+        if True:  # workarounds for pycparser
+            source = "typedef struct { } __builtin_va_list;" + "\n" + source
+            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
+            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
+            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
+            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
+        ffi.cdef(source, override=True)
+        lib = ffi.dlopen(path_libllama)
+        return (ffi, lib)
+
+    def model_default_params(self, **kwargs):
+        mparams = self.lib.llama_model_default_params()
+        for k, v in kwargs.items():
+            setattr(mparams, k, v)
+        return mparams
+
+    def context_default_params(self, **kwargs):
+        cparams = self.lib.llama_context_default_params()
+        for k, v in kwargs.items():
+            setattr(cparams, k, v)
+        return cparams
+
+
+class LibLlamaModel:
+
+    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
+        self.lib: Any = libllama.lib
+        self.ffi = libllama.ffi
+        if isinstance(mparams, dict):
+            mparams = libllama.model_default_params(**mparams)
+        self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams)
+        if not self.model:
+            raise RuntimeError("error: failed to load model '%s'" % path_model)
+        if isinstance(cparams, dict):
+            cparams = libllama.context_default_params(**cparams)
+        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
+        if not self.ctx:
+            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
+        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
+        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
+        self.text_buff = self.ffi.new("uint8_t[]", 1024)
+
+    def free(self):
+        if self.ctx:
+            self.lib.llama_free(self.ctx)
+        if self.model:
+            self.lib.llama_model_free(self.model)
+        self.ctx = None
+        self.model = None
+        self.lib = None
+
+    def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
+        encoded_text: bytes = text.encode("utf-8")
+        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        while num < 0 and len(self.token_ids) < (16 << 20):
+            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
+            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        return list(self.token_ids[0:num])
+
+    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
+        if len(self.token_ids) < len(ids):
+            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
+        for i, id in enumerate(ids):
+            self.token_ids[i] = id
+        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        while num < 0 and len(self.text_buff) < (16 << 20):
+            self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
+            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+
+
+class Tokenizer:
+
+    def encode(self, text: str) -> list[int]:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int]) -> str:
+        raise NotImplementedError
+
+
+class TokenizerGroundtruth (Tokenizer):
+
+    def __init__(self, dir_tokenizer: str):
+        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+        # guess BOS and EOS
+        ids = self.encode("a")
+        assert 1 <= len(ids) <= 3
+        add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
+        add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
+        self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
+        self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
+        # build vocab
+        tokens = list(self.model.get_vocab().values())
+        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
+        self.vocab = list(sorted(self.vocab))
+        # tokens and lists
+        self.special_tokens = list(self.model.all_special_tokens)
+        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
+        self.bos_token = self.model.bos_token
+        self.eos_token = self.model.eos_token
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.encode(text, add_special_tokens=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.decode(ids, skip_special_tokens=False)
+
+
+class TokenizerLlamaCpp (Tokenizer):
+
+    libllama: LibLlama | None = None
+
+    def __init__(self, vocab_file: str):
+        if not self.libllama:
+            self.libllama = LibLlama()
+        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.tokenize(text, add_special=True, parse_special=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
+
+
+def generator_custom_text() -> Iterator[str]:
+    """General tests"""
+    yield from [
+        "",
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\n\n",
+        "\n\n\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+        " (",
+        "\n =",
+        "' era",
+        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+        "3",
+        "33",
+        "333",
+        "3333",
+        "33333",
+        "333333",
+        "3333333",
+        "33333333",
+        "333333333",
+    ]
+
+
+def generator_custom_text_edge_cases() -> Iterator[str]:
+    """Edge cases found while debugging"""
+    yield from [
+        '\x1f-a',     # unicode_ranges_control, {0x00001C, 0x00001F}
+        '¼-a',        # unicode_ranges_digit, 0x00BC
+        '½-a',        # unicode_ranges_digit, 0x00BD
+        '¾-a',        # unicode_ranges_digit, 0x00BE
+        'a 〇b',      # unicode_ranges_digit, 0x3007
+        'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
+        'Cửa Việt',   # llama-3, ignore_merges = true
+        '<s>a',       # Phi-3 fail
+        '<unk><|endoftext|><s>',  # Phi-3 fail
+        'a\na',            # bert fail
+        '"`',              # falcon
+        ' \u2e4e',         # falcon
+        '\n\x0b  ',        # falcon
+        'a\xa0\xa0\x00b',  # jina-v2-es
+        'one <mask>',      # jina-v2-es  <mask> lstrip=true
+        'a </s> b',        # rstrip phi-3
+        'a <mask> b',      # lstrip jina-v2
+        '\xa0aC',          # deepseek
+        '\u2029 \uA3E4',   # deepseek-llm
+        "a ?",
+        'å',               # mpt
+        '\U000ac517',      # utf-8 encode error, falcon
+        '\U000522f4',      # utf-8 encode error, starcoder
+        "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
+        "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
+    ]
+
+
+def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    """Brute force check all vocab words"""
+    yield from tokenizer.vocab
+
+
+def generator_ascii_lr_strip() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield lstrip + char1 + char2 + rstrip
+                    yield lstrip + char1 + rstrip + char2
+                    yield char1 + lstrip + char2 + rstrip
+
+
+def generator_apostrophe() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield char1 + lstrip + "'" + rstrip + char2
+                    yield char1 + char2 + lstrip + "'" + rstrip + "z"
+                    yield "a" + lstrip + "'" + rstrip + char1 + char2
+
+
+def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
+    all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
+    for token in all_tokens:
+        for lstrip in WHITESPACES:
+            for rstrip in WHITESPACES:
+                yield lstrip + token + rstrip
+                yield "a" + lstrip + token + rstrip
+                yield lstrip + token + rstrip + "z"
+                yield "a" + lstrip + token + rstrip + "z"
+
+
+def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
+    all_tokens  = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        words = rand.choices(all_tokens, k=500)
+        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
+            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
+                words.pop(0)
+            if tokenizer.add_bos_token:  # drop all starting BOS
+                words.pop(0)
+        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
+            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
+                words.pop(-1)
+            if tokenizer.add_bos_token:  # drop all trailing EOS
+                words.pop(-1)
+        yield "".join(words)
+
+
+def generator_random_chars(iterations=100) -> Iterator[str]:
+    """Brute force random text with simple characters"""
+
+    NUM_WORDS = 400
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+    CHARS = list(sorted(set("""
+        ABCDEFGHIJKLMNOPQRSTUVWXYZ
+        abcdefghijklmnopqrstuvwxyz
+        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
+        áéíóúàèìòùâêîôûäëïöü
+        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
+    """)))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(CHARS, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_unicodes() -> Iterator[str]:
+    """Iterate unicode characters"""
+
+    MAX_CODEPOINTS = 0x30000  # 0x110000
+
+    def _valid(cpt):
+        if cpt >= 0x30000:  # unassigned and supplement­ary
+            return False
+        # if cpt == 0x2029:  # deepseek-llm
+        #    return False
+        if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"):  # undefined, surrogates, private
+            return False
+        return True
+
+    characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
+
+    yield from characters
+
+
+def generator_random_unicodes(iterations=100) -> Iterator[str]:
+    """Brute force random text with unicode characters"""
+
+    NUM_WORDS = 200
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+
+    characters = list(generator_unicodes())
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(characters, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text with vocab characters"""
+
+    vocab_chars = set()
+    for word in tokenizer.vocab:
+        vocab_chars.update(word)
+    vocab_chars = list(sorted(vocab_chars))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = rand.choices(vocab_chars, k=1024)
+        yield "".join(text)
+
+
+def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text from vocab words"""
+
+    vocab = [w.strip() for w in tokenizer.vocab]
+    yield from vocab
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 3)
+            words = rand.choices(vocab, k=k)
+            sep = rand.choice("     \n\r\t")
+            text.append("".join(words) + sep)
+        yield "".join(text)
+
+
+def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
+
+    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
+        for i, (a, b) in enumerate(zip(ids1, ids2)):
+            if a != b:
+                return i
+        if len(ids1) == len(ids2):
+            return -1
+        return min(len(ids1), len(ids2))
+
+    def check_detokenizer(text: str, text1: str, text2: str) -> bool:
+        if text1 == text2:  # equal to TokenizerGroundtruth?
+            return True
+        # equal to source text?
+        if tokenizer1.add_bos_token:  # remove BOS
+            if text2.startswith(tokenizer1.bos_token):
+                text2 = text2[len(tokenizer1.bos_token):]
+        if tokenizer1.add_eos_token:  # remove EOS
+            if text2.endswith(tokenizer1.eos_token):
+                text2 = text2[:-len(tokenizer1.eos_token)]
+        return text == text2
+
+    t_encode1 = 0
+    t_encode2 = 0
+    t_decode1 = 0
+    t_decode2 = 0
+    t_start = time.perf_counter()
+    encode_errors = 0
+    decode_errors = 0
+    MAX_ERRORS = 10
+
+    logger.info("%s: %s" % (generator.__qualname__, "ini"))
+    for text in generator:
+        # print(repr(text), text.encode())
+        # print(repr(text), hex(ord(text[0])), text.encode())
+        t0 = time.perf_counter()
+        ids1 = tokenizer1.encode(text)
+        t1 = time.perf_counter()
+        ids2 = tokenizer2.encode(text)
+        t2 = time.perf_counter()
+        text1 = tokenizer1.decode(ids1)
+        t3 = time.perf_counter()
+        text2 = tokenizer2.decode(ids1)
+        t4 = time.perf_counter()
+        t_encode1 += t1 - t0
+        t_encode2 += t2 - t1
+        t_decode1 += t3 - t2
+        t_decode2 += t4 - t3
+        if encode_errors < MAX_ERRORS and ids1 != ids2:
+            i = find_first_mismatch(ids1, ids2)
+            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
+            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
+            logger.error(" Expected: " + str(ids1))
+            logger.error("   Result: " + str(ids2))
+            encode_errors += 1
+            logger.error(f" {encode_errors=}")
+        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
+            i = find_first_mismatch(text1, text2)
+            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
+            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
+            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
+            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
+            decode_errors += 1
+            logger.error(f" {decode_errors=}")
+        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
+            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
+            # raise Exception()
+            break
+
+    t_total = time.perf_counter() - t_start
+    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
+
+
+def main(argv: list[str] | None = None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
+    parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
+    logger.info(f"VOCABFILE: '{args.vocab_file}'")
+
+    tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
+    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
+
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
+
+    tokenizer2.model.free()
+
+
+if __name__ == "__main__":
+    # main()
+
+    if True:
+        logging.basicConfig(
+            level    = logging.DEBUG,
+            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
+            datefmt  = "%Y-%m-%d %H:%M:%S",
+            filename = logger.name + ".log",
+            filemode = "a"
+        )
+    logging.basicConfig(
+        level    = logging.DEBUG,
+        format   = "%(levelname)s %(message)s",
+    )
+
+    path_tokenizers   = Path("./models/tokenizers/")
+    path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+    tokenizers = [
+        "llama-spm",      # SPM
+        "phi-3",          # SPM
+        "gemma",          # SPM
+        "gemma-2",        # SPM
+        "baichuan",       # SPM
+        "bert-bge",       # WPM
+        "jina-v2-en",     # WPM
+        "llama-bpe",      # BPE
+        "phi-2",          # BPE
+        "deepseek-llm",   # BPE
+        "deepseek-coder", # BPE
+        "falcon",         # BPE
+        "mpt",            # BPE
+        "starcoder",      # BPE
+        "gpt-2",          # BPE
+        "stablelm2",      # BPE
+        "refact",         # BPE
+        "qwen2",          # BPE
+        "olmo",           # BPE
+        "jina-v2-es",     # BPE
+        "jina-v2-de",     # BPE
+        "smaug-bpe",      # BPE
+        "poro-chat",      # BPE
+        "jina-v2-code",   # BPE
+        "viking",         # BPE
+        "jais",           # BPE
+    ]
+
+    logger.info("=" * 50)
+    for tokenizer in tokenizers:
+        logger.info("-" * 50)
+        logger.info(f"TOKENIZER: '{tokenizer}'")
+        vocab_file = Path(path_vocab_format % tokenizer)
+        dir_tokenizer = path_tokenizers / tokenizer
+        main([str(vocab_file), str(dir_tokenizer), "--verbose"])
diff --git a/unicode.h b/unicode.h
deleted file mode 100644
index f6be4549b..000000000
--- a/unicode.h
+++ /dev/null
@@ -1,784 +0,0 @@
-﻿#pragma once
-
-#include <cassert>
-#include <map>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
-{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
-{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
-{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
-{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
-{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
-{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
-{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
-{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
-{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
-{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
-{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
-{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
-{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
-{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
-{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
-{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
-{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
-{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
-{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
-{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
-{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
-{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
-{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
-{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
-{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
-{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
-{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
-{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
-{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
-{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
-{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
-{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
-{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
-{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
-{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
-{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
-{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
-{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
-{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
-{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
-{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
-{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
-{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
-{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
-{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
-{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
-{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
-{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
-{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
-{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
-{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
-{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
-{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
-{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
-{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
-{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
-{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
-{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
-{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
-{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
-{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
-{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
-{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
-{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
-{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
-{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
-{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
-{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
-{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
-{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
-{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
-{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
-{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
-{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
-{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
-{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
-{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
-{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
-{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
-{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
-{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
-{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
-{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
-{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
-{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
-{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
-{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
-{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
-{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
-{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
-{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
-{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
-{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
-{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
-{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
-{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
-{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
-{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
-{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
-{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
-{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
-{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
-{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
-{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
-{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
-{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
-{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
-{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
-{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
-{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
-{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
-{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
-{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
-{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
-{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
-{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
-{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
-{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
-{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
-{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
-{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
-{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
-{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
-{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
-{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
-{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
-{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
-{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
-{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
-{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
-{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
-{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
-};
-
-static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
-{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
-{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
-{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
-{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
-{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
-{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
-{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
-{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
-{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
-{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
-{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
-{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
-{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
-{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
-{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
-{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
-{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
-{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
-{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
-{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
-{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
-{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
-{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
-{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
-{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
-{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
-{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
-{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
-{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
-{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
-{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
-{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
-{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
-{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
-{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
-{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
-{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
-{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
-{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
-{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
-{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
-{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
-{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
-{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
-{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
-{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
-{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
-{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
-{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
-{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
-{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
-{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
-{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
-{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
-{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
-{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
-{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
-{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
-{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
-{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
-{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
-{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
-{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
-{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
-};
-
-static const std::multimap<uint32_t, uint32_t> nfd_map = {
-{0xC0, 0x41}, {0xC0, 0x300}, {0xC1, 0x41}, {0xC1, 0x301}, {0xC2, 0x41}, {0xC2, 0x302}, {0xC3, 0x41}, {0xC3, 0x303}, {0xC4, 0x41}, {0xC4, 0x308}, {0xC5, 0x41}, {0xC5, 0x30A}, {0xC7, 0x43},
-{0xC7, 0x327}, {0xC8, 0x45}, {0xC8, 0x300}, {0xC9, 0x45}, {0xC9, 0x301}, {0xCA, 0x45}, {0xCA, 0x302}, {0xCB, 0x45}, {0xCB, 0x308}, {0xCC, 0x49}, {0xCC, 0x300}, {0xCD, 0x49}, {0xCD, 0x301},
-{0xCE, 0x49}, {0xCE, 0x302}, {0xCF, 0x49}, {0xCF, 0x308}, {0xD1, 0x4E}, {0xD1, 0x303}, {0xD2, 0x4F}, {0xD2, 0x300}, {0xD3, 0x4F}, {0xD3, 0x301}, {0xD4, 0x4F}, {0xD4, 0x302}, {0xD5, 0x4F},
-{0xD5, 0x303}, {0xD6, 0x4F}, {0xD6, 0x308}, {0xD9, 0x55}, {0xD9, 0x300}, {0xDA, 0x55}, {0xDA, 0x301}, {0xDB, 0x55}, {0xDB, 0x302}, {0xDC, 0x55}, {0xDC, 0x308}, {0xDD, 0x59}, {0xDD, 0x301},
-{0xE0, 0x61}, {0xE0, 0x300}, {0xE1, 0x61}, {0xE1, 0x301}, {0xE2, 0x61}, {0xE2, 0x302}, {0xE3, 0x61}, {0xE3, 0x303}, {0xE4, 0x61}, {0xE4, 0x308}, {0xE5, 0x61}, {0xE5, 0x30A}, {0xE7, 0x63},
-{0xE7, 0x327}, {0xE8, 0x65}, {0xE8, 0x300}, {0xE9, 0x65}, {0xE9, 0x301}, {0xEA, 0x65}, {0xEA, 0x302}, {0xEB, 0x65}, {0xEB, 0x308}, {0xEC, 0x69}, {0xEC, 0x300}, {0xED, 0x69}, {0xED, 0x301},
-{0xEE, 0x69}, {0xEE, 0x302}, {0xEF, 0x69}, {0xEF, 0x308}, {0xF1, 0x6E}, {0xF1, 0x303}, {0xF2, 0x6F}, {0xF2, 0x300}, {0xF3, 0x6F}, {0xF3, 0x301}, {0xF4, 0x6F}, {0xF4, 0x302}, {0xF5, 0x6F},
-{0xF5, 0x303}, {0xF6, 0x6F}, {0xF6, 0x308}, {0xF9, 0x75}, {0xF9, 0x300}, {0xFA, 0x75}, {0xFA, 0x301}, {0xFB, 0x75}, {0xFB, 0x302}, {0xFC, 0x75}, {0xFC, 0x308}, {0xFD, 0x79}, {0xFD, 0x301},
-{0xFF, 0x79}, {0xFF, 0x308}, {0x100, 0x41}, {0x100, 0x304}, {0x101, 0x61}, {0x101, 0x304}, {0x102, 0x41}, {0x102, 0x306}, {0x103, 0x61}, {0x103, 0x306}, {0x104, 0x41}, {0x104, 0x328}, {0x105, 0x61},
-{0x105, 0x328}, {0x106, 0x43}, {0x106, 0x301}, {0x107, 0x63}, {0x107, 0x301}, {0x108, 0x43}, {0x108, 0x302}, {0x109, 0x63}, {0x109, 0x302}, {0x10A, 0x43}, {0x10A, 0x307}, {0x10B, 0x63},
-{0x10B, 0x307}, {0x10C, 0x43}, {0x10C, 0x30C}, {0x10D, 0x63}, {0x10D, 0x30C}, {0x10E, 0x44}, {0x10E, 0x30C}, {0x10F, 0x64}, {0x10F, 0x30C}, {0x112, 0x45}, {0x112, 0x304}, {0x113, 0x65},
-{0x113, 0x304}, {0x114, 0x45}, {0x114, 0x306}, {0x115, 0x65}, {0x115, 0x306}, {0x116, 0x45}, {0x116, 0x307}, {0x117, 0x65}, {0x117, 0x307}, {0x118, 0x45}, {0x118, 0x328}, {0x119, 0x65},
-{0x119, 0x328}, {0x11A, 0x45}, {0x11A, 0x30C}, {0x11B, 0x65}, {0x11B, 0x30C}, {0x11C, 0x47}, {0x11C, 0x302}, {0x11D, 0x67}, {0x11D, 0x302}, {0x11E, 0x47}, {0x11E, 0x306}, {0x11F, 0x67},
-{0x11F, 0x306}, {0x120, 0x47}, {0x120, 0x307}, {0x121, 0x67}, {0x121, 0x307}, {0x122, 0x47}, {0x122, 0x327}, {0x123, 0x67}, {0x123, 0x327}, {0x124, 0x48}, {0x124, 0x302}, {0x125, 0x68},
-{0x125, 0x302}, {0x128, 0x49}, {0x128, 0x303}, {0x129, 0x69}, {0x129, 0x303}, {0x12A, 0x49}, {0x12A, 0x304}, {0x12B, 0x69}, {0x12B, 0x304}, {0x12C, 0x49}, {0x12C, 0x306}, {0x12D, 0x69},
-{0x12D, 0x306}, {0x12E, 0x49}, {0x12E, 0x328}, {0x12F, 0x69}, {0x12F, 0x328}, {0x130, 0x49}, {0x130, 0x307}, {0x134, 0x4A}, {0x134, 0x302}, {0x135, 0x6A}, {0x135, 0x302}, {0x136, 0x4B},
-{0x136, 0x327}, {0x137, 0x6B}, {0x137, 0x327}, {0x139, 0x4C}, {0x139, 0x301}, {0x13A, 0x6C}, {0x13A, 0x301}, {0x13B, 0x4C}, {0x13B, 0x327}, {0x13C, 0x6C}, {0x13C, 0x327}, {0x13D, 0x4C},
-{0x13D, 0x30C}, {0x13E, 0x6C}, {0x13E, 0x30C}, {0x143, 0x4E}, {0x143, 0x301}, {0x144, 0x6E}, {0x144, 0x301}, {0x145, 0x4E}, {0x145, 0x327}, {0x146, 0x6E}, {0x146, 0x327}, {0x147, 0x4E},
-{0x147, 0x30C}, {0x148, 0x6E}, {0x148, 0x30C}, {0x14C, 0x4F}, {0x14C, 0x304}, {0x14D, 0x6F}, {0x14D, 0x304}, {0x14E, 0x4F}, {0x14E, 0x306}, {0x14F, 0x6F}, {0x14F, 0x306}, {0x150, 0x4F},
-{0x150, 0x30B}, {0x151, 0x6F}, {0x151, 0x30B}, {0x154, 0x52}, {0x154, 0x301}, {0x155, 0x72}, {0x155, 0x301}, {0x156, 0x52}, {0x156, 0x327}, {0x157, 0x72}, {0x157, 0x327}, {0x158, 0x52},
-{0x158, 0x30C}, {0x159, 0x72}, {0x159, 0x30C}, {0x15A, 0x53}, {0x15A, 0x301}, {0x15B, 0x73}, {0x15B, 0x301}, {0x15C, 0x53}, {0x15C, 0x302}, {0x15D, 0x73}, {0x15D, 0x302}, {0x15E, 0x53},
-{0x15E, 0x327}, {0x15F, 0x73}, {0x15F, 0x327}, {0x160, 0x53}, {0x160, 0x30C}, {0x161, 0x73}, {0x161, 0x30C}, {0x162, 0x54}, {0x162, 0x327}, {0x163, 0x74}, {0x163, 0x327}, {0x164, 0x54},
-{0x164, 0x30C}, {0x165, 0x74}, {0x165, 0x30C}, {0x168, 0x55}, {0x168, 0x303}, {0x169, 0x75}, {0x169, 0x303}, {0x16A, 0x55}, {0x16A, 0x304}, {0x16B, 0x75}, {0x16B, 0x304}, {0x16C, 0x55},
-{0x16C, 0x306}, {0x16D, 0x75}, {0x16D, 0x306}, {0x16E, 0x55}, {0x16E, 0x30A}, {0x16F, 0x75}, {0x16F, 0x30A}, {0x170, 0x55}, {0x170, 0x30B}, {0x171, 0x75}, {0x171, 0x30B}, {0x172, 0x55},
-{0x172, 0x328}, {0x173, 0x75}, {0x173, 0x328}, {0x174, 0x57}, {0x174, 0x302}, {0x175, 0x77}, {0x175, 0x302}, {0x176, 0x59}, {0x176, 0x302}, {0x177, 0x79}, {0x177, 0x302}, {0x178, 0x59},
-{0x178, 0x308}, {0x179, 0x5A}, {0x179, 0x301}, {0x17A, 0x7A}, {0x17A, 0x301}, {0x17B, 0x5A}, {0x17B, 0x307}, {0x17C, 0x7A}, {0x17C, 0x307}, {0x17D, 0x5A}, {0x17D, 0x30C}, {0x17E, 0x7A},
-{0x17E, 0x30C}, {0x1A0, 0x4F}, {0x1A0, 0x31B}, {0x1A1, 0x6F}, {0x1A1, 0x31B}, {0x1AF, 0x55}, {0x1AF, 0x31B}, {0x1B0, 0x75}, {0x1B0, 0x31B}, {0x1CD, 0x41}, {0x1CD, 0x30C}, {0x1CE, 0x61},
-{0x1CE, 0x30C}, {0x1CF, 0x49}, {0x1CF, 0x30C}, {0x1D0, 0x69}, {0x1D0, 0x30C}, {0x1D1, 0x4F}, {0x1D1, 0x30C}, {0x1D2, 0x6F}, {0x1D2, 0x30C}, {0x1D3, 0x55}, {0x1D3, 0x30C}, {0x1D4, 0x75},
-{0x1D4, 0x30C}, {0x1D5, 0x55}, {0x1D5, 0x308}, {0x1D5, 0x304}, {0x1D6, 0x75}, {0x1D6, 0x308}, {0x1D6, 0x304}, {0x1D7, 0x55}, {0x1D7, 0x308}, {0x1D7, 0x301}, {0x1D8, 0x75}, {0x1D8, 0x308},
-{0x1D8, 0x301}, {0x1D9, 0x55}, {0x1D9, 0x308}, {0x1D9, 0x30C}, {0x1DA, 0x75}, {0x1DA, 0x308}, {0x1DA, 0x30C}, {0x1DB, 0x55}, {0x1DB, 0x308}, {0x1DB, 0x300}, {0x1DC, 0x75}, {0x1DC, 0x308},
-{0x1DC, 0x300}, {0x1DE, 0x41}, {0x1DE, 0x308}, {0x1DE, 0x304}, {0x1DF, 0x61}, {0x1DF, 0x308}, {0x1DF, 0x304}, {0x1E0, 0x41}, {0x1E0, 0x307}, {0x1E0, 0x304}, {0x1E1, 0x61}, {0x1E1, 0x307},
-{0x1E1, 0x304}, {0x1E2, 0xC6}, {0x1E2, 0x304}, {0x1E3, 0xE6}, {0x1E3, 0x304}, {0x1E6, 0x47}, {0x1E6, 0x30C}, {0x1E7, 0x67}, {0x1E7, 0x30C}, {0x1E8, 0x4B}, {0x1E8, 0x30C}, {0x1E9, 0x6B},
-{0x1E9, 0x30C}, {0x1EA, 0x4F}, {0x1EA, 0x328}, {0x1EB, 0x6F}, {0x1EB, 0x328}, {0x1EC, 0x4F}, {0x1EC, 0x328}, {0x1EC, 0x304}, {0x1ED, 0x6F}, {0x1ED, 0x328}, {0x1ED, 0x304}, {0x1EE, 0x1B7},
-{0x1EE, 0x30C}, {0x1EF, 0x292}, {0x1EF, 0x30C}, {0x1F0, 0x6A}, {0x1F0, 0x30C}, {0x1F4, 0x47}, {0x1F4, 0x301}, {0x1F5, 0x67}, {0x1F5, 0x301}, {0x1F8, 0x4E}, {0x1F8, 0x300}, {0x1F9, 0x6E},
-{0x1F9, 0x300}, {0x1FA, 0x41}, {0x1FA, 0x30A}, {0x1FA, 0x301}, {0x1FB, 0x61}, {0x1FB, 0x30A}, {0x1FB, 0x301}, {0x1FC, 0xC6}, {0x1FC, 0x301}, {0x1FD, 0xE6}, {0x1FD, 0x301}, {0x1FE, 0xD8},
-{0x1FE, 0x301}, {0x1FF, 0xF8}, {0x1FF, 0x301}, {0x200, 0x41}, {0x200, 0x30F}, {0x201, 0x61}, {0x201, 0x30F}, {0x202, 0x41}, {0x202, 0x311}, {0x203, 0x61}, {0x203, 0x311}, {0x204, 0x45},
-{0x204, 0x30F}, {0x205, 0x65}, {0x205, 0x30F}, {0x206, 0x45}, {0x206, 0x311}, {0x207, 0x65}, {0x207, 0x311}, {0x208, 0x49}, {0x208, 0x30F}, {0x209, 0x69}, {0x209, 0x30F}, {0x20A, 0x49},
-{0x20A, 0x311}, {0x20B, 0x69}, {0x20B, 0x311}, {0x20C, 0x4F}, {0x20C, 0x30F}, {0x20D, 0x6F}, {0x20D, 0x30F}, {0x20E, 0x4F}, {0x20E, 0x311}, {0x20F, 0x6F}, {0x20F, 0x311}, {0x210, 0x52},
-{0x210, 0x30F}, {0x211, 0x72}, {0x211, 0x30F}, {0x212, 0x52}, {0x212, 0x311}, {0x213, 0x72}, {0x213, 0x311}, {0x214, 0x55}, {0x214, 0x30F}, {0x215, 0x75}, {0x215, 0x30F}, {0x216, 0x55},
-{0x216, 0x311}, {0x217, 0x75}, {0x217, 0x311}, {0x218, 0x53}, {0x218, 0x326}, {0x219, 0x73}, {0x219, 0x326}, {0x21A, 0x54}, {0x21A, 0x326}, {0x21B, 0x74}, {0x21B, 0x326}, {0x21E, 0x48},
-{0x21E, 0x30C}, {0x21F, 0x68}, {0x21F, 0x30C}, {0x226, 0x41}, {0x226, 0x307}, {0x227, 0x61}, {0x227, 0x307}, {0x228, 0x45}, {0x228, 0x327}, {0x229, 0x65}, {0x229, 0x327}, {0x22A, 0x4F},
-{0x22A, 0x308}, {0x22A, 0x304}, {0x22B, 0x6F}, {0x22B, 0x308}, {0x22B, 0x304}, {0x22C, 0x4F}, {0x22C, 0x303}, {0x22C, 0x304}, {0x22D, 0x6F}, {0x22D, 0x303}, {0x22D, 0x304}, {0x22E, 0x4F},
-{0x22E, 0x307}, {0x22F, 0x6F}, {0x22F, 0x307}, {0x230, 0x4F}, {0x230, 0x307}, {0x230, 0x304}, {0x231, 0x6F}, {0x231, 0x307}, {0x231, 0x304}, {0x232, 0x59}, {0x232, 0x304}, {0x233, 0x79},
-{0x233, 0x304}, {0x340, 0x300}, {0x341, 0x301}, {0x343, 0x313}, {0x344, 0x308}, {0x344, 0x301}, {0x374, 0x2B9}, {0x37E, 0x3B}, {0x385, 0xA8}, {0x385, 0x301}, {0x386, 0x391}, {0x386, 0x301},
-{0x387, 0xB7}, {0x388, 0x395}, {0x388, 0x301}, {0x389, 0x397}, {0x389, 0x301}, {0x38A, 0x399}, {0x38A, 0x301}, {0x38C, 0x39F}, {0x38C, 0x301}, {0x38E, 0x3A5}, {0x38E, 0x301}, {0x38F, 0x3A9},
-{0x38F, 0x301}, {0x390, 0x3B9}, {0x390, 0x308}, {0x390, 0x301}, {0x3AA, 0x399}, {0x3AA, 0x308}, {0x3AB, 0x3A5}, {0x3AB, 0x308}, {0x3AC, 0x3B1}, {0x3AC, 0x301}, {0x3AD, 0x3B5}, {0x3AD, 0x301},
-{0x3AE, 0x3B7}, {0x3AE, 0x301}, {0x3AF, 0x3B9}, {0x3AF, 0x301}, {0x3B0, 0x3C5}, {0x3B0, 0x308}, {0x3B0, 0x301}, {0x3CA, 0x3B9}, {0x3CA, 0x308}, {0x3CB, 0x3C5}, {0x3CB, 0x308}, {0x3CC, 0x3BF},
-{0x3CC, 0x301}, {0x3CD, 0x3C5}, {0x3CD, 0x301}, {0x3CE, 0x3C9}, {0x3CE, 0x301}, {0x3D3, 0x3D2}, {0x3D3, 0x301}, {0x3D4, 0x3D2}, {0x3D4, 0x308}, {0x400, 0x415}, {0x400, 0x300}, {0x401, 0x415},
-{0x401, 0x308}, {0x403, 0x413}, {0x403, 0x301}, {0x407, 0x406}, {0x407, 0x308}, {0x40C, 0x41A}, {0x40C, 0x301}, {0x40D, 0x418}, {0x40D, 0x300}, {0x40E, 0x423}, {0x40E, 0x306}, {0x419, 0x418},
-{0x419, 0x306}, {0x439, 0x438}, {0x439, 0x306}, {0x450, 0x435}, {0x450, 0x300}, {0x451, 0x435}, {0x451, 0x308}, {0x453, 0x433}, {0x453, 0x301}, {0x457, 0x456}, {0x457, 0x308}, {0x45C, 0x43A},
-{0x45C, 0x301}, {0x45D, 0x438}, {0x45D, 0x300}, {0x45E, 0x443}, {0x45E, 0x306}, {0x476, 0x474}, {0x476, 0x30F}, {0x477, 0x475}, {0x477, 0x30F}, {0x4C1, 0x416}, {0x4C1, 0x306}, {0x4C2, 0x436},
-{0x4C2, 0x306}, {0x4D0, 0x410}, {0x4D0, 0x306}, {0x4D1, 0x430}, {0x4D1, 0x306}, {0x4D2, 0x410}, {0x4D2, 0x308}, {0x4D3, 0x430}, {0x4D3, 0x308}, {0x4D6, 0x415}, {0x4D6, 0x306}, {0x4D7, 0x435},
-{0x4D7, 0x306}, {0x4DA, 0x4D8}, {0x4DA, 0x308}, {0x4DB, 0x4D9}, {0x4DB, 0x308}, {0x4DC, 0x416}, {0x4DC, 0x308}, {0x4DD, 0x436}, {0x4DD, 0x308}, {0x4DE, 0x417}, {0x4DE, 0x308}, {0x4DF, 0x437},
-{0x4DF, 0x308}, {0x4E2, 0x418}, {0x4E2, 0x304}, {0x4E3, 0x438}, {0x4E3, 0x304}, {0x4E4, 0x418}, {0x4E4, 0x308}, {0x4E5, 0x438}, {0x4E5, 0x308}, {0x4E6, 0x41E}, {0x4E6, 0x308}, {0x4E7, 0x43E},
-{0x4E7, 0x308}, {0x4EA, 0x4E8}, {0x4EA, 0x308}, {0x4EB, 0x4E9}, {0x4EB, 0x308}, {0x4EC, 0x42D}, {0x4EC, 0x308}, {0x4ED, 0x44D}, {0x4ED, 0x308}, {0x4EE, 0x423}, {0x4EE, 0x304}, {0x4EF, 0x443},
-{0x4EF, 0x304}, {0x4F0, 0x423}, {0x4F0, 0x308}, {0x4F1, 0x443}, {0x4F1, 0x308}, {0x4F2, 0x423}, {0x4F2, 0x30B}, {0x4F3, 0x443}, {0x4F3, 0x30B}, {0x4F4, 0x427}, {0x4F4, 0x308}, {0x4F5, 0x447},
-{0x4F5, 0x308}, {0x4F8, 0x42B}, {0x4F8, 0x308}, {0x4F9, 0x44B}, {0x4F9, 0x308}, {0x622, 0x627}, {0x622, 0x653}, {0x623, 0x627}, {0x623, 0x654}, {0x624, 0x648}, {0x624, 0x654}, {0x625, 0x627},
-{0x625, 0x655}, {0x626, 0x64A}, {0x626, 0x654}, {0x6C0, 0x6D5}, {0x6C0, 0x654}, {0x6C2, 0x6C1}, {0x6C2, 0x654}, {0x6D3, 0x6D2}, {0x6D3, 0x654}, {0x929, 0x928}, {0x929, 0x93C}, {0x931, 0x930},
-{0x931, 0x93C}, {0x934, 0x933}, {0x934, 0x93C}, {0x958, 0x915}, {0x958, 0x93C}, {0x959, 0x916}, {0x959, 0x93C}, {0x95A, 0x917}, {0x95A, 0x93C}, {0x95B, 0x91C}, {0x95B, 0x93C}, {0x95C, 0x921},
-{0x95C, 0x93C}, {0x95D, 0x922}, {0x95D, 0x93C}, {0x95E, 0x92B}, {0x95E, 0x93C}, {0x95F, 0x92F}, {0x95F, 0x93C}, {0x9CB, 0x9C7}, {0x9CB, 0x9BE}, {0x9CC, 0x9C7}, {0x9CC, 0x9D7}, {0x9DC, 0x9A1},
-{0x9DC, 0x9BC}, {0x9DD, 0x9A2}, {0x9DD, 0x9BC}, {0x9DF, 0x9AF}, {0x9DF, 0x9BC}, {0xA33, 0xA32}, {0xA33, 0xA3C}, {0xA36, 0xA38}, {0xA36, 0xA3C}, {0xA59, 0xA16}, {0xA59, 0xA3C}, {0xA5A, 0xA17},
-{0xA5A, 0xA3C}, {0xA5B, 0xA1C}, {0xA5B, 0xA3C}, {0xA5E, 0xA2B}, {0xA5E, 0xA3C}, {0xB48, 0xB47}, {0xB48, 0xB56}, {0xB4B, 0xB47}, {0xB4B, 0xB3E}, {0xB4C, 0xB47}, {0xB4C, 0xB57}, {0xB5C, 0xB21},
-{0xB5C, 0xB3C}, {0xB5D, 0xB22}, {0xB5D, 0xB3C}, {0xB94, 0xB92}, {0xB94, 0xBD7}, {0xBCA, 0xBC6}, {0xBCA, 0xBBE}, {0xBCB, 0xBC7}, {0xBCB, 0xBBE}, {0xBCC, 0xBC6}, {0xBCC, 0xBD7}, {0xC48, 0xC46},
-{0xC48, 0xC56}, {0xCC0, 0xCBF}, {0xCC0, 0xCD5}, {0xCC7, 0xCC6}, {0xCC7, 0xCD5}, {0xCC8, 0xCC6}, {0xCC8, 0xCD6}, {0xCCA, 0xCC6}, {0xCCA, 0xCC2}, {0xCCB, 0xCC6}, {0xCCB, 0xCC2}, {0xCCB, 0xCD5},
-{0xD4A, 0xD46}, {0xD4A, 0xD3E}, {0xD4B, 0xD47}, {0xD4B, 0xD3E}, {0xD4C, 0xD46}, {0xD4C, 0xD57}, {0xDDA, 0xDD9}, {0xDDA, 0xDCA}, {0xDDC, 0xDD9}, {0xDDC, 0xDCF}, {0xDDD, 0xDD9}, {0xDDD, 0xDCF},
-{0xDDD, 0xDCA}, {0xDDE, 0xDD9}, {0xDDE, 0xDDF}, {0xF43, 0xF42}, {0xF43, 0xFB7}, {0xF4D, 0xF4C}, {0xF4D, 0xFB7}, {0xF52, 0xF51}, {0xF52, 0xFB7}, {0xF57, 0xF56}, {0xF57, 0xFB7}, {0xF5C, 0xF5B},
-{0xF5C, 0xFB7}, {0xF69, 0xF40}, {0xF69, 0xFB5}, {0xF73, 0xF71}, {0xF73, 0xF72}, {0xF75, 0xF71}, {0xF75, 0xF74}, {0xF76, 0xFB2}, {0xF76, 0xF80}, {0xF78, 0xFB3}, {0xF78, 0xF80}, {0xF81, 0xF71},
-{0xF81, 0xF80}, {0xF93, 0xF92}, {0xF93, 0xFB7}, {0xF9D, 0xF9C}, {0xF9D, 0xFB7}, {0xFA2, 0xFA1}, {0xFA2, 0xFB7}, {0xFA7, 0xFA6}, {0xFA7, 0xFB7}, {0xFAC, 0xFAB}, {0xFAC, 0xFB7}, {0xFB9, 0xF90},
-{0xFB9, 0xFB5}, {0x1026, 0x1025}, {0x1026, 0x102E}, {0x1B06, 0x1B05}, {0x1B06, 0x1B35}, {0x1B08, 0x1B07}, {0x1B08, 0x1B35}, {0x1B0A, 0x1B09}, {0x1B0A, 0x1B35}, {0x1B0C, 0x1B0B}, {0x1B0C, 0x1B35},
-{0x1B0E, 0x1B0D}, {0x1B0E, 0x1B35}, {0x1B12, 0x1B11}, {0x1B12, 0x1B35}, {0x1B3B, 0x1B3A}, {0x1B3B, 0x1B35}, {0x1B3D, 0x1B3C}, {0x1B3D, 0x1B35}, {0x1B40, 0x1B3E}, {0x1B40, 0x1B35}, {0x1B41, 0x1B3F},
-{0x1B41, 0x1B35}, {0x1B43, 0x1B42}, {0x1B43, 0x1B35}, {0x1E00, 0x41}, {0x1E00, 0x325}, {0x1E01, 0x61}, {0x1E01, 0x325}, {0x1E02, 0x42}, {0x1E02, 0x307}, {0x1E03, 0x62}, {0x1E03, 0x307},
-{0x1E04, 0x42}, {0x1E04, 0x323}, {0x1E05, 0x62}, {0x1E05, 0x323}, {0x1E06, 0x42}, {0x1E06, 0x331}, {0x1E07, 0x62}, {0x1E07, 0x331}, {0x1E08, 0x43}, {0x1E08, 0x327}, {0x1E08, 0x301}, {0x1E09, 0x63},
-{0x1E09, 0x327}, {0x1E09, 0x301}, {0x1E0A, 0x44}, {0x1E0A, 0x307}, {0x1E0B, 0x64}, {0x1E0B, 0x307}, {0x1E0C, 0x44}, {0x1E0C, 0x323}, {0x1E0D, 0x64}, {0x1E0D, 0x323}, {0x1E0E, 0x44}, {0x1E0E, 0x331},
-{0x1E0F, 0x64}, {0x1E0F, 0x331}, {0x1E10, 0x44}, {0x1E10, 0x327}, {0x1E11, 0x64}, {0x1E11, 0x327}, {0x1E12, 0x44}, {0x1E12, 0x32D}, {0x1E13, 0x64}, {0x1E13, 0x32D}, {0x1E14, 0x45}, {0x1E14, 0x304},
-{0x1E14, 0x300}, {0x1E15, 0x65}, {0x1E15, 0x304}, {0x1E15, 0x300}, {0x1E16, 0x45}, {0x1E16, 0x304}, {0x1E16, 0x301}, {0x1E17, 0x65}, {0x1E17, 0x304}, {0x1E17, 0x301}, {0x1E18, 0x45}, {0x1E18, 0x32D},
-{0x1E19, 0x65}, {0x1E19, 0x32D}, {0x1E1A, 0x45}, {0x1E1A, 0x330}, {0x1E1B, 0x65}, {0x1E1B, 0x330}, {0x1E1C, 0x45}, {0x1E1C, 0x327}, {0x1E1C, 0x306}, {0x1E1D, 0x65}, {0x1E1D, 0x327}, {0x1E1D, 0x306},
-{0x1E1E, 0x46}, {0x1E1E, 0x307}, {0x1E1F, 0x66}, {0x1E1F, 0x307}, {0x1E20, 0x47}, {0x1E20, 0x304}, {0x1E21, 0x67}, {0x1E21, 0x304}, {0x1E22, 0x48}, {0x1E22, 0x307}, {0x1E23, 0x68}, {0x1E23, 0x307},
-{0x1E24, 0x48}, {0x1E24, 0x323}, {0x1E25, 0x68}, {0x1E25, 0x323}, {0x1E26, 0x48}, {0x1E26, 0x308}, {0x1E27, 0x68}, {0x1E27, 0x308}, {0x1E28, 0x48}, {0x1E28, 0x327}, {0x1E29, 0x68}, {0x1E29, 0x327},
-{0x1E2A, 0x48}, {0x1E2A, 0x32E}, {0x1E2B, 0x68}, {0x1E2B, 0x32E}, {0x1E2C, 0x49}, {0x1E2C, 0x330}, {0x1E2D, 0x69}, {0x1E2D, 0x330}, {0x1E2E, 0x49}, {0x1E2E, 0x308}, {0x1E2E, 0x301}, {0x1E2F, 0x69},
-{0x1E2F, 0x308}, {0x1E2F, 0x301}, {0x1E30, 0x4B}, {0x1E30, 0x301}, {0x1E31, 0x6B}, {0x1E31, 0x301}, {0x1E32, 0x4B}, {0x1E32, 0x323}, {0x1E33, 0x6B}, {0x1E33, 0x323}, {0x1E34, 0x4B}, {0x1E34, 0x331},
-{0x1E35, 0x6B}, {0x1E35, 0x331}, {0x1E36, 0x4C}, {0x1E36, 0x323}, {0x1E37, 0x6C}, {0x1E37, 0x323}, {0x1E38, 0x4C}, {0x1E38, 0x323}, {0x1E38, 0x304}, {0x1E39, 0x6C}, {0x1E39, 0x323}, {0x1E39, 0x304},
-{0x1E3A, 0x4C}, {0x1E3A, 0x331}, {0x1E3B, 0x6C}, {0x1E3B, 0x331}, {0x1E3C, 0x4C}, {0x1E3C, 0x32D}, {0x1E3D, 0x6C}, {0x1E3D, 0x32D}, {0x1E3E, 0x4D}, {0x1E3E, 0x301}, {0x1E3F, 0x6D}, {0x1E3F, 0x301},
-{0x1E40, 0x4D}, {0x1E40, 0x307}, {0x1E41, 0x6D}, {0x1E41, 0x307}, {0x1E42, 0x4D}, {0x1E42, 0x323}, {0x1E43, 0x6D}, {0x1E43, 0x323}, {0x1E44, 0x4E}, {0x1E44, 0x307}, {0x1E45, 0x6E}, {0x1E45, 0x307},
-{0x1E46, 0x4E}, {0x1E46, 0x323}, {0x1E47, 0x6E}, {0x1E47, 0x323}, {0x1E48, 0x4E}, {0x1E48, 0x331}, {0x1E49, 0x6E}, {0x1E49, 0x331}, {0x1E4A, 0x4E}, {0x1E4A, 0x32D}, {0x1E4B, 0x6E}, {0x1E4B, 0x32D},
-{0x1E4C, 0x4F}, {0x1E4C, 0x303}, {0x1E4C, 0x301}, {0x1E4D, 0x6F}, {0x1E4D, 0x303}, {0x1E4D, 0x301}, {0x1E4E, 0x4F}, {0x1E4E, 0x303}, {0x1E4E, 0x308}, {0x1E4F, 0x6F}, {0x1E4F, 0x303}, {0x1E4F, 0x308},
-{0x1E50, 0x4F}, {0x1E50, 0x304}, {0x1E50, 0x300}, {0x1E51, 0x6F}, {0x1E51, 0x304}, {0x1E51, 0x300}, {0x1E52, 0x4F}, {0x1E52, 0x304}, {0x1E52, 0x301}, {0x1E53, 0x6F}, {0x1E53, 0x304}, {0x1E53, 0x301},
-{0x1E54, 0x50}, {0x1E54, 0x301}, {0x1E55, 0x70}, {0x1E55, 0x301}, {0x1E56, 0x50}, {0x1E56, 0x307}, {0x1E57, 0x70}, {0x1E57, 0x307}, {0x1E58, 0x52}, {0x1E58, 0x307}, {0x1E59, 0x72}, {0x1E59, 0x307},
-{0x1E5A, 0x52}, {0x1E5A, 0x323}, {0x1E5B, 0x72}, {0x1E5B, 0x323}, {0x1E5C, 0x52}, {0x1E5C, 0x323}, {0x1E5C, 0x304}, {0x1E5D, 0x72}, {0x1E5D, 0x323}, {0x1E5D, 0x304}, {0x1E5E, 0x52}, {0x1E5E, 0x331},
-{0x1E5F, 0x72}, {0x1E5F, 0x331}, {0x1E60, 0x53}, {0x1E60, 0x307}, {0x1E61, 0x73}, {0x1E61, 0x307}, {0x1E62, 0x53}, {0x1E62, 0x323}, {0x1E63, 0x73}, {0x1E63, 0x323}, {0x1E64, 0x53}, {0x1E64, 0x301},
-{0x1E64, 0x307}, {0x1E65, 0x73}, {0x1E65, 0x301}, {0x1E65, 0x307}, {0x1E66, 0x53}, {0x1E66, 0x30C}, {0x1E66, 0x307}, {0x1E67, 0x73}, {0x1E67, 0x30C}, {0x1E67, 0x307}, {0x1E68, 0x53}, {0x1E68, 0x323},
-{0x1E68, 0x307}, {0x1E69, 0x73}, {0x1E69, 0x323}, {0x1E69, 0x307}, {0x1E6A, 0x54}, {0x1E6A, 0x307}, {0x1E6B, 0x74}, {0x1E6B, 0x307}, {0x1E6C, 0x54}, {0x1E6C, 0x323}, {0x1E6D, 0x74}, {0x1E6D, 0x323},
-{0x1E6E, 0x54}, {0x1E6E, 0x331}, {0x1E6F, 0x74}, {0x1E6F, 0x331}, {0x1E70, 0x54}, {0x1E70, 0x32D}, {0x1E71, 0x74}, {0x1E71, 0x32D}, {0x1E72, 0x55}, {0x1E72, 0x324}, {0x1E73, 0x75}, {0x1E73, 0x324},
-{0x1E74, 0x55}, {0x1E74, 0x330}, {0x1E75, 0x75}, {0x1E75, 0x330}, {0x1E76, 0x55}, {0x1E76, 0x32D}, {0x1E77, 0x75}, {0x1E77, 0x32D}, {0x1E78, 0x55}, {0x1E78, 0x303}, {0x1E78, 0x301}, {0x1E79, 0x75},
-{0x1E79, 0x303}, {0x1E79, 0x301}, {0x1E7A, 0x55}, {0x1E7A, 0x304}, {0x1E7A, 0x308}, {0x1E7B, 0x75}, {0x1E7B, 0x304}, {0x1E7B, 0x308}, {0x1E7C, 0x56}, {0x1E7C, 0x303}, {0x1E7D, 0x76}, {0x1E7D, 0x303},
-{0x1E7E, 0x56}, {0x1E7E, 0x323}, {0x1E7F, 0x76}, {0x1E7F, 0x323}, {0x1E80, 0x57}, {0x1E80, 0x300}, {0x1E81, 0x77}, {0x1E81, 0x300}, {0x1E82, 0x57}, {0x1E82, 0x301}, {0x1E83, 0x77}, {0x1E83, 0x301},
-{0x1E84, 0x57}, {0x1E84, 0x308}, {0x1E85, 0x77}, {0x1E85, 0x308}, {0x1E86, 0x57}, {0x1E86, 0x307}, {0x1E87, 0x77}, {0x1E87, 0x307}, {0x1E88, 0x57}, {0x1E88, 0x323}, {0x1E89, 0x77}, {0x1E89, 0x323},
-{0x1E8A, 0x58}, {0x1E8A, 0x307}, {0x1E8B, 0x78}, {0x1E8B, 0x307}, {0x1E8C, 0x58}, {0x1E8C, 0x308}, {0x1E8D, 0x78}, {0x1E8D, 0x308}, {0x1E8E, 0x59}, {0x1E8E, 0x307}, {0x1E8F, 0x79}, {0x1E8F, 0x307},
-{0x1E90, 0x5A}, {0x1E90, 0x302}, {0x1E91, 0x7A}, {0x1E91, 0x302}, {0x1E92, 0x5A}, {0x1E92, 0x323}, {0x1E93, 0x7A}, {0x1E93, 0x323}, {0x1E94, 0x5A}, {0x1E94, 0x331}, {0x1E95, 0x7A}, {0x1E95, 0x331},
-{0x1E96, 0x68}, {0x1E96, 0x331}, {0x1E97, 0x74}, {0x1E97, 0x308}, {0x1E98, 0x77}, {0x1E98, 0x30A}, {0x1E99, 0x79}, {0x1E99, 0x30A}, {0x1E9B, 0x17F}, {0x1E9B, 0x307}, {0x1EA0, 0x41}, {0x1EA0, 0x323},
-{0x1EA1, 0x61}, {0x1EA1, 0x323}, {0x1EA2, 0x41}, {0x1EA2, 0x309}, {0x1EA3, 0x61}, {0x1EA3, 0x309}, {0x1EA4, 0x41}, {0x1EA4, 0x302}, {0x1EA4, 0x301}, {0x1EA5, 0x61}, {0x1EA5, 0x302}, {0x1EA5, 0x301},
-{0x1EA6, 0x41}, {0x1EA6, 0x302}, {0x1EA6, 0x300}, {0x1EA7, 0x61}, {0x1EA7, 0x302}, {0x1EA7, 0x300}, {0x1EA8, 0x41}, {0x1EA8, 0x302}, {0x1EA8, 0x309}, {0x1EA9, 0x61}, {0x1EA9, 0x302}, {0x1EA9, 0x309},
-{0x1EAA, 0x41}, {0x1EAA, 0x302}, {0x1EAA, 0x303}, {0x1EAB, 0x61}, {0x1EAB, 0x302}, {0x1EAB, 0x303}, {0x1EAC, 0x41}, {0x1EAC, 0x323}, {0x1EAC, 0x302}, {0x1EAD, 0x61}, {0x1EAD, 0x323}, {0x1EAD, 0x302},
-{0x1EAE, 0x41}, {0x1EAE, 0x306}, {0x1EAE, 0x301}, {0x1EAF, 0x61}, {0x1EAF, 0x306}, {0x1EAF, 0x301}, {0x1EB0, 0x41}, {0x1EB0, 0x306}, {0x1EB0, 0x300}, {0x1EB1, 0x61}, {0x1EB1, 0x306}, {0x1EB1, 0x300},
-{0x1EB2, 0x41}, {0x1EB2, 0x306}, {0x1EB2, 0x309}, {0x1EB3, 0x61}, {0x1EB3, 0x306}, {0x1EB3, 0x309}, {0x1EB4, 0x41}, {0x1EB4, 0x306}, {0x1EB4, 0x303}, {0x1EB5, 0x61}, {0x1EB5, 0x306}, {0x1EB5, 0x303},
-{0x1EB6, 0x41}, {0x1EB6, 0x323}, {0x1EB6, 0x306}, {0x1EB7, 0x61}, {0x1EB7, 0x323}, {0x1EB7, 0x306}, {0x1EB8, 0x45}, {0x1EB8, 0x323}, {0x1EB9, 0x65}, {0x1EB9, 0x323}, {0x1EBA, 0x45}, {0x1EBA, 0x309},
-{0x1EBB, 0x65}, {0x1EBB, 0x309}, {0x1EBC, 0x45}, {0x1EBC, 0x303}, {0x1EBD, 0x65}, {0x1EBD, 0x303}, {0x1EBE, 0x45}, {0x1EBE, 0x302}, {0x1EBE, 0x301}, {0x1EBF, 0x65}, {0x1EBF, 0x302}, {0x1EBF, 0x301},
-{0x1EC0, 0x45}, {0x1EC0, 0x302}, {0x1EC0, 0x300}, {0x1EC1, 0x65}, {0x1EC1, 0x302}, {0x1EC1, 0x300}, {0x1EC2, 0x45}, {0x1EC2, 0x302}, {0x1EC2, 0x309}, {0x1EC3, 0x65}, {0x1EC3, 0x302}, {0x1EC3, 0x309},
-{0x1EC4, 0x45}, {0x1EC4, 0x302}, {0x1EC4, 0x303}, {0x1EC5, 0x65}, {0x1EC5, 0x302}, {0x1EC5, 0x303}, {0x1EC6, 0x45}, {0x1EC6, 0x323}, {0x1EC6, 0x302}, {0x1EC7, 0x65}, {0x1EC7, 0x323}, {0x1EC7, 0x302},
-{0x1EC8, 0x49}, {0x1EC8, 0x309}, {0x1EC9, 0x69}, {0x1EC9, 0x309}, {0x1ECA, 0x49}, {0x1ECA, 0x323}, {0x1ECB, 0x69}, {0x1ECB, 0x323}, {0x1ECC, 0x4F}, {0x1ECC, 0x323}, {0x1ECD, 0x6F}, {0x1ECD, 0x323},
-{0x1ECE, 0x4F}, {0x1ECE, 0x309}, {0x1ECF, 0x6F}, {0x1ECF, 0x309}, {0x1ED0, 0x4F}, {0x1ED0, 0x302}, {0x1ED0, 0x301}, {0x1ED1, 0x6F}, {0x1ED1, 0x302}, {0x1ED1, 0x301}, {0x1ED2, 0x4F}, {0x1ED2, 0x302},
-{0x1ED2, 0x300}, {0x1ED3, 0x6F}, {0x1ED3, 0x302}, {0x1ED3, 0x300}, {0x1ED4, 0x4F}, {0x1ED4, 0x302}, {0x1ED4, 0x309}, {0x1ED5, 0x6F}, {0x1ED5, 0x302}, {0x1ED5, 0x309}, {0x1ED6, 0x4F}, {0x1ED6, 0x302},
-{0x1ED6, 0x303}, {0x1ED7, 0x6F}, {0x1ED7, 0x302}, {0x1ED7, 0x303}, {0x1ED8, 0x4F}, {0x1ED8, 0x323}, {0x1ED8, 0x302}, {0x1ED9, 0x6F}, {0x1ED9, 0x323}, {0x1ED9, 0x302}, {0x1EDA, 0x4F}, {0x1EDA, 0x31B},
-{0x1EDA, 0x301}, {0x1EDB, 0x6F}, {0x1EDB, 0x31B}, {0x1EDB, 0x301}, {0x1EDC, 0x4F}, {0x1EDC, 0x31B}, {0x1EDC, 0x300}, {0x1EDD, 0x6F}, {0x1EDD, 0x31B}, {0x1EDD, 0x300}, {0x1EDE, 0x4F}, {0x1EDE, 0x31B},
-{0x1EDE, 0x309}, {0x1EDF, 0x6F}, {0x1EDF, 0x31B}, {0x1EDF, 0x309}, {0x1EE0, 0x4F}, {0x1EE0, 0x31B}, {0x1EE0, 0x303}, {0x1EE1, 0x6F}, {0x1EE1, 0x31B}, {0x1EE1, 0x303}, {0x1EE2, 0x4F}, {0x1EE2, 0x31B},
-{0x1EE2, 0x323}, {0x1EE3, 0x6F}, {0x1EE3, 0x31B}, {0x1EE3, 0x323}, {0x1EE4, 0x55}, {0x1EE4, 0x323}, {0x1EE5, 0x75}, {0x1EE5, 0x323}, {0x1EE6, 0x55}, {0x1EE6, 0x309}, {0x1EE7, 0x75}, {0x1EE7, 0x309},
-{0x1EE8, 0x55}, {0x1EE8, 0x31B}, {0x1EE8, 0x301}, {0x1EE9, 0x75}, {0x1EE9, 0x31B}, {0x1EE9, 0x301}, {0x1EEA, 0x55}, {0x1EEA, 0x31B}, {0x1EEA, 0x300}, {0x1EEB, 0x75}, {0x1EEB, 0x31B}, {0x1EEB, 0x300},
-{0x1EEC, 0x55}, {0x1EEC, 0x31B}, {0x1EEC, 0x309}, {0x1EED, 0x75}, {0x1EED, 0x31B}, {0x1EED, 0x309}, {0x1EEE, 0x55}, {0x1EEE, 0x31B}, {0x1EEE, 0x303}, {0x1EEF, 0x75}, {0x1EEF, 0x31B}, {0x1EEF, 0x303},
-{0x1EF0, 0x55}, {0x1EF0, 0x31B}, {0x1EF0, 0x323}, {0x1EF1, 0x75}, {0x1EF1, 0x31B}, {0x1EF1, 0x323}, {0x1EF2, 0x59}, {0x1EF2, 0x300}, {0x1EF3, 0x79}, {0x1EF3, 0x300}, {0x1EF4, 0x59}, {0x1EF4, 0x323},
-{0x1EF5, 0x79}, {0x1EF5, 0x323}, {0x1EF6, 0x59}, {0x1EF6, 0x309}, {0x1EF7, 0x79}, {0x1EF7, 0x309}, {0x1EF8, 0x59}, {0x1EF8, 0x303}, {0x1EF9, 0x79}, {0x1EF9, 0x303}, {0x1F00, 0x3B1}, {0x1F00, 0x313},
-{0x1F01, 0x3B1}, {0x1F01, 0x314}, {0x1F02, 0x3B1}, {0x1F02, 0x313}, {0x1F02, 0x300}, {0x1F03, 0x3B1}, {0x1F03, 0x314}, {0x1F03, 0x300}, {0x1F04, 0x3B1}, {0x1F04, 0x313}, {0x1F04, 0x301},
-{0x1F05, 0x3B1}, {0x1F05, 0x314}, {0x1F05, 0x301}, {0x1F06, 0x3B1}, {0x1F06, 0x313}, {0x1F06, 0x342}, {0x1F07, 0x3B1}, {0x1F07, 0x314}, {0x1F07, 0x342}, {0x1F08, 0x391}, {0x1F08, 0x313},
-{0x1F09, 0x391}, {0x1F09, 0x314}, {0x1F0A, 0x391}, {0x1F0A, 0x313}, {0x1F0A, 0x300}, {0x1F0B, 0x391}, {0x1F0B, 0x314}, {0x1F0B, 0x300}, {0x1F0C, 0x391}, {0x1F0C, 0x313}, {0x1F0C, 0x301},
-{0x1F0D, 0x391}, {0x1F0D, 0x314}, {0x1F0D, 0x301}, {0x1F0E, 0x391}, {0x1F0E, 0x313}, {0x1F0E, 0x342}, {0x1F0F, 0x391}, {0x1F0F, 0x314}, {0x1F0F, 0x342}, {0x1F10, 0x3B5}, {0x1F10, 0x313},
-{0x1F11, 0x3B5}, {0x1F11, 0x314}, {0x1F12, 0x3B5}, {0x1F12, 0x313}, {0x1F12, 0x300}, {0x1F13, 0x3B5}, {0x1F13, 0x314}, {0x1F13, 0x300}, {0x1F14, 0x3B5}, {0x1F14, 0x313}, {0x1F14, 0x301},
-{0x1F15, 0x3B5}, {0x1F15, 0x314}, {0x1F15, 0x301}, {0x1F18, 0x395}, {0x1F18, 0x313}, {0x1F19, 0x395}, {0x1F19, 0x314}, {0x1F1A, 0x395}, {0x1F1A, 0x313}, {0x1F1A, 0x300}, {0x1F1B, 0x395},
-{0x1F1B, 0x314}, {0x1F1B, 0x300}, {0x1F1C, 0x395}, {0x1F1C, 0x313}, {0x1F1C, 0x301}, {0x1F1D, 0x395}, {0x1F1D, 0x314}, {0x1F1D, 0x301}, {0x1F20, 0x3B7}, {0x1F20, 0x313}, {0x1F21, 0x3B7},
-{0x1F21, 0x314}, {0x1F22, 0x3B7}, {0x1F22, 0x313}, {0x1F22, 0x300}, {0x1F23, 0x3B7}, {0x1F23, 0x314}, {0x1F23, 0x300}, {0x1F24, 0x3B7}, {0x1F24, 0x313}, {0x1F24, 0x301}, {0x1F25, 0x3B7},
-{0x1F25, 0x314}, {0x1F25, 0x301}, {0x1F26, 0x3B7}, {0x1F26, 0x313}, {0x1F26, 0x342}, {0x1F27, 0x3B7}, {0x1F27, 0x314}, {0x1F27, 0x342}, {0x1F28, 0x397}, {0x1F28, 0x313}, {0x1F29, 0x397},
-{0x1F29, 0x314}, {0x1F2A, 0x397}, {0x1F2A, 0x313}, {0x1F2A, 0x300}, {0x1F2B, 0x397}, {0x1F2B, 0x314}, {0x1F2B, 0x300}, {0x1F2C, 0x397}, {0x1F2C, 0x313}, {0x1F2C, 0x301}, {0x1F2D, 0x397},
-{0x1F2D, 0x314}, {0x1F2D, 0x301}, {0x1F2E, 0x397}, {0x1F2E, 0x313}, {0x1F2E, 0x342}, {0x1F2F, 0x397}, {0x1F2F, 0x314}, {0x1F2F, 0x342}, {0x1F30, 0x3B9}, {0x1F30, 0x313}, {0x1F31, 0x3B9},
-{0x1F31, 0x314}, {0x1F32, 0x3B9}, {0x1F32, 0x313}, {0x1F32, 0x300}, {0x1F33, 0x3B9}, {0x1F33, 0x314}, {0x1F33, 0x300}, {0x1F34, 0x3B9}, {0x1F34, 0x313}, {0x1F34, 0x301}, {0x1F35, 0x3B9},
-{0x1F35, 0x314}, {0x1F35, 0x301}, {0x1F36, 0x3B9}, {0x1F36, 0x313}, {0x1F36, 0x342}, {0x1F37, 0x3B9}, {0x1F37, 0x314}, {0x1F37, 0x342}, {0x1F38, 0x399}, {0x1F38, 0x313}, {0x1F39, 0x399},
-{0x1F39, 0x314}, {0x1F3A, 0x399}, {0x1F3A, 0x313}, {0x1F3A, 0x300}, {0x1F3B, 0x399}, {0x1F3B, 0x314}, {0x1F3B, 0x300}, {0x1F3C, 0x399}, {0x1F3C, 0x313}, {0x1F3C, 0x301}, {0x1F3D, 0x399},
-{0x1F3D, 0x314}, {0x1F3D, 0x301}, {0x1F3E, 0x399}, {0x1F3E, 0x313}, {0x1F3E, 0x342}, {0x1F3F, 0x399}, {0x1F3F, 0x314}, {0x1F3F, 0x342}, {0x1F40, 0x3BF}, {0x1F40, 0x313}, {0x1F41, 0x3BF},
-{0x1F41, 0x314}, {0x1F42, 0x3BF}, {0x1F42, 0x313}, {0x1F42, 0x300}, {0x1F43, 0x3BF}, {0x1F43, 0x314}, {0x1F43, 0x300}, {0x1F44, 0x3BF}, {0x1F44, 0x313}, {0x1F44, 0x301}, {0x1F45, 0x3BF},
-{0x1F45, 0x314}, {0x1F45, 0x301}, {0x1F48, 0x39F}, {0x1F48, 0x313}, {0x1F49, 0x39F}, {0x1F49, 0x314}, {0x1F4A, 0x39F}, {0x1F4A, 0x313}, {0x1F4A, 0x300}, {0x1F4B, 0x39F}, {0x1F4B, 0x314},
-{0x1F4B, 0x300}, {0x1F4C, 0x39F}, {0x1F4C, 0x313}, {0x1F4C, 0x301}, {0x1F4D, 0x39F}, {0x1F4D, 0x314}, {0x1F4D, 0x301}, {0x1F50, 0x3C5}, {0x1F50, 0x313}, {0x1F51, 0x3C5}, {0x1F51, 0x314},
-{0x1F52, 0x3C5}, {0x1F52, 0x313}, {0x1F52, 0x300}, {0x1F53, 0x3C5}, {0x1F53, 0x314}, {0x1F53, 0x300}, {0x1F54, 0x3C5}, {0x1F54, 0x313}, {0x1F54, 0x301}, {0x1F55, 0x3C5}, {0x1F55, 0x314},
-{0x1F55, 0x301}, {0x1F56, 0x3C5}, {0x1F56, 0x313}, {0x1F56, 0x342}, {0x1F57, 0x3C5}, {0x1F57, 0x314}, {0x1F57, 0x342}, {0x1F59, 0x3A5}, {0x1F59, 0x314}, {0x1F5B, 0x3A5}, {0x1F5B, 0x314},
-{0x1F5B, 0x300}, {0x1F5D, 0x3A5}, {0x1F5D, 0x314}, {0x1F5D, 0x301}, {0x1F5F, 0x3A5}, {0x1F5F, 0x314}, {0x1F5F, 0x342}, {0x1F60, 0x3C9}, {0x1F60, 0x313}, {0x1F61, 0x3C9}, {0x1F61, 0x314},
-{0x1F62, 0x3C9}, {0x1F62, 0x313}, {0x1F62, 0x300}, {0x1F63, 0x3C9}, {0x1F63, 0x314}, {0x1F63, 0x300}, {0x1F64, 0x3C9}, {0x1F64, 0x313}, {0x1F64, 0x301}, {0x1F65, 0x3C9}, {0x1F65, 0x314},
-{0x1F65, 0x301}, {0x1F66, 0x3C9}, {0x1F66, 0x313}, {0x1F66, 0x342}, {0x1F67, 0x3C9}, {0x1F67, 0x314}, {0x1F67, 0x342}, {0x1F68, 0x3A9}, {0x1F68, 0x313}, {0x1F69, 0x3A9}, {0x1F69, 0x314},
-{0x1F6A, 0x3A9}, {0x1F6A, 0x313}, {0x1F6A, 0x300}, {0x1F6B, 0x3A9}, {0x1F6B, 0x314}, {0x1F6B, 0x300}, {0x1F6C, 0x3A9}, {0x1F6C, 0x313}, {0x1F6C, 0x301}, {0x1F6D, 0x3A9}, {0x1F6D, 0x314},
-{0x1F6D, 0x301}, {0x1F6E, 0x3A9}, {0x1F6E, 0x313}, {0x1F6E, 0x342}, {0x1F6F, 0x3A9}, {0x1F6F, 0x314}, {0x1F6F, 0x342}, {0x1F70, 0x3B1}, {0x1F70, 0x300}, {0x1F71, 0x3B1}, {0x1F71, 0x301},
-{0x1F72, 0x3B5}, {0x1F72, 0x300}, {0x1F73, 0x3B5}, {0x1F73, 0x301}, {0x1F74, 0x3B7}, {0x1F74, 0x300}, {0x1F75, 0x3B7}, {0x1F75, 0x301}, {0x1F76, 0x3B9}, {0x1F76, 0x300}, {0x1F77, 0x3B9},
-{0x1F77, 0x301}, {0x1F78, 0x3BF}, {0x1F78, 0x300}, {0x1F79, 0x3BF}, {0x1F79, 0x301}, {0x1F7A, 0x3C5}, {0x1F7A, 0x300}, {0x1F7B, 0x3C5}, {0x1F7B, 0x301}, {0x1F7C, 0x3C9}, {0x1F7C, 0x300},
-{0x1F7D, 0x3C9}, {0x1F7D, 0x301}, {0x1F80, 0x3B1}, {0x1F80, 0x313}, {0x1F80, 0x345}, {0x1F81, 0x3B1}, {0x1F81, 0x314}, {0x1F81, 0x345}, {0x1F82, 0x3B1}, {0x1F82, 0x313}, {0x1F82, 0x300},
-{0x1F82, 0x345}, {0x1F83, 0x3B1}, {0x1F83, 0x314}, {0x1F83, 0x300}, {0x1F83, 0x345}, {0x1F84, 0x3B1}, {0x1F84, 0x313}, {0x1F84, 0x301}, {0x1F84, 0x345}, {0x1F85, 0x3B1}, {0x1F85, 0x314},
-{0x1F85, 0x301}, {0x1F85, 0x345}, {0x1F86, 0x3B1}, {0x1F86, 0x313}, {0x1F86, 0x342}, {0x1F86, 0x345}, {0x1F87, 0x3B1}, {0x1F87, 0x314}, {0x1F87, 0x342}, {0x1F87, 0x345}, {0x1F88, 0x391},
-{0x1F88, 0x313}, {0x1F88, 0x345}, {0x1F89, 0x391}, {0x1F89, 0x314}, {0x1F89, 0x345}, {0x1F8A, 0x391}, {0x1F8A, 0x313}, {0x1F8A, 0x300}, {0x1F8A, 0x345}, {0x1F8B, 0x391}, {0x1F8B, 0x314},
-{0x1F8B, 0x300}, {0x1F8B, 0x345}, {0x1F8C, 0x391}, {0x1F8C, 0x313}, {0x1F8C, 0x301}, {0x1F8C, 0x345}, {0x1F8D, 0x391}, {0x1F8D, 0x314}, {0x1F8D, 0x301}, {0x1F8D, 0x345}, {0x1F8E, 0x391},
-{0x1F8E, 0x313}, {0x1F8E, 0x342}, {0x1F8E, 0x345}, {0x1F8F, 0x391}, {0x1F8F, 0x314}, {0x1F8F, 0x342}, {0x1F8F, 0x345}, {0x1F90, 0x3B7}, {0x1F90, 0x313}, {0x1F90, 0x345}, {0x1F91, 0x3B7},
-{0x1F91, 0x314}, {0x1F91, 0x345}, {0x1F92, 0x3B7}, {0x1F92, 0x313}, {0x1F92, 0x300}, {0x1F92, 0x345}, {0x1F93, 0x3B7}, {0x1F93, 0x314}, {0x1F93, 0x300}, {0x1F93, 0x345}, {0x1F94, 0x3B7},
-{0x1F94, 0x313}, {0x1F94, 0x301}, {0x1F94, 0x345}, {0x1F95, 0x3B7}, {0x1F95, 0x314}, {0x1F95, 0x301}, {0x1F95, 0x345}, {0x1F96, 0x3B7}, {0x1F96, 0x313}, {0x1F96, 0x342}, {0x1F96, 0x345},
-{0x1F97, 0x3B7}, {0x1F97, 0x314}, {0x1F97, 0x342}, {0x1F97, 0x345}, {0x1F98, 0x397}, {0x1F98, 0x313}, {0x1F98, 0x345}, {0x1F99, 0x397}, {0x1F99, 0x314}, {0x1F99, 0x345}, {0x1F9A, 0x397},
-{0x1F9A, 0x313}, {0x1F9A, 0x300}, {0x1F9A, 0x345}, {0x1F9B, 0x397}, {0x1F9B, 0x314}, {0x1F9B, 0x300}, {0x1F9B, 0x345}, {0x1F9C, 0x397}, {0x1F9C, 0x313}, {0x1F9C, 0x301}, {0x1F9C, 0x345},
-{0x1F9D, 0x397}, {0x1F9D, 0x314}, {0x1F9D, 0x301}, {0x1F9D, 0x345}, {0x1F9E, 0x397}, {0x1F9E, 0x313}, {0x1F9E, 0x342}, {0x1F9E, 0x345}, {0x1F9F, 0x397}, {0x1F9F, 0x314}, {0x1F9F, 0x342},
-{0x1F9F, 0x345}, {0x1FA0, 0x3C9}, {0x1FA0, 0x313}, {0x1FA0, 0x345}, {0x1FA1, 0x3C9}, {0x1FA1, 0x314}, {0x1FA1, 0x345}, {0x1FA2, 0x3C9}, {0x1FA2, 0x313}, {0x1FA2, 0x300}, {0x1FA2, 0x345},
-{0x1FA3, 0x3C9}, {0x1FA3, 0x314}, {0x1FA3, 0x300}, {0x1FA3, 0x345}, {0x1FA4, 0x3C9}, {0x1FA4, 0x313}, {0x1FA4, 0x301}, {0x1FA4, 0x345}, {0x1FA5, 0x3C9}, {0x1FA5, 0x314}, {0x1FA5, 0x301},
-{0x1FA5, 0x345}, {0x1FA6, 0x3C9}, {0x1FA6, 0x313}, {0x1FA6, 0x342}, {0x1FA6, 0x345}, {0x1FA7, 0x3C9}, {0x1FA7, 0x314}, {0x1FA7, 0x342}, {0x1FA7, 0x345}, {0x1FA8, 0x3A9}, {0x1FA8, 0x313},
-{0x1FA8, 0x345}, {0x1FA9, 0x3A9}, {0x1FA9, 0x314}, {0x1FA9, 0x345}, {0x1FAA, 0x3A9}, {0x1FAA, 0x313}, {0x1FAA, 0x300}, {0x1FAA, 0x345}, {0x1FAB, 0x3A9}, {0x1FAB, 0x314}, {0x1FAB, 0x300},
-{0x1FAB, 0x345}, {0x1FAC, 0x3A9}, {0x1FAC, 0x313}, {0x1FAC, 0x301}, {0x1FAC, 0x345}, {0x1FAD, 0x3A9}, {0x1FAD, 0x314}, {0x1FAD, 0x301}, {0x1FAD, 0x345}, {0x1FAE, 0x3A9}, {0x1FAE, 0x313},
-{0x1FAE, 0x342}, {0x1FAE, 0x345}, {0x1FAF, 0x3A9}, {0x1FAF, 0x314}, {0x1FAF, 0x342}, {0x1FAF, 0x345}, {0x1FB0, 0x3B1}, {0x1FB0, 0x306}, {0x1FB1, 0x3B1}, {0x1FB1, 0x304}, {0x1FB2, 0x3B1},
-{0x1FB2, 0x300}, {0x1FB2, 0x345}, {0x1FB3, 0x3B1}, {0x1FB3, 0x345}, {0x1FB4, 0x3B1}, {0x1FB4, 0x301}, {0x1FB4, 0x345}, {0x1FB6, 0x3B1}, {0x1FB6, 0x342}, {0x1FB7, 0x3B1}, {0x1FB7, 0x342},
-{0x1FB7, 0x345}, {0x1FB8, 0x391}, {0x1FB8, 0x306}, {0x1FB9, 0x391}, {0x1FB9, 0x304}, {0x1FBA, 0x391}, {0x1FBA, 0x300}, {0x1FBB, 0x391}, {0x1FBB, 0x301}, {0x1FBC, 0x391}, {0x1FBC, 0x345},
-{0x1FBE, 0x3B9}, {0x1FC1, 0xA8}, {0x1FC1, 0x342}, {0x1FC2, 0x3B7}, {0x1FC2, 0x300}, {0x1FC2, 0x345}, {0x1FC3, 0x3B7}, {0x1FC3, 0x345}, {0x1FC4, 0x3B7}, {0x1FC4, 0x301}, {0x1FC4, 0x345},
-{0x1FC6, 0x3B7}, {0x1FC6, 0x342}, {0x1FC7, 0x3B7}, {0x1FC7, 0x342}, {0x1FC7, 0x345}, {0x1FC8, 0x395}, {0x1FC8, 0x300}, {0x1FC9, 0x395}, {0x1FC9, 0x301}, {0x1FCA, 0x397}, {0x1FCA, 0x300},
-{0x1FCB, 0x397}, {0x1FCB, 0x301}, {0x1FCC, 0x397}, {0x1FCC, 0x345}, {0x1FCD, 0x1FBF}, {0x1FCD, 0x300}, {0x1FCE, 0x1FBF}, {0x1FCE, 0x301}, {0x1FCF, 0x1FBF}, {0x1FCF, 0x342}, {0x1FD0, 0x3B9},
-{0x1FD0, 0x306}, {0x1FD1, 0x3B9}, {0x1FD1, 0x304}, {0x1FD2, 0x3B9}, {0x1FD2, 0x308}, {0x1FD2, 0x300}, {0x1FD3, 0x3B9}, {0x1FD3, 0x308}, {0x1FD3, 0x301}, {0x1FD6, 0x3B9}, {0x1FD6, 0x342},
-{0x1FD7, 0x3B9}, {0x1FD7, 0x308}, {0x1FD7, 0x342}, {0x1FD8, 0x399}, {0x1FD8, 0x306}, {0x1FD9, 0x399}, {0x1FD9, 0x304}, {0x1FDA, 0x399}, {0x1FDA, 0x300}, {0x1FDB, 0x399}, {0x1FDB, 0x301},
-{0x1FDD, 0x1FFE}, {0x1FDD, 0x300}, {0x1FDE, 0x1FFE}, {0x1FDE, 0x301}, {0x1FDF, 0x1FFE}, {0x1FDF, 0x342}, {0x1FE0, 0x3C5}, {0x1FE0, 0x306}, {0x1FE1, 0x3C5}, {0x1FE1, 0x304}, {0x1FE2, 0x3C5},
-{0x1FE2, 0x308}, {0x1FE2, 0x300}, {0x1FE3, 0x3C5}, {0x1FE3, 0x308}, {0x1FE3, 0x301}, {0x1FE4, 0x3C1}, {0x1FE4, 0x313}, {0x1FE5, 0x3C1}, {0x1FE5, 0x314}, {0x1FE6, 0x3C5}, {0x1FE6, 0x342},
-{0x1FE7, 0x3C5}, {0x1FE7, 0x308}, {0x1FE7, 0x342}, {0x1FE8, 0x3A5}, {0x1FE8, 0x306}, {0x1FE9, 0x3A5}, {0x1FE9, 0x304}, {0x1FEA, 0x3A5}, {0x1FEA, 0x300}, {0x1FEB, 0x3A5}, {0x1FEB, 0x301},
-{0x1FEC, 0x3A1}, {0x1FEC, 0x314}, {0x1FED, 0xA8}, {0x1FED, 0x300}, {0x1FEE, 0xA8}, {0x1FEE, 0x301}, {0x1FEF, 0x60}, {0x1FF2, 0x3C9}, {0x1FF2, 0x300}, {0x1FF2, 0x345}, {0x1FF3, 0x3C9}, {0x1FF3, 0x345},
-{0x1FF4, 0x3C9}, {0x1FF4, 0x301}, {0x1FF4, 0x345}, {0x1FF6, 0x3C9}, {0x1FF6, 0x342}, {0x1FF7, 0x3C9}, {0x1FF7, 0x342}, {0x1FF7, 0x345}, {0x1FF8, 0x39F}, {0x1FF8, 0x300}, {0x1FF9, 0x39F},
-{0x1FF9, 0x301}, {0x1FFA, 0x3A9}, {0x1FFA, 0x300}, {0x1FFB, 0x3A9}, {0x1FFB, 0x301}, {0x1FFC, 0x3A9}, {0x1FFC, 0x345}, {0x1FFD, 0xB4}, {0x2000, 0x2002}, {0x2001, 0x2003}, {0x2126, 0x3A9},
-{0x212A, 0x4B}, {0x212B, 0x41}, {0x212B, 0x30A}, {0x219A, 0x2190}, {0x219A, 0x338}, {0x219B, 0x2192}, {0x219B, 0x338}, {0x21AE, 0x2194}, {0x21AE, 0x338}, {0x21CD, 0x21D0}, {0x21CD, 0x338},
-{0x21CE, 0x21D4}, {0x21CE, 0x338}, {0x21CF, 0x21D2}, {0x21CF, 0x338}, {0x2204, 0x2203}, {0x2204, 0x338}, {0x2209, 0x2208}, {0x2209, 0x338}, {0x220C, 0x220B}, {0x220C, 0x338}, {0x2224, 0x2223},
-{0x2224, 0x338}, {0x2226, 0x2225}, {0x2226, 0x338}, {0x2241, 0x223C}, {0x2241, 0x338}, {0x2244, 0x2243}, {0x2244, 0x338}, {0x2247, 0x2245}, {0x2247, 0x338}, {0x2249, 0x2248}, {0x2249, 0x338},
-{0x2260, 0x3D}, {0x2260, 0x338}, {0x2262, 0x2261}, {0x2262, 0x338}, {0x226D, 0x224D}, {0x226D, 0x338}, {0x226E, 0x3C}, {0x226E, 0x338}, {0x226F, 0x3E}, {0x226F, 0x338}, {0x2270, 0x2264},
-{0x2270, 0x338}, {0x2271, 0x2265}, {0x2271, 0x338}, {0x2274, 0x2272}, {0x2274, 0x338}, {0x2275, 0x2273}, {0x2275, 0x338}, {0x2278, 0x2276}, {0x2278, 0x338}, {0x2279, 0x2277}, {0x2279, 0x338},
-{0x2280, 0x227A}, {0x2280, 0x338}, {0x2281, 0x227B}, {0x2281, 0x338}, {0x2284, 0x2282}, {0x2284, 0x338}, {0x2285, 0x2283}, {0x2285, 0x338}, {0x2288, 0x2286}, {0x2288, 0x338}, {0x2289, 0x2287},
-{0x2289, 0x338}, {0x22AC, 0x22A2}, {0x22AC, 0x338}, {0x22AD, 0x22A8}, {0x22AD, 0x338}, {0x22AE, 0x22A9}, {0x22AE, 0x338}, {0x22AF, 0x22AB}, {0x22AF, 0x338}, {0x22E0, 0x227C}, {0x22E0, 0x338},
-{0x22E1, 0x227D}, {0x22E1, 0x338}, {0x22E2, 0x2291}, {0x22E2, 0x338}, {0x22E3, 0x2292}, {0x22E3, 0x338}, {0x22EA, 0x22B2}, {0x22EA, 0x338}, {0x22EB, 0x22B3}, {0x22EB, 0x338}, {0x22EC, 0x22B4},
-{0x22EC, 0x338}, {0x22ED, 0x22B5}, {0x22ED, 0x338}, {0x2329, 0x3008}, {0x232A, 0x3009}, {0x2ADC, 0x2ADD}, {0x2ADC, 0x338}, {0x304C, 0x304B}, {0x304C, 0x3099}, {0x304E, 0x304D}, {0x304E, 0x3099},
-{0x3050, 0x304F}, {0x3050, 0x3099}, {0x3052, 0x3051}, {0x3052, 0x3099}, {0x3054, 0x3053}, {0x3054, 0x3099}, {0x3056, 0x3055}, {0x3056, 0x3099}, {0x3058, 0x3057}, {0x3058, 0x3099}, {0x305A, 0x3059},
-{0x305A, 0x3099}, {0x305C, 0x305B}, {0x305C, 0x3099}, {0x305E, 0x305D}, {0x305E, 0x3099}, {0x3060, 0x305F}, {0x3060, 0x3099}, {0x3062, 0x3061}, {0x3062, 0x3099}, {0x3065, 0x3064}, {0x3065, 0x3099},
-{0x3067, 0x3066}, {0x3067, 0x3099}, {0x3069, 0x3068}, {0x3069, 0x3099}, {0x3070, 0x306F}, {0x3070, 0x3099}, {0x3071, 0x306F}, {0x3071, 0x309A}, {0x3073, 0x3072}, {0x3073, 0x3099}, {0x3074, 0x3072},
-{0x3074, 0x309A}, {0x3076, 0x3075}, {0x3076, 0x3099}, {0x3077, 0x3075}, {0x3077, 0x309A}, {0x3079, 0x3078}, {0x3079, 0x3099}, {0x307A, 0x3078}, {0x307A, 0x309A}, {0x307C, 0x307B}, {0x307C, 0x3099},
-{0x307D, 0x307B}, {0x307D, 0x309A}, {0x3094, 0x3046}, {0x3094, 0x3099}, {0x309E, 0x309D}, {0x309E, 0x3099}, {0x30AC, 0x30AB}, {0x30AC, 0x3099}, {0x30AE, 0x30AD}, {0x30AE, 0x3099}, {0x30B0, 0x30AF},
-{0x30B0, 0x3099}, {0x30B2, 0x30B1}, {0x30B2, 0x3099}, {0x30B4, 0x30B3}, {0x30B4, 0x3099}, {0x30B6, 0x30B5}, {0x30B6, 0x3099}, {0x30B8, 0x30B7}, {0x30B8, 0x3099}, {0x30BA, 0x30B9}, {0x30BA, 0x3099},
-{0x30BC, 0x30BB}, {0x30BC, 0x3099}, {0x30BE, 0x30BD}, {0x30BE, 0x3099}, {0x30C0, 0x30BF}, {0x30C0, 0x3099}, {0x30C2, 0x30C1}, {0x30C2, 0x3099}, {0x30C5, 0x30C4}, {0x30C5, 0x3099}, {0x30C7, 0x30C6},
-{0x30C7, 0x3099}, {0x30C9, 0x30C8}, {0x30C9, 0x3099}, {0x30D0, 0x30CF}, {0x30D0, 0x3099}, {0x30D1, 0x30CF}, {0x30D1, 0x309A}, {0x30D3, 0x30D2}, {0x30D3, 0x3099}, {0x30D4, 0x30D2}, {0x30D4, 0x309A},
-{0x30D6, 0x30D5}, {0x30D6, 0x3099}, {0x30D7, 0x30D5}, {0x30D7, 0x309A}, {0x30D9, 0x30D8}, {0x30D9, 0x3099}, {0x30DA, 0x30D8}, {0x30DA, 0x309A}, {0x30DC, 0x30DB}, {0x30DC, 0x3099}, {0x30DD, 0x30DB},
-{0x30DD, 0x309A}, {0x30F4, 0x30A6}, {0x30F4, 0x3099}, {0x30F7, 0x30EF}, {0x30F7, 0x3099}, {0x30F8, 0x30F0}, {0x30F8, 0x3099}, {0x30F9, 0x30F1}, {0x30F9, 0x3099}, {0x30FA, 0x30F2}, {0x30FA, 0x3099},
-{0x30FE, 0x30FD}, {0x30FE, 0x3099}, {0xF900, 0x8C48}, {0xF901, 0x66F4}, {0xF902, 0x8ECA}, {0xF903, 0x8CC8}, {0xF904, 0x6ED1}, {0xF905, 0x4E32}, {0xF906, 0x53E5}, {0xF907, 0x9F9C}, {0xF908, 0x9F9C},
-{0xF909, 0x5951}, {0xF90A, 0x91D1}, {0xF90B, 0x5587}, {0xF90C, 0x5948}, {0xF90D, 0x61F6}, {0xF90E, 0x7669}, {0xF90F, 0x7F85}, {0xF910, 0x863F}, {0xF911, 0x87BA}, {0xF912, 0x88F8}, {0xF913, 0x908F},
-{0xF914, 0x6A02}, {0xF915, 0x6D1B}, {0xF916, 0x70D9}, {0xF917, 0x73DE}, {0xF918, 0x843D}, {0xF919, 0x916A}, {0xF91A, 0x99F1}, {0xF91B, 0x4E82}, {0xF91C, 0x5375}, {0xF91D, 0x6B04}, {0xF91E, 0x721B},
-{0xF91F, 0x862D}, {0xF920, 0x9E1E}, {0xF921, 0x5D50}, {0xF922, 0x6FEB}, {0xF923, 0x85CD}, {0xF924, 0x8964}, {0xF925, 0x62C9}, {0xF926, 0x81D8}, {0xF927, 0x881F}, {0xF928, 0x5ECA}, {0xF929, 0x6717},
-{0xF92A, 0x6D6A}, {0xF92B, 0x72FC}, {0xF92C, 0x90CE}, {0xF92D, 0x4F86}, {0xF92E, 0x51B7}, {0xF92F, 0x52DE}, {0xF930, 0x64C4}, {0xF931, 0x6AD3}, {0xF932, 0x7210}, {0xF933, 0x76E7}, {0xF934, 0x8001},
-{0xF935, 0x8606}, {0xF936, 0x865C}, {0xF937, 0x8DEF}, {0xF938, 0x9732}, {0xF939, 0x9B6F}, {0xF93A, 0x9DFA}, {0xF93B, 0x788C}, {0xF93C, 0x797F}, {0xF93D, 0x7DA0}, {0xF93E, 0x83C9}, {0xF93F, 0x9304},
-{0xF940, 0x9E7F}, {0xF941, 0x8AD6}, {0xF942, 0x58DF}, {0xF943, 0x5F04}, {0xF944, 0x7C60}, {0xF945, 0x807E}, {0xF946, 0x7262}, {0xF947, 0x78CA}, {0xF948, 0x8CC2}, {0xF949, 0x96F7}, {0xF94A, 0x58D8},
-{0xF94B, 0x5C62}, {0xF94C, 0x6A13}, {0xF94D, 0x6DDA}, {0xF94E, 0x6F0F}, {0xF94F, 0x7D2F}, {0xF950, 0x7E37}, {0xF951, 0x964B}, {0xF952, 0x52D2}, {0xF953, 0x808B}, {0xF954, 0x51DC}, {0xF955, 0x51CC},
-{0xF956, 0x7A1C}, {0xF957, 0x7DBE}, {0xF958, 0x83F1}, {0xF959, 0x9675}, {0xF95A, 0x8B80}, {0xF95B, 0x62CF}, {0xF95C, 0x6A02}, {0xF95D, 0x8AFE}, {0xF95E, 0x4E39}, {0xF95F, 0x5BE7}, {0xF960, 0x6012},
-{0xF961, 0x7387}, {0xF962, 0x7570}, {0xF963, 0x5317}, {0xF964, 0x78FB}, {0xF965, 0x4FBF}, {0xF966, 0x5FA9}, {0xF967, 0x4E0D}, {0xF968, 0x6CCC}, {0xF969, 0x6578}, {0xF96A, 0x7D22}, {0xF96B, 0x53C3},
-{0xF96C, 0x585E}, {0xF96D, 0x7701}, {0xF96E, 0x8449}, {0xF96F, 0x8AAA}, {0xF970, 0x6BBA}, {0xF971, 0x8FB0}, {0xF972, 0x6C88}, {0xF973, 0x62FE}, {0xF974, 0x82E5}, {0xF975, 0x63A0}, {0xF976, 0x7565},
-{0xF977, 0x4EAE}, {0xF978, 0x5169}, {0xF979, 0x51C9}, {0xF97A, 0x6881}, {0xF97B, 0x7CE7}, {0xF97C, 0x826F}, {0xF97D, 0x8AD2}, {0xF97E, 0x91CF}, {0xF97F, 0x52F5}, {0xF980, 0x5442}, {0xF981, 0x5973},
-{0xF982, 0x5EEC}, {0xF983, 0x65C5}, {0xF984, 0x6FFE}, {0xF985, 0x792A}, {0xF986, 0x95AD}, {0xF987, 0x9A6A}, {0xF988, 0x9E97}, {0xF989, 0x9ECE}, {0xF98A, 0x529B}, {0xF98B, 0x66C6}, {0xF98C, 0x6B77},
-{0xF98D, 0x8F62}, {0xF98E, 0x5E74}, {0xF98F, 0x6190}, {0xF990, 0x6200}, {0xF991, 0x649A}, {0xF992, 0x6F23}, {0xF993, 0x7149}, {0xF994, 0x7489}, {0xF995, 0x79CA}, {0xF996, 0x7DF4}, {0xF997, 0x806F},
-{0xF998, 0x8F26}, {0xF999, 0x84EE}, {0xF99A, 0x9023}, {0xF99B, 0x934A}, {0xF99C, 0x5217}, {0xF99D, 0x52A3}, {0xF99E, 0x54BD}, {0xF99F, 0x70C8}, {0xF9A0, 0x88C2}, {0xF9A1, 0x8AAA}, {0xF9A2, 0x5EC9},
-{0xF9A3, 0x5FF5}, {0xF9A4, 0x637B}, {0xF9A5, 0x6BAE}, {0xF9A6, 0x7C3E}, {0xF9A7, 0x7375}, {0xF9A8, 0x4EE4}, {0xF9A9, 0x56F9}, {0xF9AA, 0x5BE7}, {0xF9AB, 0x5DBA}, {0xF9AC, 0x601C}, {0xF9AD, 0x73B2},
-{0xF9AE, 0x7469}, {0xF9AF, 0x7F9A}, {0xF9B0, 0x8046}, {0xF9B1, 0x9234}, {0xF9B2, 0x96F6}, {0xF9B3, 0x9748}, {0xF9B4, 0x9818}, {0xF9B5, 0x4F8B}, {0xF9B6, 0x79AE}, {0xF9B7, 0x91B4}, {0xF9B8, 0x96B8},
-{0xF9B9, 0x60E1}, {0xF9BA, 0x4E86}, {0xF9BB, 0x50DA}, {0xF9BC, 0x5BEE}, {0xF9BD, 0x5C3F}, {0xF9BE, 0x6599}, {0xF9BF, 0x6A02}, {0xF9C0, 0x71CE}, {0xF9C1, 0x7642}, {0xF9C2, 0x84FC}, {0xF9C3, 0x907C},
-{0xF9C4, 0x9F8D}, {0xF9C5, 0x6688}, {0xF9C6, 0x962E}, {0xF9C7, 0x5289}, {0xF9C8, 0x677B}, {0xF9C9, 0x67F3}, {0xF9CA, 0x6D41}, {0xF9CB, 0x6E9C}, {0xF9CC, 0x7409}, {0xF9CD, 0x7559}, {0xF9CE, 0x786B},
-{0xF9CF, 0x7D10}, {0xF9D0, 0x985E}, {0xF9D1, 0x516D}, {0xF9D2, 0x622E}, {0xF9D3, 0x9678}, {0xF9D4, 0x502B}, {0xF9D5, 0x5D19}, {0xF9D6, 0x6DEA}, {0xF9D7, 0x8F2A}, {0xF9D8, 0x5F8B}, {0xF9D9, 0x6144},
-{0xF9DA, 0x6817}, {0xF9DB, 0x7387}, {0xF9DC, 0x9686}, {0xF9DD, 0x5229}, {0xF9DE, 0x540F}, {0xF9DF, 0x5C65}, {0xF9E0, 0x6613}, {0xF9E1, 0x674E}, {0xF9E2, 0x68A8}, {0xF9E3, 0x6CE5}, {0xF9E4, 0x7406},
-{0xF9E5, 0x75E2}, {0xF9E6, 0x7F79}, {0xF9E7, 0x88CF}, {0xF9E8, 0x88E1}, {0xF9E9, 0x91CC}, {0xF9EA, 0x96E2}, {0xF9EB, 0x533F}, {0xF9EC, 0x6EBA}, {0xF9ED, 0x541D}, {0xF9EE, 0x71D0}, {0xF9EF, 0x7498},
-{0xF9F0, 0x85FA}, {0xF9F1, 0x96A3}, {0xF9F2, 0x9C57}, {0xF9F3, 0x9E9F}, {0xF9F4, 0x6797}, {0xF9F5, 0x6DCB}, {0xF9F6, 0x81E8}, {0xF9F7, 0x7ACB}, {0xF9F8, 0x7B20}, {0xF9F9, 0x7C92}, {0xF9FA, 0x72C0},
-{0xF9FB, 0x7099}, {0xF9FC, 0x8B58}, {0xF9FD, 0x4EC0}, {0xF9FE, 0x8336}, {0xF9FF, 0x523A}, {0xFA00, 0x5207}, {0xFA01, 0x5EA6}, {0xFA02, 0x62D3}, {0xFA03, 0x7CD6}, {0xFA04, 0x5B85}, {0xFA05, 0x6D1E},
-{0xFA06, 0x66B4}, {0xFA07, 0x8F3B}, {0xFA08, 0x884C}, {0xFA09, 0x964D}, {0xFA0A, 0x898B}, {0xFA0B, 0x5ED3}, {0xFA0C, 0x5140}, {0xFA0D, 0x55C0}, {0xFA10, 0x585A}, {0xFA12, 0x6674}, {0xFA15, 0x51DE},
-{0xFA16, 0x732A}, {0xFA17, 0x76CA}, {0xFA18, 0x793C}, {0xFA19, 0x795E}, {0xFA1A, 0x7965}, {0xFA1B, 0x798F}, {0xFA1C, 0x9756}, {0xFA1D, 0x7CBE}, {0xFA1E, 0x7FBD}, {0xFA20, 0x8612}, {0xFA22, 0x8AF8},
-{0xFA25, 0x9038}, {0xFA26, 0x90FD}, {0xFA2A, 0x98EF}, {0xFA2B, 0x98FC}, {0xFA2C, 0x9928}, {0xFA2D, 0x9DB4}, {0xFA2E, 0x90DE}, {0xFA2F, 0x96B7}, {0xFA30, 0x4FAE}, {0xFA31, 0x50E7}, {0xFA32, 0x514D},
-{0xFA33, 0x52C9}, {0xFA34, 0x52E4}, {0xFA35, 0x5351}, {0xFA36, 0x559D}, {0xFA37, 0x5606}, {0xFA38, 0x5668}, {0xFA39, 0x5840}, {0xFA3A, 0x58A8}, {0xFA3B, 0x5C64}, {0xFA3C, 0x5C6E}, {0xFA3D, 0x6094},
-{0xFA3E, 0x6168}, {0xFA3F, 0x618E}, {0xFA40, 0x61F2}, {0xFA41, 0x654F}, {0xFA42, 0x65E2}, {0xFA43, 0x6691}, {0xFA44, 0x6885}, {0xFA45, 0x6D77}, {0xFA46, 0x6E1A}, {0xFA47, 0x6F22}, {0xFA48, 0x716E},
-{0xFA49, 0x722B}, {0xFA4A, 0x7422}, {0xFA4B, 0x7891}, {0xFA4C, 0x793E}, {0xFA4D, 0x7949}, {0xFA4E, 0x7948}, {0xFA4F, 0x7950}, {0xFA50, 0x7956}, {0xFA51, 0x795D}, {0xFA52, 0x798D}, {0xFA53, 0x798E},
-{0xFA54, 0x7A40}, {0xFA55, 0x7A81}, {0xFA56, 0x7BC0}, {0xFA57, 0x7DF4}, {0xFA58, 0x7E09}, {0xFA59, 0x7E41}, {0xFA5A, 0x7F72}, {0xFA5B, 0x8005}, {0xFA5C, 0x81ED}, {0xFA5D, 0x8279}, {0xFA5E, 0x8279},
-{0xFA5F, 0x8457}, {0xFA60, 0x8910}, {0xFA61, 0x8996}, {0xFA62, 0x8B01}, {0xFA63, 0x8B39}, {0xFA64, 0x8CD3}, {0xFA65, 0x8D08}, {0xFA66, 0x8FB6}, {0xFA67, 0x9038}, {0xFA68, 0x96E3}, {0xFA69, 0x97FF},
-{0xFA6A, 0x983B}, {0xFA6B, 0x6075}, {0xFA6C, 0x242EE}, {0xFA6D, 0x8218}, {0xFA70, 0x4E26}, {0xFA71, 0x51B5}, {0xFA72, 0x5168}, {0xFA73, 0x4F80}, {0xFA74, 0x5145}, {0xFA75, 0x5180}, {0xFA76, 0x52C7},
-{0xFA77, 0x52FA}, {0xFA78, 0x559D}, {0xFA79, 0x5555}, {0xFA7A, 0x5599}, {0xFA7B, 0x55E2}, {0xFA7C, 0x585A}, {0xFA7D, 0x58B3}, {0xFA7E, 0x5944}, {0xFA7F, 0x5954}, {0xFA80, 0x5A62}, {0xFA81, 0x5B28},
-{0xFA82, 0x5ED2}, {0xFA83, 0x5ED9}, {0xFA84, 0x5F69}, {0xFA85, 0x5FAD}, {0xFA86, 0x60D8}, {0xFA87, 0x614E}, {0xFA88, 0x6108}, {0xFA89, 0x618E}, {0xFA8A, 0x6160}, {0xFA8B, 0x61F2}, {0xFA8C, 0x6234},
-{0xFA8D, 0x63C4}, {0xFA8E, 0x641C}, {0xFA8F, 0x6452}, {0xFA90, 0x6556}, {0xFA91, 0x6674}, {0xFA92, 0x6717}, {0xFA93, 0x671B}, {0xFA94, 0x6756}, {0xFA95, 0x6B79}, {0xFA96, 0x6BBA}, {0xFA97, 0x6D41},
-{0xFA98, 0x6EDB}, {0xFA99, 0x6ECB}, {0xFA9A, 0x6F22}, {0xFA9B, 0x701E}, {0xFA9C, 0x716E}, {0xFA9D, 0x77A7}, {0xFA9E, 0x7235}, {0xFA9F, 0x72AF}, {0xFAA0, 0x732A}, {0xFAA1, 0x7471}, {0xFAA2, 0x7506},
-{0xFAA3, 0x753B}, {0xFAA4, 0x761D}, {0xFAA5, 0x761F}, {0xFAA6, 0x76CA}, {0xFAA7, 0x76DB}, {0xFAA8, 0x76F4}, {0xFAA9, 0x774A}, {0xFAAA, 0x7740}, {0xFAAB, 0x78CC}, {0xFAAC, 0x7AB1}, {0xFAAD, 0x7BC0},
-{0xFAAE, 0x7C7B}, {0xFAAF, 0x7D5B}, {0xFAB0, 0x7DF4}, {0xFAB1, 0x7F3E}, {0xFAB2, 0x8005}, {0xFAB3, 0x8352}, {0xFAB4, 0x83EF}, {0xFAB5, 0x8779}, {0xFAB6, 0x8941}, {0xFAB7, 0x8986}, {0xFAB8, 0x8996},
-{0xFAB9, 0x8ABF}, {0xFABA, 0x8AF8}, {0xFABB, 0x8ACB}, {0xFABC, 0x8B01}, {0xFABD, 0x8AFE}, {0xFABE, 0x8AED}, {0xFABF, 0x8B39}, {0xFAC0, 0x8B8A}, {0xFAC1, 0x8D08}, {0xFAC2, 0x8F38}, {0xFAC3, 0x9072},
-{0xFAC4, 0x9199}, {0xFAC5, 0x9276}, {0xFAC6, 0x967C}, {0xFAC7, 0x96E3}, {0xFAC8, 0x9756}, {0xFAC9, 0x97DB}, {0xFACA, 0x97FF}, {0xFACB, 0x980B}, {0xFACC, 0x983B}, {0xFACD, 0x9B12}, {0xFACE, 0x9F9C},
-{0xFACF, 0x2284A}, {0xFAD0, 0x22844}, {0xFAD1, 0x233D5}, {0xFAD2, 0x3B9D}, {0xFAD3, 0x4018}, {0xFAD4, 0x4039}, {0xFAD5, 0x25249}, {0xFAD6, 0x25CD0}, {0xFAD7, 0x27ED3}, {0xFAD8, 0x9F43},
-{0xFAD9, 0x9F8E}, {0xFB1D, 0x5D9}, {0xFB1D, 0x5B4}, {0xFB1F, 0x5F2}, {0xFB1F, 0x5B7}, {0xFB2A, 0x5E9}, {0xFB2A, 0x5C1}, {0xFB2B, 0x5E9}, {0xFB2B, 0x5C2}, {0xFB2C, 0x5E9}, {0xFB2C, 0x5BC},
-{0xFB2C, 0x5C1}, {0xFB2D, 0x5E9}, {0xFB2D, 0x5BC}, {0xFB2D, 0x5C2}, {0xFB2E, 0x5D0}, {0xFB2E, 0x5B7}, {0xFB2F, 0x5D0}, {0xFB2F, 0x5B8}, {0xFB30, 0x5D0}, {0xFB30, 0x5BC}, {0xFB31, 0x5D1},
-{0xFB31, 0x5BC}, {0xFB32, 0x5D2}, {0xFB32, 0x5BC}, {0xFB33, 0x5D3}, {0xFB33, 0x5BC}, {0xFB34, 0x5D4}, {0xFB34, 0x5BC}, {0xFB35, 0x5D5}, {0xFB35, 0x5BC}, {0xFB36, 0x5D6}, {0xFB36, 0x5BC},
-{0xFB38, 0x5D8}, {0xFB38, 0x5BC}, {0xFB39, 0x5D9}, {0xFB39, 0x5BC}, {0xFB3A, 0x5DA}, {0xFB3A, 0x5BC}, {0xFB3B, 0x5DB}, {0xFB3B, 0x5BC}, {0xFB3C, 0x5DC}, {0xFB3C, 0x5BC}, {0xFB3E, 0x5DE},
-{0xFB3E, 0x5BC}, {0xFB40, 0x5E0}, {0xFB40, 0x5BC}, {0xFB41, 0x5E1}, {0xFB41, 0x5BC}, {0xFB43, 0x5E3}, {0xFB43, 0x5BC}, {0xFB44, 0x5E4}, {0xFB44, 0x5BC}, {0xFB46, 0x5E6}, {0xFB46, 0x5BC},
-{0xFB47, 0x5E7}, {0xFB47, 0x5BC}, {0xFB48, 0x5E8}, {0xFB48, 0x5BC}, {0xFB49, 0x5E9}, {0xFB49, 0x5BC}, {0xFB4A, 0x5EA}, {0xFB4A, 0x5BC}, {0xFB4B, 0x5D5}, {0xFB4B, 0x5B9}, {0xFB4C, 0x5D1},
-{0xFB4C, 0x5BF}, {0xFB4D, 0x5DB}, {0xFB4D, 0x5BF}, {0xFB4E, 0x5E4}, {0xFB4E, 0x5BF}, {0x1109A, 0x11099}, {0x1109A, 0x110BA}, {0x1109C, 0x1109B}, {0x1109C, 0x110BA}, {0x110AB, 0x110A5},
-{0x110AB, 0x110BA}, {0x1112E, 0x11131}, {0x1112E, 0x11127}, {0x1112F, 0x11132}, {0x1112F, 0x11127}, {0x1134B, 0x11347}, {0x1134B, 0x1133E}, {0x1134C, 0x11347}, {0x1134C, 0x11357}, {0x114BB, 0x114B9},
-{0x114BB, 0x114BA}, {0x114BC, 0x114B9}, {0x114BC, 0x114B0}, {0x114BE, 0x114B9}, {0x114BE, 0x114BD}, {0x115BA, 0x115B8}, {0x115BA, 0x115AF}, {0x115BB, 0x115B9}, {0x115BB, 0x115AF}, {0x1D15E, 0x1D157},
-{0x1D15E, 0x1D165}, {0x1D15F, 0x1D158}, {0x1D15F, 0x1D165}, {0x1D160, 0x1D158}, {0x1D160, 0x1D165}, {0x1D160, 0x1D16E}, {0x1D161, 0x1D158}, {0x1D161, 0x1D165}, {0x1D161, 0x1D16F}, {0x1D162, 0x1D158},
-{0x1D162, 0x1D165}, {0x1D162, 0x1D170}, {0x1D163, 0x1D158}, {0x1D163, 0x1D165}, {0x1D163, 0x1D171}, {0x1D164, 0x1D158}, {0x1D164, 0x1D165}, {0x1D164, 0x1D172}, {0x1D1BB, 0x1D1B9}, {0x1D1BB, 0x1D165},
-{0x1D1BC, 0x1D1BA}, {0x1D1BC, 0x1D165}, {0x1D1BD, 0x1D1B9}, {0x1D1BD, 0x1D165}, {0x1D1BD, 0x1D16E}, {0x1D1BE, 0x1D1BA}, {0x1D1BE, 0x1D165}, {0x1D1BE, 0x1D16E}, {0x1D1BF, 0x1D1B9}, {0x1D1BF, 0x1D165},
-{0x1D1BF, 0x1D16F}, {0x1D1C0, 0x1D1BA}, {0x1D1C0, 0x1D165}, {0x1D1C0, 0x1D16F}, {0x2F800, 0x4E3D}, {0x2F801, 0x4E38}, {0x2F802, 0x4E41}, {0x2F803, 0x20122}, {0x2F804, 0x4F60}, {0x2F805, 0x4FAE},
-{0x2F806, 0x4FBB}, {0x2F807, 0x5002}, {0x2F808, 0x507A}, {0x2F809, 0x5099}, {0x2F80A, 0x50E7}, {0x2F80B, 0x50CF}, {0x2F80C, 0x349E}, {0x2F80D, 0x2063A}, {0x2F80E, 0x514D}, {0x2F80F, 0x5154},
-{0x2F810, 0x5164}, {0x2F811, 0x5177}, {0x2F812, 0x2051C}, {0x2F813, 0x34B9}, {0x2F814, 0x5167}, {0x2F815, 0x518D}, {0x2F816, 0x2054B}, {0x2F817, 0x5197}, {0x2F818, 0x51A4}, {0x2F819, 0x4ECC},
-{0x2F81A, 0x51AC}, {0x2F81B, 0x51B5}, {0x2F81C, 0x291DF}, {0x2F81D, 0x51F5}, {0x2F81E, 0x5203}, {0x2F81F, 0x34DF}, {0x2F820, 0x523B}, {0x2F821, 0x5246}, {0x2F822, 0x5272}, {0x2F823, 0x5277},
-{0x2F824, 0x3515}, {0x2F825, 0x52C7}, {0x2F826, 0x52C9}, {0x2F827, 0x52E4}, {0x2F828, 0x52FA}, {0x2F829, 0x5305}, {0x2F82A, 0x5306}, {0x2F82B, 0x5317}, {0x2F82C, 0x5349}, {0x2F82D, 0x5351},
-{0x2F82E, 0x535A}, {0x2F82F, 0x5373}, {0x2F830, 0x537D}, {0x2F831, 0x537F}, {0x2F832, 0x537F}, {0x2F833, 0x537F}, {0x2F834, 0x20A2C}, {0x2F835, 0x7070}, {0x2F836, 0x53CA}, {0x2F837, 0x53DF},
-{0x2F838, 0x20B63}, {0x2F839, 0x53EB}, {0x2F83A, 0x53F1}, {0x2F83B, 0x5406}, {0x2F83C, 0x549E}, {0x2F83D, 0x5438}, {0x2F83E, 0x5448}, {0x2F83F, 0x5468}, {0x2F840, 0x54A2}, {0x2F841, 0x54F6},
-{0x2F842, 0x5510}, {0x2F843, 0x5553}, {0x2F844, 0x5563}, {0x2F845, 0x5584}, {0x2F846, 0x5584}, {0x2F847, 0x5599}, {0x2F848, 0x55AB}, {0x2F849, 0x55B3}, {0x2F84A, 0x55C2}, {0x2F84B, 0x5716},
-{0x2F84C, 0x5606}, {0x2F84D, 0x5717}, {0x2F84E, 0x5651}, {0x2F84F, 0x5674}, {0x2F850, 0x5207}, {0x2F851, 0x58EE}, {0x2F852, 0x57CE}, {0x2F853, 0x57F4}, {0x2F854, 0x580D}, {0x2F855, 0x578B},
-{0x2F856, 0x5832}, {0x2F857, 0x5831}, {0x2F858, 0x58AC}, {0x2F859, 0x214E4}, {0x2F85A, 0x58F2}, {0x2F85B, 0x58F7}, {0x2F85C, 0x5906}, {0x2F85D, 0x591A}, {0x2F85E, 0x5922}, {0x2F85F, 0x5962},
-{0x2F860, 0x216A8}, {0x2F861, 0x216EA}, {0x2F862, 0x59EC}, {0x2F863, 0x5A1B}, {0x2F864, 0x5A27}, {0x2F865, 0x59D8}, {0x2F866, 0x5A66}, {0x2F867, 0x36EE}, {0x2F868, 0x36FC}, {0x2F869, 0x5B08},
-{0x2F86A, 0x5B3E}, {0x2F86B, 0x5B3E}, {0x2F86C, 0x219C8}, {0x2F86D, 0x5BC3}, {0x2F86E, 0x5BD8}, {0x2F86F, 0x5BE7}, {0x2F870, 0x5BF3}, {0x2F871, 0x21B18}, {0x2F872, 0x5BFF}, {0x2F873, 0x5C06},
-{0x2F874, 0x5F53}, {0x2F875, 0x5C22}, {0x2F876, 0x3781}, {0x2F877, 0x5C60}, {0x2F878, 0x5C6E}, {0x2F879, 0x5CC0}, {0x2F87A, 0x5C8D}, {0x2F87B, 0x21DE4}, {0x2F87C, 0x5D43}, {0x2F87D, 0x21DE6},
-{0x2F87E, 0x5D6E}, {0x2F87F, 0x5D6B}, {0x2F880, 0x5D7C}, {0x2F881, 0x5DE1}, {0x2F882, 0x5DE2}, {0x2F883, 0x382F}, {0x2F884, 0x5DFD}, {0x2F885, 0x5E28}, {0x2F886, 0x5E3D}, {0x2F887, 0x5E69},
-{0x2F888, 0x3862}, {0x2F889, 0x22183}, {0x2F88A, 0x387C}, {0x2F88B, 0x5EB0}, {0x2F88C, 0x5EB3}, {0x2F88D, 0x5EB6}, {0x2F88E, 0x5ECA}, {0x2F88F, 0x2A392}, {0x2F890, 0x5EFE}, {0x2F891, 0x22331},
-{0x2F892, 0x22331}, {0x2F893, 0x8201}, {0x2F894, 0x5F22}, {0x2F895, 0x5F22}, {0x2F896, 0x38C7}, {0x2F897, 0x232B8}, {0x2F898, 0x261DA}, {0x2F899, 0x5F62}, {0x2F89A, 0x5F6B}, {0x2F89B, 0x38E3},
-{0x2F89C, 0x5F9A}, {0x2F89D, 0x5FCD}, {0x2F89E, 0x5FD7}, {0x2F89F, 0x5FF9}, {0x2F8A0, 0x6081}, {0x2F8A1, 0x393A}, {0x2F8A2, 0x391C}, {0x2F8A3, 0x6094}, {0x2F8A4, 0x226D4}, {0x2F8A5, 0x60C7},
-{0x2F8A6, 0x6148}, {0x2F8A7, 0x614C}, {0x2F8A8, 0x614E}, {0x2F8A9, 0x614C}, {0x2F8AA, 0x617A}, {0x2F8AB, 0x618E}, {0x2F8AC, 0x61B2}, {0x2F8AD, 0x61A4}, {0x2F8AE, 0x61AF}, {0x2F8AF, 0x61DE},
-{0x2F8B0, 0x61F2}, {0x2F8B1, 0x61F6}, {0x2F8B2, 0x6210}, {0x2F8B3, 0x621B}, {0x2F8B4, 0x625D}, {0x2F8B5, 0x62B1}, {0x2F8B6, 0x62D4}, {0x2F8B7, 0x6350}, {0x2F8B8, 0x22B0C}, {0x2F8B9, 0x633D},
-{0x2F8BA, 0x62FC}, {0x2F8BB, 0x6368}, {0x2F8BC, 0x6383}, {0x2F8BD, 0x63E4}, {0x2F8BE, 0x22BF1}, {0x2F8BF, 0x6422}, {0x2F8C0, 0x63C5}, {0x2F8C1, 0x63A9}, {0x2F8C2, 0x3A2E}, {0x2F8C3, 0x6469},
-{0x2F8C4, 0x647E}, {0x2F8C5, 0x649D}, {0x2F8C6, 0x6477}, {0x2F8C7, 0x3A6C}, {0x2F8C8, 0x654F}, {0x2F8C9, 0x656C}, {0x2F8CA, 0x2300A}, {0x2F8CB, 0x65E3}, {0x2F8CC, 0x66F8}, {0x2F8CD, 0x6649},
-{0x2F8CE, 0x3B19}, {0x2F8CF, 0x6691}, {0x2F8D0, 0x3B08}, {0x2F8D1, 0x3AE4}, {0x2F8D2, 0x5192}, {0x2F8D3, 0x5195}, {0x2F8D4, 0x6700}, {0x2F8D5, 0x669C}, {0x2F8D6, 0x80AD}, {0x2F8D7, 0x43D9},
-{0x2F8D8, 0x6717}, {0x2F8D9, 0x671B}, {0x2F8DA, 0x6721}, {0x2F8DB, 0x675E}, {0x2F8DC, 0x6753}, {0x2F8DD, 0x233C3}, {0x2F8DE, 0x3B49}, {0x2F8DF, 0x67FA}, {0x2F8E0, 0x6785}, {0x2F8E1, 0x6852},
-{0x2F8E2, 0x6885}, {0x2F8E3, 0x2346D}, {0x2F8E4, 0x688E}, {0x2F8E5, 0x681F}, {0x2F8E6, 0x6914}, {0x2F8E7, 0x3B9D}, {0x2F8E8, 0x6942}, {0x2F8E9, 0x69A3}, {0x2F8EA, 0x69EA}, {0x2F8EB, 0x6AA8},
-{0x2F8EC, 0x236A3}, {0x2F8ED, 0x6ADB}, {0x2F8EE, 0x3C18}, {0x2F8EF, 0x6B21}, {0x2F8F0, 0x238A7}, {0x2F8F1, 0x6B54}, {0x2F8F2, 0x3C4E}, {0x2F8F3, 0x6B72}, {0x2F8F4, 0x6B9F}, {0x2F8F5, 0x6BBA},
-{0x2F8F6, 0x6BBB}, {0x2F8F7, 0x23A8D}, {0x2F8F8, 0x21D0B}, {0x2F8F9, 0x23AFA}, {0x2F8FA, 0x6C4E}, {0x2F8FB, 0x23CBC}, {0x2F8FC, 0x6CBF}, {0x2F8FD, 0x6CCD}, {0x2F8FE, 0x6C67}, {0x2F8FF, 0x6D16},
-{0x2F900, 0x6D3E}, {0x2F901, 0x6D77}, {0x2F902, 0x6D41}, {0x2F903, 0x6D69}, {0x2F904, 0x6D78}, {0x2F905, 0x6D85}, {0x2F906, 0x23D1E}, {0x2F907, 0x6D34}, {0x2F908, 0x6E2F}, {0x2F909, 0x6E6E},
-{0x2F90A, 0x3D33}, {0x2F90B, 0x6ECB}, {0x2F90C, 0x6EC7}, {0x2F90D, 0x23ED1}, {0x2F90E, 0x6DF9}, {0x2F90F, 0x6F6E}, {0x2F910, 0x23F5E}, {0x2F911, 0x23F8E}, {0x2F912, 0x6FC6}, {0x2F913, 0x7039},
-{0x2F914, 0x701E}, {0x2F915, 0x701B}, {0x2F916, 0x3D96}, {0x2F917, 0x704A}, {0x2F918, 0x707D}, {0x2F919, 0x7077}, {0x2F91A, 0x70AD}, {0x2F91B, 0x20525}, {0x2F91C, 0x7145}, {0x2F91D, 0x24263},
-{0x2F91E, 0x719C}, {0x2F91F, 0x243AB}, {0x2F920, 0x7228}, {0x2F921, 0x7235}, {0x2F922, 0x7250}, {0x2F923, 0x24608}, {0x2F924, 0x7280}, {0x2F925, 0x7295}, {0x2F926, 0x24735}, {0x2F927, 0x24814},
-{0x2F928, 0x737A}, {0x2F929, 0x738B}, {0x2F92A, 0x3EAC}, {0x2F92B, 0x73A5}, {0x2F92C, 0x3EB8}, {0x2F92D, 0x3EB8}, {0x2F92E, 0x7447}, {0x2F92F, 0x745C}, {0x2F930, 0x7471}, {0x2F931, 0x7485},
-{0x2F932, 0x74CA}, {0x2F933, 0x3F1B}, {0x2F934, 0x7524}, {0x2F935, 0x24C36}, {0x2F936, 0x753E}, {0x2F937, 0x24C92}, {0x2F938, 0x7570}, {0x2F939, 0x2219F}, {0x2F93A, 0x7610}, {0x2F93B, 0x24FA1},
-{0x2F93C, 0x24FB8}, {0x2F93D, 0x25044}, {0x2F93E, 0x3FFC}, {0x2F93F, 0x4008}, {0x2F940, 0x76F4}, {0x2F941, 0x250F3}, {0x2F942, 0x250F2}, {0x2F943, 0x25119}, {0x2F944, 0x25133}, {0x2F945, 0x771E},
-{0x2F946, 0x771F}, {0x2F947, 0x771F}, {0x2F948, 0x774A}, {0x2F949, 0x4039}, {0x2F94A, 0x778B}, {0x2F94B, 0x4046}, {0x2F94C, 0x4096}, {0x2F94D, 0x2541D}, {0x2F94E, 0x784E}, {0x2F94F, 0x788C},
-{0x2F950, 0x78CC}, {0x2F951, 0x40E3}, {0x2F952, 0x25626}, {0x2F953, 0x7956}, {0x2F954, 0x2569A}, {0x2F955, 0x256C5}, {0x2F956, 0x798F}, {0x2F957, 0x79EB}, {0x2F958, 0x412F}, {0x2F959, 0x7A40},
-{0x2F95A, 0x7A4A}, {0x2F95B, 0x7A4F}, {0x2F95C, 0x2597C}, {0x2F95D, 0x25AA7}, {0x2F95E, 0x25AA7}, {0x2F95F, 0x7AEE}, {0x2F960, 0x4202}, {0x2F961, 0x25BAB}, {0x2F962, 0x7BC6}, {0x2F963, 0x7BC9},
-{0x2F964, 0x4227}, {0x2F965, 0x25C80}, {0x2F966, 0x7CD2}, {0x2F967, 0x42A0}, {0x2F968, 0x7CE8}, {0x2F969, 0x7CE3}, {0x2F96A, 0x7D00}, {0x2F96B, 0x25F86}, {0x2F96C, 0x7D63}, {0x2F96D, 0x4301},
-{0x2F96E, 0x7DC7}, {0x2F96F, 0x7E02}, {0x2F970, 0x7E45}, {0x2F971, 0x4334}, {0x2F972, 0x26228}, {0x2F973, 0x26247}, {0x2F974, 0x4359}, {0x2F975, 0x262D9}, {0x2F976, 0x7F7A}, {0x2F977, 0x2633E},
-{0x2F978, 0x7F95}, {0x2F979, 0x7FFA}, {0x2F97A, 0x8005}, {0x2F97B, 0x264DA}, {0x2F97C, 0x26523}, {0x2F97D, 0x8060}, {0x2F97E, 0x265A8}, {0x2F97F, 0x8070}, {0x2F980, 0x2335F}, {0x2F981, 0x43D5},
-{0x2F982, 0x80B2}, {0x2F983, 0x8103}, {0x2F984, 0x440B}, {0x2F985, 0x813E}, {0x2F986, 0x5AB5}, {0x2F987, 0x267A7}, {0x2F988, 0x267B5}, {0x2F989, 0x23393}, {0x2F98A, 0x2339C}, {0x2F98B, 0x8201},
-{0x2F98C, 0x8204}, {0x2F98D, 0x8F9E}, {0x2F98E, 0x446B}, {0x2F98F, 0x8291}, {0x2F990, 0x828B}, {0x2F991, 0x829D}, {0x2F992, 0x52B3}, {0x2F993, 0x82B1}, {0x2F994, 0x82B3}, {0x2F995, 0x82BD},
-{0x2F996, 0x82E6}, {0x2F997, 0x26B3C}, {0x2F998, 0x82E5}, {0x2F999, 0x831D}, {0x2F99A, 0x8363}, {0x2F99B, 0x83AD}, {0x2F99C, 0x8323}, {0x2F99D, 0x83BD}, {0x2F99E, 0x83E7}, {0x2F99F, 0x8457},
-{0x2F9A0, 0x8353}, {0x2F9A1, 0x83CA}, {0x2F9A2, 0x83CC}, {0x2F9A3, 0x83DC}, {0x2F9A4, 0x26C36}, {0x2F9A5, 0x26D6B}, {0x2F9A6, 0x26CD5}, {0x2F9A7, 0x452B}, {0x2F9A8, 0x84F1}, {0x2F9A9, 0x84F3},
-{0x2F9AA, 0x8516}, {0x2F9AB, 0x273CA}, {0x2F9AC, 0x8564}, {0x2F9AD, 0x26F2C}, {0x2F9AE, 0x455D}, {0x2F9AF, 0x4561}, {0x2F9B0, 0x26FB1}, {0x2F9B1, 0x270D2}, {0x2F9B2, 0x456B}, {0x2F9B3, 0x8650},
-{0x2F9B4, 0x865C}, {0x2F9B5, 0x8667}, {0x2F9B6, 0x8669}, {0x2F9B7, 0x86A9}, {0x2F9B8, 0x8688}, {0x2F9B9, 0x870E}, {0x2F9BA, 0x86E2}, {0x2F9BB, 0x8779}, {0x2F9BC, 0x8728}, {0x2F9BD, 0x876B},
-{0x2F9BE, 0x8786}, {0x2F9BF, 0x45D7}, {0x2F9C0, 0x87E1}, {0x2F9C1, 0x8801}, {0x2F9C2, 0x45F9}, {0x2F9C3, 0x8860}, {0x2F9C4, 0x8863}, {0x2F9C5, 0x27667}, {0x2F9C6, 0x88D7}, {0x2F9C7, 0x88DE},
-{0x2F9C8, 0x4635}, {0x2F9C9, 0x88FA}, {0x2F9CA, 0x34BB}, {0x2F9CB, 0x278AE}, {0x2F9CC, 0x27966}, {0x2F9CD, 0x46BE}, {0x2F9CE, 0x46C7}, {0x2F9CF, 0x8AA0}, {0x2F9D0, 0x8AED}, {0x2F9D1, 0x8B8A},
-{0x2F9D2, 0x8C55}, {0x2F9D3, 0x27CA8}, {0x2F9D4, 0x8CAB}, {0x2F9D5, 0x8CC1}, {0x2F9D6, 0x8D1B}, {0x2F9D7, 0x8D77}, {0x2F9D8, 0x27F2F}, {0x2F9D9, 0x20804}, {0x2F9DA, 0x8DCB}, {0x2F9DB, 0x8DBC},
-{0x2F9DC, 0x8DF0}, {0x2F9DD, 0x208DE}, {0x2F9DE, 0x8ED4}, {0x2F9DF, 0x8F38}, {0x2F9E0, 0x285D2}, {0x2F9E1, 0x285ED}, {0x2F9E2, 0x9094}, {0x2F9E3, 0x90F1}, {0x2F9E4, 0x9111}, {0x2F9E5, 0x2872E},
-{0x2F9E6, 0x911B}, {0x2F9E7, 0x9238}, {0x2F9E8, 0x92D7}, {0x2F9E9, 0x92D8}, {0x2F9EA, 0x927C}, {0x2F9EB, 0x93F9}, {0x2F9EC, 0x9415}, {0x2F9ED, 0x28BFA}, {0x2F9EE, 0x958B}, {0x2F9EF, 0x4995},
-{0x2F9F0, 0x95B7}, {0x2F9F1, 0x28D77}, {0x2F9F2, 0x49E6}, {0x2F9F3, 0x96C3}, {0x2F9F4, 0x5DB2}, {0x2F9F5, 0x9723}, {0x2F9F6, 0x29145}, {0x2F9F7, 0x2921A}, {0x2F9F8, 0x4A6E}, {0x2F9F9, 0x4A76},
-{0x2F9FA, 0x97E0}, {0x2F9FB, 0x2940A}, {0x2F9FC, 0x4AB2}, {0x2F9FD, 0x29496}, {0x2F9FE, 0x980B}, {0x2F9FF, 0x980B}, {0x2FA00, 0x9829}, {0x2FA01, 0x295B6}, {0x2FA02, 0x98E2}, {0x2FA03, 0x4B33},
-{0x2FA04, 0x9929}, {0x2FA05, 0x99A7}, {0x2FA06, 0x99C2}, {0x2FA07, 0x99FE}, {0x2FA08, 0x4BCE}, {0x2FA09, 0x29B30}, {0x2FA0A, 0x9B12}, {0x2FA0B, 0x9C40}, {0x2FA0C, 0x9CFD}, {0x2FA0D, 0x4CCE},
-{0x2FA0E, 0x4CED}, {0x2FA0F, 0x9D67}, {0x2FA10, 0x2A0CE}, {0x2FA11, 0x4CF8}, {0x2FA12, 0x2A105}, {0x2FA13, 0x2A20E}, {0x2FA14, 0x2A291}, {0x2FA15, 0x9EBB}, {0x2FA16, 0x4D56}, {0x2FA17, 0x9EF9},
-{0x2FA18, 0x9EFE}, {0x2FA19, 0x9F05}, {0x2FA1A, 0x9F0F}, {0x2FA1B, 0x9F16}, {0x2FA1D, 0x2A600},
-};
-
-static std::string codepoint_to_utf8(uint32_t cp) {
-    std::string result;
-    if (/* 0x00 <= cp && */ cp <= 0x7f) {
-        result.push_back(cp);
-    }
-    else if (0x80 <= cp && cp <= 0x7ff) {
-        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else if (0x800 <= cp && cp <= 0xffff) {
-        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.push_back(0xf0 | ((cp >> 18) & 0x07));
-        result.push_back(0x80 | ((cp >> 12) & 0x3f));
-        result.push_back(0x80 | ((cp >> 6) & 0x3f));
-        result.push_back(0x80 | (cp & 0x3f));
-    }
-    else {
-        throw std::invalid_argument("invalid codepoint");
-    }
-    return result;
-}
-
-static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
-    std::string result;
-    for (size_t i = 0; i < cps.size(); ++i) {
-        result.append(codepoint_to_utf8(cps[i]));
-    }
-    return result;
-}
-
-static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
-    assert(offset < utf8.size());
-    if (!(utf8[offset + 0] & 0x80)) {
-        auto result = utf8[offset + 0];
-        offset += 1;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x40)) {
-        throw std::invalid_argument("invalid character");
-    }
-    if (!(utf8[offset + 0] & 0x20)) {
-        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
-        offset += 2;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x10)) {
-        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
-        offset += 3;
-        return result;
-    }
-    if (!(utf8[offset + 0] & 0x08)) {
-        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
-            throw std::invalid_argument("invalid character");
-        }
-        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
-        offset += 4;
-        return result;
-    }
-    throw std::invalid_argument("invalid string");
-}
-
-static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
-    std::vector<uint32_t> result;
-    size_t offset = 0;
-    while (offset < utf8.size()) {
-        result.push_back(codepoint_from_utf8(utf8, offset));
-    }
-    return result;
-}
-
-static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
-    std::vector<uint16_t> result;
-    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-        result.emplace_back(cp);
-    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
-    }
-    else {
-        throw std::invalid_argument("invalid codepoint");
-    }
-    return result;
-}
-
-static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
-    std::vector<uint16_t> result;
-    for (size_t i = 0; i < cps.size(); ++i) {
-        auto temp = codepoint_to_utf16(cps[i]);
-        result.insert(result.end(), temp.begin(), temp.end());
-    }
-    return result;
-}
-
-static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
-    assert(offset < utf16.size());
-    if (((utf16[0] >> 10) << 10) != 0xd800) {
-        auto result = utf16[offset + 0];
-        offset += 1;
-        return result;
-    }
-
-    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
-        throw std::invalid_argument("invalid character");
-    }
-
-    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-    offset += 2;
-    return result;
-}
-
-static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
-    std::vector<uint32_t> result;
-    size_t offset = 0;
-    while (offset < utf16.size()) {
-        result.push_back(codepoint_from_utf16(utf16, offset));
-    }
-    return result;
-}
-
-#define CODEPOINT_TYPE_UNIDENTIFIED 0
-#define CODEPOINT_TYPE_DIGIT 1
-#define CODEPOINT_TYPE_LETTER 2
-#define CODEPOINT_TYPE_WHITESPACE 3
-#define CODEPOINT_TYPE_ACCENT_MARK 4
-#define CODEPOINT_TYPE_PUNCTUATION 5
-#define CODEPOINT_TYPE_SYMBOL 6
-#define CODEPOINT_TYPE_CONTROL 7
-
-static std::unordered_map<uint32_t, int> codepoint_type_map() {
-    std::unordered_map<uint32_t, int> codepoint_types;
-    for (auto p : digit_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
-        }
-    }
-    for (auto p : letter_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_LETTER;
-        }
-    }
-    for (auto p : whitespace_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
-        }
-    }
-    for (auto p : accent_mark_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
-        }
-    }
-    for (auto p : punctuation_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
-        }
-    }
-    for  (auto p : symbol_ranges) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
-        }
-    }
-    for (auto p : control_ranges) {
-        for (auto i = p.first; i <= p.second; ++ i) {
-            codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
-        }
-    }
-    return codepoint_types;
-}
-
-static int codepoint_type(uint32_t cp) {
-    static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
-    const auto it = codepoint_types.find(cp);
-    return it == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
-}
-
-static int codepoint_type(const std::string & utf8) {
-    if (utf8.length() == 0) {
-        return CODEPOINT_TYPE_UNIDENTIFIED;
-    }
-    size_t offset = 0;
-    return codepoint_type(codepoint_from_utf8(utf8, offset));
-}
-
-static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
-    std::unordered_map<uint8_t, std::string> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = codepoint_to_utf8(ch);
-    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = codepoint_to_utf8(ch);
-    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[ch] = codepoint_to_utf8(ch);
-    }
-    auto n = 0;
-    for (int ch = 0; ch < 256; ++ch) {
-        if (map.find(ch) == map.end()) {
-            map[ch] = codepoint_to_utf8(256 + n);
-            ++n;
-        }
-    }
-    return map;
-}
-
-static std::string bytes_to_unicode_bpe(uint8_t byte) {
-    static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
-    return map.at(byte);
-}
-
-static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
-    std::unordered_map<std::string, uint8_t> map;
-    for (int ch = u'!'; ch <= u'~'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[codepoint_to_utf8(ch)] = ch;
-    }
-    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[codepoint_to_utf8(ch)] = ch;
-    }
-    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
-        assert(0 <= ch && ch < 256);
-        map[codepoint_to_utf8(ch)] = ch;
-    }
-    auto n = 0;
-    for (int ch = 0; ch < 256; ++ch) {
-        if (map.find(codepoint_to_utf8(ch)) == map.end()) {
-            map[codepoint_to_utf8(256 + n)] = ch;
-            ++n;
-        }
-    }
-    return map;
-}
-
-static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
-    static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
-    return map.at(utf8);
-}
-